├── .gitignore
├── bioinformatics-toolkit-apps
    ├── LICENSE
    ├── Setup.hs
    ├── app
    │   ├── MergeMotifs.hs
    │   ├── MkIndex.hs
    │   └── MotifScan.hs
    └── bioinformatics-toolkit-apps.cabal
└── bioinformatics-toolkit
    ├── LICENSE
    ├── README.md
    ├── Setup.hs
    ├── benchmarks
        └── bench.hs
    ├── bioinformatics-toolkit.cabal
    ├── cabal.project
    ├── docs
        └── main.tex
    ├── src
        └── Bio
        │   ├── ChIPSeq
        │       └── FragLen.hs
        │   ├── Data
        │       ├── Bam.hs
        │       ├── Bed.hs
        │       ├── Bed
        │       │   ├── Types.hs
        │       │   └── Utils.hs
        │       ├── BigWig.hs
        │       ├── Fasta.hs
        │       ├── Fastq.hs
        │       └── Types.hs
        │   ├── GO.hs
        │   ├── GO
        │       └── Parser.hs
        │   ├── HiC.hs
        │   ├── HiC
        │       ├── Visualize.hs
        │       └── Visualize
        │       │   └── Internal.hs
        │   ├── Motif.hs
        │   ├── Motif
        │       ├── Alignment.hs
        │       ├── Merge.hs
        │       └── Search.hs
        │   ├── RealWorld
        │       ├── BioGRID.hs
        │       ├── ENCODE.hs
        │       ├── Ensembl.hs
        │       ├── GDC.hs
        │       ├── GENCODE.hs
        │       ├── ID.hs
        │       ├── Reactome.hs
        │       └── Uniprot.hs
        │   ├── Seq.hs
        │   ├── Seq
        │       └── IO.hs
        │   └── Utils
        │       ├── BitVector.hs
        │       ├── Functions.hs
        │       ├── Misc.hs
        │       ├── Overlap.hs
        │       └── Types.hs
    └── tests
        ├── Tests
            ├── Bam.hs
            ├── Bed.hs
            ├── Fastq.hs
            ├── Motif.hs
            ├── Seq.hs
            └── Tools.hs
        ├── data
            ├── example.bam
            ├── example.bed.gz
            ├── example_copy.bam
            ├── example_intersect_peaks.bed.gz
            ├── genes.gtf.gz
            ├── motifs.fasta
            ├── motifs.meme
            ├── pairedend.bam
            ├── pairedend.bedpe
            ├── peaks.bed.gz
            ├── peaks.sorted.bed.gz
            ├── test.fastq.gz
            └── test_wrap.fastq.gz
        └── test.hs


/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | cabal-dev
 3 | *.o
 4 | *.hi
 5 | *.chi
 6 | *.chs.h
 7 | *.swp
 8 | .virtualenv
 9 | .hsenv
10 | .cabal-sandbox/
11 | cabal.sandbox.config
12 | cabal.config
13 | .stack-work
14 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit-apps/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Author name here (c) 2017
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Author name here nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/bioinformatics-toolkit-apps/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit-apps/app/MergeMotifs.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | 
  3 | module Main where
  4 | 
  5 | import           AI.Clustering.Hierarchical        hiding (drawDendrogram)
  6 | import           Control.Monad                     (forM, forM_)
  7 | import qualified Data.ByteString.Char8             as B
  8 | import           Data.Default
  9 | import           Data.Double.Conversion.ByteString (toShortest)
 10 | import           Data.List.Split                   (splitOn)
 11 | import           Data.Monoid                       ((<>))
 12 | {-
 13 | import           Diagrams.Backend.Cairo
 14 | import           Diagrams.Plots.Dendrogram
 15 | import           Diagrams.Prelude           (dims2D, strutX, (|||))
 16 | -}
 17 | import           Options.Applicative
 18 | import           System.IO
 19 | import           Text.Printf
 20 | 
 21 | import           Bio.Data.Fasta
 22 | import           Bio.Motif
 23 | import           Bio.Motif.Alignment
 24 | import           Bio.Motif.Merge
 25 | import           Bio.Seq                           (toBS)
 26 | import           Bio.Utils.Functions
 27 | 
 28 | data CMD = Merge { mergeInput  :: FilePath
 29 |                  , mode        :: String
 30 |                  , thres       :: Double
 31 |                  , alignOption :: AlignOption
 32 |                  , mergeOutput :: FilePath
 33 |                  }
 34 |          | Dist { inputA      :: FilePath
 35 |                 , inputB      :: Maybe FilePath
 36 |                 , alignOption :: AlignOption
 37 |                 }
 38 |          | Cluster { clusterInput :: !FilePath
 39 |                    , cutoff       :: !Double
 40 |                    , alignOption  :: !AlignOption
 41 |                    }
 42 | 
 43 | mergeParser :: Parser CMD
 44 | mergeParser = Merge
 45 |     <$> argument str (metavar "INPUT")
 46 |     <*> strOption
 47 |         ( long "mode"
 48 |        <> short 'm'
 49 |        <> value "iter"
 50 |        <> metavar "MODE"
 51 |        <> help "Merging algorithm, could be iter or tree, default is iter" )
 52 |     <*> option auto
 53 |         ( long "thres"
 54 |        <> short 't'
 55 |        <> value 0.2
 56 |        <> metavar "THRESHOLD"
 57 |        <> help "two motifs that have distance belowing threshold would be merged, default is 0.2" )
 58 |     <*> alignParser
 59 |     <*> strOption
 60 |         ( long "output"
 61 |        <> short 'o'
 62 |        <> value "merged_output.meme"
 63 |        <> metavar "OUTPUT" )
 64 | 
 65 | distParser :: Parser CMD
 66 | distParser = Dist
 67 |     <$> strOption
 68 |            ( short 'a'
 69 |           <> metavar "INPUT_A" )
 70 |     <*> (optional . strOption)
 71 |            ( short 'b'
 72 |           <> metavar "INPUT_B" )
 73 |     <*> alignParser
 74 | 
 75 | clusterParser :: Parser CMD
 76 | clusterParser = Cluster
 77 |     <$> argument str (metavar "INPUT")
 78 |     <*> option auto
 79 |         ( long "height"
 80 |        <> short 'h'
 81 |        <> value 0.2
 82 |        <> metavar "HEIGHT"
 83 |        <> help "Cut hierarchical tree at given height. Default: 0.2" )
 84 |     <*> alignParser
 85 | 
 86 | data AlignOption = AlignOption
 87 |     { gap     :: Double
 88 |     , gapMode :: String
 89 |     , avgMode :: String
 90 |     }
 91 | 
 92 | alignParser :: Parser AlignOption
 93 | alignParser = AlignOption
 94 |      <$> option auto
 95 |            ( long "gap"
 96 |           <> short 'g'
 97 |           <> value 0.05
 98 |           <> metavar "GAP_PENALTY"
 99 |           <> help "Gap penalty, default: 0.05" )
100 |      <*> strOption
101 |            ( long "gap_mode"
102 |           <> value "exp"
103 |           <> metavar "GAP_MODE"
104 |           <> help "Gap penalty mode, one of linear, quadratic, cubic, and exp. default: exp." )
105 |      <*> strOption
106 |            ( long "avg_mode"
107 |           <> value "l1"
108 |           <> metavar "AVERAGE_MODE"
109 |           <> help "Averaging function, one of l1, l2, l3, max. default: l1." )
110 | 
111 | treeMerge :: Double -> String -> [Motif] -> AlignOption
112 |           -> ([Motif], Dendrogram Motif)
113 | treeMerge th pre ms alignOpt = (zipWith f [0::Int ..] $ map merge $ tree `cutAt` th, tree)
114 |   where
115 |     f i (suffix, pwm) = Motif ((B.pack $ pre ++ "_" ++ show i ++ "_" ++ show (toIUPAC pwm))
116 |                                  `B.append` "("
117 |                                  `B.append` suffix
118 |                                  `B.append` ")" ) pwm
119 |     merge tr = ( B.intercalate "+" $ map _name $ flatten tr
120 |                , dilute $ mergeTreeWeighted align tr)
121 |     tree = buildTree align ms
122 |     align = mkAlignFn alignOpt
123 | {-# INLINE treeMerge #-}
124 | 
125 | getSuffix :: String -> String
126 | getSuffix = last . splitOn "."
127 | {-# INLINE getSuffix #-}
128 | 
129 | mkAlignFn :: AlignOption -> AlignFn
130 | mkAlignFn (AlignOption gap gapMode avgMode) = alignmentBy jsd (pFn gap) avgFn
131 |   where
132 |     pFn = case gapMode of
133 |         "linear"    -> linPenal
134 |         "quadratic" -> quadPenal
135 |         "cubic"     -> cubPenal
136 |         "exp"       -> expPenal
137 |         _           -> error "Unknown gap mode"
138 |     avgFn = case avgMode of
139 |         "l1"  -> l1
140 |         "l2"  -> l2
141 |         "l3"  -> l3
142 |         "max" -> lInf
143 |         _     -> error "Unknown average mode"
144 | {-# INLINE mkAlignFn #-}
145 | 
146 | readMotif :: FilePath -> IO [Motif]
147 | readMotif fl = case getSuffix fl of
148 |     "fasta" -> readFasta' fl
149 |     _       -> readMEME fl
150 | 
151 | writeMotif :: FilePath -> [Motif] -> IO ()
152 | writeMotif fl motifs = case getSuffix fl of
153 |     "fasta" -> writeFasta fl motifs
154 |     _       -> writeMEME fl motifs def
155 | 
156 | 
157 | defaultMain :: CMD -> IO ()
158 | defaultMain (Dist a b alignOpt) = do
159 |     motifsA <- readMotif a
160 |     pairs <- case b of
161 |         Nothing -> return $ comb motifsA
162 |         Just b' -> do
163 |             motifsB <- readMotif b'
164 |             return [(x,y) | x <- motifsA, y <- motifsB]
165 |     forM_ pairs $ \(x,y) -> do
166 |         let (d,_) = alignFn (_pwm x) $ (_pwm y)
167 |         B.putStrLn $ B.intercalate "\t" [_name x, _name y, toShortest d]
168 |   where
169 |     alignFn = mkAlignFn alignOpt
170 | 
171 | defaultMain (Merge inFl m th alignOpt outFl)= do
172 |     motifs <- readMotif inFl
173 |     let motifNumber = length motifs
174 | 
175 |     hPutStrLn stderr $ printf "Merging Mode: %s" m
176 |     hPutStrLn stderr $ printf "Read %d motifs" motifNumber
177 | 
178 |     motifs' <- case m of
179 |         "tree" -> do
180 |             let (newMotifs, tree) = treeMerge th "" motifs alignOpt
181 |                 fn x = B.unpack (_name x) ++ ": " ++ B.unpack (toBS $ toIUPAC $ _pwm x)
182 | 
183 |             {-
184 |             case svg of
185 |                 Just fl -> do
186 |                     let w = 80
187 |                         h = 5 * fromIntegral motifNumber
188 |                     renderCairo fl (dims2D (10*w) (10*h)) $ drawDendrogram w h th tree fn ||| strutX 40
189 |                     return newMotifs
190 |                     -}
191 |             return newMotifs
192 |         "iter" -> do
193 |             let rs = iterativeMerge (mkAlignFn alignOpt) th motifs
194 |             forM rs $ \(nm, pwm, ws) -> do
195 |                 let pwm' = dilute (pwm, ws)
196 |                 return $ Motif (B.intercalate "+" nm) pwm
197 |          -- _ -> error "Unkown mode!"
198 | 
199 |     hPutStrLn stderr $ printf "Write %d motifs" (length motifs')
200 | 
201 |     writeMotif outFl motifs'
202 | 
203 | defaultMain (Cluster inFl c alignOpt) = do
204 |     motifs <- readMotif inFl
205 |     let tree = buildTree align motifs
206 |         align = mkAlignFn alignOpt
207 |     forM_ (tree `cutAt` c) $ \t ->
208 |         B.putStrLn $ B.intercalate "\t" $ map _name $ flatten t
209 | {-# INLINE defaultMain #-}
210 | 
211 | comb :: [a] -> [(a,a)]
212 | comb (y:ys) = zip (repeat y) ys ++ comb ys
213 | comb _      = []
214 | {-# INLINE comb #-}
215 | 
216 | main :: IO ()
217 | main = execParser opts >>= defaultMain
218 |   where
219 |     opts = info (helper <*> parser)
220 |             ( fullDesc
221 |            <> header (printf "Compare, align and merge motifs, version %s" v))
222 |     v = "0.2.0" :: String
223 |     parser = subparser $ (
224 |         command "merge" (info (helper <*> mergeParser) $
225 |             fullDesc <> progDesc "Merge motifs")
226 |      <> command "dist" (info (helper <*> distParser) $
227 |             fullDesc <> progDesc "Align and compute pairwise distances")
228 |      <> command "cluster" (info (helper <*> clusterParser) $
229 |             fullDesc <> progDesc "Perform hierarchical clustering on motifs")
230 |      )
231 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit-apps/app/MkIndex.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import Bio.Seq.IO (mkIndex)
 4 | import System.Environment
 5 | 
 6 | main :: IO ()
 7 | main = do
 8 |     (outF: inputs)  <- getArgs
 9 |     mkIndex inputs outF
10 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit-apps/app/MotifScan.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import           Bio.Data.Bed
 4 | import           Bio.Data.Bed.Utils
 5 | import           Bio.Motif
 6 | import           Bio.Seq.IO
 7 | import           Conduit
 8 | import           Data.Default                      (def)
 9 | import           Data.Semigroup                    ((<>))
10 | import           Data.Version                      (showVersion)
11 | import           Options.Applicative
12 | import           Paths_bioinformatics_toolkit_apps (version)
13 | import           System.IO                         (stdout)
14 | import           Text.Printf
15 | 
16 | 
17 | data Options = Options
18 |     { genomeFile :: FilePath
19 |     , motifFile  :: FilePath
20 |     , input      :: FilePath
21 |     , p          :: Double
22 |     } deriving (Show, Read)
23 | 
24 | parser :: Parser Options
25 | parser = Options
26 |      <$> strArgument (metavar "GENOME")
27 |      <*> strArgument (metavar "MOTIF_MEME")
28 |      <*> strArgument (metavar "INPUT")
29 |      <*> option auto
30 |            ( long "p-value"
31 |           <> short 'p'
32 |           <> value 1e-5
33 |           <> metavar "P-Value"
34 |           <> help "p-value cutoff. (default: 1e-5)" )
35 | 
36 | defaultMain :: Options -> IO ()
37 | defaultMain opts = do
38 |     withGenome (genomeFile opts) $ \genome -> do
39 |         motifs <- map (mkCutoffMotif def (p opts)) <$> readMEME (motifFile opts)
40 |         runResourceT $ runConduit $
41 |             (streamBed (input opts) :: Source (ResourceT IO) BED3) .|
42 |             scanMotif genome motifs .| sinkHandleBed stdout
43 | 
44 | main :: IO ()
45 | main = execParser opts >>= defaultMain
46 |   where
47 |     opts = info (helper <*> parser) ( fullDesc <>
48 |         header (printf "bioinformatics-toolkit-apps-v%s" (showVersion version)) )
49 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit-apps/bioinformatics-toolkit-apps.cabal:
--------------------------------------------------------------------------------
 1 | name:                bioinformatics-toolkit-apps
 2 | version:             0.1.0
 3 | -- synopsis:
 4 | -- description:
 5 | -- homepage:
 6 | license:             BSD3
 7 | license-file:        LICENSE
 8 | author:              Kai Zhang
 9 | maintainer:          kai@kzhang.org
10 | copyright:           (c) 2017-2018 Kai Zhang
11 | category:            Bioinformatics
12 | build-type:          Simple
13 | extra-source-files:  README.md
14 | cabal-version:       >=1.10
15 | 
16 | executable scan_motif
17 |   hs-source-dirs:      app
18 |   main-is:             MotifScan.hs
19 |   -- ghc-options:         -threaded -Wall -optl-static -optc-static -optl-pthread
20 |   ghc-options:         -threaded -Wall
21 |   build-depends:       base
22 |                      , bioinformatics-toolkit
23 |                      , conduit >=1.3.0
24 |                      , data-default
25 |                      , optparse-applicative
26 |   default-language:    Haskell2010
27 | 
28 | executable mkindex
29 |   hs-source-dirs:      app
30 |   main-is:             MkIndex.hs
31 |   -- ghc-options:         -threaded -Wall -optl-static -optc-static -optl-pthread
32 |   ghc-options:         -threaded -Wall
33 |   build-depends:       base
34 |                      , bioinformatics-toolkit
35 |   default-language:    Haskell2010
36 | 
37 | executable merge_motifs
38 |   hs-source-dirs:      app
39 |   main-is:             MergeMotifs.hs
40 |   -- ghc-options:         -threaded -Wall -optl-static -optc-static -optl-pthread
41 |   ghc-options:         -threaded -Wall
42 |   build-depends:       base
43 |                      , bioinformatics-toolkit
44 |                      , bytestring
45 |                      , conduit >=1.3.0
46 |                      , clustering
47 |                      , double-conversion
48 |                      , split
49 |                      , data-default
50 |                      , optparse-applicative
51 |   default-language:    Haskell2010
52 | 
53 | source-repository head
54 |   type:     git
55 |   location: https://github.com/kaizhang/bioinformatics-toolkit.git
56 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014-2021 Kai Zhang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/README.md:
--------------------------------------------------------------------------------
1 | Bioinformatics Algorithms
2 | =========================
3 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/benchmarks/bench.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | import Criterion.Main
 4 | import System.Random
 5 | import Data.Conduit.Zlib (ungzip, multiple, gzip)
 6 | import qualified Data.ByteString.Char8 as B
 7 | import Data.Default.Class
 8 | import Conduit
 9 | import Control.Monad.Identity
10 | import System.IO.Unsafe
11 | import AI.Clustering.Hierarchical
12 | import Data.Either
13 | 
14 | import Bio.Data.Fasta
15 | import Bio.Data.Fastq
16 | import Bio.Motif
17 | import Bio.Motif.Search
18 | import Bio.Motif.Alignment
19 | import Bio.Seq
20 | 
21 | dna :: DNA Basic
22 | dna = fromRight undefined $ fromBS $ B.pack $ map f $ take 5000 $ randomRs (0, 3) (mkStdGen 2)
23 |   where
24 |     f :: Int -> Char
25 |     f x = case x of
26 |         0 -> 'A'
27 |         1 -> 'C'
28 |         2 -> 'G'
29 |         3 -> 'T'
30 |         _ -> undefined
31 | 
32 | pwm :: PWM
33 | pwm = toPWM [ "0.3 0.3 0.3 0.1"
34 |             , "0 0.5 0 0.5"
35 |             , "0.1 0.2 0.5 0.3"
36 |             , "0.1 0.1 0.1 0.7"
37 |             , "0 0 0 1"
38 |             , "0.25 0.25 0.25 0.25"
39 |             , "0.1 0.1 0.3 0.5"
40 |             , "0.25 0.25 0 0.5"
41 |             , "0.1 0.1 0.7 0.1"
42 |             , "0 0 0 1"
43 |             ]
44 | 
45 | motifs :: [Motif]
46 | motifs = unsafePerformIO $ readFasta' "data/motifs.fasta"
47 | 
48 | main :: IO ()
49 | main = do
50 |   content <- fmap B.concat $ runResourceT $ runConduit $
51 |       sourceFile "tests/data/test.fastq.gz" .| multiple ungzip .| sinkList
52 |   let fq = B.concat $ replicate 50 content
53 |   defaultMain 
54 |       [ bench "motif score" $ nf (scores def pwm) dna 
55 |       --, bgroup "TFBS scanning" [ bench "Naive" $ nf (\x -> runIdentity $ findTFBSSlow def pwm x (0.6 * optimalScore def pwm) $$ CL.consume) dna
56 |       --                         , bench "look ahead" $ nf (\x -> runIdentity $ findTFBS def pwm x (0.6 * optimalScore def pwm) $$ CL.consume) dna
57 |       --                         ]
58 |       --, bench "motif merge" $ nf (\x -> fmap show $ flatten $ buildTree x) motifs
59 |       , bench "Read FASTQ Fast" $ nfAppIO (\x -> runConduit $ yield x .| parseFastqC .| sinkList) fq
60 |       , bench "Read FASTQ Slow" $ nfAppIO (\x -> runConduit $ yield x .| parseFastqC' .| sinkList) fq
61 |       ]
62 | 
63 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/bioinformatics-toolkit.cabal:
--------------------------------------------------------------------------------
  1 | name:                bioinformatics-toolkit
  2 | version:             0.10.0
  3 | synopsis:            A collection of bioinformatics tools
  4 | description:         A collection of bioinformatics tools
  5 | license:             MIT
  6 | license-file:        LICENSE
  7 | author:              Kai Zhang
  8 | maintainer:          kai@kzhang.org
  9 | copyright:           (c) 2014-2022 Kai Zhang
 10 | category:            Bio
 11 | build-type:          Simple
 12 | extra-source-files:  README.md
 13 | cabal-version:       1.18
 14 | data-files:
 15 |   tests/data/example.bam
 16 |   tests/data/example.bed.gz
 17 |   tests/data/pairedend.bam
 18 |   tests/data/pairedend.bedpe
 19 |   tests/data/peaks.bed.gz
 20 |   tests/data/peaks.sorted.bed.gz
 21 |   tests/data/example_intersect_peaks.bed.gz
 22 |   tests/data/motifs.fasta
 23 |   tests/data/motifs.meme
 24 |   tests/data/test.fastq.gz
 25 |   tests/data/test_wrap.fastq.gz
 26 |   tests/data/genes.gtf.gz
 27 | 
 28 | library
 29 |   hs-source-dirs: src
 30 |   ghc-options: -Wall
 31 | 
 32 |   exposed-modules:
 33 |     Bio.ChIPSeq.FragLen
 34 |     Bio.Data.Bed
 35 |     Bio.Data.Bed.Types
 36 |     Bio.Data.Bed.Utils
 37 |     Bio.Data.Bam
 38 |     Bio.Data.Fasta
 39 |     Bio.Data.Fastq
 40 |     Bio.GO
 41 |     Bio.GO.Parser
 42 |     Bio.Motif
 43 |     Bio.Motif.Alignment
 44 |     Bio.Motif.Merge
 45 |     Bio.Motif.Search
 46 |     Bio.RealWorld.BioGRID
 47 |     Bio.RealWorld.ENCODE
 48 |     Bio.RealWorld.Ensembl
 49 |     Bio.RealWorld.GENCODE
 50 |     Bio.RealWorld.GDC
 51 |     Bio.RealWorld.ID
 52 |     Bio.RealWorld.Reactome
 53 |     Bio.RealWorld.Uniprot
 54 |     Bio.Seq
 55 |     Bio.Seq.IO
 56 |     Bio.Utils.BitVector
 57 |     Bio.Utils.Functions
 58 |     Bio.Utils.Misc
 59 |     Bio.Utils.Overlap
 60 |     Bio.Utils.Types
 61 | 
 62 |   build-depends:
 63 |       base >=4.11 && <5.0
 64 |     , aeson >= 2.0
 65 |     , aeson-pretty
 66 |     , attoparsec >= 0.13.0.0
 67 |     , bytestring >= 0.10
 68 |     , bytestring-lexing >= 0.5
 69 |     , case-insensitive
 70 |     , clustering
 71 |     , conduit >= 1.3.0
 72 |     , conduit-extra
 73 |     , containers >= 0.5
 74 |     , data-ordlist
 75 |     , data-default-class
 76 |     , double-conversion
 77 |     , deepseq
 78 |     , HsHTSLib >= 1.9.2
 79 |     , http-conduit >= 2.1.8
 80 |     , hexpat
 81 |     , IntervalMap >= 0.5.0.0
 82 |     , microlens
 83 |     , microlens-th
 84 |     , matrices >= 0.5.0
 85 |     , mtl >= 2.1.3.1
 86 |     , math-functions
 87 |     , parallel >= 3.2
 88 |     , primitive
 89 |     , split
 90 |     , statistics >= 0.13.2.1
 91 |     , text >= 0.11
 92 |     , transformers >= 0.3.0.0
 93 |     , unordered-containers >= 0.2
 94 |     , word8
 95 |     , vector
 96 |     , vector-algorithms
 97 | 
 98 |   default-language:    Haskell2010
 99 | 
100 | benchmark bench
101 |   type: exitcode-stdio-1.0
102 |   main-is: benchmarks/bench.hs
103 |   default-language:    Haskell2010
104 |   build-depends:
105 |       base >=4.8 && <5.0
106 |     , bioinformatics-toolkit
107 |     , random
108 |     , criterion
109 |     , clustering
110 |     , bytestring
111 |     , data-default-class
112 |     , conduit
113 |     , conduit-extra
114 |     , mtl
115 | 
116 | test-suite tests
117 |   type: exitcode-stdio-1.0
118 |   hs-source-dirs: tests
119 |   main-is: test.hs
120 |   other-modules:
121 |       Tests.Bed
122 |     , Tests.Bam
123 |     , Tests.Fastq
124 |     , Tests.Motif
125 |     , Tests.Seq
126 |     , Tests.Tools
127 | 
128 |   default-language:    Haskell2010
129 |   build-depends:
130 |       base
131 |     , bytestring
132 |     , random
133 |     , vector
134 |     , data-default-class
135 |     , microlens
136 |     , tasty
137 |     , tasty-golden
138 |     , tasty-hunit
139 |     , bioinformatics-toolkit
140 |     , conduit
141 |     , conduit-extra
142 |     , unordered-containers
143 |     , mtl
144 |     , matrices
145 | 
146 | source-repository  head
147 |   type: git
148 |   location: https://github.com/kaizhang/bioinformatics-toolkit.git
149 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/cabal.project:
--------------------------------------------------------------------------------
1 | packages: *.cabal
2 | 
3 | package cryptonite
4 |   flags: -use_target_attributes


--------------------------------------------------------------------------------
/bioinformatics-toolkit/docs/main.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[12pt]{scrartcl}
 2 | 
 3 | \usepackage[english]{babel}
 4 | \usepackage[sorting=none,backend=biber]{biblatex}
 5 | \usepackage{amsmath}
 6 | \usepackage{graphicx}
 7 | \usepackage[noend]{algpseudocode}
 8 | \usepackage{algorithm}
 9 | \usepackage{hyperref}
10 | \usepackage[backend=cairo, outputdir=diagrams]{diagrams-latex}
11 | 
12 | \newtheorem{dfn}{Definition}
13 | 
14 | \title{Document}
15 | \author{Kai Zhang}
16 | \date{}
17 | 
18 | \begin{document}
19 | 
20 | \maketitle
21 | 
22 | \tableofcontents
23 | \newpage
24 | 
25 | 1-based coordinate system A coordinate system where the first base of a sequence
26 | is one. In this system, a region is specified by a closed interval. For example, the
27 | region between the 3rd and the 7th bases inclusive is [3, 7]. The SAM, VCF, GFF and Wiggle
28 | formats are using the 1-based coordinate system.
29 | 
30 | 0-based coordinate system A coordinate system where the first base of a sequence
31 | is zero. In this coordinate system, a region is specified by a half-closed-half-open interval.
32 | For example, the region between the 3rd and the 7th bases inclusive is [2, 7).
33 | The BAM, BCFv2, BED, and PSL formats are using the 0-based coordinate system.
34 | 
35 | \printbibliography
36 | 
37 | \end{document}
38 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/ChIPSeq/FragLen.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | 
 3 | module Bio.ChIPSeq.FragLen
 4 |     ( fragLength
 5 |     , naiveCCWithSmooth
 6 |     ) where
 7 | 
 8 | import           Bio.Data.Bed
 9 | import           Lens.Micro                ((^.))
10 | import           Control.Parallel.Strategies (parMap, rpar)
11 | import qualified Data.ByteString.Char8       as B
12 | import qualified Data.HashMap.Strict         as M
13 | import qualified Data.HashSet                as S
14 | import           Data.List                   (foldl', maximumBy)
15 | import           Data.Ord                    (comparing)
16 | 
17 | -- | estimate fragment length for a ChIP-seq experiment
18 | fragLength :: (Int, Int) -> [BED] -> Int
19 | fragLength (start, end) beds = fst $ maximumBy (comparing snd) $
20 |     naiveCCWithSmooth 4 beds [start, start+2 .. end]
21 | {-# INLINE fragLength #-}
22 | 
23 | -- sizeDistribution :: (Int, Int) -> [BED] -> (Int, Int)
24 | 
25 | fromBED :: [BED] -> [(B.ByteString, (S.HashSet Int, S.HashSet Int))]
26 | fromBED = map toSet . M.toList . M.fromListWith f . map parseLine
27 |     where
28 |         parseLine :: BED -> (B.ByteString, ([Int], [Int]))
29 |         parseLine x = case x^.strand of
30 |             Just True  -> (x^.chrom, ([x^.chromStart], []))
31 |             Just False -> (x^.chrom, ([], [x^.chromEnd]))
32 |             _          -> error "Unknown Strand!"
33 |         f (a,b) (a',b') = (a++a', b++b')
34 |         toSet (chr, (forwd, rev)) = (chr, (S.fromList forwd, S.fromList rev))
35 | {-# INLINE fromBED #-}
36 | 
37 | -- | fast relative cross-correlation with smoothing
38 | apprxCorr :: S.HashSet Int -> S.HashSet Int -> Int -> Int -> Int
39 | apprxCorr forwd rev smooth d = S.foldl' f 0 rev
40 |     where
41 |         f :: Int -> Int -> Int
42 |         f !acc r | any (`S.member` forwd) [r-d-smooth..r-d+smooth] = acc + 1
43 |                  | otherwise = acc
44 | {-# INLINE apprxCorr #-}
45 | 
46 | -- | calcuate cross corrlation with different shifts
47 | naiveCCWithSmooth :: Int -> [BED] -> [Int] -> [(Int, Int)]
48 | naiveCCWithSmooth smooth input range = f . fromBED $ input
49 |     where
50 |         cc :: [(B.ByteString, (S.HashSet Int, S.HashSet Int))] -> Int -> Int
51 |         cc xs d = foldl' (+) 0 . map ((\(forwd, rev) -> apprxCorr forwd rev smooth d).snd) $ xs
52 |         f xs = zip range $ parMap rpar (cc xs) range
53 | {-# INLINE naiveCCWithSmooth #-}
54 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Bam.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE FlexibleContexts #-}
 2 | {-# LANGUAGE LambdaCase #-}
 3 | module Bio.Data.Bam
 4 |     ( BAM
 5 |     , getBamHeader
 6 |     , streamBam
 7 |     , sinkBam
 8 |     , bamToBedC
 9 |     , bamToBed
10 |     , bamToFragmentC
11 |     , bamToFragment
12 |     , bamToFastqC
13 |     , bamToFastq
14 |     , sortedBamToBedPE
15 |     ) where
16 | 
17 | import Control.Monad (mzero)
18 | import Data.List (foldl')
19 | import           Bio.Data.Bed
20 | import           Bio.Data.Fastq
21 | import           Bio.HTS
22 | import           Conduit
23 | 
24 | -- | Convert bam record to bed record. Unmapped reads will be discarded.
25 | bamToBedC :: MonadIO m => BAMHeader -> ConduitT BAM BED m ()
26 | bamToBedC header = mapC (bamToBed header) .| concatC
27 | {-# INLINE bamToBedC #-}
28 | 
29 | -- | Convert bam record to fastq record.
30 | bamToFastqC :: Monad m => ConduitT BAM Fastq m ()
31 | bamToFastqC = mapC bamToFastq .| concatC
32 | {-# INLINE bamToFastqC #-}
33 | 
34 | -- | Convert pairedend bam to fragment. 
35 | bamToFragmentC :: Monad m => BAMHeader -> ConduitT BAM BED m ()
36 | bamToFragmentC header = mapC (bamToFragment header) .| concatC
37 | {-# INLINE bamToFragmentC #-}
38 | 
39 | bamToFragment :: BAMHeader -> BAM -> Maybe BED
40 | bamToFragment header bam 
41 |     | not (isFirstSegment flg) = Nothing
42 |     | otherwise = do
43 |         chr1 <- refName header bam
44 |         chr2 <- mateRefName header bam
45 |         if chr1 == chr2
46 |             then return $ BED chr1 (min start1 start2) (max end1 end2)
47 |                 (Just $ queryName bam) Nothing Nothing
48 |             else mzero
49 |   where
50 |     start1 = startLoc bam
51 |     end1 = endLoc bam
52 |     start2 = mateStartLoc bam
53 |     end2 = mateStartLoc bam + ciglen cig
54 |     cig = case queryAuxData ('M', 'C') bam of
55 |         Just (AuxString x) -> string2Cigar x
56 |         _ -> error "No MC tag. Please run samtools fixmate on file first."
57 |     ciglen (CIGAR c) = foldl' f 0 c
58 |         where f acc (n,x) = if x `elem` "MDN=X" then n + acc else acc
59 |     flg = flag bam
60 | {-# INLINE bamToFragment #-}
61 | 
62 | -- | Convert pairedend bam file to bed. the bam file must be sorted by names,
63 | -- e.g., using "samtools sort -n". This condition is checked from Bam header.
64 | sortedBamToBedPE :: Monad m => BAMHeader -> ConduitT BAM (BED, BED) m ()
65 | sortedBamToBedPE header = case getSortOrder header of
66 |     Queryname -> loopBedPE .| concatC
67 |     _         -> error "Bam file must be sorted by NAME."
68 |   where
69 |     loopBedPE = (,) <$$> await <***> await >>= \case
70 |         Nothing -> return ()
71 |         Just (bam1, bam2) -> if queryName bam1 /= queryName bam2
72 |             then error "Adjacent records have different query names. Aborted."
73 |             else do
74 |                 yield $ (,) <$> bamToBed header bam1 <*> bamToBed header bam2
75 |                 loopBedPE
76 |       where
77 |         (<$$>) = fmap . fmap
78 |         (<***>) = (<*>) . fmap (<*>)
79 | {-# INLINE sortedBamToBedPE #-}
80 | 
81 | -- | Convert BAM to BED.
82 | bamToBed :: BAMHeader -> BAM -> Maybe BED
83 | bamToBed header bam = mkBed <$> refName header bam 
84 |   where
85 |     mkBed chr = BED chr start end nm sc str
86 |     start = startLoc bam
87 |     end = endLoc bam
88 |     nm = Just $ queryName bam
89 |     str = Just $ not $ isRev bam
90 |     sc = Just $ fromIntegral $ mapq bam
91 | {-# INLINE bamToBed #-}
92 | 
93 | -- | Convert BAM to Fastq.
94 | bamToFastq :: BAM -> Maybe Fastq
95 | bamToFastq bam = Fastq (queryName bam) <$> getSeq bam <*> qualityS bam
96 | {-# INLINE bamToFastq #-}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Bed.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE FlexibleContexts      #-}
  2 | {-# LANGUAGE OverloadedStrings     #-}
  3 | 
  4 | module Bio.Data.Bed
  5 |     ( BEDLike(..)
  6 |     , BEDConvert(..)
  7 |     , BED(..)
  8 |     , BED3(..)
  9 |     , BEDGraph(..)
 10 |     , bdgValue
 11 |     , NarrowPeak(..)
 12 |     , npSignal
 13 |     , npPvalue
 14 |     , npQvalue
 15 |     , npPeak
 16 |     , BEDExt(..)
 17 |     , _bed
 18 |     , _data
 19 | 
 20 |     , BEDTree
 21 |     , bedToTree
 22 |     , sortedBedToTree
 23 |     , queryIntersect
 24 |     , intersecting
 25 |     , within
 26 |     , isIntersected
 27 |     , sizeOverlapped
 28 |     , splitBed
 29 |     , splitBedBySize
 30 |     , splitBedBySizeLeft
 31 |     , splitBedBySizeOverlap
 32 |     , sortBed
 33 |     , intersectBed
 34 |     , intersectBedWith
 35 |     , intersectSortedBed
 36 |     , intersectSortedBedWith
 37 |     , isOverlapped
 38 |     , mergeBed
 39 |     , mergeBedWith
 40 |     , mergeSortedBed
 41 |     , mergeSortedBedWith
 42 | --    , splitOverlapped
 43 |     , countOverlapped
 44 | 
 45 |     -- * IO
 46 |     , streamBed
 47 |     , streamBedGzip
 48 |     , readBed
 49 |     , sinkFileBed
 50 |     , sinkFileBedGzip
 51 |     , sinkHandleBed
 52 |     , writeBed
 53 | 
 54 |     , compareBed
 55 |     ) where
 56 | 
 57 | import           Conduit
 58 | import           Control.Arrow                ((***), first)
 59 | import           Lens.Micro
 60 | import qualified Data.ByteString.Char8        as B
 61 | import qualified Data.Foldable                as F
 62 | import           Data.Function                (on)
 63 | import qualified Data.HashMap.Strict          as M
 64 | import qualified Data.IntervalMap.Strict      as IM
 65 | import           Data.List                    (groupBy, sortBy, group)
 66 | import           Data.Conduit.Zlib           (gzip, ungzip, multiple)
 67 | import qualified Data.Vector                  as V
 68 | import qualified Data.Vector.Algorithms.Intro as I
 69 | import           System.IO
 70 | 
 71 | import           Bio.Data.Bed.Types
 72 | import           Bio.Utils.Misc               (binBySize, binBySizeLeft,
 73 |                                                binBySizeOverlap, bins)
 74 | 
 75 | -- | Convert a set of sorted bed records to interval tree, with combining
 76 | -- function for equal keys.
 77 | sortedBedToTree :: (BEDLike b, F.Foldable f)
 78 |                 => (a -> a -> a)
 79 |                 -> Sorted (f (b, a))
 80 |                 -> BEDTree a
 81 | sortedBedToTree f (Sorted xs) = M.fromList $
 82 |     map ((head *** IM.fromAscListWith f) . unzip) $ groupBy ((==) `on` fst) $
 83 |     map (\(b, x) -> (b^.chrom, (IM.IntervalCO (b^.chromStart) (b^.chromEnd), x))) $
 84 |     F.toList xs
 85 | {-# INLINE sortedBedToTree #-}
 86 | 
 87 | bedToTree :: BEDLike b
 88 |           => (a -> a -> a)
 89 |           -> [(b, a)]
 90 |           -> BEDTree a
 91 | bedToTree f xs = M.fromList $ map ((head *** IM.fromAscListWith f) . unzip) $
 92 |     groupBy ((==) `on` fst) $
 93 |     map (\(b, x) -> (b^.chrom, (IM.IntervalCO (b^.chromStart) (b^.chromEnd), x))) $
 94 |     V.toList $ V.create $ do
 95 |         v <- V.unsafeThaw $ V.fromList xs
 96 |         I.sortBy (compareBed `on` fst) v
 97 |         return v
 98 | {-# INLINE bedToTree #-}
 99 | 
100 | queryIntersect :: BEDLike b => b -> BEDTree a -> [(BED3, a)]
101 | queryIntersect x tree = map (first f) $ IM.assocs $ intersecting tree x
102 |   where
103 |     f (IM.IntervalCO lo hi) = BED3 chr lo hi
104 |     f _ = undefined
105 |     chr = x^.chrom
106 | {-# INLINE queryIntersect #-}
107 | 
108 | -- | Return the submap of key intervals intersecting the given interval.
109 | intersecting :: BEDLike b => BEDTree a -> b -> IM.IntervalMap Int a
110 | intersecting tree x = IM.intersecting (M.lookupDefault IM.empty (x^.chrom) tree) $
111 |     IM.IntervalCO (x^.chromStart) $ x^.chromEnd
112 | {-# INLINE intersecting #-}
113 | 
114 | -- | Return the submap of key intervals completely inside the given interval.
115 | within:: BEDLike b => BEDTree a -> b -> IM.IntervalMap Int a
116 | within tree x = IM.within (M.lookupDefault IM.empty (x^.chrom) tree) $
117 |     IM.IntervalCO (x^.chromStart) $ x^.chromEnd
118 | {-# INLINE within #-}
119 | 
120 | isIntersected :: BEDLike b => BEDTree a -> b -> Bool
121 | isIntersected tree = not . IM.null . intersecting tree
122 | {-# INLINE isIntersected #-}
123 | 
124 | sizeOverlapped :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Int
125 | sizeOverlapped b1 b2 | b1^.chrom /= b2^.chrom = 0
126 |                      | overlap < 0 = 0
127 |                      | otherwise = overlap
128 |   where
129 |     overlap = minimum [ b1^.chromEnd - b2^.chromStart
130 |                       , b2^.chromEnd - b1^.chromStart
131 |                       , b1^.chromEnd - b1^.chromStart
132 |                       , b2^.chromEnd - b2^.chromStart ]
133 | 
134 | -- | split a bed region into k consecutive subregions, discarding leftovers
135 | splitBed :: BEDConvert b => Int -> b -> [b]
136 | splitBed k bed = map (uncurry (asBed (bed^.chrom))) $
137 |     bins k (bed^.chromStart, bed^.chromEnd)
138 | {-# INLINE splitBed #-}
139 | 
140 | -- | split a bed region into consecutive fixed size subregions, discarding leftovers
141 | splitBedBySize :: BEDConvert b => Int -> b -> [b]
142 | splitBedBySize k bed = map (uncurry (asBed (bed^.chrom))) $
143 |     binBySize k (bed^.chromStart, bed^.chromEnd)
144 | {-# INLINE splitBedBySize #-}
145 | 
146 | -- | split a bed region into consecutive fixed size subregions, including leftovers
147 | splitBedBySizeLeft :: BEDConvert b => Int -> b -> [b]
148 | splitBedBySizeLeft k bed = map (uncurry (asBed (bed^.chrom))) $
149 |     binBySizeLeft k (bed^.chromStart, bed^.chromEnd)
150 | {-# INLINE splitBedBySizeLeft #-}
151 | 
152 | splitBedBySizeOverlap :: BEDConvert b
153 |                       => Int     -- ^ bin size
154 |                       -> Int     -- ^ overlap size
155 |                       -> b -> [b]
156 | splitBedBySizeOverlap k o bed = map (uncurry (asBed (bed^.chrom))) $
157 |     binBySizeOverlap k o (bed^.chromStart, bed^.chromEnd)
158 | {-# INLINE splitBedBySizeOverlap #-}
159 | 
160 | -- | Compare bed records using only the chromosome, start and end positions.
161 | -- Unlike the ``compare'' from the Ord type class, this function can compare
162 | -- different types of BED data types.
163 | compareBed :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Ordering
164 | compareBed b1 b2 = compare (b1^.chrom, b1^.chromStart, b1^.chromEnd)
165 |                            (b2^.chrom, b2^.chromStart, b2^.chromEnd)
166 | {-# INLINE compareBed #-}
167 | 
168 | -- | sort BED, first by chromosome (alphabetical order), then by chromStart, last by chromEnd
169 | sortBed :: BEDLike b => [b] -> Sorted (V.Vector b)
170 | sortBed beds = Sorted $ V.create $ do
171 |     v <- V.unsafeThaw . V.fromList $ beds
172 |     I.sortBy compareBed v
173 |     return v
174 | {-# INLINE sortBed #-}
175 | 
176 | -- | return records in A that are overlapped with records in B
177 | intersectBed :: (BEDLike b1, BEDLike b2, Monad m) => [b2] -> ConduitT b1 b1 m ()
178 | intersectBed b = intersectSortedBed b'
179 |   where
180 |     b' = sortBed b
181 | {-# INLINE intersectBed #-}
182 | 
183 | -- | return records in A that are overlapped with records in B
184 | intersectSortedBed :: (BEDLike b1, BEDLike b2, Monad m)
185 |                    => Sorted (V.Vector b2) -> ConduitT b1 b1 m ()
186 | intersectSortedBed (Sorted b) = filterC (not . IM.null . intersecting tree)
187 |   where
188 |     tree = sortedBedToTree (\_ _ -> ()) . Sorted $ V.map (\x -> (x,())) b
189 | {-# INLINE intersectSortedBed #-}
190 | 
191 | intersectBedWith :: (BEDLike b1, BEDLike b2, Monad m)
192 |                  => (b1 -> [b2] -> a)
193 |                  -> [b2]
194 |                  -> ConduitT b1 a m ()
195 | intersectBedWith fn = intersectSortedBedWith fn . sortBed
196 | {-# INLINE intersectBedWith #-}
197 | 
198 | intersectSortedBedWith :: (BEDLike b1, BEDLike b2, Monad m)
199 |                        => (b1 -> [b2] -> a)
200 |                        -> Sorted (V.Vector b2)
201 |                        -> ConduitT b1 a m ()
202 | intersectSortedBedWith fn (Sorted b) = mapC $ \input -> fn input
203 |     $ concat $ IM.elems $ intersecting tree input
204 |   where
205 |     tree = sortedBedToTree (++) $ Sorted $ V.map (\x -> (x, [x])) b
206 | {-# INLINE intersectSortedBedWith #-}
207 | 
208 | isOverlapped :: (BEDLike b1, BEDLike b2) => b1 -> b2 -> Bool
209 | isOverlapped b1 b2 = b1^.chrom == b2^.chrom &&
210 |     not (b1^.chromEnd <= b2^.chromStart || b2^.chromEnd <= b1^.chromStart)
211 | 
212 | -- | Merge overlapping regions.
213 | mergeBed :: (BEDConvert b, Monad m) => [b] -> ConduitT i b m ()
214 | mergeBed xs = yieldMany xs' .| mergeSortedBed
215 |   where
216 |     Sorted xs' = sortBed xs
217 | {-# INLINE mergeBed #-}
218 | 
219 | -- | Merge overlapping regions according to a merging function.
220 | mergeBedWith :: (BEDLike b, Monad m)
221 |              => ([b] -> a) -> [b] -> ConduitT i a m ()
222 | mergeBedWith f xs = yieldMany xs' .| mergeSortedBedWith f
223 |   where
224 |     Sorted xs' = sortBed xs
225 | {-# INLINE mergeBedWith #-}
226 | 
227 | -- | Merge overlapping regions. The input stream must be sorted first.
228 | mergeSortedBed :: (BEDConvert b, Monad m) => ConduitT b b m ()
229 | mergeSortedBed = mergeSortedBedWith f
230 |   where
231 |     f xs = asBed (head xs ^. chrom) lo hi
232 |       where
233 |         lo = minimum $ map (^.chromStart) xs
234 |         hi = maximum $ map (^.chromEnd) xs
235 | {-# INLINE mergeSortedBed #-}
236 | 
237 | -- | Merge overlapping regions according to a merging function. The input
238 | -- stream must be sorted first.
239 | mergeSortedBedWith :: (BEDLike b, Monad m)
240 |                    => ([b] -> a) -> ConduitT b a m ()
241 | mergeSortedBedWith mergeFn = headC >>= ( maybe mempty $ \b0 ->
242 |     go ((b0^.chrom, b0^.chromStart, b0^.chromEnd), [b0]) )
243 |   where
244 |     go ((chr, s, e), acc) = headC >>= maybe (yield $ mergeFn acc) f
245 |       where
246 |         f bed | chr /= chr' || s' > e =
247 |                     yield (mergeFn acc) >> go ((chr',s',e'), [bed])
248 |               | s' < s = error "input stream is not sorted"
249 |               | e' > e = go ((chr',s,e'), bed:acc)
250 |               | otherwise = go ((chr,s,e), bed:acc)
251 |           where
252 |             chr' = bed^.chrom
253 |             s' = bed^.chromStart
254 |             e' = bed^.chromEnd
255 | {-# INLINE mergeSortedBedWith #-}
256 | 
257 | {-
258 | -- | Split overlapped regions into non-overlapped regions. The input must be overlapped.
259 | -- This function is usually used with `mergeBedWith`.
260 | splitOverlapped :: BEDLike b => ([b] -> a) -> [b] -> [(BED3, a)]
261 | splitOverlapped fun xs = filter ((>0) . size . fst) $
262 |     evalState (F.foldrM f [] $ init anchors) $
263 |         (\(a,b) -> (fromEither a, M.singleton (b^.chromStart, b^.chromEnd) b)) $
264 |         last anchors
265 |   where
266 |     anchors = sortBy (comparing (fromEither . fst)) $ concatMap
267 |         ( \x -> [(Left $ x^.chromStart, x), (Right $ x^.chromEnd, x)] ) xs
268 |     f (i, x) acc = do
269 |         (j, s) <- get
270 |         let bed = (asBed chr (fromEither i) j, fun $ M.elems s)
271 |             s' = case i of
272 |                 Left _  -> M.delete (x^.chromStart, x^.chromEnd) s
273 |                 Right _ -> M.insert (x^.chromStart, x^.chromEnd) x s
274 |         put (fromEither i, s')
275 |         return (bed:acc)
276 |     fromEither (Left x)  = x
277 |     fromEither (Right x) = x
278 |     chr = head xs ^. chrom
279 | {-# INLINE splitOverlapped #-}
280 | -}
281 | 
282 | -- | Split overlapped regions into non-overlapped regions. The input must be overlapped.
283 | -- This function is usually used with `mergeBedWith`.
284 | countOverlapped :: BEDLike b => [b] -> [(BED3, Int)]
285 | countOverlapped xs = reverse $ (\(_,_,x) -> x) $
286 |     F.foldl' go (p0, c0, []) $ tail anchors
287 |   where
288 |     (Left p0, c0) = head anchors
289 |     go (x, accum, result) (Left p, count)
290 |         | x == p = (p, accum + count, result)
291 |         | otherwise = (p, accum + count, (asBed chr x $ p, accum):result)
292 |     go (x, accum, result) (Right p, count) =
293 |       (p+1, accum - count, (asBed chr x $ p + 1, accum):result)
294 |     anchors = map (\x -> (head x, length x)) $ group $ sortBy cmp $ concatMap
295 |         (\x -> [Left $ x^.chromStart, Right $ x^.chromEnd - 1]) xs
296 |     cmp (Left x) (Left y) = compare x y
297 |     cmp (Right x) (Right y) = compare x y
298 |     cmp (Left x) (Right y) = case compare x y of
299 |         EQ -> LT
300 |         o -> o
301 |     cmp (Right x) (Left y) = case compare x y of
302 |         EQ -> GT
303 |         o -> o
304 |     chr = head xs ^. chrom
305 | {-# INLINE countOverlapped #-}
306 | 
307 | streamBed :: (MonadResource m, BEDConvert b, MonadIO m)
308 |           => FilePath -> ConduitT i b m () 
309 | streamBed input = sourceFile input .| bsToBed
310 | {-# INLINE streamBed #-}
311 | 
312 | streamBedGzip :: (BEDConvert b, MonadResource m, MonadThrow m, PrimMonad m)
313 |               => FilePath -> ConduitT i b m () 
314 | streamBedGzip input = sourceFile input .| multiple ungzip .| bsToBed
315 | {-# INLINE streamBedGzip #-}
316 | 
317 | readBed :: BEDConvert b => FilePath -> IO [b]
318 | readBed fl = runResourceT $ runConduit $ streamBed fl .| sinkList
319 | {-# INLINE readBed #-}
320 | 
321 | sinkFileBed :: (BEDConvert b, MonadResource m) => FilePath -> ConduitT b o m ()
322 | sinkFileBed output = bedToBS .| sinkFile output
323 | {-# INLINE sinkFileBed #-}
324 | 
325 | sinkFileBedGzip :: (BEDConvert b, MonadResource m, MonadThrow m, PrimMonad m)
326 |                 => FilePath -> ConduitT b o m ()
327 | sinkFileBedGzip output = bedToBS .| gzip .| sinkFile output
328 | {-# INLINE sinkFileBedGzip #-}
329 | 
330 | sinkHandleBed :: (BEDConvert b, MonadIO m) => Handle -> ConduitT b o m ()
331 | sinkHandleBed hdl = bedToBS .| sinkHandle hdl
332 | {-# INLINE sinkHandleBed #-}
333 | 
334 | writeBed :: BEDConvert b => FilePath -> [b] -> IO ()
335 | writeBed fl beds = runResourceT $ runConduit $ yieldMany beds .| sinkFileBed fl
336 | {-# INLINE writeBed #-}
337 | 
338 | bedToBS :: (BEDConvert b, Monad m) => ConduitT b B.ByteString m ()
339 | bedToBS = mapC toLine .| unlinesAsciiC
340 | {-# INLINE bedToBS #-}
341 | 
342 | bsToBed :: (BEDConvert b, Monad m) => ConduitT B.ByteString b m ()
343 | bsToBed = linesUnboundedAsciiC .| mapC fromLine
344 | {-# INLINE bsToBed #-}
345 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Bed/Types.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE TemplateHaskell #-}
  3 | {-# LANGUAGE StrictData #-}
  4 | {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
  5 | 
  6 | module Bio.Data.Bed.Types
  7 |     ( BEDLike(..)
  8 |     , BEDConvert(..)
  9 |     , BED(..)
 10 |     , BED3(..)
 11 |     , BEDGraph(..)
 12 |     , bdgValue
 13 |     , NarrowPeak(..)
 14 |     , npSignal
 15 |     , npPvalue
 16 |     , npQvalue
 17 |     , npPeak
 18 |     , BroadPeak(..)
 19 |     , bpSignal
 20 |     , bpPvalue
 21 |     , bpQvalue
 22 |     , BEDExt(..)
 23 |     , _bed
 24 |     , _data
 25 |     , BEDTree
 26 |     , Sorted(..)
 27 |     ) where
 28 | 
 29 | import Lens.Micro
 30 | import Lens.Micro.TH (makeLensesFor)
 31 | import qualified Data.ByteString.Char8             as B
 32 | import           Data.ByteString.Lex.Integral      (packDecimal)
 33 | import           Data.Double.Conversion.ByteString (toShortest)
 34 | import qualified Data.HashMap.Strict               as M
 35 | import qualified Data.IntervalMap.Strict           as IM
 36 | import           Data.Maybe                        (fromJust, fromMaybe)
 37 | import GHC.Generics (Generic)
 38 | import Control.DeepSeq (NFData)
 39 | 
 40 | import           Bio.Utils.Misc                    (readDouble, readInt)
 41 | 
 42 | readDoubleNonnegative :: B.ByteString -> Maybe Double
 43 | readDoubleNonnegative x | v < 0 = Nothing
 44 |                         | otherwise = Just v
 45 |   where
 46 |     v = readDouble x
 47 | {-# INLINE readDoubleNonnegative #-}
 48 | 
 49 | readIntNonnegative :: B.ByteString -> Maybe Int
 50 | readIntNonnegative x | v < 0 = Nothing
 51 |                      | otherwise = Just v
 52 |   where
 53 |     v = readInt x
 54 | {-# INLINE readIntNonnegative #-}
 55 | 
 56 | -- | A class representing BED-like data, e.g., BED3, BED6 and BED12. BED format
 57 | -- uses 0-based index (see documentation).
 58 | class BEDLike b where
 59 |     -- | Field lens
 60 |     chrom :: Lens' b B.ByteString
 61 |     chromStart :: Lens' b Int
 62 |     chromEnd :: Lens' b Int
 63 |     name :: Lens' b (Maybe B.ByteString)
 64 |     score :: Lens' b (Maybe Int)
 65 |     strand :: Lens' b (Maybe Bool)
 66 | 
 67 |     -- | Return the size of a bed region.
 68 |     size :: b -> Int
 69 |     size bed = bed^.chromEnd - bed^.chromStart
 70 |     {-# INLINE size #-}
 71 | 
 72 |     {-# MINIMAL chrom, chromStart, chromEnd, name, score, strand #-}
 73 | 
 74 | class BEDLike b => BEDConvert b where
 75 |     -- | Construct bed record from chromsomoe, start location and end location
 76 |     asBed :: B.ByteString -> Int -> Int -> b
 77 | 
 78 |     -- | Convert bytestring to bed format
 79 |     fromLine :: B.ByteString -> b
 80 | 
 81 |     -- | Convert bed to bytestring
 82 |     toLine :: b -> B.ByteString
 83 | 
 84 |     convert :: BEDLike b' => b' -> b
 85 |     convert bed = asBed (bed^.chrom) (bed^.chromStart) (bed^.chromEnd)
 86 |     {-# INLINE convert #-}
 87 | 
 88 |     {-# MINIMAL asBed, fromLine, toLine #-}
 89 | 
 90 | -- * BED6 format
 91 | 
 92 | -- | BED6 format, as described in http://genome.ucsc.edu/FAQ/FAQformat.html#format1.7
 93 | data BED = BED
 94 |     { _bed_chrom      :: B.ByteString
 95 |     , _bed_chromStart :: Int
 96 |     , _bed_chromEnd   :: Int
 97 |     , _bed_name       :: Maybe B.ByteString
 98 |     , _bed_score      :: Maybe Int
 99 |     , _bed_strand     :: Maybe Bool  -- ^ True: "+", False: "-"
100 |     } deriving (Eq, Show, Read, Generic, NFData)
101 | 
102 | instance Ord BED where
103 |     compare (BED x1 x2 x3 x4 x5 x6) (BED y1 y2 y3 y4 y5 y6) =
104 |         compare (x1,x2,x3,x4,x5,x6) (y1,y2,y3,y4,y5,y6)
105 | 
106 | instance BEDLike BED where
107 |     chrom = lens _bed_chrom (\bed x -> bed { _bed_chrom = x })
108 |     chromStart = lens _bed_chromStart (\bed x -> bed { _bed_chromStart = x })
109 |     chromEnd = lens _bed_chromEnd (\bed x -> bed { _bed_chromEnd = x })
110 |     name = lens _bed_name (\bed x -> bed { _bed_name = x })
111 |     score = lens _bed_score (\bed x -> bed { _bed_score = x })
112 |     strand = lens _bed_strand (\bed x -> bed { _bed_strand = x })
113 | 
114 | instance BEDConvert BED where
115 |     asBed chr s e = BED chr s e Nothing Nothing Nothing
116 | 
117 |     fromLine l = f $ take 6 $ B.split '\t' l
118 |       where
119 |         f [f1,f2,f3,f4,f5,f6] = BED f1 (readInt f2) (readInt f3) (getName f4)
120 |             (getScore f5) (getStrand f6)
121 |         f [f1,f2,f3,f4,f5] = BED f1 (readInt f2) (readInt f3) (getName f4)
122 |             (getScore f5) Nothing
123 |         f [f1,f2,f3,f4] = BED f1 (readInt f2) (readInt f3) (getName f4)
124 |             Nothing Nothing
125 |         f [f1,f2,f3] = asBed f1 (readInt f2) (readInt f3)
126 |         f _ = error "Read BED fail: Not enough fields!"
127 |         getName x | x == "." = Nothing
128 |                   | otherwise = Just x
129 |         getScore x | x == "." = Nothing
130 |                    | s >= 0 = Just s
131 |                    | otherwise = error "Read BED fail: score must be greater than 0"
132 |           where
133 |             s = readInt x
134 |         getStrand str | str == "-" = Just False
135 |                       | str == "+" = Just True
136 |                       | otherwise = Nothing
137 |     {-# INLINE fromLine #-}
138 | 
139 |     toLine (BED f1 f2 f3 f4 f5 f6) = B.intercalate "\t"
140 |         [ f1, fromJust $ packDecimal f2, fromJust $ packDecimal f3
141 |         , fromMaybe "." f4, score', strand' ]
142 |       where
143 |         strand' | f6 == Just True = "+"
144 |                 | f6 == Just False = "-"
145 |                 | otherwise = "."
146 |         score' = case f5 of
147 |                      Just x -> fromJust $ packDecimal x
148 |                      _      -> "."
149 |     {-# INLINE toLine #-}
150 | 
151 |     convert bed = BED (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) (bed^.name)
152 |                       (bed^.score) (bed^.strand)
153 | 
154 | -- | BED3 format
155 | data BED3 = BED3
156 |     { _bed3_chrom       :: B.ByteString
157 |     , _bed3_chrom_start :: Int
158 |     , _bed3_chrom_end   :: Int
159 |     } deriving (Eq, Show, Read, Generic, NFData)
160 | 
161 | instance Ord BED3 where
162 |     compare (BED3 x1 x2 x3) (BED3 y1 y2 y3) = compare (x1,x2,x3) (y1,y2,y3)
163 | 
164 | instance BEDLike BED3 where
165 |     chrom = lens _bed3_chrom (\bed x -> bed { _bed3_chrom = x })
166 |     chromStart = lens _bed3_chrom_start (\bed x -> bed { _bed3_chrom_start = x })
167 |     chromEnd = lens _bed3_chrom_end (\bed x -> bed { _bed3_chrom_end = x })
168 |     name = lens (const Nothing) (\bed _ -> bed)
169 |     score = lens (const Nothing) (\bed _ -> bed)
170 |     strand = lens (const Nothing) (\bed _ -> bed)
171 | 
172 | instance BEDConvert BED3 where
173 |     asBed = BED3
174 | 
175 |     fromLine l = case B.split '\t' l of
176 |                     (a:b:c:_) -> BED3 a (readInt b) $ readInt c
177 |                     _ -> error "Read BED fail: Incorrect number of fields"
178 |     {-# INLINE fromLine #-}
179 | 
180 |     toLine (BED3 a b c) = B.intercalate "\t"
181 |         [a, fromJust $ packDecimal b, fromJust $ packDecimal c]
182 |     {-# INLINE toLine #-}
183 | 
184 | -- | Bedgraph format.
185 | data BEDGraph = BEDGraph
186 |     { _bdg_chrom       :: B.ByteString
187 |     , _bdg_chrom_start :: Int
188 |     , _bdg_chrom_end   :: Int
189 |     , _bdg_value       :: Double
190 |     } deriving (Eq, Show, Read, Generic, NFData)
191 | 
192 | makeLensesFor [("_bdg_value", "bdgValue")] ''BEDGraph
193 | 
194 | instance Ord BEDGraph where
195 |     compare (BEDGraph x1 x2 x3 x4) (BEDGraph y1 y2 y3 y4) =
196 |         compare (x1,x2,x3,x4) (y1,y2,y3,y4)
197 | 
198 | instance BEDLike BEDGraph where
199 |     chrom = lens _bdg_chrom (\bed x -> bed { _bdg_chrom = x })
200 |     chromStart = lens _bdg_chrom_start (\bed x -> bed { _bdg_chrom_start = x })
201 |     chromEnd = lens _bdg_chrom_end (\bed x -> bed { _bdg_chrom_end = x })
202 |     name = lens (const Nothing) (\bed _ -> bed)
203 |     score = lens (const Nothing) (\bed _ -> bed)
204 |     strand = lens (const Nothing) (\bed _ -> bed)
205 | 
206 | instance BEDConvert BEDGraph where
207 |     asBed a b c = BEDGraph a b c 0
208 |     {-# INLINE asBed #-}
209 | 
210 |     fromLine l = case B.split '\t' l of
211 |         (a:b:c:d:_) -> BEDGraph a (readInt b) (readInt c) $ readDouble d
212 |         _ -> error "Read BEDGraph fail: Incorrect number of fields"
213 |     {-# INLINE fromLine #-}
214 | 
215 |     toLine (BEDGraph a b c d) = B.intercalate "\t"
216 |         [a, fromJust $ packDecimal b, fromJust $ packDecimal c, toShortest d]
217 |     {-# INLINE toLine #-}
218 | 
219 | 
220 | -- | ENCODE narrowPeak format: https://genome.ucsc.edu/FAQ/FAQformat.html#format12
221 | data NarrowPeak = NarrowPeak
222 |     { _npChrom  :: B.ByteString
223 |     , _npStart  :: Int
224 |     , _npEnd    :: Int
225 |     , _npName   :: Maybe B.ByteString
226 |     , _npScore  :: Int
227 |     , _npStrand :: Maybe Bool
228 |     , _npSignal :: Double
229 |     , _npPvalue :: Maybe Double
230 |     , _npQvalue :: Maybe Double
231 |     , _npPeak   :: Maybe Int
232 |     } deriving (Eq, Show, Read, Generic, NFData)
233 | 
234 | makeLensesFor [ ("_npSignal", "npSignal")
235 |               , ("_npPvalue", "npPvalue")
236 |               , ("_npQvalue", "npQvalue")
237 |               , ("_npPeak", "npPeak")
238 |               ] ''NarrowPeak
239 | 
240 | instance BEDLike NarrowPeak where
241 |     chrom = lens _npChrom (\bed x -> bed { _npChrom = x })
242 |     chromStart = lens _npStart (\bed x -> bed { _npStart = x })
243 |     chromEnd = lens _npEnd (\bed x -> bed { _npEnd = x })
244 |     name = lens _npName (\bed x -> bed { _npName = x })
245 |     score = lens (Just . _npScore) (\bed x -> bed { _npScore = fromJust x })
246 |     strand = lens _npStrand (\bed x -> bed { _npStrand = x })
247 | 
248 | instance BEDConvert NarrowPeak where
249 |     asBed chr s e = NarrowPeak chr s e Nothing 0 Nothing 0 Nothing Nothing Nothing
250 | 
251 |     fromLine = go . B.split '\t'
252 |       where
253 |         go [a,b,c] = convert $ BED3 a (readInt b) $ readInt c
254 |         go (a:b:c:d:e:f:g:h:i:j:_) = NarrowPeak a (readInt b) (readInt c)
255 |             (if d == "." then Nothing else Just d)
256 |             (readInt e)
257 |             (if f == "." then Nothing else if f == "+" then Just True else Just False)
258 |             (readDouble g)
259 |             (readDoubleNonnegative h)
260 |             (readDoubleNonnegative i)
261 |             (readIntNonnegative j)
262 |         go x = error $ "Cannot parse line: " <> show x
263 |     {-# INLINE fromLine #-}
264 | 
265 |     toLine (NarrowPeak a b c d e f g h i j) = B.intercalate "\t"
266 |         [ a, fromJust $ packDecimal b, fromJust $ packDecimal c, fromMaybe "." d
267 |         , fromJust $ packDecimal e
268 |         , case f of
269 |             Nothing   -> "."
270 |             Just True -> "+"
271 |             _         -> "-"
272 |         , toShortest g, fromMaybe "-1" $ fmap toShortest h
273 |         , fromMaybe "-1" $ fmap toShortest i
274 |         , fromMaybe "-1" $ fmap (fromJust . packDecimal) j
275 |         ]
276 |     {-# INLINE toLine #-}
277 | 
278 |     convert bed = NarrowPeak (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) (bed^.name)
279 |         (fromMaybe 0 $ bed^.score) (bed^.strand) 0 Nothing Nothing Nothing
280 | 
281 | -- | ENCODE broadPeak format: https://genome.ucsc.edu/FAQ/FAQformat.html#format13
282 | data BroadPeak = BroadPeak
283 |     { _bpChrom  :: B.ByteString
284 |     , _bpStart  :: Int
285 |     , _bpEnd    :: Int
286 |     , _bpName   :: Maybe B.ByteString
287 |     , _bpScore  :: Int
288 |     , _bpStrand :: Maybe Bool
289 |     , _bpSignal :: Double
290 |     , _bpPvalue :: Maybe Double
291 |     , _bpQvalue :: Maybe Double
292 |     } deriving (Eq, Show, Read, Generic, NFData)
293 | 
294 | makeLensesFor [ ("_bpSignal", "bpSignal")
295 |               , ("_bpPvalue", "bpPvalue")
296 |               , ("_bpQvalue", "bpQvalue")
297 |               ] ''BroadPeak
298 | 
299 | instance BEDLike BroadPeak where
300 |     chrom = lens _bpChrom (\bed x -> bed { _bpChrom = x })
301 |     chromStart = lens _bpStart (\bed x -> bed { _bpStart = x })
302 |     chromEnd = lens _bpEnd (\bed x -> bed { _bpEnd = x })
303 |     name = lens _bpName (\bed x -> bed { _bpName = x })
304 |     score = lens (Just . _bpScore) (\bed x -> bed { _bpScore = fromJust x })
305 |     strand = lens _bpStrand (\bed x -> bed { _bpStrand = x })
306 | 
307 | instance BEDConvert BroadPeak where
308 |     asBed chr s e = BroadPeak chr s e Nothing 0 Nothing 0 Nothing Nothing
309 | 
310 |     fromLine l = BroadPeak a (readInt b) (readInt c)
311 |         (if d == "." then Nothing else Just d)
312 |         (readInt e)
313 |         (if f == "." then Nothing else if f == "+" then Just True else Just False)
314 |         (readDouble g)
315 |         (readDoubleNonnegative h)
316 |         (readDoubleNonnegative i)
317 |       where
318 |         (a:b:c:d:e:f:g:h:i:_) = B.split '\t' l
319 |     {-# INLINE fromLine #-}
320 | 
321 |     toLine (BroadPeak a b c d e f g h i) = B.intercalate "\t"
322 |         [ a, fromJust $ packDecimal b, fromJust $ packDecimal c, fromMaybe "." d
323 |         , fromJust $ packDecimal e
324 |         , case f of
325 |             Nothing   -> "."
326 |             Just True -> "+"
327 |             _         -> "-"
328 |         , toShortest g, fromMaybe "-1" $ fmap toShortest h
329 |         , fromMaybe "-1" $ fmap toShortest i
330 |         ]
331 |     {-# INLINE toLine #-}
332 | 
333 |     convert bed = BroadPeak (bed^.chrom) (bed^.chromStart) (bed^.chromEnd) (bed^.name)
334 |         (fromMaybe 0 $ bed^.score) (bed^.strand) 0 Nothing Nothing
335 | 
336 | 
337 | data BEDExt bed a = BEDExt
338 |     { _ext_bed :: bed
339 |     , _ext_data :: a
340 |     } deriving (Eq, Show, Read, Generic, NFData)
341 | 
342 | makeLensesFor [("_ext_bed", "_bed"), ("_ext_data", "_data")] ''BEDExt
343 | 
344 | instance BEDLike bed => BEDLike (BEDExt bed a) where
345 |     chrom = _bed . chrom
346 |     chromStart = _bed . chromStart
347 |     chromEnd = _bed . chromEnd
348 |     name = _bed . name
349 |     score = _bed . score
350 |     strand = _bed . strand
351 | 
352 | instance (Read a, Show a, BEDConvert bed) => BEDConvert (BEDExt bed a) where
353 |     asBed _ _ _ = error "Unable to transform arbitrary record to BEDExt"
354 | 
355 |     fromLine l = let (a, b) = B.breakEnd (=='\t') l
356 |                  in BEDExt (fromLine $ B.init a) $ read $ B.unpack b
357 |     {-# INLINE fromLine #-}
358 | 
359 |     toLine (BEDExt bed a) = toLine bed <> "\t" <> B.pack (show a)
360 |     {-# INLINE toLine #-}
361 | 
362 | type BEDTree a = M.HashMap B.ByteString (IM.IntervalMap Int a)
363 | 
364 | -- | a type to imply that underlying data structure is sorted
365 | newtype Sorted b = Sorted {fromSorted :: b} deriving (Show, Read, Eq)


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Bed/Utils.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE FlexibleContexts      #-}
  2 | {-# LANGUAGE OverloadedStrings     #-}
  3 | {-# LANGUAGE LambdaCase #-}
  4 | {-# LANGUAGE RecordWildCards #-}
  5 | {-# LANGUAGE BangPatterns #-}
  6 | 
  7 | module Bio.Data.Bed.Utils
  8 |     ( fetchSeq
  9 |     , clipBed
 10 |     , CutoffMotif(..)
 11 |     , mkCutoffMotif
 12 |     , scanMotif 
 13 |     , monoColonalize
 14 |     , BaseMap(..)
 15 |     , baseMap
 16 |     , queryBaseMap
 17 |     , rpkmBed
 18 |     , rpkmSortedBed
 19 |     , countTagsBed
 20 |     , countTagsBinBed
 21 |     , countTagsBinBed'
 22 |     , tagCountDistr
 23 |     , peakCluster
 24 |     ) where
 25 | 
 26 | import           Conduit
 27 | import           Lens.Micro
 28 | import           Control.Monad.State.Strict
 29 | import qualified Data.ByteString.Char8 as B
 30 | import qualified Data.Foldable                as F
 31 | import           Data.Function                (on)
 32 | import qualified Data.HashMap.Strict          as M
 33 | import qualified Data.IntervalMap.Strict      as IM
 34 | import           Data.Maybe                   (fromJust, fromMaybe)
 35 | import qualified Data.Vector                  as V
 36 | import qualified Data.Vector.Algorithms.Intro as I
 37 | import qualified Data.Vector.Generic          as G
 38 | import qualified Data.Vector.Generic.Mutable  as GM
 39 | import qualified Data.Vector.Unboxed          as U
 40 | import System.IO
 41 | 
 42 | import           Bio.Data.Bed
 43 | import           Bio.Data.Bed.Types
 44 | import           Bio.Motif                    (Bkgd (..), Motif (..), CDF, PWM)
 45 | import qualified Bio.Motif                    as Motif
 46 | import qualified Bio.Motif.Search             as Motif
 47 | import           Bio.Seq hiding (length)
 48 | import           Bio.Seq.IO
 49 | import qualified Bio.Utils.BitVector as BV
 50 | 
 51 | clipBed :: (BEDLike b, Monad m)
 52 |         => [(B.ByteString, Int)]  -- ^ Chromosome sizes
 53 |         -> ConduitT b b m ()
 54 | clipBed chrsize = concatMapC f
 55 |   where
 56 |     f x = case M.lookup (x^.chrom) chrsize' of
 57 |         Nothing -> Nothing
 58 |         Just n -> if x^.chromStart >= n
 59 |             then Nothing
 60 |             else Just $ chromStart %~ max 0 $ chromEnd %~ min n $ x
 61 |     chrsize' = M.fromListWith (error "redundant chromosomes") chrsize
 62 | {-# INLINE clipBed #-}
 63 | 
 64 | -- | retreive sequences
 65 | fetchSeq :: BioSeq DNA a
 66 |          => Genome
 67 |          -> BED
 68 |          -> IO (Either String (DNA a))
 69 | fetchSeq g bed = do
 70 |     dna <- getSeq g (bed^.chrom, bed^.chromStart, bed^.chromEnd)
 71 |     return $ case bed^.strand of
 72 |         Just False -> rc <$> dna
 73 |         _          -> dna
 74 | {-# INLINE fetchSeq #-}
 75 | 
 76 | -- | Motif with predefined cutoff score. All necessary intermediate data
 77 | -- structure for motif scanning are stored.
 78 | data CutoffMotif = CutoffMotif
 79 |     { _motif_name :: B.ByteString
 80 |     , _motif_pwm :: PWM
 81 |     , _motif_sigma :: U.Vector Double
 82 |     , _motif_pwm_rc :: PWM
 83 |     , _motif_sigma_rc :: U.Vector Double
 84 |     , _background :: Bkgd
 85 |     , _cutoff :: Double
 86 |     , _cdf :: CDF }
 87 | 
 88 | mkCutoffMotif :: Bkgd
 89 |               -> Double  -- ^ p-value
 90 |               -> Motif -> CutoffMotif
 91 | mkCutoffMotif bg p motif = CutoffMotif (_name motif) (_pwm motif) sigma pwm'
 92 |     sigma' bg sc $ Motif.truncateCDF (1 - p * 10) cdf
 93 |   where
 94 |     cdf = Motif.scoreCDF bg $ _pwm motif
 95 |     sc = Motif.cdf' cdf $ 1 - p
 96 |     sigma = Motif.optimalScoresSuffix bg $ _pwm motif
 97 |     pwm' = Motif.rcPWM $ _pwm motif
 98 |     sigma' = Motif.optimalScoresSuffix bg pwm'
 99 | 
100 | -- | Motif score is in [0, 1000]: ( 1 / (1 + exp (-(-logP - 5))) ) * 1000.
101 | scanMotif :: (BEDLike b, MonadIO m)
102 |           => Genome -> [CutoffMotif] -> ConduitT b BED m ()
103 | scanMotif g motifs = awaitForever $ \bed -> do
104 |     let (chr, start, end) = (bed^.chrom, bed^.chromStart, bed^.chromEnd)
105 |     liftIO (getSeq g (chr, start, end)) >>= \case
106 |         Left _    -> liftIO $ hPutStrLn stderr $
107 |             "Warning: no sequence for region: " ++ show (chr, start, end)
108 |         Right dna -> forM_ motifs $ \CutoffMotif{..} -> do
109 |             let mkBed str (i, sc) = BED chr (start + i) (start + i + n)
110 |                     (Just $ _motif_name) (Just $ toAffinity $ 1 - Motif.cdf _cdf sc)
111 |                     (Just str)
112 |                 n = Motif.size _motif_pwm
113 |             -- Scan forward strand
114 |             Motif.findTFBSWith _motif_sigma _background _motif_pwm
115 |                 (dna :: DNA IUPAC) _cutoff True .| mapC (mkBed True)
116 |             -- Scan reverse strand
117 |             Motif.findTFBSWith _motif_sigma_rc _background _motif_pwm_rc
118 |                 dna _cutoff True .| mapC (mkBed False)
119 |   where
120 |     toAffinity x' = round $ sc * 1000
121 |       where
122 |         sc = 1 / (1 + exp (-(x - 5)))
123 |         x = negate $ logBase 10 $ max 1e-20 x'
124 | {-# INLINE scanMotif #-}
125 | 
126 | -- | process a sorted BED stream, keep only mono-colonal tags
127 | monoColonalize :: Monad m => ConduitT BED BED m ()
128 | monoColonalize = do
129 |     x <- headC
130 |     case x of
131 |         Just b -> yield b >> concatMapAccumC f b
132 |         Nothing -> return ()
133 |   where
134 |     f cur prev = case compareBed prev cur of
135 |         GT -> error $
136 |             "Input is not sorted: " ++ show prev ++ " > " ++ show cur
137 |         LT -> (cur, [cur])
138 |         _ -> if prev^.strand == cur^.strand then (cur, []) else (cur, [cur])
139 | {-# INLINE monoColonalize #-}
140 | 
141 | newtype BaseMap = BaseMap (M.HashMap B.ByteString BV.BitVector)
142 | 
143 | -- | Count the tags (starting positions) at each position in the genome.
144 | baseMap :: PrimMonad m
145 |         => [(B.ByteString, Int)]   -- ^ chromosomes and their sizes
146 |         -> ConduitT BED o m BaseMap
147 | baseMap chrs = do
148 |     bvs <- lift $ fmap M.fromList $ forM chrs $ \(chr, n) -> do
149 |         bv <- BV.zeros n
150 |         return (chr, bv)
151 | 
152 |     mapM_C $ \bed -> case M.lookup (bed^.chrom) bvs of
153 |         Nothing -> return ()
154 |         Just bv -> if fromMaybe True $ bed^.strand
155 |             then BV.set bv $ bed^.chromStart
156 |             else BV.set bv $ bed^.chromEnd - 1
157 | 
158 |     lift $ fmap BaseMap $ sequence $ fmap BV.unsafeFreeze bvs 
159 | 
160 | queryBaseMap :: BEDLike b => b -> BaseMap -> Maybe [Bool]
161 | queryBaseMap bed (BaseMap bm) = case M.lookup (bed^.chrom) bm of
162 |     Nothing -> Nothing
163 |     Just bv ->
164 |         let res = map (bv BV.!) [bed^.chromStart .. bed^.chromEnd - 1]
165 |         in case bed^.strand of
166 |             Just False -> Just $ reverse res
167 |             _ -> Just res
168 | 
169 | -- | calculate RPKM on a set of unique regions. Regions (in bed format) would be kept in
170 | -- memory but not tag file.
171 | -- RPKM: Readcounts per kilobase per million reads. Only counts the starts of tags
172 | rpkmBed :: (PrimMonad m, BEDLike b, G.Vector v Double)
173 |      => [b] -> ConduitT BED o m (v Double)
174 | rpkmBed regions = do
175 |     v <- lift $ do v' <- V.unsafeThaw . V.fromList . zip [0..] $ regions
176 |                    I.sortBy (compareBed `on` snd) v'
177 |                    V.unsafeFreeze v'
178 |     let (idx, sortedRegions) = V.unzip v
179 |         n = G.length idx
180 |     readCount <- rpkmSortedBed $ Sorted sortedRegions
181 | 
182 |     lift $ do
183 |         result <- GM.new n
184 |         G.sequence_ . G.imap (\x i -> GM.unsafeWrite result i (readCount U.! x)) $ idx
185 |         G.unsafeFreeze result
186 | {-# INLINE rpkmBed #-}
187 | 
188 | -- | calculate RPKM on a set of regions. Regions must be sorted. The Sorted data
189 | -- type is used to remind users to sort their data.
190 | rpkmSortedBed :: (PrimMonad m, BEDLike b, G.Vector v Double)
191 |               => Sorted (V.Vector b) -> ConduitT BED o m (v Double)
192 | rpkmSortedBed (Sorted regions) = do
193 |     vec <- lift $ GM.replicate l 0
194 |     n <- foldMC (count vec) (0 :: Int)
195 |     let factor = fromIntegral n / 1e9
196 |     lift $ liftM (G.imap (\i x -> x / factor / (fromIntegral . size) (regions V.! i)))
197 |          $ G.unsafeFreeze vec
198 |   where
199 |     count v nTags tag = do
200 |         let p | tag^.strand == Just True = tag^.chromStart
201 |               | tag^.strand == Just False = tag^.chromEnd - 1
202 |               | otherwise = error "Unkown strand"
203 |             xs = concat $ IM.elems $
204 |                 IM.containing (M.lookupDefault IM.empty (tag^.chrom) intervalMap) p
205 |         addOne v xs
206 |         return $ succ nTags
207 | 
208 |     intervalMap = sortedBedToTree (++) . Sorted . G.toList . G.zip regions .
209 |                   G.map return . G.enumFromN 0 $ l
210 |     addOne v' = mapM_ $ \x -> GM.unsafeRead v' x >>= GM.unsafeWrite v' x . (+1)
211 |     l = G.length regions
212 | {-# INLINE rpkmSortedBed #-}
213 | 
214 | countTagsBed :: (PrimMonad m, BEDLike b, G.Vector v Int)
215 |              => [b] -> ConduitT BED o m (v Int, Int)
216 | countTagsBed regions = do
217 |     vec <- lift $ GM.replicate l 0
218 |     n <- foldMC (count vec) (0 :: Int)
219 |     vec' <- lift $ G.unsafeFreeze vec
220 |     return (vec', n)
221 |   where
222 |     count v nTags tag = do
223 |         let p | tag^.strand == Just True = tag^.chromStart
224 |               | tag^.strand == Just False = tag^.chromEnd - 1
225 |               | otherwise = error "Unkown strand"
226 |             xs = concat $ IM.elems $
227 |                 IM.containing (M.lookupDefault IM.empty (tag^.chrom) intervalMap) p
228 |         addOne v xs
229 |         return $ succ nTags
230 |     intervalMap = bedToTree (++) $ zip regions $ map return [0..]
231 |     addOne v' = mapM_ $ \x -> GM.unsafeRead v' x >>= GM.unsafeWrite v' x . (+1)
232 |     l = length regions
233 | {-# INLINE countTagsBed #-}
234 | 
235 | -- | divide each region into consecutive bins, and count tags for each bin and
236 | -- return the number of all tags. Note: a tag is considered to be overlapped
237 | -- with a region only if the starting position of the tag is in the region. For
238 | -- the common sense overlapping, use countTagsBinBed'.
239 | countTagsBinBed :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b)
240 |            => Int   -- ^ bin size
241 |            -> [b]   -- ^ regions
242 |            -> ConduitT BED o m ([v a], Int)
243 | countTagsBinBed k beds = do
244 |     vs <- lift $ fmap V.fromList $ forM beds $ \bed -> do
245 |         let start = bed^.chromStart
246 |             num = ((bed^.chromEnd) - start) `div` k
247 |             index i = (i - start) `div` k
248 |         v <- GM.replicate num 0
249 |         return (v, index)
250 |     nTags <- foldMC (f vs) 0
251 |     readCount <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs
252 |     return (readCount, nTags)
253 |   where
254 |     f vs n bed = do
255 |         let pos | bed^.strand == Just True = bed^.chromStart
256 |                 | bed^.strand == Just False = bed^.chromEnd - 1
257 |                 | otherwise = error "unkown strand."
258 |             overlaps = concat $ IM.elems $ IM.containing
259 |                 (M.lookupDefault IM.empty (bed^.chrom) intervalMap) pos
260 |         forM_ overlaps $ \x -> do
261 |             let (v, idxFn) = vs `G.unsafeIndex` x
262 |                 i = let i' = idxFn pos
263 |                         l = GM.length v
264 |                     in if i' >= l then l - 1 else i'
265 |             GM.unsafeModify v (+1) i
266 |         return $ n + 1
267 |     intervalMap = bedToTree (++) $ zip beds $ map return [0..]
268 | {-# INLINE countTagsBinBed #-}
269 | 
270 | -- | Same as countTagsBinBed, except that tags are treated as complete intervals
271 | -- instead of single points.
272 | countTagsBinBed' :: (Integral a, PrimMonad m, G.Vector v a, BEDLike b1, BEDLike b2)
273 |                  => Int   -- ^ bin size
274 |                  -> [b1]   -- ^ regions
275 |                  -> ConduitT b2 o m ([v a], Int)
276 | countTagsBinBed' k beds = do
277 |     initRC <- lift $ forM beds $ \bed -> do
278 |         let start = bed^.chromStart
279 |             end = bed^.chromEnd
280 |             num = (end - start) `div` k
281 |             index i = (i - start) `div` k
282 |         v <- GM.replicate num 0
283 |         return (v, index)
284 | 
285 |     sink 0 $ V.fromList initRC
286 |   where
287 |     sink !nTags vs = do
288 |         tag <- await
289 |         case tag of
290 |             Just bed -> do
291 |                 let chr = bed^.chrom
292 |                     start = bed^.chromStart
293 |                     end = bed^.chromEnd
294 |                     overlaps = concat $ IM.elems $ IM.intersecting
295 |                         (M.lookupDefault IM.empty chr intervalMap) $ IM.IntervalCO start end
296 |                 lift $ forM_ overlaps $ \x -> do
297 |                     let (v, idxFn) = vs `G.unsafeIndex` x
298 |                         lo = let i = idxFn start
299 |                              in if i < 0 then 0 else i
300 |                         hi = let i = idxFn end
301 |                                  l = GM.length v
302 |                              in if i >= l then l - 1 else i
303 |                     forM_ [lo..hi] $ \i ->
304 |                         GM.unsafeRead v i >>= GM.unsafeWrite v i . (+1)
305 |                 sink (nTags+1) vs
306 | 
307 |             _ -> do readCount <- lift $ mapM (G.unsafeFreeze . fst) $ G.toList vs
308 |                     return (readCount, nTags)
309 | 
310 |     intervalMap = bedToTree (++) $ zip beds $ map return [0..]
311 | {-# INLINE countTagsBinBed' #-}
312 | 
313 | tagCountDistr :: PrimMonad m => G.Vector v Int => ConduitT BED o m (v Int)
314 | tagCountDistr = loop M.empty
315 |   where
316 |     loop m = do
317 |         x <- await
318 |         case x of
319 |             Just bed -> do
320 |                 let p | fromMaybe True (bed^.strand) = bed^.chromStart
321 |                       | otherwise = 1 - bed^.chromEnd
322 |                 case M.lookup (bed^.chrom) m of
323 |                     Just table -> loop $ M.insert (bed^.chrom) (M.insertWith (+) p 1 table) m
324 |                     _ -> loop $ M.insert (bed^.chrom) (M.fromList [(p,1)]) m
325 |             _ -> lift $ do
326 |                 vec <- GM.replicate 100 0
327 |                 F.forM_ m $ \table ->
328 |                     F.forM_ table $ \v -> do
329 |                         let i = min 99 v
330 |                         GM.unsafeRead vec i >>= GM.unsafeWrite vec i . (+1)
331 |                 G.unsafeFreeze vec
332 | {-# INLINE tagCountDistr #-}
333 | 
334 | -- | cluster peaks
335 | peakCluster :: (BEDLike b, Monad m)
336 |             => [b]   -- ^ peaks
337 |             -> Int   -- ^ radius
338 |             -> Int   -- ^ cutoff
339 |             -> ConduitT o BED m ()
340 | peakCluster peaks r th = mergeBedWith mergeFn peaks' .| filterC g
341 |   where
342 |     peaks' = map f peaks
343 |     f b = let c = (b^.chromStart + b^.chromEnd) `div` 2
344 |           in asBed (b^.chrom) (c-r) (c+r) :: BED3
345 |     mergeFn xs = asBed (head xs ^. chrom) lo hi & score .~ Just (fromIntegral $ length xs)
346 |       where
347 |         lo = minimum $ map (^.chromStart) xs
348 |         hi = maximum $ map (^.chromEnd) xs
349 |     g b = fromJust (b^.score) >= fromIntegral th
350 | {-# INLINE peakCluster #-}
351 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/BigWig.hs:
--------------------------------------------------------------------------------
 1 | module Bio.Data.BigWig
 2 |     ( module Data.BBI.BigWig
 3 |     , accumScores
 4 |     , normalizedScores
 5 |     ) where
 6 | 
 7 | import Data.BBI.BigWig
 8 | import Data.Conduit
 9 | import qualified Data.Conduit.List as CL
10 | import Bio.Data.Bed
11 | 
12 | -- | for each BED region, return total scores of all wig records overlapped with it
13 | accumScores :: BEDLike b => BWFile -> Conduit b IO Double
14 | accumScores bwF = CL.mapM (helper bwF)
15 | -}
16 | 
17 | -- | for each BED region, return normalized scores (divided by the length) of
18 | -- all wig records overlapped with it
19 | normalizedScores :: BEDLike b => BWFile -> Conduit b IO Double
20 | normalizedScores bwF = CL.mapM $ \bed -> do
21 |     x <- helper bwF bed
22 |     return $ x / (fromIntegral . size) bed
23 | 
24 | helper :: BEDLike b => BWFile -> b -> IO Double
25 | helper bwF bed = queryBWFile bwF (chr, start, end) $$ CL.fold f 0.0
26 |   where
27 |     f acc (_, s, e, v) = acc + fromIntegral (e - s) * v
28 |     chr = chrom bed
29 |     start = chromStart bed
30 |     end = chromEnd bed
31 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Fasta.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE FlexibleInstances #-}
 2 | {-# LANGUAGE UndecidableInstances #-}
 3 | 
 4 | module Bio.Data.Fasta
 5 |     ( FastaLike(..)
 6 |     , fastaReader
 7 |     ) where
 8 | 
 9 | import Bio.Motif
10 | import Bio.Seq
11 | import qualified Data.ByteString.Char8 as B
12 | import Conduit
13 | 
14 | class FastaLike f where
15 |     -- | Convert a FASTA record, consisting of a record header and a record body,
16 |     -- to a specific data type
17 |     fromFastaRecord :: (B.ByteString, [B.ByteString]) -> f
18 | 
19 |     readFasta :: FilePath -> ConduitT i f (ResourceT IO) ()
20 |     readFasta fl = fastaReader fl .| mapC fromFastaRecord
21 | 
22 |     -- | non-stream version, read whole file in memory
23 |     readFasta' :: FilePath -> IO [f]
24 |     readFasta' fl = runResourceT $ runConduit $ readFasta fl .| sinkList
25 |     {-# MINIMAL fromFastaRecord #-}
26 | 
27 | instance BioSeq s a => FastaLike (s a) where
28 |     fromFastaRecord (_, xs) = case fromBS (B.concat xs) of
29 |         Left err -> error err
30 |         Right x -> x
31 |     {-# INLINE fromFastaRecord #-}
32 | 
33 | instance FastaLike Motif where
34 |     fromFastaRecord (name, mat) = Motif name (toPWM mat)
35 |     {-# INLINE fromFastaRecord #-}
36 | 
37 | fastaReader :: FilePath
38 |             -> ConduitT i (B.ByteString, [B.ByteString]) (ResourceT IO) ()
39 | fastaReader fl = sourceFile fl .| linesUnboundedAsciiC .| loop []
40 |   where
41 |     loop acc = do
42 |         x <- await
43 |         case x of
44 |             Just l -> case () of
45 |                 _ | B.null l -> loop acc  -- empty line, go to next line
46 |                   | B.head l == '>' -> output (reverse acc) >> loop [B.tail l]
47 |                   | otherwise -> loop (l:acc)
48 |             Nothing -> output $ reverse acc
49 |     output (x:xs) = yield (x, xs)
50 |     output _ = return ()
51 | {-# INLINE fastaReader #-}
52 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Fastq.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE LambdaCase #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE DeriveGeneric, DeriveAnyClass #-}
  4 | module Bio.Data.Fastq
  5 |     ( Fastq(..)
  6 |     , streamFastqGzip
  7 |     , streamFastq
  8 |     , sinkFastqGzip
  9 |     , sinkFastq
 10 |     , parseFastqC
 11 |     , parseFastqC'
 12 |     , fastqToByteString
 13 |     , qualitySummary
 14 |     , trimPolyA
 15 |     ) where
 16 | 
 17 | import           Conduit
 18 | import Data.Conduit.Zlib (ungzip, multiple, gzip)
 19 | import qualified Data.ByteString.Char8 as B
 20 | import qualified Data.ByteString as BS
 21 | import qualified Data.Attoparsec.ByteString as A
 22 | import Data.Attoparsec.ByteString.Char8
 23 | import Data.Conduit.Attoparsec
 24 | import GHC.Generics (Generic)
 25 | import Control.DeepSeq
 26 | 
 27 | -- | A FASTQ file normally uses four lines per sequence.
 28 | --
 29 | --     * Line 1 begins with a '@' character and is followed by a sequence
 30 | --       identifier and an optional description (like a FASTA title line).
 31 | --
 32 | --     * Line 2 is the raw sequence letters.
 33 | --
 34 | --     * Line 3 begins with a '+' character and is optionally followed by the
 35 | --       same sequence identifier (and any description) again.
 36 | --
 37 | --     * Line 4 encodes the quality values for the sequence in Line 2, and must
 38 | --       contain the same number of symbols as letters in the sequence.
 39 | data Fastq = Fastq
 40 |     { fastqSeqId   :: B.ByteString
 41 |     , fastqSeq     :: B.ByteString
 42 |     , fastqSeqQual :: B.ByteString
 43 |     } deriving (Show, Eq, Generic, NFData)
 44 | 
 45 | -- | Read gzipped fastq file.
 46 | streamFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m) 
 47 |                 => FilePath -> ConduitT i Fastq m ()
 48 | streamFastqGzip fl = sourceFileBS fl .| multiple ungzip .| parseFastqC
 49 | 
 50 | streamFastq :: (MonadResource m, MonadThrow m)
 51 |             => FilePath -> ConduitT i Fastq m ()
 52 | streamFastq fl = sourceFileBS fl .| parseFastqC
 53 | 
 54 | sinkFastq :: (MonadResource m, MonadThrow m)
 55 |           => FilePath -> ConduitT Fastq o m ()
 56 | sinkFastq fl = mapC fastqToByteString .| unlinesAsciiC .| sinkFileBS fl
 57 | 
 58 | sinkFastqGzip :: (PrimMonad m, MonadThrow m, MonadResource m)
 59 |               => FilePath -> ConduitT Fastq o m ()
 60 | sinkFastqGzip fl = mapC fastqToByteString .| unlinesAsciiC .| gzip .| sinkFileBS fl
 61 | 
 62 | data FQBuilder = Init FQBuilder
 63 |                | FQ1 B.ByteString FQBuilder
 64 |                | FQ2 B.ByteString FQBuilder
 65 |                | FQ3 B.ByteString FQBuilder
 66 |                | Complete
 67 | 
 68 | fqBuilder :: FQBuilder -> Fastq
 69 | fqBuilder = go ([], [], [])
 70 |   where
 71 |     go acc (Init bldr) = go acc bldr
 72 |     go (f1,f2,f3) (FQ1 x bldr) = go (x:f1, f2, f3) bldr
 73 |     go (f1,f2,f3) (FQ2 x bldr) = go (f1, x:f2, f3) bldr
 74 |     go (f1,f2,f3) (FQ3 x bldr) = go (f1, f2, x:f3) bldr
 75 |     go (f1,f2,f3) Complete = Fastq (B.concat $ reverse f1)
 76 |         (B.concat $ reverse f2) (B.concat $ reverse f3)
 77 | {-# INLINE fqBuilder #-}
 78 | 
 79 | parseFastqC :: Monad m => ConduitT B.ByteString Fastq m ()
 80 | parseFastqC = await >>= maybe (error "Empty input") ( \x -> do
 81 |     if B.head x == '@'
 82 |         then loop Init 'a' $ B.tail x
 83 |         else error "Record does not start with \'@\'" )
 84 |   where
 85 |     tryRead1 input | B.null input = await >>= maybe (error "Unexpected EOF") return
 86 |                    | otherwise = return input
 87 |     loop acc st input = case st of
 88 |         'a' -> do
 89 |             (x, rest) <- B.break (=='\n') <$> tryRead1 input
 90 |             if B.null rest
 91 |                 then loop (acc . FQ1 x) 'a' rest
 92 |                 else loop (acc . FQ1 x) 'b' $ B.tail rest
 93 |         'b' -> do 
 94 |             (x, rest) <- B.break (=='\n') <$> tryRead1 input
 95 |             if B.null rest
 96 |                 then loop (acc . FQ2 x) 'b' rest
 97 |                 else loop (acc . FQ2 x) 'B' $ B.tail rest
 98 |         'B' -> do 
 99 |             input' <- tryRead1 input
100 |             if B.head input' == '+'
101 |                 then loop acc 'c' $ B.tail input'
102 |                 else do
103 |                     let (x, rest) = B.break (=='\n') input'
104 |                     if B.null rest
105 |                         then loop (acc . FQ2 x) 'b' rest
106 |                         else loop (acc . FQ2 x) 'B' $ B.tail rest
107 |         'c' -> do
108 |             (x, rest) <- B.break (=='\n') <$> tryRead1 input
109 |             if B.null rest
110 |                 then loop acc 'c' rest
111 |                 else loop acc 'd' $ B.tail rest
112 |         'd' -> do
113 |             (x, rest) <- B.break (=='\n') <$> tryRead1 input
114 |             if B.null rest 
115 |                 then loop (acc . FQ3 x) 'd' rest
116 |                 else loop (acc . FQ3 x) 'D' $ B.tail rest
117 |         'D' -> if B.null input
118 |             then await >>= \case
119 |                 Nothing -> yield $ fqBuilder $ acc Complete
120 |                 Just input' -> if B.head input' == '@'
121 |                     then do
122 |                         yield $ fqBuilder $ acc Complete
123 |                         loop Init 'a' $ B.tail input'
124 |                     else loop acc 'd' input'
125 |             else if B.head input == '@'
126 |                 then do
127 |                     yield $ fqBuilder $ acc Complete
128 |                     loop Init 'a' $ B.tail input
129 |                 else loop acc 'd' input
130 | {-# INLINE parseFastqC #-}
131 | 
132 | parseFastqC' :: MonadThrow m => ConduitT B.ByteString Fastq m ()
133 | parseFastqC' = conduitParser fastqParser .| mapC snd
134 |   where
135 |     fastqParser = do
136 |         _ <- skipWhile (/='@') >> char8 '@'
137 |         ident <- A.takeTill isEndOfLine
138 |         endOfLine
139 |         sequ <- BS.filter (not . isEndOfLine) <$> takeTill (=='+')
140 |         char8 '+' >> A.skipWhile (not . isEndOfLine) >> endOfLine
141 |         score <- BS.filter (not . isEndOfLine) <$>
142 |             A.scan 0 (f (B.length sequ))
143 |         skipWhile (/='@')
144 |         return $ Fastq ident sequ score
145 |       where
146 |         f n i x | i >= n = Nothing
147 |                 | isEndOfLine x = Just i
148 |                 | otherwise = Just $ i + 1
149 | {-# INLINE parseFastqC' #-}
150 | 
151 | fastqToByteString :: Fastq -> B.ByteString
152 | fastqToByteString (Fastq a b c) = B.concat ["@", a, "\n", b, "\n+\n", c]
153 | {-# INLINE fastqToByteString #-}
154 | 
155 | -- | Get the mean and variance of quality scores at every position.
156 | qualitySummary :: Monad m => ConduitT Fastq o m [(Double, Double)]
157 | qualitySummary = mapC (map fromIntegral . decodeQualSc) .| meanVarianceC
158 | 
159 | meanVarianceC :: Monad m => ConduitT [Double] o m [(Double, Double)]
160 | meanVarianceC = peekC >>= \case
161 |     Nothing -> error "Empty input"
162 |     Just x -> fst <$> foldlC f (replicate (length x) (0,0), 0 :: Int)
163 |   where
164 |     f (acc, n) xs = let acc' = zipWith g acc xs in (acc', n')
165 |       where
166 |         n' = n + 1
167 |         g (m, s) x = (m', s')
168 |           where
169 |             m' = m + d / fromIntegral n'
170 |             s' = s + d * (x - m')
171 |             d  = x - m
172 | {-# INLINE meanVarianceC #-}
173 | 
174 | decodeQualSc :: Fastq -> [Int]
175 | decodeQualSc = map (fromIntegral . (\x -> x - 33)) . BS.unpack .fastqSeqQual
176 | {-# INLINE decodeQualSc #-}
177 | 
178 | pError :: Int -> Double
179 | pError x = 10 ** (negate (fromIntegral x) / 10)
180 | {-# INLINE pError #-}
181 | 
182 | {-
183 | mkFastqRecord l1 l2 l3 l4 = Fastq (parseLine1 l1) (parseLine2 l2)
184 |     (parseLine3 l3) (parseLine4 l4)
185 |   where
186 |     parseLine1 x | B.head x == '@' = B.tail x
187 |                  | otherwise = error $ "Parse line 1 failed: " ++ B.unpack x
188 |     parseLine2 x | B.all f x = x
189 |                  | otherwise = error $ "Parse line 2 failed: " ++ B.unpack x
190 |       where
191 |         f 'C' = True
192 |         f 'G' = True
193 |         f 'T' = True
194 |         f 'A' = True
195 |         f 'N' = True
196 |         f _   = False
197 |     parseLine3 x | B.head x == '+' = B.tail x
198 |                  | otherwise = error $ "Parse line 3 failed: " ++ B.unpack x
199 |     parseLine4 x | BS.all f x = x
200 |                  | otherwise = error $ "Parse line 4 failed: " ++ B.unpack x
201 |       where
202 |         f b = let b' = fromIntegral b :: Int
203 |               in b' >= 33 && b' <= 126
204 | -}
205 | 
206 | -- | Remove trailing 'A'
207 | trimPolyA :: Int -> Fastq -> Fastq
208 | trimPolyA n f@(Fastq a b c)
209 |     | B.length trailing >= n = Fastq a b' $ B.take (B.length b') c
210 |     | otherwise = f
211 |   where
212 |     (b', trailing) = B.spanEnd (=='A') b


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Data/Types.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE TemplateHaskell #-}
  2 | {-# LANGUAGE FlexibleInstances #-}
  3 | {-# LANGUAGE FlexibleContexts #-}
  4 | {-# LANGUAGE MultiParamTypeClasses #-}
  5 | {-# LANGUAGE FunctionalDependencies #-}
  6 | {-# LANGUAGE OverloadedStrings #-}
  7 | 
  8 | module Bio.Data.Types where
  9 |     ( FileFormat(..)
 10 |     , File(..)
 11 |     , emptyFile
 12 |     , location
 13 |     , replication
 14 |     , format
 15 |     , keywords
 16 |     , Experiment(..)
 17 |     , eid
 18 |     , control
 19 |     , celltype
 20 |     , target
 21 |     , files
 22 |     , info
 23 |     ) where
 24 | 
 25 | import Data.Aeson
 26 | import Data.Aeson.TH (deriveJSON, defaultOptions)
 27 | import Shelly hiding (FilePath)
 28 | import qualified Data.ByteString as B
 29 | import qualified Data.ByteString.Char8 as BC
 30 | import qualified Data.Text as T
 31 | import Control.Lens (makeFields, (^.), (.~))
 32 | import qualified Data.HashMap.Strict as M
 33 | import Crypto.Hash.MD5 (hash)
 34 | import Numeric (showHex)
 35 | 
 36 | data FileFormat = BamFile
 37 |                 | BaiFile
 38 |                 | BedFile
 39 |                 | BedGZip
 40 |                 | FastqFile
 41 |                 | FastqGZip
 42 |                 | BedgraphFile
 43 |                 | BigWigFile
 44 |                 | NarrowPeakFile
 45 |                 | BroadPeakFile
 46 |                 | Other
 47 |     deriving (Show, Read, Eq)
 48 | 
 49 | data File = File
 50 |     { fileLocation :: !FilePath
 51 |     , fileReplication :: !Int
 52 |     , fileFormat :: !FileFormat
 53 |     , fileKeywords :: ![String]
 54 |     , fileInfo :: !(M.HashMap String String)
 55 |     } deriving (Show, Read, Eq)
 56 | 
 57 | makeFields ''File
 58 | 
 59 | emptyFile :: File
 60 | emptyFile = File
 61 |     { fileLocation = ""
 62 |     , fileReplication = 0
 63 |     , fileFormat = Other
 64 |     , fileKeywords = []
 65 |     , fileInfo = M.empty
 66 |     }
 67 | 
 68 | data Experiment = Experiment
 69 |     { experimentEid :: !String
 70 |     , experimentCelltype :: !String
 71 |     , experimentTarget :: !String
 72 |     , experimentFiles :: ![File]
 73 |     , experimentInfo :: !(M.HashMap String String)
 74 |     , experimentControl :: !(Maybe String)
 75 |     } deriving (Show, Read, Eq)
 76 | 
 77 | makeFields ''Experiment
 78 | 
 79 | deriveJSON defaultOptions ''Format
 80 | 
 81 | instance FromJSON File where
 82 |     parseJSON (Object obj) = do
 83 |         path <- obj .: "path"
 84 |         File <$> return path <*>
 85 |                  obj .:? "rep" .!= 1 <*>
 86 |                  obj .:? "format" .!= getFormat path <*>
 87 |                  return [] <*>
 88 |                  return M.empty
 89 | 
 90 | instance FromJSON Experiment where
 91 |     parseJSON (Object obj) = do
 92 |         fls <- obj .: "files"
 93 |         let eid' = concat . map (flip showHex "") $ B.unpack $ hash $ BC.pack $
 94 |                    unlines $ map (^.location) fls
 95 |         Experiment <$> obj .:? "id" .!= eid' <*>
 96 |                        obj .:? "celltype" .!= "" <*>
 97 |                        obj .: "target" <*>
 98 |                        return fls <*>
 99 |                        return M.empty <*>
100 |                        obj .:? "control"
101 | 
102 | getFormat :: FilePath -> Format
103 | getFormat fl = case suf of
104 |     "bam" -> Bam
105 |     "bed" -> Bed
106 |     "gz" -> case (snd $ T.breakOnEnd "." $ T.init pre) of
107 |         "bed" -> BedGZip
108 |         _ -> error "Unknown file format"
109 |     _ -> error "Unknown file format"
110 |   where
111 |     (pre, suf) = T.breakOnEnd "." $ T.toLower $ T.pack fl
112 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/GO.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Bio.GO
 3 |     ( GO(..)
 4 |     , GOId
 5 |     , GOMap
 6 |     , getGOLevel
 7 |     ) where
 8 | 
 9 | import qualified Data.HashMap.Strict as M
10 | import Data.Maybe
11 | import qualified Data.Text           as T
12 | 
13 | data GO = GO
14 |     { _oboId        :: !GOId
15 |     , _label        :: !T.Text
16 |     , _subProcessOf :: ![GOId]
17 |     , _oboNS        :: !T.Text
18 |     } deriving (Show, Read)
19 | 
20 | type GOId = Int
21 | 
22 | type GOMap = M.HashMap GOId GO
23 | 
24 | -- | The top level is 0.
25 | getGOLevel :: GOId -> GOMap -> Int
26 | getGOLevel gid gm = loop 0 [gid]
27 |   where
28 |     loop l ids | null parents = l
29 |                | otherwise = loop (l+1) parents
30 |       where
31 |         parents = concatMap _subProcessOf $ flip mapMaybe ids $ \i -> M.lookup i gm 
32 | 
33 | {-
34 | getParentById :: GOId -> GOMap -> Maybe GO
35 | getParentById gid goMap = M.lookup gid goMap >>= _subProcessOf
36 |                                              >>= (`M.lookup` goMap)
37 | {-# INLINE getParentById #-}
38 | 
39 | -- | Add a GO term to the count table. Term counts will propogate from child to
40 | -- its parents. This function works for cyclical graph as well.
41 | addTerm :: GO -> GOMap -> TermCount -> TermCount
42 | addTerm g m t = loop S.empty g t
43 |   where
44 |     loop visited go table
45 |         | _oboId go `S.member` visited = table
46 |         | otherwise = case _subProcessOf go of
47 |             Nothing -> table'
48 |             Just gid -> loop (S.insert (_oboId go) visited)
49 |                 (M.lookupDefault undefined gid m) table'
50 |       where
51 |         table' = M.insertWith (+) (_oboId go) 1 table
52 | 
53 | enrichment :: (TermCount, Int)  -- ^ Background frequency and the total number
54 |            -> (TermCount, Int)  -- ^ Foreground
55 |            -> [(GOId, Double, Double)]
56 | enrichment (bg, bg_total) (fg, fg_total) =
57 |     flip map (M.toList fg) $ \(gid, fg_count) ->
58 |         let enrich = fromIntegral (fg_count * bg_total) /
59 |                      fromIntegral (fg_total * bg_count)
60 |             bg_count = M.lookupDefault undefined gid bg
61 |             p = 1 - hyperquick fg_count bg_count fg_total bg_total
62 |         in (gid, enrich, p)
63 | -}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/GO/Parser.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE FlexibleContexts #-}
 3 | {-# LANGUAGE StrictData #-}
 4 | module Bio.GO.Parser
 5 |     ( readOWL
 6 |     , readOWLAsMap
 7 |     , GAF(..)
 8 |     , readGAF
 9 |     ) where
10 | 
11 | import           Control.Arrow              ((&&&))
12 | import Conduit
13 | import qualified Data.ByteString.Lazy.Char8 as L
14 | import qualified Data.ByteString.Char8 as B
15 | import qualified Data.HashMap.Strict        as M
16 | import           Data.Text.Encoding         (decodeUtf8)
17 | import           Text.XML.Expat.Proc
18 | import           Text.XML.Expat.Tree
19 | import Data.Maybe
20 | import qualified Data.CaseInsensitive as CI
21 | 
22 | import           Bio.GO
23 | import           Bio.Utils.Misc (readInt)
24 | 
25 | readOWL :: FilePath -> IO [GO]
26 | readOWL fl = do
27 |     xml <- parseThrowing defaultParseOptions <$> L.readFile fl :: IO (Node B.ByteString B.ByteString)
28 |     return $ map process $ filterChildren (\x -> "owl:Class" == getName x && not (isDeprecated x)) xml
29 |   where
30 |     isDeprecated x = isJust $ findChild "owl:deprecated" x
31 |     process record = GO id' label parent namespace
32 |       where
33 |         id' = case findChild "oboInOwl:id" record of
34 |             Nothing -> error $ "Cannot find id field for: " <> show record
35 |             Just i -> readInt $ snd $ B.breakEnd (==':') $ B.concat $ map getText $ getChildren i
36 |         label = case findChild "rdfs:label" record of
37 |             Nothing -> error "readOWL: cannot find label field"
38 |             Just l -> decodeUtf8 $ B.concat $ map getText $ getChildren l
39 |         namespace = case findChild "oboInOwl:hasOBONamespace" record of
40 |             Nothing -> error "readOWL: cannot find namespace field"
41 |             Just ns -> decodeUtf8 $ B.concat $ map getText $ getChildren ns
42 |         parent =
43 |             let f p = case lookup "rdf:resource" (getAttributes p) of
44 |                     Nothing -> Nothing
45 |                     Just at -> Just $ readInt $ snd $ B.breakEnd (=='_') at
46 |             in mapMaybe f $ findChildren "rdfs:subClassOf" record 
47 | 
48 | readOWLAsMap :: FilePath -> IO GOMap
49 | readOWLAsMap fl = M.fromListWith errMsg . map (_oboId &&& id) <$> readOWL fl
50 |   where
51 |     errMsg = error "readOWLAsMap: Duplicate records."
52 | 
53 | data GAF = GAF
54 |     { gafDb :: B.ByteString
55 |     , gafDbId :: B.ByteString
56 |     , gafSymbol :: CI.CI B.ByteString
57 |     , gafQualifier :: Maybe [B.ByteString]
58 |     , gafGoId :: GOId
59 |     , gafDbRef :: [B.ByteString]
60 |     , gafEvidenceCode :: B.ByteString
61 |     , gafWithOrFrom :: Maybe [B.ByteString]
62 |     , gafAspect :: B.ByteString
63 |     , gafName :: Maybe B.ByteString
64 |     , gafSynonym :: Maybe [B.ByteString]
65 |     , gafType :: B.ByteString
66 |     , gafTaxon :: [B.ByteString]
67 |     , gafDate :: B.ByteString
68 |     , gafAssignedBy :: B.ByteString
69 |     , gafAnnotationExtension :: Maybe [B.ByteString]
70 |     , gafGeneProductID :: Maybe B.ByteString
71 |     } deriving (Show, Ord, Eq)
72 | 
73 | -- | GO Annotation File (GAF) Format 2.1 Parser. For details read:
74 | -- http://geneontology.org/page/go-annotation-file-gaf-format-21.
75 | readGAF :: FilePath -> ConduitT i GAF (ResourceT IO) ()
76 | readGAF input = sourceFileBS input .| linesUnboundedAsciiC .|
77 |     (dropWhileC isCom >> mapC parseLine)
78 |   where
79 |     isCom l = B.head l == '!' || B.null l
80 | {-# INLINE readGAF #-}
81 | 
82 | parseLine :: B.ByteString -> GAF
83 | parseLine l = GAF f1 f2 (CI.mk f3) (optionals f4)
84 |     (readInt $ snd $ B.breakEnd (==':') f5) (B.split '|' f6) f7 (optionals f8)
85 |     f9 (optional f10) (optionals f11) f12 (B.split '|' f13) f14 f15
86 |     (optionals f16) (optional f17)
87 |   where
88 |     [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17] = B.split '\t' l
89 |     optional x | B.null x = Nothing
90 |                | otherwise = Just x
91 |     optionals x | B.null x = Nothing
92 |                 | otherwise = Just $ B.split '|' x
93 | {-# INLINE parseLine #-}
94 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/HiC.hs:
--------------------------------------------------------------------------------
  1 | module Bio.HiC
  2 |     ( ContactMap(..)
  3 |     , fromAL
  4 |     , mkContactMap
  5 |     , mkContactMap'
  6 | 
  7 |     -- * contact matrix normalization
  8 |     , vcNorm
  9 |     , vcNormVector
 10 |     , sqrtNorm
 11 |     , sqrtNormVector
 12 |     , obsDivExp
 13 |     , expectedVector
 14 |     ) where
 15 | 
 16 | import Bio.SamTools.Bam
 17 | import qualified Bio.SamTools.BamIndex as BI
 18 | import Control.Monad (forM_, when, liftM, replicateM)
 19 | import Control.Monad.Primitive
 20 | import Control.Monad.Trans (lift)
 21 | import qualified Data.ByteString.Char8 as B
 22 | import Data.Binary (Binary(..))
 23 | import Data.Conduit
 24 | import qualified Data.Conduit.List as CL
 25 | import Data.List (foldl')
 26 | import Data.Maybe (fromJust)
 27 | import Data.Bits (shiftR)
 28 | import qualified Data.Vector.Generic as G
 29 | import qualified Data.Vector.Generic.Mutable as GM
 30 | import qualified Data.Vector.Unboxed as U
 31 | import qualified Data.Matrix.Symmetric as MS
 32 | import qualified Data.Matrix.Symmetric.Mutable as MSM
 33 | import Statistics.Sample (mean)
 34 | 
 35 | import Bio.Data.Bam
 36 | import Bio.Data.Bed
 37 | 
 38 | type Matrix a = MS.SymMatrix U.Vector a
 39 | 
 40 | -- | a specific base in the genome
 41 | type Site = (B.ByteString, Int)
 42 | 
 43 | -- | two sites interacting with each other form a contact
 44 | type Contact = (Site, Site)
 45 | 
 46 | data ContactMap = ContactMap
 47 |     { _chroms :: [(B.ByteString, Int)]
 48 |     , _resolution :: !Int
 49 |     , _matrix :: !(Matrix Double)
 50 |     }
 51 | 
 52 | instance Binary ContactMap where
 53 |     put (ContactMap chroms res mat) = do
 54 |         put n
 55 |         mapM_ put chroms
 56 |         put res
 57 |         put mat
 58 |       where
 59 |         n = length chroms
 60 |     
 61 |     get = do
 62 |         n <- get
 63 |         chroms <- replicateM n get
 64 |         res <- get
 65 |         mat <- get
 66 |         return $ ContactMap chroms res mat
 67 | 
 68 | -- | Read HiC contact map from associate list
 69 | fromAL :: PrimMonad m
 70 |        => [(B.ByteString, Int)]
 71 |        -> Int
 72 |        -> Sink ((Int, Int), Double) m ContactMap
 73 | fromAL chrs res = do
 74 |     mat <- lift $ MSM.replicate (matSize,matSize) 0
 75 |     CL.mapM_ $ \((i,j), v) -> MSM.write mat (i `div` res, j `div` res) v
 76 |     mat' <- lift $ MS.unsafeFreeze mat
 77 |     return $ ContactMap chrs res mat'
 78 |   where
 79 |     matSize = foldl' (+) 0 $ map (\(_,x) -> (x-1) `div` res + 1) chrs
 80 | {-# INLINE fromAL #-}
 81 | 
 82 | {-
 83 | mkContactMap' :: FilePath -> [BED3] -> Int -> Int -> Source IO ..
 84 | mkContactMap' bamFl regions w extend = do
 85 |     handle <- liftIO $ BI.open bamFl
 86 |     forM_ [0..n-1] 
 87 |   where
 88 |     positions = V.scanl f 0 regions'
 89 |     n = V.last positions
 90 |     f acc (BED3 chr s e) = acc + (e - s) `div` w
 91 |     regions' = V.fromList regions
 92 |     g i = binarySearch positions i
 93 |     -}
 94 | 
 95 | -- | O(n * (logN + k)). n = number of bins, N = number of tags. Generate contanct
 96 | -- map using constant memory
 97 | mkContactMap :: FilePath -> BED3 -> Int -> Int -> Source IO ((Int, Int), Int)
 98 | mkContactMap bamFl (BED3 chr s e) w extend = do
 99 |     handle <- lift $ BI.open bamFl
100 |     forM_ [0..n-1] $ \i ->
101 |         forM_ [i..n-1] $ \j -> do
102 |             let s1 = s + i * w
103 |                 c1 = s1 + w'
104 |                 s2 = s + j * w
105 |                 c2 = s2 + w'
106 |             r <- lift $ readCount handle (w'+extend) ((chr, c1), (chr, c2))
107 |             if i == j
108 |                then yield ((s1, s2), r `div` 2)
109 |                else yield ((s1, s2), r)
110 |   where
111 |     n = (e - s) `div` w
112 |     w' = w `div` 2
113 | 
114 | -- | O(N + n). Store matrix in memory, faster when region is big
115 | mkContactMap' :: PrimMonad m => BED3 -> Int -> Int -> Sink Bam1 m (U.Vector Int)
116 | mkContactMap' (BED3 chr s e) w extend = do
117 |     vec <- lift $ GM.replicate vecLen 0
118 |     loop vec
119 |     lift . liftM (G.map (`div` 2)) . G.unsafeFreeze $ vec
120 |   where
121 |     n = (e - s) `div` w
122 |     e' = n * w
123 |     vecLen = n * (n+1) `div` 2
124 |     loop v = do
125 |         x <- await
126 |         case x of
127 |             Just bam ->
128 |                 let flag = do bamChr <- targetName bam
129 |                               matChr <- mateTargetName bam
130 |                               return $ bamChr == chr && matChr == chr
131 |                 in case flag of
132 |                     Just True -> do
133 |                         let (p, pMate) = getStarts bam
134 |                             a = (p - s) `div` w
135 |                             b = (pMate - s) `div` w
136 |                             i | a < b = idx a b
137 |                               | otherwise = idx b a
138 |                         when (p >= s && pMate >= s && p < e' && pMate < e') $ 
139 |                             lift $ GM.read v i >>= GM.write v i . (+1)
140 |                         loop v
141 |                     _ -> loop v
142 |             _ -> return ()
143 |     idx i j = i * (2 * n - i + 1) `div` 2 + j - i
144 | 
145 | -- | get starting location of bam and its mate
146 | getStarts :: Bam1 -> (Int, Int)
147 | getStarts bam = let p1 = fromIntegral . fromJust . position $ bam
148 |                     p2 = fromIntegral . fromJust . matePosition $ bam
149 |                     l = fromIntegral . fromJust . queryLength $ bam
150 |                     p1' | isReverse bam = p1 + l
151 |                         | otherwise = p1
152 |                     p2' | isReverse bam = p2 + l
153 |                         | otherwise = p2
154 |                 in (p1', p2')
155 | {-# INLINE getStarts #-}
156 | 
157 | -- | the number of tag pairs that overlap with the region
158 | -- covered by the contact
159 | readCount :: BI.IdxHandle  -- ^ bam file handler
160 |           -> Int           -- ^ half window width
161 |           -> Contact
162 |           -> IO Int
163 | readCount handle w ((c1, p1), (c2, p2)) = viewBam handle (c1, p1-w, p1+w) $$ CL.fold f 0
164 |   where
165 |     f acc x =
166 |         case mateTargetName x of
167 |             Just chr -> if chr == c2
168 |                            then let mp = fromIntegral . fromJust .  matePosition $ x
169 |                                     l = fromIntegral . fromJust . queryLength $ x
170 |                                 in if isOverlapped r2 (mp, mp+l)
171 |                                       then acc + 1
172 |                                       else acc
173 |                            else acc
174 |             _ -> acc
175 |     isOverlapped (lo,hi) (lo',hi') = lo' < hi && hi' > lo
176 |     r2 = (p2-w, p2+w)
177 | {-# INLINE readCount #-}
178 | 
179 | -- | Vanilla coverage normalization (Lieberman-Aiden et al., 2009)
180 | vcNorm :: ContactMap -> ContactMap
181 | vcNorm c = normalizeBy (vcNormVector c) c 
182 | 
183 | sqrtNorm :: ContactMap -> ContactMap
184 | sqrtNorm c = normalizeBy (sqrtNormVector c) c 
185 | 
186 | normalizeBy :: U.Vector Double -> ContactMap -> ContactMap
187 | normalizeBy normVec c = c{_matrix = MS.SymMatrix n vec'}
188 |   where
189 |     (MS.SymMatrix n vec) = _matrix c
190 |     vec' = U.create $ do
191 |         v <- U.thaw vec
192 |         loop 0 0 v
193 |         return v
194 |     loop i j v | i < n && j < n = do
195 |                    let i' = idx n i j
196 |                        x = (normVec U.! i) * (normVec U.! j)
197 |                        x' | x == 0 = 0
198 |                           | otherwise = 1 / x
199 |                    GM.read v i' >>= GM.write v i' . (*x')
200 |                    loop i (j+1) v
201 |                | i < n = loop (i+1) (i+1) v
202 |                | otherwise = return ()
203 | {-# INLINE normalizeBy #-}
204 | 
205 | vcNormVector :: ContactMap -> U.Vector Double
206 | vcNormVector c = U.generate n $ \i -> (U.foldl1' (+) $ mat `MS.takeRow` i) / 1e6
207 |   where
208 |     mat = _matrix c
209 |     n = fst $ MS.dim mat
210 | {-# INLINE vcNormVector #-}
211 | 
212 | sqrtNormVector :: ContactMap -> U.Vector Double
213 | sqrtNormVector = U.map sqrt . vcNormVector
214 | {-# INLINE sqrtNormVector #-}
215 | 
216 | -- | O/E
217 | obsDivExp :: ContactMap -> ContactMap
218 | obsDivExp c = c{_matrix = MS.SymMatrix n vec'}
219 |   where
220 |     (MS.SymMatrix n vec) = _matrix c
221 |     vec' = U.create $ do
222 |         v <- U.thaw vec
223 |         loop 0 0 v
224 |         return v
225 |     loop i j v | i < n && j < n = do
226 |                    let i' = idx n i j
227 |                        x = expect `U.unsafeIndex` (j-i)
228 |                        x' | x == 0 = 0
229 |                           | otherwise = 1 / x
230 |                    GM.read v i' >>= GM.write v i' . (*x')
231 |                    loop i (j+1) v
232 |                | i < n = loop (i+1) (i+1) v
233 |                | otherwise = return ()
234 |     expect = expectedVector c
235 | 
236 | -- | Expected contact frequency between two locus with distance d.
237 | expectedAt :: Int -> ContactMap -> Double
238 | expectedAt d c = mean $ U.generate (n-d) $ \i -> MS.unsafeIndex mat (i,i+d)
239 |   where
240 |     mat = _matrix c
241 |     n = MS.rows mat
242 | {-# INLINE expectedAt #-}
243 | 
244 | expectedVector :: ContactMap -> U.Vector Double
245 | expectedVector c = U.generate n $ \d -> expectedAt d c
246 |   where
247 |     n = MS.rows $ _matrix c
248 | {-# INLINE expectedVector #-}
249 |     
250 | -- row major upper triangular indexing
251 | idx :: Int -> Int -> Int -> Int
252 | idx n i j = (i * (2 * n - i - 1)) `shiftR` 1 + j
253 | {-# INLINE idx #-}
254 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/HiC/Visualize.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE FlexibleContexts #-}
  2 | module Bio.HiC.Visualize
  3 |     ( drawHiC
  4 |     , DrawOpt(..)
  5 |     , reds
  6 |     , blueRed
  7 |     ) where
  8 | 
  9 | import qualified Data.ByteString.Lazy as L
 10 | import qualified Data.Matrix.Generic as MG
 11 | import Data.Matrix.Symmetric (SymMatrix(..))
 12 | import Codec.Picture
 13 | import Data.Colour
 14 | import Data.Colour.Names
 15 | import Data.Colour.SRGB
 16 | import Data.Default.Class (Default(..))
 17 | import qualified Data.Vector as V
 18 | import qualified Data.Vector.Unboxed as U
 19 | 
 20 | import Bio.HiC
 21 | 
 22 | data DrawOpt = DrawOpt 
 23 |     { _range :: !(Maybe (Double, Double))
 24 |     , _palette :: !(V.Vector (Colour Double))
 25 |     }
 26 | 
 27 | instance Default DrawOpt where
 28 |     def = DrawOpt
 29 |         { _range = Nothing
 30 |         , _palette = reds
 31 |         }
 32 | 
 33 | reds :: V.Vector (Colour Double)
 34 | reds = V.fromList $ interpolate 62 white red
 35 | 
 36 | blueRed :: V.Vector (Colour Double)
 37 | blueRed = V.fromList $ interpolate 30 blue white ++ interpolate 30 white red
 38 | 
 39 | drawHiC :: FilePath -> ContactMap -> DrawOpt -> IO ()
 40 | drawHiC fl m opt = case encodePalettedPng pal pic of
 41 |     Left e -> error e
 42 |     Right png -> L.writeFile fl png
 43 |   where
 44 |     pic = matToImage nCol (lo,hi) mat
 45 |     pal = generateImage fn nCol 1
 46 |       where
 47 |         fn i _ = colorConvert $ cols V.! i
 48 |         cols = _palette opt
 49 |     nCol = V.length $ _palette opt
 50 |     mat@(SymMatrix _ v) = _matrix m
 51 |     (lo,hi) = case _range opt of
 52 |         Just (a,b) -> (a,b)
 53 |         _ -> (U.minimum v, U.maximum v)
 54 | 
 55 | {-
 56 | drawMatrix :: MG.Matrix m v Double => FilePath -> m v Double -> DrawOpt -> IO ()
 57 | drawMatrix fl mat opt = case encodePalettedPng pal pic of
 58 |     Left e -> error e
 59 |     Right png -> L.writeFile fl png
 60 |   where
 61 |     pic = matToImage nCol (lo,hi) mat
 62 |     pal = generateImage fn nCol 1
 63 |       where
 64 |         fn i _ = colorConvert $ cols V.! i
 65 |         cols = _palette opt
 66 |     nCol = V.length $ _palette opt
 67 |     (lo,hi) = case _range opt of
 68 |         Just (a,b) -> (a,b)
 69 |         _ -> (U.minimum v, U.maximum v)
 70 |         -}
 71 | 
 72 | matToImage :: MG.Matrix m v Double => Int -> (Double, Double) -> m v Double -> Image Pixel8
 73 | matToImage n (lo,hi) mat = generateImage drawPixel r c
 74 |   where
 75 |     drawPixel i j | x <= lo = 0
 76 |                   | x >= hi = fromIntegral $ n - 1
 77 |                   | otherwise = truncate $ (x - lo) / step
 78 |       where
 79 |         x = mat `MG.unsafeIndex` (i,j)
 80 |     step = (hi - lo) / fromIntegral n
 81 |     (r,c) = MG.dim mat
 82 | {-# INLINE matToImage #-}
 83 | 
 84 | indexedMatToImage :: MG.Matrix m v Pixel8 => m v Pixel8 -> Image Pixel8
 85 | indexedMatToImage mat = generateImage fn r c
 86 |   where
 87 |     fn i j = mat `MG.unsafeIndex` (i,j)
 88 |     (r,c) = MG.dim mat
 89 | {-# INLINE indexedMatToImage #-}
 90 | 
 91 | colorConvert :: Colour Double -> PixelRGB8
 92 | colorConvert c = let RGB r g b = toSRGB24 c
 93 |                  in PixelRGB8 r g b
 94 | {-# INLINE colorConvert #-}
 95 | 
 96 | {-
 97 | colorBlend :: Int -> V.Vector (Colour Double) -> V.Vector (Colour Double)
 98 | colorBlend n colors | n <= m = colors
 99 |                     | otherwise = 
100 |   where
101 |     m = V.length colors
102 |     -}
103 | 
104 | interpolate :: Int -> Colour Double -> Colour Double -> [Colour Double]
105 | interpolate n c1 c2 = loop 1
106 |   where
107 |     loop i | i > n = []
108 |            | otherwise = blend (fromIntegral i * step) c2 c1 : loop (i+1)
109 |     step = 1 / fromIntegral (n+1)
110 | {-# INLINE interpolate #-}
111 | 
112 | -- | map numbers to colors
113 | colorMapSmooth :: Double -- a value from 0 to 1
114 |                -> V.Vector (Colour Double) -> Colour Double
115 | colorMapSmooth x colors = blend p (colors V.! i) $ colors V.! (i+1)
116 |   where
117 |     p = fromIntegral i - x * (fromIntegral n - 1) + 1
118 |     i | x == 1 = n - 2
119 |       | otherwise = truncate $ x * (fromIntegral n - 1)
120 |     n = V.length colors
121 | {-# INLINE colorMapSmooth #-}
122 | 
123 | linearMapBound :: (Double, Double) -> (Double, Double) -> Double -> Double
124 | linearMapBound (l, u) (l', u') x
125 |     | isNaN x || x < l = l' 
126 |     | x > u = u'
127 |     | otherwise = (x - l) / (u - l) * (u' - l') + l'
128 | {-# INLINE linearMapBound #-}
129 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/HiC/Visualize/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ScopedTypeVariables #-}
  2 | {-# LANGUAGE Rank2Types #-}
  3 | {-# LANGUAGE CPP #-}
  4 | -- most of the codes in this file are directly copied from JuicyPixel
  5 | 
  6 | module Bio.HiC.Visualize.Internal where
  7 | 
  8 | #if !MIN_VERSION_base(4,8,0)
  9 | import Foreign.ForeignPtr.Safe( ForeignPtr, castForeignPtr )
 10 | #else
 11 | import Foreign.ForeignPtr( ForeignPtr, castForeignPtr )
 12 | #endif
 13 | 
 14 | import Foreign.Storable( Storable, sizeOf )
 15 | import Control.Monad (when)
 16 | import Data.Binary (Binary(..), Get)
 17 | import Data.Binary.Get( getWord8
 18 |                       , getWord32be
 19 |                       , getLazyByteString
 20 |                       )
 21 | import Data.Binary.Put( runPut
 22 |                       , putWord8
 23 |                       , putWord32be
 24 |                       , putLazyByteString
 25 |                       )
 26 | import Data.Bits( xor, (.&.), unsafeShiftR )
 27 | import qualified Data.Vector.Unboxed as U
 28 | import Data.Word
 29 | import Data.List (foldl')
 30 | import qualified Data.ByteString as B
 31 | import qualified Data.ByteString.Lazy as L
 32 | import qualified Data.ByteString.Lazy.Char8 as LS
 33 | import Data.Vector.Storable ( Vector, unsafeToForeignPtr, unsafeFromForeignPtr0 )
 34 | import qualified Data.ByteString.Internal as S
 35 | import qualified Data.Vector.Generic as G
 36 | 
 37 | import Data.Conduit
 38 | 
 39 | -- | Value used to identify a png chunk, must be 4 bytes long.
 40 | type ChunkSignature = L.ByteString
 41 | 
 42 | -- | Generic header used in PNG images.
 43 | data PngIHdr = PngIHdr
 44 |     { width             :: !Word32       -- ^ Image width in number of pixel
 45 |     , height            :: !Word32       -- ^ Image height in number of pixel
 46 |     , bitDepth          :: !Word8        -- ^ Number of bit per sample
 47 |     , colourType        :: !PngImageType -- ^ Kind of png image (greyscale, true color, indexed...)
 48 |     , compressionMethod :: !Word8        -- ^ Compression method used
 49 |     , filterMethod      :: !Word8        -- ^ Must be 0
 50 |     , interlaceMethod   :: !PngInterlaceMethod   -- ^ If the image is interlaced (for progressive rendering)
 51 |     }
 52 |     deriving Show
 53 | 
 54 | -- | Data structure during real png loading/parsing
 55 | data PngRawChunk = PngRawChunk
 56 |     { chunkLength :: Word32
 57 |     , chunkType   :: ChunkSignature
 58 |     , chunkCRC    :: Word32
 59 |     , chunkData   :: L.ByteString
 60 |     }
 61 | 
 62 | -- | What kind of information is encoded in the IDAT section
 63 | -- of the PngFile
 64 | data PngImageType =
 65 |       PngGreyscale
 66 |     | PngTrueColour
 67 |     | PngIndexedColor
 68 |     | PngGreyscaleWithAlpha
 69 |     | PngTrueColourWithAlpha
 70 |     deriving Show
 71 | 
 72 | -- | Different known interlace methods for PNG image
 73 | data PngInterlaceMethod =
 74 |       -- | No interlacing, basic data ordering, line by line
 75 |       -- from left to right.
 76 |       PngNoInterlace
 77 | 
 78 |       -- | Use the Adam7 ordering, see `adam7Reordering`
 79 |     | PngInterlaceAdam7
 80 |     deriving (Enum, Show)
 81 | 
 82 | preparePngHeader :: Int -> Int -> PngImageType -> Word8 -> PngIHdr
 83 | preparePngHeader w h imgType depth = PngIHdr
 84 |     { width             = fromIntegral w
 85 |     , height            = fromIntegral h
 86 |     , bitDepth          = depth
 87 |     , colourType        = imgType
 88 |     , compressionMethod = 0
 89 |     , filterMethod      = 0
 90 |     , interlaceMethod   = PngNoInterlace
 91 |     }
 92 | 
 93 | instance Binary PngRawChunk where
 94 |     put chunk = do
 95 |         putWord32be $ chunkLength chunk
 96 |         putLazyByteString $ chunkType chunk
 97 |         when (chunkLength chunk /= 0)
 98 |              (putLazyByteString $ chunkData chunk)
 99 |         putWord32be $ chunkCRC chunk
100 | 
101 |     get = do
102 |         size <- getWord32be
103 |         chunkSig <- getLazyByteString (fromIntegral $ L.length iHDRSignature)
104 |         imgData <- if size == 0
105 |             then return L.empty
106 |             else getLazyByteString (fromIntegral size)
107 |         crc <- getWord32be
108 | 
109 |         let computedCrc = pngComputeCrc [chunkSig, imgData]
110 |         when (computedCrc `xor` crc /= 0)
111 |              (fail $ "Invalid CRC : " ++ show computedCrc ++ ", "
112 |                                       ++ show crc)
113 |         return PngRawChunk {
114 |             chunkLength = size,
115 |             chunkData = imgData,
116 |             chunkCRC = crc,
117 |             chunkType = chunkSig
118 |         }
119 | 
120 | instance Binary PngImageType where
121 |     put PngGreyscale = putWord8 0
122 |     put PngTrueColour = putWord8 2
123 |     put PngIndexedColor = putWord8 3
124 |     put PngGreyscaleWithAlpha = putWord8 4
125 |     put PngTrueColourWithAlpha = putWord8 6
126 | 
127 |     get = get >>= imageTypeOfCode
128 | 
129 | imageTypeOfCode :: Word8 -> Get PngImageType
130 | imageTypeOfCode 0 = return PngGreyscale
131 | imageTypeOfCode 2 = return PngTrueColour
132 | imageTypeOfCode 3 = return PngIndexedColor
133 | imageTypeOfCode 4 = return PngGreyscaleWithAlpha
134 | imageTypeOfCode 6 = return PngTrueColourWithAlpha
135 | imageTypeOfCode _ = fail "Invalid png color code"
136 | 
137 | instance Binary PngIHdr where
138 |     put hdr = do
139 |         putWord32be 13
140 |         let inner = runPut $ do
141 |                 putLazyByteString iHDRSignature
142 |                 putWord32be $ width hdr
143 |                 putWord32be $ height hdr
144 |                 putWord8    $ bitDepth hdr
145 |                 put $ colourType hdr
146 |                 put $ compressionMethod hdr
147 |                 put $ filterMethod hdr
148 |                 put $ interlaceMethod hdr
149 |             crc = pngComputeCrc [inner]
150 |         putLazyByteString inner
151 |         putWord32be crc
152 | 
153 |     get = do
154 |         _size <- getWord32be
155 |         ihdrSig <- getLazyByteString (L.length iHDRSignature)
156 |         when (ihdrSig /= iHDRSignature)
157 |              (fail "Invalid PNG file, wrong ihdr")
158 |         w <- getWord32be
159 |         h <- getWord32be
160 |         depth <- get
161 |         colorType <- get
162 |         compression <- get
163 |         filtermethod <- get
164 |         interlace <- get
165 |         _crc <- getWord32be
166 |         return PngIHdr {
167 |             width = w,
168 |             height = h,
169 |             bitDepth = depth,
170 |             colourType = colorType,
171 |             compressionMethod = compression,
172 |             filterMethod = filtermethod,
173 |             interlaceMethod = interlace
174 |         }
175 | 
176 | instance Binary PngInterlaceMethod where
177 |     get = getWord8 >>= \w -> case w of
178 |         0 -> return PngNoInterlace
179 |         1 -> return PngInterlaceAdam7
180 |         _ -> fail "Invalid interlace method"
181 | 
182 |     put PngNoInterlace    = putWord8 0
183 |     put PngInterlaceAdam7 = putWord8 1
184 | 
185 | -- signature
186 | 
187 | -- | Signature signalling that the following data will be a png image
188 | -- in the png bit stream
189 | pngSignature :: ChunkSignature
190 | pngSignature = L.pack [137, 80, 78, 71, 13, 10, 26, 10]
191 | 
192 | -- | Helper function to help pack signatures.
193 | signature :: String -> ChunkSignature
194 | signature = LS.pack 
195 | 
196 | -- | Signature for the header chunk of png (must be the first)
197 | iHDRSignature :: ChunkSignature 
198 | iHDRSignature = signature "IHDR"
199 | 
200 | -- | Signature for a palette chunk in the pgn file. Must
201 | -- occure before iDAT.
202 | pLTESignature :: ChunkSignature
203 | pLTESignature = signature "PLTE"
204 | 
205 | -- | Signature for a data chuck (with image parts in it)
206 | iDATSignature :: ChunkSignature
207 | iDATSignature = signature "IDAT"
208 | 
209 | -- | Signature for the last chunk of a png image, telling
210 | -- the end.
211 | iENDSignature :: ChunkSignature
212 | iENDSignature = signature "IEND"
213 | 
214 | -- | Compute the CRC of a raw buffer, as described in annex D of the PNG
215 | -- specification.
216 | pngComputeCrc :: [L.ByteString] -> Word32
217 | pngComputeCrc = (0xFFFFFFFF `xor`) . L.foldl' updateCrc 0xFFFFFFFF . L.concat
218 |     where updateCrc crc val =
219 |               let u32Val = fromIntegral val
220 |                   lutVal = pngCrcTable U.! fromIntegral ((crc `xor` u32Val) .&. 0xFF)
221 |               in lutVal `xor` (crc `unsafeShiftR` 8)
222 | 
223 | -- | From the Annex D of the png specification.
224 | pngCrcTable :: U.Vector Word32
225 | pngCrcTable = U.fromListN 256 [ foldl' updateCrcConstant c [zero .. 7] | c <- [0 .. 255] ]
226 |     where zero = 0 :: Int -- To avoid defaulting to Integer
227 |           updateCrcConstant c _ | c .&. 1 /= 0 = magicConstant `xor` (c `unsafeShiftR` 1)
228 |                                 | otherwise = c `unsafeShiftR` 1
229 |           magicConstant = 0xedb88320 :: Word32
230 | 
231 | 
232 | 
233 | ----------------------------------------------------------------------------
234 | 
235 | endChunk :: PngRawChunk
236 | endChunk = PngRawChunk { chunkLength = 0
237 |                        , chunkType = iENDSignature
238 |                        , chunkCRC = pngComputeCrc [iENDSignature]
239 |                        , chunkData = L.empty
240 |                        }
241 | 
242 | type Palette = Vector Word8
243 | 
244 | preparePalette :: Palette -> PngRawChunk
245 | preparePalette pal = PngRawChunk
246 |   { chunkLength = fromIntegral $ G.length pal
247 |   , chunkType   = pLTESignature
248 |   , chunkCRC    = pngComputeCrc [pLTESignature, binaryData]
249 |   , chunkData   = binaryData
250 |   }
251 |    where binaryData = L.fromChunks [toByteString pal]
252 | 
253 | toByteString :: forall a. (Storable a) => Vector a -> B.ByteString
254 | toByteString vec = S.PS (castForeignPtr ptr) offset (len * size)
255 |   where (ptr, offset, len) = unsafeToForeignPtr vec
256 |         size = sizeOf (undefined :: a)
257 | 
258 | generatePng :: Int -> Int -> Palette -> (Int -> Int -> Word8) -> Source m L.ByteString
259 | generatePng w h pal fn = do
260 |     yield pngSignature
261 |     yield $ encode header
262 |     yield $ encode $ preparePalette pal
263 |     loop 0 0
264 |     yield $ encode endChunk
265 |   where
266 |     header = preparePngHeader w h PngIndexedColor 8
267 |     loop i j = do
268 |         fn i j
269 | 
270 | genericEncodePng :: forall px. (Pixel px, PixelBaseComponent px ~ Word8)
271 |                  => Int -> Int -> Maybe Palette -> PngImageType -> Image px
272 |                  -> L.ByteString
273 | genericEncodePng w h palette imgKind
274 |                  image@(Image { imageWidth = w, imageHeight = h, imageData = arr }) =
275 |   encode PngRawImage { header = hdr
276 |                      , chunks = prependPalette palette [prepareIDatChunk imgEncodedData, endChunk]}
277 |     where hdr = preparePngHeader w h image imgKind 8
278 |           zero = B.singleton 0
279 | 
280 |           compCount = componentCount (undefined :: px)
281 | 
282 |           prependPalette Nothing l = l
283 |           prependPalette (Just p) l = preparePalette p : l
284 | 
285 |           lineSize = compCount * w
286 |           encodeLine line = blitVector arr (line * lineSize) lineSize
287 |           imgEncodedData = Z.compress . L.fromChunks
288 |                         $ concat [[zero, encodeLine line] | line <- [0 .. h - 1]]
289 | 
290 | prepareIDatChunk :: Lb.ByteString -> PngRawChunk
291 | prepareIDatChunk imgData = PngRawChunk
292 |     { chunkLength = fromIntegral $ Lb.length imgData
293 |     , chunkType   = iDATSignature
294 |     , chunkCRC    = pngComputeCrc [iDATSignature, imgData]
295 |     , chunkData   = imgData
296 |     }
297 | 
298 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Motif/Alignment.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE Rank2Types #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE FlexibleContexts #-}
  4 | {-# LANGUAGE BangPatterns #-}
  5 | module Bio.Motif.Alignment
  6 |     ( alignment
  7 |     , alignmentBy
  8 |     , linPenal
  9 |     , quadPenal
 10 |     , cubPenal
 11 |     , expPenal
 12 |     , l1
 13 |     , l2
 14 |     , l3
 15 |     , lInf
 16 |     , AlignFn
 17 |     , CombineFn
 18 |     ) where
 19 | 
 20 | import qualified Data.Vector.Generic as G
 21 | import qualified Data.Vector.Unboxed as U
 22 | import qualified Data.Matrix.Unboxed as M
 23 | import Statistics.Sample (mean)
 24 | 
 25 | import Bio.Motif
 26 | import Bio.Utils.Functions
 27 | 
 28 | -- | penalty function takes the number of gaps and matched positions as input,
 29 | -- return penalty value
 30 | type PenalFn = Int -> Int -> Double
 31 | 
 32 | type DistanceFn = forall v. (G.Vector v Double, G.Vector v (Double, Double))
 33 |                => v Double -> v Double -> Double
 34 | 
 35 | type AlignFn = PWM
 36 |             -> PWM
 37 |             -> (Double, (Bool, Int))  -- ^ (distance, (on same direction,
 38 |                                       -- position w.r.t. the first pwm))
 39 | 
 40 | -- | combine distances from different positions of alignment
 41 | type CombineFn = U.Vector Double -> Double
 42 | 
 43 | alignment :: AlignFn
 44 | alignment = alignmentBy jsd (expPenal 0.05) l1
 45 | 
 46 | -- | linear penalty
 47 | linPenal :: Double -> PenalFn
 48 | linPenal x nGap nMatch = fromIntegral nGap * x / fromIntegral nMatch
 49 | {-# INLINE linPenal #-}
 50 | 
 51 | -- | quadratic penalty
 52 | quadPenal :: Double -> PenalFn
 53 | quadPenal x nGap nMatch = fromIntegral (nGap ^ (2 :: Int)) * x / fromIntegral nMatch
 54 | {-# INLINE quadPenal #-}
 55 | 
 56 | -- | cubic penalty
 57 | cubPenal :: Double -> PenalFn
 58 | cubPenal x nGap nMatch = fromIntegral (nGap ^ (3 :: Int)) * x / fromIntegral nMatch
 59 | {-# INLINE cubPenal #-}
 60 | 
 61 | -- | exponentail penalty
 62 | expPenal :: Double -> PenalFn
 63 | expPenal x nGap nMatch = fromIntegral (2^nGap - 1 :: Int) * x / fromIntegral nMatch
 64 | {-# INLINE expPenal #-}
 65 | 
 66 | l1 :: CombineFn
 67 | l1 = mean
 68 | {-# INLINE l1 #-}
 69 | 
 70 | l2 :: CombineFn
 71 | l2 = sqrt . mean . U.map (**2)
 72 | {-# INLINE l2 #-}
 73 | 
 74 | l3 :: CombineFn
 75 | l3 = (**(1/3)) . mean . U.map (**3)
 76 | {-# INLINE l3 #-}
 77 | 
 78 | lInf :: CombineFn
 79 | lInf = U.maximum
 80 | {-# INLINE lInf #-}
 81 | 
 82 | -- internal gaps are not allowed, larger score means larger distance, so the smaller the better
 83 | alignmentBy :: DistanceFn  -- ^ compute the distance between two aligned pwms
 84 |             -> PenalFn     -- ^ gap penalty
 85 |             -> CombineFn
 86 |             -> AlignFn
 87 | alignmentBy fn pFn combFn m1 m2
 88 |     | fst forwardAlign <= fst reverseAlign =
 89 |         (fst forwardAlign, (True, snd forwardAlign))
 90 |     | otherwise = (fst reverseAlign, (False, snd reverseAlign))
 91 |   where
 92 |     forwardAlign | d1 < d2 = (d1,i1)
 93 |                  | otherwise = (d2,-i2)
 94 |       where
 95 |         (d1,i1) = loop opti2 (1/0,-1) s2 s1 0
 96 |         (d2,i2) = loop opti1 (1/0,-1) s1 s2 0
 97 |     reverseAlign | d1 < d2 = (d1,i1)
 98 |                  | otherwise = (d2,-i2)
 99 |       where
100 |         (d1,i1) = loop opti2 (1/0,-1) s2' s1 0
101 |         (d2,i2) = loop opti1 (1/0,-1) s1 s2' 0
102 | 
103 |     loop opti (min',i') a b@(_:xs) !i
104 |         | opti U.! i >= min' = (min',i')
105 |         | d < min' = loop opti (d,i) a xs (i+1)
106 |         | otherwise = loop opti (min',i') a xs (i+1)
107 |       where
108 |         d = combFn sc + pFn nGap nMatch
109 |         sc = U.fromList $ zipWith fn a b
110 |         nMatch = U.length sc
111 |         nGap = n1 + n2 - 2 * nMatch
112 |     loop _ acc _ _ _ = acc
113 | 
114 |     opti1 = optimalSc n1 n2
115 |     opti2 = optimalSc n2 n1
116 | 
117 |     optimalSc x y = U.fromList $ scanr1 f $ go 0
118 |       where
119 |         f v min' = min v min'
120 |         go i | nM == 0 = []
121 |              | otherwise = pFn nG nM : go (i+1)
122 |           where
123 |             nM = min x $ y - i
124 |             nG = i + abs (x - (y-i))
125 | 
126 |     s1 = M.toRows . _mat $ m1
127 |     s2 = M.toRows . _mat $ m2
128 |     s2' = M.toRows . _mat $ m2'
129 |     m2' = rcPWM m2
130 |     n1 = length s1
131 |     n2 = length s2
132 | {-# INLINE alignmentBy #-}
133 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Motif/Merge.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | module Bio.Motif.Merge
  4 |     ( mergePWM
  5 |     , mergePWMWeighted
  6 |     , dilute
  7 |     , trim
  8 |     , mergeTree
  9 |     , mergeTreeWeighted
 10 |     , iterativeMerge
 11 |     , buildTree
 12 |     , cutTreeBy
 13 |     )where
 14 | 
 15 | import           AI.Clustering.Hierarchical    hiding (size)
 16 | import Control.Arrow (first)
 17 | import           Control.Monad                 (forM_, when)
 18 | import           Control.Monad.ST              (runST, ST)
 19 | import qualified Data.ByteString.Char8         as B
 20 | import           Data.List                     (dropWhileEnd)
 21 | import qualified Data.Matrix.Symmetric.Generic.Mutable as MSU
 22 | import qualified Data.Matrix.Unboxed           as MU
 23 | import           Data.Maybe
 24 | import qualified Data.Vector                   as V
 25 | import qualified Data.Vector.Mutable           as VM
 26 | import qualified Data.Vector.Unboxed           as U
 27 | 
 28 | import           Bio.Motif
 29 | import           Bio.Motif.Alignment
 30 | import           Bio.Utils.Functions           (kld)
 31 | 
 32 | mergePWM :: (PWM, PWM, Int) -> PWM
 33 | mergePWM (m1, m2, shift) | shift >= 0 = merge shift (_mat m1) $ _mat m2
 34 |                          | otherwise = merge (-shift) (_mat m2) $ _mat m1
 35 |   where
 36 |     merge s a b = PWM Nothing $ MU.fromRows $ loop 0
 37 |       where
 38 |         n1 = MU.rows a
 39 |         n2 = MU.rows b
 40 |         loop i | i' < 0 || (i < n1 && i' >= n2) = MU.takeRow a i : loop (i+1)
 41 |                | i < n1 && i' < n2 = f (MU.takeRow a i) (MU.takeRow b i') : loop (i+1)
 42 |                | i >= n1 && i' < n2 = MU.takeRow b i' : loop (i+1)
 43 |                | otherwise = []
 44 |           where
 45 |             i' = i - s
 46 |             f = U.zipWith (\x y -> (x+y)/2)
 47 | 
 48 | mergePWMWeighted :: (PWM, [Int])  -- ^ pwm and weights at each position
 49 |                  -> (PWM, [Int])
 50 |                  -> Int                  -- ^ shift
 51 |                  -> (PWM, [Int])
 52 | mergePWMWeighted m1 m2 shift
 53 |     | shift >= 0 = merge shift (first _mat m1) $ first _mat m2
 54 |     | otherwise = merge (-shift) (first _mat m2) $ first _mat m1
 55 | 
 56 |   where
 57 |     merge s (p1,w1) (p2,w2) = first (PWM Nothing . MU.fromRows) $ unzip $ loop 0
 58 |       where
 59 |         a = V.fromList $ zip (MU.toRows p1) w1
 60 |         b = V.fromList $ zip (MU.toRows p2) w2
 61 |         n1 = V.length a
 62 |         n2 = V.length b
 63 |         loop i | i' < 0 || (i < n1 && i' >= n2) = a V.! i : loop (i+1)
 64 |                | i < n1 && i' < n2 = f (a V.! i) (b V.! i') : loop (i+1)
 65 |                | i >= n1 && i' < n2 = b V.! i' : loop (i+1)
 66 |                | otherwise = []
 67 |           where
 68 |             i' = i - s
 69 |             f (xs, wx) (ys, wy) = (U.zipWith (\x y ->
 70 |                 (fromIntegral wx * x + fromIntegral wy * y) /
 71 |                 fromIntegral (wx + wy)) xs ys, wx + wy)
 72 | 
 73 | -- | dilute positions in a PWM that are associated with low weights
 74 | dilute :: (PWM, [Int]) -> PWM
 75 | dilute (pwm, ws) = PWM Nothing $ MU.fromRows $ zipWith f ws $ MU.toRows $ _mat pwm
 76 |   where
 77 |     f w r | w < n = let d = fromIntegral $ n - w
 78 |                     in U.map (\x -> (fromIntegral w * x + 0.25 * d) / fromIntegral n) r
 79 |           | otherwise = r
 80 |     n = maximum ws
 81 | {-# INLINE dilute #-}
 82 | 
 83 | trim :: Bkgd -> Double -> PWM -> PWM
 84 | trim (BG (a,c,g,t)) cutoff pwm = PWM Nothing $ MU.fromRows $ dropWhileEnd f $
 85 |     dropWhile f rs
 86 |   where
 87 |     f x = kld x bg < cutoff
 88 |     rs = MU.toRows $ _mat pwm
 89 |     bg = U.fromList [a,c,g,t]
 90 | {-# INLINE trim #-}
 91 | 
 92 | mergeTree :: AlignFn -> Dendrogram Motif -> PWM
 93 | mergeTree align t = case t of
 94 |     Branch _ _ left right -> f (mergeTree align left) $ mergeTree align right
 95 |     Leaf a -> _pwm a
 96 |   where
 97 |     f a b | isSame = mergePWM (a, b, i)
 98 |           | otherwise = mergePWM (a, rcPWM b, i)
 99 |       where (_, (isSame, i)) = align a b
100 | 
101 | mergeTreeWeighted :: AlignFn -> Dendrogram Motif -> (PWM, [Int])
102 | mergeTreeWeighted align t = case t of
103 |     Branch _ _ left right -> f (mergeTreeWeighted align left) $
104 |         mergeTreeWeighted align right
105 |     Leaf a -> (_pwm a, replicate (size $ _pwm a) 1)
106 |   where
107 |     f (a,w1) (b,w2)
108 |         | isSame = mergePWMWeighted (a,w1) (b,w2) i
109 |         | otherwise = mergePWMWeighted (a,w1) (rcPWM b, reverse w2) i
110 |       where (_, (isSame, i)) = align a b
111 | {-# INLINE mergeTreeWeighted #-}
112 | 
113 | iterativeMerge :: AlignFn
114 |                -> Double    -- cutoff
115 |                -> [Motif]   -- ^ Motifs to be merged. Motifs must have unique name.
116 |                -> [([B.ByteString], PWM, [Int])]
117 | iterativeMerge align th motifs = runST $ do
118 |     motifs' <- V.unsafeThaw $ V.fromList $ flip map motifs $ \x ->
119 |         Just ([_name x], _pwm x, replicate (size $ _pwm x) 1)
120 | 
121 |     let n = VM.length motifs'
122 |         iter mat = do
123 |             -- retrieve the minimum value
124 |             ((i, j), (d, (isSame, pos))) <- loop ((-1,-1), (1/0, undefined)) 0 1
125 |             if d < th
126 |                 then do
127 |                     Just (nm1, pwm1, w1) <- VM.unsafeRead motifs' i
128 |                     Just (nm2, pwm2, w2) <- VM.unsafeRead motifs' j
129 |                     let merged = (nm1 ++ nm2, pwm', w')
130 |                         (pwm',w') | isSame = mergePWMWeighted (pwm1, w1) (pwm2, w2) pos
131 |                                   | otherwise = mergePWMWeighted (pwm1, w1)
132 |                                               (rcPWM $ pwm2, reverse w2) pos
133 | 
134 |                     -- update
135 |                     forM_ [0..n-1] $ \i' -> MSU.unsafeWrite mat (i',j) Nothing
136 |                     VM.unsafeWrite motifs' i $ Just merged
137 |                     VM.unsafeWrite motifs' j Nothing
138 |                     forM_ [0..n-1] $ \j' -> when (i /= j') $ do
139 |                         x <- VM.unsafeRead motifs' j'
140 |                         case x of
141 |                             Just (_, pwm2',_) -> do
142 |                                 let ali | i < j' = Just $ align pwm' pwm2'
143 |                                         | otherwise = Just $ align pwm2' pwm'
144 |                                 MSU.unsafeWrite mat (i,j') ali
145 |                             _ -> return ()
146 |                     iter mat
147 |                 else return ()
148 |           where
149 |             loop ((i_min, j_min), d_min) i j
150 |                 | i >= n = return ((i_min, j_min), d_min)
151 |                 | j >= n = loop ((i_min, j_min), d_min) (i+1) (i+2)
152 |                 | otherwise = do
153 |                     x <- MSU.unsafeRead mat (i,j)
154 |                     case x of
155 |                         Just d -> if fst d < fst d_min
156 |                             then loop ((i,j), d) i (j+1)
157 |                             else loop ((i_min, j_min), d_min) i (j+1)
158 |                         _ -> loop ((i_min, j_min), d_min) i (j+1)
159 | 
160 |     -- initialization
161 |     mat <- MSU.replicate (n,n) Nothing :: ST s (MSU.SymMMatrix VM.MVector s (Maybe (Double, (Bool, Int))))
162 |     forM_ [0..n-1] $ \i -> forM_ [i+1 .. n-1] $ \j -> do
163 |         Just (_, pwm1, _) <- VM.unsafeRead motifs' i
164 |         Just (_, pwm2, _) <- VM.unsafeRead motifs' j
165 |         MSU.unsafeWrite mat (i,j) $ Just $ align pwm1 pwm2
166 | 
167 |     iter mat
168 |     results <- V.unsafeFreeze motifs'
169 |     return $ V.toList $ V.map fromJust $ V.filter isJust results
170 | {-# INLINE iterativeMerge #-}
171 | 
172 | -- | build a guide tree from a set of motifs
173 | buildTree :: AlignFn -> [Motif] -> Dendrogram Motif
174 | buildTree align motifs = hclust Average (V.fromList motifs) δ
175 |   where
176 |     δ (Motif _ x) (Motif _ y) = fst $ align x y
177 | 
178 | cutTreeBy :: Double   -- ^ start
179 |           -> Double   -- ^ step
180 |           -> ([Dendrogram a] -> Bool) -> Dendrogram a -> [Dendrogram a]
181 | cutTreeBy start step fn tree = go start
182 |   where
183 |     go x | fn clusters = clusters
184 |          | x - step > 0 = go $ x - step
185 |          | otherwise = clusters
186 |       where clusters = cutAt tree x
187 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Motif/Search.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns     #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | 
  4 | module Bio.Motif.Search
  5 |     ( findTFBS
  6 |     , findTFBSWith
  7 |     , findTFBSSlow
  8 |     , maxMatchSc
  9 |     , optimalScoresSuffix
 10 |     ) where
 11 | 
 12 | import           Conduit
 13 | import qualified Data.ByteString.Char8       as B
 14 | import qualified Data.Matrix.Unboxed         as M
 15 | import           Data.Ord                    (comparing)
 16 | import qualified Data.Vector.Unboxed         as U
 17 | 
 18 | import           Bio.Motif                   (Bkgd (..), PWM (..), scores', size)
 19 | import           Bio.Seq                     (DNA, toBS)
 20 | import qualified Bio.Seq                     as Seq (length)
 21 | 
 22 | -- | given a user defined threshold, look for TF binding sites on a DNA
 23 | -- sequence, using look ahead search. This function doesn't search for binding
 24 | -- sites on the reverse strand
 25 | findTFBS :: Monad m
 26 |          => Bkgd
 27 |          -> PWM
 28 |          -> DNA a
 29 |          -> Double
 30 |          -> Bool    -- ^ whether to skip ambiguous sequences. Recommend: True
 31 |                     -- in most cases
 32 |          -> ConduitT i (Int, Double) m ()
 33 | findTFBS bg pwm dna thres skip = findTFBSWith sigma bg pwm dna thres skip
 34 |   where
 35 |     sigma = optimalScoresSuffix bg pwm
 36 | {-# INLINE findTFBS #-}
 37 | 
 38 | findTFBSWith :: Monad m
 39 |              => U.Vector Double   -- ^ best possible match score of suffixes
 40 |              -> Bkgd
 41 |              -> PWM
 42 |              -> DNA a
 43 |              -> Double
 44 |              -> Bool    -- ^ whether to skip ambiguous sequences. Recommend: True
 45 |                         -- in most cases
 46 |              -> ConduitT i (Int, Double) m ()
 47 | findTFBSWith sigma bg pwm dna thres skip = loop 0
 48 |   where
 49 |     loop !i | i >= l - n + 1 = return ()
 50 |             | otherwise = do let (d, sc) = searchFn bg pwm sigma dna i thres
 51 |                              if d == n - 1
 52 |                                 then yield (i, sc) >> loop (i+1)
 53 |                                 else loop (i+1)
 54 |     l = Seq.length dna
 55 |     n = size pwm
 56 |     searchFn | skip = lookAheadSearch'
 57 |              | otherwise = lookAheadSearch
 58 | {-# INLINE findTFBSWith #-}
 59 | 
 60 | {-
 61 | -- | a parallel version of findTFBS
 62 | findTFBS' :: Bkgd
 63 |           -> PWM
 64 |           -> DNA a
 65 |           -> Double
 66 |           -> Bool
 67 |           -> [Int]
 68 | findTFBS' bg pwm dna th skip = concat $ parMap rdeepseq f [0,step..l-n+1]
 69 |   where
 70 |     f x = loop x
 71 |       where
 72 |         loop i | i >= x+step || i >= l-n+1 = []
 73 |                | otherwise = let d = fst $ searchFn bg pwm sigma dna i th
 74 |                              in if d == n-1
 75 |                                    then i : loop (i+1)
 76 |                                    else loop (i+1)
 77 |     sigma = optimalScoresSuffix bg pwm
 78 |     l = Seq.length dna
 79 |     n = size pwm
 80 |     step = 500000
 81 |     searchFn | skip = lookAheadSearch'
 82 |              | otherwise = lookAheadSearch
 83 | {-# INLINE findTFBS' #-}
 84 | -}
 85 | 
 86 | -- | use naive search
 87 | findTFBSSlow :: Monad m => Bkgd -> PWM -> DNA a -> Double -> ConduitT i (Int, Double) m ()
 88 | findTFBSSlow bg pwm dna thres = scores' bg pwm dna .| loop 0
 89 |   where
 90 |     loop i = do v <- await
 91 |                 case v of
 92 |                     Just v' ->  if v' >= thres then yield (i, v') >> loop (i+1)
 93 |                                                else loop (i+1)
 94 |                     _ -> return ()
 95 | {-# INLINE findTFBSSlow #-}
 96 | 
 97 | -- | the largest possible match scores starting from every position of a DNA sequence
 98 | maxMatchSc :: Bkgd -> PWM -> DNA a -> Double
 99 | maxMatchSc bg pwm dna = loop (-1/0) 0
100 |   where
101 |     loop !max' !i | i >= l - n + 1 = max'
102 |                   | otherwise = if d == n - 1 then loop sc (i+1)
103 |                                               else loop max' (i+1)
104 |       where
105 |         (d, sc) = lookAheadSearch bg pwm sigma dna i max'
106 |     sigma = optimalScoresSuffix bg pwm
107 |     l = Seq.length dna
108 |     n = size pwm
109 | {-# INLINE maxMatchSc #-}
110 | 
111 | optimalScoresSuffix :: Bkgd -> PWM -> U.Vector Double
112 | optimalScoresSuffix (BG (a, c, g, t)) (PWM _ pwm) =
113 |     U.fromList . tail . map (last sigma -) $ sigma
114 |   where
115 |     sigma = scanl f 0 $ M.toRows pwm
116 |     f !acc xs = let (i, s) = U.maximumBy (comparing snd) .
117 |                                 U.zip (U.fromList ([0..3] :: [Int])) $ xs
118 |                 in acc + case i of
119 |                     0 -> log $ s / a
120 |                     1 -> log $ s / c
121 |                     2 -> log $ s / g
122 |                     3 -> log $ s / t
123 |                     _ -> undefined
124 | {-# INLINE optimalScoresSuffix #-}
125 | 
126 | lookAheadSearch :: Bkgd              -- ^ background nucleotide distribution
127 |                 -> PWM               -- ^ pwm
128 |                 -> U.Vector Double   -- ^ best possible match score of suffixes
129 |                 -> DNA a             -- ^ DNA sequence
130 |                 -> Int               -- ^ starting location on the DNA
131 |                 -> Double            -- ^ threshold
132 |                 -> (Int, Double)     -- ^ (d, sc_d), the largest d such that sc_d > th_d
133 | lookAheadSearch (BG (a, c, g, t)) pwm sigma dna start thres = loop (0, -1) 0
134 |   where
135 |     loop (!acc, !th_d) !d
136 |       | acc < th_d = (d-2, acc)
137 |       | otherwise = if d >= n
138 |                        then (d-1, acc)
139 |                        else loop (acc + sc, thres - sigma U.! d) (d+1)
140 |       where
141 |         sc = case toBS dna `B.index` (start + d) of
142 |               'A' -> log $! matchA / a
143 |               'C' -> log $! matchC / c
144 |               'G' -> log $! matchG / g
145 |               'T' -> log $! matchT / t
146 |               'N' -> 0
147 |               'V' -> log $! (matchA + matchC + matchG) / (a + c + g)
148 |               'H' -> log $! (matchA + matchC + matchT) / (a + c + t)
149 |               'D' -> log $! (matchA + matchG + matchT) / (a + g + t)
150 |               'B' -> log $! (matchC + matchG + matchT) / (c + g + t)
151 |               'M' -> log $! (matchA + matchC) / (a + c)
152 |               'K' -> log $! (matchG + matchT) / (g + t)
153 |               'W' -> log $! (matchA + matchT) / (a + t)
154 |               'S' -> log $! (matchC + matchG) / (c + g)
155 |               'Y' -> log $! (matchC + matchT) / (c + t)
156 |               'R' -> log $! (matchA + matchG) / (a + g)
157 |               _   -> error "Bio.Motif.Search.lookAheadSearch: invalid nucleotide"
158 |         matchA = addSome $ M.unsafeIndex (_mat pwm) (d,0)
159 |         matchC = addSome $ M.unsafeIndex (_mat pwm) (d,1)
160 |         matchG = addSome $ M.unsafeIndex (_mat pwm) (d,2)
161 |         matchT = addSome $ M.unsafeIndex (_mat pwm) (d,3)
162 |         addSome !x | x == 0 = pseudoCount
163 |                    | otherwise = x
164 |         pseudoCount = 0.0001
165 |     n = size pwm
166 | {-# INLINE lookAheadSearch #-}
167 | 
168 | -- | this version skip sequences contain ambiguous bases, like "N"
169 | lookAheadSearch' :: Bkgd              -- ^ background nucleotide distribution
170 |                  -> PWM               -- ^ pwm
171 |                  -> U.Vector Double   -- ^ best possible match score of suffixes
172 |                  -> DNA a             -- ^ DNA sequence
173 |                  -> Int               -- ^ starting location on the DNA
174 |                  -> Double            -- ^ threshold
175 |                  -> (Int, Double)     -- ^ (d, sc_d), the largest d such that sc_d > th_d
176 | lookAheadSearch' (BG (a, c, g, t)) pwm sigma dna start thres = loop (0, -1) 0
177 |   where
178 |     loop (!acc, !th_d) !d
179 |       | acc < th_d = (d-2, acc)
180 |       | otherwise = if d >= n
181 |                        then (d-1, acc)
182 |                        else loop (acc + sc, thres - sigma U.! d) (d+1)
183 |       where
184 |         sc = case toBS dna `B.index` (start + d) of
185 |               'A' -> log $! matchA / a
186 |               'C' -> log $! matchC / c
187 |               'G' -> log $! matchG / g
188 |               'T' -> log $! matchT / t
189 |               _   -> -1 / 0
190 |         matchA = addSome $ M.unsafeIndex (_mat pwm) (d,0)
191 |         matchC = addSome $ M.unsafeIndex (_mat pwm) (d,1)
192 |         matchG = addSome $ M.unsafeIndex (_mat pwm) (d,2)
193 |         matchT = addSome $ M.unsafeIndex (_mat pwm) (d,3)
194 |         addSome !x | x == 0 = pseudoCount
195 |                    | otherwise = x
196 |         pseudoCount = 0.0001
197 |     n = size pwm
198 | {-# INLINE lookAheadSearch' #-}
199 | 
200 | {-
201 | data SpaceDistribution = SpaceDistribution
202 |     { _motif1   :: Motif
203 |     , _nSites1  :: (Int, Int)
204 |     , _motif2   :: Motif
205 |     , _nSites2  :: (Int, Int)
206 |     , _same     :: [(Int, Int)]
207 |     , _opposite :: [(Int, Int)]
208 |     } deriving (Show, Read)
209 | 
210 | -- | search for spacing constraint between two TFs
211 | spaceConstraint :: [(Motif, Motif)]   -- ^ motifs, names must be unique
212 |                 -> Bkgd    -- ^ backgroud nucleotide distribution
213 |                 -> Double  -- ^ p-Value for motif finding
214 |                 -> Int     -- ^ half window size, typical 5
215 |                 -> Int     -- ^ max distance to search, typical 300
216 |                 -> DNA a -> [SpaceDistribution]
217 | spaceConstraint pairs bg th w k dna = flip map pairs $ \(a, b) ->
218 |     let (m1, site1) = HM.lookupDefault undefined (_name a) sites
219 |         (m2, site2) = HM.lookupDefault undefined (_name b) sites
220 |         (same, opposite) = spaceConstraintHelper site1 site2 w k
221 |     in SpaceDistribution m1 ((U.length *** U.length) site1) m2
222 |         ((U.length *** U.length) site2) same opposite
223 |   where
224 |     motifs = nubBy ((==) `on` _name) $ concatMap (\(a,b) -> [a,b]) pairs
225 |     findSites (Motif _ pwm) = (fwd, rev)
226 |       where
227 |         fwd = runST $ runConduit $ findTFBS bg pwm dna cutoff True .| sinkVector
228 |         rev = runST $ runConduit $ findTFBS bg pwm' dna cutoff' True .|
229 |               mapC (+ (size pwm - 1)) .| sinkVector
230 |         cutoff = pValueToScore th bg pwm
231 |         cutoff' = pValueToScore th bg pwm'
232 |         pwm' = rcPWM pwm
233 |     sites = HM.fromList $ map (\m -> (_name m, (m, findSites m))) motifs
234 | {-# INLINE spaceConstraint #-}
235 | 
236 | spaceConstraintHelper :: (U.Vector Int, U.Vector Int)
237 |                       -> (U.Vector Int, U.Vector Int)
238 |                       -> Int
239 |                       -> Int
240 |                       -> ([(Int, Int)], [(Int, Int)])
241 | spaceConstraintHelper (fw1, rv1) (fw2, rv2) w k = (same, oppose)
242 |   where
243 |     rs = let rs' = [-k, -k+2*w+1 .. 0]
244 |          in rs' ++ map (*(-1)) (reverse rs')
245 |     fw2' = S.fromList $ U.toList fw2
246 |     rv2' = S.fromList $ U.toList rv2
247 |     nOverlap :: U.Vector Int -> S.HashSet Int -> Int -> Int -> Int
248 |     nOverlap xs ys w' i = U.foldl' f 0 xs
249 |       where
250 |         f acc x | any (`S.member` ys) [x + i - w' .. x + i + w'] = acc + 1
251 |                 | otherwise = acc
252 |     same = zip rs $ zipWith (+) nFF nRR
253 |       where
254 |         nFF = map (nOverlap fw1 fw2' w) rs
255 |         nRR = map (nOverlap rv1 rv2' w) $ reverse rs
256 |     oppose = zip rs $ zipWith (+) nFR nRF
257 |       where
258 |         nFR = map (nOverlap fw1 rv2' w) rs
259 |         nRF = map (nOverlap rv1 fw2' w) $ reverse rs
260 | {-# INLINE spaceConstraintHelper #-}
261 | 
262 | computePValue :: Double -> [Int] -> [(Int, Double)]
263 | computePValue p xs = zip xs $ map (pValue n p) xs
264 |   where
265 |     n = foldl' (+) 0 xs
266 | 
267 | pValue :: Int -> Double -> Int -> Double
268 | pValue n p x | n > 2000 = complCumulative (poisson (fromIntegral n* p)) $ fromIntegral x
269 |              | otherwise = complCumulative (binomial n p) $ fromIntegral x
270 | 
271 | -}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/BioGRID.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Bio.RealWorld.BioGRID
 4 |     ( TAB2(..)
 5 |     , fetchByGeneNames
 6 |     ) where
 7 | 
 8 | import Network.HTTP.Conduit
 9 | import Data.List
10 | import qualified Data.ByteString.Lazy.Char8 as BL
11 | import qualified Data.ByteString.Char8 as B
12 | import qualified Data.Text as T
13 | 
14 | accessKey :: String
15 | accessKey = "accessKey=6168b8d02b2aa2e9a45af6f3afac4461"
16 | 
17 | base :: String
18 | base = "http://webservice.thebiogrid.org/"
19 | 
20 | -- | BioGRID tab2 format
21 | data TAB2 = TAB2
22 |     { _biogridId :: B.ByteString
23 |     , _entrezIdA :: B.ByteString
24 |     , _entrezIdB :: B.ByteString
25 |     , _biogridIdA :: B.ByteString
26 |     , _biogridIdB :: B.ByteString
27 |     , _systematicNameA :: T.Text
28 |     , _systematicNameB :: T.Text
29 |     , _symbolA :: T.Text
30 |     , _symbolB :: T.Text
31 |     , _synonymsA :: [T.Text]
32 |     , _synonymsB :: [T.Text]
33 |     , _experimentalSystemName :: T.Text
34 |     , _experimentalSystemType :: T.Text
35 |     , _firstAuthor :: T.Text
36 |     , _pubmedId :: B.ByteString
37 |     , _organismIdA :: B.ByteString
38 |     , _organismIdB :: B.ByteString
39 |     , _throughput :: T.Text
40 |     , _score :: Maybe Double
41 |     , _ptm :: T.Text
42 |     , _phenotypes :: [T.Text]
43 |     , _qualifications :: [T.Text]
44 |     , _tags :: [T.Text]
45 |     , _source :: T.Text
46 |     } deriving (Show)
47 | 
48 | parseAsTab2 :: BL.ByteString -> TAB2
49 | parseAsTab2 l = TAB2 (BL.toStrict $ xs!!0)
50 |                      (BL.toStrict $ xs!!1)
51 |                      (BL.toStrict $ xs!!2)
52 |                      (BL.toStrict $ xs!!3)
53 |                      (BL.toStrict $ xs!!4)
54 |                      (T.pack $ BL.unpack $ xs!!5)
55 |                      (T.pack $ BL.unpack $ xs!!6)
56 |                      (T.pack $ BL.unpack $ xs!!7)
57 |                      (T.pack $ BL.unpack $ xs!!8)
58 |                      (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!9)
59 |                      (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!10)
60 |                      (T.pack $ BL.unpack $ xs!!11)
61 |                      (T.pack $ BL.unpack $ xs!!12)
62 |                      (T.pack $ BL.unpack $ xs!!13)
63 |                      (BL.toStrict $ xs!!14)
64 |                      (BL.toStrict $ xs!!15)
65 |                      (BL.toStrict $ xs!!16)
66 |                      (T.pack $ BL.unpack $ xs!!17)
67 |                      (getScore $ BL.unpack $ xs!!18)
68 |                      (T.pack $ BL.unpack $ xs!!19)
69 |                      (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!20)
70 |                      (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!21)
71 |                      (T.splitOn "|" $ T.pack $ BL.unpack $ xs!!22)
72 |                      (T.pack $ BL.unpack $ xs!!23)
73 |   where
74 |     xs = BL.split '\t' l
75 |     getScore "-" = Nothing
76 |     getScore x = Just $ read x
77 | 
78 | -- | retreive first 10,000 records
79 | fetchByGeneNames :: [String] -> IO [TAB2]
80 | fetchByGeneNames genes = do
81 |     initReq <- parseRequest $ intercalate "&" [url, geneList, tax, accessKey]
82 |     let request = initReq { method = "GET"
83 |                           , requestHeaders = [("Content-type", "text/plain")]
84 |                           }
85 |     manager <- newManager tlsManagerSettings
86 |     r <- httpLbs request manager
87 |     return $ map parseAsTab2 $ BL.lines $ responseBody r
88 |   where
89 |     url = base ++ "/interactions/?searchNames=ture&includeInteractors=false"
90 |     geneList = "geneList=" ++ intercalate "|" genes
91 |     tax = "taxId=9606"
92 | {-# INLINE fetchByGeneNames #-}
93 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/ENCODE.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | 
  3 | module Bio.RealWorld.ENCODE
  4 |     ( KeyWords(..)
  5 |     , search
  6 | 
  7 |     -- * common search
  8 |     , terms
  9 |     , cellIs
 10 |     , organismIs
 11 |     , assayIs
 12 | 
 13 |     -- * specific search
 14 |     , getFile
 15 |     , queryById
 16 |     , openUrl
 17 |     , jsonFromUrl
 18 | 
 19 |     -- * Inspection
 20 |     , (|@)
 21 |     , (|!)
 22 |     , as
 23 |     , (&)
 24 | 
 25 |     -- * printing
 26 |     , showResult
 27 |     ) where
 28 | 
 29 | import Data.Aeson
 30 | import Data.Aeson.Types
 31 | import Data.Aeson.Encode.Pretty (encodePretty)
 32 | import qualified Data.Aeson.KeyMap as M
 33 | import qualified Data.Aeson.Key as M
 34 | import qualified Data.ByteString.Lazy.Char8 as B
 35 | import qualified Data.ByteString.Char8 as BS
 36 | import qualified Data.Sequence as S
 37 | import qualified Data.Text as T
 38 | import qualified Data.Vector as V
 39 | import Data.Maybe (fromMaybe)
 40 | import Network.HTTP.Conduit
 41 | import Data.Default.Class
 42 | 
 43 | import Bio.RealWorld.ID
 44 | 
 45 | -- | Terms and constraints.
 46 | data KeyWords = KeyWords (S.Seq String) (S.Seq String)
 47 | 
 48 | instance Default KeyWords where
 49 |     def = KeyWords S.empty $ S.fromList ["frame=object", "limit=all"]
 50 | 
 51 | instance Show KeyWords where
 52 |     show (KeyWords x y) = f x ++ g y
 53 |       where
 54 |         f x' | S.null x' = ""
 55 |              | otherwise = "searchTerm=" ++ foldr1 (\a b -> b ++ ('+':a)) x' ++ "&"
 56 |         g y' | S.null y' = ""
 57 |              | otherwise =  foldr1 (\a b -> b ++ ('&':a)) y'
 58 | 
 59 | instance Semigroup KeyWords where
 60 |     (<>) (KeyWords a b) (KeyWords a' b') = KeyWords (a S.>< a') (b S.>< b')
 61 | 
 62 | base :: String
 63 | base = "https://www.encodeproject.org/"
 64 | 
 65 | -- | general search using keywords and a set of constraints. Example:
 66 | -- search ["chip", "sp1"] ["type=experiment"]
 67 | search :: KeyWords -> IO (Either String [Value])
 68 | search kw = do
 69 |     initReq <- parseRequest url
 70 |     let request = initReq { method = "GET"
 71 |                           , requestHeaders = [("accept", "application/json")]
 72 |                           }
 73 |     manager <- newManager tlsManagerSettings
 74 |     r <- httpLbs request manager
 75 |     return $ (eitherDecode . responseBody) r >>=
 76 |         parseEither (withObject "ENCODE_JSON" (.: "@graph"))
 77 |   where
 78 |     url = base ++ "search/?" ++ show kw
 79 | 
 80 | showResult :: Value -> IO ()
 81 | showResult = B.putStrLn . encodePretty
 82 | 
 83 | terms :: [String] -> KeyWords
 84 | terms xs = KeyWords (S.fromList xs) S.empty
 85 | 
 86 | assayIs :: String -> KeyWords
 87 | assayIs x = KeyWords S.empty $
 88 |                      S.fromList ["type=experiment", "assay_term_name=" ++ x]
 89 | 
 90 | organismIs :: String -> KeyWords
 91 | organismIs x = KeyWords S.empty $
 92 |     S.fromList ["replicates.library.biosample.donor.organism.scientific_name=" ++ x]
 93 | 
 94 | cellIs :: String -> KeyWords
 95 | cellIs x = KeyWords S.empty $ S.fromList ["biosample_term_name=" ++ x]
 96 | 
 97 | -- | accession
 98 | queryById :: EncodeAcc -> IO (Either String Value)
 99 | queryById acc = jsonFromUrl $ "experiments/" ++ BS.unpack (fromID acc)
100 | 
101 | getFile :: FilePath -> String -> IO ()
102 | getFile out url = openUrl (base ++ url) "application/octet-stream" >>=
103 |                   B.writeFile out
104 | 
105 | openUrl :: String -> String -> IO B.ByteString
106 | openUrl url datatype = do
107 |     initReq <- parseRequest url
108 |     let request = initReq { method = "GET"
109 |                           , requestHeaders = [("accept", BS.pack datatype)]
110 |                           }
111 |     manager <- newManager tlsManagerSettings
112 |     r <- httpLbs request manager
113 |     return $ responseBody r
114 | 
115 | jsonFromUrl :: String -> IO (Either String Value)
116 | jsonFromUrl url = eitherDecode <$> openUrl (base ++ url) "application/json"
117 | 
118 | 
119 | (|@) :: Value -> T.Text -> Value
120 | (|@) (Object obj) key = fromMaybe (error errMsg) $ M.lookup (M.fromText key) obj
121 |   where
122 |     errMsg = "No such key: " ++ T.unpack key ++ " In: " ++ show obj
123 | (|@) _ _ = error "not an object"
124 | {-# INLINE (|@) #-}
125 | 
126 | (|!) :: Value -> Int -> Value
127 | (|!) (Array ar) i = ar V.! i
128 | (|!) _ _ = error "not an array"
129 | {-# INLINE (|!) #-}
130 | 
131 | (&) :: a -> (a -> b) -> b
132 | (&) = flip ($)
133 | {-# INLINE (&) #-}
134 | 
135 | as :: FromJSON a => Value -> a
136 | as = getResult . fromJSON
137 |   where
138 |     getResult (Error e) = error e
139 |     getResult (Success x) = x
140 | {-# INLINE as #-}
141 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/Ensembl.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Bio.RealWorld.Ensembl
 3 |     ( lookup
 4 |     ) where
 5 | 
 6 | import Prelude hiding (lookup)
 7 | import Data.Aeson
 8 | import Data.List.Split (chunksOf)
 9 | import qualified Data.ByteString.Char8 as B
10 | import qualified Data.Aeson.KeyMap as M
11 | import Network.HTTP.Conduit
12 | 
13 | import Bio.RealWorld.ID (BioID(..), EnsemblID)
14 | 
15 | base :: String
16 | base = "http://rest.ensembl.org/"
17 | 
18 | lookup :: [EnsemblID] -> IO (Either String Object)
19 | lookup xs = do
20 |     rs <- mapM lookupHelp $ chunksOf 1000 xs
21 |     return $ foldl1 f rs
22 |   where
23 |     f a b = do
24 |         a' <- a
25 |         b' <- b
26 |         return $ M.union a' b'
27 | 
28 | lookupHelp :: [EnsemblID] -> IO (Either String Object)
29 | lookupHelp xs = do
30 |     initReq <- parseRequest url
31 |     let request = initReq { method = "POST"
32 |                           , requestHeaders = [("Content-type", "application/json")]
33 |                           , requestBody = body
34 |                           }
35 |     manager <- newManager tlsManagerSettings
36 |     r <- httpLbs request manager
37 |     return . eitherDecode . responseBody $ r
38 |   where
39 |     url = base ++ "/lookup/id/"
40 |     ids = B.pack $ show $ map fromID xs
41 |     body = RequestBodyBS $ B.intercalate "" ["{ \"ids\" :", ids, "}"]
42 | {-# INLINE lookupHelp #-}
43 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/GDC.hs:
--------------------------------------------------------------------------------
 1 | -- NIH Genomic Data Commons
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | module Bio.RealWorld.GDC
 5 |     (downloadData) where
 6 | 
 7 | import           Conduit
 8 | import qualified Data.Text as T
 9 | import qualified Data.ByteString.Char8 as B
10 | import Data.Maybe (fromJust)
11 | import           Network.HTTP.Conduit
12 | 
13 | baseurl :: String
14 | baseurl = "https://api.gdc.cancer.gov/"
15 | 
16 | -- | Download data
17 | downloadData :: String    -- ^ UUID
18 |              -> FilePath  -- ^ Output dir
19 |              -> IO FilePath
20 | downloadData uuid dir = do
21 |      request <- parseRequest url
22 |      manager <- newManager tlsManagerSettings
23 |      runResourceT $ do
24 |          response <- http request manager
25 |          let filename = T.unpack $ snd $ T.breakOnEnd "filename=" $ T.pack $
26 |                 B.unpack $ fromJust $ lookup "Content-Disposition" $
27 |                 responseHeaders response
28 |          runConduit $ responseBody response .| sinkFileBS (dir ++ "/" ++ filename)
29 |          return filename
30 |   where
31 |     url = baseurl ++ "data/" ++ uuid
32 | {-# INLINE downloadData #-}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/GENCODE.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | {-# LANGUAGE RecordWildCards #-}
  4 | 
  5 | module Bio.RealWorld.GENCODE
  6 |     ( Gene(..)
  7 |     , Transcript(..)
  8 |     , TranscriptType(..)
  9 |     , readGenes
 10 |     , readGenesC
 11 |     , getPromoters
 12 |     , getDomains
 13 |     ) where
 14 | 
 15 | import           Conduit
 16 | import qualified Data.ByteString.Char8 as B
 17 | import           Data.CaseInsensitive  (CI, mk)
 18 | import qualified Data.HashMap.Strict   as M
 19 | import Data.List.Ordered (nubSort)
 20 | import           Data.Maybe            (fromMaybe, fromJust, isNothing)
 21 | import Lens.Micro
 22 | import Data.List (foldl')
 23 | import Data.Char (toLower)
 24 | import qualified Data.Vector as V
 25 | 
 26 | import Bio.Data.Bed
 27 | import Bio.Data.Bed.Types
 28 | import           Bio.Utils.Misc        (readInt)
 29 | 
 30 | data TranscriptType = Coding
 31 |                     | NonCoding
 32 |                     deriving (Show, Eq, Ord)
 33 | 
 34 | -- | GTF's position is 1-based, but here we convert it to 0-based indexing.
 35 | data Gene = Gene
 36 |     { geneName        :: !(CI B.ByteString)
 37 |     , geneId          :: !B.ByteString
 38 |     , geneChrom       :: !B.ByteString
 39 |     , geneLeft        :: !Int
 40 |     , geneRight       :: !Int
 41 |     , geneStrand      :: !Bool
 42 |     , geneTranscripts :: ![Transcript]
 43 |     } deriving (Show, Eq, Ord)
 44 | 
 45 | data Transcript = Transcript
 46 |     { transId          :: !B.ByteString
 47 |     , transLeft        :: !Int
 48 |     , transRight       :: !Int
 49 |     , transStrand      :: !Bool
 50 |     , transExon        :: ![(Int, Int)]
 51 |     , transUTR         :: ![(Int, Int)]
 52 |     , transType        :: TranscriptType
 53 |     } deriving (Show, Eq, Ord)
 54 | 
 55 | -- | Read gene information from Gencode GTF file
 56 | readGenes :: FilePath -> IO [Gene]
 57 | readGenes input = runResourceT $ runConduit $ sourceFile input .| readGenesC
 58 | 
 59 | readGenesC :: Monad m => ConduitT B.ByteString o m [Gene]
 60 | readGenesC = do
 61 |     (genes, transcripts, exons, utrs) <- readElements
 62 |     let t = M.fromList $ map (\(a,b) -> (transId b, (a,b))) transcripts
 63 |     return $ nubGene $ M.elems $ foldl' addTranscript
 64 |         (M.fromList $ map (\x -> (geneId x, x)) genes) $
 65 |         M.elems $ foldl' addUTR (foldl' addExon t exons) utrs
 66 | {-# INLINE readGenesC #-}
 67 | 
 68 | nubGene :: [Gene] -> [Gene]
 69 | nubGene gs = nubSort $ map nubG gs
 70 |   where
 71 |     nubG g = g { geneTranscripts = nubSort $ map nubT $ geneTranscripts g}
 72 |     nubT t = t { transExon = nubSort $ transExon t 
 73 |                , transUTR = nubSort $ transUTR t  }
 74 | {-# INLINE nubGene #-}
 75 | 
 76 | readElements :: Monad m => ConduitT B.ByteString o m
 77 |     ( [Gene], [(B.ByteString, Transcript)]
 78 |     , [(B.ByteString, (Int, Int))], [(B.ByteString, (Int, Int))] )
 79 | readElements = linesUnboundedAsciiC .| foldlC f ([], [], [], [])
 80 |   where
 81 |     f acc l
 82 |         | B.head l == '#' = acc
 83 |         | featType == "gene" = _1 %~ (gene:) $ acc
 84 |         | featType == "transcript" = _2 %~ ((gid, transcript):) $ acc
 85 |         | featType == "exon" = _3 %~ ((tid, exon):) $ acc
 86 |         | featType == "utr" = _4 %~ ((tid, utr):) $ acc
 87 |         | otherwise = acc
 88 |       where
 89 |         gene = Gene (mk $ fromMaybe (error "could not find \"gene_name\"") $
 90 |             getField "gene_name") gid chr lPos rPos (f7=="+") []
 91 |         transcript = Transcript tid lPos rPos (f7=="+") [] [] tTy
 92 |         exon = (lPos, rPos)
 93 |         utr = (lPos, rPos)
 94 |         [chr,_,f3,f4,f5,_,f7,_,f9] = B.split '\t' l
 95 |         gid = fromMaybe (error "could not find \"gene_id\"") $ getField "gene_id"
 96 |         tid = fromMaybe (error "could not find \"transcript_id\"") $ getField "transcript_id"
 97 |         tTy = case getField "transcript_type" of
 98 |             Just "protein_coding" -> Coding
 99 |             Nothing -> Coding
100 |             _ -> NonCoding
101 |         lPos = readInt f4 - 1
102 |         rPos = readInt f5 - 1
103 |         featType = B.map toLower f3
104 |         getField x = fmap (B.init . B.drop 2) $ lookup x $
105 |             map (B.break isSpace . strip) $ B.split ';' f9
106 |     strip = fst . B.spanEnd isSpace . B.dropWhile isSpace
107 |     isSpace = (== ' ')
108 | {-# INLINE readElements #-}
109 | 
110 | addExon :: M.HashMap B.ByteString (a, Transcript)
111 |         -> (B.ByteString, (Int, Int))
112 |         -> M.HashMap B.ByteString (a, Transcript)
113 | addExon m (key, val) = M.adjust (\(x, trans) ->
114 |     (x, trans{transExon = val : transExon trans})) key m
115 | {-# INLINE addExon #-}
116 | 
117 | addUTR :: M.HashMap B.ByteString (a, Transcript)
118 |        -> (B.ByteString, (Int, Int))
119 |        -> M.HashMap B.ByteString (a, Transcript)
120 | addUTR m (key, val) = M.adjust (\(x, trans) ->
121 |     (x, trans{transUTR = val : transUTR trans})) key m
122 | {-# INLINE addUTR #-}
123 | 
124 | addTranscript :: M.HashMap B.ByteString Gene
125 |               -> (B.ByteString, Transcript)
126 |               -> M.HashMap B.ByteString Gene
127 | addTranscript m (key, val) = M.adjust (\gene ->
128 |     gene{geneTranscripts = val : geneTranscripts gene}) key m
129 | {-# INLINE addTranscript #-}
130 | 
131 | getPromoters :: Int   -- ^ upstream
132 |              -> Int   -- ^ downstream
133 |              -> Gene
134 |              -> [BEDExt BED3 (Int, CI B.ByteString)]
135 | getPromoters up down Gene{..} = map g $ nubSort tss
136 |   where
137 |     g x | geneStrand = BEDExt (asBed geneChrom (max 0 $ x - up) (x + down)) (x, geneName)
138 |         | otherwise = BEDExt (asBed geneChrom (max 0 $ x - down) (x + up)) (x, geneName)
139 |     tss | geneStrand = geneLeft : map transLeft geneTranscripts
140 |         | otherwise = geneRight : map transRight geneTranscripts
141 | {-# INLINE getPromoters #-}
142 | 
143 | -- | Compute genes' regulatory domains using the algorithm described in GREAT.
144 | -- NOTE: the result doesn't contain promoters
145 | getDomains :: BEDLike b
146 |            => Int             -- ^ Extension length. A good default is 1M.
147 |            -> [b] -- ^ A list of promoters
148 |            -> [b] -- ^ Regulatory domains
149 | getDomains ext genes
150 |     | null genes = error "No gene available for domain assignment!"
151 |     | otherwise = filter ((>0) . size) $ concatMap f $ triplet $
152 |         [Nothing] ++ map Just basal ++ [Nothing]
153 |   where
154 |     f (left, Just bed, right) =
155 |       [ chromStart .~ leftPos $ chromEnd .~ s $ bed
156 |       , chromStart .~ e $ chromEnd .~ rightPos $ bed ]
157 |       where
158 |         chr = bed^.chrom
159 |         s = bed^.chromStart
160 |         e = bed^.chromEnd
161 |         leftPos
162 |             | isNothing left || chr /= fromJust left ^. chrom = max (s - ext) 0
163 |             | otherwise = min s $ max (s - ext) $ fromJust left ^. chromEnd
164 |         rightPos
165 |             | isNothing right || chr /= fromJust right ^. chrom = e + ext   -- TODO: bound check
166 |             | otherwise = max e $ min (e + ext) $ fromJust right ^. chromStart
167 |     f _ = undefined
168 |     triplet (x1:x2:x3:xs) = (x1,x2,x3) : triplet xs
169 |     triplet _ = []
170 |     basal = V.toList $ fromSorted $ sortBed genes
171 | {-# INLINE getDomains #-}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/ID.hs:
--------------------------------------------------------------------------------
 1 | module Bio.RealWorld.ID where
 2 | 
 3 | import qualified Data.ByteString.Char8 as B
 4 | 
 5 | class BioID a where
 6 |     fromID :: a -> B.ByteString
 7 |     toID :: B.ByteString -> a
 8 | 
 9 | newtype UniprotID = UniprotID B.ByteString deriving (Show, Eq)
10 | 
11 | newtype UCSCID = UCSCID B.ByteString deriving (Show, Eq)
12 | 
13 | newtype GOID = GOID B.ByteString deriving (Show, Eq)
14 | 
15 | -- | ENCODE Accession
16 | newtype EncodeAcc = EncodeAcc B.ByteString deriving (Show, Eq)
17 | 
18 | -- | Ensembl ID
19 | newtype EnsemblID = EnsemblID B.ByteString deriving (Show, Eq)
20 | 
21 | instance BioID EncodeAcc where
22 |     fromID (EncodeAcc x) = x
23 |     toID = EncodeAcc
24 | 
25 | instance BioID EnsemblID where
26 |     fromID (EnsemblID x) = x
27 |     toID = EnsemblID
28 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/Reactome.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE DeriveGeneric #-}
 3 | 
 4 | module Bio.RealWorld.Reactome
 5 |     ( getPathways
 6 |     ) where
 7 | 
 8 | import Data.Aeson
 9 | import GHC.Generics (Generic)
10 | import qualified Data.Text as T
11 | import           Network.HTTP.Simple
12 | 
13 | base :: String
14 | base = "https://reactome.org/ContentService"
15 | 
16 | data Obj = Obj
17 |     { className :: Maybe T.Text
18 |     , dbId :: Int 
19 |     , displayName :: T.Text
20 |     , schemaClass :: Maybe T.Text
21 |     , stId :: Maybe T.Text
22 |     , stIdVersion :: Maybe T.Text
23 |     } deriving (Show, Generic)
24 | 
25 | instance ToJSON Obj
26 | instance FromJSON Obj
27 |  
28 | -- | All Reactome top level pathways
29 | getPathways :: String -> IO [Obj]
30 | getPathways species = do
31 |     req <- parseRequest url
32 |     response <- httpJSON req
33 |     return $ getResponseBody response
34 |   where
35 |     url = base ++ "/data/pathways/top/" ++ species
36 | 
37 | {-
38 | pathwayAnalysis :: [B.ByteString]   -- ^ A list of identifiers
39 |                 -> IO B.ByteString
40 | pathwayAnalysis ids = do
41 |     initReq <- parseRequest base
42 |     let request = urlEncodedBody [] initReq
43 |     manager <- newManager tlsManagerSettings
44 |     r <- fmap M.fromList $ runResourceT $ do
45 |         response <- http request manager
46 |         runConduit $ responseBody response .| linesUnboundedAsciiC .|
47 |             (dropC 1 >> mapC ((\[a,b] -> (a,b)) . B.split '\t')) .| sinkList
48 |     return $ map (flip M.lookup r) ids
49 |   where
50 |     params =
51 | -}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/RealWorld/Uniprot.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Bio.RealWorld.Uniprot
 4 |     ( mapID
 5 |     ) where
 6 | 
 7 | import           Conduit
 8 | import qualified Data.ByteString.Char8 as B
 9 | import qualified Data.HashMap.Strict   as M
10 | import           Network.HTTP.Conduit
11 | 
12 | base :: String
13 | base = "http://www.uniprot.org/uploadlists/"
14 | 
15 | mapID :: [B.ByteString]   -- ^ A list of IDs
16 |       -> B.ByteString     -- ^ From database
17 |       -> B.ByteString     -- ^ To database
18 |       -> IO [Maybe B.ByteString]
19 | mapID ids from to = do
20 |     initReq <- parseRequest base
21 |     let request = setQueryString query initReq
22 |             { method = "GET"
23 |             , requestHeaders = [("User-Agent", "kk@test.org")]
24 |             }
25 |     manager <- newManager tlsManagerSettings
26 |     r <- fmap M.fromList $ runResourceT $ do
27 |         response <- http request manager
28 |         runConduit $ responseBody response .| linesUnboundedAsciiC .|
29 |             (dropC 1 >> mapC ((\[a,b] -> (a,b)) . B.split '\t')) .| sinkList
30 |     return $ map (flip M.lookup r) ids
31 |   where
32 |     query = [ ("from", Just from)
33 |             , ("to", Just to)
34 |             , ("format", Just "tab")
35 |             , ("query", Just $ B.unwords ids)
36 |             ]
37 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Seq.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns          #-}
  2 | {-# LANGUAGE FlexibleContexts      #-}
  3 | {-# LANGUAGE MultiParamTypeClasses #-}
  4 | {-# LANGUAGE ScopedTypeVariables   #-}
  5 | module Bio.Seq
  6 |     (
  7 |     -- * Alphabet
  8 |       Basic
  9 |     , IUPAC
 10 |     , Ext
 11 |     -- * Sequence types
 12 |     , DNA
 13 |     , RNA
 14 |     , Peptide
 15 |     , BioSeq'(..)
 16 |     , BioSeq(..)
 17 |     -- * DNA related functions
 18 |     , rc
 19 |     , gcContent
 20 |     , nucleotideFreq
 21 |     ) where
 22 | 
 23 | import qualified Data.ByteString.Char8 as B
 24 | import           Data.Char8            (toUpper)
 25 | import qualified Data.HashMap.Strict   as M
 26 | import qualified Data.HashSet          as S
 27 | import           Data.Proxy            (Proxy (..))
 28 | import           Prelude               hiding (length)
 29 | 
 30 | -- | Alphabet defined by http://www.chem.qmul.ac.uk/iupac/
 31 | -- | Standard unambiguous alphabet
 32 | data Basic
 33 | 
 34 | -- | full IUPAC alphabet, including ambiguous letters
 35 | data IUPAC
 36 | 
 37 | -- | extended alphabet
 38 | data Ext
 39 | 
 40 | -- | DNA sequence
 41 | newtype DNA alphabet = DNA B.ByteString
 42 | 
 43 | -- | RNA sequence
 44 | newtype RNA alphabet = RNA B.ByteString
 45 | 
 46 | -- | Peptide sequence
 47 | newtype Peptide alphabet = Peptide B.ByteString
 48 | 
 49 | instance Show (DNA a) where
 50 |     show (DNA s) = B.unpack s
 51 | 
 52 | instance Semigroup (DNA a) where
 53 |     (<>) (DNA x) (DNA y) = DNA (x <> y)
 54 | 
 55 | instance Monoid (DNA a) where
 56 |     mempty = DNA B.empty
 57 |     mconcat dnas = DNA . B.concat . map toBS $ dnas
 58 | 
 59 | class BioSeq' s where
 60 |     toBS :: s a -> B.ByteString
 61 |     unsafeFromBS :: B.ByteString -> s a
 62 | 
 63 |     slice :: Int -> Int -> s a -> s a
 64 | 
 65 |     length :: s a -> Int
 66 |     length = B.length . toBS
 67 |     {-# MINIMAL toBS, slice, unsafeFromBS #-}
 68 | 
 69 | instance BioSeq' DNA where
 70 |     toBS (DNA s) = s
 71 |     unsafeFromBS = DNA
 72 |     slice i l (DNA s) = DNA . B.take l . B.drop i $ s
 73 | 
 74 | instance BioSeq' RNA where
 75 |     toBS (RNA s) = s
 76 |     unsafeFromBS = RNA
 77 |     slice i l (RNA s) = RNA . B.take l . B.drop i $ s
 78 | 
 79 | instance BioSeq' Peptide where
 80 |     toBS (Peptide s) = s
 81 |     unsafeFromBS = Peptide
 82 |     slice i l (Peptide s) = Peptide . B.take l . B.drop i $ s
 83 | 
 84 | class BioSeq' seq => BioSeq seq alphabet where
 85 |     alphabet :: Proxy (seq alphabet) -> S.HashSet Char
 86 |     fromBS :: B.ByteString -> Either String (seq alphabet)
 87 |     fromBS input = case B.mapAccumL fun Nothing input of
 88 |         (Nothing, r) -> Right $ unsafeFromBS r
 89 |         (Just e, _)  -> Left $ "Bio.Seq.fromBS: unknown character: " ++ [e]
 90 |       where
 91 |         fun (Just e) x = (Just e, x)
 92 |         fun Nothing x = let x' = toUpper x
 93 |                         in if x' `S.member` alphabet (Proxy :: Proxy (seq alphabet))
 94 |                             then (Nothing, x')
 95 |                             else (Just x', x')
 96 |     {-# MINIMAL alphabet #-}
 97 | 
 98 | instance BioSeq DNA Basic where
 99 |     alphabet _ = S.fromList "ACGT"
100 | 
101 | instance BioSeq DNA IUPAC where
102 |     alphabet _ = S.fromList "ACGTNVHDBMKWSYR"
103 | 
104 | instance BioSeq DNA Ext where
105 |     alphabet _ = undefined
106 | 
107 | instance BioSeq RNA Basic where
108 |     alphabet _ = S.fromList "ACGU"
109 | 
110 | -- | O(n) Reverse complementary of DNA sequence.
111 | rc :: DNA alphabet -> DNA alphabet
112 | rc (DNA s) = DNA . B.map f . B.reverse $ s
113 |   where
114 |     f x = case x of
115 |         'A' -> 'T'
116 |         'C' -> 'G'
117 |         'G' -> 'C'
118 |         'T' -> 'A'
119 |         _   -> x
120 | 
121 | -- | O(n) Compute GC content.
122 | gcContent :: DNA alphabet -> Double
123 | gcContent = (\(a,b) -> a / fromIntegral b) . B.foldl' f (0.0,0::Int) . toBS
124 |   where
125 |     f (!x,!n) c =
126 |         let x' = case c of
127 |                 'A' -> x
128 |                 'C' -> x + 1
129 |                 'G' -> x + 1
130 |                 'T' -> x
131 |                 'H' -> x + 0.25
132 |                 'D' -> x + 0.25
133 |                 'V' -> x + 0.75
134 |                 'B' -> x + 0.75
135 |                 'S' -> x + 1
136 |                 'W' -> x
137 |                 _   -> x + 0.5     -- "NMKYR"
138 |         in (x', n+1)
139 | 
140 | -- | O(n) Compute single nucleotide frequency.
141 | nucleotideFreq :: forall a . BioSeq DNA a => DNA a -> M.HashMap Char Int
142 | nucleotideFreq dna = B.foldl' f m0 . toBS $ dna
143 |   where
144 |     m0 = M.fromList . zip (S.toList $ alphabet (Proxy :: Proxy (DNA a))) . repeat $ 0
145 |     f m x = M.adjust (+1) x m
146 | {-# INLINE nucleotideFreq #-}
147 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Seq/IO.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | 
  4 | module Bio.Seq.IO
  5 |     ( Genome
  6 |     , openGenome
  7 |     , closeGenome
  8 |     , withGenome
  9 |     , getSeq
 10 |     , getChrom
 11 |     , getChrSizes
 12 |     , mkIndex
 13 |     ) where
 14 | 
 15 | import Control.Exception (bracket)
 16 | import Conduit
 17 | import qualified Data.ByteString.Char8 as B
 18 | import qualified Data.HashMap.Lazy as M
 19 | import Data.List.Split
 20 | import System.IO
 21 | 
 22 | import Bio.Seq
 23 | import Bio.Utils.Misc (readInt)
 24 | import Bio.Data.Fasta (fastaReader)
 25 | 
 26 | -- | The first 2048 bytes are header. Header consists of a magic string,
 27 | -- followed by chromosome information. Example:
 28 | -- <HASKELLBIOINFORMATICS>\nCHR1 START SIZE
 29 | data Genome = Genome !Handle !IndexTable !Int
 30 | 
 31 | type IndexTable = M.HashMap B.ByteString (Int, Int)
 32 | 
 33 | magic :: B.ByteString
 34 | magic = "<HASKELLBIOINFORMATICS_7d2c5gxhg934>"
 35 | 
 36 | openGenome :: FilePath -> IO Genome
 37 | openGenome fl = do
 38 |     h <- openFile fl ReadMode
 39 |     sig <- B.hGetLine h
 40 |     if sig == magic
 41 |         then do
 42 |             header <- B.hGetLine h
 43 |             return $ Genome h (getIndex header) (B.length sig + B.length header + 2)
 44 |         else error "Bio.Seq.Query.openGenome: Incorrect format"
 45 |   where
 46 |     getIndex = M.fromList . map f . chunksOf 3 . B.words
 47 |       where
 48 |         f [k, v, l] = (k, (readInt v, readInt l))
 49 |         f _ = error "error"
 50 | {-# INLINE openGenome #-}
 51 | 
 52 | closeGenome :: Genome -> IO ()
 53 | closeGenome (Genome h _ _) = hClose h
 54 | {-# INLINE closeGenome #-}
 55 | 
 56 | withGenome :: FilePath -> (Genome -> IO a) -> IO a
 57 | withGenome fl fn = bracket (openGenome fl) closeGenome fn
 58 | {-# INLINE withGenome #-}
 59 | 
 60 | -- | A query is represented by a tuple: (chr, start, end) and is
 61 | -- zero-based index, half-close-half-open
 62 | type Query = (B.ByteString, Int, Int)
 63 | 
 64 | -- | Retrieve sequence.
 65 | getSeq :: BioSeq s a => Genome -> Query -> IO (Either String (s a))
 66 | getSeq (Genome h index headerSize) (chr, start, end) = case M.lookup chr index of
 67 |     Just (chrStart, chrSize) ->
 68 |         if end > chrSize
 69 |             then return $ Left $ "Bio.Seq.getSeq: out of index: " ++ show end ++
 70 |                 ">" ++ show chrSize
 71 |             else do
 72 |                 hSeek h AbsoluteSeek $ fromIntegral $ headerSize + chrStart + start
 73 |                 fromBS <$> B.hGet h (end - start)
 74 |     _ -> return $ Left $ "Bio.Seq.getSeq: Cannot find " ++ show chr
 75 | {-# INLINE getSeq #-}
 76 | 
 77 | -- | Retrieve whole chromosome.
 78 | getChrom :: Genome -> B.ByteString -> IO (Either String (DNA IUPAC))
 79 | getChrom g chr = case lookup chr chrSize of
 80 |     Just s -> getSeq g (chr, 0, s)
 81 |     _ -> return $ Left "Unknown chromosome"
 82 |   where
 83 |     chrSize = getChrSizes g
 84 | {-# INLINE getChrom #-}
 85 | 
 86 | -- | Retrieve chromosome size information.
 87 | getChrSizes :: Genome -> [(B.ByteString, Int)]
 88 | getChrSizes (Genome _ table _) = map (\(k, (_, l)) -> (k, l)) . M.toList $ table
 89 | {-# INLINE getChrSizes #-}
 90 | 
 91 | -- | Indexing a genome.
 92 | mkIndex :: [FilePath]    -- ^ A list of fasta files. Each file can have multiple
 93 |                          -- chromosomes.
 94 |         -> FilePath      -- ^ output file
 95 |         -> IO ()
 96 | mkIndex fls outFl = withFile outFl WriteMode $ \outH -> do
 97 |     (chrs, dnas) <- (unzip . concat) <$> mapM readSeq fls
 98 |     B.hPutStr outH $ B.unlines [magic, mkHeader chrs]
 99 |     mapM_ (B.hPutStr outH) dnas
100 |   where
101 |     readSeq fl = runResourceT $ runConduit $ fastaReader fl .| conduit .| sinkList
102 |       where
103 |         conduit = awaitForever $ \(chrName, seqs) -> do
104 |             let dna = B.concat seqs
105 |             yield ((head $ B.words chrName, B.length dna), dna)
106 | {-# INLINE mkIndex #-}
107 | 
108 | mkHeader :: [(B.ByteString, Int)] -> B.ByteString
109 | mkHeader xs = B.unwords.fst $ foldl f ([], 0) xs
110 |   where
111 |     f (s, i) (s', i') = (s ++ [s', B.pack $ show i, B.pack $ show i'], i + i')
112 | {-# INLINE mkHeader #-}
113 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Utils/BitVector.hs:
--------------------------------------------------------------------------------
 1 | module Bio.Utils.BitVector
 2 |     ( BitVector(..)
 3 |     , BitMVector
 4 |     , size
 5 |     , (!)
 6 |     , set
 7 |     , clear
 8 |     , unsafeFreeze
 9 |     , zeros
10 |     , toList
11 |     ) where
12 | 
13 | import qualified Data.Vector.Unboxed as U
14 | import Control.Monad.Primitive
15 | import qualified Data.Vector.Unboxed.Mutable as UM
16 | import Data.Word
17 | import Data.Bits
18 | import Text.Printf (printf)
19 | 
20 | data BitVector = BitVector Int (U.Vector Word8)
21 | 
22 | data BitMVector s = BitMVector Int (UM.MVector s Word8)
23 | 
24 | size :: BitVector -> Int
25 | size (BitVector n _) = n
26 | 
27 | (!) :: BitVector -> Int -> Bool
28 | (!) = index
29 | 
30 | index :: BitVector -> Int -> Bool
31 | index (BitVector n v) idx
32 |     | idx >= n = error $ printf "index out of bounds (%d,%d)" idx n
33 |     | otherwise = testBit (v `U.unsafeIndex` i) j
34 |   where
35 |     i = idx `div` 8
36 |     j = idx `mod` 8
37 | 
38 | set :: PrimMonad m => BitMVector (PrimState m) -> Int -> m ()
39 | set (BitMVector _ mv) idx = UM.modify mv ((flip setBit) j) i
40 |   where
41 |     i = idx `div` 8
42 |     j = idx `mod` 8
43 | 
44 | clear :: PrimMonad m => BitMVector (PrimState m) -> Int -> m ()
45 | clear (BitMVector _ mv) idx = UM.modify mv ((flip clearBit) j) i
46 |   where
47 |     i = idx `div` 8
48 |     j = idx `mod` 8
49 | 
50 | unsafeFreeze :: PrimMonad m => BitMVector (PrimState m) -> m BitVector
51 | unsafeFreeze (BitMVector n mv) = U.unsafeFreeze mv >>= return . BitVector n
52 | 
53 | zeros :: PrimMonad m => Int -> m (BitMVector (PrimState m))
54 | zeros n = UM.replicate n' 0 >>= return . BitMVector n
55 |   where
56 |     n' = if j == 0 then i else i + 1
57 |     i = n `div` 8
58 |     j = n `mod` 8
59 | 
60 | toList :: BitVector -> [Bool]
61 | toList bv = flip map [0..n-1] $ \i -> bv ! i
62 |   where
63 |     n = size bv


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Utils/Functions.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | 
  4 | module Bio.Utils.Functions (
  5 |       ihs
  6 |     , ihs'
  7 |     , scale
  8 |     , filterFDR
  9 |     , slideAverage
 10 |     , hyperquick
 11 |     , gaussianKDE
 12 |     , kld
 13 |     , jsd
 14 |     , binarySearch
 15 |     , binarySearchBy
 16 |     , binarySearchByBounds
 17 |     , quantileNormalization
 18 |     , quantileNormalization'
 19 | ) where
 20 | 
 21 | import Data.Bits (shiftR)
 22 | import Data.List (foldl', groupBy)
 23 | import Data.Function (on)
 24 | import Data.Ord (comparing)
 25 | import qualified Data.Vector as V
 26 | import qualified Data.Vector.Generic as G
 27 | import qualified Data.Vector.Unboxed as U
 28 | import qualified Data.Matrix as M
 29 | import Statistics.Sample (meanVarianceUnb, mean)
 30 | import Statistics.Function (sortBy)
 31 | import Control.Parallel.Strategies (parMap, rseq)
 32 | import Statistics.Sample.KernelDensity (kde)
 33 | 
 34 | -- | inverse hyperbolic sine transformation
 35 | ihs :: Double  -- ^ θ, determine the shape of the function
 36 |     -> Double  -- ^ input
 37 |     -> Double
 38 | ihs !θ !x | θ == 0 = x
 39 |           | otherwise = log (θ * x + sqrt (θ * θ * x * x + 1)) / θ
 40 | {-# INLINE ihs #-}
 41 | 
 42 | -- | inverse hyperbolic sine transformation with θ = 1
 43 | ihs' :: Double -> Double
 44 | ihs' = ihs 1
 45 | 
 46 | -- | scale data to zero mean and 1 variance
 47 | scale :: G.Vector v Double => v Double -> v Double
 48 | scale xs = G.map (\x -> (x - m) / sqrt s) xs
 49 |   where
 50 |     (m,s) = meanVarianceUnb xs
 51 | {-# INLINE scale #-}
 52 | 
 53 | -- | given the p-values, filter data by controling FDR
 54 | filterFDR :: G.Vector v (a, Double)
 55 |           => Double  -- ^ desired FDR value
 56 |           -> v (a, Double)  -- ^ input data and pvalues
 57 |           -> v (a, Double)
 58 | filterFDR α xs = go n . sortBy (comparing snd) $ xs
 59 |   where
 60 |     go rank v | rank <= 0 = G.empty
 61 |               | snd (v `G.unsafeIndex` (rank-1)) <= fromIntegral rank * α / fromIntegral n = G.slice 0 rank v
 62 |               | otherwise = go (rank-1) v
 63 |     n = G.length xs
 64 | {-# INLINE filterFDR #-}
 65 | 
 66 | -- | Compute the sliding average for each entry in a vector
 67 | slideAverage :: (Fractional a, G.Vector v a)
 68 |              => Int              -- ^ size of HALF sliding window, 2 means a total
 69 |                                  -- window size is 5
 70 |              -> v a
 71 |              -> v a
 72 | slideAverage k xs = G.generate n $ \i -> go xs (max 0 (i-k)) (min (n-1) (i+k))
 73 |   where
 74 |     go v i j = let l = j - i + 1
 75 |                in G.foldl' (+) 0 (G.unsafeSlice i l v) / fromIntegral l
 76 |     n = G.length xs
 77 | {-# INLINE slideAverage #-}
 78 | 
 79 | hyperquick :: Int -> Int -> Int -> Int -> Double
 80 | hyperquick x m _n _N = loop (m-2) s s (2*e)
 81 |   where
 82 |     loop !k !ak !bk !epsk
 83 |         | k < _N - (_n-x) - 1 && epsk > e =
 84 |             let ck = ak / bk
 85 |                 k' = k + 1
 86 |                 jjm = invJm _n x _N k'
 87 |                 bk' = bk * jjm + 1
 88 |                 ak' = ak * jjm
 89 |                 espk' = fromIntegral (_N - (_n - x) - 1 - k') * (ck - ak' / bk')
 90 |             in loop k' ak' bk' espk'
 91 |         | otherwise = 1 - (ak / bk - epsk / 2)
 92 |     s = foldl' (\s' k -> 1 + s' * invJm _n x _N k) 1.0 [x..m-2]
 93 |     invJm _n _x _N _m = ( 1 - fromIntegral _x / fromIntegral (_m+1) ) /
 94 |                           ( 1 - fromIntegral (_n-1-_x) / fromIntegral (_N-1-_m) )
 95 |     e = 1e-20
 96 | 
 97 | -- | Assign weights to the points according to density estimation.
 98 | gaussianKDE :: Int  -- ^ number of mesh points used in KDE
 99 |             -> U.Vector Double -> (Double -> Double)
100 | gaussianKDE n xs = \x -> 
101 |     let i = binarySearch points x
102 |         lo = points U.! (i - 1)
103 |         lo_d = den U.! (i - 1) 
104 |         hi = points U.! i
105 |         hi_d = den U.! i
106 |         hi_w = (x - lo) / (hi - lo)
107 |         lo_w = 1 - hi_w
108 |      in lo_w * lo_d + hi_w * hi_d
109 |   where
110 |     (points, den) = kde n xs
111 | 
112 | -- | compute the Kullback-Leibler divergence between two valid (not check) probability distributions.
113 | -- kl(X,Y) = \sum_i P(x_i) log_2(P(x_i)\/P(y_i)).
114 | kld :: (G.Vector v Double, G.Vector v (Double, Double)) => v Double -> v Double -> Double
115 | kld xs ys | G.length xs /= G.length ys = error "Incompitable dimensions"
116 |           | otherwise = G.foldl' f 0 . G.zip xs $ ys
117 |   where
118 |     f acc (x, y) | x == 0 = acc
119 |                  | otherwise = acc + x * (logBase 2 x - logBase 2 y)
120 | {-# SPECIALIZE kld :: U.Vector Double -> U.Vector Double -> Double #-}
121 | {-# SPECIALIZE kld :: V.Vector Double -> V.Vector Double -> Double #-}
122 | 
123 | -- | Jensen-Shannon divergence: JS(X,Y) = 1\/2 KL(X,(X+Y)\/2) + 1\/2 KL(Y,(X+Y)\/2).
124 | jsd :: (G.Vector v Double, G.Vector v (Double, Double)) => v Double -> v Double -> Double
125 | jsd xs ys = 0.5 * kld xs zs + 0.5 * kld ys zs
126 |   where zs = G.zipWith (\x y -> (x + y) / 2) xs ys
127 | {-# SPECIALIZE jsd :: U.Vector Double -> U.Vector Double -> Double #-}
128 | {-# SPECIALIZE jsd :: V.Vector Double -> V.Vector Double -> Double #-}
129 | 
130 | -- | O(log n). return the position of the first element that is >= query
131 | binarySearch :: (G.Vector v e, Ord e)
132 |              => v e -> e -> Int
133 | binarySearch vec e = binarySearchByBounds compare vec e 0 $ G.length vec - 1
134 | {-# INLINE binarySearch #-}
135 | 
136 | binarySearchBy :: G.Vector v e
137 |                => (e -> a -> Ordering) -> v e -> a -> Int
138 | binarySearchBy cmp vec e = binarySearchByBounds cmp vec e 0 $ G.length vec - 1
139 | {-# INLINE binarySearchBy #-}
140 | 
141 | binarySearchByBounds :: G.Vector v e
142 |                      => (e -> a -> Ordering) -> v e -> a -> Int -> Int -> Int
143 | binarySearchByBounds cmp vec e = loop
144 |   where
145 |     loop !l !u
146 |         | u < l = l
147 |         | otherwise = case cmp (vec G.! k) e of
148 |                         LT -> loop (k+1) u
149 |                         EQ -> loop l (k-1)
150 |                         GT -> loop l (k-1)
151 |       where k = u + l `shiftR` 1
152 | {-# INLINE binarySearchByBounds #-}
153 | 
154 | {-
155 | empiricalCDF :: G.Vector v Double => v Double -> v Double
156 | empiricalCDF xs = runST $ do
157 |     let n = G.length xs
158 |         indices = groupBy ( (==) `on` ((xs G.!).snd) ) $ zip [1.0..] $ sortBy (compare `on` (xs G.!)) [0..n-1]
159 |         updates mv (v,ys) = mapM_ (flip (GM.unsafeWrite mv) v.snd) ys
160 |     xs' <- G.thaw xs
161 |     mapM_ (updates xs'. ((flip (/) (fromIntegral n).fst.last) &&& id)) indices
162 |     G.unsafeFreeze xs'
163 | {-# INLINE empiricalCDF #-}
164 | 
165 | cdf :: G.Vector v Double => v Double -> v Double
166 | cdf xs = let f = empiricalCDF xs
167 |              n = fromIntegral $ G.length xs
168 |              δ = 1 / (4 * (n**0.25) * sqrt (pi * log n))
169 |          in G.map (\ x -> case () of
170 |              _ | x < δ -> δ
171 |                | x > 1 - δ -> 1 - δ
172 |                | otherwise -> x) f
173 | {-# INLINE cdf #-}
174 | -}
175 | 
176 | -- | Columns are samples, rows are features / genes.
177 | quantileNormalization :: M.Matrix Double -> M.Matrix Double
178 | quantileNormalization mat = M.fromColumns $ map
179 |     (fst . G.unzip . sortBy (comparing snd) . G.fromList . concatMap f .
180 |     groupBy ((==) `on` (snd . snd)) . zip averages . G.toList) $
181 |     M.toColumns srtMat
182 |   where
183 |     f [(a,(b,_))] = [(a,b)]
184 |     f xs = let m = mean $ U.fromList $ fst $ unzip xs
185 |            in map (\(_,(i,_)) -> (m, i)) xs
186 |     srtMat :: M.Matrix (Int, Double)
187 |     srtMat = M.fromColumns $ map (sortBy (comparing snd) . G.zip (G.enumFromN 0 n)) $
188 |         M.toColumns mat
189 |     averages = map (mean . snd . G.unzip) $ M.toRows srtMat
190 |     n = M.rows mat
191 | {-# INLINE quantileNormalization #-}
192 | 
193 | -- | Columns are samples, rows are features / genes.
194 | quantileNormalization' :: M.Matrix Double -> M.Matrix Double
195 | quantileNormalization' mat = M.fromColumns $ parMap rseq
196 |     (fst . G.unzip . sortBy (comparing snd) . G.fromList . concatMap f .
197 |     groupBy ((==) `on` (snd . snd)) . zip averages . G.toList) $
198 |     M.toColumns srtMat
199 |   where
200 |     f [(a,(b,_))] = [(a,b)]
201 |     f xs = let m = mean $ U.fromList $ fst $ unzip xs
202 |            in map (\(_,(i,_)) -> (m, i)) xs
203 |     srtMat :: M.Matrix (Int, Double)
204 |     srtMat = M.fromColumns $ parMap rseq (sortBy (comparing snd) . G.zip (G.enumFromN 0 n)) $
205 |         M.toColumns mat
206 |     averages = parMap rseq (mean . snd . G.unzip) $ M.toRows srtMat
207 |     n = M.rows mat
208 | {-# INLINE quantileNormalization' #-}


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Utils/Misc.hs:
--------------------------------------------------------------------------------
 1 | module Bio.Utils.Misc
 2 |     ( readInt
 3 |     , readDouble
 4 |     , bins
 5 |     , binBySize
 6 |     , binBySizeLeft
 7 |     , binBySizeOverlap
 8 |     ) where
 9 | 
10 | import           Data.ByteString.Char8          (ByteString)
11 | import           Data.ByteString.Lex.Fractional (readExponential, readSigned)
12 | import           Data.ByteString.Lex.Integral   (readDecimal)
13 | import           Data.Maybe                     (fromMaybe)
14 | 
15 | readInt :: ByteString -> Int
16 | readInt x = fst . fromMaybe errMsg . readSigned readDecimal $ x
17 |   where
18 |     errMsg = error $ "readInt: Fail to cast ByteString to Int:" ++ show x
19 | {-# INLINE readInt #-}
20 | 
21 | readDouble :: ByteString -> Double
22 | readDouble x = fst . fromMaybe errMsg . readSigned readExponential $ x
23 |   where
24 |     errMsg = error $ "readDouble: Fail to cast ByteString to Double:" ++ show x
25 | {-# INLINE readDouble #-}
26 | 
27 | -- | divide a given half-close-half-open region into fixed size
28 | -- half-close-half-open intervals, discarding leftovers
29 | binBySize :: Int -> (Int, Int) -> [(Int, Int)]
30 | binBySize step (start, end) = let xs = [start, start + step .. end]
31 |                               in zip xs . tail $ xs
32 | {-# INLINE binBySize #-}
33 | 
34 | binBySizeOverlap :: Int -> Int -> (Int, Int) -> [(Int, Int)]
35 | binBySizeOverlap step overlap (start, end)
36 |     | overlap >= step = error "binBySizeOverlap: overlap > step"
37 |     | otherwise = go start
38 |   where
39 |     go i | i + overlap < end = (i, i + step) : go (i + step - overlap)
40 |          | otherwise = []
41 | {-# INLINE binBySizeOverlap #-}
42 | 
43 | -- | Including leftovers, the last bin will be extended to match desirable size
44 | binBySizeLeft :: Int -> (Int, Int) -> [(Int, Int)]
45 | binBySizeLeft step (start, end) = go start
46 |   where
47 |     go i | i < end = (i, i + step) : go (i + step)
48 |          | otherwise = []
49 | {-# INLINE binBySizeLeft #-}
50 | 
51 | -- | divide a given region into k equal size sub-regions, discarding leftovers
52 | bins :: Int -> (Int, Int) -> [(Int, Int)]
53 | bins binNum (start, end) = let k = (end - start) `div` binNum
54 |                            in take binNum . binBySize k $ (start, end)
55 | {-# INLINE bins #-}
56 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Utils/Overlap.hs:
--------------------------------------------------------------------------------
 1 | module Bio.Utils.Overlap
 2 |     ( overlapFragment
 3 |     , overlapNucl
 4 |     , coverage
 5 |     ) where
 6 | 
 7 | import           Bio.Data.Bed
 8 | import           Conduit
 9 | import           Lens.Micro                ((^.))
10 | import           Control.Monad
11 | import qualified Data.ByteString.Char8       as B
12 | import           Data.Function
13 | import qualified Data.HashMap.Strict         as M
14 | import qualified Data.IntervalMap.Strict     as IM
15 | import           Data.List
16 | import qualified Data.Vector.Unboxed         as V
17 | import qualified Data.Vector.Unboxed.Mutable as VM
18 | 
19 | -- | convert lines of a BED file into a data structure - A hashmap of which the
20 | -- | chromosomes, and values are interval maps.
21 | toMap :: [(B.ByteString, (Int, Int))] -> M.HashMap B.ByteString (IM.IntervalMap Int Int)
22 | toMap input = M.fromList.map create.groupBy ((==) `on` (fst.fst)) $ zip input [0..]
23 |     where
24 |         f ((_, x), i) = (toInterval x, i)
25 |         create xs = (fst.fst.head $ xs, IM.fromDistinctAscList.map f $ xs)
26 | {-# INLINE toMap #-}
27 | 
28 | coverage :: [BED]  -- ^ genomic locus in BED format
29 |          -> ConduitT () BED IO ()  -- ^ reads in BED format
30 |          -> IO (V.Vector Double, Int)
31 | coverage bin tags = liftM getResult $ runConduit $ tags .| sink
32 |   where
33 |     sink = do
34 |         v <- lift $ VM.replicate (n+1) 0
35 |         mapM_C $ \t -> do
36 |                 let set = M.lookup (t^.chrom) featMap
37 |                     s = t^.chromStart
38 |                     e = t^.chromEnd
39 |                     b = (s, e)
40 |                     l = e - s + 1
41 |                     intervals = case set of
42 |                         Just iMap -> IM.toList . IM.intersecting iMap . toInterval $ b
43 |                         _ -> []
44 |                 forM_ intervals (\interval -> do
45 |                     let i = snd interval
46 |                         nucl = overlap b . fst $ interval
47 |                     VM.write v i . (+nucl) =<< VM.read v i
48 |                     )
49 |                 VM.write v n . (+l) =<< VM.read v n
50 |         lift $ V.freeze v
51 |     getResult v = (V.zipWith normalize (V.slice 0 n v) featWidth, v V.! n)
52 |     featMap = toMap.map (\x -> (x^.chrom, (x^.chromStart, x^.chromEnd))) $ bin
53 |     featWidth = V.fromList $ map size bin
54 |     n = length bin
55 |     overlap (l, u) (IM.ClosedInterval l' u')
56 |         | l' >= l = if u' <= u then u'-l'+1 else u-l'+1
57 |         | otherwise = if u' <= u then u'-l+1 else u-l+1
58 |     overlap _ _ = 0
59 |     normalize a b = fromIntegral a / fromIntegral b
60 | 
61 | overlapFragment, overlapNucl ::
62 |                   [(Int, Int)] -- ^ Ascending order list
63 |                 -> [(Int, Int)] -- ^ tags in any order
64 |                 -> V.Vector Int
65 | overlapFragment xs ts = V.create (VM.replicate n 0 >>= go ts)
66 |     where
67 |         n = length xs
68 |         iMap = IM.fromAscList $ zip (map toInterval xs) [0..]
69 |         go ts' v = do
70 |             forM_ ts' (\x -> do
71 |                 let indices = IM.elems . IM.intersecting iMap . toInterval $ x
72 |                 forM_ indices (\i -> VM.write v i . (+1) =<< VM.read v i)
73 |                 )
74 |             return v
75 | 
76 | overlapNucl xs ts = V.create (VM.replicate n 0 >>= go ts)
77 |     where
78 |         n = length xs
79 |         iMap = IM.fromAscList $ zip (map toInterval xs) [0..]
80 |         go ts' v = do
81 |             forM_ ts' (\x -> do
82 |                 let intervals = IM.toList . IM.intersecting iMap . toInterval $ x
83 |                 forM_ intervals (\interval -> do
84 |                     let i = snd interval
85 |                         nucl = overlap x . fst $ interval
86 |                     VM.write v i . (+nucl) =<< VM.read v i
87 |                     )
88 |                 )
89 |             return v
90 |         overlap (l, u) (IM.ClosedInterval l' u')
91 |             | l' >= l = if u' <= u then u'-l'+1 else u-l'+1
92 |             | otherwise = if u' <= u then u'-l+1 else u-l+1
93 |         overlap _ _ = 0
94 | 
95 | toInterval :: (a, a) -> IM.Interval a
96 | toInterval (l, u) = IM.ClosedInterval l u
97 | {-# INLINE toInterval #-}
98 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/src/Bio/Utils/Types.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTs #-}
 2 | {-# LANGUAGE StandaloneDeriving #-}
 3 | {-# LANGUAGE FlexibleContexts #-}
 4 | module Bio.Utils.Types
 5 |     ( Sorted
 6 |     , ordering
 7 |     , fromSorted
 8 |     , toSorted
 9 |     , unsafeToSorted
10 |     ) where
11 | 
12 | import qualified Data.Foldable as F
13 | import Data.Ord ()
14 | 
15 | data Sorted f a where
16 |     Sorted :: (F.Foldable f, Ord a)
17 |            => { ordering :: !Ordering
18 |               , fromSorted :: !(f a)
19 |               }
20 |            -> Sorted f a
21 | 
22 | deriving instance Show (f a) => Show (Sorted f a)
23 | 
24 | -- | if the data has been sorted, wrap it into Sorted type
25 | toSorted :: (F.Foldable f, Ord a) => f a -> Sorted f a
26 | toSorted xs = Sorted o xs
27 |   where
28 |     o = snd . F.foldl' g (const EQ, EQ) $ xs
29 |     g (func, ord) x
30 |         | ord == EQ = (compare x, ord')
31 |         | ord' == ord || ord' == EQ = (compare x, ord)
32 |         | otherwise = error "data is not sorted"
33 |       where
34 |         ord' = func x
35 | 
36 | unsafeToSorted :: (F.Foldable f, Ord a) => Ordering -> f a -> Sorted f a
37 | unsafeToSorted = Sorted
38 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/Tests/Bam.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Tests.Bam (tests) where
 4 | 
 5 | import           Bio.Data.Bam
 6 | import           Bio.Data.Bed
 7 | import           Bio.Utils.Misc        (readInt)
 8 | import           Conduit
 9 | import           Control.Monad         (forM_)
10 | import qualified Data.ByteString.Char8 as B
11 | import           Data.Tuple            (swap)
12 | import           Test.Tasty
13 | import           Test.Tasty.Golden
14 | import           Test.Tasty.HUnit
15 | 
16 | tests :: TestTree
17 | tests = testGroup "Test: Bio.Data.Bam"
18 |     [ bamIOTest
19 |     , testCase "bamToBed" bamToBedTest
20 |     , testCase "sortedBamToBedPE" sortedBamToBedPETest
21 |     ]
22 | 
23 | bamIOTest :: TestTree
24 | bamIOTest = do
25 |     goldenVsFile "BAM Read/Write Test" input output io
26 |   where
27 |     io = do
28 |         header <- getBamHeader input
29 |         runResourceT $ runConduit $ streamBam input .| sinkBam output header
30 |     input = "tests/data/example.bam"
31 |     output = "tests/data/example_copy.bam"
32 | 
33 | bamToBedTest :: Assertion
34 | bamToBedTest = do
35 |     bed <- runResourceT $ runConduit $
36 |         streamBedGzip "tests/data/example.bed.gz" .| sinkList
37 |     bed' <- do
38 |         let input = "tests/data/example.bam" 
39 |         header <- getBamHeader input
40 |         runResourceT $ runConduit $ streamBam input .| bamToBedC header .| sinkList
41 |     forM_ (zip bed bed') $ \(a,b) -> 
42 |         if a == b then return () else error $ show (a,b)
43 |     (bed == bed') @? show (head bed, head bed')
44 | 
45 | sortedBamToBedPETest :: Assertion
46 | sortedBamToBedPETest = do
47 |     bedpe <- readBedPE "tests/data/pairedend.bedpe" :: IO [(BED3, BED3)]
48 |     bedpe' <- do
49 |         let input = "tests/data/pairedend.bam"
50 |         header <- getBamHeader input
51 |         runResourceT $ runConduit $ streamBam input .|
52 |             sortedBamToBedPE header .|
53 |             mapC (\(x,y) -> (convert x, convert y)) .| sinkList
54 |     forM_ (zip bedpe bedpe') $ \(b1, b2) -> (b1 == b2 || b1 == swap b2) @? show (b1,b2)
55 |   where
56 |     readBedPE fl = do
57 |         c <- B.readFile fl
58 |         return $ map (f . B.split '\t') $ B.lines c
59 |     f (f1:f2:f3:f4:f5:f6:_) = ( asBed f1 (readInt f2) (readInt f3)
60 |                               , asBed f4 (readInt f5) (readInt f6) )
61 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/Tests/Bed.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | 
  3 | module Tests.Bed (tests) where
  4 | 
  5 | import           Bio.Data.Bed
  6 | import           Bio.Data.Bed.Types
  7 | import           Bio.Data.Bed.Utils
  8 | import Bio.Utils.BitVector hiding (size)
  9 | import Lens.Micro
 10 | import           Conduit
 11 | import           Data.Function    (on)
 12 | import           Data.List        (sortBy, sort)
 13 | import qualified Data.Vector      as V
 14 | import           Test.Tasty
 15 | import           Test.Tasty.HUnit
 16 | import qualified Data.HashMap.Strict as M
 17 | import Data.Ord
 18 | import Data.Maybe
 19 | import Bio.RealWorld.GENCODE
 20 | import Data.Conduit.Zlib 
 21 | 
 22 | tests :: TestTree
 23 | tests = testGroup "Test: Bio.Data.Bed"
 24 |     [ testCase "sortBed" sortBedTest
 25 |     , testCase "split" splitBedTest
 26 |     , testCase "splitOverlapped" splitOverlappedTest
 27 |     , testCase "mergeBed" mergeBedTest
 28 |     , testCase "intersectBed" intersectBedTest
 29 |     , testCase "baseMap" baseMapTest
 30 |     , testCase "domain assignment" geneDomainTest
 31 |     ]
 32 | 
 33 | sortBedTest :: Assertion
 34 | sortBedTest = do
 35 |     beds <- runResourceT $ runConduit $
 36 |         streamBedGzip "tests/data/peaks.bed.gz" .| sinkList :: IO [BED3]
 37 |     let (Sorted actual) = sortBed beds
 38 |     expect <- runResourceT $ runConduit $
 39 |         streamBedGzip "tests/data/peaks.sorted.bed.gz" .| sinkVector
 40 |     expect @=? actual
 41 | 
 42 | splitBedTest :: Assertion
 43 | splitBedTest = (s1', s2', s3') @=? (s1, s2, s3)
 44 |   where
 45 |     bed = asBed "chr1" 0 99
 46 |     s1 = splitBedBySize 20 bed
 47 |     s1' = map f [(0, 20), (20, 40), (40, 60), (60, 80)]
 48 |     s2 = splitBedBySizeLeft 20 bed
 49 |     s2' = map f [(0, 20), (20, 40), (40, 60), (60, 80), (80, 100)]
 50 |     s3 = splitBedBySizeOverlap 20 10 bed
 51 |     s3' = map f [ ( 0, 20), (10, 30), (20, 40), (30, 50), (40, 60)
 52 |                 , (50, 70), (60, 80), (70, 90), (80, 100) ]
 53 |     f (a,b) = asBed "chr1" a b :: BED3
 54 | 
 55 | splitOverlappedTest :: Assertion
 56 | splitOverlappedTest = expect @=? result
 57 |   where
 58 |     input :: [BED3]
 59 |     input = map (\(a,b) -> asBed "chr1" a b)
 60 |         [ (0, 100)
 61 |         , (10, 20)
 62 |         , (50, 150)
 63 |         , (120, 160)
 64 |         , (155, 200)
 65 |         , (155, 220)
 66 |         , (0, 10)
 67 |         , (0, 10)
 68 |         ]
 69 |     expect = map (\((a,b), x) -> (asBed "chr1" a b, x))
 70 |         [ ((0, 10), 3)
 71 |         , ((10, 20), 2)
 72 |         , ((20, 50), 1)
 73 |         , ((50, 100), 2)
 74 |         , ((100, 120), 1)
 75 |         , ((120, 150), 2)
 76 |         , ((150, 155), 1)
 77 |         , ((155, 160), 3)
 78 |         , ((160, 200), 2)
 79 |         , ((200, 220), 1)
 80 |         ]
 81 |     --result = sortBy (compareBed `on` fst) $ splitOverlapped length input
 82 |     result = sortBy (compareBed `on` fst) $ countOverlapped input
 83 | 
 84 | mergeBedTest :: Assertion
 85 | mergeBedTest = expect @=? result
 86 |   where
 87 |     input :: [BED3]
 88 |     input = map (\(a,b) -> asBed "chr1" a b)
 89 |         [ (0, 100)
 90 |         , (10, 20)
 91 |         , (50, 150)
 92 |         , (120, 160)
 93 |         , (155, 200)
 94 |         , (155, 220)
 95 |         , (500, 1000)
 96 |         ]
 97 |     expect = map (\(a,b) -> asBed "chr1" a b)
 98 |         [ (0, 220)
 99 |         , (500, 1000)
100 |         ]
101 |     result = runIdentity $ runConduit $ mergeBed input .| sinkList
102 | 
103 | intersectBedTest :: Assertion
104 | intersectBedTest = do
105 |     expect <- runResourceT $ runConduit $
106 |         streamBedGzip "tests/data/example_intersect_peaks.bed.gz" .| sinkList :: IO [BED3]
107 |     peaks <- runResourceT $ runConduit $
108 |         streamBedGzip "tests/data/peaks.bed.gz" .| sinkList :: IO [BED3]
109 |     result <- runResourceT $ runConduit $
110 |         streamBedGzip "tests/data/example.bed.gz" .| intersectBed peaks .| sinkList
111 |     expect @=? result
112 | 
113 | baseMapTest :: Assertion
114 | baseMapTest = do
115 |     BaseMap bv <- runResourceT $ runConduit $ streamBedGzip "tests/data/example.bed.gz" .|
116 |         baseMap [("chr1", 300000000)]
117 |     let res = M.lookupDefault undefined "chr1" $
118 |             fmap (map fst . filter snd . zip [0..] . toList) bv
119 |     expect <- runResourceT $ runConduit $ streamBedGzip "tests/data/example.bed.gz" .|
120 |         concatMapC f .| sinkList
121 |     sort expect @=? sort res
122 |   where
123 |     f :: BED -> Maybe Int
124 |     f bed = if bed^.chrom == "chr1"
125 |         then Just $ if fromJust (bed^.strand)
126 |             then bed^.chromStart
127 |             else bed^.chromEnd - 1
128 |         else Nothing
129 | 
130 | geneDomainTest :: Assertion
131 | geneDomainTest = do
132 |     genes <- runResourceT $ runConduit $ sourceFile "tests/data/genes.gtf.gz" .|
133 |         multiple ungzip .| readGenesC
134 |     let promoters = bedToTree const $ zip (concatMap (getPromoters 5000 1000) genes) $ repeat ()
135 |         domain = getDomains 1000000 $ concatMap (getPromoters 5000 1000) genes
136 |         f x = not (isIntersected promoters x) && (c1 || c2)
137 |           where
138 |             c1 = isIntersected promoters (chromStart %~ (`subtract` 1) $ x) &&
139 |                 isIntersected promoters (chromEnd %~ (+1) $ x) &&
140 |                 size x < 1000000
141 |             c2 = isIntersected promoters (chromStart %~ (`subtract` 1) $ chromEnd %~ (+1) $ x) &&
142 |                 size x == 1000000
143 |     all f domain @=? True


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/Tests/Fastq.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Tests.Fastq (tests) where
 4 | 
 5 | import           Bio.Data.Fastq
 6 | import Data.Conduit.Zlib (ungzip, multiple, gzip)
 7 | import qualified Data.ByteString.Char8 as B
 8 | import qualified Data.ByteString.Lazy.Char8 as BL
 9 | import Lens.Micro
10 | import           Conduit
11 | import           Test.Tasty
12 | import           Test.Tasty.Golden
13 | import           Test.Tasty.HUnit
14 | import qualified Data.HashMap.Strict as M
15 | import Data.Ord
16 | import Data.Maybe
17 | 
18 | tests :: TestTree
19 | tests = testGroup "Test: Bio.Data.Fastq"
20 |     [ testCase "FASTQ IO" fastqIO
21 |     ]
22 | 
23 | fastqIO :: Assertion
24 | fastqIO = do
25 |     a <- fmap B.concat $ runResourceT $ runConduit $
26 |         sourceFile "tests/data/test.fastq.gz" .| multiple ungzip .| sinkList
27 |     b <- fmap B.concat $ runResourceT $ runConduit $
28 |         sourceFile "tests/data/test_wrap.fastq.gz" .| multiple ungzip .| sinkList
29 |     let fq1 = B.concat $ replicate 1000 a
30 |         fq2 = B.concat $ replicate 1000 b
31 |         r1 = B.concat $ runIdentity $ runConduit $ yield fq1 .|
32 |             parseFastqC .| mapC fastqToByteString .| unlinesAsciiC .| sinkList
33 |         r2 = B.concat $ runIdentity $ runConduit $ yield fq2 .|
34 |             parseFastqC .| mapC fastqToByteString .| unlinesAsciiC .| sinkList
35 |     [True, True] @=? [r1 == fq1, r2 == fq1]


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/Tests/Motif.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | module Tests.Motif (tests) where
 4 | 
 5 | import qualified Data.ByteString.Char8 as B
 6 | import           Data.Conduit
 7 | import qualified Data.Conduit.List     as CL
 8 | import           Data.Default.Class
 9 | import           System.Random
10 | import           Test.Tasty
11 | import           Test.Tasty.HUnit
12 | 
13 | import           Bio.Data.Fasta
14 | import           Bio.Motif
15 | import           Bio.Motif.Search
16 | import           Bio.Seq
17 | 
18 | import qualified Data.Vector.Unboxed as U
19 | 
20 | dna :: DNA Basic
21 | dna = case fromBS (B.pack $ map f $ take 5000 $ randomRs (0, 3) (mkStdGen 2)) of
22 |     Left msg -> error msg
23 |     Right x -> x
24 |   where
25 |     f :: Int -> Char
26 |     f x = case x of
27 |         0 -> 'A'
28 |         1 -> 'C'
29 |         2 -> 'G'
30 |         3 -> 'T'
31 |         _ -> undefined
32 | 
33 | motifs :: IO [Motif]
34 | motifs = do
35 |     m1 <- readFasta' "tests/data/motifs.fasta"
36 |     -- m2 <- readMEME "tests/data/motifs.meme"
37 |     return m1
38 | 
39 | tests :: TestTree
40 | tests = testGroup "Test: Bio.Motif"
41 |     [ --testCase "IUPAC converting" toIUPACTest
42 |       testCase "TFBS scanning" findTFBSTest
43 |     , testCase "Max matching score" maxScTest
44 |     , testCase "pValue calculation" pValueTest
45 |     , testCase "CDF truncate test" cdfTruncateTest
46 |     , testCase "Reverse Complentary" reverseComplentary
47 |     ]
48 | 
49 | 
50 | {-
51 | toIUPACTest :: Assertion
52 | toIUPACTest = assertEqual "toIUPAC check" expect actual
53 |   where
54 |     expect = "SAA"
55 |     actual = show . toIUPAC $ pwm
56 |     -}
57 | 
58 | findTFBSTest :: Assertion
59 | findTFBSTest = do
60 |     ms <- motifs
61 |     let (Motif _ pwm) = head ms
62 |     expect <- findTFBS def pwm dna (0.6 * optimalScore def pwm) True $$ CL.consume
63 |     actual <- findTFBSSlow def pwm dna (0.6 * optimalScore def pwm) $$ CL.consume
64 |     assertEqual "findTFBS" expect actual
65 | 
66 | maxScTest :: Assertion
67 | maxScTest = do
68 |     ms <- motifs
69 |     let expect = map (\x -> maximum $ scores def (_pwm x) dna) ms
70 |         actual = map (\x -> maxMatchSc def (_pwm x) dna) ms
71 |     assertEqual "maxMatchSc" expect actual
72 | 
73 | pValueTest :: Assertion
74 | pValueTest = do
75 |     ms <- motifs
76 |     let expect = map (approx . pValueToScoreExact 1e-4 def . _pwm) ms
77 |         actual = map (approx . pValueToScore 1e-4 def . _pwm) ms
78 |     assertEqual "pValueToScore" expect actual
79 |   where
80 |     approx x = round $ 10 * x
81 | 
82 | cdfTruncateTest :: Assertion
83 | cdfTruncateTest = do
84 |     ms <- motifs
85 |     let expect = map (pValueToScore 1e-4 def . _pwm) ms
86 |         actual = map (pValueToScore' 1e-4 def . _pwm) ms
87 |     assertEqual "CDF truncate" expect actual
88 |   where
89 |     pValueToScore' p bg pwm = cdf' (truncateCDF 0.999 $ scoreCDF bg pwm) $ 1 - p
90 | 
91 | reverseComplentary :: Assertion
92 | reverseComplentary = do
93 |     ms <- motifs
94 |     let expect = map (\x -> map approx $ scores def (_pwm x) dna) ms
95 |         actual = map (\x -> map approx $ reverse $ scores def (rcPWM $ _pwm x) $ rc dna) ms
96 |     assertEqual "reverse complentary" expect actual
97 |   where
98 |     approx x = round $ 100000 * x


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/Tests/Seq.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Tests.Seq (tests) where
 3 | 
 4 | import           Bio.Seq
 5 | import           Data.Either
 6 | import qualified Data.HashMap.Strict as M
 7 | import           Test.Tasty
 8 | import           Test.Tasty.HUnit
 9 | 
10 | tests :: TestTree
11 | tests = testGroup "Test: Bio.Seq"
12 |     [ testCase "nucleotideFreq" testNuclFreq
13 |     ]
14 | 
15 | testNuclFreq :: Assertion
16 | testNuclFreq = do
17 |     let dna = fromRight undefined $ fromBS "ACTTCCCGGGD" :: DNA IUPAC
18 |         test = M.lookupDefault undefined 'C' $ nucleotideFreq dna
19 |         expected = 4
20 |     test @=? expected
21 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/Tests/Tools.hs:
--------------------------------------------------------------------------------
 1 | module Tests.Tools (tests) where
 2 | 
 3 | import Bio.Utils.Functions
 4 | import Test.Tasty
 5 | import Test.Tasty.HUnit
 6 | import qualified Data.Matrix as MU
 7 | import Text.Printf (printf)
 8 | 
 9 | tests :: TestTree
10 | tests = testGroup "Test: Bio.Utils"
11 |     [ quantileNormalizationTest
12 |     ]
13 | 
14 | quantileNormalizationTest :: TestTree
15 | quantileNormalizationTest = testGroup "quantile normalization"
16 |     [ testCase "case 1" $ y1 @=? quantileNormalization x1
17 |     , testCase "case 2" $ MU.map (printf "%.2f") y2 @=?
18 |         (MU.map (printf "%.2f") (quantileNormalization x2) :: MU.Matrix String)
19 |     ]
20 |   where
21 |     x1 :: MU.Matrix Double
22 |     x1 = MU.fromLists
23 |         [ [2,  4, 4, 5]
24 |         , [5, 14, 4, 7]
25 |         , [4,  8, 6, 9]
26 |         , [3,  8, 5, 8]
27 |         , [3,  9, 3, 5] ]
28 |     y1 :: MU.Matrix Double
29 |     y1 = MU.fromLists
30 |         [ [ 3.5,  3.5, 5.25, 4.25]
31 |         , [ 8.5,  8.5, 5.25,  5.5]
32 |         , [ 6.5, 5.25,  8.5,  8.5]
33 |         , [5.25, 5.25,  6.5,  6.5]
34 |         , [5.25,  6.5,  3.5, 4.25] ]
35 |     x2 :: MU.Matrix Double
36 |     x2 = MU.fromLists
37 |         [ [5, 4, 3]
38 |         , [2, 1, 4]
39 |         , [3, 4, 6]
40 |         , [4, 2, 8] ]
41 |     y2 :: MU.Matrix Double
42 |     y2 = MU.fromLists
43 |         [ [ 5.666667,  5.166667, 2 ]
44 |         , [ 2,  2, 3]
45 |         , [3, 5.166667,  4.666667]
46 |         , [4.666667,  3,  5.666667] ]
47 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/example.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example.bam


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/example.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example.bed.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/example_copy.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example_copy.bam


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/example_intersect_peaks.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/example_intersect_peaks.bed.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/genes.gtf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/genes.gtf.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/motifs.fasta:
--------------------------------------------------------------------------------
 1 | >1_ASCCAGGCKGG
 2 | 0.768791 0.07577 0.120456 0.034983
 3 | 0.057276 0.532522375 0.282802875 0.12739862500000002
 4 | 0.048894125 0.796682375 0.09644475 0.05797825
 5 | 0.044861750000000006 0.6375725 0.263928 0.05363775
 6 | 0.61916725 0.104690625 0.17939824999999998 0.096743875
 7 | 0.086404375 0.0745045 0.807213125 0.031878125
 8 | 0.08837187499999999 0.07052475 0.828965875 0.012137499999999999
 9 | 0.039856875 0.80558425 0.08105799999999999 0.07350025
10 | 0.079739875 0.110598625 0.40112975 0.40853125
11 | 0.03881925 0.12873025 0.781449 0.05100175
12 | 0.136913 0.15827100000000002 0.5818685 0.1229465
13 | 
14 | >2_NGCRGGCKGAS
15 | 0.090506 0.471424 0.2324 0.205671
16 | 0.021978 0.040675 0.927348 0.01
17 | 0.014602750000000001 0.9562965 0.012410250000000001 0.01669075
18 | 0.468245 0.021042 0.29159075 0.21912225
19 | 0.0388475 0.04228675 0.887877 0.03098875
20 | 0.05335675 0.096898 0.8321492500000001 0.017596
21 | 0.15436075 0.7520785 0.052595249999999996 0.040965499999999995
22 | 0.02820475 0.07090675 0.31607725 0.5848115
23 | 0.02995975 0.01687 0.9428795 0.010290750000000001
24 | 0.6324460000000001 0.0383675 0.260754 0.0684325
25 | 0.056907 0.5814265000000001 0.3313415 0.030325
26 | 
27 | >3_GGAGGAGGNG
28 | 0.078664 0.0663 0.823079 0.031957
29 | 0.20135799999999998 0.034205 0.7544365 0.01
30 | 0.82030325 0.07725525 0.0680665 0.034375
31 | 0.08712400000000001 0.054251 0.8422565 0.0163685
32 | 0.283462 0.0698595 0.5971305 0.049547999999999995
33 | 0.65613775 0.22489399999999998 0.0696585 0.04930975
34 | 0.0762455 0.164718 0.74042025 0.0186165
35 | 0.11206625 0.05687675 0.81332925 0.01772775
36 | 0.38844975000000004 0.26271175 0.27699425 0.07184399999999999
37 | 0.10369725 0.07755350000000001 0.7556514999999999 0.06309775000000001
38 | 
39 | >4_YCGSGANCGSA
40 | 0.088404 0.484429 0.078568 0.348599
41 | 0.04927675 0.6174805 0.25066225 0.0825805
42 | 0.0349955 0.06153675 0.88377325 0.0196945
43 | 0.051606 0.4338045 0.38140925 0.1331805
44 | 0.09071325 0.10709625 0.7679435 0.03424675
45 | 0.82530675 0.1102275 0.03089375 0.033572
46 | 0.44087375 0.267475 0.246572 0.04507925
47 | 0.050244 0.81758975 0.10140475 0.0307615
48 | 0.038650500000000004 0.085901 0.7673414999999999 0.108107
49 | 0.028267999999999998 0.451347 0.444228 0.076157
50 | 0.805 0.052 0.035 0.108
51 | 
52 | >5_SGCGTGCRC
53 | 0.052962999999999996 0.322791 0.4390235 0.1852225
54 | 0.149675 0.089726 0.7472715000000001 0.013327499999999999
55 | 0.044135499999999994 0.8364149999999999 0.0258065 0.0936425
56 | 0.0219595 0.01 0.9580405 0.01
57 | 0.027667 0.01 0.020536 0.941797
58 | 0.0118715 0.040971999999999995 0.937156 0.01
59 | 0.022783499999999998 0.8734415 0.053024999999999996 0.05075
60 | 0.486805 0.0319375 0.469651 0.011606
61 | 0.060225499999999994 0.7693025 0.0367025 0.13376949999999999
62 | 
63 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/motifs.meme:
--------------------------------------------------------------------------------
 1 | MEME version 4
 2 | 
 3 | ALPHABET= ACGT
 4 | 
 5 | strands: + -
 6 | 
 7 | Background letter frequencies
 8 | A 0.303 C 0.183 G 0.209 T 0.306 
 9 | 
10 | MOTIF lexA
11 | letter-probability matrix: alength= 4 w= 12 nsites= 14 E= 3.2e-035 
12 |  0.000000  0.000000  1.000000  0.000000 
13 |  0.000000  0.000000  0.000000  1.000000 
14 |  0.857143  0.000000  0.071429  0.071429 
15 |  0.000000  0.071429  0.000000  0.928571 
16 |  0.857143  0.000000  0.071429  0.071429 
17 |  0.142857  0.000000  0.000000  0.857143 
18 |  0.571429  0.071429  0.214286  0.142857 
19 |  0.285714  0.285714  0.000000  0.428571 
20 |  1.000000  0.000000  0.000000  0.000000 
21 |  0.285714  0.214286  0.000000  0.500000 
22 |  0.428571  0.500000  0.000000  0.071429 
23 |  0.000000  1.000000  0.000000  0.000000 
24 | 
25 | MOTIF YPL133C+757.pfm
26 | letter-probability matrix: alength= 4 w= 12 nsites= 1 E= 0
27 |  0.25 0.25 0.25 0.25
28 |  0 0.5 0 0.5
29 |  0 1 0 0
30 |  0 1 0 0
31 |  0.333333 0 0.333333 0.333333
32 |  0 0 0 1
33 |  0 0 0 1
34 |  1 0 0 0
35 |  0 1 0 0
36 |  0 1 0 0
37 |  0 0 1 0
38 |  0.25 0.25 0.25 0.25
39 | 
40 | MOTIF 7 V_ELK1_01
41 | 
42 | letter-probability matrix: alength= 4 w= 12 nsites= 4 E= 0
43 |   0.250000	  0.250000	  0.250000	  0.250000	
44 |   0.500000	  0.000000	  0.250000	  0.250000	
45 |   0.500000	  0.250000	  0.250000	  0.000000	
46 |   0.750000	  0.000000	  0.250000	  0.000000	
47 |   0.000000	  0.750000	  0.250000	  0.000000	
48 |   0.000000	  0.000000	  1.000000	  0.000000	
49 |   1.000000	  0.000000	  0.000000	  0.000000	
50 |   0.000000	  0.250000	  0.000000	  0.750000	
51 |   0.250000	  0.250000	  0.250000	  0.250000	
52 |   0.250000	  0.750000	  0.000000	  0.000000	
53 |   0.250000	  0.250000	  0.500000	  0.000000	
54 |   0.250000	  0.250000	  0.000000	  0.500000	
55 | 


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/pairedend.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/pairedend.bam


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/peaks.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/peaks.bed.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/peaks.sorted.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/peaks.sorted.bed.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/test.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/test.fastq.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/data/test_wrap.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kaizhang/bioinformatics-toolkit/b8949495069a61e56ae125ee5e4f87b9a423868f/bioinformatics-toolkit/tests/data/test_wrap.fastq.gz


--------------------------------------------------------------------------------
/bioinformatics-toolkit/tests/test.hs:
--------------------------------------------------------------------------------
 1 | import qualified Tests.Bed as Bed
 2 | import qualified Tests.Bam as Bam
 3 | import qualified Tests.Motif as Motif
 4 | import qualified Tests.Fastq as Fastq
 5 | import qualified Tests.Seq as Seq
 6 | import qualified Tests.Tools as Tools
 7 | import Test.Tasty
 8 | 
 9 | main :: IO ()
10 | main = defaultMain $ testGroup "Main"
11 |     [ Bed.tests
12 |     , Bam.tests
13 |     , Seq.tests
14 |     , Motif.tests
15 |     , Tools.tests
16 |     , Fastq.tests
17 |     ]
18 | 


--------------------------------------------------------------------------------