├── .github
└── workflows
│ └── action.yml
├── .travis.yml
├── Biobase
├── RNAcentralHTTPRequest.hs
├── RNAlien.hs
├── RNAlien
│ ├── CMstatParser.hs
│ ├── InfernalParser.hs
│ ├── Library.hs
│ ├── RNAcentralHTTP.hs
│ └── Types.hs
├── RNAlienScan.hs
├── RNAlienStatistics.hs
└── cmsearchToBED.hs
├── ChangeLog.md
├── Dockerfile.dev
├── LICENSE
├── ParserTest.hs
├── README.md
├── RNAlien.cabal
├── RNAlien.svg
├── RNAlienScan.svg
├── cabal.project
├── default.nix
├── envhs.nix
├── manual.pdf
├── overrides.nix
├── scripts
├── AlienBenchmarkCMCompare.sh
├── AlienBenchmarkModels.sh
├── alienresultstatistics.pl
├── alienstructurestatistics.pl
├── blastbenchmarkdata.pl
├── buildClanModels.pl
├── cmComparevsRfam.pl
├── cmcomparebesthitextractor.pl
├── getblastdb.sh
├── makemultiplotcsv.sh
└── nhmmerbenchmarkdata.pl
├── stack.yaml
└── test
├── single.fa
├── test.stockholm
├── testcalls
└── testmulti.fa
/.github/workflows/action.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [push]
3 | jobs:
4 | build:
5 | runs-on: ${{ matrix.os }}
6 | strategy:
7 | matrix:
8 | ghc: ['8.8', '8.10', '9.0']
9 | cabal: ['3.2', '3.4']
10 | os: [ubuntu-latest]
11 | name: Haskell GHC ${{ matrix.ghc }}, cabal ${{ matrix.cabal }}
12 | steps:
13 | - uses: actions/checkout@v2
14 | - name: Setup Haskell
15 | uses: haskell/actions/setup@v1
16 | with:
17 | ghc-version: ${{ matrix.ghc }}
18 | cabal-version: ${{ matrix.cabal }}
19 | - run: cabal v2-test
20 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 |
3 | language: c
4 |
5 | services:
6 | - docker
7 |
8 | before_script:
9 | - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin
10 | - docker build --tag $DOCKER_USERNAME/rnalien -f Dockerfile.dev .
11 |
12 | script:
13 | - docker create --name develcontainer $DOCKER_USERNAME/rnalien
14 | - docker images
15 | - mkdir RNAlien
16 | - docker cp develcontainer:/RNAlien RNAlien
17 | - docker cp develcontainer:/RNAlienStatistics RNAlien
18 | - docker cp develcontainer:/cmsearchToBed RNAlien
19 | - docker cp develcontainer:/RNAcentralHTTPRequest RNAlien
20 | - docker cp develcontainer:/RNAlienScan RNAlien
21 | - cp LICENSE RNAlien
22 | - tar -cvzf RNAlien.tar.gz RNAlien
23 | - docker push $DOCKER_USERNAME/rnalien
24 | - docker run --rm $DOCKER_USERNAME/rnalien /RNAlien --help
25 |
26 |
27 | deploy:
28 | provider: releases
29 | skip_cleanup: true
30 | api_key: $GITHUB_TOKEN
31 | file: "RNAlien.tar.gz"
32 | on:
33 | tags: true
34 |
--------------------------------------------------------------------------------
/Biobase/RNAcentralHTTPRequest.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE RecordWildCards #-}
2 | {-# LANGUAGE DeriveDataTypeable #-}
3 |
4 | -- | RNAcentralHTTPRequest
5 | -- Testcommand: dist/build/RNAcentralHTTPRequest/RNAcentralHTTPRequest -i ATACTTACCTGGCACAGGGGATACCACGATCACCAAGGTGGTTCCCCCAAGACGAGGCTCACCATTGCACTCCGGTGGCGCTGACCCTTGCAATGACCCCAAATGTGGGTTACTCGGGTGTGTAATTTCTGTTAGCTGGGGACTGCGTTCGCGCTTTCCCCTT
6 | module Main where
7 |
8 | import System.Console.CmdArgs
9 | import Biobase.RNAlien.RNAcentralHTTP
10 |
11 | data Options = Options
12 | { inputSequence :: String
13 | } deriving (Show,Data,Typeable)
14 |
15 | options :: Options
16 | options = Options
17 | { inputSequence = def &= name "i" &= help "input sequence"
18 | } &= summary "RNAcentralHTTPRequest" &= help "Florian Eggenhofer 2016" &= verbosity
19 |
20 | main :: IO ()
21 | main = do
22 | Options{..} <- cmdArgs options
23 | let query = buildStringViaMD5Query inputSequence
24 | rnacentralentries <- getRNACentralEntries [query]
25 | print rnacentralentries
26 |
27 |
--------------------------------------------------------------------------------
/Biobase/RNAlien.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE RecordWildCards #-}
2 | {-# LANGUAGE DeriveDataTypeable #-}
3 |
4 | -- | Unsupervized construction of RNA family models
5 | -- For more information on RNA family models consult
6 | -- Usage example: RNAlien -i /path/input.fa -c 5 -o /outdir/
7 | -- Usage example offline mode: RNAlien -i /path/input.fa -b /backup/blast/nt_v5 -o /outdir/ -c 5 -t 1396 -j
8 | module Main where
9 |
10 | import System.Console.CmdArgs
11 | import System.Directory
12 | import Biobase.RNAlien.Types
13 | import Biobase.RNAlien.Library
14 | import Data.Maybe
15 | import Data.Either.Unwrap
16 | import Data.Time
17 | import qualified System.FilePath as FP
18 | import Paths_RNAlien (version)
19 | import Data.Version (showVersion)
20 | import qualified Biobase.StockholmAlignment.Import as BS
21 | import qualified Control.Exception.Base as CE
22 | import Control.Monad
23 |
24 | data Options = Options
25 | { inputFastaFilePath :: String,
26 | inputAlignmentFilePath :: String,
27 | outputPath :: String,
28 | inputTaxId :: Maybe Int,
29 | inputnSCICutoff :: Maybe Double,
30 | inputEvalueCutoff :: Maybe Double,
31 | inputBlastDatabase :: Maybe String,
32 | lengthFilter :: Bool,
33 | coverageFilter :: Bool,
34 | singleHitperTax :: Bool,
35 | blastSoftmasking :: Bool,
36 | inputQuerySelectionMethod :: String,
37 | inputQueryNumber :: Int,
38 | threads :: Int,
39 | taxonomyRestriction :: Maybe String,
40 | sessionIdentificator :: Maybe String,
41 | performEvaluation :: Bool,
42 | checkSetup :: Bool,
43 | taxonomyDumpPath :: String,
44 | offlineMode :: Bool
45 | } deriving (Show,Data,Typeable)
46 |
47 | options :: Options
48 | options = Options
49 | { inputFastaFilePath = def &= name "i" &= help "Path to input fasta file",
50 | inputAlignmentFilePath = def &= name "p" &= help "Path to input alignment file",
51 | outputPath = def &= name "o" &= help "Path to output directory. Default: current working directory",
52 | inputTaxId = Nothing &= name "t" &= help "NCBI taxonomy ID number of input RNA organism",
53 | inputnSCICutoff = Just (1 :: Double) &= name "z" &= help "Only candidate sequences with a normalized structure conservation index (nSCI) higher than this value are accepted. Default: 1",
54 | inputEvalueCutoff = Just (0.001 :: Double) &= name "e" &= help "Evalue cutoff for cmsearch filtering. Default: 0.001",
55 | inputBlastDatabase = Just "nt" &= name "b" &= help "Specify name of blast database to use, in offline mode the filepath to the blast database (/home/user/nt_v5). Default: nt",
56 | lengthFilter = True &= name "l" &= help "Filter blast hits per genomic length. Default: True",
57 | coverageFilter = True &= name "a" &= help "Filter blast hits by coverage of at least 80%. Default: True",
58 | singleHitperTax = False &= name "s" &= help "Only the best blast hit per taxonomic entry is considered. Default: False",
59 | blastSoftmasking = False &= name "f" &= help "Toggles blast query softmasking, meaning masking of non-conserved regions on the query. Default: False",
60 | inputQuerySelectionMethod = "filtering" &= name "m" &= help "Method for selection of queries (filtering,clustering). Default: filtering",
61 | inputQueryNumber = (5 :: Int) &= name "n" &= help "Number of queries used for candidate search. Default: 5",
62 | threads = 1 &= name "c" &= help "Number of available cpu slots/cores. Default: 1",
63 | taxonomyRestriction = Nothing &= name "r" &= help "Restrict search space to taxonomic kingdom (bacteria,archea,eukaryia,cellularorganisms,viruses). Default: not set",
64 | sessionIdentificator = Nothing &= name "d" &= help "Optional session id that is used instead of automatically generated one.",
65 | performEvaluation = True &= name "x" &= help "Perform evaluation step. Default: True",
66 | checkSetup = False &= name "g" &= help "Just prints installed tool versions and performs connection check. Default: False",
67 | taxonomyDumpPath = def &= name "w" &= help "Path to NCBI taxonomy dump directory.",
68 | offlineMode = False &= name "j" &= help "Uses locally installed blast and databases. Default: False"
69 | } &= summary ("RNAlien " ++ alienVersion) &= help "Florian Eggenhofer, Ivo L. Hofacker, Christian Hoener zu Siederdissen - 2013 - 2020" &= verbosity
70 |
71 | main :: IO ()
72 | main = do
73 | Options{..} <- cmdArgs options
74 | verboseLevel <- getVerbosity
75 | --let tools = if inputQuerySelectionMethod == "clustering" then ["clustalo","mlocarna","RNAfold","RNAalifold","cmcalibrate","cmstat","cmbuild","RNAz","RNAcode"] else ["mlocarna","RNAfold","RNAalifold","cmcalibrate","cmstat","cmbuild","RNAz","RNAcode"]
76 | -- Generate SessionID
77 | sessionId <- createSessionID sessionIdentificator
78 | timestamp <- getCurrentTime
79 | currentWorkDirectory <- getCurrentDirectory
80 | let selectedOutputPath = if null outputPath then currentWorkDirectory else outputPath
81 | let temporaryDirectoryPath = FP.addTrailingPathSeparator selectedOutputPath ++ sessionId ++ "/"
82 | createDirectoryIfMissing False temporaryDirectoryPath
83 | setupCheckAlienWithLog inputQuerySelectionMethod temporaryDirectoryPath
84 | createDirectoryIfMissing False (temporaryDirectoryPath ++ "log")
85 | -- Create Log files
86 | writeFile (temporaryDirectoryPath ++ "Log") ("RNAlien " ++ alienVersion ++ "\n")
87 | writeFile (temporaryDirectoryPath ++ "log/warnings") ("")
88 | logMessage ("Timestamp: " ++ (show timestamp) ++ "\n") temporaryDirectoryPath
89 | logMessage ("Temporary Directory: " ++ temporaryDirectoryPath ++ "\n") temporaryDirectoryPath
90 | let iterationNumber = 0
91 | singleFasta <- isSingleFasta inputFastaFilePath
92 | if singleFasta
93 | then do
94 | fastaInput <- readFastaFile inputFastaFilePath
95 | when (null fastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i")
96 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath
97 | let reformatedFastaInput = map reformatFasta fastaInput
98 | let inputSequence = head reformatedFastaInput
99 | initialTaxId <- setInitialTaxId offlineMode threads inputBlastDatabase temporaryDirectoryPath inputTaxId inputSequence
100 | let checkedTaxonomyRestriction = checkTaxonomyRestriction taxonomyRestriction
101 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) inputTaxId singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads inputBlastDatabase checkedTaxonomyRestriction (setVerbose verboseLevel) offlineMode [] taxonomyDumpPath
102 | when (setVerbose verboseLevel) (print staticOptions)
103 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] initialTaxId Nothing (fromJust inputEvalueCutoff) False [] [] [] Nothing
104 | logMessage (show initialization) temporaryDirectoryPath
105 | modelConstructionResults <- modelConstructer staticOptions initialization
106 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults
107 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable
108 | if performEvaluation
109 | then do
110 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults
111 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation
112 | resultSummary modelConstructionResults staticOptions
113 | writeFile (temporaryDirectoryPath ++ "done") ""
114 | else do
115 | resultSummary modelConstructionResults staticOptions
116 | writeFile (temporaryDirectoryPath ++ "done") ""
117 | else do
118 | --multi fasta or aln input
119 | alignmentFilePath <- if not (null inputFastaFilePath)
120 | then do
121 | let mlocarnaFilePath = temporaryDirectoryPath ++ "input.mlocarna"
122 | let mlocarnaDirPath = temporaryDirectoryPath ++ "inputMLocarna"
123 | let mlocarnaStkPath = mlocarnaDirPath ++ "/results/result.stk"
124 | let alifoldPath = temporaryDirectoryPath ++ "input.alifold"
125 | let stockholmPath = temporaryDirectoryPath ++ "input.stockholm"
126 | alignSequences "mlocarna" ("--stockholm --consensus-structure alifold --tgtdir=" ++ mlocarnaDirPath ++ " --threads=" ++ show threads ++ " ") [inputFastaFilePath] [] [mlocarnaFilePath] []
127 | _ <- systemRNAalifold "-r --cfactor 0.6 --nfactor 0.5" mlocarnaStkPath alifoldPath
128 | _ <- replaceStockholmStructure mlocarnaStkPath alifoldPath stockholmPath
129 | return stockholmPath
130 | else return inputAlignmentFilePath
131 | alignmentInput <- BS.readExistingStockholm alignmentFilePath
132 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath
133 | when (isLeft alignmentInput) (error (fromLeft alignmentInput))
134 | let rightAlignment = head $ fromRight alignmentInput
135 | let reformatedFastaInput = stockholmAlignmentToFasta rightAlignment
136 | when (null reformatedFastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i")
137 | let inputSequence = head reformatedFastaInput
138 | initialTaxId <- setInitialTaxId offlineMode threads inputBlastDatabase temporaryDirectoryPath inputTaxId inputSequence
139 | let checkedTaxonomyRestriction = checkTaxonomyRestriction taxonomyRestriction
140 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) inputTaxId singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads inputBlastDatabase checkedTaxonomyRestriction (setVerbose verboseLevel) offlineMode [] taxonomyDumpPath
141 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] initialTaxId Nothing (fromJust inputEvalueCutoff) False [] [] [] (Just rightAlignment)
142 | --let (upperTaxLimit,lowerTaxLimit) = setTaxonomicContextEntrez iterationNumber (taxonomicContext initialization) (upperTaxonomyLimit initialization)
143 | currentTaxonomicContext <- CE.catch (getTaxonomicContext (offline staticOptions) (ncbiTaxonomyDumpPath staticOptions) initialTaxId (taxonomicContext initialization))
144 | (\e -> do let err = show (e :: CE.IOException)
145 | logWarning ("Warning: Retrieving taxonomic context failed:" ++ " " ++ err) (tempDirPath staticOptions)
146 | return Nothing)
147 | let nextModelConstructionInput = constructNext iterationNumber initialization [] [] initialTaxId currentTaxonomicContext [] [] True
148 | -- let nextModelConstructionInput = constructNext currentIterationNumber modelConstruction alignmentResults similarMembers currentUpperTaxonomyLimit currentTaxonomicContext [] currentPotentialMembers True
149 | --constructModel nextModelConstructionInput staticOptions
150 | let outputDirectory = tempDirPath staticOptions ++ "0" ++ "/"
151 | createDirectory outputDirectory
152 | let fastaFilePath = outputDirectory ++ "model.fa"
153 | let stockholmFilepath = outputDirectory ++ "model" ++ ".stockholm"
154 | let cmFilepath = outputDirectory ++ "model" ++ ".cm"
155 | let cmCalibrateFilepath = outputDirectory ++ "model" ++ ".cmcalibrate"
156 | let cmBuildFilepath = outputDirectory ++ "model" ++ ".cmbuild"
157 | copyFile alignmentFilePath stockholmFilepath
158 | writeFastaFile fastaFilePath reformatedFastaInput
159 | let refinedAlignmentFilepath = outputDirectory ++ "modelrefined.stockholm"
160 | let cmBuildOptions ="--refine " ++ refinedAlignmentFilepath
161 | _ <- systemCMbuild cmBuildOptions stockholmFilepath cmFilepath cmBuildFilepath
162 | _ <- systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath
163 | writeFile (outputDirectory ++ "done") ""
164 | --select queries
165 | print "here1"
166 | let logDirectory = outputDirectory ++ "log"
167 | createDirectory logDirectory
168 | currentSelectedQueries <- selectQueries staticOptions initialization []
169 | --currentSelectedQueries <- selectQueries staticOptions initialization []
170 | let nextScanModelConstructionInputWithQueries = nextModelConstructionInput {selectedQueries = currentSelectedQueries}
171 | print "here 2"
172 | logMessage (iterationSummaryLog nextScanModelConstructionInputWithQueries) (tempDirPath staticOptions)
173 | modelConstructionResults <- modelConstructer staticOptions nextModelConstructionInput
174 | print "here 3"
175 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults
176 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable
177 | if performEvaluation
178 | then do
179 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults
180 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation
181 | resultSummary modelConstructionResults staticOptions
182 | writeFile (temporaryDirectoryPath ++ "done") ""
183 | else do
184 | resultSummary modelConstructionResults staticOptions
185 | writeFile (temporaryDirectoryPath ++ "done") ""
186 |
187 | isSingleFasta :: String -> IO Bool
188 | isSingleFasta inputFastaFilePath = do
189 | if null inputFastaFilePath
190 | then return False
191 | else do
192 | fastaInput <- readFastaFile inputFastaFilePath
193 | when (null fastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i")
194 | if (length fastaInput == (1 :: Int)) then return True else return False
195 |
196 | alienVersion :: String
197 | alienVersion = showVersion version
198 |
--------------------------------------------------------------------------------
/Biobase/RNAlien/CMstatParser.hs:
--------------------------------------------------------------------------------
1 | -- | This module contains parsing functions for Infernal programs
2 |
3 | module Biobase.RNAlien.CMstatParser (
4 | module Biobase.RNAlien.Types,
5 | parseCMstat,
6 | readCMstat
7 | )
8 | where
9 |
10 | import Text.ParserCombinators.Parsec
11 | import Biobase.RNAlien.Types
12 |
13 | -- | parse from input filePath
14 | parseCMstat :: String -> Either ParseError CMstat
15 | parseCMstat = parse genParserCMstat "parseCMstat"
16 |
17 | -- | parse from input filePath
18 | readCMstat :: String -> IO (Either ParseError CMstat)
19 | readCMstat filePath = do
20 | parsedFile <- parseFromFile genParserCMstat filePath
21 | return parsedFile
22 |
23 | genParserCMstat :: GenParser Char st CMstat
24 | genParserCMstat = do
25 | manyTill anyChar (try (string "rel entropy"))
26 | _ <- newline
27 | _ <- char '#'
28 | skipMany1 (char ' ')
29 | skipMany1 (char '-')
30 | _ <- newline
31 | _ <- char '#'
32 | _ <- manyTill anyChar (try (string "#"))
33 | _ <- many1 (try (oneOf " -"))
34 | _ <- newline
35 | skipMany1 space
36 | _statIndex <- many1 digit
37 | skipMany1 space
38 | _statName <- many1 letter
39 | skipMany1 space
40 | _statAccession <- many1 (noneOf " ")
41 | skipMany1 space
42 | _statSequenceNumber <- many1 digit
43 | skipMany1 space
44 | _statEffectiveSequences <- many1 (oneOf "0123456789.e-")
45 | skipMany1 space
46 | _statConsensusLength <- many digit
47 | skipMany1 space
48 | _statW <- many1 digit
49 | skipMany1 space
50 | _statBasepaires <- many1 digit
51 | skipMany1 space
52 | _statBifurcations <- many1 digit
53 | skipMany1 space
54 | _statModel <- many1 letter
55 | skipMany1 space
56 | _relativeEntropyCM <- many1 (oneOf "0123456789.e-")
57 | skipMany1 space
58 | _relativeEntropyHMM <- many1 (oneOf "0123456789.e-")
59 | _ <- newline
60 | _ <- char '#'
61 | _ <- newline
62 | _ <- eof
63 | return $ CMstat (readInt _statIndex) _statName _statAccession (readInt _statSequenceNumber) (readDouble _statEffectiveSequences) (readInt _statConsensusLength) (readInt _statW) (readInt _statBasepaires) (readInt _statBifurcations) _statModel (readDouble _relativeEntropyCM) (readDouble _relativeEntropyHMM)
64 | --
65 | readInt :: String -> Int
66 | readInt = read
67 |
68 | readDouble :: String -> Double
69 | readDouble = read
70 |
--------------------------------------------------------------------------------
/Biobase/RNAlien/InfernalParser.hs:
--------------------------------------------------------------------------------
1 | -- | This module contains parsing functions for Infernal programs
2 |
3 | module Biobase.RNAlien.InfernalParser (
4 | module Biobase.RNAlien.Types,
5 | readCMSearch,
6 | readCMSearches,
7 | parseCMSearch,
8 | parseCMSearches,
9 | )
10 | where
11 |
12 | import Text.ParserCombinators.Parsec
13 | import Biobase.RNAlien.Types
14 | import qualified Data.ByteString.Char8 as B
15 |
16 | -- | parse from input filePath
17 | parseCMSearch :: String -> Either ParseError CMsearch
18 | parseCMSearch = parse genParserCMSearch "parseCMsearch"
19 |
20 | -- | parse from input filePath
21 | parseCMSearches :: String -> Either ParseError CMsearch
22 | parseCMSearches = parse genParserCMSearches "parseCMsearch"
23 |
24 | -- | parse from input filePath
25 | readCMSearch :: String -> IO (Either ParseError CMsearch)
26 | readCMSearch filePath = do
27 | parsedFile <- parseFromFile genParserCMSearch filePath
28 | return parsedFile
29 |
30 | -- | parse from input filePath
31 | readCMSearches :: String -> IO (Either ParseError CMsearch)
32 | readCMSearches filePath = do
33 | parsedFile <- parseFromFile genParserCMSearches filePath
34 | return parsedFile
35 |
36 | genParserCMSearches :: GenParser Char st CMsearch
37 | genParserCMSearches = do
38 | --_ <- string "# cmsearch :: search CM(s) against a sequence database"
39 | --_ <- newline
40 | --_ <- string "# INFERNAL "
41 | --_ <- many1 (noneOf "\n")
42 | --_ <- newline
43 | --_ <- string "# Copyright (C) 201"
44 | --_ <- many1 (noneOf "\n")
45 | --_ <- newline
46 | --_ <- manyTill anyChar (try (string "# Freely distributed under the GNU General Public License (GPLv3).") --Freely distributed under a BSD open source license.)
47 | --_ <- newline
48 | _ <- manyTill anyChar (try (string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"))
49 | _ <- newline
50 | _ <- string "# query CM file:"
51 | skipMany1 space
52 | queryCMfile' <- many1 (noneOf "\n")
53 | newline
54 | _ <- string "# target sequence database:"
55 | skipMany1 space
56 | targetSequenceDatabase' <- many1 (noneOf "\n")
57 | _ <- newline
58 | optional (try (genParserCMsearchHeaderField "# CM configuration"))
59 | optional (try (genParserCMsearchHeaderField "# database size is set to"))
60 | optional (try (genParserCMsearchHeaderField "# truncated sequence detection"))
61 | _ <- string "# number of worker threads:"
62 | skipMany1 space
63 | numberOfWorkerThreads' <- many1 (noneOf "\n")
64 | _ <- newline
65 | _ <- string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
66 | _ <- newline
67 | _ <- optional newline
68 | cmSearchesHits <- many1 (try genParserMultipleCMSearch)
69 | _ <- optional (string "[ok]\n")
70 | _ <- eof
71 | return $ CMsearch queryCMfile' targetSequenceDatabase' numberOfWorkerThreads' (concat cmSearchesHits)
72 |
73 | genParserCMSearch :: GenParser Char st CMsearch
74 | genParserCMSearch = do
75 | --_ <- string "# cmsearch :: search CM(s) against a sequence database"
76 | --_ <- newline
77 | --_ <- string "# INFERNAL "
78 | --skipMany1 (noneOf "\n")
79 | --_ <- newline
80 | --_ <- string "# Copyright (C) 201"
81 | --_ <- many1 (noneOf "\n")
82 | --_ <- newline
83 | --_ <- string "# Freely distributed under the GNU General Public License (GPLv3)."
84 | --_ <- newline
85 | _ <- manyTill anyChar (try (string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"))
86 | _ <- newline
87 | _ <- string "# query CM file:"
88 | _ <- skipMany1 space
89 | queryCMfile' <- many1 (noneOf "\n")
90 | _ <- newline
91 | _ <- string "# target sequence database:"
92 | skipMany1 space
93 | targetSequenceDatabase' <- many1 (noneOf "\n")
94 | _ <- newline
95 | _ <- optional (try (genParserCMsearchHeaderField "# CM configuration"))
96 | _ <- optional (try (genParserCMsearchHeaderField "# database size is set to"))
97 | _ <- optional (try (genParserCMsearchHeaderField "# truncated sequence detection"))
98 | _ <- string "# number of worker threads:"
99 | skipMany1 space
100 | numberOfWorkerThreads' <- many1 (noneOf "\n")
101 | _ <- newline
102 | _ <- string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -"
103 | _ <- newline
104 | _ <- optional newline
105 | _ <- string "Query:"
106 | skipMany1 (noneOf "\n")
107 | _ <- newline
108 | _ <- optional (try (genParserCMsearchHeaderField "Accession"))
109 | _ <- optional (try (genParserCMsearchHeaderField "Description"))
110 | _ <- string "Hit scores:"
111 | _ <- newline
112 | _ <- choice [try (string " rank"), try (string " rank") , try (string " rank"), try (string " rank"),try (string " rank"),try (string " rank")]
113 | many1 space
114 | string "E-value"
115 | --many1 space
116 | --string "score"
117 | --many1 space
118 | --string "bias"
119 | --many1 space
120 | --string "sequence"
121 | --many1 space
122 | --string "start"
123 | --many1 space
124 | --string "end"
125 | --many1 space
126 | --string "mdl"
127 | --many1 space
128 | --string "trunc"
129 | --many1 space
130 | --string "gc"
131 | --many1 space
132 | --string "description"
133 | --newline
134 | _ <- manyTill anyChar (try (string "-"))
135 | --string " -"
136 | skipMany1 (try (oneOf " -"))
137 | _ <- newline
138 | optional (try (string " ------ inclusion threshold ------"))
139 | skipMany newline
140 | hitScores' <- many (try genParserCMsearchHit) --`endBy` (try (string "Hit alignments:"))
141 | optional (try genParserCMsearchEmptyHit)
142 | -- this is followed by hit alignments and internal cmsearch statistics which are not parsed
143 | _ <- many anyChar
144 | _ <- eof
145 | return $ CMsearch queryCMfile' targetSequenceDatabase' numberOfWorkerThreads' hitScores'
146 |
147 | -- | Parsing function for CMSearches with multiple querymodels in one modelfile, e.g. clans
148 | genParserMultipleCMSearch :: GenParser Char st [CMsearchHit]
149 | genParserMultipleCMSearch = do
150 | --optional newline
151 | --optional string "//"
152 | string "Query:"
153 | many1 (noneOf "\n")
154 | newline
155 | optional (try (genParserCMsearchHeaderField "Accession"))
156 | optional (try (genParserCMsearchHeaderField "Description"))
157 | string "Hit scores:"
158 | newline
159 | choice [try (string " rank"), try (string " rank") , try (string " rank"), try (string " rank"),try (string " rank"),try (string " rank")]
160 | many1 space
161 | string "E-value"
162 | many1 space
163 | string "score"
164 | many1 space
165 | string "bias"
166 | many1 space
167 | string "sequence"
168 | many1 space
169 | string "start"
170 | many1 space
171 | string "end"
172 | many1 space
173 | string "mdl"
174 | many1 space
175 | string "trunc"
176 | many1 space
177 | string "gc"
178 | many1 space
179 | string "description"
180 | newline
181 | string " -"
182 | many1 (try (oneOf " -"))
183 | newline
184 | optional (try (string " ------ inclusion threshold ------"))
185 | many newline
186 | hitScores' <- many (try genParserCMsearchHit) --`endBy` (try (string "Hit alignments:"))
187 | optional (try genParserCMsearchEmptyHit)
188 | -- this is followed by hit alignments and internal cmsearch statistics which are not parsed
189 | --many anyChar
190 | manyTill anyChar (try (string "//\n"))
191 | return hitScores'
192 |
193 | genParserCMsearchHeaderField :: String -> GenParser Char st String
194 | genParserCMsearchHeaderField fieldname = do
195 | string (fieldname ++ ":")
196 | many1 space
197 | many1 (noneOf "\n")
198 | newline
199 | return []
200 |
201 | genParserCMsearchEmptyHit :: GenParser Char st [CMsearchHit]
202 | genParserCMsearchEmptyHit = do
203 | string " [No hits detected that satisfy reporting thresholds]"
204 | newline
205 | optional (try newline)
206 | return []
207 |
208 | genParserCMsearchHit :: GenParser Char st CMsearchHit
209 | genParserCMsearchHit = do
210 | many1 space
211 | string "("
212 | hitRank' <- many1 digit
213 | string ")"
214 | many1 space
215 | hitSignificant' <- choice [char '!', char '?']
216 | many1 space
217 | hitEValue' <- many1 (oneOf "0123456789.e-")
218 | many1 space
219 | hitScore' <- many1 (oneOf "0123456789.e-")
220 | many1 space
221 | hitBias' <- many1 (oneOf "0123456789.e-")
222 | many1 space
223 | hitSequenceHeader' <- many1 (noneOf " ")
224 | many1 space
225 | hitStart' <- many1 digit
226 | many1 space
227 | hitEnd' <- many1 digit
228 | many1 space
229 | hitStrand' <- choice [char '+', char '-', char '.']
230 | many1 space
231 | hitModel' <- many1 letter
232 | many1 space
233 | hitTruncation' <- many1 (choice [alphaNum, char '\''])
234 | many1 space
235 | hitGCcontent' <- many1 (oneOf "0123456789.e-")
236 | many1 space
237 | hitDescription' <- many1 (noneOf "\n")
238 | newline
239 | optional (try (string " ------ inclusion threshold ------"))
240 | optional (try newline)
241 | return $ CMsearchHit (readInt hitRank') hitSignificant' (readDouble hitEValue') (readDouble hitScore') (readDouble hitBias') (B.pack hitSequenceHeader') (readInt hitStart') (readInt hitEnd') hitStrand' (B.pack hitModel') (B.pack hitTruncation') (readDouble hitGCcontent') (B.pack hitDescription')
242 |
243 | --
244 | readInt :: String -> Int
245 | readInt = read
246 |
247 | readDouble :: String -> Double
248 | readDouble = read
249 |
--------------------------------------------------------------------------------
/Biobase/RNAlien/RNAcentralHTTP.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 |
3 | {-# LANGUAGE DeriveGeneric #-}
4 |
5 | -- | Interface for the RNAcentral REST webservice.
6 | --
7 | module Biobase.RNAlien.RNAcentralHTTP (rnaCentralHTTP,
8 | buildSequenceViaMD5Query,
9 | buildStringViaMD5Query,
10 | getRNACentralEntries,
11 | showRNAcentralAlienEvaluation,
12 | RNAcentralEntryResponse(..),
13 | RNAcentralEntry(..)
14 | ) where
15 |
16 | import Network.HTTP.Conduit
17 | import qualified Data.ByteString.Lazy.Char8 as L8
18 | --import qualified Data.ByteString.Char8 as BS8
19 | import Network.Socket
20 | import Control.Concurrent
21 | import Data.Text
22 | import Data.Aeson
23 | import GHC.Generics
24 | import qualified Data.Digest.Pure.MD5 as M
25 | import Data.Either
26 | import Biobase.Fasta.Strict
27 | import Biobase.Types.BioSequence
28 |
29 | --Datatypes
30 | -- | Data structure for RNAcentral entry response
31 | data RNAcentralEntryResponse = RNAcentralEntryResponse
32 | {
33 | count :: Int,
34 | next :: Maybe Text,
35 | previous :: Maybe Text,
36 | results :: [RNAcentralEntry]
37 | }
38 | deriving (Show, Eq, Generic)
39 |
40 | instance ToJSON RNAcentralEntryResponse where
41 | toJSON = genericToJSON defaultOptions
42 | --toEncoding = genericToEncoding defaultOptions
43 |
44 | instance FromJSON RNAcentralEntryResponse
45 |
46 | data RNAcentralEntry = RNAcentralEntry
47 | {
48 | url :: Text,
49 | rnacentral_id :: Text,
50 | md5 :: Text,
51 | sequence :: Text,
52 | length :: Int,
53 | xrefs :: Text,
54 | publications :: Text
55 | }
56 | deriving (Show, Eq, Generic)
57 |
58 | instance ToJSON RNAcentralEntry where
59 | toJSON = genericToJSON defaultOptions
60 | --toEncoding = genericToEncoding defaultOptions
61 |
62 | instance FromJSON RNAcentralEntry
63 |
64 | -- | Send query and parse return XML
65 | startSession :: String -> IO (Either String RNAcentralEntryResponse)
66 | startSession query' = do
67 | requestXml <- withSocketsDo
68 | $ sendQuery query'
69 | --putStr (L8.unpack requestXml)
70 | let eitherErrorResponse = eitherDecode requestXml :: Either String RNAcentralEntryResponse
71 | return eitherErrorResponse
72 |
73 | -- | Send query and return response XML
74 | sendQuery :: String -> IO L8.ByteString
75 | sendQuery query' = do
76 | let address = "http://rnacentral.org/api/v1/rna/"
77 | let request = address ++ query'
78 | --putStrLn request
79 | simpleHttp request
80 |
81 | -- | Function for querying the RNAcentral REST interface.
82 | rnaCentralHTTP :: String -> IO (Either String RNAcentralEntryResponse)
83 | rnaCentralHTTP query' =
84 | startSession query'
85 |
86 | -- | Function for delayed queries to the RNAcentral REST interface. Enforces the maximum 20 requests per second policy.
87 | delayedRNACentralHTTP :: String -> IO (Either String RNAcentralEntryResponse)
88 | delayedRNACentralHTTP query' = do
89 | threadDelay 55000
90 | startSession query'
91 |
92 | getRNACentralEntries :: [String] -> IO [Either String RNAcentralEntryResponse]
93 | getRNACentralEntries queries = do
94 | mapM delayedRNACentralHTTP queries
95 |
96 | -- | Build a query from a input sequence
97 | --
98 | -- TODO [chzs] consider using strict bytestring as long as possible.
99 | --
100 | -- TODO [chzs] consider giving useful typelevel names to the types in @Fasta@.
101 | -- One may give a type-level name to the sequence identifier, and an identifier
102 | -- (like @DNA@) to the biosequence type.
103 |
104 | buildSequenceViaMD5Query :: Fasta () () -> String
105 | buildSequenceViaMD5Query s = qString
106 | where querySequence = L8.fromStrict . _bioSequence $ _fasta s
107 | querySequenceUreplacedwithT = L8.map bsreplaceUT querySequence
108 | querySequenceU2Twolb = L8.filter ((/= '\n')) querySequenceUreplacedwithT
109 | md5Sequence = M.md5 querySequenceU2Twolb
110 | qString = "?md5=" ++ show md5Sequence
111 |
112 | --Build a query from a input string
113 | buildStringViaMD5Query :: String -> String
114 | buildStringViaMD5Query s = qString
115 | where querySequenceUreplacedwithT = L8.map bsreplaceUT (L8.pack s)
116 | querySequenceU2Twolb = L8.filter ((/= '\n')) querySequenceUreplacedwithT
117 | md5Sequence = M.md5 querySequenceU2Twolb
118 | qString = "?md5=" ++ show md5Sequence
119 |
120 | showRNAcentralAlienEvaluation :: [Either String RNAcentralEntryResponse] -> String
121 | showRNAcentralAlienEvaluation responses = output
122 | where resultEntries = Prelude.concatMap results (rights responses)
123 | resulthead = "rnacentral_id\tmd5\tlength\n"
124 | resultentries = Prelude.concatMap showRNAcentralAlienEvaluationLine resultEntries
125 | output = if Prelude.null resultentries then "No matching sequences found in RNAcentral\n" else resulthead ++ resultentries
126 |
127 | showRNAcentralAlienEvaluationLine :: RNAcentralEntry -> String
128 | showRNAcentralAlienEvaluationLine entry = unpack (rnacentral_id entry) ++ "\t" ++ unpack (md5 entry) ++ "\t" ++ show (Biobase.RNAlien.RNAcentralHTTP.length entry) ++"\n"
129 |
130 | bsreplaceUT :: Char -> Char
131 | bsreplaceUT a
132 | | a == 'U' = 'T'
133 | | otherwise = a
134 |
135 |
--------------------------------------------------------------------------------
/Biobase/RNAlien/Types.hs:
--------------------------------------------------------------------------------
1 | -- | This module contains data structures for RNAlien
2 |
3 | module Biobase.RNAlien.Types where
4 |
5 | import Biobase.Fasta.Strict
6 | import Biobase.Taxonomy.Import
7 | import Biobase.StockholmAlignment.Types
8 | --import Biobase.Types.BioSequence
9 | import qualified Data.ByteString.Char8 as B
10 |
11 | -- | Static construction options
12 | data StaticOptions = StaticOptions
13 | { tempDirPath :: String,
14 | sessionID :: String,
15 | nSCICutoff :: Double,
16 | userTaxId :: Maybe Int,
17 | singleHitperTaxToggle :: Bool,
18 | querySelectionMethod :: String,
19 | queryNumber :: Int,
20 | lengthFilterToggle :: Bool,
21 | coverageFilterToggle :: Bool,
22 | blastSoftmaskingToggle :: Bool,
23 | cpuThreads :: Int,
24 | blastDatabase :: Maybe String,
25 | taxRestriction :: Maybe String,
26 | verbositySwitch :: Bool,
27 | offline :: Bool,
28 | genomeFastasPath :: String,
29 | ncbiTaxonomyDumpPath :: String
30 | } deriving (Show)
31 |
32 | -- | Keeps track of model construction
33 | data ModelConstruction = ModelConstruction
34 | { iterationNumber :: Int,
35 | inputFasta :: [Fasta () ()],
36 | --unique seed sequencs
37 | taxRecords :: [TaxonomyRecord],
38 | --additional similar sequences - collected by full similarity to previously found entries
39 | similarRecords :: [TaxonomyRecord],
40 | --Taxonomy ID of the highest node in taxonomic subtree used in search
41 | upperTaxonomyLimit :: Maybe Int,
42 | taxonomicContext :: Maybe Lineage,
43 | evalueThreshold :: Double,
44 | alignmentModeInfernal :: Bool,
45 | selectedQueries :: [Fasta () ()],
46 | potentialMembers :: [SearchResult],
47 | genomeFastas :: [Fasta () ()],
48 | inputAlignment :: Maybe StockholmAlignment
49 | }
50 |
51 | instance Show ModelConstruction where
52 | show (ModelConstruction _iterationNumber _inputFasta _taxRecords _similarRecords _upperTaxonomyLimit _taxonomicContext _evalueThreshold _alignmentModeInfernal _selectedQueries _potentialMembers _genomeFastas _inputAlignment) = a ++ b ++ c ++ d ++ e ++ g ++ h ++ i ++ j ++ k ++ l
53 | where a = "Modelconstruction iteration: " ++ show _iterationNumber ++ "\n"
54 | -- b = "Input fasta:\n" ++ concatMap (prettyPrintFasta 80) _inputFasta -- L.unpack (fastaHeader _inputFasta) ++ "\n" ++ L.unpack (fastaSequence _inputFasta) ++ "\n"
55 | b = "Input fasta:\n" ++ concatMap (convertString . fastaToByteString 80) _inputFasta
56 | c = "Input alignment:\n" ++ maybe "not provided" show _inputAlignment ++ "\n"
57 | d = "Taxonomy records:\n" ++ show _taxRecords ++ "\n"
58 | e = "Similar records:\n" ++ show _similarRecords ++ "\n"
59 | g = "Upper taxonomy limit: " ++ maybe "not set" show _upperTaxonomyLimit ++ "\n"
60 | h = "Taxonomic Context: " ++ maybe "not set" show _taxonomicContext ++ "\n"
61 | i = "Evalue cutoff: " ++ show _evalueThreshold ++ "\n"
62 | j = "Selected queries: \n" ++ concatMap show _selectedQueries
63 | k = "Potential Members: \n" ++ concatMap show _potentialMembers
64 | l = "Number of genomes for RNAlienScan: " ++ show (length _genomeFastas) ++ "\n"
65 |
66 | data TaxonomyRecord = TaxonomyRecord
67 | { recordTaxonomyId :: Int,
68 | sequenceRecords :: [SequenceRecord]
69 | }
70 |
71 | instance Show TaxonomyRecord where
72 | show (TaxonomyRecord _recordTaxonomyId _sequenceRecords) = a ++ b
73 | where a = "TaxonomyRecord TaxonomyId: " ++ show _recordTaxonomyId ++ "\n"
74 | b = show _sequenceRecords
75 |
76 | data SequenceRecord = SequenceRecord
77 | { --Sequence consisting of SeqLabel, and SeqData
78 | nucleotideSequence :: Fasta () (),
79 | -- 0 is unaligned, number is the iteration the sequence has been included into the alignment
80 | aligned :: Int,
81 | recordDescription :: B.ByteString
82 | }
83 |
84 | instance Show SequenceRecord where
85 | show (SequenceRecord _nucleotideSequence _aligned _recordDescription) = a ++ b ++ c
86 | where a = "Record Description: " ++ B.unpack _recordDescription ++ "\n"
87 | b = "Aligned in iteration: " ++ show _aligned ++ "\n"
88 | c = "Sequence:" ++ show _nucleotideSequence ++ "\n"
89 | -- |
90 | data CMsearch = CMsearch
91 | { queryCMfile :: String,
92 | targetSequenceDatabase :: String,
93 | numberOfWorkerThreads :: String,
94 | cmsearchHits :: [CMsearchHit]
95 | -- hitAlignments :: [CMsearchHitAlignment]
96 | -- internalCMPipelineStatisticsSummary
97 | } deriving (Show, Eq, Read)
98 |
99 | -- |
100 | data CMsearchHit = CMsearchHit
101 | { hitRank :: Int,
102 | hitSignificance :: Char,
103 | hitEvalue :: Double,
104 | hitScore :: Double,
105 | hitBias :: Double,
106 | hitSequenceHeader :: B.ByteString,
107 | hitStart :: Int,
108 | hitEnd :: Int,
109 | hitStrand :: Char,
110 | hitModel :: B.ByteString,
111 | hitTruncation :: B.ByteString,
112 | hitGCContent :: Double,
113 | hitDescription :: B.ByteString
114 | } deriving (Show, Eq, Read)
115 |
116 | data SearchResult = SearchResult
117 | { candidates :: [(Fasta () (),Int,B.ByteString)],
118 | blastDatabaseSize :: Maybe Double
119 | }
120 |
121 | instance Show SearchResult where
122 | show (SearchResult _candidates _blastDatabaseSize) = a ++ b
123 | where a = "SearchResults :\n " ++ concatMap show _candidates ++ "\n"
124 | b = "BlastDb Size: " ++ show _blastDatabaseSize ++ "\n"
125 |
126 | -- |
127 | data CMstat = CMstat
128 | { statIndex :: Int,
129 | statName :: String,
130 | statAccession :: String,
131 | statSequenceNumber :: Int,
132 | statEffectiveSequences :: Double,
133 | statConsensusLength :: Int,
134 | -- W The expected maximum length of a hit to the model.
135 | statW :: Int,
136 | statBasepairs :: Int,
137 | statBifurcations :: Int,
138 | statModel :: String,
139 | relativeEntropyCM :: Double,
140 | relativeEntropyHMM :: Double
141 | } deriving (Eq, Read)
142 |
143 | instance Show CMstat where
144 | show (CMstat _statIndex _statName _statAccession _statSequenceNumber _statEffectiveSequences _statConsensusLength _statW _statBasepairs _statBifurcations _statModel _relativeEntropyCM _relativeEntropyHMM) = a ++ b ++ c ++ d ++ e ++ f ++ g ++ h ++ i ++ j ++ k ++ l
145 | where a = "CMstat - covariance model statistics:\nIndex: " ++ show _statIndex ++ "\n"
146 | b = "Name: " ++ show _statName ++ "\n"
147 | c = "Accession: " ++ show _statAccession ++ "\n"
148 | d = "Sequence Number: " ++ show _statSequenceNumber ++ "\n"
149 | e = "Effective Sequences: " ++ show _statEffectiveSequences ++ "\n"
150 | f = "Consensus length: " ++ show _statConsensusLength ++ "\n"
151 | g = "Expected maximum hit-length: " ++ show _statW ++ "\n"
152 | h = "Basepairs: " ++ show _statBasepairs ++ "\n"
153 | i = "Bifurcations: " ++ show _statBifurcations ++ "\n"
154 | j = "Modeltype: " ++ show _statModel ++ "\n"
155 | k = "Relative Entropy CM: " ++ show _relativeEntropyCM ++ "\n"
156 | l = "Relative Entropy HMM: " ++ show _relativeEntropyHMM ++ "\n"
157 |
--------------------------------------------------------------------------------
/Biobase/RNAlienScan.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE RecordWildCards #-}
2 | {-# LANGUAGE DeriveDataTypeable #-}
3 |
4 | -- | Unsupervized construction of RNA family models
5 | -- For more information on RNA family models consult
6 | -- Usage example: RNAlien -i /path/input.fa -c 5 -o /outdir/
7 | -- Usage example offline mode: RNAlien -i /path/input.fa -b /backup/blast/nt_v5 -o /outdir/ -c 5 -t 1396 -j
8 | module Main where
9 |
10 | import System.Console.CmdArgs
11 | import System.Directory
12 | import Biobase.RNAlien.Types
13 | import Biobase.RNAlien.Library
14 | import Data.Maybe
15 | import Data.Either.Unwrap
16 | import Data.Time
17 | import qualified System.FilePath as FP
18 | import Paths_RNAlien (version)
19 | import Data.Version (showVersion)
20 | --import Biobase.Fasta.Streaming
21 | import Control.Monad
22 | import qualified Biobase.StockholmAlignment.Import as BS
23 |
24 | data Options = Options
25 | { inputFastaFilePath :: String,
26 | inputAlignmentFilePath :: String,
27 | inputGenomesFastaFilePath :: String,
28 | outputPath :: String,
29 | inputnSCICutoff :: Maybe Double,
30 | inputEvalueCutoff :: Maybe Double,
31 | lengthFilter :: Bool,
32 | coverageFilter :: Bool,
33 | singleHitperTax :: Bool,
34 | blastSoftmasking :: Bool,
35 | inputQuerySelectionMethod :: String,
36 | inputQueryNumber :: Int,
37 | threads :: Int,
38 | sessionIdentificator :: Maybe String,
39 | performEvaluation :: Bool,
40 | checkSetup :: Bool
41 | } deriving (Show,Data,Typeable)
42 |
43 | options :: Options
44 | options = Options
45 | { inputFastaFilePath = def &= name "i" &= help "Path to input fasta file",
46 | inputAlignmentFilePath = def &= name "p" &= help "Path to input alignment file",
47 | inputGenomesFastaFilePath = def &= name "b" &= help "Path to input genome fasta files",
48 | outputPath = def &= name "o" &= help "Path to output directory. Default: current working directory",
49 | inputnSCICutoff = Just (1 :: Double) &= name "z" &= help "Only candidate sequences with a normalized structure conservation index (nSCI) higher than this value are accepted. Default: 1",
50 | inputEvalueCutoff = Just (0.001 :: Double) &= name "e" &= help "Evalue cutoff for cmsearch filtering. Default: 0.001",
51 | lengthFilter = True &= name "l" &= help "Filter blast hits per genomic length. Default: True",
52 | coverageFilter = True &= name "a" &= help "Filter blast hits by coverage of at least 80%. Default: True",
53 | singleHitperTax = False &= name "s" &= help "Only the best blast hit per taxonomic entry is considered. Default: False",
54 | blastSoftmasking = False &= name "f" &= help "Toggles blast query softmasking, meaning masking of non-conserved regions on the query. Default: False",
55 | inputQuerySelectionMethod = "filtering" &= name "m" &= help "Method for selection of queries (filtering,clustering). Default: filtering",
56 | inputQueryNumber = (5 :: Int) &= name "n" &= help "Number of queries used for candidate search. Default: 5",
57 | threads = 1 &= name "c" &= help "Number of available cpu slots/cores. Default: 1",
58 | sessionIdentificator = Nothing &= name "d" &= help "Optional session id that is used instead of automatically generated one.",
59 | performEvaluation = True &= name "x" &= help "Perform evaluation step. Default: True",
60 | checkSetup = False &= name "g" &= help "Just prints installed tool versions and performs connection check. Default: False"
61 | } &= summary ("RNAlienScan " ++ alienVersion) &= help "Florian Eggenhofer - 2019" &= verbosity
62 |
63 | main :: IO ()
64 | main = do
65 | Options{..} <- cmdArgs options
66 | verboseLevel <- getVerbosity
67 | -- Generate SessionID
68 | sessionId <- createSessionID sessionIdentificator
69 | timestamp <- getCurrentTime
70 | currentWorkDirectory <- getCurrentDirectory
71 | let selectedOutputPath = if null outputPath then currentWorkDirectory else outputPath
72 | let temporaryDirectoryPath = FP.addTrailingPathSeparator selectedOutputPath ++ sessionId ++ "/"
73 | createDirectoryIfMissing False temporaryDirectoryPath
74 | setupCheckScanWithLog inputQuerySelectionMethod temporaryDirectoryPath
75 | createDirectoryIfMissing False (temporaryDirectoryPath ++ "log")
76 | -- Create Log files
77 | writeFile (temporaryDirectoryPath ++ "Log") ("RNAlienScan " ++ alienVersion ++ "\n")
78 | writeFile (temporaryDirectoryPath ++ "log/warnings") ("")
79 | logMessage ("Timestamp: " ++ (show timestamp) ++ "\n") temporaryDirectoryPath
80 | logMessage ("Temporary Directory: " ++ temporaryDirectoryPath ++ "\n") temporaryDirectoryPath
81 | let iterationNumber = 0
82 | if null inputFastaFilePath
83 | then do
84 | alignmentInput <- BS.readExistingStockholm inputAlignmentFilePath
85 | inputGenomesFasta <- readFastaFile inputGenomesFastaFilePath
86 | when (null inputGenomesFasta) (error "Please provide input genomes with the cmd line parameter -s")
87 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath
88 | when (isLeft alignmentInput) (error (fromLeft alignmentInput))
89 | let rightAlignment = head $ fromRight alignmentInput
90 | let reformatedFastaInput = stockholmAlignmentToFasta rightAlignment
91 | when (null reformatedFastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i")
92 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) Nothing singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads Nothing Nothing (setVerbose verboseLevel) True inputGenomesFastaFilePath []
93 | --let initialization = ModelConstruction iterationNumber reformatedFastaInput [] Nothing Nothing (fromJust inputEvalueCutoff) False [] [] [] alignmentInput
94 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] Nothing Nothing (fromJust inputEvalueCutoff) False [] [] inputGenomesFasta (Just rightAlignment)
95 | logMessage (show initialization) temporaryDirectoryPath
96 | --logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - infernal mode\n") (tempDirPath staticOptions)
97 | --prepare next iteration
98 | let nextModelConstructionInput = constructNext iterationNumber initialization [] [] Nothing Nothing [] [] True
99 | let outputDirectory = tempDirPath staticOptions ++ "0" ++ "/"
100 | createDirectory outputDirectory
101 | let fastaFilePath = outputDirectory ++ "model" ++ ".fa"
102 | writeFastaFile fastaFilePath reformatedFastaInput
103 | let stockholmFilepath = outputDirectory ++ "model" ++ ".stockholm"
104 | let cmFilepath = outputDirectory ++ "model" ++ ".cm"
105 | let cmCalibrateFilepath = outputDirectory ++ "model" ++ ".cmcalibrate"
106 | let cmBuildFilepath = outputDirectory ++ "model" ++ ".cmbuild"
107 | copyFile inputAlignmentFilePath stockholmFilepath
108 | let refinedAlignmentFilepath = outputDirectory ++ "modelrefined.stockholm"
109 | let cmBuildOptions ="--refine " ++ refinedAlignmentFilepath
110 | _ <- systemCMbuild cmBuildOptions stockholmFilepath cmFilepath cmBuildFilepath
111 | _ <- systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath
112 | writeFile (outputDirectory ++ "done") ""
113 | --select queries
114 | currentSelectedQueries <- selectQueries staticOptions nextModelConstructionInput []
115 | --let nextScanModelConstructionInputWithQueries = nextModelConstructionInput {selectedQueries = currentSelectedQueries}
116 | --logMessage (iterationSummaryLog nextScanModelConstructionInputWithQueries) (tempDirPath staticOptions)
117 | let nextScanModelConstructionInputWithQueries = initialization {iterationNumber = (1 :: Int), selectedQueries = currentSelectedQueries}
118 | modelConstructionResults <- scanModelConstructer staticOptions nextScanModelConstructionInputWithQueries -- nextScanModelConstructionInputWithQueries
119 | --modelConstructionResults <- alignmentConstructionWithoutCandidates "scan" Nothing Nothing staticOptions nextScanModelConstructionInputWithQueries
120 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults
121 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable
122 | if performEvaluation
123 | then do
124 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults
125 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation
126 | resultSummary modelConstructionResults staticOptions
127 | writeFile (temporaryDirectoryPath ++ "done") ""
128 | else do
129 | resultSummary modelConstructionResults staticOptions
130 | writeFile (temporaryDirectoryPath ++ "done") ""
131 | else do
132 | fastaInput <- readFastaFile inputFastaFilePath
133 | when (null fastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i")
134 | inputGenomesFasta <- readFastaFile inputGenomesFastaFilePath
135 | when (null inputGenomesFasta) (error "Please provide input genomes with the cmd line parameter -s")
136 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath
137 | let reformatedFastaInput = map reformatFasta fastaInput
138 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) Nothing singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads Nothing Nothing (setVerbose verboseLevel) True inputGenomesFastaFilePath []
139 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] Nothing Nothing (fromJust inputEvalueCutoff) False [] [] inputGenomesFasta Nothing
140 | logMessage (show initialization) temporaryDirectoryPath
141 | modelConstructionResults <- scanModelConstructer staticOptions initialization
142 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults
143 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable
144 | if performEvaluation
145 | then do
146 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults
147 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation
148 | resultSummary modelConstructionResults staticOptions
149 | writeFile (temporaryDirectoryPath ++ "done") ""
150 | else do
151 | resultSummary modelConstructionResults staticOptions
152 | writeFile (temporaryDirectoryPath ++ "done") ""
153 |
154 | alienVersion :: String
155 | alienVersion = showVersion version
156 |
--------------------------------------------------------------------------------
/Biobase/RNAlienStatistics.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE RecordWildCards #-}
2 | {-# LANGUAGE DeriveDataTypeable #-}
3 |
4 | -- | Statistics for RNAlien Results
5 | -- dist/build/RNAlienStatistics/RNAlienStatistics -s bitscore -i /scratch/egg/temp/cm13676/1/model.cm -r /home/mescalin/egg/current/Data/AlienTest/cms/BsrG.cm -g /scratch/egg/temp/AlienSearch/genomes/ -o /scratch/egg/temp/AlienStatistics
6 | module Main where
7 |
8 | import System.Console.CmdArgs
9 | import Data.Either.Unwrap
10 | import System.Process
11 | import qualified Data.ByteString.Char8 as B
12 | import Biobase.RNAlien.Library
13 | import System.Directory
14 | import Biobase.Fasta.Strict
15 | import Data.List
16 | import qualified System.FilePath as FP
17 | import qualified Data.List.Split as DS
18 | import Text.Printf
19 | import Bio.RNAzParser
20 | import qualified Bio.RNAcodeParser as RC
21 | import Biobase.Types.BioSequence
22 |
23 | data Options = Options
24 | { alienCovarianceModelPath :: String,
25 | alienrnazPath :: String,
26 | alienrnacodePath :: String,
27 | aliencmstatPath :: String,
28 | rfamCovarianceModelPath :: String,
29 | rfamFastaFilePath :: String,
30 | alienFastaFilePath :: String,
31 | rfamModelName :: String,
32 | rfamModelId :: String,
33 | rfamThreshold :: Double,
34 | alienThreshold :: Double,
35 | databaseSize :: Maybe Double,
36 | outputDirectoryPath :: String,
37 | benchmarkIndex :: Int,
38 | thresholdSelection :: String,
39 | linkScores :: Bool,
40 | threads :: Int
41 | } deriving (Show,Data,Typeable)
42 |
43 | options :: Options
44 | options = Options
45 | { alienCovarianceModelPath = def &= name "i" &= help "Path to alienCovarianceModelPath",
46 | alienrnazPath = def &= name "z" &= help "Path to alienRNAzResult",
47 | alienrnacodePath = def &= name "w" &= help "Path to alienRNAcodeResult",
48 | aliencmstatPath = def &= name "m" &= help "Path to aliencmstatResult",
49 | rfamCovarianceModelPath = def &= name "r" &= help "Path to rfamCovarianceModelPath",
50 | rfamFastaFilePath = def &= name "g" &= help "Path to rfamFastaFile",
51 | rfamModelName = def &= name "n" &= help "Rfam model name",
52 | rfamModelId = def &= name "d" &= help "Rfam model id",
53 | alienFastaFilePath = def &= name "a" &= help "Path to alienFastaFile",
54 | outputDirectoryPath = def &= name "o" &= help "Path to output directory",
55 | alienThreshold = 20 &= name "t" &= help "Bitscore threshold for RNAlien model hits on Rfam fasta, default 20",
56 | rfamThreshold = 20 &= name "x" &= help "Bitscore threshold for Rfam model hits on Alien fasta, default 20",
57 | databaseSize = Nothing &= name "k" &= help "Cmsearch database size in mega bases. default not set",
58 | benchmarkIndex = 1 &= name "b" &= help "Index used to identify sRNA tagged RNA families",
59 | thresholdSelection = "bitscore" &= name "s" &= help "Selection method, (bitscore, evalue), default bitscore",
60 | linkScores = False &= name "l" &= help "Triggers computation of linkscores via CMCompare",
61 | threads = 1 &= name "c" &= help "Number of available cpu slots/cores, default 1"
62 | } &= summary "RNAlienStatistics" &= help "Florian Eggenhofer - >2013" &= verbosity
63 |
64 | --cmSearchFasta threads rfamCovarianceModelPath outputDirectoryPath "Rfam" False genomesDirectoryPath
65 | cmSearchFasta :: Int -> String -> Double -> Maybe Double -> Int -> String -> String -> String -> String -> IO [CMsearchHit]
66 | cmSearchFasta benchmarkIndex thresholdSelection thresholdScore databaseSize cpuThreads covarianceModelPath outputDirectory modelType fastapath = do
67 | createDirectoryIfMissing False (outputDirectory ++ "/" ++ modelType)
68 | _ <- systemCMsearch cpuThreads (maybe "" (\dbs -> " -Z " ++ show dbs ++ " ") databaseSize) covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch")
69 | --_ <- systemCMsearch cpuThreads " " covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ (show benchmarkIndex) ++ ".cmsearch")
70 | result <- readCMSearch (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch")
71 | if isLeft result
72 | then do
73 | print (fromLeft result)
74 | return []
75 | else do
76 | let rightResults = fromRight result
77 | let significantHits = filterCMsearchHits thresholdSelection thresholdScore rightResults
78 | let uniquesignificantHits = nubBy cmSearchSameHit significantHits
79 | return uniquesignificantHits
80 |
81 | --cmSearchFasta threads rfamCovarianceModelPath outputDirectoryPath "Rfam" False genomesDirectoryPath
82 | cmSearchesFasta :: Int -> String -> Double -> Maybe Double -> Int -> String -> String -> String -> String -> IO [CMsearchHit]
83 | cmSearchesFasta benchmarkIndex thresholdSelection thresholdScore databaseSize cpuThreads covarianceModelPath outputDirectory modelType fastapath = do
84 | createDirectoryIfMissing False (outputDirectory ++ "/" ++ modelType)
85 | _ <- systemCMsearch cpuThreads (maybe "" (\dbs -> " -Z " ++ show dbs ++ " ") databaseSize) covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch")
86 | --_ <- systemCMsearch cpuThreads " " covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ (show benchmarkIndex) ++ ".cmsearch")
87 | result <- readCMSearches (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch")
88 | if isLeft result
89 | then do
90 | print (fromLeft result)
91 | return []
92 | else do
93 | let rightResults = fromRight result
94 | let significantHits = filterCMsearchHits thresholdSelection thresholdScore rightResults
95 | --putStrLn ("significant Hits " ++ show (length significantHits))
96 | let uniquesignificantHits = nubBy cmSearchSameHit significantHits
97 | --putStrLn ("unique significant Hits " ++ show (length uniquesignificantHits))
98 | --let organismUniquesignificantHits = nubBy cmSearchSameOrganism significantHits
99 | return uniquesignificantHits
100 |
101 | filterCMsearchHits :: String -> Double -> CMsearch -> [CMsearchHit]
102 | filterCMsearchHits thresholdSelection thresholdScore cmSearchResult
103 | | thresholdSelection == "bitscore" = bitscorefiltered
104 | | otherwise = evaluefiltered
105 | where bitscorefiltered = filter (\hit -> hitScore hit >= thresholdScore) (cmsearchHits cmSearchResult)
106 | evaluefiltered = filter (\hit -> hitEvalue hit <= thresholdScore) (cmsearchHits cmSearchResult)
107 |
108 | partitionCMsearchHits :: String -> Double -> CMsearch -> ([CMsearchHit],[CMsearchHit])
109 | partitionCMsearchHits thresholdSelection thresholdScore cmSearchResult
110 | | thresholdSelection == "bitscore" = (bitscoreselected,bitscorerejected)
111 | | otherwise = (evalueselected,evaluerejected)
112 | where (bitscoreselected,bitscorerejected) = partition (\hit -> hitScore hit >= thresholdScore) (cmsearchHits cmSearchResult)
113 | (evalueselected,evaluerejected) = partition (\hit -> hitEvalue hit <= thresholdScore) (cmsearchHits cmSearchResult)
114 |
115 | trimCMsearchFastaFile :: String -> String -> String -> CMsearch -> String -> IO ()
116 | trimCMsearchFastaFile genomesDirectory outputFolder modelType cmsearch fastafile = do
117 | let fastaInputPath = genomesDirectory ++ "/" ++ fastafile
118 | let fastaOutputPath = outputFolder ++ "/" ++ modelType ++ "/" ++ fastafile
119 | fastaSequences <- readFastaFile fastaInputPath
120 | let trimmedSequence = trimCMsearchSequence cmsearch (head fastaSequences)
121 | writeFastaFile fastaOutputPath [trimmedSequence]
122 |
123 | trimCMsearchSequence :: CMsearch -> Fasta () () -> Fasta () ()
124 | trimCMsearchSequence cmSearchResult inputSequence = subSequence
125 | where hitScoreEntry = head (cmsearchHits cmSearchResult)
126 | sequenceString = show (_fasta inputSequence)
127 | sequenceSubstring = cmSearchsubString (hitStart hitScoreEntry) (hitEnd hitScoreEntry) sequenceString
128 | newSequenceHeader = SequenceIdentifier (B.pack (show (_header inputSequence) ++ "cmS_" ++ show (hitStart hitScoreEntry) ++ "_" ++ show (hitEnd hitScoreEntry) ++ "_" ++ show (hitStrand hitScoreEntry)))
129 | subSequence = Fasta newSequenceHeader (BioSequence (B.pack sequenceSubstring))
130 |
131 | --With paralogs allowed
132 | cmSearchSameHit :: CMsearchHit -> CMsearchHit -> Bool
133 | cmSearchSameHit hitscore1 hitscore2
134 | | unpackedSeqHeader1 == unpackedSeqHeader2 = True
135 | | otherwise = False
136 | where unpackedSeqHeader1 = B.unpack (hitSequenceHeader hitscore1)
137 | unpackedSeqHeader2 = B.unpack (hitSequenceHeader hitscore2)
138 |
139 | cmSearchSameOrganism :: CMsearchHit -> CMsearchHit -> Bool
140 | cmSearchSameOrganism hitscore1 hitscore2
141 | | hitOrganism1 == hitOrganism2 = True
142 | | otherwise = False
143 | where unpackedSeqHeader1 = B.unpack (hitSequenceHeader hitscore1)
144 | unpackedSeqHeader2 = B.unpack (hitSequenceHeader hitscore2)
145 | separationcharacter1 = selectSeparationChar unpackedSeqHeader1
146 | separationcharacter2 = selectSeparationChar unpackedSeqHeader2
147 | hitOrganism1 = head (DS.splitOn separationcharacter1 unpackedSeqHeader1)
148 | hitOrganism2 = head (DS.splitOn separationcharacter2 unpackedSeqHeader2)
149 |
150 | selectSeparationChar :: String -> String
151 | selectSeparationChar inputString
152 | | any ((== ':')) inputString = ":"
153 | | otherwise = "/"
154 |
155 | main :: IO ()
156 | main = do
157 | Options{..} <- cmdArgs options
158 | rfamModelExists <- doesFileExist rfamCovarianceModelPath
159 | verbose <- getVerbosity
160 | rnazString <- rnazOutput verbose alienrnazPath
161 | rnacodeString <- rnaCodeOutput verbose alienrnacodePath
162 | cmStatString <- cmStatOutput verbose aliencmstatPath
163 | if rfamModelExists
164 | then do
165 | --compute linkscore
166 | linkscore <- if linkScores
167 | then compareCM rfamCovarianceModelPath alienCovarianceModelPath outputDirectoryPath
168 | else return (Left "-")
169 | rfamMaxLinkScore <- if linkScores then compareCM rfamCovarianceModelPath rfamCovarianceModelPath outputDirectoryPath else return (Left "-")
170 | alienMaxLinkscore <- if linkScores then compareCM alienCovarianceModelPath alienCovarianceModelPath outputDirectoryPath else return (Left "-")
171 | _ <- system ("cat " ++ rfamFastaFilePath ++ " | grep '>' | wc -l >" ++ outputDirectoryPath ++ FP.takeFileName rfamFastaFilePath ++ ".entries")
172 | _ <- system ("cat " ++ alienFastaFilePath ++ " | grep '>' | wc -l >" ++ outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries")
173 | rfamFastaEntries <- readFile (outputDirectoryPath ++ FP.takeFileName rfamFastaFilePath ++ ".entries")
174 | alienFastaEntries <- readFile (outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries")
175 | let rfamFastaEntriesNumber = read rfamFastaEntries :: Int
176 | let alienFastaEntriesNumber = read alienFastaEntries :: Int
177 | rfamonAlienResults <- cmSearchesFasta benchmarkIndex thresholdSelection rfamThreshold databaseSize threads rfamCovarianceModelPath outputDirectoryPath "rfamOnAlien" alienFastaFilePath
178 | alienonRfamResults <- cmSearchFasta benchmarkIndex thresholdSelection alienThreshold databaseSize threads alienCovarianceModelPath outputDirectoryPath "alienOnRfam" rfamFastaFilePath
179 | let rfamonAlienResultsNumber = length rfamonAlienResults
180 | let alienonRfamResultsNumber = length alienonRfamResults
181 | let rfamonAlienRecovery = (fromIntegral rfamonAlienResultsNumber :: Double) / (fromIntegral alienFastaEntriesNumber :: Double)
182 | let alienonRfamRecovery = (fromIntegral alienonRfamResultsNumber :: Double) / (fromIntegral rfamFastaEntriesNumber :: Double)
183 | if verbose == Loud
184 | then do
185 | putStrLn ("BenchmarkIndex: " ++ show benchmarkIndex)
186 | putStrLn ("RfamModelName: " ++ rfamModelName)
187 | putStrLn ("RfamModelId: " ++ rfamModelId)
188 | putStrLn ("Linkscore: " ++ either id show linkscore)
189 | putStrLn ("rfamMaxLinkScore: " ++ either id show rfamMaxLinkScore)
190 | putStrLn ("alienMaxLinkscore: " ++ either id show alienMaxLinkscore)
191 | putStrLn ("rfamGatheringThreshold: " ++ show rfamThreshold)
192 | putStrLn ("alienGatheringThreshold: " ++ show alienThreshold)
193 | putStrLn ("rfamFastaEntriesNumber: " ++ show rfamFastaEntriesNumber)
194 | putStrLn ("alienFastaEntriesNumber: " ++ show alienFastaEntriesNumber)
195 | putStrLn ("rfamonAlienResultsNumber: " ++ show rfamonAlienResultsNumber)
196 | putStrLn ("alienonRfamResultsNumber: " ++ show alienonRfamResultsNumber)
197 | putStrLn ("RfamonAlienRecovery: " ++ show rfamonAlienRecovery)
198 | putStrLn ("AlienonRfamRecovery: " ++ show alienonRfamRecovery)
199 | print rnazString
200 | print rnacodeString
201 | print cmStatString
202 | else
203 | putStrLn (show benchmarkIndex ++ "\t" ++ rfamModelName ++ "\t" ++ rfamModelId ++ "\t" ++ (either id show linkscore) ++ "\t" ++ (either id show rfamMaxLinkScore) ++ "\t" ++ (either id show alienMaxLinkscore) ++ "\t" ++ show rfamThreshold ++ "\t" ++ show alienThreshold ++ "\t" ++ show rfamFastaEntriesNumber ++ "\t" ++ show alienFastaEntriesNumber ++ "\t" ++ show rfamonAlienResultsNumber ++ "\t" ++ show alienonRfamResultsNumber ++ "\t" ++ printf "%.2f" rfamonAlienRecovery ++ "\t" ++ printf "%.2f" alienonRfamRecovery ++ "\t" ++ rnazString ++ "\t" ++ rnacodeString ++ "\t" ++ cmStatString)
204 | else do
205 | --compute linkscore
206 | alienMaxLinkscore <- if linkScores then compareCM alienCovarianceModelPath alienCovarianceModelPath outputDirectoryPath else return ( Left "-")
207 | _ <- system ("cat " ++ alienFastaFilePath ++ " | grep '>' | wc -l >" ++ outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries")
208 | alienFastaEntries <- readFile (outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries")
209 | let alienFastaEntriesNumber = read alienFastaEntries :: Int
210 | if verbose == Loud
211 | then do
212 | putStrLn "BenchmarkIndex:"
213 | putStrLn "RfamModelName: -"
214 | putStrLn "RfamModelId: -"
215 | putStrLn "Linkscore: -"
216 | putStrLn "rfamMaxLinkScore: -"
217 | putStrLn ("alienMaxLinkscore: " ++ either id show alienMaxLinkscore)
218 | putStrLn "rfamGatheringThreshold: -"
219 | putStrLn "alienGatheringThreshold: -"
220 | putStrLn "rfamFastaEntriesNumber: -"
221 | putStrLn ("alienFastaEntriesNumber: " ++ show alienFastaEntriesNumber)
222 | putStrLn "rfamonAlienResultsNumber: -"
223 | putStrLn "alienonRfamResultsNumber: -"
224 | putStrLn "RfamonAlienRecovery: -"
225 | putStrLn "AlienonRfamRecovery: -"
226 | print rnazString
227 | print cmStatString
228 | else
229 | putStrLn (show benchmarkIndex ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ (either id show alienMaxLinkscore) ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ show alienFastaEntriesNumber ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ rnazString ++ "\t" ++ rnacodeString ++ "\t" ++ cmStatString)
230 |
231 | rnazOutput :: Verbosity -> String -> IO String
232 | rnazOutput verbose rnazPath = do
233 | rnazPresent <- doesFileExist rnazPath
234 | if rnazPresent
235 | then do
236 | inputRNAz <- readRNAz rnazPath
237 | if isRight inputRNAz
238 | then do
239 | let rnaZ = fromRight inputRNAz
240 | if verbose == Loud
241 | then do
242 | let output = "Mean pairwise identity: " ++ show (meanPairwiseIdentity rnaZ) ++ "\n Shannon entropy: " ++ show (shannonEntropy rnaZ) ++ "\n GC content: " ++ show (gcContent rnaZ) ++ "\n Mean single sequence minimum free energy: " ++ show (meanSingleSequenceMinimumFreeEnergy rnaZ) ++ "\n Consensus minimum free energy: " ++ show (consensusMinimumFreeEnergy rnaZ) ++ "\n Energy contribution: " ++ show (energyContribution rnaZ) ++ "\n Covariance contribution: " ++ show (covarianceContribution rnaZ) ++ "\n Combinations pair: " ++ show (combinationsPair rnaZ) ++ "\n Mean z-score: " ++ show (meanZScore rnaZ) ++ "\n Structure conservation index: " ++ show (structureConservationIndex rnaZ) ++ "\n Background model: " ++ backgroundModel rnaZ ++ "\n Decision model: " ++ decisionModel rnaZ ++ "\n SVM decision value: " ++ show (svmDecisionValue rnaZ) ++ "\n SVM class propability: " ++ show (svmRNAClassProbability rnaZ) ++ "\n Prediction: " ++ prediction rnaZ
243 | return output
244 | else do
245 | let output = show (meanPairwiseIdentity rnaZ) ++ "\t" ++ show (shannonEntropy rnaZ) ++ "\t" ++ show (gcContent rnaZ) ++ "\t" ++ show (meanSingleSequenceMinimumFreeEnergy rnaZ) ++ "\t" ++ show (consensusMinimumFreeEnergy rnaZ) ++ "\t" ++ show (energyContribution rnaZ) ++ "\t" ++ show (covarianceContribution rnaZ) ++ "\t" ++ show (combinationsPair rnaZ) ++ "\t" ++ show (meanZScore rnaZ) ++ "\t" ++ show (structureConservationIndex rnaZ) ++ "\t" ++ show (svmDecisionValue rnaZ) ++ "\t" ++ show (svmRNAClassProbability rnaZ) ++ "\t" ++ prediction rnaZ
246 | return output
247 | else
248 | if (verbose == Loud)
249 | then do
250 | let output = "Mean pairwise identity: " ++ " - \n Shannon entropy: " ++ " - \n GC content: " ++ " - \n Mean single sequence minimum free energy: " ++ " - \n Consensus minimum free energy: " ++ " - \n Energy contribution: " ++ " - \n Covariance contribution: " ++ " - \n Combinations pair: " ++ " - \n Mean z-score: " ++ " - \n Structure conservation index: " ++ " - \n Background model: " ++ " - \n Decision model: " ++ " - \n SVM decision value: " ++ " - \n SVM class propability: " ++ " - \n Prediction: " ++ " - \n"
251 | return output
252 | else do
253 | let output = "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-"
254 | return output
255 | else
256 | if (verbose == Loud)
257 | then do
258 | let output = "Mean pairwise identity: " ++ " - \n Shannon entropy: " ++ " - \n GC content: " ++ " - \n Mean single sequence minimum free energy: " ++ " - \n Consensus minimum free energy: " ++ " - \n Energy contribution: " ++ " - \n Covariance contribution: " ++ " - \n Combinations pair: " ++ " - \n Mean z-score: " ++ " - \n Structure conservation index: " ++ " - \n Background model: " ++ " - \n Decision model: " ++ " - \n SVM decision value: " ++ " - \n SVM class propability: " ++ " - \n Prediction: " ++ " - \n"
259 | return output
260 | else do
261 | let output = "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-"
262 | return output
263 |
264 | cmStatOutput :: Verbosity -> String -> IO String
265 | cmStatOutput verbose cmstatPath = do
266 | cmstatPresent <- doesFileExist cmstatPath
267 | if cmstatPresent
268 | then do
269 | inputCMstat <- readCMstat cmstatPath
270 | if isRight inputCMstat
271 | then do
272 | let cmStat = fromRight inputCMstat
273 | if verbose == Loud
274 | then do
275 | let output = "statSequenceNumber: " ++ show (statSequenceNumber cmStat) ++ "\nstatEffectiveSequences: " ++ show (statEffectiveSequences cmStat) ++ "\nstatConsensusLength: " ++ show (statConsensusLength cmStat) ++ "\nstatW: " ++ show (statW cmStat) ++ "\nstatBasepairs: " ++ show (statBasepairs cmStat) ++ "\nstatBifurcations: " ++ show (statBifurcations cmStat) ++ "\nstatModel: " ++ statModel cmStat ++ "\nrelativeEntropyCM: " ++ show (relativeEntropyCM cmStat) ++ "\nrelativeEntropyHMM: " ++ show (relativeEntropyHMM cmStat)
276 | return output
277 | else do
278 | let output = show (statSequenceNumber cmStat) ++ "\t" ++ show (statEffectiveSequences cmStat) ++ "\t" ++ show (statConsensusLength cmStat) ++ "\t" ++ show (statW cmStat) ++ "\t" ++ show (statBasepairs cmStat) ++ "\t" ++ show (statBifurcations cmStat) ++ "\t" ++ statModel cmStat ++ "\t" ++ show (relativeEntropyCM cmStat) ++ "\t" ++ show (relativeEntropyHMM cmStat)
279 | return output
280 | else
281 | if (verbose == Loud)
282 | then do
283 | let output = "statSequenceNumber: -" ++ "\nstatEffectiveSequences: -" ++ "\nstatConsensusLength: -" ++ "\nstatW: -" ++ "\nstatBasepairs: -" ++ "\nstatBifurcations: -" ++ "\nstatModel: -" ++ "\nrelativeEntropyCM: -" ++ "\nrelativeEntropyHMM: -"
284 | return output
285 | else do
286 | let output = "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-"
287 | return output
288 | else
289 | if (verbose == Loud)
290 | then do
291 | let output = "statSequenceNumber: -" ++ "\nstatEffectiveSequences: -" ++ "\nstatConsensusLength: -" ++ "\nstatW: -" ++ "\nstatBasepairs: -" ++ "\nstatBifurcations: -" ++ "\nstatModel: -" ++ "\nrelativeEntropyCM: -" ++ "\nrelativeEntropyHMM: -"
292 | return output
293 | else do
294 | let output = "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-"
295 | return output
296 |
297 | rnaCodeOutput :: Verbosity -> String -> IO String
298 | rnaCodeOutput verbose rnaCodePath = do
299 | rnacodePresent <- doesFileExist rnaCodePath
300 | if rnacodePresent
301 | then do
302 | inputRNACode <- RC.readRNAcodeTabular rnaCodePath
303 | if isRight inputRNACode
304 | then do
305 | let rnaCode = fromRight inputRNACode
306 | let lowestPvalue = minimum (map RC.pvalue (RC.rnacodeHits rnaCode))
307 | let rnaCodeClassification = if lowestPvalue < 0.05 then "PROTEIN" else "OTHER"
308 | if verbose == Loud
309 | then do
310 | let output = "RNAcode lowest p-value: " ++ show lowestPvalue ++ "\nrnaCodeClassification: " ++ rnaCodeClassification
311 | return output
312 | else do
313 | let output = show lowestPvalue ++ "\t" ++ rnaCodeClassification
314 | return output
315 | else
316 | if (verbose == Loud)
317 | then do
318 | let output = "RNAcode lowest p-value: " ++ "-" ++ "\nrnaCodeClassification: " ++ "-"
319 | return output
320 | else do
321 | let output = "-\t" ++ "-"
322 | --let output = show (fromLeft inputRNACode)
323 | return output
324 | else
325 | if (verbose == Loud)
326 | then do
327 | let output = "RNAcode lowest p-value: " ++ "-" ++ "\nrnaCodeClassification: " ++ "-"
328 | return output
329 | else do
330 | let output = "-\t" ++ "-"
331 | return output
332 |
--------------------------------------------------------------------------------
/Biobase/cmsearchToBED.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE RecordWildCards #-}
2 | {-# LANGUAGE DeriveDataTypeable #-}
3 |
4 | -- | Convert cmsearch output to Browser Extensible Data (BED) format
5 | -- Testcommand: cmsearchToBED -i /path/to/test.clustal
6 | module Main where
7 | import Prelude
8 | import System.Console.CmdArgs
9 | import Biobase.RNAlien.Library
10 | import Data.Either.Unwrap
11 | import qualified Data.ByteString.Char8 as B
12 | import qualified Data.Text as T
13 | import Data.List
14 |
15 | data Bed = Bed
16 | { browserPostition :: T.Text,
17 | browserSettings :: T.Text,
18 | bedName :: T.Text,
19 | bedDescription :: T.Text,
20 | bedVisibility :: Int,
21 | bedItemRgb :: Bool,
22 | bedEntries :: [BedEntry]
23 | } deriving (Eq, Read)
24 |
25 | instance Show Bed where
26 | show (Bed _browserPostition _browserSettings _bedName _bedDescription _bedVisibility _bedItemRgb _bedEntries) = a ++ b ++ c ++ d ++ e ++ f ++ g
27 | where a = "browser position " ++ T.unpack _browserPostition ++ "\n"
28 | b = T.unpack _browserSettings ++ "\n"
29 | c = "track name=\"" ++ T.unpack _bedName ++ "\" "
30 | d = "description=\"" ++ T.unpack _bedDescription ++ "\" "
31 | e = "visibility=" ++ show _bedVisibility ++ " "
32 | f = "itemRgb=\"" ++ itemRbg ++ "\"\n"
33 | itemRbg = if _bedItemRgb then "On" else "Off"
34 | g = concatMap show _bedEntries
35 |
36 |
37 | data BedEntry = BedEntry
38 | { chrom :: T.Text,
39 | chromStart :: Int,
40 | chromEnd :: Int,
41 | chromName :: Maybe T.Text,
42 | score :: Maybe Int,
43 | strand :: Maybe Char,
44 | thickStart :: Maybe Int,
45 | thickEnd :: Maybe Int,
46 | color :: Maybe T.Text,
47 | blockCount :: Maybe Int,
48 | blockSizes :: Maybe [Int],
49 | blockStarts :: Maybe [Int]
50 | } deriving (Eq, Read)
51 |
52 | instance Show BedEntry where
53 | show (BedEntry _chrom _chromStart _chromEnd _chromName _score _strand _thickStart _thickEnd _color _blockCount _blockSizes _blockStarts) = a ++ b ++ c ++ d ++ e ++ f ++ g ++ h ++ i ++ j ++ k ++ l
54 | where a = T.unpack _chrom ++ "\t"
55 | b = show _chromStart ++ "\t"
56 | c = show _chromEnd ++ "\t"
57 | d = maybe "" T.unpack _chromName ++ "\t"
58 | e = maybe "" show _score ++ "\t"
59 | f = maybe "" ((: [])) _strand ++ "\t"
60 | g = maybe "" show _thickStart ++ "\t"
61 | h = maybe "" show _thickEnd ++ "\t"
62 | i = maybe "" T.unpack _color ++ "\t"
63 | j = maybe "" show _blockCount ++ "\t"
64 | k = maybe "" (intercalate "," . map show) _blockSizes ++ "\t"
65 | l = maybe "" (intercalate "," . map show) _blockStarts ++ "\n"
66 |
67 | data Options = Options
68 | { cmsearchPath :: String,
69 | inputBrowserSettings :: String,
70 | inputBedVisibility :: Int,
71 | inputTrackName :: String,
72 | inputTrackDescription :: String,
73 | inputItemRgb :: Bool,
74 | inputTrackColor :: String,
75 | sortBed :: Bool,
76 | withHeader :: Bool
77 | } deriving (Show,Data,Typeable)
78 |
79 | options :: Options
80 | options = Options
81 | { cmsearchPath = def &= name "i" &= help "Path to input cmsearch file",
82 | inputBrowserSettings = "browser hide all" &= name "b" &= help "Browser settings. Default: browser hide all",
83 | inputBedVisibility = (2 :: Int) &= name "y" &= help "Visibility setting of track. Default: 2",
84 | inputTrackName = "PredictedRNA" &= name "n" &= help "Name of the track Default: PredictedRNA",
85 | inputTrackDescription = "RNA loci predicted by cmsearch" &= name "d" &= help "Description of the track. Default: RNA loci predicted by cmsearch",
86 | inputItemRgb = True &= name "r" &= help "RGB Color of the track. Default: True",
87 | inputTrackColor = "255,0,0" &= name "c" &= help "RGB Color of the track. Default: 255,0,0",
88 | sortBed = True &= name "s" &= help "Sort entries of Bed file by start end end cooridinates. Default: True",
89 | withHeader = True &= name "w" &= help "Output contains bed header. Default: True"
90 | } &= summary "cmsearchToBED - Converts cmsearch file hits to BED file entries" &= help "Florian Eggenhofer 2016" &= verbosity
91 |
92 | main :: IO ()
93 | main = do
94 | Options{..} <- cmdArgs options
95 | parsedCmsearch <- readCMSearch cmsearchPath
96 | if isRight parsedCmsearch
97 | then do
98 | let outputBED = convertcmSearchToBED (fromRight parsedCmsearch) inputBrowserSettings inputTrackName inputTrackDescription inputTrackColor inputBedVisibility inputItemRgb sortBed
99 | if isRight outputBED
100 | then
101 | if withHeader
102 | then print (fromRight outputBED)
103 | else do
104 | let output = concatMap show (bedEntries (fromRight outputBED))
105 | putStr output
106 | else putStr (fromLeft outputBED)
107 | else putStr ("A problem occured converting from cmsearch to BED format:\n " ++ show (fromLeft parsedCmsearch))
108 |
109 | --convertcmSearchToBED :: CMsearch -> String -> String -> Either String String
110 | --convertcmSearchToBED inputcmsearch trackName trackColor
111 | -- | null cmHits = Left "cmsearch file contains no hits"
112 | -- | otherwise = Right (bedHeader ++ bedEntries)
113 | -- where cmHits = cmsearchHits inputcmsearch
114 | -- bedHeader = "browser position " ++ browserPosition ++ "\nbrowser hide all\ntrack name=\"cmsearch hits\" description=\"cmsearch hits\" visibility=2 itemRgb=\"On\"\n"
115 | -- bedEntries = concatMap (cmsearchHitToBEDentry trackName trackColor) cmHits
116 | -- browserPosition = L.unpack (hitSequenceHeader firstHit) ++ ":" ++ entryStart firstHit ++ "-" ++ entryEnd firstHit
117 | -- firstHit = (head cmHits)
118 |
119 | convertcmSearchToBED :: CMsearch -> String -> String -> String -> String -> Int -> Bool -> Bool -> Either String Bed
120 | convertcmSearchToBED inputcmsearch inputBrowserSettings trackName trackDescription trackColor inputBedVisibility inputItemRgb sortBed
121 | | null cmHits = Left "cmsearch file contains no hits"
122 | | otherwise = Right bed
123 | where cmHits = cmsearchHits inputcmsearch
124 | --bedHeader = "browser position " ++ browserPosition ++ "\nbrowser hide all\ntrack name=\"cmsearch hits\" description=\"cmsearch hits\" visibility=2 itemRgb=\"On\"\n"
125 | bedEntries = map (cmsearchHitToBEDentry trackName trackColor) cmHits
126 | sortedBedEntries = if sortBed then sortBy orderBedEntry bedEntries else bedEntries
127 | currentBrowserPosition = T.unpack (chrom firstEntry) ++ ":" ++ show (chromStart firstEntry) ++ "-" ++ show (chromEnd firstEntry)
128 | firstEntry = head sortedBedEntries
129 | bed = Bed (T.pack currentBrowserPosition) (T.pack inputBrowserSettings) (T.pack trackName) (T.pack trackDescription) inputBedVisibility inputItemRgb sortedBedEntries
130 |
131 | cmsearchHitToBEDentry :: String -> String -> CMsearchHit -> BedEntry
132 | cmsearchHitToBEDentry hitName hitColor cmHit = entry
133 | where entry = BedEntry chromosome entrystart entryend (Just (T.pack hitName)) entryscore entrystrand thickstart thickend entrycolor blocks blockSize blockStart
134 | chromosome = T.pack (B.unpack (hitSequenceHeader cmHit))
135 | --entryline = L.unpack (hitSequenceHeader cmHit) ++ "\t" ++ entryStart cmHit ++ "\t" ++ entryEnd cmHit++ "\t" ++ (hitName) ++ "\t" ++ "0" ++ "\t" ++ [(hitStrand cmHit)] ++ "\t" ++ show (hitStart cmHit) ++ "\t" ++ show (hitEnd cmHit) ++ "\t" ++ hitColor ++ "\n"
136 | entrystart = if hitStrand cmHit == '+' then hitStart cmHit else hitEnd cmHit
137 | entryend = if hitStrand cmHit == '+' then hitEnd cmHit else hitStart cmHit
138 | entryscore = Just (0 :: Int)
139 | entrystrand = Just (hitStrand cmHit)
140 | thickstart = Just entrystart
141 | thickend = Just entryend
142 | entrycolor = Just (T.pack hitColor)
143 | blocks = Just (1 :: Int)
144 | blockSize = Just [entryend - entrystart]
145 | blockStart = Just [0 :: Int]
146 |
147 |
148 | --cmsearchHitToBEDentry :: String -> String -> CMsearchHit -> String
149 | --cmsearchHitToBEDentry hitName hitColor cmHit = entryline
150 | -- where entryline = L.unpack (hitSequenceHeader cmHit) ++ "\t" ++ entryStart cmHit ++ "\t" ++ entryEnd cmHit++ "\t" ++ (hitName) ++ "\t" ++ "0" ++ "\t" ++ [(hitStrand cmHit)] ++ "\t" ++ show (hitStart cmHit) ++ "\t" ++ show (hitEnd cmHit) ++ "\t" ++ hitColor ++ "\n"
151 | --entrystart = if (hitStrand cmHit) == '+' then show (hitStart cmHit) else show (hitEnd cmHit)
152 | --entryend = if (hitStrand cmHit) == '+' then show (hitEnd cmHit) else show (hitStart cmHit)
153 |
154 | entryStart :: CMsearchHit -> String
155 | entryStart cmHit
156 | | hitStrand cmHit == '+' = show (hitStart cmHit)
157 | | otherwise = show (hitEnd cmHit)
158 |
159 | entryEnd :: CMsearchHit -> String
160 | entryEnd cmHit
161 | | hitStrand cmHit == '+' = show (hitEnd cmHit)
162 | | otherwise = show (hitStart cmHit)
163 |
164 | orderBedEntry :: BedEntry -> BedEntry -> Ordering
165 | orderBedEntry firstHit secondHit
166 | | start1 > start2 = GT
167 | | start1 < start2 = LT
168 | | otherwise = orderBedEntryEnd firstHit secondHit
169 | where start1 = chromStart firstHit
170 | start2 = chromStart secondHit
171 |
172 | orderBedEntryEnd :: BedEntry -> BedEntry -> Ordering
173 | orderBedEntryEnd firstHit secondHit
174 | | end1 > end2 = GT
175 | | end1 < end2 = LT
176 | | otherwise = EQ
177 | where end1 = chromEnd firstHit
178 | end2 = chromEnd secondHit
179 |
--------------------------------------------------------------------------------
/ChangeLog.md:
--------------------------------------------------------------------------------
1 | -*-change-log-*-
2 |
3 | ### 1.8.5 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 6. June 2021
4 |
5 | * Compatibility with ghc 9
6 | * Testing with github actions
7 |
8 | ### 1.8.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 3. January 2020
9 |
10 | * Construction start from input alignment for Scan and Alien
11 | * Alien is working fully offline, by using offline taxonomy database
12 | * Improved collection of near identical hits
13 | * RNAlien now uses paralellization
14 | * Fixes for speed regression in taxid positive set computation
15 |
16 | ### 1.7.1 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 12. September 2019
17 |
18 | * Fixed Scan tool global search step
19 |
20 | ### 1.7.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 29. August 2019
21 |
22 | * Added Scan tool
23 | * Changed tracing high similarity candidates
24 | * Fixed regression in parsing input fasta
25 |
26 | ### 1.6.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 19. June 2019
27 |
28 | * Added offline mode for blast calls and sequence retrieval
29 | * Changed to Biobase repository layout
30 | * Added statically linked executables to releases
31 |
32 | ### 1.5.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 4. March 2019
33 |
34 | * Enabled initialization from multi-line fasta
35 |
36 | ### 1.4.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 9. December 2018
37 |
38 | * Switched to Biobase libraries
39 | * RNAlien is now using json based blast requests
40 |
41 | ### 1.3.8 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 3. April 2019
42 |
43 | * Fix for outdated ca-certificates
44 |
45 | ### 1.3.7 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 13. March 2017
46 |
47 | * Removed optimization flags that prevent hackage upload
48 |
49 | ### 1.3.6 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 5. March 2017
50 |
51 | * SelectSequences moved to own repository, removed tool from package
52 | * Clustal result file is now also written without evaluation step
53 |
54 | ### 1.3.5 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 5. March 2017
55 |
56 | * Added a commandline switch to check setup and network connection, improved tempdir handling
57 |
58 | ### 1.3.4 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 2. March 2017
59 |
60 | * More changes toward bioconda compatibility, changed compiler optimization flag to -O
61 |
62 | ### 1.3.3 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 1. March 2017
63 |
64 | * Further changes to stack.yaml
65 |
66 | ### 1.3.2 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 8. February 2017
67 |
68 | * Minor fix to stack.yaml for bioconda recipe
69 |
70 | ### 1.3.1 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 6. February 2017
71 |
72 | * Updated version constraints for ClustalParser supporting multi-line consensus secondary structure
73 |
74 | ### 1.3.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 20. January 2017
75 |
76 | * Included bugfix from ViennaRNAparser concerning RNAalifold systemcall
77 |
78 | ### 1.2.9 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 8. January 2017
79 |
80 | * Dropped dependency on rnazSelectSequences.pl for evaluation step
81 | * Select sequences can now print a similarity matrix
82 | * Internal sequence selection is substantially faster due to text-metrics
83 |
84 | ### 1.2.8 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 1. January 2017
85 |
86 | * Added a commandline switch to turn switch the evaluation step on and off
87 |
88 | ### 1.2.7 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 13. November 2016
89 |
90 | * Fixed a bug in inital connection check with HTTPS
91 |
92 | ### 1.2.6 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 12. November 2016
93 |
94 | * Changed NCBI URL to HTTPS and updated libary constraints
95 |
96 | ### 1.2.5 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 26. October 2016
97 |
98 | * Updated stack.yaml
99 |
100 | ### 1.2.4 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 24. October 2016
101 |
102 | * Support for GHC-8.0.1
103 |
104 | ### 1.2.3 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 21. October 2016
105 |
106 | * Added cmsearch output to BED12 converter for genome browser integration
107 | * Updated dependency versions and version number output
108 |
109 | ### 1.2.2 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 1. June 2016
110 |
111 | * Fixed a bug building RNAcentral query and improved formatting of
112 | corresponding output
113 |
114 | ### 1.2.1 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 30. May 2016
115 |
116 | * Added RNAcentralRequest utility
117 | * Fixed a bug in parsing RNAcentral response headers
118 |
119 | ### 1.2.0 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 30. May 2016
120 |
121 | * Added cmsearchToBED utility
122 |
123 | ### 1.1.3 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 25. April 2016
124 |
125 | * Fixed wrong description for softmasking commandline switch
126 | * Fixed encoding tabular iteration progress output
127 |
128 | ### 1.1.2 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 18. April 2016
129 |
130 | * Fixed a bug in passing softmasking to blast
131 | * Performance improvements in query selection
132 |
133 | ### 1.1.1 [Florian Eggenhofer](egg@informatik.uni-freiburg.de) 23. March 2016
134 |
135 | * Added a commandlineswitch for softmasking
136 | * Improved interface with Alienserver
137 |
138 | ### 1.1.0 [Florian Eggenhofer](mailto:florian.eggenhofer@univie.ac.at) 11. February 2016
139 |
140 | * Update including changes from 1st review
141 | * Cmbuild uses --refine option
142 | * Evaluation now includes RNAcode result, which is a new dependecy
143 | * RNAcentral lookup for found sequences via REST interface during evaluation
144 | * Added a new alternative query selection method that filters for entries max. pairwise identity
145 | * Added softmasking to blastrequests
146 | * Paralog sequences are now included by default
147 | * Installation of RNAlien is now available via stackage
148 | * Fix several bugs including blasthit coverage filter
149 | * RNAlienStatistics can now parse cmsearch results from multiple cm files as for clans
150 | * RNAlienStatistics includes a switch for using bitscore or evalue cutoffs
151 |
152 | ### 1.0.0 [Florian Eggenhofer](florian.eggenhofer@univie.ac.at) 29. October 2015
153 |
154 | * Initial version
155 |
--------------------------------------------------------------------------------
/Dockerfile.dev:
--------------------------------------------------------------------------------
1 | FROM alpine:edge
2 |
3 | RUN apk update
4 | RUN apk add --no-cache musl musl-dev musl-utils musl-dbg ghc ghc-dev ghc-doc cabal zlib-dev zlib zlib-static tar gzip wget
5 |
6 | ADD . source
7 | WORKDIR source
8 | RUN cabal new-update && cabal new-build --enable-executable-static
9 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAlien/build/RNAlien/RNAlien /RNAlien
10 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAlienScan/build/RNAlienScan/RNAlienScan /RNAlienScan
11 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAlienStatistics/build/RNAlienStatistics/RNAlienStatistics /RNAlienStatistics
12 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/cmsearchToBed/build/cmsearchToBed/cmsearchToBed /cmsearchToBed
13 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAcentralHTTPRequest/build/RNAcentralHTTPRequest/RNAcentralHTTPRequest /RNAcentralHTTPRequest
14 | RUN cabal new-clean
15 | RUN rm -r /source
16 | RUN apk del musl musl-dev musl-utils musl-dbg ghc ghc-dev ghc-doc cabal zlib-static zlib-dev zlib tar gzip wget
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
676 |
--------------------------------------------------------------------------------
/ParserTest.hs:
--------------------------------------------------------------------------------
1 | -- | Parser test script
2 | -- read from file and directly print parsing output
3 | -- runghc -package-db=.cabal-sandbox/x86_64-linux-ghc-7.8.3-packages.conf.d/ ParserTest.hs test.cmstat
4 | module Main where
5 |
6 | import System.Environment (getArgs)
7 | import System.Console.CmdArgs
8 | import System.Directory
9 | import Bio.Sequence.Fasta
10 | import Bio.RNAlienData
11 | import Bio.RNAlienLibrary
12 | import Data.Maybe
13 | import Data.Time
14 | import Data.Either.Unwrap
15 |
16 | main :: IO ()
17 | main = do
18 | args <- getArgs
19 | let input_file = (head args)
20 | parseresult <- readCMstat input_file
21 | print (fromRight parseresult)
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | =========
3 | RNAlien is a tool for automatic construction of RNAfamily models from a single sequence.
4 |
5 | It is available as a commandline tool, for testing or construction of few sequences the webservice can be used.
6 |
7 | The source code of RNAlien is open source and available via GitHub and Hackage (License GPL-3):
8 |
9 | * [](https://github.com/eggzilla/RNAlien) [](https://travis-ci.org/eggzilla/RNAlien) [](https://hackage.haskell.org/package/RNAlien) [](https://anaconda.org/bioconda/rnalien) [](https://hub.docker.com/repository/docker/eggzilla/rnalien) 
10 |
11 |
12 | ### Installation via bioconda - recommended
13 |
14 | RNAlien can be installed with all tool dependencies via [conda](https://conda.io/docs/install/quick.html). Once you have conda installed simply type:
15 |
16 | conda create -n rnalien185 -c conda-forge -c bioconda rnalien=1.8.5
17 |
18 | Activate the environment in which RNAlien was installed to use it:
19 |
20 | conda activate rnalien185
21 |
22 | To use the offline-mode of the commandline tool additionally following database downloads are required:
23 |
24 | * Download [NCBI Taxonomy Dump](ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz)
25 | ```bash
26 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz
27 | tar -xzf new_taxdump.tar.gz
28 | ```
29 |
30 | * Download [NCBI Blast version 5 database](https://ftp.ncbi.nlm.nih.gov/blast/db/v5)
31 | ```bash
32 | #After installing and activating the RNAlien bioconda environment use update_blastdb.pl
33 | #Show all available databases
34 | update_blastdb.pl --blastdb_version 5 --showall
35 | #Download the nt_v5 database (~about 70 GB in size)
36 | update_blastdb.pl --blastdb_version 5 nt_v5 --decompress
37 | ```
38 |
39 | ### Usage
40 |
41 | After installation with bioconda, activating the environment and downloading the files using the offline mode of the command line tool is recommended.
42 | Following are example calls for the files contained in the test directory of the repository.
43 | Using -c 4 and +RTS -N4 provides 4 cpu threads to the used tool dependencies (e.g. blast,..) and to RNAlien.
44 | * Single fasta input:
45 | ```bash
46 | RNAlien -i single.fa -c 4 -j -b /pathto/blast5db/nt_v5 -d single -w /pathto/new_taxdump/taxidlineage.dmp +RTS -N4
47 | ```
48 |
49 | * Multi fasta input:
50 |
51 | ```bash
52 | RNAlien -i testmulti.fa -c 4 -j -b /pathto/blast5db/nt_v5 -d single -w /pathto/new_taxdump/taxidlineage.dmp +RTS -N4
53 | ```
54 |
55 | * Stockholm alignment (with consensus structure) input
56 | ```bash
57 | RNAlien -p test.stockholm -c 4 -j -b /pathto/blast5db/nt_v5 -d aln -w /pathto/new_taxdump/taxidlineage.dmp +RTS -N4
58 | ```
59 |
60 | If you just want to try RNAlien out, or to construct a single family the onlinse mode can be used.
61 | It does not require database downloads and queries the required information from ncbi webservices.
62 | A stable, uninterupted internet connection is mandatory.
63 |
64 | * Single fasta input (online-mode):
65 | ```bash
66 | RNAlien -i single.fa -c 4 -d onsingle +RTS -N4
67 | ```
68 | To display the possible commandline options run:
69 |
70 | ```bash
71 | RNAlien --help
72 | ```
73 | For detailed instruction how to use RNAlien please see the [Help page.](http://rna.tbi.univie.ac.at/rnalien/help)
74 |
75 | TaxonomyTools which can be used to visualise the organisms included in a RNAlien result can be found here (License GPL-3):
76 |
77 | * [](https://github.com/eggzilla/TaxonomyTools) [](https://travis-ci.org/eggzilla/TaxonomyTools) [](https://hackage.haskell.org/package/RNAlien)
78 |
79 |
--------------------------------------------------------------------------------
/RNAlien.cabal:
--------------------------------------------------------------------------------
1 | name: RNAlien
2 | version: 1.8.5
3 | synopsis: Unsupervized construction of RNA family models
4 | description: RNAlien is a tool for automatic construction of RNAfamily models from a single sequence.
5 | .
6 | It is available as a commandline tool, for testing or construction of few sequences the webservice can be used.
7 | .
8 | The source code of RNAlien, as well as the webserver is open source and available via GitHub (License GPL-3):
9 | .
10 | *
11 | .
12 | *
13 | .
14 | TaxonomyTools which can be used to visualise the organisms included in a RNAlien result can be found here (License GPL-3):
15 | .
16 | *
17 | .
18 | *
19 | .
20 | For instruction how to use RNAlien please see the .
21 | .
22 | Dependencies:
23 | .
24 | *
25 | .
26 | *
27 | .
28 | *
29 | .
30 | *
31 | .
32 | *
33 | .
34 | Installation via cabal-install:
35 | .
36 | > cabal install RNAlien
37 |
38 | license: GPL-3
39 | license-file: LICENSE
40 | author: Florian Eggenhofer
41 | maintainer: egg@informatik.uni-freiburg.de
42 | copyright: Florian Eggenhofer
43 | category: Bioinformatics
44 | build-type: Simple
45 | cabal-version: >= 1.10.0
46 | tested-with: GHC == 8.8, GHC == 8.10, GHC == 9.0
47 | Extra-Source-Files:
48 | README.md ChangeLog.md
49 |
50 | source-repository head
51 | type: git
52 | location: https://github.com/eggzilla/RNAlien
53 |
54 | source-repository this
55 | type: git
56 | location: https://github.com/eggzilla/RNAlien/tree/1.8.5
57 | tag: 1.8.5
58 |
59 | executable RNAlien
60 | Hs-Source-Dirs: ./Biobase/
61 | main-is: RNAlien.hs
62 | ghc-options: -Wall
63 | default-language: Haskell2010
64 | other-modules: Paths_RNAlien
65 | build-depends: base >=4.5 && <5, cmdargs, directory,
66 | random, containers, RNAlien, time, either-unwrap, filepath,
67 | BiobaseFasta == 0.4.0.*, StockholmAlignment
68 |
69 | executable RNAlienScan
70 | Hs-Source-Dirs: ./Biobase/
71 | main-is: RNAlienScan.hs
72 | ghc-options: -Wall
73 | default-language: Haskell2010
74 | other-modules: Paths_RNAlien
75 | build-depends: base >=4.5 && <5, cmdargs, directory,
76 | random, containers, RNAlien, time, either-unwrap, filepath,
77 | BiobaseFasta == 0.4.0.*, StockholmAlignment
78 |
79 | executable RNAlienStatistics
80 | Hs-Source-Dirs: ./Biobase/
81 | main-is: RNAlienStatistics.hs
82 | ghc-options: -Wall
83 | default-language: Haskell2010
84 | other-modules: Paths_RNAlien
85 | build-depends: base >=4.5 && <5, cmdargs, cassava, vector, process, bytestring,
86 | either-unwrap, RNAlien, directory, split, filepath, ViennaRNAParser>=1.3.2,
87 | BiobaseFasta == 0.4.0.*, BiobaseTypes == 0.2.1.*
88 |
89 | executable cmsearchToBed
90 | Hs-Source-Dirs: ./Biobase/
91 | main-is: cmsearchToBED.hs
92 | ghc-options: -Wall
93 | default-language: Haskell2010
94 | other-modules: Paths_RNAlien
95 | build-depends: base >=4.5 && <5, cmdargs, either-unwrap, RNAlien, bytestring, text
96 |
97 | executable RNAcentralHTTPRequest
98 | Hs-Source-Dirs: ./Biobase/
99 | main-is: RNAcentralHTTPRequest.hs
100 | ghc-options: -Wall
101 | default-language: Haskell2010
102 | other-modules: Paths_RNAlien
103 | build-depends: base >=4.5 && <5, cmdargs, either-unwrap, RNAlien
104 |
105 | Library
106 | Hs-Source-Dirs: .
107 | ghc-options: -Wall -fno-warn-unused-do-bind -fsimpl-tick-factor=500
108 | default-language: Haskell2010
109 | build-depends: base >=4.5 && <5, cmdargs, ViennaRNAParser>=1.3.2, process, directory,
110 | parsec, random, bytestring, Taxonomy >= 2.1.0, either-unwrap, containers,
111 | ClustalParser>=1.3.0, vector, edit-distance, cassava, matrix, hierarchical-clustering,
112 | filepath, HTTP, http-conduit, hxt, network<=2.8.0.0, aeson<=1.6.0.0, text, transformers,
113 | pureMD5, http-types, text-metrics, BiobaseTypes == 0.2.1.*, BiobaseFasta == 0.4.0.*,
114 | BiobaseBlast == 0.3.3.*, BlastHTTP >= 1.4.2, BiobaseHTTP == 1.2.0, silently, StockholmAlignment>=1.3.0, BiobaseEnsembl>=0.2.0.0, parallel, attoparsec
115 | Exposed-Modules: Biobase.RNAlien.Types
116 | Biobase.RNAlien.Library
117 | Biobase.RNAlien.RNAcentralHTTP
118 | Biobase.RNAlien.InfernalParser
119 | Biobase.RNAlien.CMstatParser
120 |
--------------------------------------------------------------------------------
/RNAlien.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
108 |
--------------------------------------------------------------------------------
/RNAlienScan.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
153 |
--------------------------------------------------------------------------------
/cabal.project:
--------------------------------------------------------------------------------
1 | packages: .
2 |
3 | optional-packages: ./*/*.cabal
4 |
5 |
--------------------------------------------------------------------------------
/default.nix:
--------------------------------------------------------------------------------
1 | { mkDerivation, aeson, base, BiobaseBlast, BiobaseFasta, BlastHTTP
2 | , bytestring, cassava, ClustalParser, cmdargs, containers
3 | , directory, edit-distance, either-unwrap, filepath
4 | , hierarchical-clustering, HTTP, http-conduit, http-types, hxt
5 | , matrix, network, parsec, process, pureMD5, random, split, stdenv
6 | , Taxonomy, text, text-metrics, time, transformers, vector
7 | , ViennaRNAParser
8 | }:
9 | mkDerivation {
10 | pname = "RNAlien";
11 | version = "1.5.0";
12 | src = ./.;
13 | isLibrary = true;
14 | isExecutable = true;
15 | libraryHaskellDepends = [
16 | aeson base BiobaseBlast BiobaseFasta BlastHTTP bytestring cassava
17 | ClustalParser cmdargs containers directory edit-distance
18 | either-unwrap filepath hierarchical-clustering HTTP http-conduit
19 | http-types hxt matrix network parsec process pureMD5 random
20 | Taxonomy text text-metrics transformers vector ViennaRNAParser
21 | ];
22 | executableHaskellDepends = [
23 | base BiobaseFasta bytestring cassava cmdargs containers directory
24 | either-unwrap filepath process random split text time vector
25 | ViennaRNAParser
26 | ];
27 | description = "Unsupervized construction of RNA family models";
28 | license = stdenv.lib.licenses.gpl3;
29 | }
30 |
--------------------------------------------------------------------------------
/envhs.nix:
--------------------------------------------------------------------------------
1 | with (import {});
2 | hsDevFunctions ./.
3 |
--------------------------------------------------------------------------------
/manual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eggzilla/RNAlien/94937c39ad20bbe7150cc865434c9feadd317090/manual.pdf
--------------------------------------------------------------------------------
/overrides.nix:
--------------------------------------------------------------------------------
1 | self:
2 |
3 | let
4 | tgz = builtins.fetchTarball {
5 | url = "http://hackage.haskell.org/package/semirings-0.3.1.1/semirings-0.3.1.1.tar.gz";
6 | sha256 = "1wi4g4xk3vjqig2mrgdc09ygwcdirlpky00xikak8cndkydcm2za";
7 | };
8 | in
9 |
10 | { semirings = tgz;
11 | }
12 |
--------------------------------------------------------------------------------
/scripts/AlienBenchmarkCMCompare.sh:
--------------------------------------------------------------------------------
1 | #Alien Benchmark
2 | #!/bin/bash
3 | #$ -t 1-56 #This will start the job for each sRNA Rfam family
4 | #$ -l mem_free=10G
5 | #$ -j yes
6 | #$ -o /scratch/egg/temp/
7 | #$ -e /scratch/egg/temp/
8 | #$ -l hostname="tc00|tc01|tc02|tc03|tc04"
9 | #$ -N area54
10 | #alienrun
11 | if [ -f /scr/kronos/egg/AlienStructuredResultsCollected4/$SGE_TASK_ID/done ]; then
12 | cmComparevsRfam.pl $SGE_TASK_ID
13 | sleep 1
14 | echo "File not found!"
15 | fi
16 |
--------------------------------------------------------------------------------
/scripts/AlienBenchmarkModels.sh:
--------------------------------------------------------------------------------
1 | #Alien Benchmark
2 | #!/bin/bash
3 | #$ -t 1-373 #This will start the job for each sRNA Rfam family
4 | #$ -pe para 7
5 | #$ -l mem_free=34.9G
6 | #$ -j yes
7 | #$ -o /scratch/egg/temp/
8 | #$ -e /scratch/egg/temp/
9 | #$ -l hostname="xc00|xc01|xc02|xc03|xc04|xc05|xc06|xc07|xc08"
10 | #$ -N area54
11 | #alienrun
12 | if [ ! -f /scratch/egg/AlienResultsCollected/$SGE_TASK_ID/done ]; then
13 | /home/mescalin/egg/current/Projects/Haskell/RNAlien/dist/build/RNAlien/RNAlien -i /scr/kronos/egg/AliensRNATestSet/$SGE_TASK_ID.fa -c 7 -t "$( /scratch/egg/temp/$SGE_TASK_ID.alienout
14 | sleep 1
15 | echo "File not found!"
16 | fi
17 |
--------------------------------------------------------------------------------
/scripts/alienresultstatistics.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #./alienresultstatistics structured 11 bitscore
3 | use warnings;
4 | use strict;
5 | use diagnostics;
6 | #use utf8;
7 | use Data::Dumper;
8 | use Cwd;
9 | use Switch;
10 | $|=1;
11 | #decideds which benchmark data to process
12 | my $type = $ARGV[0];
13 | #result iteration
14 | my $currentresultnumber = $ARGV[1];
15 | #threshold selection (bitscore, evalue)
16 | my $threshold_selection = $ARGV[2];
17 | #use clans for specificity check
18 | my $use_clans = 1;
19 | #Sequences to use (seed,full)
20 | my $use_sequences="seed";
21 |
22 |
23 | #contains all RNAlien result folders for sRNA tagged families
24 | my $alienresult_basename;
25 | #contains all Rfam Families names by family name with extension .cm
26 | my $rfammodel_basename;
27 | #contains all full seed alignment sequences as RfamID .fa fasta files
28 | my $rfamfasta_basename;
29 | my $RNAFamilyIdFile;
30 | my $familyNumber;
31 | my $resulttempdir;
32 | my $resultfileprefix;
33 | my $cpu_cores = 30;
34 |
35 |
36 | if($type eq "background"){
37 | $alienresult_basename="/scr/coridan/egg/AlienBackgroundCollected" . "$currentresultnumber" . "/";
38 | $rfammodel_basename = "/scr/coridan/egg/AlienTest/sRNAFamilies/all_models/";
39 | if($use_sequences eq "full"){
40 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyfasta/"; #full fasta
41 | }else{
42 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; #seed fasta
43 | }
44 | $RNAFamilyIdFile = "/scr/coridan/egg/backgroundFamilyNameIdGatheringCutoffSorted";
45 | $familyNumber = 712;
46 | $resulttempdir = "/scr/coridan/egg/temp/AlienRandomResultStatistics". "$currentresultnumber" . "/";
47 | $resultfileprefix = "structuredalienbackgroundoutput";
48 | }elsif($type eq "structured"){
49 | $alienresult_basename="/scr/kronos/egg/AlienStructuredResultsCollected" . "$currentresultnumber" . "/";
50 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/";
51 | if($use_sequences eq "full"){
52 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyfasta/"; #full fasta
53 | }else{
54 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyseedfasta/"; #seed fasta
55 | }
56 | $RNAFamilyIdFile = "/scr/kronos/egg/structuredFamilyNameIdGatheringCutoffSorted";
57 | $familyNumber = 56;
58 | #$familyNumber = 72; old number includes first mini background set
59 | $resulttempdir = "/scr/kronos/egg/temp/AlienStructuredResultStatistics". "$currentresultnumber" . "/";
60 | $resultfileprefix = "structuredalien". $use_sequences ."output";
61 | }elsif($type eq "diverse"){
62 | $alienresult_basename="/scr/kronos/egg/AlienDiverseResultsCollected" . "$currentresultnumber" . "/";
63 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/";
64 | if($use_sequences eq "full"){
65 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyfasta/"; #full fasta
66 | }else{
67 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyseedfasta2/"; #seed fasta
68 | }
69 |
70 | #$RNAFamilyIdFile = "/scr/kronos/egg/diverse_families/result_diverse_families";
71 | $RNAFamilyIdFile = "/scr/kronos/egg/diverse_families/test2";
72 | $familyNumber = 191;
73 | $resulttempdir = "/scr/kronos/egg/temp/AlienDiverseResultStatistics". "$currentresultnumber" . "/";
74 | $resultfileprefix = "diversealien" . $use_sequences . "output";
75 | }elsif($type eq "blast"){
76 | $alienresult_basename="/scr/coridan/egg/blastout/";
77 | $rfammodel_basename = "/scr/coridan/egg/AlienTest/sRNAFamilies/all_models/";
78 | if($use_sequences eq "full"){
79 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyfasta/"; #full fasta
80 | }else{
81 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; #seed fasta
82 | }
83 |
84 | $RNAFamilyIdFile = "/scr/coridan/egg/structuredFamilyNameIdGatheringCutoffSorted";
85 | $familyNumber = 56;
86 | $resulttempdir = "/scr/coridan/egg/temp/AlienBlastResultStatistics/";
87 | $resultfileprefix = "blastalien" . $use_sequences . "output";
88 | }elsif($type eq "nhmmer"){
89 | $alienresult_basename="/scr/coridan/egg/nhmmerout/";
90 | $rfammodel_basename = "/scr/coridan/egg/AlienTest/sRNAFamilies/all_models/";
91 | if($use_sequences eq "full"){
92 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyfasta/"; #full fasta
93 | }else{
94 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; #seed fasta
95 | }
96 | $RNAFamilyIdFile = "/scr/coridan/egg/structuredFamilyNameIdGatheringCutoffSorted";
97 | $familyNumber = 56;
98 | $resulttempdir = "/scr/coridan/egg/temp/AlienHmmerResultStatistics/";
99 | $resultfileprefix = "hmmer" . $use_sequences . "output";
100 | }else{
101 | #sRNA
102 | $alienresult_basename="/scr/kronos/egg/AlienResultsCollected" . "$currentresultnumber" . "/";
103 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/";
104 | if($use_sequences eq "full"){
105 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyfasta/"; #full fasta
106 | }else{
107 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyseedfasta/"; #seed fasta
108 | }
109 | $RNAFamilyIdFile = "/scr/kronos/egg/smallRNAtaggedfamiliesNameIDThresholdTagSorted.csv";
110 | $familyNumber = 374;
111 | $resulttempdir = "/scr/kronos/egg/temp/AlienResultStatistics" . "$currentresultnumber" . "/";
112 | $resultfileprefix = "alienseedoutput";
113 | }
114 |
115 | my @RNAfamilies;
116 | open(my $RNAfamilyfh, "<", $RNAFamilyIdFile)
117 | or die "Failed to open file: $!\n";
118 | while(<$RNAfamilyfh>) {
119 | chomp;
120 | push @RNAfamilies, $_;
121 | }
122 | close $RNAfamilyfh;
123 | unless (-d $resulttempdir){
124 | mkdir $resulttempdir or die "Cannot create result tempdir: $!";
125 | }else{
126 | #system "rm -r $resulttempdir" or die "Cannot create result tempdir: $!";
127 | #mkdir $resulttempdir or die "Cannot create result tempdir: $!";
128 | }
129 | my $output_directory_path = "/scr/coridan/egg/$resultfileprefix$currentresultnumber/";
130 | unless (-d $output_directory_path){
131 | mkdir $output_directory_path or die "Cannot create output dir: $!";
132 | }
133 |
134 |
135 | my $gathering_score_multiplier = 1.0;
136 | my $gathering_score_lower_bound;
137 | if ($threshold_selection eq "bitscore"){
138 | alienresultstatistic($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,$gathering_score_multiplier,$gathering_score_lower_bound,"$output_directory_path" . "bs-" . $gathering_score_multiplier . ".tsv",$cpu_cores,$threshold_selection,"evalue threshold",$use_clans,$type);
139 | }else{
140 | my @evalues = qw(1 1e-3 1e-6 1e-9);
141 | foreach my $evalue (@evalues){
142 | my $outputfilePath = "$output_directory_path" . "ev-" . $evalue . ".tsv";
143 | alienresultstatistic($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,$gathering_score_multiplier,$gathering_score_lower_bound,$outputfilePath,$cpu_cores,$threshold_selection,$evalue,$use_clans,$type);
144 | }
145 | }
146 |
147 | sub alienresultstatistic{
148 | my $familyNumber = shift;
149 | my $alienresult_basename = shift;
150 | my $rfammodel_basename = shift;
151 | my $rfamfasta_basename = shift;
152 | my $RNAFamilyIdFile = shift;
153 | my $resulttempdir = shift;
154 | my $gathering_score_multiplier = shift;
155 | my $gathering_score_lower_bound = shift;
156 | my $outputfilePath = shift;
157 | my $cpu_cores = shift;
158 | my $thresholdSelection = shift;
159 | my $evalueThreshold = shift;
160 | my $use_clans = shift;
161 | my $type = shift;
162 | my $output="Index\tRfamName\tRfamId\tLinkscore\trfamMaxLS\talienMaxLS\trfamGatheringThreshold\talienGatheringThreshold\trfamFastaNumber\talienFastaNumber\trfamonAlienNumber\talienonRfamNumber\tRfamonAlienRecovery\tAlienonRfamRecovery\tmeanPairwiseIdentity\tshannonEntropy\tgcContent\tmeanSingleSequenceMFE\tconsensusMFE\tenergyContribution\tcovarianceContribution\tcombinationsPair\tmeanZScore\tSCI\tsvmDecisionValue\tsvmRNAClassProbability\tprediction\tRclowestpv\tRcclass\tstatSequenceNumber\tstatEffectiveSequences\tstatConsensusLength\tstatW\tstatBasepairs\tstatBifurcations\tstatModel\trelativeEntropyCM\trelativeEntropyHMM\n";
163 | my $clanMembersFile = "/scr/kronos/egg/clans/family_clan";
164 | my %clan_members;
165 | #open(my $clanMembersfh, "<", $clanMembersFile)
166 | # or die "Failed to open file: $!\n";
167 | # while(<$clanMembersfh>) {
168 | # chomp;
169 | #add to hash
170 | # my @line = split('\t',$_);
171 | #print "$line[0] - $line[1]";
172 | #push( @{ $clan_members {$line[0] } }, $line[1]);
173 | #$clan_members{$line[0]}=$line[1];
174 | # }
175 | # close $clanMembersfh;
176 |
177 | for(my $counter=1; $counter <= $familyNumber; $counter++){
178 | my $current_alienresult_folder= $alienresult_basename.$counter."/";
179 | if(-e $alienresult_basename.$counter."/result.cm"){
180 | my $alienModelPath = $current_alienresult_folder."result.cm";
181 | my $alienFastaPath = $current_alienresult_folder."result.fa";
182 | my $alienRNAzPath = $current_alienresult_folder."result.rnaz";
183 | my $alienRNAcodePath = $current_alienresult_folder."result.rnacode";
184 | my $aliencmstatPath = $current_alienresult_folder."result.cmstat";
185 | #retrieve family specific information
186 | my @rfamModelNameId = split(/\s+/,$RNAfamilies[($counter - 1)]);
187 | #my @rfamModelNameId = split(/\s+/,$RNAfamilies[($counter)]);
188 | my $rfamModelName = $rfamModelNameId[0];
189 | my $rfamModelId = $rfamModelNameId[1];
190 | my $rfamModelPath;
191 | my $use_clans=0;
192 | if($use_clans == 1){
193 | #check if key exists
194 | if(exists $clan_members{$rfamModelId}){
195 | #my $clan_for_rfammodel = $clan_members{$rfamModelId};
196 | $rfamModelPath = "/scr/kronos/egg/clans/clan_models/". "$clan_members{$rfamModelId}". ".cm";
197 | print "For $rfamModelId, set path to: /scr/kronos/egg/clans/clan_models/". "$clan_members{$rfamModelId}\n";
198 | }else{
199 | $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm";
200 | print "For $rfamModelId, set path to: $rfammodel_basename . $rfamModelId" . ".cm\n";
201 | }
202 | }else{
203 | $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm";
204 | }
205 | #my $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm";
206 | my $rfamFastaPath =$rfamfasta_basename . $rfamModelId . ".fa";
207 | if(! -e $rfamModelPath){
208 | print "Does not exist: $rfamModelPath ";
209 | }
210 | if(! -e $rfamFastaPath){
211 | print "Does not exist: $rfamFastaPath ";
212 | }
213 |
214 | if(! -e $alienModelPath){
215 | print "Does not exist: $alienModelPath ";
216 | }
217 | if(! -e $alienFastaPath){
218 | print "Does not exist: $alienFastaPath";
219 | }
220 | #set threshold corresponding to bitscore or evalue cutoff
221 | my $threshold;
222 | my $databaseSize;
223 | if($thresholdSelection eq "bitscore"){
224 | my $rfamThresholdUnmodified = $rfamModelNameId[2];
225 | my $rfamThreshold;
226 | unless ($rfamThresholdUnmodified eq "-"){
227 | $rfamThreshold = $rfamThresholdUnmodified * $gathering_score_multiplier;
228 | }else{
229 | $rfamThreshold= "0";
230 | }
231 | if(defined $gathering_score_lower_bound){
232 | if($rfamThreshold < $gathering_score_lower_bound){
233 | $rfamThreshold = $gathering_score_lower_bound;
234 | }
235 | }
236 | $threshold = $rfamThreshold;
237 | $databaseSize = "";
238 | }else{
239 | $threshold = $evalueThreshold;
240 | $databaseSize = setdatabasesize($counter,$type);
241 | }
242 | $output = $output . `RNAlienStatistics $databaseSize -s $thresholdSelection -c $cpu_cores -n $rfamModelName -d $rfamModelId -b $counter -i $alienModelPath -r $rfamModelPath -a $alienFastaPath -g $rfamFastaPath -t $threshold -x $threshold -o $resulttempdir -w $alienRNAcodePath -z $alienRNAzPath -m $aliencmstatPath`;
243 | print "RNAlienStatistics $databaseSize -s $thresholdSelection -c $cpu_cores -n $rfamModelName -d $rfamModelId -b $counter -i $alienModelPath -r $rfamModelPath -a $alienFastaPath -g $rfamFastaPath -t $threshold -x $threshold -o $resulttempdir -z $alienRNAzPath -m $aliencmstatPath"."\n";
244 | }else{
245 | $output = $output . "$counter" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" ."-" . "\t" . "-" . "\t" . "-" . "\t" . "0" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-\t" . "-" . "\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\n";
246 | print "Does not exist $alienresult_basename.$counter/done";
247 | }
248 | }
249 |
250 | open(my $outputfh, ">", $outputfilePath)
251 | or die "Failed to open file: $!\n";
252 | print $outputfh $output;
253 | close $outputfh;
254 | return 1;
255 | }
256 |
257 | sub setdatabasesize{
258 | my $counter = shift;
259 | my $type = shift;
260 | my $databasesize;
261 | if($type eq "diverse"){
262 | switch ($counter) {
263 | case 7 { $databasesize = 1; } #RNaseP_bact_b
264 | case 12 { $databasesize = 1; } #PrfA
265 | case 13 { $databasesize = 1; } #CopA
266 | case 14 { $databasesize = 1; } #FMN
267 | case 19 { $databasesize = 1; } #RNAI
268 | case 20 { $databasesize = 1; } #SIB_RNA
269 | case 23 { $databasesize = 1; } #Purine
270 | case 24 { $databasesize = 1; } #SSU_rRNA_bacteria
271 | case 26 { $databasesize = 1; } #glmS
272 | case 27 { $databasesize = 1; } #ctRNA_pGA1
273 | case 28 { $databasesize = 1; } #RNA-OUT
274 | case 29 { $databasesize = 1; } #ctRNA_pT181
275 | case 36 { $databasesize = 1; } #ydaO-yuaA
276 | case 38 { $databasesize = 1; } #Pox_AX_element
277 | case 39 { $databasesize = 1; } #IBV_D-RNA
278 | case 45 { $databasesize = 1; } #ROSE
279 | case 48 { $databasesize = 1; } #HCV_SLVII
280 | case 49 { $databasesize = 1; } #HCV_SLIV
281 | case 51 { $databasesize = 1; } #HIV_FE
282 | case 56 { $databasesize = 1; } #RNAIII
283 | case 57 { $databasesize = 1; } #Thr_leader
284 | case 59 { $databasesize = 1; } #Leu_leader
285 | case 60 { $databasesize = 1; } #Trp_leader
286 | case 61 { $databasesize = 1; } #His_leader
287 | case 62 { $databasesize = 1; } #PreQ1
288 | case 63 { $databasesize = 1; } #Flavivirus_DB
289 | case 67 { $databasesize = 1; } #L13_leader
290 | case 68 { $databasesize = 1; } #L19_leader
291 | case 69 { $databasesize = 1; } #L20_leader
292 | case 70 { $databasesize = 1; } #L21_leader
293 | case 75 { $databasesize = 1; } #P1
294 | case 76 { $databasesize = 1; } #P24
295 | case 85 { $databasesize = 1; } #preQ1-II
296 | case 86 { $databasesize = 1; } #MOCO_RNA_motif
297 | case 87 { $databasesize = 1; } #RF_site2
298 | case 88 { $databasesize = 1; } #RF_site3
299 | case 89 { $databasesize = 1; } #RF_site5
300 | case 90 { $databasesize = 1; } #RF_site9
301 | case 91 { $databasesize = 1; } #PK-G12rRNA
302 | case 105 { $databasesize = 1; } #AHBV_epsilon
303 | case 106 { $databasesize = 1; } #CRISPR-DR2
304 | case 107 { $databasesize = 1; } #CRISPR-DR3
305 | case 108 { $databasesize = 1; } #CRISPR-DR5
306 | case 109 { $databasesize = 1; } #CRISPR-DR7
307 | case 110 { $databasesize = 1; } #CRISPR-DR35
308 | case 111 { $databasesize = 1; } #CRISPR-DR53
309 | case 112 { $databasesize = 1; } #CRISPR-DR55
310 | case 113 { $databasesize = 1; } #CRISPR-DR60
311 | case 114 { $databasesize = 1; } #CRISPR-DR61
312 | case 115 { $databasesize = 1; } #CRISPR-DR65
313 | case 116 { $databasesize = 1; } #isrA
314 | case 117 { $databasesize = 1; } #istR
315 | case 120 { $databasesize = 1; } #NrrF
316 | case 121 { $databasesize = 1; } #VrrA
317 | case 122 { $databasesize = 1; } #MFR
318 | case 126 { $databasesize = 1; } #AdoCbl-variant
319 | case 127 { $databasesize = 1; } #Lnt
320 | case 128 { $databasesize = 1; } #cspA
321 | case 129 { $databasesize = 1; } #SMK_box_riboswitch
322 | case 130 { $databasesize = 1; } #rnk_leader
323 | case 131 { $databasesize = 1; } #RatA
324 | case 132 { $databasesize = 1; } #blv_FSE
325 | case 133 { $databasesize = 1; } #FourU
326 | case 134 { $databasesize = 1; } #fstAT
327 | case 135 { $databasesize = 1; } #HSUR
328 | case 136 { $databasesize = 1; } #Lambda_thermo
329 | case 138 { $databasesize = 1; } #MicX
330 | case 139 { $databasesize = 1; } #symR
331 | case 140 { $databasesize = 1; } #PtaRNA1
332 | case 141 { $databasesize = 1; } #rdlD
333 | case 142 { $databasesize = 1; } #ROSE
334 | case 143 { $databasesize = 1; } #HIV_FS2
335 | case 144 { $databasesize = 1; } #ovine_lenti_FSE
336 | case 145 { $databasesize = 1; } #veev_FSE
337 | case 153 { $databasesize = 1; } #SSU_rRNA_archaea
338 | case 155 { $databasesize = 1; } #HEARO
339 | case 156 { $databasesize = 1; } #STnc630
340 | case 157 { $databasesize = 1; } #STnc370
341 | case 158 { $databasesize = 1; } #STnc180
342 | case 159 { $databasesize = 1; } #OrzO-P
343 | case 161 { $databasesize = 1; } #tfoR
344 | case 162 { $databasesize = 1; } #IS009
345 | case 169 { $databasesize = 1; } #sX5
346 | case 170 { $databasesize = 1; } #sX11
347 | case 174 { $databasesize = 1; } #hsp17
348 | case 175 { $databasesize = 1; } #PyrG_leader
349 | case 176 { $databasesize = 1; } #PyrD_leader
350 | case 177 { $databasesize = 1; } #Ms_AS-8
351 | case 183 { $databasesize = 1; } #ohsC_RNA
352 | case 185 { $databasesize = 1; } #ToxI
353 | case 186 { $databasesize = 1; } #ROSE_3
354 | else { $databasesize = 1000; }
355 | }
356 | }elsif($type eq "sRNA"){
357 | $databasesize = 1000;
358 | }elsif($type eq "background"){
359 | $databasesize = 1000;
360 | }else{
361 | switch ($counter) {
362 | case 7 { $databasesize = 1; } #RNaseP_bact_a
363 | case 8 { $databasesize = 1; } #RNaseP_bact_b
364 | case 16 { $databasesize = 1; } #phageP-RNA
365 | case 17 { $databasesize = 1; } #FMN
366 | case 19 { $databasesize = 1; } #S15
367 | case 20 { $databasesize = 1; } #SAM
368 | case 22 { $databasesize = 1; } #Purin
369 | case 23 { $databasesize = 1; } #Lysine
370 | case 24 { $databasesize = 1; } #Bacterial_small_SRP
371 | case 25 { $databasesize = 1; } #Cobalamin
372 | case 26 { $databasesize = 1; } #HIV-1_DIS
373 | case 27 { $databasesize = 1; } #SSU_rRNA_bacteria
374 | case 29 { $databasesize = 1; } #IRES_Pesti
375 | case 30 { $databasesize = 1; } #glmS
376 | case 32 { $databasesize = 1; } #ykoK
377 | case 33 { $databasesize = 1; } #IRES_Cripavirus
378 | case 34 { $databasesize = 1; } #HIV_FE
379 | case 35 { $databasesize = 1; } #TCV_H5
380 | case 36 { $databasesize = 1; } #Glycine
381 | case 39 { $databasesize = 1; } #c-di-GMP-I
382 | case 40 { $databasesize = 1; } #preQ1-II
383 | case 42 { $databasesize = 1; } #PK-G12rRNA
384 | case 43 { $databasesize = 1; } #HIV-1_SD
385 | case 44 { $databasesize = 1; } #MFR
386 | case 45 { $databasesize = 1; } #AdoCbl-variant
387 | case 46 { $databasesize = 1; } #crcB
388 | case 47 { $databasesize = 1; } #c-di-GMP-II
389 | case 48 { $databasesize = 1; } #THF
390 | case 51 { $databasesize = 1; } #Archea_SRP
391 | case 56 { $databasesize = 1; } #ToxI
392 | else { $databasesize = 1000; }
393 | }
394 | }
395 | return " -k $databasesize ";
396 |
397 | }
398 |
--------------------------------------------------------------------------------
/scripts/alienstructurestatistics.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #./scripts/alienstructurestatistics.pl structured 13
3 | # Computes and plots structure distance among alien benchmark sets and versus Rfam
4 |
5 | # 1. Computes the normalized distance changes over iterations
6 | # 3. Computes the average normalized distance changes over iterations
7 | # 2. Computes the distance between updated structure and normal structure over iterations
8 | # 4. Computes the average distance between updated structure and normal structure over iterations
9 | # 5. Compute the normalized distance between iteration and Rfam consensus
10 | # 6. Compute the average normalized distance between iteration and Rfam consensus
11 |
12 | use warnings;
13 | use strict;
14 | use diagnostics;
15 | use Data::Dumper;
16 | use Cwd;
17 | $|=1;
18 | #decideds which benchmark data to process
19 | my $type = $ARGV[0];
20 | #result iteration
21 | my $currentresultnumber = $ARGV[1];
22 | #contains all RNAlien result folders for sRNA tagged families
23 | my $alienresult_basename;
24 | #contains all Rfam Families names by family name with extension .cm
25 | my $rfammodel_basename;
26 | #contains all full seed alignment sequences as RfamID .fa fasta files
27 | my $rfamfasta_basename;
28 | #contains seed alignments as RfamID .fa fasta files
29 | my $rfamstockholm_basename;
30 |
31 | my $RNAFamilyIdFile;
32 | my $familyNumber;
33 | my $resulttempdir;
34 |
35 | if($type eq "structured"){
36 | $alienresult_basename="/scr/coridan/egg/AlienStructuredResultsCollected" . "$currentresultnumber" . "/";
37 | $rfamstockholm_basename = "/scr/coridan/egg/structuredfamilyrfamstockholm/";
38 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/";
39 | $RNAFamilyIdFile = "/scr/coridan/egg/structuredFamilyNameIdGatheringCutoffSorted";
40 | $familyNumber = 56;
41 | $resulttempdir = "/scr/coridan/egg/temp/AlienStructuredResultStatistics". "$currentresultnumber" . "/";
42 | }else{
43 | #sRNA
44 | $alienresult_basename="/scr/kronos/egg/AlienResultsCollected" . "$currentresultnumber" . "/";
45 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/";
46 | $RNAFamilyIdFile = "/scr/kronos/egg/smallRNAtaggedfamiliesNameIDThresholdTagSorted.csv";
47 | $familyNumber = 374;
48 | $resulttempdir = "/scr/kronos/egg/temp/AlienResultStatistics" . "$currentresultnumber" . "/";
49 | }
50 |
51 | #Distance comparison between first stockholms of constructions with and without structureupdate
52 | #normalizedDistanceBetweenFirstStockholms($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,"/scratch/egg/");
53 | unless(-d "/scr/kronos/egg/iterationdistance$currentresultnumber/"){
54 | mkdir "/scr/kronos/egg/iterationdistance$currentresultnumber/";
55 | }
56 | distanceBetweenAlienRfamStockholms($familyNumber,$alienresult_basename,$rfamstockholm_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,"/scr/kronos/egg/iterationdistance$currentresultnumber/");
57 | #normalizedDistanceOverIterations($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,"/scr/kronos/egg/iterationdistance$currentresultnumber/");
58 |
59 | sub distanceBetweenAlienRfamStockholms{
60 | #retrieve common sequence identifier
61 | #compare stockholmstructre and parse result back
62 | my $familyNumber = shift;
63 | my $alienresult_basename = shift;
64 | my $rfamstockholm_basename = shift;
65 | my $rfamfasta_basename = shift;
66 | my $RNAFamilyIdFile = shift;
67 | my $resulttempdir = shift;
68 | my $resultfolderpath = shift;
69 | my $outputfilePath= $resultfolderpath . "distancestructureupdatenone.dist";
70 | my $output;
71 | for(my $counter=1; $counter <= $familyNumber; $counter++){
72 | my $current_alienresult_folder= $alienresult_basename.$counter."/";
73 | if(-e $alienresult_basename.$counter."/done"){
74 | #print "$alienresult_basename$counter\n";
75 | my $fstStockholmPath = "$rfamstockholm_basename/$counter.stockholm";
76 | my $sndStockholmPath = "$alienresult_basename"."$counter"."/result.stockholm";
77 | my $inputFastaPath = "$alienresult_basename"."$counter"."/result.fa";
78 | if(-e $inputFastaPath){
79 | my @fastacontent;
80 | open(my $fastafh, "<", $inputFastaPath)
81 | or die "Failed to open file: $!\n";
82 | while(<$fastafh>) {
83 | chomp;
84 | push @fastacontent, $_;
85 | }
86 | close $fastafh;
87 | my $fasta_identifier = $fastacontent[0];
88 | $fasta_identifier =~ s/>//;
89 | #$fasta_identifier =~ s/\\K.+$//;
90 | if(-e $fstStockholmPath){
91 | $output = $output . `~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i $fasta_identifier -a $fstStockholmPath -r $sndStockholmPath -d P -o $resultfolderpath`;
92 | }else{
93 | $output = $output . "no stockholm found\n";
94 | }
95 |
96 | }
97 | }else{
98 | $output = $output . "no inputfasta found\n";
99 | }
100 | }
101 |
102 | open(my $outputfh, ">", $outputfilePath)
103 | or die "Failed to open file: $!\n";
104 | print $outputfh $output;
105 | close $outputfh;
106 | return 1;
107 | }
108 |
109 |
110 | sub normalizedDistanceBetweenFirstStockholms{
111 | #retrieve common sequence identifier
112 | #compare stockholmstructre and parse result back
113 | my $familyNumber = shift;
114 | my $alienresult_basename = shift;
115 | my $rfammodel_basename = shift;
116 | my $rfamfasta_basename = shift;
117 | my $RNAFamilyIdFile = shift;
118 | my $resulttempdir = shift;
119 | my $resultfolderpath = shift;
120 | my $outputfilePath= $resultfolderpath . "distancestructureupdatenone.dist";
121 | my $output;
122 | for(my $counter=1; $counter <= $familyNumber; $counter++){
123 | my $current_alienresult_folder= $alienresult_basename.$counter."/";
124 | if(-e $alienresult_basename.$counter."/done"){
125 | #print "$alienresult_basename$counter\n";
126 | my $fstStockholmPath = findStockholm("/scratch/egg/AlienStructuredResultsCollected12/$counter/");
127 | my $sndStockholmPath = findStockholm("/scratch/egg/AlienStructuredResultsCollected13/$counter/");
128 | my $inputFastaPath = findInputFasta($current_alienresult_folder);
129 | if(-e $inputFastaPath){
130 | my @fastacontent;
131 | open(my $fastafh, "<", $inputFastaPath)
132 | or die "Failed to open file: $!\n";
133 | while(<$fastafh>) {
134 | chomp;
135 | push @fastacontent, $_;
136 | }
137 | close $fastafh;
138 | my $fasta_identifier = $fastacontent[0];
139 | $fasta_identifier =~ s/>//;
140 | $fasta_identifier =~ s/\\K.+$//;
141 | if(-e $fstStockholmPath){
142 | $output = $output . `~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i $fasta_identifier -a $fstStockholmPath -r $sndStockholmPath -o /scratch/egg/temp/`;
143 | }else{
144 | $output = $output . "no stockholm found\n";
145 | }
146 |
147 | }
148 | }else{
149 | $output = $output . "no inputfasta found\n";
150 | }
151 | }
152 |
153 | open(my $outputfh, ">", $outputfilePath)
154 | or die "Failed to open file: $!\n";
155 | print $outputfh $output;
156 | close $outputfh;
157 | return 1;
158 | }
159 |
160 | sub normalizedDistanceOverIterations{
161 | #retrieve common sequence identifier
162 | #compare stockholmstructre and parse result back
163 | my $familyNumber = shift;
164 | my $alienresult_basename = shift;
165 | my $rfammodel_basename = shift;
166 | my $rfamfasta_basename = shift;
167 | my $RNAFamilyIdFile = shift;
168 | my $resulttempdir = shift;
169 | my $resultfolderpath = shift;
170 | for(my $counter=1; $counter <= $familyNumber; $counter++){
171 | my $output = "";
172 | my $current_alienresult_folder= $alienresult_basename.$counter."/";
173 | if(-e $alienresult_basename . $counter."/done"){
174 | #print "$alienresult_basename$counter\n";
175 | my $referenceStockholmPath = findStockholm("/scratch/egg/AlienStructuredResultsCollected13/$counter/");
176 | my $inputFastaPath = findInputFasta($current_alienresult_folder);
177 | my $iterationNumber = findIterationNumber($current_alienresult_folder);
178 | if(-e $inputFastaPath){
179 | my @fastacontent;
180 | open(my $fastafh, "<", $inputFastaPath)
181 | or die "Failed to open file: $!\n";
182 | while(<$fastafh>) {
183 | chomp;
184 | push @fastacontent, $_;
185 | }
186 | close $fastafh;
187 | my $fasta_identifier = $fastacontent[0];
188 | $fasta_identifier =~ s/>//;
189 | $fasta_identifier =~ s/\\K.+$//;
190 | if(-e $referenceStockholmPath){
191 | for(my $iteration = 0; $iteration <= $iterationNumber; $iteration++){
192 | my $currentStockholmPath = $current_alienresult_folder . $iteration . "/model.stockholm";
193 | if(-e $currentStockholmPath){
194 | $output = $output . "$iteration\t" . `~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i $fasta_identifier -a $referenceStockholmPath -r $currentStockholmPath -o /scratch/egg/temp/`;
195 | }else{
196 | #print "$currentStockholmPath\n";
197 | $output = $output . "$iteration\tNA\n"
198 | }
199 | }
200 | }else{
201 | $output = $output . "no stockholm found\n";
202 | }
203 | }
204 | }else{
205 | $output = $output . "no inputfasta found\n";
206 | }
207 | my $outputfilePath = $resultfolderpath . $counter . "_iterationstructure.dist";
208 | open(my $outputfh, ">", $outputfilePath)
209 | or die "Failed to open file: $!\n";
210 | print $outputfh $output;
211 | close $outputfh;
212 | }
213 | return 1;
214 | }
215 |
216 | sub findIterationNumber{
217 | my $current_alienresult_folder = shift;
218 | my $continue = 1;
219 | my $iteration = 0;
220 | while($continue){
221 | my $currentpath = $current_alienresult_folder."/".$iteration;
222 | #print $currentfastapath;
223 | unless(-d $currentpath){
224 | $continue = 0;
225 | return $iteration;
226 | }else{
227 | $iteration++;
228 | }
229 | if($iteration>50){
230 | $continue = 0;
231 | }
232 | }
233 | }
234 |
235 | sub findInputFasta{
236 | my $current_alienresult_folder = shift;
237 | my $continue = 1;
238 | my $iteration = 0;
239 | while($continue){
240 | my $currentfastapath = $current_alienresult_folder."/".$iteration."/input.fa";
241 | #print $currentfastapath;
242 | if(-e $currentfastapath){
243 | $continue = 0;
244 | return $currentfastapath;
245 | }else{
246 | $iteration++;
247 | }
248 | if($iteration>50){
249 | $continue = 0;
250 | }
251 | }
252 | }
253 |
254 | sub findStockholm{
255 | my $current_alienresult_folder = shift;
256 | my $continue = 1;
257 | my $iteration = 0;
258 | while($continue){
259 | my $currentstockholmpath = $current_alienresult_folder."/".$iteration."/model.stockholm";
260 | if(-e $currentstockholmpath){
261 | $continue = 0;
262 | return $currentstockholmpath;
263 | }else{
264 | $iteration++;
265 | }
266 | if($iteration>50){
267 | $continue = 0;
268 | }
269 | }
270 |
271 | }
272 |
273 | # sub normalizedDistanceChangeOverIterations{
274 | # #retrieve common sequence identifier
275 | # #compare stockholmstructre and parse result back
276 | # my $familyNumber = shift;
277 | # my $alienresult_basename = shift;
278 | # my $rfammodel_basename = shift;
279 | # my $rfamfasta_basename = shift;
280 | # my $RNAFamilyIdFile = shift;
281 | # my $resulttempdir = shift;
282 | # my $gathering_score_multiplier = shift;
283 | # my $gathering_score_lower_bound = shift;
284 | # my $outputfilePath = shift;
285 | # my $output;
286 | # for(my $counter=1; $counter <= $familyNumber; $counter++){
287 | # my $current_alienresult_folder= $alienresult_basename.$counter."/";
288 | # if(-e $alienresult_basename.$counter."/done"){
289 | # my $alienModelPath = $current_alienresult_folder."result.cm";
290 | # my $alienFastaPath = $current_alienresult_folder."result.fa";
291 | # my @rfamModelNameId = split(/\s+/,$RNAfamilies[($counter - 1)]);
292 | # my $rfamModelName = $rfamModelNameId[0];
293 | # my $rfamModelId = $rfamModelNameId[1];
294 | # my $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm";
295 | # my $rfamFastaPath =$rfamfasta_basename . $rfamModelId . ".fa";
296 | # if(! -e $rfamModelPath){
297 | # print "Does not exist: $rfamModelPath ";
298 | # }
299 | # if(! -e $rfamFastaPath){
300 | # print "Does not exist: $rfamFastaPath ";
301 | # }
302 |
303 | # if(! -e $alienModelPath){
304 | # print "Does not exist: $alienModelPath ";
305 | # }
306 | # if(! -e $alienFastaPath){
307 | # print "Does not exist: $alienFastaPath";
308 | # }
309 | # $output = $output . `RNAlienStatistics -c 20 -n $rfamModelName -d $rfamModelId -b $counter -i $alienModelPath -r $rfamModelPath -a $alienFastaPath -g $rfamFastaPath -t $rfamThreshold -x $rfamThreshold -o $resulttempdir`;
310 | # #~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i AB001721.1 -a /scratch/egg/AlienStructuredResultsCollected13/1/1/model.stockholm -r /scratch/egg/AlienStructuredResultsCollected13/1/9/model.stockholm -o /scratch/egg/temp/
311 | # }
312 | # }
313 | # open(my $outputfh, ">", $outputfilePath)
314 | # or die "Failed to open file: $!\n";
315 | # print $outputfh $output;
316 | # close $outputfh;
317 | # return 1;
318 | # }
319 |
320 | # sub averageNormalizedDistanceChangesOverIterations{
321 | # #summarize familywise results of NormalizedDistanceChangesOverIterations
322 | # return 1;
323 | # }
324 |
325 | # sub normalizedDistanceChangeOverIterations{
326 | # return 1;
327 | # }
328 |
329 | # sub normalizedDistanceChangeOverIterations{
330 | # return 1;
331 | # }
332 |
333 | # sub normalizedDistanceChangeOverIterations{
334 | # return 1;
335 | # }
336 |
337 | # sub normalizedDistanceChangeOverIterations{
338 | # return 1;
339 | # }
340 |
--------------------------------------------------------------------------------
/scripts/blastbenchmarkdata.pl:
--------------------------------------------------------------------------------
1 | #!/bin/perl
2 | use strict;
3 | use warnings;
4 | #blastn -db nt -evalue 0.001 -query "/scratch/egg/structuredRNATestSet/1.fa"
5 | #$ blastx -db myDB -query myQuery -out myContigList.txt -outfmt "6 sallacc"
6 | #$ blastdbcmd -db myBlastDBName -dbtype prot -entry_batch myContigList.txt -outfmt %f -out myHitContigs.fasta
7 | #
8 | my $counter=1;
9 | for(1..56){
10 | print "$counter\n";
11 | # #`blastn -db nt -evalue 0.001 -soft_masking true -query structuredRNATestSet/$counter.fa -out blastout/$counter.txt -outfmt \"6 sallacc qcovs sseq\"`;
12 | # open(my $blastfh, "<", "blastout/$counter.txt") or die "Failed to open file: $!\n";
13 | # open(my $fastafh, ">", "blastout/$counter.fasta") or die "Failed to open file: $!\n";
14 | # my @sequences;
15 | # my $counter2=0;
16 | # while(<$blastfh>) {
17 | # chomp;
18 | # #add to hash
19 | # my @line = split('\t',$_);
20 | # my $unique=1;
21 | # foreach my $seq (@sequences){
22 | # #print "$line[1] $seq\n";
23 | # if($line[2] eq $seq){
24 | # $unique = 0;
25 | # }
26 | # }
27 | # if($unique){
28 | # push (@sequences, $line[2]);
29 | # my $printseq= $line[2];
30 | # $printseq =~ s/-//g;
31 | #
32 | # if($line[1]>=80){
33 | # print $fastafh ">$line[0]_$counter2\n$printseq\n";
34 | # #print ">$line[0]\n$line[1]\n";
35 | # }
36 | # }
37 | # #print @sequences;
38 | # $counter2++;
39 | # }
40 | # close $blastfh;
41 | # close $fastafh;
42 | #
43 | #
44 |
45 | #blastdbcmd -db nt -dbtype nucl -entry_batch blastout/$counter.txt -outfmt %f -out blastout/$counter.fasta`;
46 |
47 | #`mlocarna --skip-pp --fast-mea --free-endgaps --threads 3 blastout/$counter.fasta --tgtdir blastout/$counter.mlocarna`;
48 | #if(-e "blastout/$counter.mlocarna/results/result.aln"){
49 | #`cp blastout/$counter.mlocarna/results/result.aln blastout/$counter.clustal`;
50 | #`RNAalifold -r --cfactor 0.6 --nfactor 0.5 < blastout/$counter.clustal > blastout/$counter.alifold`;
51 | #`/scratch/egg/alienhmmerblast/convertalignments.pl -i blastout/$counter.clustal -o blastout/$counter.stockholm -f stockholm`;
52 | #}else{
53 | #`/scratch/egg/alienhmmerblast/convertalignments.pl -i blastout/$counter.fasta -o blastout/$counter.stockholm -f stockholm`;
54 | # `RNAfold < blastout/$counter.fasta > blastout/$counter.fold`;
55 | #}
56 | #`cmbuild --refine blastout/$counter.refine blastout/$counter.cm blastout/$counter.stockholm > blastout/$counter.log`;
57 | #`cmcalibrate blastout/$counter.cm`;
58 | #Copying to folders with running index for RNAlienStatistics wrapper script
59 | `mkdir blastout/$counter`;
60 | `cp blastout/$counter.fasta blastout/$counter/result.fa`;
61 | `cp blastout/$counter.cm blastout/$counter/result.cm`;
62 |
63 | $counter++;
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/scripts/buildClanModels.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | #Write RNA family models of clan members in one file
4 |
5 | use strict;
6 | use warnings;
7 | use Data::Dumper qw(Dumper);
8 |
9 | #Read in clan_membership.txt to find the clan member families
10 | #Build hash with clan id as key and members as values
11 |
12 |
13 | my $clanMembersFile = "clan_membership.txt";
14 | my %clan_members;
15 | open(my $clanMembersfh, "<", $clanMembersFile)
16 | or die "Failed to open file: $!\n";
17 | while(<$clanMembersfh>) {
18 | chomp;
19 | #add to hash
20 | my @line = split('\t',$_);
21 | #print "$line[0] - $line[1]";
22 | push( @{ $clan_members {$line[0] } }, $line[1]);
23 | }
24 | close $clanMembersfh;
25 |
26 | #print Dumper \%clan_members;
27 |
28 | #Write member covariance model into clan covariance model in clan_models subdirectory
29 | foreach my $clan (keys %clan_members){
30 | my @members = @{$clan_members{$clan}};
31 | #print "@members\n";
32 | `rm clan_models/$clan.cm`;
33 | `touch clan_models/$clan.cm`;
34 | foreach my $member (@members) {
35 | `cat all_models/$member.cm >> clan_models/$clan.cm`;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/scripts/cmComparevsRfam.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use warnings;
4 | use strict;
5 | use diagnostics;
6 | use utf8;
7 | use Data::Dumper;
8 | use List::Util qw(min max);
9 | use File::Basename;
10 | use Cwd;
11 | $|=1;
12 |
13 | my $counter = 1;
14 | #contains all RNAlien result folders for sRNA tagged families
15 | #my $alienresult_basename="/scratch/egg/AlienTestResult5/temp/";
16 | #my $aliencollected_basename="/scratch/egg/AlienResultsCollected/";
17 |
18 | my $cmcompareresult_basename="/scratch/egg/cmcomparestructuredResultscollected/";
19 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n";
20 |
21 | for(1..56){
22 | my $current_alienresult_file = $cmcompareresult_basename.$counter.".alienresult";
23 | if(-e $current_alienresult_file){
24 | my @resultlines;
25 | open(my $resultfh, "<", $current_alienresult_file)
26 | or die "Failed to open file: $!\n";
27 | while(<$resultfh>) {
28 | chomp;
29 | push @resultlines, $_;
30 | }
31 | close $resultfh;
32 | my $bestentry;
33 | my $sndbestentry;
34 | my $bestlinkscore = 2;
35 | my $sndbestlinkscore = 1;
36 | foreach my $line (@resultlines){
37 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479
38 | my @fields = split(/\s+/,$line);
39 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n";
40 | my @scores = ($fields[2],$fields[3]);
41 | my $linkscore = min @scores;
42 | #print $linkscore."\n";
43 | if($linkscore > $bestlinkscore){
44 | $bestlinkscore=$linkscore;
45 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
46 | $filename =~ s/.cm//;
47 | $bestentry = $filename . "\t" . $linkscore;
48 | }elsif($linkscore > $sndbestlinkscore){
49 | $sndbestlinkscore = $linkscore;
50 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
51 | $filename =~ s/.cm//;
52 | $sndbestentry = $filename . "\t" . $linkscore;
53 | }
54 | }
55 | #print "$counter-$current_alienresult_file\n";
56 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n";
57 |
58 | }
59 | $counter++;
60 | }
61 |
62 | $counter = 1;
63 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n";
64 | for(1..56){
65 | my $current_rfamresult_file= $cmcompareresult_basename.$counter.".rfamresult";
66 | if(-e $current_rfamresult_file){
67 | my @resultlines;
68 | open(my $resultfh, "<", $current_rfamresult_file)
69 | or die "Failed to open file: $!\n";
70 | while(<$resultfh>) {
71 | chomp;
72 | push @resultlines, $_;
73 | }
74 | close $resultfh;
75 | my $bestentry;
76 | my $sndbestentry;
77 | my $bestlinkscore = 2;
78 | my $sndbestlinkscore = 1;
79 | foreach my $line (@resultlines){
80 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479
81 | my @fields = split(/\s+/,$line);
82 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n";
83 | my @scores = ($fields[2],$fields[3]);
84 | my $linkscore = min @scores;
85 | #print $linkscore."\n";
86 | if($linkscore > $bestlinkscore){
87 | $bestlinkscore=$linkscore;
88 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
89 | $filename =~ s/.cm//;
90 | $bestentry = $filename . "\t" . $linkscore;
91 | }elsif($linkscore > $sndbestlinkscore){
92 | $sndbestlinkscore = $linkscore;
93 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
94 | $filename =~ s/.cm//;
95 | $sndbestentry = $filename . "\t" . $linkscore;
96 | }
97 | }
98 | #print "$counter-$current_rfamresult_file\n";
99 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n";
100 |
101 | }
102 | $counter++;
103 | }
104 |
--------------------------------------------------------------------------------
/scripts/cmcomparebesthitextractor.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use warnings;
4 | use strict;
5 | use diagnostics;
6 | use utf8;
7 | use Data::Dumper;
8 | use List::Util qw(min max);
9 | use File::Basename;
10 | use Cwd;
11 | $|=1;
12 |
13 | my $counter = 1;
14 | #contains all RNAlien result folders for sRNA tagged families
15 | #my $alienresult_basename="/scratch/egg/AlienTestResult5/temp/";
16 | #my $aliencollected_basename="/scratch/egg/AlienResultsCollected/";
17 |
18 | my $cmcompareresult_basename="/scratch/egg/cmcomparestructuredResultscollected/";
19 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n";
20 |
21 | for(1..56){
22 | my $current_alienresult_file = $cmcompareresult_basename.$counter.".alienresult";
23 | if(-e $current_alienresult_file){
24 | my @resultlines;
25 | open(my $resultfh, "<", $current_alienresult_file)
26 | or die "Failed to open file: $!\n";
27 | while(<$resultfh>) {
28 | chomp;
29 | push @resultlines, $_;
30 | }
31 | close $resultfh;
32 | my $bestentry;
33 | my $sndbestentry;
34 | my $bestlinkscore = 2;
35 | my $sndbestlinkscore = 1;
36 | foreach my $line (@resultlines){
37 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479
38 | my @fields = split(/\s+/,$line);
39 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n";
40 | my @scores = ($fields[2],$fields[3]);
41 | my $linkscore = min @scores;
42 | #print $linkscore."\n";
43 | if($linkscore > $bestlinkscore){
44 | $bestlinkscore=$linkscore;
45 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
46 | $filename =~ s/.cm//;
47 | $bestentry = $filename . "\t" . $linkscore;
48 | }elsif($linkscore > $sndbestlinkscore){
49 | $sndbestlinkscore = $linkscore;
50 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
51 | $filename =~ s/.cm//;
52 | $sndbestentry = $filename . "\t" . $linkscore;
53 | }
54 | }
55 | #print "$counter-$current_alienresult_file\n";
56 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n";
57 |
58 | }
59 | $counter++;
60 | }
61 |
62 | $counter = 1;
63 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n";
64 | for(1..56){
65 | my $current_rfamresult_file= $cmcompareresult_basename.$counter.".rfamresult";
66 | if(-e $current_rfamresult_file){
67 | my @resultlines;
68 | open(my $resultfh, "<", $current_rfamresult_file)
69 | or die "Failed to open file: $!\n";
70 | while(<$resultfh>) {
71 | chomp;
72 | push @resultlines, $_;
73 | }
74 | close $resultfh;
75 | my $bestentry;
76 | my $sndbestentry;
77 | my $bestlinkscore = 2;
78 | my $sndbestlinkscore = 1;
79 | foreach my $line (@resultlines){
80 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479
81 | my @fields = split(/\s+/,$line);
82 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n";
83 | my @scores = ($fields[2],$fields[3]);
84 | my $linkscore = min @scores;
85 | #print $linkscore."\n";
86 | if($linkscore > $bestlinkscore){
87 | $bestlinkscore=$linkscore;
88 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
89 | $filename =~ s/.cm//;
90 | $bestentry = $filename . "\t" . $linkscore;
91 | }elsif($linkscore > $sndbestlinkscore){
92 | $sndbestlinkscore = $linkscore;
93 | my ($filename, $dirs, $suffix) = fileparse($fields[1]);
94 | $filename =~ s/.cm//;
95 | $sndbestentry = $filename . "\t" . $linkscore;
96 | }
97 | }
98 | #print "$counter-$current_rfamresult_file\n";
99 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n";
100 |
101 | }
102 | $counter++;
103 | }
104 |
--------------------------------------------------------------------------------
/scripts/getblastdb.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | declare -a ntfiles=("nt.00.tar.gz" "nt.01.tar.gz" "nt.02.tar.gz" "nt.03.tar.gz"
3 | "nt.04.tar.gz" "nt.05.tar.gz" "nt.06.tar.gz" "nt.07.tar.gz"
4 | "nt.08.tar.gz" "nt.09.tar.gz" "nt.10.tar.gz" "nt.11.tar.gz"
5 | "nt.12.tar.gz" "nt.13.tar.gz" "nt.14.tar.gz" "nt.15.tar.gz"
6 | "nt.16.tar.gz" "nt.17.tar.gz" "nt.18.tar.gz" "nt.19.tar.gz"
7 | "nt.20.tar.gz" "nt.21.tar.gz" "nt.22.tar.gz" "nt.23.tar.gz"
8 | "nt.24.tar.gz" "nt.25.tar.gz" "nt.26.tar.gz" "nt.27.tar.gz"
9 | "nt.28.tar.gz" "nt.29.tar.gz" "nt.30.tar.gz" "nt.31.tar.gz"
10 | "nt.32.tar.gz" "nt.33.tar.gz" "nt.34.tar.gz" "nt.35.tar.gz"
11 | "nt.36.tar.gz" "nt.37.tar.gz" "nt.38.tar.gz" "nt.39.tar.gz"
12 | "nt.40.tar.gz" "nt.41.tar.gz" "nt.42.tar.gz" "nt.43.tar.gz"
13 | "nt.44.tar.gz" "nt.45.tar.gz" "nt.46.tar.gz" "nt.47.tar.gz"
14 | "nt.48.tar.gz" "nt.49.tar.gz" "nt.50.tar.gz" "nt.51.tar.gz"
15 | "nt.52.tar.gz" "nt.53.tar.gz" "nt.54.tar.gz" "nt.55.tar.gz")
16 | for f in "${ntfiles[@]}"
17 | do
18 | echo "$f"
19 | wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/$f
20 | tar zxvpf $f
21 | done
22 |
23 |
--------------------------------------------------------------------------------
/scripts/makemultiplotcsv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | mkdir recoverymultiplot2
3 | touch recoverymultiplot2/filenames
4 | mkdir recoverymultiplot2/rfamonalien
5 | mkdir recoverymultiplot2/alienonrfam
6 | echo "" > recoverymultiplot2/filenames
7 | counter=1
8 | for i in structuredalienseedoutput4-*.csv; do
9 | echo "recoverymultiplot2/rfamonalien/$counter.sorted.csv " >> recoverymultiplot2/filenames
10 | echo "$i"
11 | cut -d $'\t' -f 13 $i > recoverymultiplot2/rfamonalien/$counter.csv;
12 | cut -d $'\t' -f 14 $i > recoverymultiplot2/alienonrfam/$counter.csv;
13 | sort -k 1 -n recoverymultiplot2/rfamonalien/$counter.csv > recoverymultiplot2/rfamonalien/$counter.sorted.csv;
14 | sort -k 1 -n recoverymultiplot2/alienonrfam/$counter.csv > recoverymultiplot2/alienonrfam/$counter.sorted.csv;
15 | counter=$[$counter +1]
16 | done
17 | usedfilenames=$( recoverymultiplot2/rfamonalien/allsorted
20 | pr -mts $usedfilenames > recoverymultiplot2/alienonrfam/allsorted
21 |
22 |
--------------------------------------------------------------------------------
/scripts/nhmmerbenchmarkdata.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use warnings;
3 | use strict;
4 | my $counter=1;
5 | for(1..56){
6 | print "$counter\n";
7 | #'time nhmmer -E 0.001 -A nhmmerout/$counter.sto -o nhmmerout/$counter.hmmer /scratch/egg/structuredRNATestSet/$counter.fa nt`;
8 | #`~egg/Tools/hmmer-3.1b2-linux-intel-x86_64/easel/miniapps/esl-reformat fasta nhmmerout/$counter.sto > nhmmerout/$counter.fa`;
9 | unless($counter == 27){
10 | #if(-e "nhmmerout/$counter.sto"){
11 | #`/scratch/egg/alienhmmerblast/convertalignments.pl -g stockholm -i nhmmerout/$counter.sto -o nhmmerout/$counter.clustal -f clustalw`;
12 | #`rnazSelectSeqs.pl nhmmerout/$counter.clustal`;
13 | #`RNAalifold -r --cfactor 0.6 --nfactor 0.5 < nhmmerout/$counter.clustal > nhmmerout/$counter.alifold`;
14 | # `/scratch/egg/alienhmmerblast/convertalignments.pl -g stockholm -i nhmmerout/$counter.sto -o nhmmerout/$counter.stockholm -f stockholm`;
15 | #}else{
16 | #copy input sequence in case of no hits
17 | # `/scratch/egg/alienhmmerblast/convertalignments.pl -g fasta -i nhmmerout/$counter.fa -o nhmmerout/$counter.stockholm -f stockholm`;
18 | # #`RNAfold < nhmmerout/$counter.fasta > nhmmerout/$counter.fold`;
19 | #}
20 | #Manually insert consensus structure line
21 | #`cp nhmmerout/$counter.stockholm nhmmerout/$counter.stockholm.bak`;
22 | #`grep -v "#=GS" nhmmerout/$counter.stockholm.bak | grep -v "#=GR" > nhmmerout/$counter.stockholm`;
23 | #`cmbuild --refine nhmmerout/$counter.refine nhmmerout/$counter.cm nhmmerout/$counter.stockholm > nhmmerout/$counter.log`;
24 | #`cmcalibrate --cpu 30 nhmmerout/$counter.cm`;
25 |
26 | #Copying to folders with running index for RNAlienStatistics wrapper script
27 | `mkdir nhmmerout/$counter`;
28 | `cp nhmmerout/$counter.fa nhmmerout/$counter/result.fa`;
29 | `cp nhmmerout/$counter.cm nhmmerout/$counter/result.cm`;
30 | }
31 |
32 | $counter++;
33 |
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
1 | flags: {}
2 | packages:
3 | - '.'
4 | extra-deps: []
5 | compiler-check: newer-minor
6 | resolver: lts-9.21
7 |
--------------------------------------------------------------------------------
/test/single.fa:
--------------------------------------------------------------------------------
1 | >AARQ02000011.1/391-585
2 | AAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAA
3 | AAGUAUCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCU
4 | GUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAG
5 | UAAUGAAACAUGCUU
6 |
--------------------------------------------------------------------------------
/test/test.stockholm:
--------------------------------------------------------------------------------
1 | # STOCKHOLM 1.0
2 | #=GF AU Infernal 1.1.2
3 |
4 | #=GS CP008770.1:757226-757421 DE Listeria monocytogenes strain 88-1059 genome
5 | #=GS CP007196.1:749187-749382 DE Listeria monocytogenes serotype 3c str. 10-5027, complete genome
6 | #=GS CP007169.1:797163-797359 DE Listeria monocytogenes serotype 1/2b str. 10-0811, complete genome
7 |
8 | AARQ02000011.1/391-585 -AAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAAAAGUA.UCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAACAUGCUU
9 | #=GR AARQ02000011.1/391-585 PP .*****************************************************************.**********************************************************************************************************************************
10 | CP008770.1:757226-757421 CAAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAAAAGUA.UCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAAUAUGCUU
11 | #=GR CP008770.1:757226-757421 PP ******************************************************************.**********************************************************************************************************************************
12 | CP007196.1:749187-749382 CAAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAAAAGUA.UCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAAUAUGCUU
13 | #=GR CP007196.1:749187-749382 PP ******************************************************************.**********************************************************************************************************************************
14 | CP007169.1:797163-797359 CAAUUAAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUACUUGAAGGUGAAAUCCCUGAAAAGUAAACAGUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCACUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAACAUGUUU
15 | #=GR CP007169.1:797163-797359 PP ******************************************************************9**********************************************************************************************************************************
16 | #=GC SS_cons :::::::::::(((((<<--<<<<<<<<<<--------<<<<<--<<<-<____>>>>--->>>>>.>>>>>>>>>>----->>,<<<<<<<--<<<<____>>>>---->>>>>>><<<<<<<____>>>>>>>,,,,,<<<<<<<-<<---<<<<<<<<____>>>>>>>>----->>>>>>>->>,,,,)))))
17 | #=GC RF cAAUUgAAUAGaagCgCCAGaaCuGaucGgGAcGAAAAugCuuGAaGGUGAAAUCCCuGAAaaGca.cCgauCaGuuGAcGAGGAGGaGacuAAcCGaAGUUuCGgcGGGaguCuCCCGgCuGcGcAUgCaGcCGUUAAGuCuuaCuUaCAAAcCacuuGGGUGACCaaguGgAcAGAGuaGuaaUGaAAcAcGcuu
18 | //
19 |
--------------------------------------------------------------------------------
/test/testcalls:
--------------------------------------------------------------------------------
1 | #Build with profiling enabled
2 | cabal new-build --enable-profiling --ghc-options="-rtsopts -threaded" --reinstall
3 | #Single input
4 | #Offline
5 | #Single fasta
6 | nohup RNAlien -i test/single.fa -c 7 -j -b /work/work/blast5db/nt_v5 -d single -w /work/work/new_taxdump/taxidlineage.dmp +RTS -p -N7&
7 | #Multi fasta
8 | nohup RNAlien -i test/testmulti.fa -c 7 -j -b /work/work/blast5db/nt_v5 -d multi -w /work/work/new_taxdump/taxidlineage.dmp +RTS -p -N7&
9 | #Stockholm alignment
10 | nohup RNAlien -p test/test.stockholm -c 7 -j -b /work/work/blast5db/nt_v5 -d aln -w /work/work/new_taxdump/taxidlineage.dmp +RTS -p -N7&
11 | #Online
12 | #Single fasta
13 | nohup RNAlien -i test/single.fa -c 7 -d onsingle +RTS -p -N7&
14 | #Multi fasta
15 | nohup RNAlien -i test/testmulti.fa -c 7 -d onmulti +RTS -p -N7&
16 | #Stockholm alignment
17 | nohup RNAlien -p test/test.stockholm -c 7 -d onaln +RTS -p -N7&
18 | #Scan
19 | #Single fasta
20 | RNAlienScan -i test/single.fa -b test/scan.fa -c 7 -d scansingle +RTS -p -N7&
21 | #Single fasta
22 | RNAlienScan -i test/testmulti.fa -b test/scan.fa -c 7 -d scanmulti +RTS -p -N7&
23 | #Single fasta
24 | RNAlienScan -i test/test.stockholm -b test/scan.fa -c 7 -d scanaln +RTS -p -N7&
25 |
26 |
27 |
--------------------------------------------------------------------------------
/test/testmulti.fa:
--------------------------------------------------------------------------------
1 | >AARQ02000011.1/391-585
2 | AATTGAATAGAAGCGCCAGAACTGATTGGGACGAAAATGCTTGAAGGTGAAATCCCTGAAAAGTATCGATCAGTTGACGA
3 | GGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCATTTGGG
4 | TGACCAAGTGGACAGAGTAGTAATGAAACATGCTT
5 | >CP008770.1:757226-757421 Listeria monocytogenes strain 88-1059 genome
6 | CAATTGAATAGAAGCGCCAGAACTGATTGGGACGAAAATGCTTGAAGGTGAAATCCCTGAAAAGTATCGATCAGTTGACG
7 | AGGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCATTTGG
8 | GTGACCAAGTGGACAGAGTAGTAATGAAATATGCTT
9 | >CP007196.1:749187-749382 Listeria monocytogenes serotype 3c str. 10-5027, complete genome
10 | CAATTGAATAGAAGCGCCAGAACTGATTGGGACGAAAATGCTTGAAGGTGAAATCCCTGAAAAGTATCGATCAGTTGACG
11 | AGGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCATTTGG
12 | GTGACCAAGTGGACAGAGTAGTAATGAAATATGCTT
13 | >CP007169.1:797163-797359 Listeria monocytogenes serotype 1/2b str. 10-0811, complete genome
14 | CAATTAAATAGAAGCGCCAGAACTGATTGGGACGAAAATACTTGAAGGTGAAATCCCTGAAAAGTAAACAGTCAGTTGAC
15 | GAGGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCACTTG
16 | GGTGACCAAGTGGACAGAGTAGTAATGAAACATGTTT
17 |
--------------------------------------------------------------------------------