├── .github └── workflows │ └── action.yml ├── .travis.yml ├── Biobase ├── RNAcentralHTTPRequest.hs ├── RNAlien.hs ├── RNAlien │ ├── CMstatParser.hs │ ├── InfernalParser.hs │ ├── Library.hs │ ├── RNAcentralHTTP.hs │ └── Types.hs ├── RNAlienScan.hs ├── RNAlienStatistics.hs └── cmsearchToBED.hs ├── ChangeLog.md ├── Dockerfile.dev ├── LICENSE ├── ParserTest.hs ├── README.md ├── RNAlien.cabal ├── RNAlien.svg ├── RNAlienScan.svg ├── cabal.project ├── default.nix ├── envhs.nix ├── manual.pdf ├── overrides.nix ├── scripts ├── AlienBenchmarkCMCompare.sh ├── AlienBenchmarkModels.sh ├── alienresultstatistics.pl ├── alienstructurestatistics.pl ├── blastbenchmarkdata.pl ├── buildClanModels.pl ├── cmComparevsRfam.pl ├── cmcomparebesthitextractor.pl ├── getblastdb.sh ├── makemultiplotcsv.sh └── nhmmerbenchmarkdata.pl ├── stack.yaml └── test ├── single.fa ├── test.stockholm ├── testcalls └── testmulti.fa /.github/workflows/action.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push] 3 | jobs: 4 | build: 5 | runs-on: ${{ matrix.os }} 6 | strategy: 7 | matrix: 8 | ghc: ['8.8', '8.10', '9.0'] 9 | cabal: ['3.2', '3.4'] 10 | os: [ubuntu-latest] 11 | name: Haskell GHC ${{ matrix.ghc }}, cabal ${{ matrix.cabal }} 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Setup Haskell 15 | uses: haskell/actions/setup@v1 16 | with: 17 | ghc-version: ${{ matrix.ghc }} 18 | cabal-version: ${{ matrix.cabal }} 19 | - run: cabal v2-test 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: c 4 | 5 | services: 6 | - docker 7 | 8 | before_script: 9 | - echo "$DOCKER_PASSWORD" | docker login -u "$DOCKER_USERNAME" --password-stdin 10 | - docker build --tag $DOCKER_USERNAME/rnalien -f Dockerfile.dev . 11 | 12 | script: 13 | - docker create --name develcontainer $DOCKER_USERNAME/rnalien 14 | - docker images 15 | - mkdir RNAlien 16 | - docker cp develcontainer:/RNAlien RNAlien 17 | - docker cp develcontainer:/RNAlienStatistics RNAlien 18 | - docker cp develcontainer:/cmsearchToBed RNAlien 19 | - docker cp develcontainer:/RNAcentralHTTPRequest RNAlien 20 | - docker cp develcontainer:/RNAlienScan RNAlien 21 | - cp LICENSE RNAlien 22 | - tar -cvzf RNAlien.tar.gz RNAlien 23 | - docker push $DOCKER_USERNAME/rnalien 24 | - docker run --rm $DOCKER_USERNAME/rnalien /RNAlien --help 25 | 26 | 27 | deploy: 28 | provider: releases 29 | skip_cleanup: true 30 | api_key: $GITHUB_TOKEN 31 | file: "RNAlien.tar.gz" 32 | on: 33 | tags: true 34 | -------------------------------------------------------------------------------- /Biobase/RNAcentralHTTPRequest.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards #-} 2 | {-# LANGUAGE DeriveDataTypeable #-} 3 | 4 | -- | RNAcentralHTTPRequest 5 | -- Testcommand: dist/build/RNAcentralHTTPRequest/RNAcentralHTTPRequest -i ATACTTACCTGGCACAGGGGATACCACGATCACCAAGGTGGTTCCCCCAAGACGAGGCTCACCATTGCACTCCGGTGGCGCTGACCCTTGCAATGACCCCAAATGTGGGTTACTCGGGTGTGTAATTTCTGTTAGCTGGGGACTGCGTTCGCGCTTTCCCCTT 6 | module Main where 7 | 8 | import System.Console.CmdArgs 9 | import Biobase.RNAlien.RNAcentralHTTP 10 | 11 | data Options = Options 12 | { inputSequence :: String 13 | } deriving (Show,Data,Typeable) 14 | 15 | options :: Options 16 | options = Options 17 | { inputSequence = def &= name "i" &= help "input sequence" 18 | } &= summary "RNAcentralHTTPRequest" &= help "Florian Eggenhofer 2016" &= verbosity 19 | 20 | main :: IO () 21 | main = do 22 | Options{..} <- cmdArgs options 23 | let query = buildStringViaMD5Query inputSequence 24 | rnacentralentries <- getRNACentralEntries [query] 25 | print rnacentralentries 26 | 27 | -------------------------------------------------------------------------------- /Biobase/RNAlien.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards #-} 2 | {-# LANGUAGE DeriveDataTypeable #-} 3 | 4 | -- | Unsupervized construction of RNA family models 5 | -- For more information on RNA family models consult 6 | -- Usage example: RNAlien -i /path/input.fa -c 5 -o /outdir/ 7 | -- Usage example offline mode: RNAlien -i /path/input.fa -b /backup/blast/nt_v5 -o /outdir/ -c 5 -t 1396 -j 8 | module Main where 9 | 10 | import System.Console.CmdArgs 11 | import System.Directory 12 | import Biobase.RNAlien.Types 13 | import Biobase.RNAlien.Library 14 | import Data.Maybe 15 | import Data.Either.Unwrap 16 | import Data.Time 17 | import qualified System.FilePath as FP 18 | import Paths_RNAlien (version) 19 | import Data.Version (showVersion) 20 | import qualified Biobase.StockholmAlignment.Import as BS 21 | import qualified Control.Exception.Base as CE 22 | import Control.Monad 23 | 24 | data Options = Options 25 | { inputFastaFilePath :: String, 26 | inputAlignmentFilePath :: String, 27 | outputPath :: String, 28 | inputTaxId :: Maybe Int, 29 | inputnSCICutoff :: Maybe Double, 30 | inputEvalueCutoff :: Maybe Double, 31 | inputBlastDatabase :: Maybe String, 32 | lengthFilter :: Bool, 33 | coverageFilter :: Bool, 34 | singleHitperTax :: Bool, 35 | blastSoftmasking :: Bool, 36 | inputQuerySelectionMethod :: String, 37 | inputQueryNumber :: Int, 38 | threads :: Int, 39 | taxonomyRestriction :: Maybe String, 40 | sessionIdentificator :: Maybe String, 41 | performEvaluation :: Bool, 42 | checkSetup :: Bool, 43 | taxonomyDumpPath :: String, 44 | offlineMode :: Bool 45 | } deriving (Show,Data,Typeable) 46 | 47 | options :: Options 48 | options = Options 49 | { inputFastaFilePath = def &= name "i" &= help "Path to input fasta file", 50 | inputAlignmentFilePath = def &= name "p" &= help "Path to input alignment file", 51 | outputPath = def &= name "o" &= help "Path to output directory. Default: current working directory", 52 | inputTaxId = Nothing &= name "t" &= help "NCBI taxonomy ID number of input RNA organism", 53 | inputnSCICutoff = Just (1 :: Double) &= name "z" &= help "Only candidate sequences with a normalized structure conservation index (nSCI) higher than this value are accepted. Default: 1", 54 | inputEvalueCutoff = Just (0.001 :: Double) &= name "e" &= help "Evalue cutoff for cmsearch filtering. Default: 0.001", 55 | inputBlastDatabase = Just "nt" &= name "b" &= help "Specify name of blast database to use, in offline mode the filepath to the blast database (/home/user/nt_v5). Default: nt", 56 | lengthFilter = True &= name "l" &= help "Filter blast hits per genomic length. Default: True", 57 | coverageFilter = True &= name "a" &= help "Filter blast hits by coverage of at least 80%. Default: True", 58 | singleHitperTax = False &= name "s" &= help "Only the best blast hit per taxonomic entry is considered. Default: False", 59 | blastSoftmasking = False &= name "f" &= help "Toggles blast query softmasking, meaning masking of non-conserved regions on the query. Default: False", 60 | inputQuerySelectionMethod = "filtering" &= name "m" &= help "Method for selection of queries (filtering,clustering). Default: filtering", 61 | inputQueryNumber = (5 :: Int) &= name "n" &= help "Number of queries used for candidate search. Default: 5", 62 | threads = 1 &= name "c" &= help "Number of available cpu slots/cores. Default: 1", 63 | taxonomyRestriction = Nothing &= name "r" &= help "Restrict search space to taxonomic kingdom (bacteria,archea,eukaryia,cellularorganisms,viruses). Default: not set", 64 | sessionIdentificator = Nothing &= name "d" &= help "Optional session id that is used instead of automatically generated one.", 65 | performEvaluation = True &= name "x" &= help "Perform evaluation step. Default: True", 66 | checkSetup = False &= name "g" &= help "Just prints installed tool versions and performs connection check. Default: False", 67 | taxonomyDumpPath = def &= name "w" &= help "Path to NCBI taxonomy dump directory.", 68 | offlineMode = False &= name "j" &= help "Uses locally installed blast and databases. Default: False" 69 | } &= summary ("RNAlien " ++ alienVersion) &= help "Florian Eggenhofer, Ivo L. Hofacker, Christian Hoener zu Siederdissen - 2013 - 2020" &= verbosity 70 | 71 | main :: IO () 72 | main = do 73 | Options{..} <- cmdArgs options 74 | verboseLevel <- getVerbosity 75 | --let tools = if inputQuerySelectionMethod == "clustering" then ["clustalo","mlocarna","RNAfold","RNAalifold","cmcalibrate","cmstat","cmbuild","RNAz","RNAcode"] else ["mlocarna","RNAfold","RNAalifold","cmcalibrate","cmstat","cmbuild","RNAz","RNAcode"] 76 | -- Generate SessionID 77 | sessionId <- createSessionID sessionIdentificator 78 | timestamp <- getCurrentTime 79 | currentWorkDirectory <- getCurrentDirectory 80 | let selectedOutputPath = if null outputPath then currentWorkDirectory else outputPath 81 | let temporaryDirectoryPath = FP.addTrailingPathSeparator selectedOutputPath ++ sessionId ++ "/" 82 | createDirectoryIfMissing False temporaryDirectoryPath 83 | setupCheckAlienWithLog inputQuerySelectionMethod temporaryDirectoryPath 84 | createDirectoryIfMissing False (temporaryDirectoryPath ++ "log") 85 | -- Create Log files 86 | writeFile (temporaryDirectoryPath ++ "Log") ("RNAlien " ++ alienVersion ++ "\n") 87 | writeFile (temporaryDirectoryPath ++ "log/warnings") ("") 88 | logMessage ("Timestamp: " ++ (show timestamp) ++ "\n") temporaryDirectoryPath 89 | logMessage ("Temporary Directory: " ++ temporaryDirectoryPath ++ "\n") temporaryDirectoryPath 90 | let iterationNumber = 0 91 | singleFasta <- isSingleFasta inputFastaFilePath 92 | if singleFasta 93 | then do 94 | fastaInput <- readFastaFile inputFastaFilePath 95 | when (null fastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i") 96 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath 97 | let reformatedFastaInput = map reformatFasta fastaInput 98 | let inputSequence = head reformatedFastaInput 99 | initialTaxId <- setInitialTaxId offlineMode threads inputBlastDatabase temporaryDirectoryPath inputTaxId inputSequence 100 | let checkedTaxonomyRestriction = checkTaxonomyRestriction taxonomyRestriction 101 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) inputTaxId singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads inputBlastDatabase checkedTaxonomyRestriction (setVerbose verboseLevel) offlineMode [] taxonomyDumpPath 102 | when (setVerbose verboseLevel) (print staticOptions) 103 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] initialTaxId Nothing (fromJust inputEvalueCutoff) False [] [] [] Nothing 104 | logMessage (show initialization) temporaryDirectoryPath 105 | modelConstructionResults <- modelConstructer staticOptions initialization 106 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults 107 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable 108 | if performEvaluation 109 | then do 110 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults 111 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation 112 | resultSummary modelConstructionResults staticOptions 113 | writeFile (temporaryDirectoryPath ++ "done") "" 114 | else do 115 | resultSummary modelConstructionResults staticOptions 116 | writeFile (temporaryDirectoryPath ++ "done") "" 117 | else do 118 | --multi fasta or aln input 119 | alignmentFilePath <- if not (null inputFastaFilePath) 120 | then do 121 | let mlocarnaFilePath = temporaryDirectoryPath ++ "input.mlocarna" 122 | let mlocarnaDirPath = temporaryDirectoryPath ++ "inputMLocarna" 123 | let mlocarnaStkPath = mlocarnaDirPath ++ "/results/result.stk" 124 | let alifoldPath = temporaryDirectoryPath ++ "input.alifold" 125 | let stockholmPath = temporaryDirectoryPath ++ "input.stockholm" 126 | alignSequences "mlocarna" ("--stockholm --consensus-structure alifold --tgtdir=" ++ mlocarnaDirPath ++ " --threads=" ++ show threads ++ " ") [inputFastaFilePath] [] [mlocarnaFilePath] [] 127 | _ <- systemRNAalifold "-r --cfactor 0.6 --nfactor 0.5" mlocarnaStkPath alifoldPath 128 | _ <- replaceStockholmStructure mlocarnaStkPath alifoldPath stockholmPath 129 | return stockholmPath 130 | else return inputAlignmentFilePath 131 | alignmentInput <- BS.readExistingStockholm alignmentFilePath 132 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath 133 | when (isLeft alignmentInput) (error (fromLeft alignmentInput)) 134 | let rightAlignment = head $ fromRight alignmentInput 135 | let reformatedFastaInput = stockholmAlignmentToFasta rightAlignment 136 | when (null reformatedFastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i") 137 | let inputSequence = head reformatedFastaInput 138 | initialTaxId <- setInitialTaxId offlineMode threads inputBlastDatabase temporaryDirectoryPath inputTaxId inputSequence 139 | let checkedTaxonomyRestriction = checkTaxonomyRestriction taxonomyRestriction 140 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) inputTaxId singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads inputBlastDatabase checkedTaxonomyRestriction (setVerbose verboseLevel) offlineMode [] taxonomyDumpPath 141 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] initialTaxId Nothing (fromJust inputEvalueCutoff) False [] [] [] (Just rightAlignment) 142 | --let (upperTaxLimit,lowerTaxLimit) = setTaxonomicContextEntrez iterationNumber (taxonomicContext initialization) (upperTaxonomyLimit initialization) 143 | currentTaxonomicContext <- CE.catch (getTaxonomicContext (offline staticOptions) (ncbiTaxonomyDumpPath staticOptions) initialTaxId (taxonomicContext initialization)) 144 | (\e -> do let err = show (e :: CE.IOException) 145 | logWarning ("Warning: Retrieving taxonomic context failed:" ++ " " ++ err) (tempDirPath staticOptions) 146 | return Nothing) 147 | let nextModelConstructionInput = constructNext iterationNumber initialization [] [] initialTaxId currentTaxonomicContext [] [] True 148 | -- let nextModelConstructionInput = constructNext currentIterationNumber modelConstruction alignmentResults similarMembers currentUpperTaxonomyLimit currentTaxonomicContext [] currentPotentialMembers True 149 | --constructModel nextModelConstructionInput staticOptions 150 | let outputDirectory = tempDirPath staticOptions ++ "0" ++ "/" 151 | createDirectory outputDirectory 152 | let fastaFilePath = outputDirectory ++ "model.fa" 153 | let stockholmFilepath = outputDirectory ++ "model" ++ ".stockholm" 154 | let cmFilepath = outputDirectory ++ "model" ++ ".cm" 155 | let cmCalibrateFilepath = outputDirectory ++ "model" ++ ".cmcalibrate" 156 | let cmBuildFilepath = outputDirectory ++ "model" ++ ".cmbuild" 157 | copyFile alignmentFilePath stockholmFilepath 158 | writeFastaFile fastaFilePath reformatedFastaInput 159 | let refinedAlignmentFilepath = outputDirectory ++ "modelrefined.stockholm" 160 | let cmBuildOptions ="--refine " ++ refinedAlignmentFilepath 161 | _ <- systemCMbuild cmBuildOptions stockholmFilepath cmFilepath cmBuildFilepath 162 | _ <- systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath 163 | writeFile (outputDirectory ++ "done") "" 164 | --select queries 165 | print "here1" 166 | let logDirectory = outputDirectory ++ "log" 167 | createDirectory logDirectory 168 | currentSelectedQueries <- selectQueries staticOptions initialization [] 169 | --currentSelectedQueries <- selectQueries staticOptions initialization [] 170 | let nextScanModelConstructionInputWithQueries = nextModelConstructionInput {selectedQueries = currentSelectedQueries} 171 | print "here 2" 172 | logMessage (iterationSummaryLog nextScanModelConstructionInputWithQueries) (tempDirPath staticOptions) 173 | modelConstructionResults <- modelConstructer staticOptions nextModelConstructionInput 174 | print "here 3" 175 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults 176 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable 177 | if performEvaluation 178 | then do 179 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults 180 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation 181 | resultSummary modelConstructionResults staticOptions 182 | writeFile (temporaryDirectoryPath ++ "done") "" 183 | else do 184 | resultSummary modelConstructionResults staticOptions 185 | writeFile (temporaryDirectoryPath ++ "done") "" 186 | 187 | isSingleFasta :: String -> IO Bool 188 | isSingleFasta inputFastaFilePath = do 189 | if null inputFastaFilePath 190 | then return False 191 | else do 192 | fastaInput <- readFastaFile inputFastaFilePath 193 | when (null fastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i") 194 | if (length fastaInput == (1 :: Int)) then return True else return False 195 | 196 | alienVersion :: String 197 | alienVersion = showVersion version 198 | -------------------------------------------------------------------------------- /Biobase/RNAlien/CMstatParser.hs: -------------------------------------------------------------------------------- 1 | -- | This module contains parsing functions for Infernal programs 2 | 3 | module Biobase.RNAlien.CMstatParser ( 4 | module Biobase.RNAlien.Types, 5 | parseCMstat, 6 | readCMstat 7 | ) 8 | where 9 | 10 | import Text.ParserCombinators.Parsec 11 | import Biobase.RNAlien.Types 12 | 13 | -- | parse from input filePath 14 | parseCMstat :: String -> Either ParseError CMstat 15 | parseCMstat = parse genParserCMstat "parseCMstat" 16 | 17 | -- | parse from input filePath 18 | readCMstat :: String -> IO (Either ParseError CMstat) 19 | readCMstat filePath = do 20 | parsedFile <- parseFromFile genParserCMstat filePath 21 | return parsedFile 22 | 23 | genParserCMstat :: GenParser Char st CMstat 24 | genParserCMstat = do 25 | manyTill anyChar (try (string "rel entropy")) 26 | _ <- newline 27 | _ <- char '#' 28 | skipMany1 (char ' ') 29 | skipMany1 (char '-') 30 | _ <- newline 31 | _ <- char '#' 32 | _ <- manyTill anyChar (try (string "#")) 33 | _ <- many1 (try (oneOf " -")) 34 | _ <- newline 35 | skipMany1 space 36 | _statIndex <- many1 digit 37 | skipMany1 space 38 | _statName <- many1 letter 39 | skipMany1 space 40 | _statAccession <- many1 (noneOf " ") 41 | skipMany1 space 42 | _statSequenceNumber <- many1 digit 43 | skipMany1 space 44 | _statEffectiveSequences <- many1 (oneOf "0123456789.e-") 45 | skipMany1 space 46 | _statConsensusLength <- many digit 47 | skipMany1 space 48 | _statW <- many1 digit 49 | skipMany1 space 50 | _statBasepaires <- many1 digit 51 | skipMany1 space 52 | _statBifurcations <- many1 digit 53 | skipMany1 space 54 | _statModel <- many1 letter 55 | skipMany1 space 56 | _relativeEntropyCM <- many1 (oneOf "0123456789.e-") 57 | skipMany1 space 58 | _relativeEntropyHMM <- many1 (oneOf "0123456789.e-") 59 | _ <- newline 60 | _ <- char '#' 61 | _ <- newline 62 | _ <- eof 63 | return $ CMstat (readInt _statIndex) _statName _statAccession (readInt _statSequenceNumber) (readDouble _statEffectiveSequences) (readInt _statConsensusLength) (readInt _statW) (readInt _statBasepaires) (readInt _statBifurcations) _statModel (readDouble _relativeEntropyCM) (readDouble _relativeEntropyHMM) 64 | -- 65 | readInt :: String -> Int 66 | readInt = read 67 | 68 | readDouble :: String -> Double 69 | readDouble = read 70 | -------------------------------------------------------------------------------- /Biobase/RNAlien/InfernalParser.hs: -------------------------------------------------------------------------------- 1 | -- | This module contains parsing functions for Infernal programs 2 | 3 | module Biobase.RNAlien.InfernalParser ( 4 | module Biobase.RNAlien.Types, 5 | readCMSearch, 6 | readCMSearches, 7 | parseCMSearch, 8 | parseCMSearches, 9 | ) 10 | where 11 | 12 | import Text.ParserCombinators.Parsec 13 | import Biobase.RNAlien.Types 14 | import qualified Data.ByteString.Char8 as B 15 | 16 | -- | parse from input filePath 17 | parseCMSearch :: String -> Either ParseError CMsearch 18 | parseCMSearch = parse genParserCMSearch "parseCMsearch" 19 | 20 | -- | parse from input filePath 21 | parseCMSearches :: String -> Either ParseError CMsearch 22 | parseCMSearches = parse genParserCMSearches "parseCMsearch" 23 | 24 | -- | parse from input filePath 25 | readCMSearch :: String -> IO (Either ParseError CMsearch) 26 | readCMSearch filePath = do 27 | parsedFile <- parseFromFile genParserCMSearch filePath 28 | return parsedFile 29 | 30 | -- | parse from input filePath 31 | readCMSearches :: String -> IO (Either ParseError CMsearch) 32 | readCMSearches filePath = do 33 | parsedFile <- parseFromFile genParserCMSearches filePath 34 | return parsedFile 35 | 36 | genParserCMSearches :: GenParser Char st CMsearch 37 | genParserCMSearches = do 38 | --_ <- string "# cmsearch :: search CM(s) against a sequence database" 39 | --_ <- newline 40 | --_ <- string "# INFERNAL " 41 | --_ <- many1 (noneOf "\n") 42 | --_ <- newline 43 | --_ <- string "# Copyright (C) 201" 44 | --_ <- many1 (noneOf "\n") 45 | --_ <- newline 46 | --_ <- manyTill anyChar (try (string "# Freely distributed under the GNU General Public License (GPLv3).") --Freely distributed under a BSD open source license.) 47 | --_ <- newline 48 | _ <- manyTill anyChar (try (string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")) 49 | _ <- newline 50 | _ <- string "# query CM file:" 51 | skipMany1 space 52 | queryCMfile' <- many1 (noneOf "\n") 53 | newline 54 | _ <- string "# target sequence database:" 55 | skipMany1 space 56 | targetSequenceDatabase' <- many1 (noneOf "\n") 57 | _ <- newline 58 | optional (try (genParserCMsearchHeaderField "# CM configuration")) 59 | optional (try (genParserCMsearchHeaderField "# database size is set to")) 60 | optional (try (genParserCMsearchHeaderField "# truncated sequence detection")) 61 | _ <- string "# number of worker threads:" 62 | skipMany1 space 63 | numberOfWorkerThreads' <- many1 (noneOf "\n") 64 | _ <- newline 65 | _ <- string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -" 66 | _ <- newline 67 | _ <- optional newline 68 | cmSearchesHits <- many1 (try genParserMultipleCMSearch) 69 | _ <- optional (string "[ok]\n") 70 | _ <- eof 71 | return $ CMsearch queryCMfile' targetSequenceDatabase' numberOfWorkerThreads' (concat cmSearchesHits) 72 | 73 | genParserCMSearch :: GenParser Char st CMsearch 74 | genParserCMSearch = do 75 | --_ <- string "# cmsearch :: search CM(s) against a sequence database" 76 | --_ <- newline 77 | --_ <- string "# INFERNAL " 78 | --skipMany1 (noneOf "\n") 79 | --_ <- newline 80 | --_ <- string "# Copyright (C) 201" 81 | --_ <- many1 (noneOf "\n") 82 | --_ <- newline 83 | --_ <- string "# Freely distributed under the GNU General Public License (GPLv3)." 84 | --_ <- newline 85 | _ <- manyTill anyChar (try (string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")) 86 | _ <- newline 87 | _ <- string "# query CM file:" 88 | _ <- skipMany1 space 89 | queryCMfile' <- many1 (noneOf "\n") 90 | _ <- newline 91 | _ <- string "# target sequence database:" 92 | skipMany1 space 93 | targetSequenceDatabase' <- many1 (noneOf "\n") 94 | _ <- newline 95 | _ <- optional (try (genParserCMsearchHeaderField "# CM configuration")) 96 | _ <- optional (try (genParserCMsearchHeaderField "# database size is set to")) 97 | _ <- optional (try (genParserCMsearchHeaderField "# truncated sequence detection")) 98 | _ <- string "# number of worker threads:" 99 | skipMany1 space 100 | numberOfWorkerThreads' <- many1 (noneOf "\n") 101 | _ <- newline 102 | _ <- string "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -" 103 | _ <- newline 104 | _ <- optional newline 105 | _ <- string "Query:" 106 | skipMany1 (noneOf "\n") 107 | _ <- newline 108 | _ <- optional (try (genParserCMsearchHeaderField "Accession")) 109 | _ <- optional (try (genParserCMsearchHeaderField "Description")) 110 | _ <- string "Hit scores:" 111 | _ <- newline 112 | _ <- choice [try (string " rank"), try (string " rank") , try (string " rank"), try (string " rank"),try (string " rank"),try (string " rank")] 113 | many1 space 114 | string "E-value" 115 | --many1 space 116 | --string "score" 117 | --many1 space 118 | --string "bias" 119 | --many1 space 120 | --string "sequence" 121 | --many1 space 122 | --string "start" 123 | --many1 space 124 | --string "end" 125 | --many1 space 126 | --string "mdl" 127 | --many1 space 128 | --string "trunc" 129 | --many1 space 130 | --string "gc" 131 | --many1 space 132 | --string "description" 133 | --newline 134 | _ <- manyTill anyChar (try (string "-")) 135 | --string " -" 136 | skipMany1 (try (oneOf " -")) 137 | _ <- newline 138 | optional (try (string " ------ inclusion threshold ------")) 139 | skipMany newline 140 | hitScores' <- many (try genParserCMsearchHit) --`endBy` (try (string "Hit alignments:")) 141 | optional (try genParserCMsearchEmptyHit) 142 | -- this is followed by hit alignments and internal cmsearch statistics which are not parsed 143 | _ <- many anyChar 144 | _ <- eof 145 | return $ CMsearch queryCMfile' targetSequenceDatabase' numberOfWorkerThreads' hitScores' 146 | 147 | -- | Parsing function for CMSearches with multiple querymodels in one modelfile, e.g. clans 148 | genParserMultipleCMSearch :: GenParser Char st [CMsearchHit] 149 | genParserMultipleCMSearch = do 150 | --optional newline 151 | --optional string "//" 152 | string "Query:" 153 | many1 (noneOf "\n") 154 | newline 155 | optional (try (genParserCMsearchHeaderField "Accession")) 156 | optional (try (genParserCMsearchHeaderField "Description")) 157 | string "Hit scores:" 158 | newline 159 | choice [try (string " rank"), try (string " rank") , try (string " rank"), try (string " rank"),try (string " rank"),try (string " rank")] 160 | many1 space 161 | string "E-value" 162 | many1 space 163 | string "score" 164 | many1 space 165 | string "bias" 166 | many1 space 167 | string "sequence" 168 | many1 space 169 | string "start" 170 | many1 space 171 | string "end" 172 | many1 space 173 | string "mdl" 174 | many1 space 175 | string "trunc" 176 | many1 space 177 | string "gc" 178 | many1 space 179 | string "description" 180 | newline 181 | string " -" 182 | many1 (try (oneOf " -")) 183 | newline 184 | optional (try (string " ------ inclusion threshold ------")) 185 | many newline 186 | hitScores' <- many (try genParserCMsearchHit) --`endBy` (try (string "Hit alignments:")) 187 | optional (try genParserCMsearchEmptyHit) 188 | -- this is followed by hit alignments and internal cmsearch statistics which are not parsed 189 | --many anyChar 190 | manyTill anyChar (try (string "//\n")) 191 | return hitScores' 192 | 193 | genParserCMsearchHeaderField :: String -> GenParser Char st String 194 | genParserCMsearchHeaderField fieldname = do 195 | string (fieldname ++ ":") 196 | many1 space 197 | many1 (noneOf "\n") 198 | newline 199 | return [] 200 | 201 | genParserCMsearchEmptyHit :: GenParser Char st [CMsearchHit] 202 | genParserCMsearchEmptyHit = do 203 | string " [No hits detected that satisfy reporting thresholds]" 204 | newline 205 | optional (try newline) 206 | return [] 207 | 208 | genParserCMsearchHit :: GenParser Char st CMsearchHit 209 | genParserCMsearchHit = do 210 | many1 space 211 | string "(" 212 | hitRank' <- many1 digit 213 | string ")" 214 | many1 space 215 | hitSignificant' <- choice [char '!', char '?'] 216 | many1 space 217 | hitEValue' <- many1 (oneOf "0123456789.e-") 218 | many1 space 219 | hitScore' <- many1 (oneOf "0123456789.e-") 220 | many1 space 221 | hitBias' <- many1 (oneOf "0123456789.e-") 222 | many1 space 223 | hitSequenceHeader' <- many1 (noneOf " ") 224 | many1 space 225 | hitStart' <- many1 digit 226 | many1 space 227 | hitEnd' <- many1 digit 228 | many1 space 229 | hitStrand' <- choice [char '+', char '-', char '.'] 230 | many1 space 231 | hitModel' <- many1 letter 232 | many1 space 233 | hitTruncation' <- many1 (choice [alphaNum, char '\'']) 234 | many1 space 235 | hitGCcontent' <- many1 (oneOf "0123456789.e-") 236 | many1 space 237 | hitDescription' <- many1 (noneOf "\n") 238 | newline 239 | optional (try (string " ------ inclusion threshold ------")) 240 | optional (try newline) 241 | return $ CMsearchHit (readInt hitRank') hitSignificant' (readDouble hitEValue') (readDouble hitScore') (readDouble hitBias') (B.pack hitSequenceHeader') (readInt hitStart') (readInt hitEnd') hitStrand' (B.pack hitModel') (B.pack hitTruncation') (readDouble hitGCcontent') (B.pack hitDescription') 242 | 243 | -- 244 | readInt :: String -> Int 245 | readInt = read 246 | 247 | readDouble :: String -> Double 248 | readDouble = read 249 | -------------------------------------------------------------------------------- /Biobase/RNAlien/RNAcentralHTTP.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | {-# LANGUAGE DeriveGeneric #-} 4 | 5 | -- | Interface for the RNAcentral REST webservice. 6 | -- 7 | module Biobase.RNAlien.RNAcentralHTTP (rnaCentralHTTP, 8 | buildSequenceViaMD5Query, 9 | buildStringViaMD5Query, 10 | getRNACentralEntries, 11 | showRNAcentralAlienEvaluation, 12 | RNAcentralEntryResponse(..), 13 | RNAcentralEntry(..) 14 | ) where 15 | 16 | import Network.HTTP.Conduit 17 | import qualified Data.ByteString.Lazy.Char8 as L8 18 | --import qualified Data.ByteString.Char8 as BS8 19 | import Network.Socket 20 | import Control.Concurrent 21 | import Data.Text 22 | import Data.Aeson 23 | import GHC.Generics 24 | import qualified Data.Digest.Pure.MD5 as M 25 | import Data.Either 26 | import Biobase.Fasta.Strict 27 | import Biobase.Types.BioSequence 28 | 29 | --Datatypes 30 | -- | Data structure for RNAcentral entry response 31 | data RNAcentralEntryResponse = RNAcentralEntryResponse 32 | { 33 | count :: Int, 34 | next :: Maybe Text, 35 | previous :: Maybe Text, 36 | results :: [RNAcentralEntry] 37 | } 38 | deriving (Show, Eq, Generic) 39 | 40 | instance ToJSON RNAcentralEntryResponse where 41 | toJSON = genericToJSON defaultOptions 42 | --toEncoding = genericToEncoding defaultOptions 43 | 44 | instance FromJSON RNAcentralEntryResponse 45 | 46 | data RNAcentralEntry = RNAcentralEntry 47 | { 48 | url :: Text, 49 | rnacentral_id :: Text, 50 | md5 :: Text, 51 | sequence :: Text, 52 | length :: Int, 53 | xrefs :: Text, 54 | publications :: Text 55 | } 56 | deriving (Show, Eq, Generic) 57 | 58 | instance ToJSON RNAcentralEntry where 59 | toJSON = genericToJSON defaultOptions 60 | --toEncoding = genericToEncoding defaultOptions 61 | 62 | instance FromJSON RNAcentralEntry 63 | 64 | -- | Send query and parse return XML 65 | startSession :: String -> IO (Either String RNAcentralEntryResponse) 66 | startSession query' = do 67 | requestXml <- withSocketsDo 68 | $ sendQuery query' 69 | --putStr (L8.unpack requestXml) 70 | let eitherErrorResponse = eitherDecode requestXml :: Either String RNAcentralEntryResponse 71 | return eitherErrorResponse 72 | 73 | -- | Send query and return response XML 74 | sendQuery :: String -> IO L8.ByteString 75 | sendQuery query' = do 76 | let address = "http://rnacentral.org/api/v1/rna/" 77 | let request = address ++ query' 78 | --putStrLn request 79 | simpleHttp request 80 | 81 | -- | Function for querying the RNAcentral REST interface. 82 | rnaCentralHTTP :: String -> IO (Either String RNAcentralEntryResponse) 83 | rnaCentralHTTP query' = 84 | startSession query' 85 | 86 | -- | Function for delayed queries to the RNAcentral REST interface. Enforces the maximum 20 requests per second policy. 87 | delayedRNACentralHTTP :: String -> IO (Either String RNAcentralEntryResponse) 88 | delayedRNACentralHTTP query' = do 89 | threadDelay 55000 90 | startSession query' 91 | 92 | getRNACentralEntries :: [String] -> IO [Either String RNAcentralEntryResponse] 93 | getRNACentralEntries queries = do 94 | mapM delayedRNACentralHTTP queries 95 | 96 | -- | Build a query from a input sequence 97 | -- 98 | -- TODO [chzs] consider using strict bytestring as long as possible. 99 | -- 100 | -- TODO [chzs] consider giving useful typelevel names to the types in @Fasta@. 101 | -- One may give a type-level name to the sequence identifier, and an identifier 102 | -- (like @DNA@) to the biosequence type. 103 | 104 | buildSequenceViaMD5Query :: Fasta () () -> String 105 | buildSequenceViaMD5Query s = qString 106 | where querySequence = L8.fromStrict . _bioSequence $ _fasta s 107 | querySequenceUreplacedwithT = L8.map bsreplaceUT querySequence 108 | querySequenceU2Twolb = L8.filter ((/= '\n')) querySequenceUreplacedwithT 109 | md5Sequence = M.md5 querySequenceU2Twolb 110 | qString = "?md5=" ++ show md5Sequence 111 | 112 | --Build a query from a input string 113 | buildStringViaMD5Query :: String -> String 114 | buildStringViaMD5Query s = qString 115 | where querySequenceUreplacedwithT = L8.map bsreplaceUT (L8.pack s) 116 | querySequenceU2Twolb = L8.filter ((/= '\n')) querySequenceUreplacedwithT 117 | md5Sequence = M.md5 querySequenceU2Twolb 118 | qString = "?md5=" ++ show md5Sequence 119 | 120 | showRNAcentralAlienEvaluation :: [Either String RNAcentralEntryResponse] -> String 121 | showRNAcentralAlienEvaluation responses = output 122 | where resultEntries = Prelude.concatMap results (rights responses) 123 | resulthead = "rnacentral_id\tmd5\tlength\n" 124 | resultentries = Prelude.concatMap showRNAcentralAlienEvaluationLine resultEntries 125 | output = if Prelude.null resultentries then "No matching sequences found in RNAcentral\n" else resulthead ++ resultentries 126 | 127 | showRNAcentralAlienEvaluationLine :: RNAcentralEntry -> String 128 | showRNAcentralAlienEvaluationLine entry = unpack (rnacentral_id entry) ++ "\t" ++ unpack (md5 entry) ++ "\t" ++ show (Biobase.RNAlien.RNAcentralHTTP.length entry) ++"\n" 129 | 130 | bsreplaceUT :: Char -> Char 131 | bsreplaceUT a 132 | | a == 'U' = 'T' 133 | | otherwise = a 134 | 135 | -------------------------------------------------------------------------------- /Biobase/RNAlien/Types.hs: -------------------------------------------------------------------------------- 1 | -- | This module contains data structures for RNAlien 2 | 3 | module Biobase.RNAlien.Types where 4 | 5 | import Biobase.Fasta.Strict 6 | import Biobase.Taxonomy.Import 7 | import Biobase.StockholmAlignment.Types 8 | --import Biobase.Types.BioSequence 9 | import qualified Data.ByteString.Char8 as B 10 | 11 | -- | Static construction options 12 | data StaticOptions = StaticOptions 13 | { tempDirPath :: String, 14 | sessionID :: String, 15 | nSCICutoff :: Double, 16 | userTaxId :: Maybe Int, 17 | singleHitperTaxToggle :: Bool, 18 | querySelectionMethod :: String, 19 | queryNumber :: Int, 20 | lengthFilterToggle :: Bool, 21 | coverageFilterToggle :: Bool, 22 | blastSoftmaskingToggle :: Bool, 23 | cpuThreads :: Int, 24 | blastDatabase :: Maybe String, 25 | taxRestriction :: Maybe String, 26 | verbositySwitch :: Bool, 27 | offline :: Bool, 28 | genomeFastasPath :: String, 29 | ncbiTaxonomyDumpPath :: String 30 | } deriving (Show) 31 | 32 | -- | Keeps track of model construction 33 | data ModelConstruction = ModelConstruction 34 | { iterationNumber :: Int, 35 | inputFasta :: [Fasta () ()], 36 | --unique seed sequencs 37 | taxRecords :: [TaxonomyRecord], 38 | --additional similar sequences - collected by full similarity to previously found entries 39 | similarRecords :: [TaxonomyRecord], 40 | --Taxonomy ID of the highest node in taxonomic subtree used in search 41 | upperTaxonomyLimit :: Maybe Int, 42 | taxonomicContext :: Maybe Lineage, 43 | evalueThreshold :: Double, 44 | alignmentModeInfernal :: Bool, 45 | selectedQueries :: [Fasta () ()], 46 | potentialMembers :: [SearchResult], 47 | genomeFastas :: [Fasta () ()], 48 | inputAlignment :: Maybe StockholmAlignment 49 | } 50 | 51 | instance Show ModelConstruction where 52 | show (ModelConstruction _iterationNumber _inputFasta _taxRecords _similarRecords _upperTaxonomyLimit _taxonomicContext _evalueThreshold _alignmentModeInfernal _selectedQueries _potentialMembers _genomeFastas _inputAlignment) = a ++ b ++ c ++ d ++ e ++ g ++ h ++ i ++ j ++ k ++ l 53 | where a = "Modelconstruction iteration: " ++ show _iterationNumber ++ "\n" 54 | -- b = "Input fasta:\n" ++ concatMap (prettyPrintFasta 80) _inputFasta -- L.unpack (fastaHeader _inputFasta) ++ "\n" ++ L.unpack (fastaSequence _inputFasta) ++ "\n" 55 | b = "Input fasta:\n" ++ concatMap (convertString . fastaToByteString 80) _inputFasta 56 | c = "Input alignment:\n" ++ maybe "not provided" show _inputAlignment ++ "\n" 57 | d = "Taxonomy records:\n" ++ show _taxRecords ++ "\n" 58 | e = "Similar records:\n" ++ show _similarRecords ++ "\n" 59 | g = "Upper taxonomy limit: " ++ maybe "not set" show _upperTaxonomyLimit ++ "\n" 60 | h = "Taxonomic Context: " ++ maybe "not set" show _taxonomicContext ++ "\n" 61 | i = "Evalue cutoff: " ++ show _evalueThreshold ++ "\n" 62 | j = "Selected queries: \n" ++ concatMap show _selectedQueries 63 | k = "Potential Members: \n" ++ concatMap show _potentialMembers 64 | l = "Number of genomes for RNAlienScan: " ++ show (length _genomeFastas) ++ "\n" 65 | 66 | data TaxonomyRecord = TaxonomyRecord 67 | { recordTaxonomyId :: Int, 68 | sequenceRecords :: [SequenceRecord] 69 | } 70 | 71 | instance Show TaxonomyRecord where 72 | show (TaxonomyRecord _recordTaxonomyId _sequenceRecords) = a ++ b 73 | where a = "TaxonomyRecord TaxonomyId: " ++ show _recordTaxonomyId ++ "\n" 74 | b = show _sequenceRecords 75 | 76 | data SequenceRecord = SequenceRecord 77 | { --Sequence consisting of SeqLabel, and SeqData 78 | nucleotideSequence :: Fasta () (), 79 | -- 0 is unaligned, number is the iteration the sequence has been included into the alignment 80 | aligned :: Int, 81 | recordDescription :: B.ByteString 82 | } 83 | 84 | instance Show SequenceRecord where 85 | show (SequenceRecord _nucleotideSequence _aligned _recordDescription) = a ++ b ++ c 86 | where a = "Record Description: " ++ B.unpack _recordDescription ++ "\n" 87 | b = "Aligned in iteration: " ++ show _aligned ++ "\n" 88 | c = "Sequence:" ++ show _nucleotideSequence ++ "\n" 89 | -- | 90 | data CMsearch = CMsearch 91 | { queryCMfile :: String, 92 | targetSequenceDatabase :: String, 93 | numberOfWorkerThreads :: String, 94 | cmsearchHits :: [CMsearchHit] 95 | -- hitAlignments :: [CMsearchHitAlignment] 96 | -- internalCMPipelineStatisticsSummary 97 | } deriving (Show, Eq, Read) 98 | 99 | -- | 100 | data CMsearchHit = CMsearchHit 101 | { hitRank :: Int, 102 | hitSignificance :: Char, 103 | hitEvalue :: Double, 104 | hitScore :: Double, 105 | hitBias :: Double, 106 | hitSequenceHeader :: B.ByteString, 107 | hitStart :: Int, 108 | hitEnd :: Int, 109 | hitStrand :: Char, 110 | hitModel :: B.ByteString, 111 | hitTruncation :: B.ByteString, 112 | hitGCContent :: Double, 113 | hitDescription :: B.ByteString 114 | } deriving (Show, Eq, Read) 115 | 116 | data SearchResult = SearchResult 117 | { candidates :: [(Fasta () (),Int,B.ByteString)], 118 | blastDatabaseSize :: Maybe Double 119 | } 120 | 121 | instance Show SearchResult where 122 | show (SearchResult _candidates _blastDatabaseSize) = a ++ b 123 | where a = "SearchResults :\n " ++ concatMap show _candidates ++ "\n" 124 | b = "BlastDb Size: " ++ show _blastDatabaseSize ++ "\n" 125 | 126 | -- | 127 | data CMstat = CMstat 128 | { statIndex :: Int, 129 | statName :: String, 130 | statAccession :: String, 131 | statSequenceNumber :: Int, 132 | statEffectiveSequences :: Double, 133 | statConsensusLength :: Int, 134 | -- W The expected maximum length of a hit to the model. 135 | statW :: Int, 136 | statBasepairs :: Int, 137 | statBifurcations :: Int, 138 | statModel :: String, 139 | relativeEntropyCM :: Double, 140 | relativeEntropyHMM :: Double 141 | } deriving (Eq, Read) 142 | 143 | instance Show CMstat where 144 | show (CMstat _statIndex _statName _statAccession _statSequenceNumber _statEffectiveSequences _statConsensusLength _statW _statBasepairs _statBifurcations _statModel _relativeEntropyCM _relativeEntropyHMM) = a ++ b ++ c ++ d ++ e ++ f ++ g ++ h ++ i ++ j ++ k ++ l 145 | where a = "CMstat - covariance model statistics:\nIndex: " ++ show _statIndex ++ "\n" 146 | b = "Name: " ++ show _statName ++ "\n" 147 | c = "Accession: " ++ show _statAccession ++ "\n" 148 | d = "Sequence Number: " ++ show _statSequenceNumber ++ "\n" 149 | e = "Effective Sequences: " ++ show _statEffectiveSequences ++ "\n" 150 | f = "Consensus length: " ++ show _statConsensusLength ++ "\n" 151 | g = "Expected maximum hit-length: " ++ show _statW ++ "\n" 152 | h = "Basepairs: " ++ show _statBasepairs ++ "\n" 153 | i = "Bifurcations: " ++ show _statBifurcations ++ "\n" 154 | j = "Modeltype: " ++ show _statModel ++ "\n" 155 | k = "Relative Entropy CM: " ++ show _relativeEntropyCM ++ "\n" 156 | l = "Relative Entropy HMM: " ++ show _relativeEntropyHMM ++ "\n" 157 | -------------------------------------------------------------------------------- /Biobase/RNAlienScan.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards #-} 2 | {-# LANGUAGE DeriveDataTypeable #-} 3 | 4 | -- | Unsupervized construction of RNA family models 5 | -- For more information on RNA family models consult 6 | -- Usage example: RNAlien -i /path/input.fa -c 5 -o /outdir/ 7 | -- Usage example offline mode: RNAlien -i /path/input.fa -b /backup/blast/nt_v5 -o /outdir/ -c 5 -t 1396 -j 8 | module Main where 9 | 10 | import System.Console.CmdArgs 11 | import System.Directory 12 | import Biobase.RNAlien.Types 13 | import Biobase.RNAlien.Library 14 | import Data.Maybe 15 | import Data.Either.Unwrap 16 | import Data.Time 17 | import qualified System.FilePath as FP 18 | import Paths_RNAlien (version) 19 | import Data.Version (showVersion) 20 | --import Biobase.Fasta.Streaming 21 | import Control.Monad 22 | import qualified Biobase.StockholmAlignment.Import as BS 23 | 24 | data Options = Options 25 | { inputFastaFilePath :: String, 26 | inputAlignmentFilePath :: String, 27 | inputGenomesFastaFilePath :: String, 28 | outputPath :: String, 29 | inputnSCICutoff :: Maybe Double, 30 | inputEvalueCutoff :: Maybe Double, 31 | lengthFilter :: Bool, 32 | coverageFilter :: Bool, 33 | singleHitperTax :: Bool, 34 | blastSoftmasking :: Bool, 35 | inputQuerySelectionMethod :: String, 36 | inputQueryNumber :: Int, 37 | threads :: Int, 38 | sessionIdentificator :: Maybe String, 39 | performEvaluation :: Bool, 40 | checkSetup :: Bool 41 | } deriving (Show,Data,Typeable) 42 | 43 | options :: Options 44 | options = Options 45 | { inputFastaFilePath = def &= name "i" &= help "Path to input fasta file", 46 | inputAlignmentFilePath = def &= name "p" &= help "Path to input alignment file", 47 | inputGenomesFastaFilePath = def &= name "b" &= help "Path to input genome fasta files", 48 | outputPath = def &= name "o" &= help "Path to output directory. Default: current working directory", 49 | inputnSCICutoff = Just (1 :: Double) &= name "z" &= help "Only candidate sequences with a normalized structure conservation index (nSCI) higher than this value are accepted. Default: 1", 50 | inputEvalueCutoff = Just (0.001 :: Double) &= name "e" &= help "Evalue cutoff for cmsearch filtering. Default: 0.001", 51 | lengthFilter = True &= name "l" &= help "Filter blast hits per genomic length. Default: True", 52 | coverageFilter = True &= name "a" &= help "Filter blast hits by coverage of at least 80%. Default: True", 53 | singleHitperTax = False &= name "s" &= help "Only the best blast hit per taxonomic entry is considered. Default: False", 54 | blastSoftmasking = False &= name "f" &= help "Toggles blast query softmasking, meaning masking of non-conserved regions on the query. Default: False", 55 | inputQuerySelectionMethod = "filtering" &= name "m" &= help "Method for selection of queries (filtering,clustering). Default: filtering", 56 | inputQueryNumber = (5 :: Int) &= name "n" &= help "Number of queries used for candidate search. Default: 5", 57 | threads = 1 &= name "c" &= help "Number of available cpu slots/cores. Default: 1", 58 | sessionIdentificator = Nothing &= name "d" &= help "Optional session id that is used instead of automatically generated one.", 59 | performEvaluation = True &= name "x" &= help "Perform evaluation step. Default: True", 60 | checkSetup = False &= name "g" &= help "Just prints installed tool versions and performs connection check. Default: False" 61 | } &= summary ("RNAlienScan " ++ alienVersion) &= help "Florian Eggenhofer - 2019" &= verbosity 62 | 63 | main :: IO () 64 | main = do 65 | Options{..} <- cmdArgs options 66 | verboseLevel <- getVerbosity 67 | -- Generate SessionID 68 | sessionId <- createSessionID sessionIdentificator 69 | timestamp <- getCurrentTime 70 | currentWorkDirectory <- getCurrentDirectory 71 | let selectedOutputPath = if null outputPath then currentWorkDirectory else outputPath 72 | let temporaryDirectoryPath = FP.addTrailingPathSeparator selectedOutputPath ++ sessionId ++ "/" 73 | createDirectoryIfMissing False temporaryDirectoryPath 74 | setupCheckScanWithLog inputQuerySelectionMethod temporaryDirectoryPath 75 | createDirectoryIfMissing False (temporaryDirectoryPath ++ "log") 76 | -- Create Log files 77 | writeFile (temporaryDirectoryPath ++ "Log") ("RNAlienScan " ++ alienVersion ++ "\n") 78 | writeFile (temporaryDirectoryPath ++ "log/warnings") ("") 79 | logMessage ("Timestamp: " ++ (show timestamp) ++ "\n") temporaryDirectoryPath 80 | logMessage ("Temporary Directory: " ++ temporaryDirectoryPath ++ "\n") temporaryDirectoryPath 81 | let iterationNumber = 0 82 | if null inputFastaFilePath 83 | then do 84 | alignmentInput <- BS.readExistingStockholm inputAlignmentFilePath 85 | inputGenomesFasta <- readFastaFile inputGenomesFastaFilePath 86 | when (null inputGenomesFasta) (error "Please provide input genomes with the cmd line parameter -s") 87 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath 88 | when (isLeft alignmentInput) (error (fromLeft alignmentInput)) 89 | let rightAlignment = head $ fromRight alignmentInput 90 | let reformatedFastaInput = stockholmAlignmentToFasta rightAlignment 91 | when (null reformatedFastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i") 92 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) Nothing singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads Nothing Nothing (setVerbose verboseLevel) True inputGenomesFastaFilePath [] 93 | --let initialization = ModelConstruction iterationNumber reformatedFastaInput [] Nothing Nothing (fromJust inputEvalueCutoff) False [] [] [] alignmentInput 94 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] Nothing Nothing (fromJust inputEvalueCutoff) False [] [] inputGenomesFasta (Just rightAlignment) 95 | logMessage (show initialization) temporaryDirectoryPath 96 | --logVerboseMessage (verbositySwitch staticOptions) ("Alignment construction with candidates - infernal mode\n") (tempDirPath staticOptions) 97 | --prepare next iteration 98 | let nextModelConstructionInput = constructNext iterationNumber initialization [] [] Nothing Nothing [] [] True 99 | let outputDirectory = tempDirPath staticOptions ++ "0" ++ "/" 100 | createDirectory outputDirectory 101 | let fastaFilePath = outputDirectory ++ "model" ++ ".fa" 102 | writeFastaFile fastaFilePath reformatedFastaInput 103 | let stockholmFilepath = outputDirectory ++ "model" ++ ".stockholm" 104 | let cmFilepath = outputDirectory ++ "model" ++ ".cm" 105 | let cmCalibrateFilepath = outputDirectory ++ "model" ++ ".cmcalibrate" 106 | let cmBuildFilepath = outputDirectory ++ "model" ++ ".cmbuild" 107 | copyFile inputAlignmentFilePath stockholmFilepath 108 | let refinedAlignmentFilepath = outputDirectory ++ "modelrefined.stockholm" 109 | let cmBuildOptions ="--refine " ++ refinedAlignmentFilepath 110 | _ <- systemCMbuild cmBuildOptions stockholmFilepath cmFilepath cmBuildFilepath 111 | _ <- systemCMcalibrate "fast" (cpuThreads staticOptions) cmFilepath cmCalibrateFilepath 112 | writeFile (outputDirectory ++ "done") "" 113 | --select queries 114 | currentSelectedQueries <- selectQueries staticOptions nextModelConstructionInput [] 115 | --let nextScanModelConstructionInputWithQueries = nextModelConstructionInput {selectedQueries = currentSelectedQueries} 116 | --logMessage (iterationSummaryLog nextScanModelConstructionInputWithQueries) (tempDirPath staticOptions) 117 | let nextScanModelConstructionInputWithQueries = initialization {iterationNumber = (1 :: Int), selectedQueries = currentSelectedQueries} 118 | modelConstructionResults <- scanModelConstructer staticOptions nextScanModelConstructionInputWithQueries -- nextScanModelConstructionInputWithQueries 119 | --modelConstructionResults <- alignmentConstructionWithoutCandidates "scan" Nothing Nothing staticOptions nextScanModelConstructionInputWithQueries 120 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults 121 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable 122 | if performEvaluation 123 | then do 124 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults 125 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation 126 | resultSummary modelConstructionResults staticOptions 127 | writeFile (temporaryDirectoryPath ++ "done") "" 128 | else do 129 | resultSummary modelConstructionResults staticOptions 130 | writeFile (temporaryDirectoryPath ++ "done") "" 131 | else do 132 | fastaInput <- readFastaFile inputFastaFilePath 133 | when (null fastaInput) (error "Please provide input fasta sequences with the cmd line parameter -i") 134 | inputGenomesFasta <- readFastaFile inputGenomesFastaFilePath 135 | when (null inputGenomesFasta) (error "Please provide input genomes with the cmd line parameter -s") 136 | logToolVersions inputQuerySelectionMethod temporaryDirectoryPath 137 | let reformatedFastaInput = map reformatFasta fastaInput 138 | let staticOptions = StaticOptions temporaryDirectoryPath sessionId (fromJust inputnSCICutoff) Nothing singleHitperTax inputQuerySelectionMethod inputQueryNumber lengthFilter coverageFilter blastSoftmasking threads Nothing Nothing (setVerbose verboseLevel) True inputGenomesFastaFilePath [] 139 | let initialization = ModelConstruction iterationNumber reformatedFastaInput [] [] Nothing Nothing (fromJust inputEvalueCutoff) False [] [] inputGenomesFasta Nothing 140 | logMessage (show initialization) temporaryDirectoryPath 141 | modelConstructionResults <- scanModelConstructer staticOptions initialization 142 | let resultTaxonomyRecordsCSVTable = constructTaxonomyRecordsCSVTable modelConstructionResults 143 | writeFile (temporaryDirectoryPath ++ "result.csv") resultTaxonomyRecordsCSVTable 144 | if performEvaluation 145 | then do 146 | resultEvaluation <- evaluateConstructionResult staticOptions modelConstructionResults 147 | appendFile (temporaryDirectoryPath ++ "Log") resultEvaluation 148 | resultSummary modelConstructionResults staticOptions 149 | writeFile (temporaryDirectoryPath ++ "done") "" 150 | else do 151 | resultSummary modelConstructionResults staticOptions 152 | writeFile (temporaryDirectoryPath ++ "done") "" 153 | 154 | alienVersion :: String 155 | alienVersion = showVersion version 156 | -------------------------------------------------------------------------------- /Biobase/RNAlienStatistics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards #-} 2 | {-# LANGUAGE DeriveDataTypeable #-} 3 | 4 | -- | Statistics for RNAlien Results 5 | -- dist/build/RNAlienStatistics/RNAlienStatistics -s bitscore -i /scratch/egg/temp/cm13676/1/model.cm -r /home/mescalin/egg/current/Data/AlienTest/cms/BsrG.cm -g /scratch/egg/temp/AlienSearch/genomes/ -o /scratch/egg/temp/AlienStatistics 6 | module Main where 7 | 8 | import System.Console.CmdArgs 9 | import Data.Either.Unwrap 10 | import System.Process 11 | import qualified Data.ByteString.Char8 as B 12 | import Biobase.RNAlien.Library 13 | import System.Directory 14 | import Biobase.Fasta.Strict 15 | import Data.List 16 | import qualified System.FilePath as FP 17 | import qualified Data.List.Split as DS 18 | import Text.Printf 19 | import Bio.RNAzParser 20 | import qualified Bio.RNAcodeParser as RC 21 | import Biobase.Types.BioSequence 22 | 23 | data Options = Options 24 | { alienCovarianceModelPath :: String, 25 | alienrnazPath :: String, 26 | alienrnacodePath :: String, 27 | aliencmstatPath :: String, 28 | rfamCovarianceModelPath :: String, 29 | rfamFastaFilePath :: String, 30 | alienFastaFilePath :: String, 31 | rfamModelName :: String, 32 | rfamModelId :: String, 33 | rfamThreshold :: Double, 34 | alienThreshold :: Double, 35 | databaseSize :: Maybe Double, 36 | outputDirectoryPath :: String, 37 | benchmarkIndex :: Int, 38 | thresholdSelection :: String, 39 | linkScores :: Bool, 40 | threads :: Int 41 | } deriving (Show,Data,Typeable) 42 | 43 | options :: Options 44 | options = Options 45 | { alienCovarianceModelPath = def &= name "i" &= help "Path to alienCovarianceModelPath", 46 | alienrnazPath = def &= name "z" &= help "Path to alienRNAzResult", 47 | alienrnacodePath = def &= name "w" &= help "Path to alienRNAcodeResult", 48 | aliencmstatPath = def &= name "m" &= help "Path to aliencmstatResult", 49 | rfamCovarianceModelPath = def &= name "r" &= help "Path to rfamCovarianceModelPath", 50 | rfamFastaFilePath = def &= name "g" &= help "Path to rfamFastaFile", 51 | rfamModelName = def &= name "n" &= help "Rfam model name", 52 | rfamModelId = def &= name "d" &= help "Rfam model id", 53 | alienFastaFilePath = def &= name "a" &= help "Path to alienFastaFile", 54 | outputDirectoryPath = def &= name "o" &= help "Path to output directory", 55 | alienThreshold = 20 &= name "t" &= help "Bitscore threshold for RNAlien model hits on Rfam fasta, default 20", 56 | rfamThreshold = 20 &= name "x" &= help "Bitscore threshold for Rfam model hits on Alien fasta, default 20", 57 | databaseSize = Nothing &= name "k" &= help "Cmsearch database size in mega bases. default not set", 58 | benchmarkIndex = 1 &= name "b" &= help "Index used to identify sRNA tagged RNA families", 59 | thresholdSelection = "bitscore" &= name "s" &= help "Selection method, (bitscore, evalue), default bitscore", 60 | linkScores = False &= name "l" &= help "Triggers computation of linkscores via CMCompare", 61 | threads = 1 &= name "c" &= help "Number of available cpu slots/cores, default 1" 62 | } &= summary "RNAlienStatistics" &= help "Florian Eggenhofer - >2013" &= verbosity 63 | 64 | --cmSearchFasta threads rfamCovarianceModelPath outputDirectoryPath "Rfam" False genomesDirectoryPath 65 | cmSearchFasta :: Int -> String -> Double -> Maybe Double -> Int -> String -> String -> String -> String -> IO [CMsearchHit] 66 | cmSearchFasta benchmarkIndex thresholdSelection thresholdScore databaseSize cpuThreads covarianceModelPath outputDirectory modelType fastapath = do 67 | createDirectoryIfMissing False (outputDirectory ++ "/" ++ modelType) 68 | _ <- systemCMsearch cpuThreads (maybe "" (\dbs -> " -Z " ++ show dbs ++ " ") databaseSize) covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch") 69 | --_ <- systemCMsearch cpuThreads " " covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ (show benchmarkIndex) ++ ".cmsearch") 70 | result <- readCMSearch (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch") 71 | if isLeft result 72 | then do 73 | print (fromLeft result) 74 | return [] 75 | else do 76 | let rightResults = fromRight result 77 | let significantHits = filterCMsearchHits thresholdSelection thresholdScore rightResults 78 | let uniquesignificantHits = nubBy cmSearchSameHit significantHits 79 | return uniquesignificantHits 80 | 81 | --cmSearchFasta threads rfamCovarianceModelPath outputDirectoryPath "Rfam" False genomesDirectoryPath 82 | cmSearchesFasta :: Int -> String -> Double -> Maybe Double -> Int -> String -> String -> String -> String -> IO [CMsearchHit] 83 | cmSearchesFasta benchmarkIndex thresholdSelection thresholdScore databaseSize cpuThreads covarianceModelPath outputDirectory modelType fastapath = do 84 | createDirectoryIfMissing False (outputDirectory ++ "/" ++ modelType) 85 | _ <- systemCMsearch cpuThreads (maybe "" (\dbs -> " -Z " ++ show dbs ++ " ") databaseSize) covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch") 86 | --_ <- systemCMsearch cpuThreads " " covarianceModelPath fastapath (outputDirectory ++ "/" ++ modelType ++ "/" ++ (show benchmarkIndex) ++ ".cmsearch") 87 | result <- readCMSearches (outputDirectory ++ "/" ++ modelType ++ "/" ++ show benchmarkIndex ++ ".cmsearch") 88 | if isLeft result 89 | then do 90 | print (fromLeft result) 91 | return [] 92 | else do 93 | let rightResults = fromRight result 94 | let significantHits = filterCMsearchHits thresholdSelection thresholdScore rightResults 95 | --putStrLn ("significant Hits " ++ show (length significantHits)) 96 | let uniquesignificantHits = nubBy cmSearchSameHit significantHits 97 | --putStrLn ("unique significant Hits " ++ show (length uniquesignificantHits)) 98 | --let organismUniquesignificantHits = nubBy cmSearchSameOrganism significantHits 99 | return uniquesignificantHits 100 | 101 | filterCMsearchHits :: String -> Double -> CMsearch -> [CMsearchHit] 102 | filterCMsearchHits thresholdSelection thresholdScore cmSearchResult 103 | | thresholdSelection == "bitscore" = bitscorefiltered 104 | | otherwise = evaluefiltered 105 | where bitscorefiltered = filter (\hit -> hitScore hit >= thresholdScore) (cmsearchHits cmSearchResult) 106 | evaluefiltered = filter (\hit -> hitEvalue hit <= thresholdScore) (cmsearchHits cmSearchResult) 107 | 108 | partitionCMsearchHits :: String -> Double -> CMsearch -> ([CMsearchHit],[CMsearchHit]) 109 | partitionCMsearchHits thresholdSelection thresholdScore cmSearchResult 110 | | thresholdSelection == "bitscore" = (bitscoreselected,bitscorerejected) 111 | | otherwise = (evalueselected,evaluerejected) 112 | where (bitscoreselected,bitscorerejected) = partition (\hit -> hitScore hit >= thresholdScore) (cmsearchHits cmSearchResult) 113 | (evalueselected,evaluerejected) = partition (\hit -> hitEvalue hit <= thresholdScore) (cmsearchHits cmSearchResult) 114 | 115 | trimCMsearchFastaFile :: String -> String -> String -> CMsearch -> String -> IO () 116 | trimCMsearchFastaFile genomesDirectory outputFolder modelType cmsearch fastafile = do 117 | let fastaInputPath = genomesDirectory ++ "/" ++ fastafile 118 | let fastaOutputPath = outputFolder ++ "/" ++ modelType ++ "/" ++ fastafile 119 | fastaSequences <- readFastaFile fastaInputPath 120 | let trimmedSequence = trimCMsearchSequence cmsearch (head fastaSequences) 121 | writeFastaFile fastaOutputPath [trimmedSequence] 122 | 123 | trimCMsearchSequence :: CMsearch -> Fasta () () -> Fasta () () 124 | trimCMsearchSequence cmSearchResult inputSequence = subSequence 125 | where hitScoreEntry = head (cmsearchHits cmSearchResult) 126 | sequenceString = show (_fasta inputSequence) 127 | sequenceSubstring = cmSearchsubString (hitStart hitScoreEntry) (hitEnd hitScoreEntry) sequenceString 128 | newSequenceHeader = SequenceIdentifier (B.pack (show (_header inputSequence) ++ "cmS_" ++ show (hitStart hitScoreEntry) ++ "_" ++ show (hitEnd hitScoreEntry) ++ "_" ++ show (hitStrand hitScoreEntry))) 129 | subSequence = Fasta newSequenceHeader (BioSequence (B.pack sequenceSubstring)) 130 | 131 | --With paralogs allowed 132 | cmSearchSameHit :: CMsearchHit -> CMsearchHit -> Bool 133 | cmSearchSameHit hitscore1 hitscore2 134 | | unpackedSeqHeader1 == unpackedSeqHeader2 = True 135 | | otherwise = False 136 | where unpackedSeqHeader1 = B.unpack (hitSequenceHeader hitscore1) 137 | unpackedSeqHeader2 = B.unpack (hitSequenceHeader hitscore2) 138 | 139 | cmSearchSameOrganism :: CMsearchHit -> CMsearchHit -> Bool 140 | cmSearchSameOrganism hitscore1 hitscore2 141 | | hitOrganism1 == hitOrganism2 = True 142 | | otherwise = False 143 | where unpackedSeqHeader1 = B.unpack (hitSequenceHeader hitscore1) 144 | unpackedSeqHeader2 = B.unpack (hitSequenceHeader hitscore2) 145 | separationcharacter1 = selectSeparationChar unpackedSeqHeader1 146 | separationcharacter2 = selectSeparationChar unpackedSeqHeader2 147 | hitOrganism1 = head (DS.splitOn separationcharacter1 unpackedSeqHeader1) 148 | hitOrganism2 = head (DS.splitOn separationcharacter2 unpackedSeqHeader2) 149 | 150 | selectSeparationChar :: String -> String 151 | selectSeparationChar inputString 152 | | any ((== ':')) inputString = ":" 153 | | otherwise = "/" 154 | 155 | main :: IO () 156 | main = do 157 | Options{..} <- cmdArgs options 158 | rfamModelExists <- doesFileExist rfamCovarianceModelPath 159 | verbose <- getVerbosity 160 | rnazString <- rnazOutput verbose alienrnazPath 161 | rnacodeString <- rnaCodeOutput verbose alienrnacodePath 162 | cmStatString <- cmStatOutput verbose aliencmstatPath 163 | if rfamModelExists 164 | then do 165 | --compute linkscore 166 | linkscore <- if linkScores 167 | then compareCM rfamCovarianceModelPath alienCovarianceModelPath outputDirectoryPath 168 | else return (Left "-") 169 | rfamMaxLinkScore <- if linkScores then compareCM rfamCovarianceModelPath rfamCovarianceModelPath outputDirectoryPath else return (Left "-") 170 | alienMaxLinkscore <- if linkScores then compareCM alienCovarianceModelPath alienCovarianceModelPath outputDirectoryPath else return (Left "-") 171 | _ <- system ("cat " ++ rfamFastaFilePath ++ " | grep '>' | wc -l >" ++ outputDirectoryPath ++ FP.takeFileName rfamFastaFilePath ++ ".entries") 172 | _ <- system ("cat " ++ alienFastaFilePath ++ " | grep '>' | wc -l >" ++ outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries") 173 | rfamFastaEntries <- readFile (outputDirectoryPath ++ FP.takeFileName rfamFastaFilePath ++ ".entries") 174 | alienFastaEntries <- readFile (outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries") 175 | let rfamFastaEntriesNumber = read rfamFastaEntries :: Int 176 | let alienFastaEntriesNumber = read alienFastaEntries :: Int 177 | rfamonAlienResults <- cmSearchesFasta benchmarkIndex thresholdSelection rfamThreshold databaseSize threads rfamCovarianceModelPath outputDirectoryPath "rfamOnAlien" alienFastaFilePath 178 | alienonRfamResults <- cmSearchFasta benchmarkIndex thresholdSelection alienThreshold databaseSize threads alienCovarianceModelPath outputDirectoryPath "alienOnRfam" rfamFastaFilePath 179 | let rfamonAlienResultsNumber = length rfamonAlienResults 180 | let alienonRfamResultsNumber = length alienonRfamResults 181 | let rfamonAlienRecovery = (fromIntegral rfamonAlienResultsNumber :: Double) / (fromIntegral alienFastaEntriesNumber :: Double) 182 | let alienonRfamRecovery = (fromIntegral alienonRfamResultsNumber :: Double) / (fromIntegral rfamFastaEntriesNumber :: Double) 183 | if verbose == Loud 184 | then do 185 | putStrLn ("BenchmarkIndex: " ++ show benchmarkIndex) 186 | putStrLn ("RfamModelName: " ++ rfamModelName) 187 | putStrLn ("RfamModelId: " ++ rfamModelId) 188 | putStrLn ("Linkscore: " ++ either id show linkscore) 189 | putStrLn ("rfamMaxLinkScore: " ++ either id show rfamMaxLinkScore) 190 | putStrLn ("alienMaxLinkscore: " ++ either id show alienMaxLinkscore) 191 | putStrLn ("rfamGatheringThreshold: " ++ show rfamThreshold) 192 | putStrLn ("alienGatheringThreshold: " ++ show alienThreshold) 193 | putStrLn ("rfamFastaEntriesNumber: " ++ show rfamFastaEntriesNumber) 194 | putStrLn ("alienFastaEntriesNumber: " ++ show alienFastaEntriesNumber) 195 | putStrLn ("rfamonAlienResultsNumber: " ++ show rfamonAlienResultsNumber) 196 | putStrLn ("alienonRfamResultsNumber: " ++ show alienonRfamResultsNumber) 197 | putStrLn ("RfamonAlienRecovery: " ++ show rfamonAlienRecovery) 198 | putStrLn ("AlienonRfamRecovery: " ++ show alienonRfamRecovery) 199 | print rnazString 200 | print rnacodeString 201 | print cmStatString 202 | else 203 | putStrLn (show benchmarkIndex ++ "\t" ++ rfamModelName ++ "\t" ++ rfamModelId ++ "\t" ++ (either id show linkscore) ++ "\t" ++ (either id show rfamMaxLinkScore) ++ "\t" ++ (either id show alienMaxLinkscore) ++ "\t" ++ show rfamThreshold ++ "\t" ++ show alienThreshold ++ "\t" ++ show rfamFastaEntriesNumber ++ "\t" ++ show alienFastaEntriesNumber ++ "\t" ++ show rfamonAlienResultsNumber ++ "\t" ++ show alienonRfamResultsNumber ++ "\t" ++ printf "%.2f" rfamonAlienRecovery ++ "\t" ++ printf "%.2f" alienonRfamRecovery ++ "\t" ++ rnazString ++ "\t" ++ rnacodeString ++ "\t" ++ cmStatString) 204 | else do 205 | --compute linkscore 206 | alienMaxLinkscore <- if linkScores then compareCM alienCovarianceModelPath alienCovarianceModelPath outputDirectoryPath else return ( Left "-") 207 | _ <- system ("cat " ++ alienFastaFilePath ++ " | grep '>' | wc -l >" ++ outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries") 208 | alienFastaEntries <- readFile (outputDirectoryPath ++ FP.takeFileName alienFastaFilePath ++ ".entries") 209 | let alienFastaEntriesNumber = read alienFastaEntries :: Int 210 | if verbose == Loud 211 | then do 212 | putStrLn "BenchmarkIndex:" 213 | putStrLn "RfamModelName: -" 214 | putStrLn "RfamModelId: -" 215 | putStrLn "Linkscore: -" 216 | putStrLn "rfamMaxLinkScore: -" 217 | putStrLn ("alienMaxLinkscore: " ++ either id show alienMaxLinkscore) 218 | putStrLn "rfamGatheringThreshold: -" 219 | putStrLn "alienGatheringThreshold: -" 220 | putStrLn "rfamFastaEntriesNumber: -" 221 | putStrLn ("alienFastaEntriesNumber: " ++ show alienFastaEntriesNumber) 222 | putStrLn "rfamonAlienResultsNumber: -" 223 | putStrLn "alienonRfamResultsNumber: -" 224 | putStrLn "RfamonAlienRecovery: -" 225 | putStrLn "AlienonRfamRecovery: -" 226 | print rnazString 227 | print cmStatString 228 | else 229 | putStrLn (show benchmarkIndex ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ (either id show alienMaxLinkscore) ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ show alienFastaEntriesNumber ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ rnazString ++ "\t" ++ rnacodeString ++ "\t" ++ cmStatString) 230 | 231 | rnazOutput :: Verbosity -> String -> IO String 232 | rnazOutput verbose rnazPath = do 233 | rnazPresent <- doesFileExist rnazPath 234 | if rnazPresent 235 | then do 236 | inputRNAz <- readRNAz rnazPath 237 | if isRight inputRNAz 238 | then do 239 | let rnaZ = fromRight inputRNAz 240 | if verbose == Loud 241 | then do 242 | let output = "Mean pairwise identity: " ++ show (meanPairwiseIdentity rnaZ) ++ "\n Shannon entropy: " ++ show (shannonEntropy rnaZ) ++ "\n GC content: " ++ show (gcContent rnaZ) ++ "\n Mean single sequence minimum free energy: " ++ show (meanSingleSequenceMinimumFreeEnergy rnaZ) ++ "\n Consensus minimum free energy: " ++ show (consensusMinimumFreeEnergy rnaZ) ++ "\n Energy contribution: " ++ show (energyContribution rnaZ) ++ "\n Covariance contribution: " ++ show (covarianceContribution rnaZ) ++ "\n Combinations pair: " ++ show (combinationsPair rnaZ) ++ "\n Mean z-score: " ++ show (meanZScore rnaZ) ++ "\n Structure conservation index: " ++ show (structureConservationIndex rnaZ) ++ "\n Background model: " ++ backgroundModel rnaZ ++ "\n Decision model: " ++ decisionModel rnaZ ++ "\n SVM decision value: " ++ show (svmDecisionValue rnaZ) ++ "\n SVM class propability: " ++ show (svmRNAClassProbability rnaZ) ++ "\n Prediction: " ++ prediction rnaZ 243 | return output 244 | else do 245 | let output = show (meanPairwiseIdentity rnaZ) ++ "\t" ++ show (shannonEntropy rnaZ) ++ "\t" ++ show (gcContent rnaZ) ++ "\t" ++ show (meanSingleSequenceMinimumFreeEnergy rnaZ) ++ "\t" ++ show (consensusMinimumFreeEnergy rnaZ) ++ "\t" ++ show (energyContribution rnaZ) ++ "\t" ++ show (covarianceContribution rnaZ) ++ "\t" ++ show (combinationsPair rnaZ) ++ "\t" ++ show (meanZScore rnaZ) ++ "\t" ++ show (structureConservationIndex rnaZ) ++ "\t" ++ show (svmDecisionValue rnaZ) ++ "\t" ++ show (svmRNAClassProbability rnaZ) ++ "\t" ++ prediction rnaZ 246 | return output 247 | else 248 | if (verbose == Loud) 249 | then do 250 | let output = "Mean pairwise identity: " ++ " - \n Shannon entropy: " ++ " - \n GC content: " ++ " - \n Mean single sequence minimum free energy: " ++ " - \n Consensus minimum free energy: " ++ " - \n Energy contribution: " ++ " - \n Covariance contribution: " ++ " - \n Combinations pair: " ++ " - \n Mean z-score: " ++ " - \n Structure conservation index: " ++ " - \n Background model: " ++ " - \n Decision model: " ++ " - \n SVM decision value: " ++ " - \n SVM class propability: " ++ " - \n Prediction: " ++ " - \n" 251 | return output 252 | else do 253 | let output = "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" 254 | return output 255 | else 256 | if (verbose == Loud) 257 | then do 258 | let output = "Mean pairwise identity: " ++ " - \n Shannon entropy: " ++ " - \n GC content: " ++ " - \n Mean single sequence minimum free energy: " ++ " - \n Consensus minimum free energy: " ++ " - \n Energy contribution: " ++ " - \n Covariance contribution: " ++ " - \n Combinations pair: " ++ " - \n Mean z-score: " ++ " - \n Structure conservation index: " ++ " - \n Background model: " ++ " - \n Decision model: " ++ " - \n SVM decision value: " ++ " - \n SVM class propability: " ++ " - \n Prediction: " ++ " - \n" 259 | return output 260 | else do 261 | let output = "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" ++ "\t" ++ "-" 262 | return output 263 | 264 | cmStatOutput :: Verbosity -> String -> IO String 265 | cmStatOutput verbose cmstatPath = do 266 | cmstatPresent <- doesFileExist cmstatPath 267 | if cmstatPresent 268 | then do 269 | inputCMstat <- readCMstat cmstatPath 270 | if isRight inputCMstat 271 | then do 272 | let cmStat = fromRight inputCMstat 273 | if verbose == Loud 274 | then do 275 | let output = "statSequenceNumber: " ++ show (statSequenceNumber cmStat) ++ "\nstatEffectiveSequences: " ++ show (statEffectiveSequences cmStat) ++ "\nstatConsensusLength: " ++ show (statConsensusLength cmStat) ++ "\nstatW: " ++ show (statW cmStat) ++ "\nstatBasepairs: " ++ show (statBasepairs cmStat) ++ "\nstatBifurcations: " ++ show (statBifurcations cmStat) ++ "\nstatModel: " ++ statModel cmStat ++ "\nrelativeEntropyCM: " ++ show (relativeEntropyCM cmStat) ++ "\nrelativeEntropyHMM: " ++ show (relativeEntropyHMM cmStat) 276 | return output 277 | else do 278 | let output = show (statSequenceNumber cmStat) ++ "\t" ++ show (statEffectiveSequences cmStat) ++ "\t" ++ show (statConsensusLength cmStat) ++ "\t" ++ show (statW cmStat) ++ "\t" ++ show (statBasepairs cmStat) ++ "\t" ++ show (statBifurcations cmStat) ++ "\t" ++ statModel cmStat ++ "\t" ++ show (relativeEntropyCM cmStat) ++ "\t" ++ show (relativeEntropyHMM cmStat) 279 | return output 280 | else 281 | if (verbose == Loud) 282 | then do 283 | let output = "statSequenceNumber: -" ++ "\nstatEffectiveSequences: -" ++ "\nstatConsensusLength: -" ++ "\nstatW: -" ++ "\nstatBasepairs: -" ++ "\nstatBifurcations: -" ++ "\nstatModel: -" ++ "\nrelativeEntropyCM: -" ++ "\nrelativeEntropyHMM: -" 284 | return output 285 | else do 286 | let output = "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-" 287 | return output 288 | else 289 | if (verbose == Loud) 290 | then do 291 | let output = "statSequenceNumber: -" ++ "\nstatEffectiveSequences: -" ++ "\nstatConsensusLength: -" ++ "\nstatW: -" ++ "\nstatBasepairs: -" ++ "\nstatBifurcations: -" ++ "\nstatModel: -" ++ "\nrelativeEntropyCM: -" ++ "\nrelativeEntropyHMM: -" 292 | return output 293 | else do 294 | let output = "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-\t" ++ "-" 295 | return output 296 | 297 | rnaCodeOutput :: Verbosity -> String -> IO String 298 | rnaCodeOutput verbose rnaCodePath = do 299 | rnacodePresent <- doesFileExist rnaCodePath 300 | if rnacodePresent 301 | then do 302 | inputRNACode <- RC.readRNAcodeTabular rnaCodePath 303 | if isRight inputRNACode 304 | then do 305 | let rnaCode = fromRight inputRNACode 306 | let lowestPvalue = minimum (map RC.pvalue (RC.rnacodeHits rnaCode)) 307 | let rnaCodeClassification = if lowestPvalue < 0.05 then "PROTEIN" else "OTHER" 308 | if verbose == Loud 309 | then do 310 | let output = "RNAcode lowest p-value: " ++ show lowestPvalue ++ "\nrnaCodeClassification: " ++ rnaCodeClassification 311 | return output 312 | else do 313 | let output = show lowestPvalue ++ "\t" ++ rnaCodeClassification 314 | return output 315 | else 316 | if (verbose == Loud) 317 | then do 318 | let output = "RNAcode lowest p-value: " ++ "-" ++ "\nrnaCodeClassification: " ++ "-" 319 | return output 320 | else do 321 | let output = "-\t" ++ "-" 322 | --let output = show (fromLeft inputRNACode) 323 | return output 324 | else 325 | if (verbose == Loud) 326 | then do 327 | let output = "RNAcode lowest p-value: " ++ "-" ++ "\nrnaCodeClassification: " ++ "-" 328 | return output 329 | else do 330 | let output = "-\t" ++ "-" 331 | return output 332 | -------------------------------------------------------------------------------- /Biobase/cmsearchToBED.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards #-} 2 | {-# LANGUAGE DeriveDataTypeable #-} 3 | 4 | -- | Convert cmsearch output to Browser Extensible Data (BED) format 5 | -- Testcommand: cmsearchToBED -i /path/to/test.clustal 6 | module Main where 7 | import Prelude 8 | import System.Console.CmdArgs 9 | import Biobase.RNAlien.Library 10 | import Data.Either.Unwrap 11 | import qualified Data.ByteString.Char8 as B 12 | import qualified Data.Text as T 13 | import Data.List 14 | 15 | data Bed = Bed 16 | { browserPostition :: T.Text, 17 | browserSettings :: T.Text, 18 | bedName :: T.Text, 19 | bedDescription :: T.Text, 20 | bedVisibility :: Int, 21 | bedItemRgb :: Bool, 22 | bedEntries :: [BedEntry] 23 | } deriving (Eq, Read) 24 | 25 | instance Show Bed where 26 | show (Bed _browserPostition _browserSettings _bedName _bedDescription _bedVisibility _bedItemRgb _bedEntries) = a ++ b ++ c ++ d ++ e ++ f ++ g 27 | where a = "browser position " ++ T.unpack _browserPostition ++ "\n" 28 | b = T.unpack _browserSettings ++ "\n" 29 | c = "track name=\"" ++ T.unpack _bedName ++ "\" " 30 | d = "description=\"" ++ T.unpack _bedDescription ++ "\" " 31 | e = "visibility=" ++ show _bedVisibility ++ " " 32 | f = "itemRgb=\"" ++ itemRbg ++ "\"\n" 33 | itemRbg = if _bedItemRgb then "On" else "Off" 34 | g = concatMap show _bedEntries 35 | 36 | 37 | data BedEntry = BedEntry 38 | { chrom :: T.Text, 39 | chromStart :: Int, 40 | chromEnd :: Int, 41 | chromName :: Maybe T.Text, 42 | score :: Maybe Int, 43 | strand :: Maybe Char, 44 | thickStart :: Maybe Int, 45 | thickEnd :: Maybe Int, 46 | color :: Maybe T.Text, 47 | blockCount :: Maybe Int, 48 | blockSizes :: Maybe [Int], 49 | blockStarts :: Maybe [Int] 50 | } deriving (Eq, Read) 51 | 52 | instance Show BedEntry where 53 | show (BedEntry _chrom _chromStart _chromEnd _chromName _score _strand _thickStart _thickEnd _color _blockCount _blockSizes _blockStarts) = a ++ b ++ c ++ d ++ e ++ f ++ g ++ h ++ i ++ j ++ k ++ l 54 | where a = T.unpack _chrom ++ "\t" 55 | b = show _chromStart ++ "\t" 56 | c = show _chromEnd ++ "\t" 57 | d = maybe "" T.unpack _chromName ++ "\t" 58 | e = maybe "" show _score ++ "\t" 59 | f = maybe "" ((: [])) _strand ++ "\t" 60 | g = maybe "" show _thickStart ++ "\t" 61 | h = maybe "" show _thickEnd ++ "\t" 62 | i = maybe "" T.unpack _color ++ "\t" 63 | j = maybe "" show _blockCount ++ "\t" 64 | k = maybe "" (intercalate "," . map show) _blockSizes ++ "\t" 65 | l = maybe "" (intercalate "," . map show) _blockStarts ++ "\n" 66 | 67 | data Options = Options 68 | { cmsearchPath :: String, 69 | inputBrowserSettings :: String, 70 | inputBedVisibility :: Int, 71 | inputTrackName :: String, 72 | inputTrackDescription :: String, 73 | inputItemRgb :: Bool, 74 | inputTrackColor :: String, 75 | sortBed :: Bool, 76 | withHeader :: Bool 77 | } deriving (Show,Data,Typeable) 78 | 79 | options :: Options 80 | options = Options 81 | { cmsearchPath = def &= name "i" &= help "Path to input cmsearch file", 82 | inputBrowserSettings = "browser hide all" &= name "b" &= help "Browser settings. Default: browser hide all", 83 | inputBedVisibility = (2 :: Int) &= name "y" &= help "Visibility setting of track. Default: 2", 84 | inputTrackName = "PredictedRNA" &= name "n" &= help "Name of the track Default: PredictedRNA", 85 | inputTrackDescription = "RNA loci predicted by cmsearch" &= name "d" &= help "Description of the track. Default: RNA loci predicted by cmsearch", 86 | inputItemRgb = True &= name "r" &= help "RGB Color of the track. Default: True", 87 | inputTrackColor = "255,0,0" &= name "c" &= help "RGB Color of the track. Default: 255,0,0", 88 | sortBed = True &= name "s" &= help "Sort entries of Bed file by start end end cooridinates. Default: True", 89 | withHeader = True &= name "w" &= help "Output contains bed header. Default: True" 90 | } &= summary "cmsearchToBED - Converts cmsearch file hits to BED file entries" &= help "Florian Eggenhofer 2016" &= verbosity 91 | 92 | main :: IO () 93 | main = do 94 | Options{..} <- cmdArgs options 95 | parsedCmsearch <- readCMSearch cmsearchPath 96 | if isRight parsedCmsearch 97 | then do 98 | let outputBED = convertcmSearchToBED (fromRight parsedCmsearch) inputBrowserSettings inputTrackName inputTrackDescription inputTrackColor inputBedVisibility inputItemRgb sortBed 99 | if isRight outputBED 100 | then 101 | if withHeader 102 | then print (fromRight outputBED) 103 | else do 104 | let output = concatMap show (bedEntries (fromRight outputBED)) 105 | putStr output 106 | else putStr (fromLeft outputBED) 107 | else putStr ("A problem occured converting from cmsearch to BED format:\n " ++ show (fromLeft parsedCmsearch)) 108 | 109 | --convertcmSearchToBED :: CMsearch -> String -> String -> Either String String 110 | --convertcmSearchToBED inputcmsearch trackName trackColor 111 | -- | null cmHits = Left "cmsearch file contains no hits" 112 | -- | otherwise = Right (bedHeader ++ bedEntries) 113 | -- where cmHits = cmsearchHits inputcmsearch 114 | -- bedHeader = "browser position " ++ browserPosition ++ "\nbrowser hide all\ntrack name=\"cmsearch hits\" description=\"cmsearch hits\" visibility=2 itemRgb=\"On\"\n" 115 | -- bedEntries = concatMap (cmsearchHitToBEDentry trackName trackColor) cmHits 116 | -- browserPosition = L.unpack (hitSequenceHeader firstHit) ++ ":" ++ entryStart firstHit ++ "-" ++ entryEnd firstHit 117 | -- firstHit = (head cmHits) 118 | 119 | convertcmSearchToBED :: CMsearch -> String -> String -> String -> String -> Int -> Bool -> Bool -> Either String Bed 120 | convertcmSearchToBED inputcmsearch inputBrowserSettings trackName trackDescription trackColor inputBedVisibility inputItemRgb sortBed 121 | | null cmHits = Left "cmsearch file contains no hits" 122 | | otherwise = Right bed 123 | where cmHits = cmsearchHits inputcmsearch 124 | --bedHeader = "browser position " ++ browserPosition ++ "\nbrowser hide all\ntrack name=\"cmsearch hits\" description=\"cmsearch hits\" visibility=2 itemRgb=\"On\"\n" 125 | bedEntries = map (cmsearchHitToBEDentry trackName trackColor) cmHits 126 | sortedBedEntries = if sortBed then sortBy orderBedEntry bedEntries else bedEntries 127 | currentBrowserPosition = T.unpack (chrom firstEntry) ++ ":" ++ show (chromStart firstEntry) ++ "-" ++ show (chromEnd firstEntry) 128 | firstEntry = head sortedBedEntries 129 | bed = Bed (T.pack currentBrowserPosition) (T.pack inputBrowserSettings) (T.pack trackName) (T.pack trackDescription) inputBedVisibility inputItemRgb sortedBedEntries 130 | 131 | cmsearchHitToBEDentry :: String -> String -> CMsearchHit -> BedEntry 132 | cmsearchHitToBEDentry hitName hitColor cmHit = entry 133 | where entry = BedEntry chromosome entrystart entryend (Just (T.pack hitName)) entryscore entrystrand thickstart thickend entrycolor blocks blockSize blockStart 134 | chromosome = T.pack (B.unpack (hitSequenceHeader cmHit)) 135 | --entryline = L.unpack (hitSequenceHeader cmHit) ++ "\t" ++ entryStart cmHit ++ "\t" ++ entryEnd cmHit++ "\t" ++ (hitName) ++ "\t" ++ "0" ++ "\t" ++ [(hitStrand cmHit)] ++ "\t" ++ show (hitStart cmHit) ++ "\t" ++ show (hitEnd cmHit) ++ "\t" ++ hitColor ++ "\n" 136 | entrystart = if hitStrand cmHit == '+' then hitStart cmHit else hitEnd cmHit 137 | entryend = if hitStrand cmHit == '+' then hitEnd cmHit else hitStart cmHit 138 | entryscore = Just (0 :: Int) 139 | entrystrand = Just (hitStrand cmHit) 140 | thickstart = Just entrystart 141 | thickend = Just entryend 142 | entrycolor = Just (T.pack hitColor) 143 | blocks = Just (1 :: Int) 144 | blockSize = Just [entryend - entrystart] 145 | blockStart = Just [0 :: Int] 146 | 147 | 148 | --cmsearchHitToBEDentry :: String -> String -> CMsearchHit -> String 149 | --cmsearchHitToBEDentry hitName hitColor cmHit = entryline 150 | -- where entryline = L.unpack (hitSequenceHeader cmHit) ++ "\t" ++ entryStart cmHit ++ "\t" ++ entryEnd cmHit++ "\t" ++ (hitName) ++ "\t" ++ "0" ++ "\t" ++ [(hitStrand cmHit)] ++ "\t" ++ show (hitStart cmHit) ++ "\t" ++ show (hitEnd cmHit) ++ "\t" ++ hitColor ++ "\n" 151 | --entrystart = if (hitStrand cmHit) == '+' then show (hitStart cmHit) else show (hitEnd cmHit) 152 | --entryend = if (hitStrand cmHit) == '+' then show (hitEnd cmHit) else show (hitStart cmHit) 153 | 154 | entryStart :: CMsearchHit -> String 155 | entryStart cmHit 156 | | hitStrand cmHit == '+' = show (hitStart cmHit) 157 | | otherwise = show (hitEnd cmHit) 158 | 159 | entryEnd :: CMsearchHit -> String 160 | entryEnd cmHit 161 | | hitStrand cmHit == '+' = show (hitEnd cmHit) 162 | | otherwise = show (hitStart cmHit) 163 | 164 | orderBedEntry :: BedEntry -> BedEntry -> Ordering 165 | orderBedEntry firstHit secondHit 166 | | start1 > start2 = GT 167 | | start1 < start2 = LT 168 | | otherwise = orderBedEntryEnd firstHit secondHit 169 | where start1 = chromStart firstHit 170 | start2 = chromStart secondHit 171 | 172 | orderBedEntryEnd :: BedEntry -> BedEntry -> Ordering 173 | orderBedEntryEnd firstHit secondHit 174 | | end1 > end2 = GT 175 | | end1 < end2 = LT 176 | | otherwise = EQ 177 | where end1 = chromEnd firstHit 178 | end2 = chromEnd secondHit 179 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | -*-change-log-*- 2 | 3 | ### 1.8.5 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 6. June 2021 4 | 5 | * Compatibility with ghc 9 6 | * Testing with github actions 7 | 8 | ### 1.8.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 3. January 2020 9 | 10 | * Construction start from input alignment for Scan and Alien 11 | * Alien is working fully offline, by using offline taxonomy database 12 | * Improved collection of near identical hits 13 | * RNAlien now uses paralellization 14 | * Fixes for speed regression in taxid positive set computation 15 | 16 | ### 1.7.1 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 12. September 2019 17 | 18 | * Fixed Scan tool global search step 19 | 20 | ### 1.7.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 29. August 2019 21 | 22 | * Added Scan tool 23 | * Changed tracing high similarity candidates 24 | * Fixed regression in parsing input fasta 25 | 26 | ### 1.6.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 19. June 2019 27 | 28 | * Added offline mode for blast calls and sequence retrieval 29 | * Changed to Biobase repository layout 30 | * Added statically linked executables to releases 31 | 32 | ### 1.5.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 4. March 2019 33 | 34 | * Enabled initialization from multi-line fasta 35 | 36 | ### 1.4.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 9. December 2018 37 | 38 | * Switched to Biobase libraries 39 | * RNAlien is now using json based blast requests 40 | 41 | ### 1.3.8 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 3. April 2019 42 | 43 | * Fix for outdated ca-certificates 44 | 45 | ### 1.3.7 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 13. March 2017 46 | 47 | * Removed optimization flags that prevent hackage upload 48 | 49 | ### 1.3.6 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 5. March 2017 50 | 51 | * SelectSequences moved to own repository, removed tool from package 52 | * Clustal result file is now also written without evaluation step 53 | 54 | ### 1.3.5 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 5. March 2017 55 | 56 | * Added a commandline switch to check setup and network connection, improved tempdir handling 57 | 58 | ### 1.3.4 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 2. March 2017 59 | 60 | * More changes toward bioconda compatibility, changed compiler optimization flag to -O 61 | 62 | ### 1.3.3 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 1. March 2017 63 | 64 | * Further changes to stack.yaml 65 | 66 | ### 1.3.2 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 8. February 2017 67 | 68 | * Minor fix to stack.yaml for bioconda recipe 69 | 70 | ### 1.3.1 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 6. February 2017 71 | 72 | * Updated version constraints for ClustalParser supporting multi-line consensus secondary structure 73 | 74 | ### 1.3.0 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 20. January 2017 75 | 76 | * Included bugfix from ViennaRNAparser concerning RNAalifold systemcall 77 | 78 | ### 1.2.9 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 8. January 2017 79 | 80 | * Dropped dependency on rnazSelectSequences.pl for evaluation step 81 | * Select sequences can now print a similarity matrix 82 | * Internal sequence selection is substantially faster due to text-metrics 83 | 84 | ### 1.2.8 [Florian Eggenhofer](mailto:egg@cs.uni-freiburg.de) 1. January 2017 85 | 86 | * Added a commandline switch to turn switch the evaluation step on and off 87 | 88 | ### 1.2.7 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 13. November 2016 89 | 90 | * Fixed a bug in inital connection check with HTTPS 91 | 92 | ### 1.2.6 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 12. November 2016 93 | 94 | * Changed NCBI URL to HTTPS and updated libary constraints 95 | 96 | ### 1.2.5 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 26. October 2016 97 | 98 | * Updated stack.yaml 99 | 100 | ### 1.2.4 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 24. October 2016 101 | 102 | * Support for GHC-8.0.1 103 | 104 | ### 1.2.3 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 21. October 2016 105 | 106 | * Added cmsearch output to BED12 converter for genome browser integration 107 | * Updated dependency versions and version number output 108 | 109 | ### 1.2.2 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 1. June 2016 110 | 111 | * Fixed a bug building RNAcentral query and improved formatting of 112 | corresponding output 113 | 114 | ### 1.2.1 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 30. May 2016 115 | 116 | * Added RNAcentralRequest utility 117 | * Fixed a bug in parsing RNAcentral response headers 118 | 119 | ### 1.2.0 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 30. May 2016 120 | 121 | * Added cmsearchToBED utility 122 | 123 | ### 1.1.3 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 25. April 2016 124 | 125 | * Fixed wrong description for softmasking commandline switch 126 | * Fixed encoding tabular iteration progress output 127 | 128 | ### 1.1.2 [Florian Eggenhofer](mailto:egg@informatik.uni-freiburg.de) 18. April 2016 129 | 130 | * Fixed a bug in passing softmasking to blast 131 | * Performance improvements in query selection 132 | 133 | ### 1.1.1 [Florian Eggenhofer](egg@informatik.uni-freiburg.de) 23. March 2016 134 | 135 | * Added a commandlineswitch for softmasking 136 | * Improved interface with Alienserver 137 | 138 | ### 1.1.0 [Florian Eggenhofer](mailto:florian.eggenhofer@univie.ac.at) 11. February 2016 139 | 140 | * Update including changes from 1st review 141 | * Cmbuild uses --refine option 142 | * Evaluation now includes RNAcode result, which is a new dependecy 143 | * RNAcentral lookup for found sequences via REST interface during evaluation 144 | * Added a new alternative query selection method that filters for entries max. pairwise identity 145 | * Added softmasking to blastrequests 146 | * Paralog sequences are now included by default 147 | * Installation of RNAlien is now available via stackage 148 | * Fix several bugs including blasthit coverage filter 149 | * RNAlienStatistics can now parse cmsearch results from multiple cm files as for clans 150 | * RNAlienStatistics includes a switch for using bitscore or evalue cutoffs 151 | 152 | ### 1.0.0 [Florian Eggenhofer](florian.eggenhofer@univie.ac.at) 29. October 2015 153 | 154 | * Initial version 155 | -------------------------------------------------------------------------------- /Dockerfile.dev: -------------------------------------------------------------------------------- 1 | FROM alpine:edge 2 | 3 | RUN apk update 4 | RUN apk add --no-cache musl musl-dev musl-utils musl-dbg ghc ghc-dev ghc-doc cabal zlib-dev zlib zlib-static tar gzip wget 5 | 6 | ADD . source 7 | WORKDIR source 8 | RUN cabal new-update && cabal new-build --enable-executable-static 9 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAlien/build/RNAlien/RNAlien /RNAlien 10 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAlienScan/build/RNAlienScan/RNAlienScan /RNAlienScan 11 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAlienStatistics/build/RNAlienStatistics/RNAlienStatistics /RNAlienStatistics 12 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/cmsearchToBed/build/cmsearchToBed/cmsearchToBed /cmsearchToBed 13 | RUN cp /source/dist-newstyle/build/x86_64-linux/ghc-*/RNAlien-*/x/RNAcentralHTTPRequest/build/RNAcentralHTTPRequest/RNAcentralHTTPRequest /RNAcentralHTTPRequest 14 | RUN cabal new-clean 15 | RUN rm -r /source 16 | RUN apk del musl musl-dev musl-utils musl-dbg ghc ghc-dev ghc-doc cabal zlib-static zlib-dev zlib tar gzip wget 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | 676 | -------------------------------------------------------------------------------- /ParserTest.hs: -------------------------------------------------------------------------------- 1 | -- | Parser test script 2 | -- read from file and directly print parsing output 3 | -- runghc -package-db=.cabal-sandbox/x86_64-linux-ghc-7.8.3-packages.conf.d/ ParserTest.hs test.cmstat 4 | module Main where 5 | 6 | import System.Environment (getArgs) 7 | import System.Console.CmdArgs 8 | import System.Directory 9 | import Bio.Sequence.Fasta 10 | import Bio.RNAlienData 11 | import Bio.RNAlienLibrary 12 | import Data.Maybe 13 | import Data.Time 14 | import Data.Either.Unwrap 15 | 16 | main :: IO () 17 | main = do 18 | args <- getArgs 19 | let input_file = (head args) 20 | parseresult <- readCMstat input_file 21 | print (fromRight parseresult) 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![RNAlien](http://www.tbi.univie.ac.at/~egg/RNAlien.png "RNAlien") 2 | ========= 3 | RNAlien is a tool for automatic construction of RNAfamily models from a single sequence. 4 | 5 | It is available as a commandline tool, for testing or construction of few sequences the webservice can be used. 6 | 7 | The source code of RNAlien is open source and available via GitHub and Hackage (License GPL-3): 8 | 9 | * [![GitHub](https://img.shields.io/github/tag/eggzilla/RNAlien.svg)](https://github.com/eggzilla/RNAlien) [![Build Status](https://travis-ci.org/eggzilla/RNAlien.svg?branch=master)](https://travis-ci.org/eggzilla/RNAlien) [![Hackage](https://img.shields.io/hackage/v/RNAlien.svg)](https://hackage.haskell.org/package/RNAlien) [![Bioconda](https://anaconda.org/bioconda/rnalien/badges/version.svg)](https://anaconda.org/bioconda/rnalien) [![Docker Image Version (latest by date)](https://img.shields.io/docker/v/eggzilla/rnalien)](https://hub.docker.com/repository/docker/eggzilla/rnalien) ![github action: master](https://github.com/eggzilla/RNAlien/actions/workflows/action.yml/badge.svg) 10 | 11 | 12 | ### Installation via bioconda - recommended 13 | 14 | RNAlien can be installed with all tool dependencies via [conda](https://conda.io/docs/install/quick.html). Once you have conda installed simply type: 15 | 16 | conda create -n rnalien185 -c conda-forge -c bioconda rnalien=1.8.5 17 | 18 | Activate the environment in which RNAlien was installed to use it: 19 | 20 | conda activate rnalien185 21 | 22 | To use the offline-mode of the commandline tool additionally following database downloads are required: 23 | 24 | * Download [NCBI Taxonomy Dump](ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz) 25 | ```bash 26 | wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz 27 | tar -xzf new_taxdump.tar.gz 28 | ``` 29 | 30 | * Download [NCBI Blast version 5 database](https://ftp.ncbi.nlm.nih.gov/blast/db/v5) 31 | ```bash 32 | #After installing and activating the RNAlien bioconda environment use update_blastdb.pl 33 | #Show all available databases 34 | update_blastdb.pl --blastdb_version 5 --showall 35 | #Download the nt_v5 database (~about 70 GB in size) 36 | update_blastdb.pl --blastdb_version 5 nt_v5 --decompress 37 | ``` 38 | 39 | ### Usage 40 | 41 | After installation with bioconda, activating the environment and downloading the files using the offline mode of the command line tool is recommended. 42 | Following are example calls for the files contained in the test directory of the repository. 43 | Using -c 4 and +RTS -N4 provides 4 cpu threads to the used tool dependencies (e.g. blast,..) and to RNAlien. 44 | * Single fasta input: 45 | ```bash 46 | RNAlien -i single.fa -c 4 -j -b /pathto/blast5db/nt_v5 -d single -w /pathto/new_taxdump/taxidlineage.dmp +RTS -N4 47 | ``` 48 | 49 | * Multi fasta input: 50 | 51 | ```bash 52 | RNAlien -i testmulti.fa -c 4 -j -b /pathto/blast5db/nt_v5 -d single -w /pathto/new_taxdump/taxidlineage.dmp +RTS -N4 53 | ``` 54 | 55 | * Stockholm alignment (with consensus structure) input 56 | ```bash 57 | RNAlien -p test.stockholm -c 4 -j -b /pathto/blast5db/nt_v5 -d aln -w /pathto/new_taxdump/taxidlineage.dmp +RTS -N4 58 | ``` 59 | 60 | If you just want to try RNAlien out, or to construct a single family the onlinse mode can be used. 61 | It does not require database downloads and queries the required information from ncbi webservices. 62 | A stable, uninterupted internet connection is mandatory. 63 | 64 | * Single fasta input (online-mode): 65 | ```bash 66 | RNAlien -i single.fa -c 4 -d onsingle +RTS -N4 67 | ``` 68 | To display the possible commandline options run: 69 | 70 | ```bash 71 | RNAlien --help 72 | ``` 73 | For detailed instruction how to use RNAlien please see the [Help page.](http://rna.tbi.univie.ac.at/rnalien/help) 74 | 75 | TaxonomyTools which can be used to visualise the organisms included in a RNAlien result can be found here (License GPL-3): 76 | 77 | * [![GitHub](https://img.shields.io/github/tag/eggzilla/TaxonomyTools.svg)](https://github.com/eggzilla/TaxonomyTools) [![Build Status](https://travis-ci.org/eggzilla/TaxonomyTools.svg?branch=master)](https://travis-ci.org/eggzilla/TaxonomyTools) [![Hackage](https://img.shields.io/hackage/v/TaxonomyTools.svg)](https://hackage.haskell.org/package/RNAlien) 78 | 79 | -------------------------------------------------------------------------------- /RNAlien.cabal: -------------------------------------------------------------------------------- 1 | name: RNAlien 2 | version: 1.8.5 3 | synopsis: Unsupervized construction of RNA family models 4 | description: RNAlien is a tool for automatic construction of RNAfamily models from a single sequence. 5 | . 6 | It is available as a commandline tool, for testing or construction of few sequences the webservice can be used. 7 | . 8 | The source code of RNAlien, as well as the webserver is open source and available via GitHub (License GPL-3): 9 | . 10 | * 11 | . 12 | * 13 | . 14 | TaxonomyTools which can be used to visualise the organisms included in a RNAlien result can be found here (License GPL-3): 15 | . 16 | * 17 | . 18 | * 19 | . 20 | For instruction how to use RNAlien please see the . 21 | . 22 | Dependencies: 23 | . 24 | * 25 | . 26 | * 27 | . 28 | * 29 | . 30 | * 31 | . 32 | * 33 | . 34 | Installation via cabal-install: 35 | . 36 | > cabal install RNAlien 37 | 38 | license: GPL-3 39 | license-file: LICENSE 40 | author: Florian Eggenhofer 41 | maintainer: egg@informatik.uni-freiburg.de 42 | copyright: Florian Eggenhofer 43 | category: Bioinformatics 44 | build-type: Simple 45 | cabal-version: >= 1.10.0 46 | tested-with: GHC == 8.8, GHC == 8.10, GHC == 9.0 47 | Extra-Source-Files: 48 | README.md ChangeLog.md 49 | 50 | source-repository head 51 | type: git 52 | location: https://github.com/eggzilla/RNAlien 53 | 54 | source-repository this 55 | type: git 56 | location: https://github.com/eggzilla/RNAlien/tree/1.8.5 57 | tag: 1.8.5 58 | 59 | executable RNAlien 60 | Hs-Source-Dirs: ./Biobase/ 61 | main-is: RNAlien.hs 62 | ghc-options: -Wall 63 | default-language: Haskell2010 64 | other-modules: Paths_RNAlien 65 | build-depends: base >=4.5 && <5, cmdargs, directory, 66 | random, containers, RNAlien, time, either-unwrap, filepath, 67 | BiobaseFasta == 0.4.0.*, StockholmAlignment 68 | 69 | executable RNAlienScan 70 | Hs-Source-Dirs: ./Biobase/ 71 | main-is: RNAlienScan.hs 72 | ghc-options: -Wall 73 | default-language: Haskell2010 74 | other-modules: Paths_RNAlien 75 | build-depends: base >=4.5 && <5, cmdargs, directory, 76 | random, containers, RNAlien, time, either-unwrap, filepath, 77 | BiobaseFasta == 0.4.0.*, StockholmAlignment 78 | 79 | executable RNAlienStatistics 80 | Hs-Source-Dirs: ./Biobase/ 81 | main-is: RNAlienStatistics.hs 82 | ghc-options: -Wall 83 | default-language: Haskell2010 84 | other-modules: Paths_RNAlien 85 | build-depends: base >=4.5 && <5, cmdargs, cassava, vector, process, bytestring, 86 | either-unwrap, RNAlien, directory, split, filepath, ViennaRNAParser>=1.3.2, 87 | BiobaseFasta == 0.4.0.*, BiobaseTypes == 0.2.1.* 88 | 89 | executable cmsearchToBed 90 | Hs-Source-Dirs: ./Biobase/ 91 | main-is: cmsearchToBED.hs 92 | ghc-options: -Wall 93 | default-language: Haskell2010 94 | other-modules: Paths_RNAlien 95 | build-depends: base >=4.5 && <5, cmdargs, either-unwrap, RNAlien, bytestring, text 96 | 97 | executable RNAcentralHTTPRequest 98 | Hs-Source-Dirs: ./Biobase/ 99 | main-is: RNAcentralHTTPRequest.hs 100 | ghc-options: -Wall 101 | default-language: Haskell2010 102 | other-modules: Paths_RNAlien 103 | build-depends: base >=4.5 && <5, cmdargs, either-unwrap, RNAlien 104 | 105 | Library 106 | Hs-Source-Dirs: . 107 | ghc-options: -Wall -fno-warn-unused-do-bind -fsimpl-tick-factor=500 108 | default-language: Haskell2010 109 | build-depends: base >=4.5 && <5, cmdargs, ViennaRNAParser>=1.3.2, process, directory, 110 | parsec, random, bytestring, Taxonomy >= 2.1.0, either-unwrap, containers, 111 | ClustalParser>=1.3.0, vector, edit-distance, cassava, matrix, hierarchical-clustering, 112 | filepath, HTTP, http-conduit, hxt, network<=2.8.0.0, aeson<=1.6.0.0, text, transformers, 113 | pureMD5, http-types, text-metrics, BiobaseTypes == 0.2.1.*, BiobaseFasta == 0.4.0.*, 114 | BiobaseBlast == 0.3.3.*, BlastHTTP >= 1.4.2, BiobaseHTTP == 1.2.0, silently, StockholmAlignment>=1.3.0, BiobaseEnsembl>=0.2.0.0, parallel, attoparsec 115 | Exposed-Modules: Biobase.RNAlien.Types 116 | Biobase.RNAlien.Library 117 | Biobase.RNAlien.RNAcentralHTTP 118 | Biobase.RNAlien.InfernalParser 119 | Biobase.RNAlien.CMstatParser 120 | -------------------------------------------------------------------------------- /RNAlien.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 42 | 44 | 45 | 47 | image/svg+xml 48 | 50 | 51 | 52 | 53 | 54 | 59 | RNAlien 70 | 78 | 88 | 96 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /RNAlienScan.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 44 | 49 | 50 | 52 | 53 | 55 | image/svg+xml 56 | 58 | 59 | 60 | 61 | 62 | 67 | RNAlienScan 78 | 86 | 93 | 101 | 108 | 111 | 124 | 137 | 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /cabal.project: -------------------------------------------------------------------------------- 1 | packages: . 2 | 3 | optional-packages: ./*/*.cabal 4 | 5 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | { mkDerivation, aeson, base, BiobaseBlast, BiobaseFasta, BlastHTTP 2 | , bytestring, cassava, ClustalParser, cmdargs, containers 3 | , directory, edit-distance, either-unwrap, filepath 4 | , hierarchical-clustering, HTTP, http-conduit, http-types, hxt 5 | , matrix, network, parsec, process, pureMD5, random, split, stdenv 6 | , Taxonomy, text, text-metrics, time, transformers, vector 7 | , ViennaRNAParser 8 | }: 9 | mkDerivation { 10 | pname = "RNAlien"; 11 | version = "1.5.0"; 12 | src = ./.; 13 | isLibrary = true; 14 | isExecutable = true; 15 | libraryHaskellDepends = [ 16 | aeson base BiobaseBlast BiobaseFasta BlastHTTP bytestring cassava 17 | ClustalParser cmdargs containers directory edit-distance 18 | either-unwrap filepath hierarchical-clustering HTTP http-conduit 19 | http-types hxt matrix network parsec process pureMD5 random 20 | Taxonomy text text-metrics transformers vector ViennaRNAParser 21 | ]; 22 | executableHaskellDepends = [ 23 | base BiobaseFasta bytestring cassava cmdargs containers directory 24 | either-unwrap filepath process random split text time vector 25 | ViennaRNAParser 26 | ]; 27 | description = "Unsupervized construction of RNA family models"; 28 | license = stdenv.lib.licenses.gpl3; 29 | } 30 | -------------------------------------------------------------------------------- /envhs.nix: -------------------------------------------------------------------------------- 1 | with (import {}); 2 | hsDevFunctions ./. 3 | -------------------------------------------------------------------------------- /manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eggzilla/RNAlien/94937c39ad20bbe7150cc865434c9feadd317090/manual.pdf -------------------------------------------------------------------------------- /overrides.nix: -------------------------------------------------------------------------------- 1 | self: 2 | 3 | let 4 | tgz = builtins.fetchTarball { 5 | url = "http://hackage.haskell.org/package/semirings-0.3.1.1/semirings-0.3.1.1.tar.gz"; 6 | sha256 = "1wi4g4xk3vjqig2mrgdc09ygwcdirlpky00xikak8cndkydcm2za"; 7 | }; 8 | in 9 | 10 | { semirings = tgz; 11 | } 12 | -------------------------------------------------------------------------------- /scripts/AlienBenchmarkCMCompare.sh: -------------------------------------------------------------------------------- 1 | #Alien Benchmark 2 | #!/bin/bash 3 | #$ -t 1-56 #This will start the job for each sRNA Rfam family 4 | #$ -l mem_free=10G 5 | #$ -j yes 6 | #$ -o /scratch/egg/temp/ 7 | #$ -e /scratch/egg/temp/ 8 | #$ -l hostname="tc00|tc01|tc02|tc03|tc04" 9 | #$ -N area54 10 | #alienrun 11 | if [ -f /scr/kronos/egg/AlienStructuredResultsCollected4/$SGE_TASK_ID/done ]; then 12 | cmComparevsRfam.pl $SGE_TASK_ID 13 | sleep 1 14 | echo "File not found!" 15 | fi 16 | -------------------------------------------------------------------------------- /scripts/AlienBenchmarkModels.sh: -------------------------------------------------------------------------------- 1 | #Alien Benchmark 2 | #!/bin/bash 3 | #$ -t 1-373 #This will start the job for each sRNA Rfam family 4 | #$ -pe para 7 5 | #$ -l mem_free=34.9G 6 | #$ -j yes 7 | #$ -o /scratch/egg/temp/ 8 | #$ -e /scratch/egg/temp/ 9 | #$ -l hostname="xc00|xc01|xc02|xc03|xc04|xc05|xc06|xc07|xc08" 10 | #$ -N area54 11 | #alienrun 12 | if [ ! -f /scratch/egg/AlienResultsCollected/$SGE_TASK_ID/done ]; then 13 | /home/mescalin/egg/current/Projects/Haskell/RNAlien/dist/build/RNAlien/RNAlien -i /scr/kronos/egg/AliensRNATestSet/$SGE_TASK_ID.fa -c 7 -t "$( /scratch/egg/temp/$SGE_TASK_ID.alienout 14 | sleep 1 15 | echo "File not found!" 16 | fi 17 | -------------------------------------------------------------------------------- /scripts/alienresultstatistics.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #./alienresultstatistics structured 11 bitscore 3 | use warnings; 4 | use strict; 5 | use diagnostics; 6 | #use utf8; 7 | use Data::Dumper; 8 | use Cwd; 9 | use Switch; 10 | $|=1; 11 | #decideds which benchmark data to process 12 | my $type = $ARGV[0]; 13 | #result iteration 14 | my $currentresultnumber = $ARGV[1]; 15 | #threshold selection (bitscore, evalue) 16 | my $threshold_selection = $ARGV[2]; 17 | #use clans for specificity check 18 | my $use_clans = 1; 19 | #Sequences to use (seed,full) 20 | my $use_sequences="seed"; 21 | 22 | 23 | #contains all RNAlien result folders for sRNA tagged families 24 | my $alienresult_basename; 25 | #contains all Rfam Families names by family name with extension .cm 26 | my $rfammodel_basename; 27 | #contains all full seed alignment sequences as RfamID .fa fasta files 28 | my $rfamfasta_basename; 29 | my $RNAFamilyIdFile; 30 | my $familyNumber; 31 | my $resulttempdir; 32 | my $resultfileprefix; 33 | my $cpu_cores = 30; 34 | 35 | 36 | if($type eq "background"){ 37 | $alienresult_basename="/scr/coridan/egg/AlienBackgroundCollected" . "$currentresultnumber" . "/"; 38 | $rfammodel_basename = "/scr/coridan/egg/AlienTest/sRNAFamilies/all_models/"; 39 | if($use_sequences eq "full"){ 40 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyfasta/"; #full fasta 41 | }else{ 42 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; #seed fasta 43 | } 44 | $RNAFamilyIdFile = "/scr/coridan/egg/backgroundFamilyNameIdGatheringCutoffSorted"; 45 | $familyNumber = 712; 46 | $resulttempdir = "/scr/coridan/egg/temp/AlienRandomResultStatistics". "$currentresultnumber" . "/"; 47 | $resultfileprefix = "structuredalienbackgroundoutput"; 48 | }elsif($type eq "structured"){ 49 | $alienresult_basename="/scr/kronos/egg/AlienStructuredResultsCollected" . "$currentresultnumber" . "/"; 50 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/"; 51 | if($use_sequences eq "full"){ 52 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyfasta/"; #full fasta 53 | }else{ 54 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyseedfasta/"; #seed fasta 55 | } 56 | $RNAFamilyIdFile = "/scr/kronos/egg/structuredFamilyNameIdGatheringCutoffSorted"; 57 | $familyNumber = 56; 58 | #$familyNumber = 72; old number includes first mini background set 59 | $resulttempdir = "/scr/kronos/egg/temp/AlienStructuredResultStatistics". "$currentresultnumber" . "/"; 60 | $resultfileprefix = "structuredalien". $use_sequences ."output"; 61 | }elsif($type eq "diverse"){ 62 | $alienresult_basename="/scr/kronos/egg/AlienDiverseResultsCollected" . "$currentresultnumber" . "/"; 63 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/"; 64 | if($use_sequences eq "full"){ 65 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyfasta/"; #full fasta 66 | }else{ 67 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyseedfasta2/"; #seed fasta 68 | } 69 | 70 | #$RNAFamilyIdFile = "/scr/kronos/egg/diverse_families/result_diverse_families"; 71 | $RNAFamilyIdFile = "/scr/kronos/egg/diverse_families/test2"; 72 | $familyNumber = 191; 73 | $resulttempdir = "/scr/kronos/egg/temp/AlienDiverseResultStatistics". "$currentresultnumber" . "/"; 74 | $resultfileprefix = "diversealien" . $use_sequences . "output"; 75 | }elsif($type eq "blast"){ 76 | $alienresult_basename="/scr/coridan/egg/blastout/"; 77 | $rfammodel_basename = "/scr/coridan/egg/AlienTest/sRNAFamilies/all_models/"; 78 | if($use_sequences eq "full"){ 79 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyfasta/"; #full fasta 80 | }else{ 81 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; #seed fasta 82 | } 83 | 84 | $RNAFamilyIdFile = "/scr/coridan/egg/structuredFamilyNameIdGatheringCutoffSorted"; 85 | $familyNumber = 56; 86 | $resulttempdir = "/scr/coridan/egg/temp/AlienBlastResultStatistics/"; 87 | $resultfileprefix = "blastalien" . $use_sequences . "output"; 88 | }elsif($type eq "nhmmer"){ 89 | $alienresult_basename="/scr/coridan/egg/nhmmerout/"; 90 | $rfammodel_basename = "/scr/coridan/egg/AlienTest/sRNAFamilies/all_models/"; 91 | if($use_sequences eq "full"){ 92 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyfasta/"; #full fasta 93 | }else{ 94 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; #seed fasta 95 | } 96 | $RNAFamilyIdFile = "/scr/coridan/egg/structuredFamilyNameIdGatheringCutoffSorted"; 97 | $familyNumber = 56; 98 | $resulttempdir = "/scr/coridan/egg/temp/AlienHmmerResultStatistics/"; 99 | $resultfileprefix = "hmmer" . $use_sequences . "output"; 100 | }else{ 101 | #sRNA 102 | $alienresult_basename="/scr/kronos/egg/AlienResultsCollected" . "$currentresultnumber" . "/"; 103 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/"; 104 | if($use_sequences eq "full"){ 105 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyfasta/"; #full fasta 106 | }else{ 107 | $rfamfasta_basename = "/scr/kronos/egg/rfamfamilyseedfasta/"; #seed fasta 108 | } 109 | $RNAFamilyIdFile = "/scr/kronos/egg/smallRNAtaggedfamiliesNameIDThresholdTagSorted.csv"; 110 | $familyNumber = 374; 111 | $resulttempdir = "/scr/kronos/egg/temp/AlienResultStatistics" . "$currentresultnumber" . "/"; 112 | $resultfileprefix = "alienseedoutput"; 113 | } 114 | 115 | my @RNAfamilies; 116 | open(my $RNAfamilyfh, "<", $RNAFamilyIdFile) 117 | or die "Failed to open file: $!\n"; 118 | while(<$RNAfamilyfh>) { 119 | chomp; 120 | push @RNAfamilies, $_; 121 | } 122 | close $RNAfamilyfh; 123 | unless (-d $resulttempdir){ 124 | mkdir $resulttempdir or die "Cannot create result tempdir: $!"; 125 | }else{ 126 | #system "rm -r $resulttempdir" or die "Cannot create result tempdir: $!"; 127 | #mkdir $resulttempdir or die "Cannot create result tempdir: $!"; 128 | } 129 | my $output_directory_path = "/scr/coridan/egg/$resultfileprefix$currentresultnumber/"; 130 | unless (-d $output_directory_path){ 131 | mkdir $output_directory_path or die "Cannot create output dir: $!"; 132 | } 133 | 134 | 135 | my $gathering_score_multiplier = 1.0; 136 | my $gathering_score_lower_bound; 137 | if ($threshold_selection eq "bitscore"){ 138 | alienresultstatistic($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,$gathering_score_multiplier,$gathering_score_lower_bound,"$output_directory_path" . "bs-" . $gathering_score_multiplier . ".tsv",$cpu_cores,$threshold_selection,"evalue threshold",$use_clans,$type); 139 | }else{ 140 | my @evalues = qw(1 1e-3 1e-6 1e-9); 141 | foreach my $evalue (@evalues){ 142 | my $outputfilePath = "$output_directory_path" . "ev-" . $evalue . ".tsv"; 143 | alienresultstatistic($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,$gathering_score_multiplier,$gathering_score_lower_bound,$outputfilePath,$cpu_cores,$threshold_selection,$evalue,$use_clans,$type); 144 | } 145 | } 146 | 147 | sub alienresultstatistic{ 148 | my $familyNumber = shift; 149 | my $alienresult_basename = shift; 150 | my $rfammodel_basename = shift; 151 | my $rfamfasta_basename = shift; 152 | my $RNAFamilyIdFile = shift; 153 | my $resulttempdir = shift; 154 | my $gathering_score_multiplier = shift; 155 | my $gathering_score_lower_bound = shift; 156 | my $outputfilePath = shift; 157 | my $cpu_cores = shift; 158 | my $thresholdSelection = shift; 159 | my $evalueThreshold = shift; 160 | my $use_clans = shift; 161 | my $type = shift; 162 | my $output="Index\tRfamName\tRfamId\tLinkscore\trfamMaxLS\talienMaxLS\trfamGatheringThreshold\talienGatheringThreshold\trfamFastaNumber\talienFastaNumber\trfamonAlienNumber\talienonRfamNumber\tRfamonAlienRecovery\tAlienonRfamRecovery\tmeanPairwiseIdentity\tshannonEntropy\tgcContent\tmeanSingleSequenceMFE\tconsensusMFE\tenergyContribution\tcovarianceContribution\tcombinationsPair\tmeanZScore\tSCI\tsvmDecisionValue\tsvmRNAClassProbability\tprediction\tRclowestpv\tRcclass\tstatSequenceNumber\tstatEffectiveSequences\tstatConsensusLength\tstatW\tstatBasepairs\tstatBifurcations\tstatModel\trelativeEntropyCM\trelativeEntropyHMM\n"; 163 | my $clanMembersFile = "/scr/kronos/egg/clans/family_clan"; 164 | my %clan_members; 165 | #open(my $clanMembersfh, "<", $clanMembersFile) 166 | # or die "Failed to open file: $!\n"; 167 | # while(<$clanMembersfh>) { 168 | # chomp; 169 | #add to hash 170 | # my @line = split('\t',$_); 171 | #print "$line[0] - $line[1]"; 172 | #push( @{ $clan_members {$line[0] } }, $line[1]); 173 | #$clan_members{$line[0]}=$line[1]; 174 | # } 175 | # close $clanMembersfh; 176 | 177 | for(my $counter=1; $counter <= $familyNumber; $counter++){ 178 | my $current_alienresult_folder= $alienresult_basename.$counter."/"; 179 | if(-e $alienresult_basename.$counter."/result.cm"){ 180 | my $alienModelPath = $current_alienresult_folder."result.cm"; 181 | my $alienFastaPath = $current_alienresult_folder."result.fa"; 182 | my $alienRNAzPath = $current_alienresult_folder."result.rnaz"; 183 | my $alienRNAcodePath = $current_alienresult_folder."result.rnacode"; 184 | my $aliencmstatPath = $current_alienresult_folder."result.cmstat"; 185 | #retrieve family specific information 186 | my @rfamModelNameId = split(/\s+/,$RNAfamilies[($counter - 1)]); 187 | #my @rfamModelNameId = split(/\s+/,$RNAfamilies[($counter)]); 188 | my $rfamModelName = $rfamModelNameId[0]; 189 | my $rfamModelId = $rfamModelNameId[1]; 190 | my $rfamModelPath; 191 | my $use_clans=0; 192 | if($use_clans == 1){ 193 | #check if key exists 194 | if(exists $clan_members{$rfamModelId}){ 195 | #my $clan_for_rfammodel = $clan_members{$rfamModelId}; 196 | $rfamModelPath = "/scr/kronos/egg/clans/clan_models/". "$clan_members{$rfamModelId}". ".cm"; 197 | print "For $rfamModelId, set path to: /scr/kronos/egg/clans/clan_models/". "$clan_members{$rfamModelId}\n"; 198 | }else{ 199 | $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm"; 200 | print "For $rfamModelId, set path to: $rfammodel_basename . $rfamModelId" . ".cm\n"; 201 | } 202 | }else{ 203 | $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm"; 204 | } 205 | #my $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm"; 206 | my $rfamFastaPath =$rfamfasta_basename . $rfamModelId . ".fa"; 207 | if(! -e $rfamModelPath){ 208 | print "Does not exist: $rfamModelPath "; 209 | } 210 | if(! -e $rfamFastaPath){ 211 | print "Does not exist: $rfamFastaPath "; 212 | } 213 | 214 | if(! -e $alienModelPath){ 215 | print "Does not exist: $alienModelPath "; 216 | } 217 | if(! -e $alienFastaPath){ 218 | print "Does not exist: $alienFastaPath"; 219 | } 220 | #set threshold corresponding to bitscore or evalue cutoff 221 | my $threshold; 222 | my $databaseSize; 223 | if($thresholdSelection eq "bitscore"){ 224 | my $rfamThresholdUnmodified = $rfamModelNameId[2]; 225 | my $rfamThreshold; 226 | unless ($rfamThresholdUnmodified eq "-"){ 227 | $rfamThreshold = $rfamThresholdUnmodified * $gathering_score_multiplier; 228 | }else{ 229 | $rfamThreshold= "0"; 230 | } 231 | if(defined $gathering_score_lower_bound){ 232 | if($rfamThreshold < $gathering_score_lower_bound){ 233 | $rfamThreshold = $gathering_score_lower_bound; 234 | } 235 | } 236 | $threshold = $rfamThreshold; 237 | $databaseSize = ""; 238 | }else{ 239 | $threshold = $evalueThreshold; 240 | $databaseSize = setdatabasesize($counter,$type); 241 | } 242 | $output = $output . `RNAlienStatistics $databaseSize -s $thresholdSelection -c $cpu_cores -n $rfamModelName -d $rfamModelId -b $counter -i $alienModelPath -r $rfamModelPath -a $alienFastaPath -g $rfamFastaPath -t $threshold -x $threshold -o $resulttempdir -w $alienRNAcodePath -z $alienRNAzPath -m $aliencmstatPath`; 243 | print "RNAlienStatistics $databaseSize -s $thresholdSelection -c $cpu_cores -n $rfamModelName -d $rfamModelId -b $counter -i $alienModelPath -r $rfamModelPath -a $alienFastaPath -g $rfamFastaPath -t $threshold -x $threshold -o $resulttempdir -z $alienRNAzPath -m $aliencmstatPath"."\n"; 244 | }else{ 245 | $output = $output . "$counter" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" ."-" . "\t" . "-" . "\t" . "-" . "\t" . "0" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-" . "\t" . "-\t" . "-" . "\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\t" . "-\n"; 246 | print "Does not exist $alienresult_basename.$counter/done"; 247 | } 248 | } 249 | 250 | open(my $outputfh, ">", $outputfilePath) 251 | or die "Failed to open file: $!\n"; 252 | print $outputfh $output; 253 | close $outputfh; 254 | return 1; 255 | } 256 | 257 | sub setdatabasesize{ 258 | my $counter = shift; 259 | my $type = shift; 260 | my $databasesize; 261 | if($type eq "diverse"){ 262 | switch ($counter) { 263 | case 7 { $databasesize = 1; } #RNaseP_bact_b 264 | case 12 { $databasesize = 1; } #PrfA 265 | case 13 { $databasesize = 1; } #CopA 266 | case 14 { $databasesize = 1; } #FMN 267 | case 19 { $databasesize = 1; } #RNAI 268 | case 20 { $databasesize = 1; } #SIB_RNA 269 | case 23 { $databasesize = 1; } #Purine 270 | case 24 { $databasesize = 1; } #SSU_rRNA_bacteria 271 | case 26 { $databasesize = 1; } #glmS 272 | case 27 { $databasesize = 1; } #ctRNA_pGA1 273 | case 28 { $databasesize = 1; } #RNA-OUT 274 | case 29 { $databasesize = 1; } #ctRNA_pT181 275 | case 36 { $databasesize = 1; } #ydaO-yuaA 276 | case 38 { $databasesize = 1; } #Pox_AX_element 277 | case 39 { $databasesize = 1; } #IBV_D-RNA 278 | case 45 { $databasesize = 1; } #ROSE 279 | case 48 { $databasesize = 1; } #HCV_SLVII 280 | case 49 { $databasesize = 1; } #HCV_SLIV 281 | case 51 { $databasesize = 1; } #HIV_FE 282 | case 56 { $databasesize = 1; } #RNAIII 283 | case 57 { $databasesize = 1; } #Thr_leader 284 | case 59 { $databasesize = 1; } #Leu_leader 285 | case 60 { $databasesize = 1; } #Trp_leader 286 | case 61 { $databasesize = 1; } #His_leader 287 | case 62 { $databasesize = 1; } #PreQ1 288 | case 63 { $databasesize = 1; } #Flavivirus_DB 289 | case 67 { $databasesize = 1; } #L13_leader 290 | case 68 { $databasesize = 1; } #L19_leader 291 | case 69 { $databasesize = 1; } #L20_leader 292 | case 70 { $databasesize = 1; } #L21_leader 293 | case 75 { $databasesize = 1; } #P1 294 | case 76 { $databasesize = 1; } #P24 295 | case 85 { $databasesize = 1; } #preQ1-II 296 | case 86 { $databasesize = 1; } #MOCO_RNA_motif 297 | case 87 { $databasesize = 1; } #RF_site2 298 | case 88 { $databasesize = 1; } #RF_site3 299 | case 89 { $databasesize = 1; } #RF_site5 300 | case 90 { $databasesize = 1; } #RF_site9 301 | case 91 { $databasesize = 1; } #PK-G12rRNA 302 | case 105 { $databasesize = 1; } #AHBV_epsilon 303 | case 106 { $databasesize = 1; } #CRISPR-DR2 304 | case 107 { $databasesize = 1; } #CRISPR-DR3 305 | case 108 { $databasesize = 1; } #CRISPR-DR5 306 | case 109 { $databasesize = 1; } #CRISPR-DR7 307 | case 110 { $databasesize = 1; } #CRISPR-DR35 308 | case 111 { $databasesize = 1; } #CRISPR-DR53 309 | case 112 { $databasesize = 1; } #CRISPR-DR55 310 | case 113 { $databasesize = 1; } #CRISPR-DR60 311 | case 114 { $databasesize = 1; } #CRISPR-DR61 312 | case 115 { $databasesize = 1; } #CRISPR-DR65 313 | case 116 { $databasesize = 1; } #isrA 314 | case 117 { $databasesize = 1; } #istR 315 | case 120 { $databasesize = 1; } #NrrF 316 | case 121 { $databasesize = 1; } #VrrA 317 | case 122 { $databasesize = 1; } #MFR 318 | case 126 { $databasesize = 1; } #AdoCbl-variant 319 | case 127 { $databasesize = 1; } #Lnt 320 | case 128 { $databasesize = 1; } #cspA 321 | case 129 { $databasesize = 1; } #SMK_box_riboswitch 322 | case 130 { $databasesize = 1; } #rnk_leader 323 | case 131 { $databasesize = 1; } #RatA 324 | case 132 { $databasesize = 1; } #blv_FSE 325 | case 133 { $databasesize = 1; } #FourU 326 | case 134 { $databasesize = 1; } #fstAT 327 | case 135 { $databasesize = 1; } #HSUR 328 | case 136 { $databasesize = 1; } #Lambda_thermo 329 | case 138 { $databasesize = 1; } #MicX 330 | case 139 { $databasesize = 1; } #symR 331 | case 140 { $databasesize = 1; } #PtaRNA1 332 | case 141 { $databasesize = 1; } #rdlD 333 | case 142 { $databasesize = 1; } #ROSE 334 | case 143 { $databasesize = 1; } #HIV_FS2 335 | case 144 { $databasesize = 1; } #ovine_lenti_FSE 336 | case 145 { $databasesize = 1; } #veev_FSE 337 | case 153 { $databasesize = 1; } #SSU_rRNA_archaea 338 | case 155 { $databasesize = 1; } #HEARO 339 | case 156 { $databasesize = 1; } #STnc630 340 | case 157 { $databasesize = 1; } #STnc370 341 | case 158 { $databasesize = 1; } #STnc180 342 | case 159 { $databasesize = 1; } #OrzO-P 343 | case 161 { $databasesize = 1; } #tfoR 344 | case 162 { $databasesize = 1; } #IS009 345 | case 169 { $databasesize = 1; } #sX5 346 | case 170 { $databasesize = 1; } #sX11 347 | case 174 { $databasesize = 1; } #hsp17 348 | case 175 { $databasesize = 1; } #PyrG_leader 349 | case 176 { $databasesize = 1; } #PyrD_leader 350 | case 177 { $databasesize = 1; } #Ms_AS-8 351 | case 183 { $databasesize = 1; } #ohsC_RNA 352 | case 185 { $databasesize = 1; } #ToxI 353 | case 186 { $databasesize = 1; } #ROSE_3 354 | else { $databasesize = 1000; } 355 | } 356 | }elsif($type eq "sRNA"){ 357 | $databasesize = 1000; 358 | }elsif($type eq "background"){ 359 | $databasesize = 1000; 360 | }else{ 361 | switch ($counter) { 362 | case 7 { $databasesize = 1; } #RNaseP_bact_a 363 | case 8 { $databasesize = 1; } #RNaseP_bact_b 364 | case 16 { $databasesize = 1; } #phageP-RNA 365 | case 17 { $databasesize = 1; } #FMN 366 | case 19 { $databasesize = 1; } #S15 367 | case 20 { $databasesize = 1; } #SAM 368 | case 22 { $databasesize = 1; } #Purin 369 | case 23 { $databasesize = 1; } #Lysine 370 | case 24 { $databasesize = 1; } #Bacterial_small_SRP 371 | case 25 { $databasesize = 1; } #Cobalamin 372 | case 26 { $databasesize = 1; } #HIV-1_DIS 373 | case 27 { $databasesize = 1; } #SSU_rRNA_bacteria 374 | case 29 { $databasesize = 1; } #IRES_Pesti 375 | case 30 { $databasesize = 1; } #glmS 376 | case 32 { $databasesize = 1; } #ykoK 377 | case 33 { $databasesize = 1; } #IRES_Cripavirus 378 | case 34 { $databasesize = 1; } #HIV_FE 379 | case 35 { $databasesize = 1; } #TCV_H5 380 | case 36 { $databasesize = 1; } #Glycine 381 | case 39 { $databasesize = 1; } #c-di-GMP-I 382 | case 40 { $databasesize = 1; } #preQ1-II 383 | case 42 { $databasesize = 1; } #PK-G12rRNA 384 | case 43 { $databasesize = 1; } #HIV-1_SD 385 | case 44 { $databasesize = 1; } #MFR 386 | case 45 { $databasesize = 1; } #AdoCbl-variant 387 | case 46 { $databasesize = 1; } #crcB 388 | case 47 { $databasesize = 1; } #c-di-GMP-II 389 | case 48 { $databasesize = 1; } #THF 390 | case 51 { $databasesize = 1; } #Archea_SRP 391 | case 56 { $databasesize = 1; } #ToxI 392 | else { $databasesize = 1000; } 393 | } 394 | } 395 | return " -k $databasesize "; 396 | 397 | } 398 | -------------------------------------------------------------------------------- /scripts/alienstructurestatistics.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | #./scripts/alienstructurestatistics.pl structured 13 3 | # Computes and plots structure distance among alien benchmark sets and versus Rfam 4 | 5 | # 1. Computes the normalized distance changes over iterations 6 | # 3. Computes the average normalized distance changes over iterations 7 | # 2. Computes the distance between updated structure and normal structure over iterations 8 | # 4. Computes the average distance between updated structure and normal structure over iterations 9 | # 5. Compute the normalized distance between iteration and Rfam consensus 10 | # 6. Compute the average normalized distance between iteration and Rfam consensus 11 | 12 | use warnings; 13 | use strict; 14 | use diagnostics; 15 | use Data::Dumper; 16 | use Cwd; 17 | $|=1; 18 | #decideds which benchmark data to process 19 | my $type = $ARGV[0]; 20 | #result iteration 21 | my $currentresultnumber = $ARGV[1]; 22 | #contains all RNAlien result folders for sRNA tagged families 23 | my $alienresult_basename; 24 | #contains all Rfam Families names by family name with extension .cm 25 | my $rfammodel_basename; 26 | #contains all full seed alignment sequences as RfamID .fa fasta files 27 | my $rfamfasta_basename; 28 | #contains seed alignments as RfamID .fa fasta files 29 | my $rfamstockholm_basename; 30 | 31 | my $RNAFamilyIdFile; 32 | my $familyNumber; 33 | my $resulttempdir; 34 | 35 | if($type eq "structured"){ 36 | $alienresult_basename="/scr/coridan/egg/AlienStructuredResultsCollected" . "$currentresultnumber" . "/"; 37 | $rfamstockholm_basename = "/scr/coridan/egg/structuredfamilyrfamstockholm/"; 38 | $rfamfasta_basename = "/scr/coridan/egg/rfamfamilyseedfasta/"; 39 | $RNAFamilyIdFile = "/scr/coridan/egg/structuredFamilyNameIdGatheringCutoffSorted"; 40 | $familyNumber = 56; 41 | $resulttempdir = "/scr/coridan/egg/temp/AlienStructuredResultStatistics". "$currentresultnumber" . "/"; 42 | }else{ 43 | #sRNA 44 | $alienresult_basename="/scr/kronos/egg/AlienResultsCollected" . "$currentresultnumber" . "/"; 45 | $rfammodel_basename = "/scr/kronos/egg/AlienTest/sRNAFamilies/all_models/"; 46 | $RNAFamilyIdFile = "/scr/kronos/egg/smallRNAtaggedfamiliesNameIDThresholdTagSorted.csv"; 47 | $familyNumber = 374; 48 | $resulttempdir = "/scr/kronos/egg/temp/AlienResultStatistics" . "$currentresultnumber" . "/"; 49 | } 50 | 51 | #Distance comparison between first stockholms of constructions with and without structureupdate 52 | #normalizedDistanceBetweenFirstStockholms($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,"/scratch/egg/"); 53 | unless(-d "/scr/kronos/egg/iterationdistance$currentresultnumber/"){ 54 | mkdir "/scr/kronos/egg/iterationdistance$currentresultnumber/"; 55 | } 56 | distanceBetweenAlienRfamStockholms($familyNumber,$alienresult_basename,$rfamstockholm_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,"/scr/kronos/egg/iterationdistance$currentresultnumber/"); 57 | #normalizedDistanceOverIterations($familyNumber,$alienresult_basename,$rfammodel_basename,$rfamfasta_basename,$RNAFamilyIdFile,$resulttempdir,"/scr/kronos/egg/iterationdistance$currentresultnumber/"); 58 | 59 | sub distanceBetweenAlienRfamStockholms{ 60 | #retrieve common sequence identifier 61 | #compare stockholmstructre and parse result back 62 | my $familyNumber = shift; 63 | my $alienresult_basename = shift; 64 | my $rfamstockholm_basename = shift; 65 | my $rfamfasta_basename = shift; 66 | my $RNAFamilyIdFile = shift; 67 | my $resulttempdir = shift; 68 | my $resultfolderpath = shift; 69 | my $outputfilePath= $resultfolderpath . "distancestructureupdatenone.dist"; 70 | my $output; 71 | for(my $counter=1; $counter <= $familyNumber; $counter++){ 72 | my $current_alienresult_folder= $alienresult_basename.$counter."/"; 73 | if(-e $alienresult_basename.$counter."/done"){ 74 | #print "$alienresult_basename$counter\n"; 75 | my $fstStockholmPath = "$rfamstockholm_basename/$counter.stockholm"; 76 | my $sndStockholmPath = "$alienresult_basename"."$counter"."/result.stockholm"; 77 | my $inputFastaPath = "$alienresult_basename"."$counter"."/result.fa"; 78 | if(-e $inputFastaPath){ 79 | my @fastacontent; 80 | open(my $fastafh, "<", $inputFastaPath) 81 | or die "Failed to open file: $!\n"; 82 | while(<$fastafh>) { 83 | chomp; 84 | push @fastacontent, $_; 85 | } 86 | close $fastafh; 87 | my $fasta_identifier = $fastacontent[0]; 88 | $fasta_identifier =~ s/>//; 89 | #$fasta_identifier =~ s/\\K.+$//; 90 | if(-e $fstStockholmPath){ 91 | $output = $output . `~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i $fasta_identifier -a $fstStockholmPath -r $sndStockholmPath -d P -o $resultfolderpath`; 92 | }else{ 93 | $output = $output . "no stockholm found\n"; 94 | } 95 | 96 | } 97 | }else{ 98 | $output = $output . "no inputfasta found\n"; 99 | } 100 | } 101 | 102 | open(my $outputfh, ">", $outputfilePath) 103 | or die "Failed to open file: $!\n"; 104 | print $outputfh $output; 105 | close $outputfh; 106 | return 1; 107 | } 108 | 109 | 110 | sub normalizedDistanceBetweenFirstStockholms{ 111 | #retrieve common sequence identifier 112 | #compare stockholmstructre and parse result back 113 | my $familyNumber = shift; 114 | my $alienresult_basename = shift; 115 | my $rfammodel_basename = shift; 116 | my $rfamfasta_basename = shift; 117 | my $RNAFamilyIdFile = shift; 118 | my $resulttempdir = shift; 119 | my $resultfolderpath = shift; 120 | my $outputfilePath= $resultfolderpath . "distancestructureupdatenone.dist"; 121 | my $output; 122 | for(my $counter=1; $counter <= $familyNumber; $counter++){ 123 | my $current_alienresult_folder= $alienresult_basename.$counter."/"; 124 | if(-e $alienresult_basename.$counter."/done"){ 125 | #print "$alienresult_basename$counter\n"; 126 | my $fstStockholmPath = findStockholm("/scratch/egg/AlienStructuredResultsCollected12/$counter/"); 127 | my $sndStockholmPath = findStockholm("/scratch/egg/AlienStructuredResultsCollected13/$counter/"); 128 | my $inputFastaPath = findInputFasta($current_alienresult_folder); 129 | if(-e $inputFastaPath){ 130 | my @fastacontent; 131 | open(my $fastafh, "<", $inputFastaPath) 132 | or die "Failed to open file: $!\n"; 133 | while(<$fastafh>) { 134 | chomp; 135 | push @fastacontent, $_; 136 | } 137 | close $fastafh; 138 | my $fasta_identifier = $fastacontent[0]; 139 | $fasta_identifier =~ s/>//; 140 | $fasta_identifier =~ s/\\K.+$//; 141 | if(-e $fstStockholmPath){ 142 | $output = $output . `~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i $fasta_identifier -a $fstStockholmPath -r $sndStockholmPath -o /scratch/egg/temp/`; 143 | }else{ 144 | $output = $output . "no stockholm found\n"; 145 | } 146 | 147 | } 148 | }else{ 149 | $output = $output . "no inputfasta found\n"; 150 | } 151 | } 152 | 153 | open(my $outputfh, ">", $outputfilePath) 154 | or die "Failed to open file: $!\n"; 155 | print $outputfh $output; 156 | close $outputfh; 157 | return 1; 158 | } 159 | 160 | sub normalizedDistanceOverIterations{ 161 | #retrieve common sequence identifier 162 | #compare stockholmstructre and parse result back 163 | my $familyNumber = shift; 164 | my $alienresult_basename = shift; 165 | my $rfammodel_basename = shift; 166 | my $rfamfasta_basename = shift; 167 | my $RNAFamilyIdFile = shift; 168 | my $resulttempdir = shift; 169 | my $resultfolderpath = shift; 170 | for(my $counter=1; $counter <= $familyNumber; $counter++){ 171 | my $output = ""; 172 | my $current_alienresult_folder= $alienresult_basename.$counter."/"; 173 | if(-e $alienresult_basename . $counter."/done"){ 174 | #print "$alienresult_basename$counter\n"; 175 | my $referenceStockholmPath = findStockholm("/scratch/egg/AlienStructuredResultsCollected13/$counter/"); 176 | my $inputFastaPath = findInputFasta($current_alienresult_folder); 177 | my $iterationNumber = findIterationNumber($current_alienresult_folder); 178 | if(-e $inputFastaPath){ 179 | my @fastacontent; 180 | open(my $fastafh, "<", $inputFastaPath) 181 | or die "Failed to open file: $!\n"; 182 | while(<$fastafh>) { 183 | chomp; 184 | push @fastacontent, $_; 185 | } 186 | close $fastafh; 187 | my $fasta_identifier = $fastacontent[0]; 188 | $fasta_identifier =~ s/>//; 189 | $fasta_identifier =~ s/\\K.+$//; 190 | if(-e $referenceStockholmPath){ 191 | for(my $iteration = 0; $iteration <= $iterationNumber; $iteration++){ 192 | my $currentStockholmPath = $current_alienresult_folder . $iteration . "/model.stockholm"; 193 | if(-e $currentStockholmPath){ 194 | $output = $output . "$iteration\t" . `~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i $fasta_identifier -a $referenceStockholmPath -r $currentStockholmPath -o /scratch/egg/temp/`; 195 | }else{ 196 | #print "$currentStockholmPath\n"; 197 | $output = $output . "$iteration\tNA\n" 198 | } 199 | } 200 | }else{ 201 | $output = $output . "no stockholm found\n"; 202 | } 203 | } 204 | }else{ 205 | $output = $output . "no inputfasta found\n"; 206 | } 207 | my $outputfilePath = $resultfolderpath . $counter . "_iterationstructure.dist"; 208 | open(my $outputfh, ">", $outputfilePath) 209 | or die "Failed to open file: $!\n"; 210 | print $outputfh $output; 211 | close $outputfh; 212 | } 213 | return 1; 214 | } 215 | 216 | sub findIterationNumber{ 217 | my $current_alienresult_folder = shift; 218 | my $continue = 1; 219 | my $iteration = 0; 220 | while($continue){ 221 | my $currentpath = $current_alienresult_folder."/".$iteration; 222 | #print $currentfastapath; 223 | unless(-d $currentpath){ 224 | $continue = 0; 225 | return $iteration; 226 | }else{ 227 | $iteration++; 228 | } 229 | if($iteration>50){ 230 | $continue = 0; 231 | } 232 | } 233 | } 234 | 235 | sub findInputFasta{ 236 | my $current_alienresult_folder = shift; 237 | my $continue = 1; 238 | my $iteration = 0; 239 | while($continue){ 240 | my $currentfastapath = $current_alienresult_folder."/".$iteration."/input.fa"; 241 | #print $currentfastapath; 242 | if(-e $currentfastapath){ 243 | $continue = 0; 244 | return $currentfastapath; 245 | }else{ 246 | $iteration++; 247 | } 248 | if($iteration>50){ 249 | $continue = 0; 250 | } 251 | } 252 | } 253 | 254 | sub findStockholm{ 255 | my $current_alienresult_folder = shift; 256 | my $continue = 1; 257 | my $iteration = 0; 258 | while($continue){ 259 | my $currentstockholmpath = $current_alienresult_folder."/".$iteration."/model.stockholm"; 260 | if(-e $currentstockholmpath){ 261 | $continue = 0; 262 | return $currentstockholmpath; 263 | }else{ 264 | $iteration++; 265 | } 266 | if($iteration>50){ 267 | $continue = 0; 268 | } 269 | } 270 | 271 | } 272 | 273 | # sub normalizedDistanceChangeOverIterations{ 274 | # #retrieve common sequence identifier 275 | # #compare stockholmstructre and parse result back 276 | # my $familyNumber = shift; 277 | # my $alienresult_basename = shift; 278 | # my $rfammodel_basename = shift; 279 | # my $rfamfasta_basename = shift; 280 | # my $RNAFamilyIdFile = shift; 281 | # my $resulttempdir = shift; 282 | # my $gathering_score_multiplier = shift; 283 | # my $gathering_score_lower_bound = shift; 284 | # my $outputfilePath = shift; 285 | # my $output; 286 | # for(my $counter=1; $counter <= $familyNumber; $counter++){ 287 | # my $current_alienresult_folder= $alienresult_basename.$counter."/"; 288 | # if(-e $alienresult_basename.$counter."/done"){ 289 | # my $alienModelPath = $current_alienresult_folder."result.cm"; 290 | # my $alienFastaPath = $current_alienresult_folder."result.fa"; 291 | # my @rfamModelNameId = split(/\s+/,$RNAfamilies[($counter - 1)]); 292 | # my $rfamModelName = $rfamModelNameId[0]; 293 | # my $rfamModelId = $rfamModelNameId[1]; 294 | # my $rfamModelPath = $rfammodel_basename . $rfamModelId . ".cm"; 295 | # my $rfamFastaPath =$rfamfasta_basename . $rfamModelId . ".fa"; 296 | # if(! -e $rfamModelPath){ 297 | # print "Does not exist: $rfamModelPath "; 298 | # } 299 | # if(! -e $rfamFastaPath){ 300 | # print "Does not exist: $rfamFastaPath "; 301 | # } 302 | 303 | # if(! -e $alienModelPath){ 304 | # print "Does not exist: $alienModelPath "; 305 | # } 306 | # if(! -e $alienFastaPath){ 307 | # print "Does not exist: $alienFastaPath"; 308 | # } 309 | # $output = $output . `RNAlienStatistics -c 20 -n $rfamModelName -d $rfamModelId -b $counter -i $alienModelPath -r $rfamModelPath -a $alienFastaPath -g $rfamFastaPath -t $rfamThreshold -x $rfamThreshold -o $resulttempdir`; 310 | # #~egg/current/Projects/Haskell/StockholmTools/dist/build/CompareStockholmStructure/CompareStockholmStructure -i AB001721.1 -a /scratch/egg/AlienStructuredResultsCollected13/1/1/model.stockholm -r /scratch/egg/AlienStructuredResultsCollected13/1/9/model.stockholm -o /scratch/egg/temp/ 311 | # } 312 | # } 313 | # open(my $outputfh, ">", $outputfilePath) 314 | # or die "Failed to open file: $!\n"; 315 | # print $outputfh $output; 316 | # close $outputfh; 317 | # return 1; 318 | # } 319 | 320 | # sub averageNormalizedDistanceChangesOverIterations{ 321 | # #summarize familywise results of NormalizedDistanceChangesOverIterations 322 | # return 1; 323 | # } 324 | 325 | # sub normalizedDistanceChangeOverIterations{ 326 | # return 1; 327 | # } 328 | 329 | # sub normalizedDistanceChangeOverIterations{ 330 | # return 1; 331 | # } 332 | 333 | # sub normalizedDistanceChangeOverIterations{ 334 | # return 1; 335 | # } 336 | 337 | # sub normalizedDistanceChangeOverIterations{ 338 | # return 1; 339 | # } 340 | -------------------------------------------------------------------------------- /scripts/blastbenchmarkdata.pl: -------------------------------------------------------------------------------- 1 | #!/bin/perl 2 | use strict; 3 | use warnings; 4 | #blastn -db nt -evalue 0.001 -query "/scratch/egg/structuredRNATestSet/1.fa" 5 | #$ blastx -db myDB -query myQuery -out myContigList.txt -outfmt "6 sallacc" 6 | #$ blastdbcmd -db myBlastDBName -dbtype prot -entry_batch myContigList.txt -outfmt %f -out myHitContigs.fasta 7 | # 8 | my $counter=1; 9 | for(1..56){ 10 | print "$counter\n"; 11 | # #`blastn -db nt -evalue 0.001 -soft_masking true -query structuredRNATestSet/$counter.fa -out blastout/$counter.txt -outfmt \"6 sallacc qcovs sseq\"`; 12 | # open(my $blastfh, "<", "blastout/$counter.txt") or die "Failed to open file: $!\n"; 13 | # open(my $fastafh, ">", "blastout/$counter.fasta") or die "Failed to open file: $!\n"; 14 | # my @sequences; 15 | # my $counter2=0; 16 | # while(<$blastfh>) { 17 | # chomp; 18 | # #add to hash 19 | # my @line = split('\t',$_); 20 | # my $unique=1; 21 | # foreach my $seq (@sequences){ 22 | # #print "$line[1] $seq\n"; 23 | # if($line[2] eq $seq){ 24 | # $unique = 0; 25 | # } 26 | # } 27 | # if($unique){ 28 | # push (@sequences, $line[2]); 29 | # my $printseq= $line[2]; 30 | # $printseq =~ s/-//g; 31 | # 32 | # if($line[1]>=80){ 33 | # print $fastafh ">$line[0]_$counter2\n$printseq\n"; 34 | # #print ">$line[0]\n$line[1]\n"; 35 | # } 36 | # } 37 | # #print @sequences; 38 | # $counter2++; 39 | # } 40 | # close $blastfh; 41 | # close $fastafh; 42 | # 43 | # 44 | 45 | #blastdbcmd -db nt -dbtype nucl -entry_batch blastout/$counter.txt -outfmt %f -out blastout/$counter.fasta`; 46 | 47 | #`mlocarna --skip-pp --fast-mea --free-endgaps --threads 3 blastout/$counter.fasta --tgtdir blastout/$counter.mlocarna`; 48 | #if(-e "blastout/$counter.mlocarna/results/result.aln"){ 49 | #`cp blastout/$counter.mlocarna/results/result.aln blastout/$counter.clustal`; 50 | #`RNAalifold -r --cfactor 0.6 --nfactor 0.5 < blastout/$counter.clustal > blastout/$counter.alifold`; 51 | #`/scratch/egg/alienhmmerblast/convertalignments.pl -i blastout/$counter.clustal -o blastout/$counter.stockholm -f stockholm`; 52 | #}else{ 53 | #`/scratch/egg/alienhmmerblast/convertalignments.pl -i blastout/$counter.fasta -o blastout/$counter.stockholm -f stockholm`; 54 | # `RNAfold < blastout/$counter.fasta > blastout/$counter.fold`; 55 | #} 56 | #`cmbuild --refine blastout/$counter.refine blastout/$counter.cm blastout/$counter.stockholm > blastout/$counter.log`; 57 | #`cmcalibrate blastout/$counter.cm`; 58 | #Copying to folders with running index for RNAlienStatistics wrapper script 59 | `mkdir blastout/$counter`; 60 | `cp blastout/$counter.fasta blastout/$counter/result.fa`; 61 | `cp blastout/$counter.cm blastout/$counter/result.cm`; 62 | 63 | $counter++; 64 | 65 | } 66 | -------------------------------------------------------------------------------- /scripts/buildClanModels.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #Write RNA family models of clan members in one file 4 | 5 | use strict; 6 | use warnings; 7 | use Data::Dumper qw(Dumper); 8 | 9 | #Read in clan_membership.txt to find the clan member families 10 | #Build hash with clan id as key and members as values 11 | 12 | 13 | my $clanMembersFile = "clan_membership.txt"; 14 | my %clan_members; 15 | open(my $clanMembersfh, "<", $clanMembersFile) 16 | or die "Failed to open file: $!\n"; 17 | while(<$clanMembersfh>) { 18 | chomp; 19 | #add to hash 20 | my @line = split('\t',$_); 21 | #print "$line[0] - $line[1]"; 22 | push( @{ $clan_members {$line[0] } }, $line[1]); 23 | } 24 | close $clanMembersfh; 25 | 26 | #print Dumper \%clan_members; 27 | 28 | #Write member covariance model into clan covariance model in clan_models subdirectory 29 | foreach my $clan (keys %clan_members){ 30 | my @members = @{$clan_members{$clan}}; 31 | #print "@members\n"; 32 | `rm clan_models/$clan.cm`; 33 | `touch clan_models/$clan.cm`; 34 | foreach my $member (@members) { 35 | `cat all_models/$member.cm >> clan_models/$clan.cm`; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /scripts/cmComparevsRfam.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | use diagnostics; 6 | use utf8; 7 | use Data::Dumper; 8 | use List::Util qw(min max); 9 | use File::Basename; 10 | use Cwd; 11 | $|=1; 12 | 13 | my $counter = 1; 14 | #contains all RNAlien result folders for sRNA tagged families 15 | #my $alienresult_basename="/scratch/egg/AlienTestResult5/temp/"; 16 | #my $aliencollected_basename="/scratch/egg/AlienResultsCollected/"; 17 | 18 | my $cmcompareresult_basename="/scratch/egg/cmcomparestructuredResultscollected/"; 19 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n"; 20 | 21 | for(1..56){ 22 | my $current_alienresult_file = $cmcompareresult_basename.$counter.".alienresult"; 23 | if(-e $current_alienresult_file){ 24 | my @resultlines; 25 | open(my $resultfh, "<", $current_alienresult_file) 26 | or die "Failed to open file: $!\n"; 27 | while(<$resultfh>) { 28 | chomp; 29 | push @resultlines, $_; 30 | } 31 | close $resultfh; 32 | my $bestentry; 33 | my $sndbestentry; 34 | my $bestlinkscore = 2; 35 | my $sndbestlinkscore = 1; 36 | foreach my $line (@resultlines){ 37 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479 38 | my @fields = split(/\s+/,$line); 39 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n"; 40 | my @scores = ($fields[2],$fields[3]); 41 | my $linkscore = min @scores; 42 | #print $linkscore."\n"; 43 | if($linkscore > $bestlinkscore){ 44 | $bestlinkscore=$linkscore; 45 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 46 | $filename =~ s/.cm//; 47 | $bestentry = $filename . "\t" . $linkscore; 48 | }elsif($linkscore > $sndbestlinkscore){ 49 | $sndbestlinkscore = $linkscore; 50 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 51 | $filename =~ s/.cm//; 52 | $sndbestentry = $filename . "\t" . $linkscore; 53 | } 54 | } 55 | #print "$counter-$current_alienresult_file\n"; 56 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n"; 57 | 58 | } 59 | $counter++; 60 | } 61 | 62 | $counter = 1; 63 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n"; 64 | for(1..56){ 65 | my $current_rfamresult_file= $cmcompareresult_basename.$counter.".rfamresult"; 66 | if(-e $current_rfamresult_file){ 67 | my @resultlines; 68 | open(my $resultfh, "<", $current_rfamresult_file) 69 | or die "Failed to open file: $!\n"; 70 | while(<$resultfh>) { 71 | chomp; 72 | push @resultlines, $_; 73 | } 74 | close $resultfh; 75 | my $bestentry; 76 | my $sndbestentry; 77 | my $bestlinkscore = 2; 78 | my $sndbestlinkscore = 1; 79 | foreach my $line (@resultlines){ 80 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479 81 | my @fields = split(/\s+/,$line); 82 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n"; 83 | my @scores = ($fields[2],$fields[3]); 84 | my $linkscore = min @scores; 85 | #print $linkscore."\n"; 86 | if($linkscore > $bestlinkscore){ 87 | $bestlinkscore=$linkscore; 88 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 89 | $filename =~ s/.cm//; 90 | $bestentry = $filename . "\t" . $linkscore; 91 | }elsif($linkscore > $sndbestlinkscore){ 92 | $sndbestlinkscore = $linkscore; 93 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 94 | $filename =~ s/.cm//; 95 | $sndbestentry = $filename . "\t" . $linkscore; 96 | } 97 | } 98 | #print "$counter-$current_rfamresult_file\n"; 99 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n"; 100 | 101 | } 102 | $counter++; 103 | } 104 | -------------------------------------------------------------------------------- /scripts/cmcomparebesthitextractor.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | use diagnostics; 6 | use utf8; 7 | use Data::Dumper; 8 | use List::Util qw(min max); 9 | use File::Basename; 10 | use Cwd; 11 | $|=1; 12 | 13 | my $counter = 1; 14 | #contains all RNAlien result folders for sRNA tagged families 15 | #my $alienresult_basename="/scratch/egg/AlienTestResult5/temp/"; 16 | #my $aliencollected_basename="/scratch/egg/AlienResultsCollected/"; 17 | 18 | my $cmcompareresult_basename="/scratch/egg/cmcomparestructuredResultscollected/"; 19 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n"; 20 | 21 | for(1..56){ 22 | my $current_alienresult_file = $cmcompareresult_basename.$counter.".alienresult"; 23 | if(-e $current_alienresult_file){ 24 | my @resultlines; 25 | open(my $resultfh, "<", $current_alienresult_file) 26 | or die "Failed to open file: $!\n"; 27 | while(<$resultfh>) { 28 | chomp; 29 | push @resultlines, $_; 30 | } 31 | close $resultfh; 32 | my $bestentry; 33 | my $sndbestentry; 34 | my $bestlinkscore = 2; 35 | my $sndbestlinkscore = 1; 36 | foreach my $line (@resultlines){ 37 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479 38 | my @fields = split(/\s+/,$line); 39 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n"; 40 | my @scores = ($fields[2],$fields[3]); 41 | my $linkscore = min @scores; 42 | #print $linkscore."\n"; 43 | if($linkscore > $bestlinkscore){ 44 | $bestlinkscore=$linkscore; 45 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 46 | $filename =~ s/.cm//; 47 | $bestentry = $filename . "\t" . $linkscore; 48 | }elsif($linkscore > $sndbestlinkscore){ 49 | $sndbestlinkscore = $linkscore; 50 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 51 | $filename =~ s/.cm//; 52 | $sndbestentry = $filename . "\t" . $linkscore; 53 | } 54 | } 55 | #print "$counter-$current_alienresult_file\n"; 56 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n"; 57 | 58 | } 59 | $counter++; 60 | } 61 | 62 | $counter = 1; 63 | print "index\tbestModelID\tbestModelLink\t2ndbestModelID\t2ndbestModelLink\n"; 64 | for(1..56){ 65 | my $current_rfamresult_file= $cmcompareresult_basename.$counter.".rfamresult"; 66 | if(-e $current_rfamresult_file){ 67 | my @resultlines; 68 | open(my $resultfh, "<", $current_rfamresult_file) 69 | or die "Failed to open file: $!\n"; 70 | while(<$resultfh>) { 71 | chomp; 72 | push @resultlines, $_; 73 | } 74 | close $resultfh; 75 | my $bestentry; 76 | my $sndbestentry; 77 | my $bestlinkscore = 2; 78 | my $sndbestlinkscore = 1; 79 | foreach my $line (@resultlines){ 80 | #/scr/kronos/egg/AlienStructuredResultsCollected4/8/result.cm /scratch/egg/all_models//RF00001.cm -4.642 -2.479 81 | my @fields = split(/\s+/,$line); 82 | #print $fields[0].$fields[1].$fields[2].$fields[3]."\n"; 83 | my @scores = ($fields[2],$fields[3]); 84 | my $linkscore = min @scores; 85 | #print $linkscore."\n"; 86 | if($linkscore > $bestlinkscore){ 87 | $bestlinkscore=$linkscore; 88 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 89 | $filename =~ s/.cm//; 90 | $bestentry = $filename . "\t" . $linkscore; 91 | }elsif($linkscore > $sndbestlinkscore){ 92 | $sndbestlinkscore = $linkscore; 93 | my ($filename, $dirs, $suffix) = fileparse($fields[1]); 94 | $filename =~ s/.cm//; 95 | $sndbestentry = $filename . "\t" . $linkscore; 96 | } 97 | } 98 | #print "$counter-$current_rfamresult_file\n"; 99 | print $counter . "\t" . $bestentry . "\t" . $sndbestentry . "\n"; 100 | 101 | } 102 | $counter++; 103 | } 104 | -------------------------------------------------------------------------------- /scripts/getblastdb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | declare -a ntfiles=("nt.00.tar.gz" "nt.01.tar.gz" "nt.02.tar.gz" "nt.03.tar.gz" 3 | "nt.04.tar.gz" "nt.05.tar.gz" "nt.06.tar.gz" "nt.07.tar.gz" 4 | "nt.08.tar.gz" "nt.09.tar.gz" "nt.10.tar.gz" "nt.11.tar.gz" 5 | "nt.12.tar.gz" "nt.13.tar.gz" "nt.14.tar.gz" "nt.15.tar.gz" 6 | "nt.16.tar.gz" "nt.17.tar.gz" "nt.18.tar.gz" "nt.19.tar.gz" 7 | "nt.20.tar.gz" "nt.21.tar.gz" "nt.22.tar.gz" "nt.23.tar.gz" 8 | "nt.24.tar.gz" "nt.25.tar.gz" "nt.26.tar.gz" "nt.27.tar.gz" 9 | "nt.28.tar.gz" "nt.29.tar.gz" "nt.30.tar.gz" "nt.31.tar.gz" 10 | "nt.32.tar.gz" "nt.33.tar.gz" "nt.34.tar.gz" "nt.35.tar.gz" 11 | "nt.36.tar.gz" "nt.37.tar.gz" "nt.38.tar.gz" "nt.39.tar.gz" 12 | "nt.40.tar.gz" "nt.41.tar.gz" "nt.42.tar.gz" "nt.43.tar.gz" 13 | "nt.44.tar.gz" "nt.45.tar.gz" "nt.46.tar.gz" "nt.47.tar.gz" 14 | "nt.48.tar.gz" "nt.49.tar.gz" "nt.50.tar.gz" "nt.51.tar.gz" 15 | "nt.52.tar.gz" "nt.53.tar.gz" "nt.54.tar.gz" "nt.55.tar.gz") 16 | for f in "${ntfiles[@]}" 17 | do 18 | echo "$f" 19 | wget ftp://ftp.ncbi.nlm.nih.gov/blast/db/v5/$f 20 | tar zxvpf $f 21 | done 22 | 23 | -------------------------------------------------------------------------------- /scripts/makemultiplotcsv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | mkdir recoverymultiplot2 3 | touch recoverymultiplot2/filenames 4 | mkdir recoverymultiplot2/rfamonalien 5 | mkdir recoverymultiplot2/alienonrfam 6 | echo "" > recoverymultiplot2/filenames 7 | counter=1 8 | for i in structuredalienseedoutput4-*.csv; do 9 | echo "recoverymultiplot2/rfamonalien/$counter.sorted.csv " >> recoverymultiplot2/filenames 10 | echo "$i" 11 | cut -d $'\t' -f 13 $i > recoverymultiplot2/rfamonalien/$counter.csv; 12 | cut -d $'\t' -f 14 $i > recoverymultiplot2/alienonrfam/$counter.csv; 13 | sort -k 1 -n recoverymultiplot2/rfamonalien/$counter.csv > recoverymultiplot2/rfamonalien/$counter.sorted.csv; 14 | sort -k 1 -n recoverymultiplot2/alienonrfam/$counter.csv > recoverymultiplot2/alienonrfam/$counter.sorted.csv; 15 | counter=$[$counter +1] 16 | done 17 | usedfilenames=$( recoverymultiplot2/rfamonalien/allsorted 20 | pr -mts $usedfilenames > recoverymultiplot2/alienonrfam/allsorted 21 | 22 | -------------------------------------------------------------------------------- /scripts/nhmmerbenchmarkdata.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use warnings; 3 | use strict; 4 | my $counter=1; 5 | for(1..56){ 6 | print "$counter\n"; 7 | #'time nhmmer -E 0.001 -A nhmmerout/$counter.sto -o nhmmerout/$counter.hmmer /scratch/egg/structuredRNATestSet/$counter.fa nt`; 8 | #`~egg/Tools/hmmer-3.1b2-linux-intel-x86_64/easel/miniapps/esl-reformat fasta nhmmerout/$counter.sto > nhmmerout/$counter.fa`; 9 | unless($counter == 27){ 10 | #if(-e "nhmmerout/$counter.sto"){ 11 | #`/scratch/egg/alienhmmerblast/convertalignments.pl -g stockholm -i nhmmerout/$counter.sto -o nhmmerout/$counter.clustal -f clustalw`; 12 | #`rnazSelectSeqs.pl nhmmerout/$counter.clustal`; 13 | #`RNAalifold -r --cfactor 0.6 --nfactor 0.5 < nhmmerout/$counter.clustal > nhmmerout/$counter.alifold`; 14 | # `/scratch/egg/alienhmmerblast/convertalignments.pl -g stockholm -i nhmmerout/$counter.sto -o nhmmerout/$counter.stockholm -f stockholm`; 15 | #}else{ 16 | #copy input sequence in case of no hits 17 | # `/scratch/egg/alienhmmerblast/convertalignments.pl -g fasta -i nhmmerout/$counter.fa -o nhmmerout/$counter.stockholm -f stockholm`; 18 | # #`RNAfold < nhmmerout/$counter.fasta > nhmmerout/$counter.fold`; 19 | #} 20 | #Manually insert consensus structure line 21 | #`cp nhmmerout/$counter.stockholm nhmmerout/$counter.stockholm.bak`; 22 | #`grep -v "#=GS" nhmmerout/$counter.stockholm.bak | grep -v "#=GR" > nhmmerout/$counter.stockholm`; 23 | #`cmbuild --refine nhmmerout/$counter.refine nhmmerout/$counter.cm nhmmerout/$counter.stockholm > nhmmerout/$counter.log`; 24 | #`cmcalibrate --cpu 30 nhmmerout/$counter.cm`; 25 | 26 | #Copying to folders with running index for RNAlienStatistics wrapper script 27 | `mkdir nhmmerout/$counter`; 28 | `cp nhmmerout/$counter.fa nhmmerout/$counter/result.fa`; 29 | `cp nhmmerout/$counter.cm nhmmerout/$counter/result.cm`; 30 | } 31 | 32 | $counter++; 33 | 34 | } 35 | 36 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | flags: {} 2 | packages: 3 | - '.' 4 | extra-deps: [] 5 | compiler-check: newer-minor 6 | resolver: lts-9.21 7 | -------------------------------------------------------------------------------- /test/single.fa: -------------------------------------------------------------------------------- 1 | >AARQ02000011.1/391-585 2 | AAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAA 3 | AAGUAUCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCU 4 | GUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAG 5 | UAAUGAAACAUGCUU 6 | -------------------------------------------------------------------------------- /test/test.stockholm: -------------------------------------------------------------------------------- 1 | # STOCKHOLM 1.0 2 | #=GF AU Infernal 1.1.2 3 | 4 | #=GS CP008770.1:757226-757421 DE Listeria monocytogenes strain 88-1059 genome 5 | #=GS CP007196.1:749187-749382 DE Listeria monocytogenes serotype 3c str. 10-5027, complete genome 6 | #=GS CP007169.1:797163-797359 DE Listeria monocytogenes serotype 1/2b str. 10-0811, complete genome 7 | 8 | AARQ02000011.1/391-585 -AAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAAAAGUA.UCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAACAUGCUU 9 | #=GR AARQ02000011.1/391-585 PP .*****************************************************************.********************************************************************************************************************************** 10 | CP008770.1:757226-757421 CAAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAAAAGUA.UCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAAUAUGCUU 11 | #=GR CP008770.1:757226-757421 PP ******************************************************************.********************************************************************************************************************************** 12 | CP007196.1:749187-749382 CAAUUGAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUGCUUGAAGGUGAAAUCCCUGAAAAGUA.UCGAUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCAUUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAAUAUGCUU 13 | #=GR CP007196.1:749187-749382 PP ******************************************************************.********************************************************************************************************************************** 14 | CP007169.1:797163-797359 CAAUUAAAUAGAAGCGCCAGAACUGAUUGGGACGAAAAUACUUGAAGGUGAAAUCCCUGAAAAGUAAACAGUCAGUUGACGAGGAGGAGAUUAAUCGAAGUUUCGGCGGGAGUCUCCCGGCUGUGCAUGCAGUCGUUAAGUCUUACUUACAAAUCACUUGGGUGACCAAGUGGACAGAGUAGUAAUGAAACAUGUUU 15 | #=GR CP007169.1:797163-797359 PP ******************************************************************9********************************************************************************************************************************** 16 | #=GC SS_cons :::::::::::(((((<<--<<<<<<<<<<--------<<<<<--<<<-<____>>>>--->>>>>.>>>>>>>>>>----->>,<<<<<<<--<<<<____>>>>---->>>>>>><<<<<<<____>>>>>>>,,,,,<<<<<<<-<<---<<<<<<<<____>>>>>>>>----->>>>>>>->>,,,,))))) 17 | #=GC RF cAAUUgAAUAGaagCgCCAGaaCuGaucGgGAcGAAAAugCuuGAaGGUGAAAUCCCuGAAaaGca.cCgauCaGuuGAcGAGGAGGaGacuAAcCGaAGUUuCGgcGGGaguCuCCCGgCuGcGcAUgCaGcCGUUAAGuCuuaCuUaCAAAcCacuuGGGUGACCaaguGgAcAGAGuaGuaaUGaAAcAcGcuu 18 | // 19 | -------------------------------------------------------------------------------- /test/testcalls: -------------------------------------------------------------------------------- 1 | #Build with profiling enabled 2 | cabal new-build --enable-profiling --ghc-options="-rtsopts -threaded" --reinstall 3 | #Single input 4 | #Offline 5 | #Single fasta 6 | nohup RNAlien -i test/single.fa -c 7 -j -b /work/work/blast5db/nt_v5 -d single -w /work/work/new_taxdump/taxidlineage.dmp +RTS -p -N7& 7 | #Multi fasta 8 | nohup RNAlien -i test/testmulti.fa -c 7 -j -b /work/work/blast5db/nt_v5 -d multi -w /work/work/new_taxdump/taxidlineage.dmp +RTS -p -N7& 9 | #Stockholm alignment 10 | nohup RNAlien -p test/test.stockholm -c 7 -j -b /work/work/blast5db/nt_v5 -d aln -w /work/work/new_taxdump/taxidlineage.dmp +RTS -p -N7& 11 | #Online 12 | #Single fasta 13 | nohup RNAlien -i test/single.fa -c 7 -d onsingle +RTS -p -N7& 14 | #Multi fasta 15 | nohup RNAlien -i test/testmulti.fa -c 7 -d onmulti +RTS -p -N7& 16 | #Stockholm alignment 17 | nohup RNAlien -p test/test.stockholm -c 7 -d onaln +RTS -p -N7& 18 | #Scan 19 | #Single fasta 20 | RNAlienScan -i test/single.fa -b test/scan.fa -c 7 -d scansingle +RTS -p -N7& 21 | #Single fasta 22 | RNAlienScan -i test/testmulti.fa -b test/scan.fa -c 7 -d scanmulti +RTS -p -N7& 23 | #Single fasta 24 | RNAlienScan -i test/test.stockholm -b test/scan.fa -c 7 -d scanaln +RTS -p -N7& 25 | 26 | 27 | -------------------------------------------------------------------------------- /test/testmulti.fa: -------------------------------------------------------------------------------- 1 | >AARQ02000011.1/391-585 2 | AATTGAATAGAAGCGCCAGAACTGATTGGGACGAAAATGCTTGAAGGTGAAATCCCTGAAAAGTATCGATCAGTTGACGA 3 | GGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCATTTGGG 4 | TGACCAAGTGGACAGAGTAGTAATGAAACATGCTT 5 | >CP008770.1:757226-757421 Listeria monocytogenes strain 88-1059 genome 6 | CAATTGAATAGAAGCGCCAGAACTGATTGGGACGAAAATGCTTGAAGGTGAAATCCCTGAAAAGTATCGATCAGTTGACG 7 | AGGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCATTTGG 8 | GTGACCAAGTGGACAGAGTAGTAATGAAATATGCTT 9 | >CP007196.1:749187-749382 Listeria monocytogenes serotype 3c str. 10-5027, complete genome 10 | CAATTGAATAGAAGCGCCAGAACTGATTGGGACGAAAATGCTTGAAGGTGAAATCCCTGAAAAGTATCGATCAGTTGACG 11 | AGGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCATTTGG 12 | GTGACCAAGTGGACAGAGTAGTAATGAAATATGCTT 13 | >CP007169.1:797163-797359 Listeria monocytogenes serotype 1/2b str. 10-0811, complete genome 14 | CAATTAAATAGAAGCGCCAGAACTGATTGGGACGAAAATACTTGAAGGTGAAATCCCTGAAAAGTAAACAGTCAGTTGAC 15 | GAGGAGGAGATTAATCGAAGTTTCGGCGGGAGTCTCCCGGCTGTGCATGCAGTCGTTAAGTCTTACTTACAAATCACTTG 16 | GGTGACCAAGTGGACAGAGTAGTAATGAAACATGTTT 17 | --------------------------------------------------------------------------------