├── .gitignore ├── stack.yaml ├── favicon.ico ├── _site ├── favicon.ico ├── images │ ├── ORCID-iD_icon_BW_16x16.png │ ├── 2020-04-02-covid-19 │ │ ├── 0*0nXhv3wBHICs8oU9.webp │ │ ├── 0*6GxdzZ8ff8750eUI.webp │ │ ├── 1*ARAUUR6FfmsgiJu1ocjQ-A.webp │ │ ├── 1*KM-S2Z7BJotlspqUr8Te5g.webp │ │ ├── 1*STZnkSEKJRVMBzelagdi-A.webp │ │ ├── 1*WAinSw5vnzOzm5aAjgIXCg.webp │ │ ├── 1*bvu6XdbTRlk975p7bpVl2Q.webp │ │ └── 1*pXhoiK8_kaJ38oawNTGwag.webp │ └── 2021-12-05-shake-II │ │ ├── 1_9JOrZ76udsvr1kKBippbYg.webp │ │ └── 1_cFrMhLDcSVAt6zmR1BsCjg.webp ├── css │ ├── default.css │ └── syntax.css ├── index.html └── posts │ ├── 2021-12-05-shake-I.html │ ├── 2025-02-21-poseidon-git-pr-editing.html │ ├── 2023-12-31-poseidon-end-of-year-2023.html │ ├── 2020-04-02-covid-19.html │ └── 2021-05-06-lambdar.html ├── index.html ├── images ├── ORCID-iD_icon_BW_16x16.png ├── 2020-04-02-covid-19 │ ├── 0*0nXhv3wBHICs8oU9.webp │ ├── 0*6GxdzZ8ff8750eUI.webp │ ├── 1*ARAUUR6FfmsgiJu1ocjQ-A.webp │ ├── 1*KM-S2Z7BJotlspqUr8Te5g.webp │ ├── 1*STZnkSEKJRVMBzelagdi-A.webp │ ├── 1*WAinSw5vnzOzm5aAjgIXCg.webp │ ├── 1*bvu6XdbTRlk975p7bpVl2Q.webp │ └── 1*pXhoiK8_kaJ38oawNTGwag.webp └── 2021-12-05-shake-II │ ├── 1_9JOrZ76udsvr1kKBippbYg.webp │ └── 1_cFrMhLDcSVAt6zmR1BsCjg.webp ├── templates ├── post-list.html ├── post.html └── default.html ├── blog.cabal ├── stack.yaml.lock ├── site.hs ├── css └── default.css └── posts ├── 2025-02-21-poseidon-git-pr-editing.md ├── 2021-12-05-shake-I.md ├── 2017-12-28-custom-bars-rcppprogress.markdown ├── 2023-12-31-poseidon-end-of-year-2023.md ├── 2020-04-02-covid-19.md ├── 2021-05-06-lambdar.md └── 2021-12-05-shake-II.md /.gitignore: -------------------------------------------------------------------------------- 1 | _cache/ 2 | .stack-work/ 3 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-22.43 2 | 3 | packages: 4 | - . 5 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/favicon.ico -------------------------------------------------------------------------------- /_site/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/favicon.ico -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | --- 2 | title: Posts 3 | --- 4 | 5 | $partial("templates/post-list.html")$ 6 | -------------------------------------------------------------------------------- /images/ORCID-iD_icon_BW_16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/ORCID-iD_icon_BW_16x16.png -------------------------------------------------------------------------------- /_site/images/ORCID-iD_icon_BW_16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/ORCID-iD_icon_BW_16x16.png -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp -------------------------------------------------------------------------------- /images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp -------------------------------------------------------------------------------- /images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp -------------------------------------------------------------------------------- /templates/post-list.html: -------------------------------------------------------------------------------- 1 | 8 | -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp -------------------------------------------------------------------------------- /_site/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp -------------------------------------------------------------------------------- /_site/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp -------------------------------------------------------------------------------- /templates/post.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | Posted 4 | $if(origin)$ 5 | originally here 6 | $endif$ 7 | on $date$ 8 | $if(author)$ 9 | by $author$ 10 | $endif$ 11 |
12 |
13 | $body$ 14 |
15 |
16 | -------------------------------------------------------------------------------- /blog.cabal: -------------------------------------------------------------------------------- 1 | name: blog 2 | version: 0.1.0.0 3 | build-type: Simple 4 | cabal-version: >= 1.10 5 | 6 | executable site 7 | main-is: site.hs 8 | build-depends: base == 4.* 9 | , hakyll == 4.16.* 10 | , pandoc 11 | ghc-options: -threaded -rtsopts -with-rtsopts=-N 12 | default-language: Haskell2010 13 | -------------------------------------------------------------------------------- /stack.yaml.lock: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by Stack. 2 | # You should not edit this file by hand. 3 | # For more information, please see the documentation at: 4 | # https://docs.haskellstack.org/en/stable/topics/lock_files 5 | 6 | packages: [] 7 | snapshots: 8 | - completed: 9 | sha256: 08bd13ce621b41a8f5e51456b38d5b46d7783ce114a50ab604d6bbab0d002146 10 | size: 720271 11 | url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/22/43.yaml 12 | original: lts-22.43 13 | -------------------------------------------------------------------------------- /templates/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - $title$ 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

$title$

33 | $body$ 34 |
35 | 36 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /_site/css/default.css: -------------------------------------------------------------------------------- 1 | pre{font-size:1.4rem}body{background-color:#232629;color:white;font-size:1.6rem}a:link{color:pink;background-color:transparent;text-decoration:none}a:visited{color:pink;background-color:transparent;text-decoration:none}a:hover{color:red;background-color:transparent;text-decoration:underline}a:active{color:white;background-color:transparent;text-decoration:underline}html{font-size:62.5%}header{border-bottom:0.2rem solid white}nav{text-align:right}nav a{font-size:1.8rem;font-weight:bold;text-decoration:none;text-transform:uppercase}footer{margin-top:3rem;padding:1.2rem 0;border-top:0.2rem solid white;font-size:1.2rem;color:#c1c1c1}h1{font-size:2.4rem}h2{font-size:2rem}article .header{font-size:1.4rem;font-style:italic;color:#c1c1c1}.logo a{font-weight:bold;color:white;text-decoration:none}@media (max-width:319px){body{width:90%;margin:0;padding:0 5%}header{margin:4.2rem 0}nav{margin:0 auto 3rem;text-align:center}footer{text-align:center}.logo{text-align:center;margin:1rem auto 3rem}.logo a{font-size:2.4rem}nav a{display:block;line-height:1.6}}@media (min-width:320px){body{width:90%;margin:0;padding:0 5%}header{margin:4.2rem 0}nav{margin:0 auto 3rem;text-align:center}footer{text-align:center}.logo{text-align:center;margin:1rem auto 3rem}.logo a{font-size:2.4rem}nav a{display:inline;margin:0 0.6rem}}@media (min-width:640px){body{width:80rem;margin:0 auto;padding:0}header{margin:0 0 3rem;padding:1.2rem 0}nav{margin:0;text-align:right}nav a{margin:0 0 0 1.2rem;display:inline}footer{text-align:right}.logo{margin:0;text-align:left}.logo a{float:left;font-size:1.8rem}} -------------------------------------------------------------------------------- /site.hs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- 2 | {-# LANGUAGE OverloadedStrings #-} 3 | import Data.Monoid (mappend) 4 | import Hakyll 5 | import Text.Pandoc.Highlighting (Style, breezeDark, styleToCss) 6 | import Text.Pandoc.Options (ReaderOptions (..), WriterOptions (..)) 7 | 8 | -------------------------------------------------------------------------------- 9 | main :: IO () 10 | main = hakyll $ do 11 | match "favicon.ico" $ do 12 | route idRoute 13 | compile copyFileCompiler 14 | 15 | match "images/**" $ do 16 | route idRoute 17 | compile copyFileCompiler 18 | 19 | match "css/*" $ do 20 | route idRoute 21 | compile compressCssCompiler 22 | 23 | -- css for syntax highlighting 24 | create ["css/syntax.css"] $ do 25 | route idRoute 26 | compile $ do 27 | makeItem $ styleToCss pandocCodeStyle 28 | 29 | --match (fromList ["about.rst", "contact.markdown"]) $ do 30 | match (fromList []) $ do 31 | route $ setExtension "html" 32 | compile $ pandocCompiler' 33 | >>= loadAndApplyTemplate "templates/default.html" defaultContext 34 | >>= relativizeUrls 35 | 36 | match "posts/*" $ do 37 | route $ setExtension "html" 38 | compile $ pandocCompiler' 39 | >>= loadAndApplyTemplate "templates/post.html" postCtx 40 | >>= loadAndApplyTemplate "templates/default.html" postCtx 41 | >>= relativizeUrls 42 | 43 | match "index.html" $ do 44 | route idRoute 45 | compile $ do 46 | posts <- recentFirst =<< loadAll "posts/*" 47 | let indexCtx = 48 | listField "posts" postCtx (return posts) `mappend` 49 | defaultContext 50 | 51 | getResourceBody 52 | >>= applyAsTemplate indexCtx 53 | >>= loadAndApplyTemplate "templates/default.html" indexCtx 54 | >>= relativizeUrls 55 | 56 | match "templates/*" $ compile templateBodyCompiler 57 | 58 | 59 | -------------------------------------------------------------------------------- 60 | postCtx :: Context String 61 | postCtx = 62 | dateField "date" "%B %e, %Y" `mappend` 63 | defaultContext 64 | 65 | pandocCodeStyle :: Style 66 | pandocCodeStyle = breezeDark 67 | 68 | pandocCompiler' :: Compiler (Item String) 69 | pandocCompiler' = 70 | pandocCompilerWith 71 | defaultHakyllReaderOptions 72 | defaultHakyllWriterOptions 73 | { writerHighlightStyle = Just pandocCodeStyle 74 | } 75 | -------------------------------------------------------------------------------- /_site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - Posts 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

Posts

33 | 64 | 65 | 66 |
67 | 68 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /css/default.css: -------------------------------------------------------------------------------- 1 | /* custom */ 2 | 3 | pre { 4 | font-size: 1.4rem; 5 | } 6 | 7 | body { 8 | background-color: #232629; 9 | color: white; 10 | font-size: 1.6rem; 11 | } 12 | 13 | a:link { 14 | color: pink; 15 | background-color: transparent; 16 | text-decoration: none; 17 | } 18 | 19 | a:visited { 20 | color: pink; 21 | background-color: transparent; 22 | text-decoration: none; 23 | } 24 | 25 | a:hover { 26 | color: red; 27 | background-color: transparent; 28 | text-decoration: underline; 29 | } 30 | 31 | a:active { 32 | color: white; 33 | background-color: transparent; 34 | text-decoration: underline; 35 | } 36 | 37 | /* default hakyll */ 38 | 39 | html { 40 | font-size: 62.5%; 41 | } 42 | 43 | /* 44 | body { 45 | font-size: 1.6rem; 46 | color: #000; 47 | } 48 | */ 49 | 50 | header { 51 | border-bottom: 0.2rem solid white; 52 | } 53 | 54 | nav { 55 | text-align: right; 56 | } 57 | 58 | nav a { 59 | font-size: 1.8rem; 60 | font-weight: bold; 61 | text-decoration: none; 62 | text-transform: uppercase; 63 | } 64 | 65 | footer { 66 | margin-top: 3rem; 67 | padding: 1.2rem 0; 68 | border-top: 0.2rem solid white; 69 | font-size: 1.2rem; 70 | color: #c1c1c1; 71 | } 72 | 73 | h1 { 74 | font-size: 2.4rem; 75 | } 76 | 77 | h2 { 78 | font-size: 2rem; 79 | } 80 | 81 | article .header { 82 | font-size: 1.4rem; 83 | font-style: italic; 84 | color: #c1c1c1; 85 | } 86 | 87 | .logo a { 88 | font-weight: bold; 89 | color: white; 90 | text-decoration: none; 91 | } 92 | 93 | @media (max-width: 319px) { 94 | body { 95 | width: 90%; 96 | margin: 0; 97 | padding: 0 5%; 98 | } 99 | header { 100 | margin: 4.2rem 0; 101 | } 102 | nav { 103 | margin: 0 auto 3rem; 104 | text-align: center; 105 | } 106 | footer { 107 | text-align: center; 108 | } 109 | .logo { 110 | text-align: center; 111 | margin: 1rem auto 3rem; 112 | } 113 | .logo a { 114 | font-size: 2.4rem; 115 | } 116 | nav a { 117 | display: block; 118 | line-height: 1.6; 119 | } 120 | } 121 | 122 | @media (min-width: 320px) { 123 | body { 124 | width: 90%; 125 | margin: 0; 126 | padding: 0 5%; 127 | } 128 | header { 129 | margin: 4.2rem 0; 130 | } 131 | nav { 132 | margin: 0 auto 3rem; 133 | text-align: center; 134 | } 135 | footer { 136 | text-align: center; 137 | } 138 | .logo { 139 | text-align: center; 140 | margin: 1rem auto 3rem; 141 | } 142 | .logo a { 143 | font-size: 2.4rem; 144 | } 145 | nav a { 146 | display: inline; 147 | margin: 0 0.6rem; 148 | } 149 | } 150 | 151 | @media (min-width: 640px) { 152 | body { 153 | width: 80rem; 154 | margin: 0 auto; 155 | padding: 0; 156 | } 157 | header { 158 | margin: 0 0 3rem; 159 | padding: 1.2rem 0; 160 | } 161 | nav { 162 | margin: 0; 163 | text-align: right; 164 | } 165 | nav a { 166 | margin: 0 0 0 1.2rem; 167 | display: inline; 168 | } 169 | footer { 170 | text-align: right; 171 | } 172 | .logo { 173 | margin: 0; 174 | text-align: left; 175 | } 176 | .logo a { 177 | float: left; 178 | font-size: 1.8rem; 179 | } 180 | } 181 | -------------------------------------------------------------------------------- /_site/css/syntax.css: -------------------------------------------------------------------------------- 1 | pre > code.sourceCode { white-space: pre; position: relative; } 2 | pre > code.sourceCode > span { line-height: 1.25; } 3 | pre > code.sourceCode > span:empty { height: 1.2em; } 4 | .sourceCode { overflow: visible; } 5 | code.sourceCode > span { color: inherit; text-decoration: inherit; } 6 | div.sourceCode { margin: 1em 0; } 7 | pre.sourceCode { margin: 0; } 8 | @media screen { 9 | div.sourceCode { overflow: auto; } 10 | } 11 | @media print { 12 | pre > code.sourceCode { white-space: pre-wrap; } 13 | pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; } 14 | } 15 | pre.numberSource code 16 | { counter-reset: source-line 0; } 17 | pre.numberSource code > span 18 | { position: relative; left: -4em; counter-increment: source-line; } 19 | pre.numberSource code > span > a:first-child::before 20 | { content: counter(source-line); 21 | position: relative; left: -1em; text-align: right; vertical-align: baseline; 22 | border: none; display: inline-block; 23 | -webkit-touch-callout: none; -webkit-user-select: none; 24 | -khtml-user-select: none; -moz-user-select: none; 25 | -ms-user-select: none; user-select: none; 26 | padding: 0 4px; width: 4em; 27 | background-color: #232629; 28 | color: #7a7c7d; 29 | } 30 | pre.numberSource { margin-left: 3em; border-left: 1px solid #7a7c7d; padding-left: 4px; } 31 | div.sourceCode 32 | { color: #cfcfc2; background-color: #232629; } 33 | @media screen { 34 | pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } 35 | } 36 | code span { color: #cfcfc2; } /* Normal */ 37 | code span.al { color: #95da4c; background-color: #4d1f24; font-weight: bold; } /* Alert */ 38 | code span.an { color: #3f8058; } /* Annotation */ 39 | code span.at { color: #2980b9; } /* Attribute */ 40 | code span.bn { color: #f67400; } /* BaseN */ 41 | code span.bu { color: #7f8c8d; } /* BuiltIn */ 42 | code span.cf { color: #fdbc4b; font-weight: bold; } /* ControlFlow */ 43 | code span.ch { color: #3daee9; } /* Char */ 44 | code span.cn { color: #27aeae; font-weight: bold; } /* Constant */ 45 | code span.co { color: #7a7c7d; } /* Comment */ 46 | code span.cv { color: #7f8c8d; } /* CommentVar */ 47 | code span.do { color: #a43340; } /* Documentation */ 48 | code span.dt { color: #2980b9; } /* DataType */ 49 | code span.dv { color: #f67400; } /* DecVal */ 50 | code span.er { color: #da4453; text-decoration: underline; } /* Error */ 51 | code span.ex { color: #0099ff; font-weight: bold; } /* Extension */ 52 | code span.fl { color: #f67400; } /* Float */ 53 | code span.fu { color: #8e44ad; } /* Function */ 54 | code span.im { color: #27ae60; } /* Import */ 55 | code span.in { color: #c45b00; } /* Information */ 56 | code span.kw { color: #cfcfc2; font-weight: bold; } /* Keyword */ 57 | code span.op { color: #cfcfc2; } /* Operator */ 58 | code span.ot { color: #27ae60; } /* Other */ 59 | code span.pp { color: #27ae60; } /* Preprocessor */ 60 | code span.re { color: #2980b9; background-color: #153042; } /* RegionMarker */ 61 | code span.sc { color: #3daee9; } /* SpecialChar */ 62 | code span.ss { color: #da4453; } /* SpecialString */ 63 | code span.st { color: #f44f4f; } /* String */ 64 | code span.va { color: #27aeae; } /* Variable */ 65 | code span.vs { color: #da4453; } /* VerbatimString */ 66 | code span.wa { color: #da4453; } /* Warning */ 67 | -------------------------------------------------------------------------------- /posts/2025-02-21-poseidon-git-pr-editing.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Editing a pull request branch created from a fork" 3 | author: "Clemens Schmid" 4 | origin: https://blog.poseidon-adna.org/posts/archive_editing_git.html 5 | --- 6 | 7 | At the time of writing, the [Poseidon community archive](https://github.com/poseidon-framework/community-archive) has 14 open pull requests -- most of which were opened by various community members to add new packages to the archive. What certainly is a pleasant development, because it indicates that the archive gets adopted, also comes with technical and administrative challenges. As an editor for the archive I recently had to step up my Git skills to address a particular issue I was facing. 8 | 9 | Already multiple times I found myself in the situation that I needed to edit a submission pull request before merging. This arose for example, when a package author prepared a package almost perfectly, but I still wanted to apply some additional minor changes before merging. Or an author or reviewer had struggled with Git, manoeuvred themselves into a predicament, and needed my help to untangle the knot without [starting from scratch](https://xkcd.com/1597). So here is what I came up with to allow me to do that efficiently. 10 | 11 | ## Workflow 12 | 13 | GitHub's documentation includes a helpful tutorial how to [commit changes to a pull request branch created from a fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/committing-changes-to-a-pull-request-branch-created-from-a-fork). It already covers the basic workflow how to edit a fork. The article highlights a number of conditions for this to be possible: 14 | 15 | > You can only make commits on pull request branches that: 16 | > 17 | > - Are opened in a repository that you have push access to and that were created from a fork of that repository 18 | > - Are on a user-owned fork 19 | > - Have permission granted from the pull request creator 20 | > - Don't have branch restrictions that will prevent you from committing 21 | 22 | All of these are met in my case. But two additional challenges complicate the matter: i) the community-archive uses Git LFS for the large data files, and ii) I need to do this so frequently, that cloning every fork feels unnecessarily cumbersome. The following workflow considers this special situation. 23 | 24 | ### 1. Clone the fork repository {#sec-clone} 25 | 26 | ```bash 27 | GIT_LFS_SKIP_SMUDGE=1 git clone git@github.com:USERNAME/FORK-OF-THE-REPOSITORY.git 28 | ``` 29 | 30 | Note that this workflow assumes that you have installed and configured Git LFS on your system. Cloning the repo with the `GIT_LFS_SKIP_SMUDGE` environment variable prevents downloading the LFS-tracked files despite Git LFS being enabled. This saves bandwidth and costs for us on GitHub. 31 | 32 | ### 2. (if necessary) Switch to the pull request branch 33 | 34 | ```bash 35 | git switch PR-BRANCH 36 | ``` 37 | 38 | This is only necessary, if the PR branch is not the main/master branch. 39 | 40 | ### 3. (if necessary) Download individual LFS-tracked files 41 | 42 | ```bash 43 | git lfs pull --include "PATH-TO-FILE" 44 | ``` 45 | 46 | To validate a package completely it can be necessary to also access the genotype data. But because we cloned above with `GIT_LFS_SKIP_SMUDGE=1`, this data is not in our clone now. Fortunately we can selectively download it. `PATH-TO-FILE` can also include wildcards. 47 | 48 | ### 4. Edit the files as desired 49 | 50 | Remember to commit the changes. 51 | 52 | ### 5. Push the changes 53 | 54 | This should work with `git push`. But yet again, Git LFS complicates things, raising the following error message: 55 | 56 | ```txt 57 | error: Authentication error: Authentication required: You must have push access to verify locks 58 | error: failed to push some refs to 'github.com:USERNAME/FORK-OF-THE-REPOSITORY.git' 59 | ``` 60 | 61 | This is caused by a limitation of GitHub's Git LFS implementation. A long thread here discusses the issue: [Authentication required : You must have push access to verify locks error.](https://github.com/git-lfs/git-lfs/issues/2291#issuecomment-305887405) Multiple solutions are suggested there. One reliable workaround is to [delete the git hook `.git/hooks/pre-push`](https://github.com/git-lfs/git-lfs/issues/2291#issuecomment-305887405). 62 | 63 | ```bash 64 | rm .git/hooks/pre-push 65 | git push 66 | ``` 67 | 68 | This resolved the issue for me -- specifically because I never had to edit any of the genotype data files when working on a PR fork. I don't know how this hack affects the handling of LFS-tracked files. 69 | 70 | ### 6. (optional) Moving to another fork in the same clone 71 | 72 | If the changes in a fork A are already merged into the master branch of the main archive repository, then a little trick allows to switch to another fork B in the same clone. 73 | 74 | ```bash 75 | git remote -v 76 | git remote set-url origin git@github.com:poseidon-framework/community-archive.git 77 | git switch master 78 | git pull 79 | git remote set-url origin git@github.com:USERNAME/FORK-OF-THE-NEXT-REPOSITORY.git 80 | git pull 81 | ``` 82 | 83 | We set the remote URL to the main repository, switch to the master branch, and pull. The commits from A are already there, so we have a clean state again. From here we can set a new remote URL for a fork B and pull. This effectively saves us from creating a fresh clone (as described in @sec-clone). -------------------------------------------------------------------------------- /posts/2021-12-05-shake-I.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Workflow management with Haskell Shake I: Discovery" 3 | author: Clemens Schmid 4 | origin: https://medium.com/@nevrome/my-workflow-automation-journey-discovering-shake-haskell-5c270b93ff2b 5 | --- 6 | 7 | *This is part I of a two part blog post. See [part II](/posts/2021-12-05-shake-II.html) for a little showcase of Shake.* 8 | 9 | Workflow management, so software to organize and run data analysis scripts, is one of these fortunate domains, where dozens of open source solutions are competing for our attention. There’s probably something for every taste (see e.g. the extensive list [here](https://github.com/pditommaso/awesome-pipeline)), and many of these projects are actively maintained or at least comparatively easy to resurrect. This post is an attempt to describe my personal journey for a tool that fits me, in the hope to motivate you to go searching as well. 10 | 11 | ## My user story 12 | 13 | My PhD research is located somewhere between Bioinformatics and Archaeoinformatics (yep — that’s [a thing](https://caa-international.org/)) and I work with large and high-dimensional datasets. Not really Big data, but big enough to require a high performance computing environment to run analyses in reasonable time. Space, time and (ancient)DNA meet in my data, so my code necessarily relies on a variety of software libraries from different domains. In the last two years I piled scripts on top of scripts and thus created a complex network of interlinked code for data preparation, analysis and visualization. 14 | 15 | This is my personal user story. It eventually brought me to a point where I realized that I have to introduce a more sophisticated system for dependency management and workflow automation. The former is especially important for reproducibility, and the latter to propagate changes, so to always maintain an up-to-date version of derived data products and plots. I needed a system that defines, runs and monitors a pipeline of code across different interacting scripts. 16 | 17 | As I share these challenges with a large number of people working professionally with computers, there are many excellent solutions for exactly these challenges out there. I just had to pick what fits me, my tasks and my interests. So I decided to follow my gut feelings and ended up with the containerization solutions docker and singularity to encapsulate my development environment (which will only be mentioned in passing here), and the build system [**Shake**](https://shakebuild.com/) to orchestrate my analysis pipeline. 18 | 19 | ### Why Shake, of all things? 20 | 21 | The first options l considered for pipeline management were [**Nextflow**](https://www.nextflow.io/) and [**Snakemake**](https://snakemake.readthedocs.io/en/stable/). Both are very popular among my colleagues in bioinformatics. At our department there seems to be an even divide between strong fans of the former and the latter. I personally did not want to deal neither with [Groovy](http://groovy-lang.org/documentation.html) nor with Python, though, which nextflow and snakemake respectively use as an underlying configuration language. Ideally I wanted to write the pipeline definition in a language and framework I’m already familiar with. That’s not (only) laziness. By working in either R or Haskell, with which I feel most comfortable, I could more easily leverage the power of these languages. 22 | 23 | So then I gave some scrutiny to [**targets**](https://books.ropensci.org/targets/walkthrough.html), an implementation of a pipelining tool in R. This might have worked for me, but it gave me the impression to be too focused on workflows within R. R is certainly an important component of my personal tech stack right now, but I wanted to be prepared for whatever the future might bring. I also — and that’s very shallow— didn’t like target’s syntax from what I saw in the example code, where every computation in a pipeline got crammed into a single list object. 24 | 25 | At this point I realized I would really like to solve this in Haskell, as the language became something of a personal passion anyway. A functional, strongly typed language should also — at least in theory — be a good fit to formalize building rules. I did some research and came across three Haskell tools that seem to offer workflow management: [**Funflow**](https://github.com/tweag/funflow), [**Porcupine**](https://github.com/tweag/porcupine) and [**Bioshake**](https://github.com/PapenfussLab/bioshake). Instead of diving into them one after the other, I took a step back and asked the excellent Haskell community on reddit for advice: [Experiences with workflow managers implemented in Haskell (funflow, porcupine, bioshake, ?)](https://old.reddit.com/r/haskell/comments/q0esys/experiences_with_workflow_managers_implemented_in/) 26 | 27 | Fortunately [Justin Bedő](https://github.com/jbedo), the author of Bioshake, saw the post and gave me some insights about his implementation. At the time he had already moved one step further, and had discontinued the development of Bioshake for his new solution [**BioNix**](https://github.com/PapenfussLab/bionix), which solves both (!) dependency and worflow management with the fascinating [Nix](https://nixos.org/) infrastructure. As Nix is a big world on its own, I couldn’t follow him there. So I instead gave the Bioshake documentation a good read. And there I realized that Bioshake heavily relies on Shake internally: understanding Shake seemed to be inevitable to figuring out Bioshake. And Shake alone already turned out to be powerful and flexible enough for my current needs! 28 | 29 | I had reached the end of my software exploration journey. 30 | 31 | Your journey for a workflow management solution would certainly be different, and you would most likely reach different conclusions. But I encourage you to explore this realm, if you think you share a user story similar to mine. You can keep reading [here](/posts/2021-12-05-shake-II.html), if you want to see how I configured Shake to help me with my challenges. 32 | -------------------------------------------------------------------------------- /_site/posts/2021-12-05-shake-I.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - Workflow management with Haskell Shake I: Discovery 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

Workflow management with Haskell Shake I: Discovery

33 |
34 |
35 | Posted 36 | 37 | originally here 38 | 39 | on December 5, 2021 40 | 41 | by Clemens Schmid 42 | 43 |
44 |
45 |

This is part I of a two part blog post. See part II for a little showcase of Shake.

46 |

Workflow management, so software to organize and run data analysis scripts, is one of these fortunate domains, where dozens of open source solutions are competing for our attention. There’s probably something for every taste (see e.g. the extensive list here), and many of these projects are actively maintained or at least comparatively easy to resurrect. This post is an attempt to describe my personal journey for a tool that fits me, in the hope to motivate you to go searching as well.

47 |

My user story

48 |

My PhD research is located somewhere between Bioinformatics and Archaeoinformatics (yep — that’s a thing) and I work with large and high-dimensional datasets. Not really Big data, but big enough to require a high performance computing environment to run analyses in reasonable time. Space, time and (ancient)DNA meet in my data, so my code necessarily relies on a variety of software libraries from different domains. In the last two years I piled scripts on top of scripts and thus created a complex network of interlinked code for data preparation, analysis and visualization.

49 |

This is my personal user story. It eventually brought me to a point where I realized that I have to introduce a more sophisticated system for dependency management and workflow automation. The former is especially important for reproducibility, and the latter to propagate changes, so to always maintain an up-to-date version of derived data products and plots. I needed a system that defines, runs and monitors a pipeline of code across different interacting scripts.

50 |

As I share these challenges with a large number of people working professionally with computers, there are many excellent solutions for exactly these challenges out there. I just had to pick what fits me, my tasks and my interests. So I decided to follow my gut feelings and ended up with the containerization solutions docker and singularity to encapsulate my development environment (which will only be mentioned in passing here), and the build system Shake to orchestrate my analysis pipeline.

51 |

Why Shake, of all things?

52 |

The first options l considered for pipeline management were Nextflow and Snakemake. Both are very popular among my colleagues in bioinformatics. At our department there seems to be an even divide between strong fans of the former and the latter. I personally did not want to deal neither with Groovy nor with Python, though, which nextflow and snakemake respectively use as an underlying configuration language. Ideally I wanted to write the pipeline definition in a language and framework I’m already familiar with. That’s not (only) laziness. By working in either R or Haskell, with which I feel most comfortable, I could more easily leverage the power of these languages.

53 |

So then I gave some scrutiny to targets, an implementation of a pipelining tool in R. This might have worked for me, but it gave me the impression to be too focused on workflows within R. R is certainly an important component of my personal tech stack right now, but I wanted to be prepared for whatever the future might bring. I also — and that’s very shallow— didn’t like target’s syntax from what I saw in the example code, where every computation in a pipeline got crammed into a single list object.

54 |

At this point I realized I would really like to solve this in Haskell, as the language became something of a personal passion anyway. A functional, strongly typed language should also — at least in theory — be a good fit to formalize building rules. I did some research and came across three Haskell tools that seem to offer workflow management: Funflow, Porcupine and Bioshake. Instead of diving into them one after the other, I took a step back and asked the excellent Haskell community on reddit for advice: Experiences with workflow managers implemented in Haskell (funflow, porcupine, bioshake, ?)

55 |

Fortunately Justin Bedő, the author of Bioshake, saw the post and gave me some insights about his implementation. At the time he had already moved one step further, and had discontinued the development of Bioshake for his new solution BioNix, which solves both (!) dependency and worflow management with the fascinating Nix infrastructure. As Nix is a big world on its own, I couldn’t follow him there. So I instead gave the Bioshake documentation a good read. And there I realized that Bioshake heavily relies on Shake internally: understanding Shake seemed to be inevitable to figuring out Bioshake. And Shake alone already turned out to be powerful and flexible enough for my current needs!

56 |

I had reached the end of my software exploration journey.

57 |

Your journey for a workflow management solution would certainly be different, and you would most likely reach different conclusions. But I encourage you to explore this realm, if you think you share a user story similar to mine. You can keep reading here, if you want to see how I configured Shake to help me with my challenges.

58 |
59 |
60 | 61 |
62 | 63 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /_site/posts/2025-02-21-poseidon-git-pr-editing.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - Editing a pull request branch created from a fork 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

Editing a pull request branch created from a fork

33 |
34 |
35 | Posted 36 | 37 | originally here 38 | 39 | on February 21, 2025 40 | 41 | by Clemens Schmid 42 | 43 |
44 |
45 |

At the time of writing, the Poseidon community archive has 14 open pull requests – most of which were opened by various community members to add new packages to the archive. What certainly is a pleasant development, because it indicates that the archive gets adopted, also comes with technical and administrative challenges. As an editor for the archive I recently had to step up my Git skills to address a particular issue I was facing.

46 |

Already multiple times I found myself in the situation that I needed to edit a submission pull request before merging. This arose for example, when a package author prepared a package almost perfectly, but I still wanted to apply some additional minor changes before merging. Or an author or reviewer had struggled with Git, manoeuvred themselves into a predicament, and needed my help to untangle the knot without starting from scratch. So here is what I came up with to allow me to do that efficiently.

47 |

Workflow

48 |

GitHub’s documentation includes a helpful tutorial how to commit changes to a pull request branch created from a fork. It already covers the basic workflow how to edit a fork. The article highlights a number of conditions for this to be possible:

49 |
50 |

You can only make commits on pull request branches that:

51 |
    52 |
  • Are opened in a repository that you have push access to and that were created from a fork of that repository
  • 53 |
  • Are on a user-owned fork
  • 54 |
  • Have permission granted from the pull request creator
  • 55 |
  • Don’t have branch restrictions that will prevent you from committing
  • 56 |
57 |
58 |

All of these are met in my case. But two additional challenges complicate the matter: i) the community-archive uses Git LFS for the large data files, and ii) I need to do this so frequently, that cloning every fork feels unnecessarily cumbersome. The following workflow considers this special situation.

59 |

1. Clone the fork repository

60 |
GIT_LFS_SKIP_SMUDGE=1 git clone git@github.com:USERNAME/FORK-OF-THE-REPOSITORY.git
61 |

Note that this workflow assumes that you have installed and configured Git LFS on your system. Cloning the repo with the GIT_LFS_SKIP_SMUDGE environment variable prevents downloading the LFS-tracked files despite Git LFS being enabled. This saves bandwidth and costs for us on GitHub.

62 |

2. (if necessary) Switch to the pull request branch

63 |
git switch PR-BRANCH
64 |

This is only necessary, if the PR branch is not the main/master branch.

65 |

3. (if necessary) Download individual LFS-tracked files

66 |
git lfs pull --include "PATH-TO-FILE"
67 |

To validate a package completely it can be necessary to also access the genotype data. But because we cloned above with GIT_LFS_SKIP_SMUDGE=1, this data is not in our clone now. Fortunately we can selectively download it. PATH-TO-FILE can also include wildcards.

68 |

4. Edit the files as desired

69 |

Remember to commit the changes.

70 |

5. Push the changes

71 |

This should work with git push. But yet again, Git LFS complicates things, raising the following error message:

72 |
error: Authentication error: Authentication required: You must have push access to verify locks
73 | error: failed to push some refs to 'github.com:USERNAME/FORK-OF-THE-REPOSITORY.git'
74 |

This is caused by a limitation of GitHub’s Git LFS implementation. A long thread here discusses the issue: Authentication required : You must have push access to verify locks error. Multiple solutions are suggested there. One reliable workaround is to delete the git hook .git/hooks/pre-push.

75 |
rm .git/hooks/pre-push
76 | git push
77 |

This resolved the issue for me – specifically because I never had to edit any of the genotype data files when working on a PR fork. I don’t know how this hack affects the handling of LFS-tracked files.

78 |

6. (optional) Moving to another fork in the same clone

79 |

If the changes in a fork A are already merged into the master branch of the main archive repository, then a little trick allows to switch to another fork B in the same clone.

80 |
git remote -v
81 | git remote set-url origin git@github.com:poseidon-framework/community-archive.git
82 | git switch master
83 | git pull
84 | git remote set-url origin git@github.com:USERNAME/FORK-OF-THE-NEXT-REPOSITORY.git
85 | git pull
86 |

We set the remote URL to the main repository, switch to the master branch, and pull. The commits from A are already there, so we have a clean state again. From here we can set a new remote URL for a fork B and pull. This effectively saves us from creating a fresh clone (as described in @sec-clone).

87 |
88 |
89 | 90 |
91 | 92 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /posts/2017-12-28-custom-bars-rcppprogress.markdown: -------------------------------------------------------------------------------- 1 | --- 2 | title: Custom progress bars for RcppProgress 3 | author: Clemens Schmid and Karl Forner 4 | origin: https://gallery.rcpp.org/articles/custom-bars-rcppprogress 5 | --- 6 | 7 | [RcppProgress](http://cran.r-project.org/web/packages/RcppProgress/index.html) 8 | is a tool to help you monitor the execution time of your C++ code, by 9 | providing a way to interrupt the execution inside the C++ code, and also to 10 | display a progress bar indicative of the state of your computation. Additionally, 11 | it is compatible with multi-threaded code, for example using OpenMP. 12 | [The initial (yet updated) article](https://gallery.rcpp.org/articles/using-rcppprogress/) explains the 13 | basic setup. 14 | 15 | Since version 0.4 it became more simple to create custom progress bars. In this new 16 | article we will show how to do this. Our final example displays a progress bar which 17 | provides an estimation of the remaining time (ETA) to finish a computation. 18 | 19 | ## A minimal example 20 | 21 | Imagine you added a progress bar with RcppProgress to your function 22 | `long_computation()` following the example from the first article mentioned above. 23 | 24 | ```cpp {.numberLines} 25 | // [[Rcpp::depends(RcppProgress)]] 26 | #include 27 | #include 28 | // [[Rcpp::export]] 29 | double long_computation(int nb, bool display_progress=true) { 30 | double sum = 0; 31 | Progress p(nb*nb, display_progress); 32 | for (int i = 0; i < nb; ++i) { 33 | if (Progress::check_abort() ) 34 | return -1.0; 35 | for (int j = 0; j < nb; ++j) { 36 | p.increment(); 37 | sum += R::dlnorm(i+j, 0.0, 1.0, 0); 38 | } 39 | } 40 | return sum + nb; 41 | } 42 | 43 | long_computation(10) 44 | ``` 45 | 46 | What you get is a basic and useful console visualization that looks like this: 47 | 48 | ``` 49 | 0% 10 20 30 40 50 60 70 80 90 100% 50 | [----|----|----|----|----|----|----|----|----|----| 51 | ****************************** 52 | ``` 53 | 54 | That's the default, platform independent display in RcppProgress defined in 55 | [SimpleProgressBar.hpp](https://github.com/kforner/rcpp_progress/blob/master/inst/include/simple_progress_bar.hpp). 56 | It's OK for most purposes to give you an idea how much work is done and it also allows 57 | you to make a very intuitive estimation about how long it's going to take to finish. 58 | But of course that's not everything a progress bar *could* show you. A progress bar 59 | could give you information about the running progress or about performance parameters 60 | of your system. It could contain calculated estimates of passed and remaining time. 61 | After all it could just look much more fancy to impress your colleagues. 62 | 63 | RcppProgress makes it now easy to create your own implementation of a progress bar class. 64 | Your own class has to be derived from the abstract class `ProgressBar` that defines some 65 | basic virtual methods: 66 | 67 | ```cpp {.numberLines} 68 | class ProgressBar { 69 | public: 70 | virtual ~ProgressBar() = 0; 71 | virtual void display() = 0; 72 | virtual void update(float progress) = 0; 73 | virtual void end_display() = 0; 74 | }; 75 | ``` 76 | 77 | `display()` starts the display that will be updated by subsequent calls of 78 | `update()`. `end_display` finalizes it. Your progress bar implementation should 79 | not rely on the destructor to finalize the display. 80 | 81 | A very **minimal setup** could look something like this: 82 | 83 | ```cpp {.numberLines} 84 | #include 85 | 86 | // [[Rcpp::depends(RcppProgress)]] 87 | #include 88 | #include "progress_bar.hpp" 89 | 90 | class MinimalProgressBar: public ProgressBar{ 91 | public: 92 | MinimalProgressBar() { 93 | _finalized = false; 94 | } 95 | 96 | ~MinimalProgressBar() {} 97 | 98 | void display() { 99 | REprintf("Progress: "); 100 | } 101 | 102 | void update(float progress) { 103 | if (_finalized) return; 104 | REprintf("+"); 105 | } 106 | 107 | void end_display() { 108 | if (_finalized) return; 109 | REprintf("\n"); 110 | _finalized = true; 111 | } 112 | 113 | private: 114 | 115 | bool _finalized; 116 | 117 | }; 118 | 119 | // [[Rcpp::export]] 120 | double long_computation_minimal(int nb, bool display_progress=true) { 121 | MinimalProgressBar pb; 122 | double sum = 0; 123 | Progress p(nb, display_progress, pb); 124 | for (int i = 0; i < nb; ++i) { 125 | if (Progress::check_abort() ) 126 | return -1.0; 127 | for (int j = 0; j < nb; ++j) { 128 | sum += R::dlnorm(i+j, 0.0, 1.0, 0); 129 | } 130 | p.increment(); 131 | } 132 | return sum + nb; 133 | } 134 | 135 | long_computation_minimal(10) 136 | ``` 137 | 138 | The `display()` method in this example does nothing more than printing the word 139 | `Progress`. `update()` concatenates a `+` symbol every time `Progress::increment()` is 140 | called. The result looks like this: 141 | 142 | ``` 143 | Progress: ++++++++++ 144 | ``` 145 | 146 | In comparison to the example of the default progress bar above, I moved the 147 | call to `increment()` out of the second level and into the first level loop to keep 148 | the amount of console output at bay. `update()` also checks if the display is `_finalized`. 149 | `end_display` triggers the finalization. 150 | 151 | ## Remaining time estimation 152 | 153 | Based on the minimal setup above, you can implement more sophisticated 154 | progress bars. Here's an example of one that looks exactly like the default 155 | `SimpleProgressBar`, but adds an estimation of the remaining time for the process to finish. 156 | You can find a complete package setup with the code for this ETAProgressBar 157 | [here](https://github.com/kforner/rcpp_progress/tree/master/inst/examples/RcppProgressETA). 158 | In this article we only highlight some crucial aspects of the implementation. 159 | 160 | We use the [Rinterface.h](https://stat.ethz.ch/R-manual/R-devel/RHOME/doc/manual/R-exts.html#Setting-R-callbacks) header to update the display dynamically. Unfortunately this [header is only available for Unix-like systems](https://stackoverflow.com/questions/47623478/creating-a-progress-update-by-replacing-output-in-the-r-console-from-c-c/47624175?noredirect=1#comment82228757_47624175). 161 | A less cool, old version of an ETA progress bar that also works on windows can be 162 | found [here](https://github.com/kforner/rcpp_progress/blob/5b0ec0d672c7758cf4c4134e97dfa9921ac668e0/inst/examples/RcppProgressETA/src/eta_progress_bar.hpp). 163 | The following preprocessor statements load Rinterface.h if the code is compiled 164 | on a non-windows computer. 165 | 166 | ```cpp {.numberLines} 167 | #if !defined(WIN32) && !defined(__WIN32) && !defined(__WIN32__) 168 | #include 169 | #endif 170 | ``` 171 | 172 | The class `ETAProgressBar` inherits from the abstract class `ProgressBar`. 173 | It has an integer variable `_max_ticks` that controls the amount of individual 174 | tick symbols necessary to reach the 100% mark of the progress bar. That depends 175 | on the display you want to craft. `ETAProgressBar` also has a boolean flag variable 176 | `_timer_flag` that acts as a switch to separate the initial starting turn where 177 | the time measurement starts and the following turns where the time is picked off. 178 | The measured time values are stored in two variables `start` and `end` of class 179 | `time_t` (from [ctime](http://www.cplusplus.com/reference/ctime/)). 180 | 181 | ```cpp {.numberLines} 182 | class ETAProgressBar: public ProgressBar{ 183 | 184 | // ... 185 | 186 | private: 187 | int _max_ticks; 188 | bool _finalized; 189 | bool _timer_flag; 190 | time_t start,end; 191 | 192 | // ... 193 | 194 | } 195 | ``` 196 | 197 | The `display()` function initializes the progress bar visualization. The first two lines 198 | are hard coded ASCII art. 199 | 200 | ```cpp {.numberLines} 201 | void display() { 202 | REprintf("0%% 10 20 30 40 50 60 70 80 90 100%%\n"); 203 | REprintf("[----|----|----|----|----|----|----|----|----|----|\n"); 204 | flush_console(); 205 | } 206 | ``` 207 | 208 | `update()` is the most important function for the progress bar mechanism. The if clause 209 | allows to separate the initial call of `update()` from the following ones to start the time 210 | counter. Afterwards the time passed is calculated and transformed to a human readable string 211 | by the custom function `_time_to_string()`. `_current_ticks_display()` is another custom 212 | function to transform the progress information to a string with the correct amount of `*` 213 | symbols and filling whitespaces. The progress string and the time string are concatenated 214 | to create the additional third line below the initial two lines drawn by `display()`. 215 | A string with sufficient whitespaces is also added to ensure that this dynamically updated 216 | line is overwritten completely from turn to turn. `REprintf("\r");` triggers a carriage return 217 | to make this continuous overwriting possible. 218 | 219 | ```cpp {.numberLines} 220 | void update(float progress) { 221 | 222 | // stop if already finalized 223 | if (_finalized) return; 224 | 225 | // start time measurement when update() is called the first time 226 | if (_timer_flag) { 227 | _timer_flag = false; 228 | // measure start time 229 | time(&start); 230 | } else { 231 | 232 | // measure current time 233 | time(&end); 234 | 235 | // calculate passed time and remaining time (in seconds) 236 | double pas_time = std::difftime(end, start); 237 | double rem_time = (pas_time / progress) * (1 - progress); 238 | 239 | // convert seconds to time string 240 | std::string time_string = _time_to_string(rem_time); 241 | 242 | // create progress bar string 243 | std::string progress_bar_string = _current_ticks_display(progress); 244 | 245 | // ensure overwriting of old time info 246 | int empty_length = time_string.length(); 247 | std::string empty_space = std::string(empty_length, ' '); 248 | 249 | // merge progress bar and time string 250 | std::stringstream strs; 251 | strs << "|" << progress_bar_string << "| " << time_string << empty_space; 252 | std::string temp_str = strs.str(); 253 | char const* char_type = temp_str.c_str(); 254 | 255 | // print: remove old display and replace with new 256 | REprintf("\r"); 257 | REprintf("%s", char_type); 258 | 259 | // finalize display when ready 260 | if(progress == 1) { 261 | _finalize_display(); 262 | } 263 | } 264 | } 265 | ``` 266 | 267 | `_time_to_string()` parses time information given in form of a floating point number of 268 | seconds to a human-readable string. The basic algorithm is based on an example from 269 | [programmingnotes.org](http://www.programmingnotes.org/?p=2062). 270 | 271 | ```cpp {.numberLines} 272 | std::string _time_to_string(double seconds) { 273 | 274 | int time = (int) seconds; 275 | 276 | int hour = 0; 277 | int min = 0; 278 | int sec = 0; 279 | 280 | hour = time / 3600; 281 | time = time % 3600; 282 | min = time / 60; 283 | time = time % 60; 284 | sec = time; 285 | 286 | std::stringstream time_strs; 287 | if (hour != 0) time_strs << hour << "h "; 288 | if (min != 0) time_strs << min << "min "; 289 | if (sec != 0) time_strs << sec << "s "; 290 | std::string time_str = time_strs.str(); 291 | 292 | return time_str; 293 | } 294 | ``` 295 | 296 | `_current_ticks_display()` relies on `_compute_nb_ticks()` to first of all transform 297 | the progress information (floating point number between 0 and 1) to a natural number 298 | that expresses the correct fraction of `_max_ticks`. `_construct_ticks_display_string()` 299 | takes this value and parses a string with `*` symbols and whitespaces that can be plotted 300 | as a visual progress indication. 301 | 302 | ```cpp {.numberLines} 303 | std::string _current_ticks_display(float progress) { 304 | int nb_ticks = _compute_nb_ticks(progress); 305 | std::string cur_display = _construct_ticks_display_string(nb_ticks); 306 | return cur_display; 307 | } 308 | 309 | int _compute_nb_ticks(float progress) { 310 | return int(progress * _max_ticks); 311 | } 312 | 313 | std::string _construct_ticks_display_string(int nb) { 314 | std::stringstream ticks_strs; 315 | for (int i = 0; i < (_max_ticks - 1); ++i) { 316 | if (i < nb) { 317 | ticks_strs << "*"; 318 | } else { 319 | ticks_strs << " "; 320 | } 321 | } 322 | std::string tick_space_string = ticks_strs.str(); 323 | return tick_space_string; 324 | } 325 | ``` 326 | 327 | `flush_console()` is a wrapper around [`R_FlushConsole()`](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Setting-R-callbacks) which is called to flush any 328 | pending output to the system console. It's necessary to do this when the display is started 329 | in `display()` and when it's closed in `_finalize_display()`. 330 | 331 | ```cpp {.numberLines} 332 | void flush_console() { 333 | #if !defined(WIN32) && !defined(__WIN32) && !defined(__WIN32__) 334 | R_FlushConsole(); 335 | #endif 336 | } 337 | ``` 338 | 339 | The output of an `ETAProgressBar` looks like this: 340 | 341 | ``` 342 | 0% 10 20 30 40 50 60 70 80 90 100% 343 | [----|----|----|----|----|----|----|----|----|----| 344 | |******* | 49s 345 | ``` 346 | -------------------------------------------------------------------------------- /posts/2023-12-31-poseidon-end-of-year-2023.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Poseidon end-of-year review 2023 3 | author: Clemens Schmid 4 | origin: https://blog.poseidon-adna.org/posts/endofyear2023.html 5 | --- 6 | 7 | It's late December and the time of the year when work slows down in my part of the world. For many of us an opportunity to take a break and to look back, contemplating the achievements of the year. I decided to do so as well and write a bit about Poseidon. 8 | 9 | What follows is a subjective account of the events in and around the framework in 2023 - each of my colleagues in the core team (Stephan Schiffels, Ayshin Ghalichi, Thiseas C. Lamnidis, Dhananjaya B. A. Mudiyanselage, Wolfgang Haak and I, Clemens Schmid) would probably emphasise different developments in such a write-up. That is in itself an achievement, because it shows how much the tech-stack, domains and services in our little ecosystem have grown this year: beyond the understanding of each of us individually. 10 | 11 | ## The Poseidon schema 12 | 13 | Let's start simple with the two new releases of the Poseidon schema we published this year: v2.7.0 and v2.7.1. They were published in short succession in March and May, the latter only slightly improving the sequencing source files (.ssf) added in the first. See the changelog [here](https://www.poseidon-adna.org/#/changelog) for more details, but the addition of the [.ssf file](https://www.poseidon-adna.org/#/ssf_details) is indeed their most remarkable contribution to the schema. With it we addressed a major desideratum and unresolved question in previous versions of Poseidon: How should genotype data be linked to the raw sequencing data on the [European Nucleotide Archive (ENA)](https://www.ebi.ac.uk/ena/browser/home) and other archives of the [International Nucleotide Sequence Database Collaboration (INSDC)](https://www.insdc.org/)? 14 | 15 | The .ssf file is, I would argue, a smart solution for this question. It specifies the same variables already used in the ENA database, allows for an extremely flexible, yet not arbitrary n:m connection between the entities in a Poseidon package and the raw data products and it can be [generated semi-automatically](https://github.com/poseidon-framework/scripts/blob/main/get_ena_table.py) for most of the data in our public archives. With some tweaking it can also be used to organize local data repositories independent of any online databases. The .ssf file is finally the very foundation on top of which the amazing Minotaur workflow is built (see below). 16 | 17 | Generally, both the fact that only two Poseidon releases were necessary this year and that we could treat them as non-breaking changes indicate that we reached a certain level of maturity and stability in the schema. Of course we still have ideas how to extend it further in the future, but at the moment I'm optimistic that we can maintain long-term backwards compatibility. The process in which we discussed, specified and later improved the .ssf file definition to then see Minotaur be erected on top of it was a very satisfying professional experience for me personally. 18 | 19 | ## The Minotaur workflow 20 | 21 | The Minotaur workflow is a semi-automatic workflow to reproducibly process published sequencing data into Poseidon packages. Developing this entirely new branch of the Poseidon ecosystem became possible because Thiseas joined the Poseidon core team in 2023. He came up with a sophisticated, yet open and transparent implementation of this process, in which authors and the community as a whole retain control over the data and the data processing parameters. A full write-up for the website is [in progress](https://github.com/poseidon-framework/poseidon-framework.github.io/pull/54). Here is the summary Thiseas prepared for [our poster at the ISBA conference](https://blog.poseidon-adna.org/posts/isba2023poster.html): 22 | 23 | Community members can request new packages to be processed through the Minotaur workflow by submitting a build recipe as a pull request against a dedicated GitHub repository. This recipe is created from a sequencing source file (.ssf), describing the sequencing data for the package and where it can be downloaded. Using the recipe, the sequencing data gets processed via [nf-core/eager](https://nf-co.re/eager) on computational infrastructure of MPI-EVA, using a standardised, yet flexible, set of parameters. The generated genotypes, together with descriptive statistics of the sequencing data (Endogenous, Damage, Nr_SNPs, Contamination), are compiled into a Poseidon package, and made available to users in the minotaur-archive. 24 | 25 | The Minotaur workflow is a timely addition to the Poseidon framework, providing a flexible solution to wrap legacy and new data in uniformly processed packages. Homogeneous data processing puts us closer to our great comparadum, [the AADR dataset](https://reich.hms.harvard.edu/allen-ancient-dna-resource-aadr-downloadable-genotypes-present-day-and-ancient-dna-data). It also helped us to finalize the structure of our public archives, which emerged from long discussions about the kind of data we think the aDNA community requires for derived analyses. 26 | 27 | Right now the Minotaur workflow is still in a final development and testing phase, where we focus on the processes around it, so the submission of recipes, their review and the forwarding of results to the minotaur-archive. One particular tricky question is how context information in the .janno file should be passed from the community-archive to the new packages in the minotaur-archive. [One of the last pull requests](https://github.com/poseidon-framework/poseidon-hs/pull/282) for our software tool trident in 2023 aims to introduce a reliable mechanism to merge .janno files to address this issue. 28 | 29 | ## The public archives 30 | 31 | In 2023 we finally came to a conclusion on how to organize our public data archives. What emerged is a threefold division into what we call the community-archive, the minotaur-archive and the aadr-archive. The archives are described in more detail on the [website](https://www.poseidon-adna.org/#/archive_overview), but here's the gist of it: 32 | 33 | The [community-archive](https://github.com/poseidon-framework/community-archive) emerged from our old public-archive. It includes the legacy data we originally copied from the AADR. We now decided to use this archive for author-submitted publication-wise packages to collect the exact genotype data analysed in the respective papers. The idea is twofold: With the author-submitted genotype data the results in a given paper can be reproduced exactly. And the publication authors are generally the most trustworthy authority for the context data we collect in the .janno files, e.g. the spatiotemporal origin of the individual samples. Ayshin and I recently wrote about the submission process for the community-archive [here](https://mpi-eva-archaeogenetics.github.io/comp_human_adna_book/poseidon.html#contributing-to-the-community-archive). 34 | 35 | The [minotaur-archive](https://github.com/poseidon-framework/minotaur-archive) mirrors the community-archive in that it features publication-wise packages, usually even the very same as in the community-archive. To distinguish them clearly, package titles and sample-wise Poseidon_IDs in the minotaur-archive carry the suffix `_MNT`. As explained above the packages in this archive include consistently reprocessed genotype data, run through the Minotaur workflow. 36 | 37 | The [aadr-archive](https://github.com/poseidon-framework/aadr-archive) is the conceptionally most simple archive. It features “poseidonized” versions of releases of the AADR dataset, currently only the latest AADR v54.1.p1. We documented the code and decisions for the cleaning and packaging process [here](https://github.com/poseidon-framework/aadr2poseidon). 38 | 39 | 2023 not only saw the planning and setup of these three archives, but also a lot of work to fill them with life. For the community archive that meant plenty of data cleaning by all of us, most notably Dhananjaya. And it also meant providing guidance for authors to submit their data. Thanks to the hard work of Ayshin a total of eleven author-submitted packages are available in the archive now. Number twelve was [submitted shortly before christmas](https://github.com/poseidon-framework/community-archive/pull/151) and is awaiting review. The minotaur-archive is still functionally empty, but three packages [are pending](https://github.com/poseidon-framework/minotaur-archive/pulls) thanks to Thiseas and will hopefully soon be merged. Preparing the latest version of the AADR dataset for the aadr-archive was one of the projects I tackled this year. 40 | 41 | ## The software tools 42 | 43 | The Poseidon software tools grew significantly more powerful this year. From a user-perspective 2023 brought various new features, changes to the command line interfaces and breaking updates in the Web-API. To keep track of the releases and the Poseidon schema versions they support I created a [version overview table](https://www.poseidon-adna.org/#/version_table) on the website. 44 | 45 | With qjanno I added an entirely new tool to the set. It is a command line tool to run SQL queries on .janno (and arbitrary .csv and .tsv) files. I created it by forking the [qsh package](https://github.com/itchyny/qhs) and then adjusting it heavily for the use on Poseidon packages. Just as trident it is written in Haskell and openly available with precompiled executables [here](https://www.poseidon-adna.org/#/qjanno). 46 | 47 | Stephan invested a good amount of effort into consolidating the data analysis features in xerxes. He wrote a [whitepaper](https://github.com/poseidon-framework/poseidon-analysis-hs/blob/main/docs/xerxes_whitepaper.pdf) to explain and justify the reasoning behind the implemented logic for f-statistics, and another [blog post](https://blog.poseidon-adna.org/posts/xerxes_10.html) on how to run it. Even more approachable and comprehensive is a write-up he shared [here](https://mpi-eva-archaeogenetics.github.io/comp_human_adna_book/fstats.html). Together we worked on integrating the many changes to trident and its underlying poseidon-hs Haskell library into xerxes. 48 | 49 | Our main workhorse, trident, saw an astonishing number of new releases: `v1.1.6.0` on January 8 to `v1.4.0.3` on October 30. I quickly went through the [extended changelogs](https://github.com/poseidon-framework/poseidon-hs/releases) published with each release to summarize the user-facing highlights of what trident supports now: 50 | 51 | - Arbitrary columns in the .janno file beyond the columns specified in the Poseidon schema (v1.1.6.0) 52 | - Specification of individuals with identical names from different source packages in the `trident forge` selection language (v1.1.7.0) 53 | - Validation of the entire genotype data in a package with `--fullGeno` in `trident validate` (v1.1.10.2) 54 | - Poseidon schema version v2.7.1 with validation of the .ssf file (v1.1.12.0) 55 | - A highly improved Poseidon [Web-API](https://www.poseidon-adna.org/#/web_api) that allows to request individual (old) package versions (v1.2.0.0) 56 | - Reworked versions of `trident update`, now called `trident rectify`, and `trident validate`, which now allows to validate not just entire packages, but also individual files (v1.3.0.4) 57 | - Selecting packages by version in the forge selection language and generally handling multiple package versions (v1.4.0.2, Stephan shared yet [another blog post](https://blog.poseidon-adna.org/posts/trident_14.html) about this release) 58 | 59 | As always I enjoyed the work on the software tools tremendously, especially in two cases: If one of our users reports an issue and we can address a concrete need with a release, and if the Haskell programming language allows for a particularly elegant solution for a given problem. A [currently pending pull request](https://github.com/poseidon-framework/poseidon-hs/pull/283) combines both: Ayshin made me aware of some validation failure cases that require better error messages and I found a neat way to provide just that with a custom-tailored monadic stack. 60 | 61 | ## Outreach 62 | 63 | The last domain where we made good progress in 2023 is public outreach. Naturally we invested hours in writing and updating documentation on the project website (), but we also pursued a number of special projects beyond the basic, technical description of software and workflows. 64 | 65 | The first one of these was possible thanks to the effort of Dhananjaya, Stephan and me: We built [a page on the website](https://www.poseidon-adna.org/#/archive_explorer) where the data in the public archives can be easily explored. It makes use of our Web-API to access the data and display it with a sub-page for each package. Dhananjaya wrote [a blog post](https://blog.poseidon-adna.org/posts/Archive_explorer_Blogpost.html) about this, recently. 66 | 67 | I already mentioned this blog multiple times above. It is indeed another great addition of 2023. Stephan created a separate website at to share news and short tutorials. Our wish has always been to gather an active and engaged community of users around Poseidon, and we hope to establish this blog as one of its central communication hubs. A major medium for longer write-ups beyond the technical documentation already available on the website. 68 | 69 | To announce our blog posts, software releases and other news we fully switched from Twitter (now X) to the Fediverse in 2023. You can follow us here: . The switch came naturally, given the state of affairs at X. Submitting posts automatically is more easy with Mastodon compared to Twitter and I made sure that this process works reliably for our software releases on GitHub. 70 | 71 | Beyond these technical novelties and online communication we also presented Poseidon at two in-person conferences in 2023: [ISBA10 in Tartu, Estonia](https://isba10.ut.ee) and the [NFDI4Objects community meeting in Berlin, Germany](https://www.nfdi4objects.net/index.php/en/get-informed/community-meeting-and-general-assembly). The poster we presented at both of these occasions was already mentioned above and is available [here](https://blog.poseidon-adna.org/posts/isba2023poster.html). And the slides for the talk Thiseas prepared for the latter should soon be made available by the NFDI4Objects team. 72 | 73 | ## Conclusion 74 | 75 | Much has happened for Poseidon in 2023 and I'm sure I'm not doing all of it due justice in this little summary. But I consider what is here already an impressive list that stands witness for the effort we put into the framework. And it seems to pay off: The user base is growing. More users help us in turn to find and address remaining issues and make Poseidon better for all of us. This will once more be one of my main aspirations in the coming year 2024. -------------------------------------------------------------------------------- /posts/2020-04-02-covid-19.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "COVID-19: Estimates of true infections, case fatality and growth rates in Germany" 3 | author: Clemens Schmid and Stephan Schiffels 4 | origin: https://medium.com/stephan-schiffels/covid-19-estimates-of-true-infections-case-fatality-and-growth-rates-in-germany-383285f99966 5 | --- 6 | 7 | ***Acknowledgements**: We got some valuable input and corrections from Martin Lange and Johannes Boog (both Helmholtz Centre for Environmental Research Leipzig)* 8 | 9 | ***Disclaimer**: We have no epidemiological training and share these results without warranty of any kind. They should not be used as a basis for decision making and we refer to the respected authorities (e.g. for Germany the [Robert Koch Institute](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/nCoV.html)) for reliable information and models. This post is only an interesting exercise in data analysis.* 10 | 11 | ***Note**: Analyses in this post are from April 2nd, 2020, and naturally include only data from before that date.* 12 | 13 | The COVID-19 pandemic has taken its toll all around the world and caused (so far) hundreds of deaths in Germany. In this post we present current data and model estimations for multiple relevant parameters (e.g. current number of real infections and number of future deaths) for Germany. 14 | 15 | In the context of the [#WirvsVirus hackathon](https://www.bundesregierung.de/breg-de/themen/coronavirus/hackathon-der-bundesregierung-1733632) we started to work on the R package [covid19germany](https://github.com/nevrome/covid19germany) that allows to download and visualize the current numbers of confirmed cases and deaths by administrative units. We use this package to access the data for this post. The code for this post can be found [here](https://github.com/nevrome/covid19germany/tree/master/blog_post). Furthermore the package comes with a [webapp](https://nevrome.shinyapps.io/covid19germany) that allows to explore some of the following data and analyses in further detail — not just for the whole of Germany, but also for smaller administrative units as well as gender and age classes. 16 | 17 | ## Quick overview about COVID-19 in Germany (2020–04-01) 18 | 19 | The number of confirmed COVID-19 cases in Germany is rising daily, but it is unclear to which degree new infections are taking place or testing is simply catching up with past infection events. Germany may be one of the countries where testing covers a higher proportion of infected cases as the testing abilities are [comparatively good](https://time.com/5812555/germany-coronavirus-deaths). As testing will always lack behind the actual number of infected it is still an unreliable estimator of the true dimensions of this pandemic. The number of deaths caused by COVID-19 is a more trustworthy indicator — though with a significant temporal delay. More about this later. 20 | 21 | ![+](/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp){width=100%} 22 | 23 | ![Evolution of new daily and cumulative cases in Germany by federated state (Bundesland)](/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp){width=100%} 24 | 25 | The increase of infected and deaths follows an expected acceleration trend due to exponential disease expansion with a growing number of spreaders. Dips on the weekends, especially of the number of positive tests, might be an effect of reduced working hours and reduced information transmission in and by health care authorities. At first glance, it is not entirely clear from this data if the social distancing rules imposed by the federal and local governments during the last two weeks have had a significant effect on the spreading of COVID-19, but the recent decline in the number of daily deaths raises hope. 26 | 27 | ![+](/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp){width=100%} 28 | 29 | ![Maps of cumulative and relative deaths and confirmed cases in Germany by county (Landkreis)](/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp){width=100%} 30 | 31 | Western and Southern Germany have so far been more affected than Eastern Germany, with some individual counties (Landkreise) at the border to France, Czechia and Austria especially compromised. North Rhine-Westphalia, Bavaria and Baden-Württemberg — and therefore the federated states (Bundesländer) with the most inhabitants — have the most test-confirmed cases as well as deaths. A [dashboard](https://experience.arcgis.com/experience/478220a4c454480e823b17327b2bf1d4) provided by the RKI, the GeoHealth Center at Bonn University and ESRI gives a good overview of the official numbers, which are published on a daily basis. The RKI also releases a [daily report](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/Gesamt.html?nn=13490888) with relevant information. 32 | 33 | ## Simple estimation based on systematic death lag 34 | 35 | It generally is a difficult task to estimate the true number of infected people during an epidemic outbreak. However, we learned about two methods to do so in [this](https://medium.com/@tomaspueyo/coronavirus-act-today-or-people-will-die-f4d3d9cd99ca) excellent post by Tomas Pueyo. 36 | 37 | One way is to focus on the current number of deaths. If we know the mean time it takes for an individual from infection to death (in case of death!) and the lethality (general probability to die from COVID-19), then we can calculate an estimation of the number of infected people in the past. We have some information about these two parameters from [early scientific studies](https://github.com/midas-network/COVID-19/tree/master/parameter_estimates/2019_novel_coronavirus) about COVID-19. We will use a fixed value of 17 days for the time to death and two different values for the lethality: 1% and 5%. 38 | 39 | In the figure below, the estimate of the true number of infections for Germany is plotted with a line each for the two lethality scenarios. It can only be calculated for the past **before** the mean death time, which is indicated in the plot by a black, vertical line. 40 | 41 | ![Estimated true number of infected based on the registered number of deaths (for constant death probabilities 1% and 5% and a mean time from infection to death of 17 days). The red line indicates the officially registered number of infected; blue vertical line indicates the last day for which we currently have data (yesterday); black vertical line demarks the time to which the true number of infected can be estimated (yesterday minus 17 days). Data between black and blue vertical lines are predictions based on exponential growth](/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp){width=100%} 42 | 43 | The lower the lethality of COVID-19, the higher the number of actually infected people in the past must have been, given the number of deaths that occurred later. We highlight that this estimated statistic is at least one order of magnitude higher than the measured observation of confirmed cases shown with the red line in the plot. Very interesting is the sudden uptick of the latter at the end of February, which is well reflected in the estimated statistic. Keep in mind: The estimation is based on deaths, not on test results! This correlation is therefore a good indicator that the estimate reflects some truth and that the number assumed for the mean time from infection to death (17 days) is not totally off. 44 | 45 | Nevertheless this estimator per definition only provides information about the distant past (before the black, vertical line). To extrapolate this statistic until yesterday (**after** the black and before the blue, vertical line) we need another set of assumptions. In the simplest possible growth model the disease tends to spread in an exponential fashion with a certain time window until the number of infected doubles: the doubling time. We can take the last value **I₀** in our first statistic and extend it with a time series of exponential growth with 46 | 47 | **Iₜ** = **I₀** _x_ **2**^(**t**/**d**) 48 | 49 | where **Iₜ** is the true number of infected individuals after the time **t**. **t** is counted in days from yesterday minus the mean number of days from infection to death. **d** is the aforementioned doubling time in days. 50 | 51 | The plot above shows three doubling time scenarios (3, 7 or 12 days) for each death probability scenario between the black and the blue vertical line (six scenarios in total). Some of them can already be ruled out considering the real-life testing data: They fall below the red curve. Others remain well possible. An increase of the doubling time is in all cases the desirable scenario and the following weeks will reveal (with their death count) if the social distancing measures prove to be effective to achieve this. Nevertheless it is very likely that far more people are infected right now than testing is able to confirm. 52 | 53 | In a last step we can use the estimated infection counts to extrapolate the number of expected deaths in the near future (yesterday plus the mean number of days from infection to death) for the different doubling time scenarios. The lethality is not relevant for this particular approximation, because it already influenced the preceding calculation and is therefore removed from the equation. 54 | 55 | ![Current number of deaths (red line) and predicted number of future deaths (black lines) based on an exponential growth model for the number of past infected](/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp){width=100%} 56 | 57 | If the number of cases that require intensive care rises above a certain threshold the capacities of hospitals would inevitably run out and the lethality would further increase beyond this projections. This dire possibility became a grim reality in Northern Italy. 58 | 59 | ## Estimation via Bayesian growth models 60 | 61 | To complement the analyses above and to make a more educated guess about the parameters visualized so far, we set up a Bayesian model to estimate the true number of infected people through time from both the reported deaths and the reported cases. This model was based on a slightly more complex notion of exponential growth with a built-in slow-down and includes the following assumptions: 62 | 63 | * A death rate of exactly 1% (we discuss deviations from this below) 64 | * A lag of 17 days between infection and death 65 | * A lag of 7 days between infection and confirmatory test 66 | * Exponential growth with a linear decrease of the growth rate due to the imposed social distancing measures 67 | 68 | Given these assumptions, we can estimate the true number of infections, as well as the reported number of test cases and deaths. A complete definition and analysis of this model can be found [here](https://rpubs.com/stschiff/bayesian_covid19_model). 69 | 70 | ![Model results for true (green) and confirmed cases (blue), as well as deaths (red). All three curves come from the same underlying Bayesian model and are estimated from the data (points)](/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp){width=100%} 71 | 72 | The model predictions (the colored “ribbons”) are shown together with the true reported cases (points). Because this is Bayesian inference, all model predictions are given with quantified uncertainty. Note that we have incorporated only data points between February 23 and April 1 in this analysis. Before that time, Germany did not experience exponential growth yet. 73 | 74 | As already shown above, the true number of infections (dark green) based on a death rate of 1% far exceeds the number of confirmed cases. We highlight that this is due to two effects: First, the reported cases and deaths lag behind the true infections, and so under exponential growth we expect the true infections of today to be much higher than the reported ones which were the infected seven days ago. Second, it is clearly expected that not all people with an infection get tested, for example because they don’t show symptoms. 75 | 76 | One of the nice features of our model is that we get an explicit estimate of this miss-rate, but it depends linearly on the death-rate. In this case, we have assumed a death rate of 1%, and this yields — shockingly — a probability of getting tested between 12% and 24% only. That would mean that 76–88% of true infected cases are not tested. With a death rate of 3%, for example, the miss-rate would “only” be about 40–60%. So this is hard to estimate, but it’s clear we’re missing a lot! 77 | 78 | A significant complication in this regard is introduced by the age structure of the population, because we know that elderly people die with much higher probability from COVID-19 then young people. An important next step for this kind of modelling would be to incorporate more realistic death rates, possibly age-stratified. 79 | 80 | The specific growth model with linear slow-down seems to work OK for the data we have, although not perfectly. In particular, the slow down in recent days seems to be stronger than modeled. This is somewhat expected, since the measures against spread of the virus haven’t been “linear” in any way. Nevertheless, a linear slow-down is the first approximation to this process. Based on this, we can again — and this time in a more sophisticated way — try to predict how many cases we will have in the coming weeks. This is of course highly speculative and depends on assumptions in the model. In fact, the uncertainty increases the further you predict into the future, which is visible by the widening of the model bands in the figure. For example, the number of reported cases on April 15 is predicted to be anywhere between 60,000 and 150,000 (though not with uniform probability) according to this model and its uncertainty today. The reported number of deaths by that time are predicted to be between 2700 and 6000 in Germany. These wide intervals simply reflect the limited power of the data to accurately estimate the parameters of the growth model. 81 | 82 | A popular choice to illustrate the speed of an exponential growth model is the doubling time in days, which we already employed as a static parameter in the simple model above. Our Bayesian inference now allows to estimate this parameter as a dynamic property of the underlying growth model. Here it is over the course of the last few weeks with a short outlook into the next week: 83 | 84 | ![Estimate of the doubling time in days. The visible slow-down (seen as an increase in the doubling time) is estimated from the data](/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp){width=100%} 85 | 86 | So there definitely is some indication for a slow-down, with a doubling time just around 2.5 days around the end of February and now a rate around 5 days (the black line indicates the time of this writing), and a future prediction between 7 and 16 days in a week from now. This is interesting in light of [comments from officials](https://www.tagesspiegel.de/politik/merkels-coronavirus-vorgabe-zehn-tage-verdopplungszeit-wie-weit-ist-deutschland-davon-entfernt/25656826.html) that a doubling time of 10 days or more should be reached in order to not overwhelm the healthcare system. 87 | 88 | ## Conclusion and Outlook 89 | 90 | We highlight three main conclusions from our modelling: 91 | 92 | 1. The miss-rate, so the probability for an infected person to not get tested, is one of the big unknowns in all countries currently. We can only estimate this number if strong assumptions on the death rate are made. Reversely, if the miss-rate were known better, this would allow a more accurate estimate of the death rate. One possibility to estimate the true prevalence would be representative random sampling from the population, which in fact is [planned](https://www.deutschlandfunk.de/covid-19-rki-will-dunkelziffer-der-coronavirus-infektionen.2850.de.html?drn%3Anews_id=1114345). 93 | 2. “Predicting” the epidemiological dynamics into the future remains highly speculative. With Bayesian analyses, the degree of the resulting uncertainty is at least partly “built-in” the model. In our case, we showed that even with an arguably under-complex growth model with linear slow-down, the uncertainty on the number of infections in the future is very large, with predicted numbers to vary over a factor of 10 or more. 94 | 3. One key, and perhaps simplifying, assumption in both our modelling attempts was the “lag” behind infections and test and death, respectively. One way to make these models more correct is by incorporating more realistic data for the course of individual infections. In reality, there is arguably a wide distribution of lag-times until symptoms, until test results, until death, while currently we assume these lag times to be fixed time periods. 95 | 96 | We hope that our work may trigger some feedback and motivation for others. It is very easy to get started on working with the data, for example by using our ready-to-use [R package](https://github.com/nevrome/covid19germany). A lot more analyses are possible, when taking into account other data, some of which provided in this package, including county-based information about population numbers, the number of hospital beds, and age structure. -------------------------------------------------------------------------------- /posts/2021-05-06-lambdar.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Haskell in R? An experiment with the R package lambda.r 3 | author: Clemens Schmid 4 | origin: https://nevrome.medium.com/haskell-in-r-an-experiment-with-the-r-package-lambda-r-78f21c0f9fe6 5 | --- 6 | 7 | *TL;DR: Feel free to directly jump to The lambda.r implementation if you only want to see that. The full code is posted at the end of the article.* 8 | 9 | Haskell and R are quite different programming languages. One is purely functional, statically typed and prominently features some of the most obscure abstractions in Computer Science. The other one lives at a particularly weird spot at the crossroad of the object-oriented, imperative and functional paradigms, has a ductile and dynamic type system and is optimized for the pragmatic needs of data analysis. 10 | 11 | But still these two languages share some interesting features. For example both can be run interactively in an interpreter environment. And both consider functions first-class citizens -- thus offering higher-order functions -- and allow the definition of custom infix operators. And that's why something like lambda.r is possible in the first place. 12 | 13 | [lambda.r](https://github.com/zatonovo/lambda.r) (here v.1.2.4) is an R package that provides syntax extensions to write functional, Haskell-like code in R. It implements an astonishing number of features including type and function definition, pattern matching, guard statements and even monads! True functional programming available at your fingertips in R. All while maintaining a surprisingly Haskell-like syntax and incorporating powerful bonus features from R. Even a custom debugging system is part of the package. 14 | 15 | The author Brian Lee Yung Rowe did an incredible job and also maintained the package over a commendable time span -- the first commit on Github is from 2012 and the last change was pushed 2019. 16 | 17 | Of course the package has some known limitations and rough edges. In my opinion it's an extremely clever proof of concept and I enjoyed very much playing with it, but I'm not sure if I would recommend it for use in production. I'll leave that to you and instead show you what I managed to build with it. 18 | 19 | ## The experiment 20 | 21 | Recently I wanted to implement a simple but specific logic in a bioinformatics context — so this is a real world example. But it would be tedious to explain the background, so I'll instead replace the entities with something more digestible: Apples. 22 | 23 | Let's say we have two sets of apple varieties and then a number of other fruit variety sets (varieties of pears, plums, strawberries, …). The first apple collection is large and covers all sorts of types: Ambrosia, Granny Smith, Red Delicious, Jonagold, Rome, Honeycrisp and many more. The second apple collection is much smaller, but a strict subset of the first one. It only includes the three varieties Granny Smith, Red Delicious and Honeycrisp. We don't really care about the other fruits. 24 | 25 | ### Merging fruit variety sets in Haskell 26 | 27 | How could we model these sets in Haskell? We don't need to consider the individual varieties here. Only the variety collections. So we could create the type `FruitSet` with three data constructors for the three different relevant sets. For the sake of simplicity let's shorten their names to 28 | 29 | - LAS = Large Apple Set 30 | - SAS = Small Apple Subset 31 | - OFS = Other Fruit Set 32 | 33 | ```Haskell {.numberLines} 34 | data FruitSet = 35 | LAS 36 | | SAS 37 | | OFS 38 | deriving (Eq, Show) 39 | ``` 40 | 41 | Now about the issue we have to solve for these sets: We need a function that merges a list of fruit sets according to a very specific logic into only one output fruit set. This has to adhere to the following pair-wise (and undirected) merging rules: 42 | 43 | - If we merge two identical sets then the output should just be that set. That makes sense: Consider for example two Large Apple Sets. All the Ambrosia, Rome, Red Delicious and so forth apple varieties are present in both of the input sets in a pair-wise comparison. 44 | - If we merge any set with one of the Other Fruit Sets then the output should always be an Other Fruit Set. Of course: we have a weird mixture of species and fruit varieties afterwards. 45 | 46 | For the final two rules, we also have to consider two different kind of merges: A union merge and an intersect merge. 47 | 48 | - If we merge a Large Apple Set and a Small Apple Subset with a union merge, then a Large Apple Set should be returned. That makes sense: The varieties in the small subset — Granny Smith, Red Delicious and Honeycrisp — are already part the large superset. 49 | - If we merge a Large Apple Set and a Small Apple Subset with an intersect merge, then we should get a Small Apple Subset. That just follows the same logic as in the previous rule. 50 | 51 | I think these rules are an excellent application for pattern matching in Haskell. We could implement them in a function like this: 52 | 53 | ```Haskell {.numberLines} 54 | fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet 55 | fSMerge LAS LAS _ = LAS 56 | fSMerge SAS SAS _ = SAS 57 | fSMerge OFS _ _ = OFS 58 | fSMerge _ OFS _ = OFS 59 | fSMerge LAS SAS True = SAS 60 | fSMerge SAS LAS True = SAS 61 | fSMerge LAS SAS False = LAS 62 | fSMerge SAS LAS False = LAS 63 | ``` 64 | 65 | Even if you're not familiar with Haskell you may appreciate how the different pair-wise comparison cases are expressed here. The function takes two `FruitSet`s and a logical to distinguish union (`False`) and intersect (`True`) merges. For many of these rules it does not even matter which kind of merge is applied. Here we can replace the pattern with the wildcard symbol "`_`". 66 | 67 | Now that we have these rules, we can also implement the function that applies them to an arbitrary list of `FruitSet`s to determine the appropriate superset. 68 | 69 | ```Haskell {.numberLines} 70 | fSMergeList :: [FruitSet] -> Bool -> FruitSet 71 | fSMergeList (x:xs) intersect = 72 | foldr (\a b -> fSMerge a b intersect) x xs 73 | ``` 74 | 75 | It uses a fold to combine the list elements into one. [Folds](https://wiki.haskell.org/Fold) are operations that look at two elements of a list, apply some binary function to them, take the result and apply the same function again to that and a new list element. Just until only one result remains and the list is gone. Folds usually need a starting value that serves also as an "accumulator" to track the list-condensing result along the fold's way through the list. 76 | 77 | Here I used Haskell's clever pattern matching on lists (`x:xs`) to separate the input list's head and tail. That makes it straight forward to set the head element as the starting value for the fold. We will see below that lambda.r is less elegant here. 78 | 79 | Finally we can test our code: 80 | 81 | ```haskell {.numberLines} 82 | fSMergeList [LAS] True 83 | -- LAS 84 | fSMergeList [LAS, LAS] True 85 | -- LAS 86 | fSMergeList [LAS, LAS, SAS] True 87 | -- SAS 88 | fSMergeList [LAS, LAS, SAS] False 89 | -- LAS 90 | fSMergeList [LAS, LAS, OFS] False 91 | -- OFS 92 | ``` 93 | 94 | Works like a charm! Let's compare that with lamda.r now. 95 | 96 | ## The lambda.r implementation 97 | 98 | lambda.r provides some functions, mostly clever infix operators, to enable a Haskell-like logic and syntax in R. To access them we have to install and load the package first. 99 | 100 | ```r {.numberLines} 101 | install.packages(“lambda.r”) 102 | library(lambda.r) 103 | ``` 104 | 105 | Just as in the Haskell code above we have to find a way to represent fruit sets. With lambda.r, types are defined by their constructor functions. Each function has a name and input arguments separated from a return value or operation with the `%as%` infix operator. 106 | 107 | ```r {.numberLines} 108 | FruitSet("LAS") %as% "LAS" 109 | FruitSet("SAS") %as% "SAS" 110 | FruitSet("OFS") %as% "OFS" 111 | ``` 112 | 113 | A distinction of type and data constructor as in Haskell does not exist to my knowledge. Also no nullary data constructor ("constants"). So I decided to be creative and use pattern matching on strings to simulate a data type for different fruit sets. lambda.r understands this syntax perfectly fine and prints the resulting type as follows: 114 | 115 | ```r {.numberLines} 116 | 117 | ``` 118 | 119 | ``` 120 | [[1]] 121 | FruitSet("LAS") %:=% ... 122 | [[2]] 123 | FruitSet("SAS") %:=% ... 124 | [[3]] 125 | FruitSet("OFS") %:=% ... 126 | ``` 127 | 128 | With that data type we can define the pair-wise merging logic as laid out above. 129 | 130 | ```r {.numberLines} 131 | fsMerge(a,b,intersect) %::% FruitSet : FruitSet : logical : FruitSet 132 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS") 133 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS") 134 | fsMerge("OFS", b, intersect) %as% FruitSet("OFS") 135 | fsMerge(a, "OFS", intersect) %as% FruitSet("OFS") 136 | fsMerge("LAS", "SAS", TRUE ) %as% FruitSet("SAS") 137 | fsMerge("SAS", "LAS", TRUE ) %as% FruitSet("SAS") 138 | fsMerge("LAS", "SAS", FALSE ) %as% FruitSet("LAS") 139 | fsMerge("SAS", "LAS", FALSE ) %as% FruitSet("LAS") 140 | ``` 141 | 142 | Note how extremely similar this syntax is to Haskell. The type interface definition follows exactly the same principle, short of some minor deviations when `::` became `%::%` in R and `->` is replaced by `:`. R has [some limitations](https://stackoverflow.com/questions/24697248/is-it-possible-to-define-operator-without/24698311#24698311) regarding infix operators. 143 | 144 | **One key take-away is, that this function will not run with input that is not exactly as specified. lambda.r thus introduces a static type system into R.** 145 | 146 | The pattern matching in the function definition is just as in Haskell, except of course for a number of syntactic details like the parentheses, commas, string-based values and lack of explicit wildcards. It's another language after all! 147 | 148 | With this function implemented, we only lack the last component: The function to apply the pair-wise comparisons with a fold on a list of `FruitSet`s. And here things start to become a bit more tricky, unfortunately. Let's start with the result: 149 | 150 | ```r {.numberLines} 151 | fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet 152 | fsMergeList(xs, intersect) %as% 153 | Reduce( 154 | function(a, b) { fsMerge(a, b, intersect) }, 155 | xs[tail(seq_along(xs), n = -1)], 156 | init = xs[[1]] 157 | ) 158 | ``` 159 | 160 | The general structure is again very Haskell-like. For the folding we use the `Reduce` function from the R base package (which is something like the [Prelude](https://hackage.haskell.org/package/base-4.15.0.0/docs/Prelude.html) in Haskell). One major difference between lambda.r and Haskell is though, that lambda.r lacks a good default way to handle lists. Maybe I just missed the relevant documentation or overlooked something else, but I struggled a bit with that. 161 | 162 | In the end I decided to come up with my own list type. 163 | 164 | ```r {.numberLines} 165 | FruitSetList(…) %::% FruitSet… : FruitSetList 166 | FruitSetList(…) %as% asFruitSetList(list(…))asFruitSetList(xs) %::% list : FruitSetList 167 | asFruitSetList(xs) %as% { 168 | class(xs) <- c(“FruitSetList”) 169 | xs 170 | } 171 | ``` 172 | 173 | This constructor makes use of the [Ellipsis type](https://github.com/zatonovo/lambda.r#the-ellipsis-type) "`...`", a weird feature of R, well integrated into lambda.r: a single input argument that can represent a set of multiple arguments. In lambda.r it can be combined with a type constraint to make sure that the function takes an arbitrary amount of arguments, but only of this type. So here of type FruitSet. 174 | 175 | That allows for a pretty cool constructor syntax: 176 | 177 | ```r {.numberLines} 178 | FruitSetList(FruitSet(“LAS”), FruitSet(“SAS”), FruitSet(“OFS”))[[1]] 179 | ``` 180 | 181 | ``` 182 | [1] "LAS" 183 | attr(,"class") 184 | [1] "FruitSet" "character" 185 | [[2]] 186 | [1] "SAS" 187 | attr(,"class") 188 | [1] "FruitSet" "character" 189 | [[3]] 190 | [1] "OFS" 191 | attr(,"class") 192 | [1] "FruitSet" "character"attr(,"class") 193 | [1] "FruitSetList" 194 | ``` 195 | 196 | Unforturnately I found no direct way to catch the ellipsis and make it a `FruitSetList`. With `list(...)` I could indeed transform it to a list, but that's only half the job. I resorted to the rather ugly `asFruitSetList` that "manually" adds the "FruitSetList" label to the class attribute of the output object. That works because lambda.r utilizes [R S3 classes](http://adv-r.had.co.nz/S3.html) for its magic. 197 | 198 | With that out of the way there was still one issue to address. I could not use Haskell's pattern matching on lists to separate the head and tail elements for the `Reduce` input. It's easy to get the first element of a list in R, but the tail requires some more advanced indexing: 199 | 200 | ```r {.numberLines} 201 | xs[tail(seq_along(xs), n = -1)] 202 | ``` 203 | 204 | All issues should be solved now. It's time for a final test run of our code: 205 | 206 | ```r {.numberLines} 207 | fsMergeList(FruitSetList(FruitSet("LAS")), TRUE) 208 | # [1] "LAS" 209 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS")), TRUE) 210 | # [1] "LAS" 211 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), TRUE) 212 | # [1] "SAS" 213 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), FALSE) 214 | # [1] "LAS" 215 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("OFS")), FALSE) 216 | # [1] "OFS" 217 | ``` 218 | 219 | Excellent! The Syntax is more verbose as the one in Haskell, but the results are the same. 220 | 221 | ## Recap 222 | 223 | - Haskell and R are both versatile languages with large communities that regularly suggest and discuss new abstractions. Haskell is a real innovation machine and carries many functional programming concepts into other languages. 224 | - lambda.r is a syntax extension to make some of the power of Haskell (or similar functional programming languages) available in R. 225 | - lambda.r works and is extremely fun to play with, but it's pretty verbose and lacks (at least to my understanding) a good list implementation. I also suspect it not to be optimized for performance — probably quite the opposite. 226 | 227 | I personally would love to see some of the concepts demonstrated with lambda.r to find their way into regular, base R. Especially a way to switch on static typing! That could avoid a lot of unexpected behavior. R interfaces often feel flimsy and not as rock solid as comparable code in Haskell. The approach lambda.r took here -- e.g. with the [Don't-Care Type](https://github.com/zatonovo/lambda.r#the-dont-care-type) `.`, which I did not introduce -- could be a way to combine dynamic and static typing. Ideally we want more sturdy interfaces without sacrificing R's great flexibility for rapid prototyping. 228 | 229 | *Acknowledgements: I got some valuable feedback by my colleague James Fellows Yates (@jfy133) for this post.* 230 | 231 | *** 232 | 233 | Haskell: 234 | 235 | ```haskell {.numberLines} 236 | data FruitSet = 237 | LAS 238 | | SAS 239 | | OFS 240 | deriving (Eq, Show) 241 | 242 | fSMergeList :: [FruitSet] -> Bool -> FruitSet 243 | fSMergeList (x:xs) intersect = foldr (\a b -> fSMerge a b intersect) x xs 244 | 245 | fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet 246 | fSMerge LAS LAS _ = LAS 247 | fSMerge SAS SAS _ = SAS 248 | fSMerge OFS _ _ = OFS 249 | fSMerge _ OFS _ = OFS 250 | fSMerge LAS SAS True = SAS 251 | fSMerge SAS LAS True = SAS 252 | fSMerge LAS SAS False = LAS 253 | fSMerge SAS LAS False = LAS 254 | ``` 255 | 256 | R: 257 | 258 | ```r {.numberLines} 259 | library(lambda.r) 260 | 261 | FruitSet("LAS") %as% "LAS" 262 | FruitSet("SAS") %as% "SAS" 263 | FruitSet("OFS") %as% "OFS" 264 | 265 | FruitSetList(...) %::% FruitSet... : FruitSetList 266 | FruitSetList(...) %as% asFruitSetList(list(...)) 267 | 268 | asFruitSetList(xs) %::% list : FruitSetList 269 | asFruitSetList(xs) %as% { 270 | class(xs) <- c("FruitSetList") 271 | xs 272 | } 273 | 274 | fsMerge(a, b, intersect) %::% FruitSet : FruitSet : logical : FruitSet 275 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS") 276 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS") 277 | fsMerge("OFS", b, intersect) %as% FruitSet("OFS") 278 | fsMerge(a, "OFS", intersect) %as% FruitSet("OFS") 279 | fsMerge("LAS", "SAS", TRUE ) %as% FruitSet("SAS") 280 | fsMerge("SAS", "LAS", TRUE ) %as% FruitSet("SAS") 281 | fsMerge("LAS", "SAS", FALSE ) %as% FruitSet("LAS") 282 | fsMerge("SAS", "LAS", FALSE ) %as% FruitSet("LAS") 283 | 284 | fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet 285 | fsMergeList(xs, intersect) %as% 286 | Reduce( 287 | function(a, b) { fsMerge(a, b, intersect) }, 288 | xs[tail(seq_along(xs), n = -1)], 289 | init = xs[[1]] 290 | ) 291 | ``` -------------------------------------------------------------------------------- /_site/posts/2023-12-31-poseidon-end-of-year-2023.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - Poseidon end-of-year review 2023 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

Poseidon end-of-year review 2023

33 |
34 |
35 | Posted 36 | 37 | originally here 38 | 39 | on December 31, 2023 40 | 41 | by Clemens Schmid 42 | 43 |
44 |
45 |

It’s late December and the time of the year when work slows down in my part of the world. For many of us an opportunity to take a break and to look back, contemplating the achievements of the year. I decided to do so as well and write a bit about Poseidon.

46 |

What follows is a subjective account of the events in and around the framework in 2023 - each of my colleagues in the core team (Stephan Schiffels, Ayshin Ghalichi, Thiseas C. Lamnidis, Dhananjaya B. A. Mudiyanselage, Wolfgang Haak and I, Clemens Schmid) would probably emphasise different developments in such a write-up. That is in itself an achievement, because it shows how much the tech-stack, domains and services in our little ecosystem have grown this year: beyond the understanding of each of us individually.

47 |

The Poseidon schema

48 |

Let’s start simple with the two new releases of the Poseidon schema we published this year: v2.7.0 and v2.7.1. They were published in short succession in March and May, the latter only slightly improving the sequencing source files (.ssf) added in the first. See the changelog here for more details, but the addition of the .ssf file is indeed their most remarkable contribution to the schema. With it we addressed a major desideratum and unresolved question in previous versions of Poseidon: How should genotype data be linked to the raw sequencing data on the European Nucleotide Archive (ENA) and other archives of the International Nucleotide Sequence Database Collaboration (INSDC)?

49 |

The .ssf file is, I would argue, a smart solution for this question. It specifies the same variables already used in the ENA database, allows for an extremely flexible, yet not arbitrary n:m connection between the entities in a Poseidon package and the raw data products and it can be generated semi-automatically for most of the data in our public archives. With some tweaking it can also be used to organize local data repositories independent of any online databases. The .ssf file is finally the very foundation on top of which the amazing Minotaur workflow is built (see below).

50 |

Generally, both the fact that only two Poseidon releases were necessary this year and that we could treat them as non-breaking changes indicate that we reached a certain level of maturity and stability in the schema. Of course we still have ideas how to extend it further in the future, but at the moment I’m optimistic that we can maintain long-term backwards compatibility. The process in which we discussed, specified and later improved the .ssf file definition to then see Minotaur be erected on top of it was a very satisfying professional experience for me personally.

51 |

The Minotaur workflow

52 |

The Minotaur workflow is a semi-automatic workflow to reproducibly process published sequencing data into Poseidon packages. Developing this entirely new branch of the Poseidon ecosystem became possible because Thiseas joined the Poseidon core team in 2023. He came up with a sophisticated, yet open and transparent implementation of this process, in which authors and the community as a whole retain control over the data and the data processing parameters. A full write-up for the website is in progress. Here is the summary Thiseas prepared for our poster at the ISBA conference:

53 |

Community members can request new packages to be processed through the Minotaur workflow by submitting a build recipe as a pull request against a dedicated GitHub repository. This recipe is created from a sequencing source file (.ssf), describing the sequencing data for the package and where it can be downloaded. Using the recipe, the sequencing data gets processed via nf-core/eager on computational infrastructure of MPI-EVA, using a standardised, yet flexible, set of parameters. The generated genotypes, together with descriptive statistics of the sequencing data (Endogenous, Damage, Nr_SNPs, Contamination), are compiled into a Poseidon package, and made available to users in the minotaur-archive.

54 |

The Minotaur workflow is a timely addition to the Poseidon framework, providing a flexible solution to wrap legacy and new data in uniformly processed packages. Homogeneous data processing puts us closer to our great comparadum, the AADR dataset. It also helped us to finalize the structure of our public archives, which emerged from long discussions about the kind of data we think the aDNA community requires for derived analyses.

55 |

Right now the Minotaur workflow is still in a final development and testing phase, where we focus on the processes around it, so the submission of recipes, their review and the forwarding of results to the minotaur-archive. One particular tricky question is how context information in the .janno file should be passed from the community-archive to the new packages in the minotaur-archive. One of the last pull requests for our software tool trident in 2023 aims to introduce a reliable mechanism to merge .janno files to address this issue.

56 |

The public archives

57 |

In 2023 we finally came to a conclusion on how to organize our public data archives. What emerged is a threefold division into what we call the community-archive, the minotaur-archive and the aadr-archive. The archives are described in more detail on the website, but here’s the gist of it:

58 |

The community-archive emerged from our old public-archive. It includes the legacy data we originally copied from the AADR. We now decided to use this archive for author-submitted publication-wise packages to collect the exact genotype data analysed in the respective papers. The idea is twofold: With the author-submitted genotype data the results in a given paper can be reproduced exactly. And the publication authors are generally the most trustworthy authority for the context data we collect in the .janno files, e.g. the spatiotemporal origin of the individual samples. Ayshin and I recently wrote about the submission process for the community-archive here.

59 |

The minotaur-archive mirrors the community-archive in that it features publication-wise packages, usually even the very same as in the community-archive. To distinguish them clearly, package titles and sample-wise Poseidon_IDs in the minotaur-archive carry the suffix _MNT. As explained above the packages in this archive include consistently reprocessed genotype data, run through the Minotaur workflow.

60 |

The aadr-archive is the conceptionally most simple archive. It features “poseidonized” versions of releases of the AADR dataset, currently only the latest AADR v54.1.p1. We documented the code and decisions for the cleaning and packaging process here.

61 |

2023 not only saw the planning and setup of these three archives, but also a lot of work to fill them with life. For the community archive that meant plenty of data cleaning by all of us, most notably Dhananjaya. And it also meant providing guidance for authors to submit their data. Thanks to the hard work of Ayshin a total of eleven author-submitted packages are available in the archive now. Number twelve was submitted shortly before christmas and is awaiting review. The minotaur-archive is still functionally empty, but three packages are pending thanks to Thiseas and will hopefully soon be merged. Preparing the latest version of the AADR dataset for the aadr-archive was one of the projects I tackled this year.

62 |

The software tools

63 |

The Poseidon software tools grew significantly more powerful this year. From a user-perspective 2023 brought various new features, changes to the command line interfaces and breaking updates in the Web-API. To keep track of the releases and the Poseidon schema versions they support I created a version overview table on the website.

64 |

With qjanno I added an entirely new tool to the set. It is a command line tool to run SQL queries on .janno (and arbitrary .csv and .tsv) files. I created it by forking the qsh package and then adjusting it heavily for the use on Poseidon packages. Just as trident it is written in Haskell and openly available with precompiled executables here.

65 |

Stephan invested a good amount of effort into consolidating the data analysis features in xerxes. He wrote a whitepaper to explain and justify the reasoning behind the implemented logic for f-statistics, and another blog post on how to run it. Even more approachable and comprehensive is a write-up he shared here. Together we worked on integrating the many changes to trident and its underlying poseidon-hs Haskell library into xerxes.

66 |

Our main workhorse, trident, saw an astonishing number of new releases: v1.1.6.0 on January 8 to v1.4.0.3 on October 30. I quickly went through the extended changelogs published with each release to summarize the user-facing highlights of what trident supports now:

67 |
    68 |
  • Arbitrary columns in the .janno file beyond the columns specified in the Poseidon schema (v1.1.6.0)
  • 69 |
  • Specification of individuals with identical names from different source packages in the trident forge selection language (v1.1.7.0)
  • 70 |
  • Validation of the entire genotype data in a package with --fullGeno in trident validate (v1.1.10.2)
  • 71 |
  • Poseidon schema version v2.7.1 with validation of the .ssf file (v1.1.12.0)
  • 72 |
  • A highly improved Poseidon Web-API that allows to request individual (old) package versions (v1.2.0.0)
  • 73 |
  • Reworked versions of trident update, now called trident rectify, and trident validate, which now allows to validate not just entire packages, but also individual files (v1.3.0.4)
  • 74 |
  • Selecting packages by version in the forge selection language and generally handling multiple package versions (v1.4.0.2, Stephan shared yet another blog post about this release)
  • 75 |
76 |

As always I enjoyed the work on the software tools tremendously, especially in two cases: If one of our users reports an issue and we can address a concrete need with a release, and if the Haskell programming language allows for a particularly elegant solution for a given problem. A currently pending pull request combines both: Ayshin made me aware of some validation failure cases that require better error messages and I found a neat way to provide just that with a custom-tailored monadic stack.

77 |

Outreach

78 |

The last domain where we made good progress in 2023 is public outreach. Naturally we invested hours in writing and updating documentation on the project website (https://www.poseidon-adna.org), but we also pursued a number of special projects beyond the basic, technical description of software and workflows.

79 |

The first one of these was possible thanks to the effort of Dhananjaya, Stephan and me: We built a page on the website where the data in the public archives can be easily explored. It makes use of our Web-API to access the data and display it with a sub-page for each package. Dhananjaya wrote a blog post about this, recently.

80 |

I already mentioned this blog multiple times above. It is indeed another great addition of 2023. Stephan created a separate website at https://blog.poseidon-adna.org to share news and short tutorials. Our wish has always been to gather an active and engaged community of users around Poseidon, and we hope to establish this blog as one of its central communication hubs. A major medium for longer write-ups beyond the technical documentation already available on the website.

81 |

To announce our blog posts, software releases and other news we fully switched from Twitter (now X) to the Fediverse in 2023. You can follow us here: https://ecoevo.social/@poseidon. The switch came naturally, given the state of affairs at X. Submitting posts automatically is more easy with Mastodon compared to Twitter and I made sure that this process works reliably for our software releases on GitHub.

82 |

Beyond these technical novelties and online communication we also presented Poseidon at two in-person conferences in 2023: ISBA10 in Tartu, Estonia and the NFDI4Objects community meeting in Berlin, Germany. The poster we presented at both of these occasions was already mentioned above and is available here. And the slides for the talk Thiseas prepared for the latter should soon be made available by the NFDI4Objects team.

83 |

Conclusion

84 |

Much has happened for Poseidon in 2023 and I’m sure I’m not doing all of it due justice in this little summary. But I consider what is here already an impressive list that stands witness for the effort we put into the framework. And it seems to pay off: The user base is growing. More users help us in turn to find and address remaining issues and make Poseidon better for all of us. This will once more be one of my main aspirations in the coming year 2024.

85 |
86 |
87 | 88 |
89 | 90 |
91 | Site generated by 92 | Hakyll 93 |
94 | 95 | 96 | -------------------------------------------------------------------------------- /posts/2021-12-05-shake-II.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Workflow management with Haskell Shake II: Showcase" 3 | author: Clemens Schmid 4 | origin: https://medium.com/@nevrome/my-workflow-automation-journey-discovering-shake-haskell-5c270b93ff2b 5 | --- 6 | 7 | *This is part II of a two part blog post. See [part I](/posts/2021-12-05-shake-I.html) for the story how I discovered Shake.* 8 | 9 | *GitHub repository with the code for this showcase: * 10 | 11 | ## Using Shake 12 | 13 | [Shake](https://shakebuild.com/) is a build system like [make](https://www.gnu.org/software/make/), so software to organize the compilation of large software projects. That’s why its [manual](https://shakebuild.com/manual) fully focuses on building C code. In my perception building software and managing a data analysis pipeline are very similar tasks, though: in the end you want to run every script necessary to get a certain product, and it does not matter much, if that product are crosscompiled executables or a set of plots. 14 | 15 | The Shake homepage [does a good](https://shakebuild.com/why) job in listing the advantages it has over its competitors. Here are three aspects I find particularly appealing about it: 16 | 17 | * **“Pull-based”**: Shake starts from the desired end product and figures out, which scripts it has to run to reach a certain result. If I modify a script, it only rebuilds everything that depends on it downstream. 18 | * **Fast and parallel**: Compiling and running the massive, 600 line Shakefile I need for my current main project feels fast and responsive. It’s incredibly satisfying to see Shake plow through independent scripts in parallel. 19 | * **Configurable**: Shake is a library with a simple interface, [extensive documentation](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html) and useful [configuration options](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#g:5). It boils down to idiomatic Haskell code, fully adjustable to your needs. 20 | 21 | To illustrate how it works, I want to present a basic example in the following section ([Code on GitHub](https://github.com/nevrome/ShakeExperiment)). 22 | 23 | ### A simple Shakefile 24 | 25 | Let’s imagine a workflow like this: 26 | 27 | ``` 28 | raw_input.csv --> A.R - 29 | \ 30 | -> C.R --> 3D.png 31 | / 32 | B.R - 33 | ``` 34 | 35 | We have three _.R_ scripts: **A**, **B** and **C**. **A** requires an input _.csv_ file, **B** is independent of **A**, and **C** requires the intermediate output of **A** and **B** to produce our desired, final output _3D.png_. 36 | 37 | ![3D.png: Output of our example pipeline](/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp){width=100%} 38 | 39 | In our file system this looks like this: 40 | 41 | ``` 42 | . 43 | ├── input 44 | │ └── raw_input.csv 45 | └── scripts 46 | ├── A.R 47 | ├── B.R 48 | └── C.R 49 | ``` 50 | 51 | Now let’s add a “Shakefile”, so a script that expresses our tiny pipeline with Shake. This boils down to a Haskell script with a `main` method, which describes the interaction of these files in a way Shake can parse and understand. 52 | 53 | In my opinion the most easy way to run an independent Haskell script is via the [Stack script interpreter](https://docs.haskellstack.org/en/stable/GUIDE/#script-interpreter). So if we have stack installed on our system, we can create a new script file _Shakefile.hs_ and append these two lines to the top: 54 | 55 | ```haskell {.numberLines} 56 | #!/usr/bin/env stack 57 | -- stack --resolver lts-18.7 script --package shake 58 | ``` 59 | 60 | If we later run our script with _./Shakefile.hs_, stack will automatically download and prepare the necessary dependencies: the [Glasgow Haskell Compiler](https://www.haskell.org/ghc/) and the Shake package. That allows us to import modules with functions and data types from Shake. 61 | 62 | ```haskell {.numberLines} 63 | import Development.Shake 64 | import Development.Shake.Command 65 | import Development.Shake.FilePath 66 | ``` 67 | 68 | Finally we can define our main method like this: 69 | 70 | ```haskell {.numberLines} 71 | main :: IO () 72 | main = shake shakeOptions {shakeFiles = "_build"} $ do want [ "output" "3D.png" ] 73 | 74 | "output" "3D.png" %> \out -> do 75 | let script = "scripts" "C.R" 76 | dataFiles = [ "intermediate" "dens_surface.RData", 77 | "intermediate" "colours.RData" 78 | ] 79 | need $ script : dataFiles 80 | cmd_ "Rscript" script "intermediate" "dens_surface.RData" %> \out -> do 81 | let script = "scripts" "A.R" 82 | dataFiles = [ "input" "raw_input.csv" ] 83 | need $ script : dataFiles 84 | cmd_ "Rscript" script "intermediate" "colours.RData" %> \out -> do 85 | let script = "scripts" "B.R" 86 | need [ script ] 87 | cmd_ "Rscript" script 88 | ``` 89 | 90 | I don’t want to get lost in the intricate details of Haskell and the Shake interface here, so it shall be enough to say that the function 91 | 92 | ```haskell {.numberLines} 93 | shake :: ShakeOptions -> Rules () -> IO () 94 | ``` 95 | 96 | called at the very beginning of the `main` method takes a configuration type `ShakeOptions` and a set of rules -- which can be written with the Monad instance and do-notation -- and evaluates them and the actions within them in a meaningful order. 97 | 98 | That’s how one of these rules looks like: 99 | 100 | ```haskell {.numberLines} 101 | "intermediate" "dens_surface.RData" %> \out -> do 102 | let script = "scripts" "A.R" 103 | dataFiles = [ "input" "raw_input.csv" ] 104 | need $ script : dataFiles 105 | cmd_ "Rscript" script 106 | ``` 107 | 108 | Each rule has output files (here: _dens_surface.RData_ in the directory _intermediate_) and requires input files (here: the script **A.R** and _input/raw_input.csv_). It finally also has some mechanism that connects input and output, so for example a command to run a specific script that takes the input and yields the output (here: `cmd_ "Rscript" script`). 109 | 110 | In a Shakefile you write all rules necessary to fully represent your pipeline. The rest is pure magic: Shake runs all scripts in the right order, creates missing directories and keeps carefully track of the state of each input and output file. 111 | 112 | ```bash 113 | $ ./Shakefile1.hs 114 | # Rscript (for intermediate/colours.RData) 115 | # Rscript (for intermediate/dens_surface.RData) 116 | # Rscript (for output/3D.png) 117 | ``` 118 | 119 | After running our toy example, our directory will look like this, so full of output files: 120 | 121 | ``` 122 | . 123 | ├── _build 124 | ├── input 125 | │ └── raw_input.csv 126 | ├── intermediate 127 | │ ├── colours.RData 128 | │ └── dens_surface.RData 129 | ├── output 130 | │ └── 3D.png 131 | ├── scripts 132 | │ ├── A.R 133 | │ ├── B.R 134 | │ └── C.R 135 | └── Shakefile1.hs 136 | ``` 137 | 138 | _\_build_ is where Shake stores its knowledge and puts intermediate files for itself. You should certainly add it to your _.gitignore_ file, if you work with Git, just as the _intermediate_ and _output_ directories, which are created by the pipeline. 139 | 140 | As a small experiment and to test Shake’s power, we can edit one of the scripts. **B**.R only produces a colour vector to be used in the plotting function in **C**.R, so it’s an easy target for modification. And indeed: If we edit one of the colours there and run our script again, it only runs **B** and **C**, producing a new, nifty _3D.png_. Brilliant! 141 | 142 | ```bash 143 | $ ./Shakefile1.hs 144 | # Rscript (for intermediate/colours.RData) 145 | # Rscript (for output/3D.png) 146 | ``` 147 | 148 | ![3D.png: Output of our example pipeline after a change in B.R](/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp){width=100%} 149 | 150 | ### Adjustments for my needs and convenience 151 | 152 | Our very simple Shake script is already fulfilling its basic purpose. The pipeline is fully defined and runs, when we execute the Shakefile. 153 | 154 | But some more advanced elements I personally need for my actual worflows are missing (e.g. support for singularity and our in-house HPC system). Shake itself also has some neat configuration options to explore. And finally the versatility of Haskell should allow to rewrite the core pipeline mechanics in shorter and clearer syntax. So: We have some room for improvement, and I wanted to dive deeper into that. 155 | 156 | Here’s a refactored version of the script above: 157 | 158 | ```haskell {.numberLines} 159 | #!/usr/bin/env stack 160 | -- stack --resolver lts-18.7 script --package shake 161 | 162 | import Development.Shake 163 | import Development.Shake.Command 164 | import Development.Shake.FilePath 165 | 166 | data Settings = Settings { 167 | singularityContainer :: FilePath 168 | , bindPath :: String 169 | , qsubCommand :: String 170 | } 171 | 172 | mpiEVAClusterSettings = Settings { 173 | singularityContainer = "singularity_experiment.sif" 174 | , bindPath = "--bind=/mnt/archgen/users/schmid" 175 | , qsubCommand = "qsub -sync y -b y -cwd -q archgen.q \ 176 | \-pe smp 1 -l h_vmem=10G -now n -V -j y \ 177 | \-o ~/log -N example" 178 | } 179 | 180 | relevantRunCommand :: Settings -> FilePath -> Action () 181 | relevantRunCommand (Settings singularityContainer bindPath qsubCommand) x 182 | | takeExtension x == ".R" = cmd_ qsubCommand 183 | "singularity" "exec" bindPath singularityContainer "Rscript" x 184 | | takeExtension x == ".sh" = cmd_ qsubCommand 185 | "singularity" "exec" bindPath singularityContainer x 186 | 187 | infixl 8 %$ 188 | (%$) :: FilePath -> ([FilePath], [FilePath]) -> Rules () 189 | (%$) script (inFiles, outFiles) = 190 | let settings = mpiEVAClusterSettings 191 | in outFiles &%> \out -> do 192 | need $ [script, singularityContainer settings] ++ inFiles 193 | relevantRunCommand settings script 194 | 195 | infixl 9 --> 196 | (-->) :: a -> b -> (a,b) 197 | (-->) x y = (x,y) 198 | 199 | input x = "input" x 200 | intermediate x = "intermediate" x 201 | scripts x = "scripts" x 202 | output x = "output" x 203 | 204 | main :: IO () 205 | main = shake shakeOptions { 206 | shakeFiles = "_build" 207 | , shakeThreads = 3 208 | , shakeChange = ChangeModtime 209 | , shakeProgress = progressSimple 210 | , shakeColor = True 211 | , shakeVerbosity = Verbose 212 | , shakeTimings = True 213 | } $ do 214 | want [output "3D.png"] 215 | scripts "A.R" %$ 216 | [input "raw_input.csv"] --> [intermediate "dens_surface.RData"] 217 | scripts "B.R" %$ 218 | [ ] --> [intermediate "colours.RData"] 219 | scripts "C.R" %$ 220 | map intermediate ["dens_surface.RData", "colours.RData"] --> 221 | [output "3D.png"] 222 | ``` 223 | 224 | There’s plenty to unpack here. So let’s pull it apart, starting with the the new files I added to our simple setup above. 225 | 226 | ``` 227 | . 228 | ├── input 229 | │ └── raw_input.csv 230 | ├── scripts 231 | │ ├── A.R 232 | │ ├── B.R 233 | │ └── C.R 234 | ├── Shakefile2.hs 235 | ├── singularity_build_sif.sh 236 | ├── singularity_experiment.def 237 | └── singularity_experiment.sif 238 | ``` 239 | 240 | Specifically for [Singularity](https://sylabs.io/guides/2.6/user-guide/quick_start.html) I added three files: _singularity_build_sif.sh_ is a bash script to build the singularity image file _singularity_experiment.sif_ as defined in _singularity_experiment.def_: 241 | 242 | ```Dockerfile {.numberLines} 243 | Bootstrap: docker 244 | From: rocker/r-base:4.1.0%post 245 | # install the necessary R packages 246 | R -- slave -e 'install.packages(“MASS”)' 247 | ``` 248 | 249 | This simple configuration file describes a reproducible, self-sufficient computational environment with R v4.1.0 and only one additional R package (MASS). Singularity is very [well integrated with docker](https://sylabs.io/guides/2.6/user-guide/singularity_and_docker.html) -- here I build directly on top of a [rocker](https://www.rocker-project.org/) image. As I don’t want to get lost in singularity here, I’ll leave it at that, and instead jump right into the new Shakefile. 250 | 251 | ### **Rules that don’t hurt the eyes** 252 | 253 | I think the build rule creation syntax in Shake is an eyesore -- as you can see in the first Shakefile above. For my new Shakefile I wrote a wrapper, that expresses rules more clearly. 254 | 255 | Let’s start with the new operator `%$`, which encapsulates Shake’s `%>`: 256 | 257 | ```haskell {.numberLines} 258 | (%$) :: FilePath -> ([FilePath], [FilePath]) -> Rules () 259 | (%$) script (inFiles, outFiles) = 260 | let settings = mpiEVAClusterSettings 261 | in outFiles &%> \out -> do 262 | need $ [script, singularityContainer settings] ++ inFiles 263 | relevantRunCommand settings script 264 | ``` 265 | 266 | It allows to write rules in an -- in my opinion -- much more idiomatic way: 267 | 268 | ```haskell {.numberLines} 269 | script %$ ([input files], [output files]) 270 | ``` 271 | 272 | The tuple `([],[])` to express input and output files in the second argument still feels a bit awkward, so I added an operator `-->` to express tuple creation more neatly. Using an arrow for that of course only makes sense in the pipeline context we’re covering here. To make sure that the two new operators are actually evaluated in the correct order, we manually have to set their [fixity](https://kowainik.github.io/posts/fixity#fixity-declaration). 273 | 274 | ```haskell {.numberLines} 275 | (-->) :: a -> b -> (a,b) 276 | (-->) x y = (x,y)infixl 8 %$ 277 | infixl 9 --> 278 | ``` 279 | 280 | That boils rule creation down to some wonderful syntax: 281 | 282 | ```haskell {.numberLines} 283 | script %$ [input files] --> [output files] 284 | ``` 285 | 286 | The horrible 287 | 288 | ```haskell {.numberLines} 289 | "intermediate" "colours.RData" %> \out -> do 290 | let script = "scripts" "B.R" 291 | need [ script ] 292 | cmd_ "Rscript" script 293 | ``` 294 | 295 | becomes a much more pleasant 296 | 297 | ```haskell {.numberLines} 298 | scripts "B.R" %$ [ ] --> [intermediate "colours.RData"] 299 | ``` 300 | 301 | ### **Custom run commands and environments** 302 | 303 | Now that the rules look nicer, we can turn towards the system environment. As described above, I have pretty specific requirements how exactly my scripts should be run: Through our high performance computing setting and through a singularity container. 304 | 305 | ``` 306 | HPC runs Singularity runs Rscript runs my scripts 307 | ``` 308 | 309 | To express this, I added the function `relevantRunCommand`, that does just that: compiling a relevant run command -- here depending on the file extension of the respective script. 310 | 311 | ```haskell {.numberLines} 312 | relevantRunCommand :: Settings -> FilePath -> Action () 313 | relevantRunCommand (Settings singularityContainer bindPath qsubCommand) x 314 | | takeExtension x == ".R" = cmd_ qsubCommand 315 | "singularity" "exec" bindPath singularityContainer "Rscript" x 316 | | takeExtension x == ".sh" = cmd_ qsubCommand 317 | "singularity" "exec" bindPath singularityContainer x 318 | ``` 319 | 320 | This function also requires the configuration type `Settings`, which serves to make `relevantRunCommand` somewhat flexible. It stores highly variable configuration like the path to the singularity container, which directories should be mapped into the container via [bind mounts](https://sylabs.io/guides/3.0/user-guide/bind_paths_and_mounts.html), and how exactly the scripts should be submitted to run on the HPC cluster. The example here is simplified, but true to the real setup I typically use: 321 | 322 | ```haskell {.numberLines} 323 | data Settings = Settings { 324 | singularityContainer :: FilePath 325 | , bindPath :: String 326 | , qsubCommand :: String 327 | }mpiEVAClusterSettings = Settings { 328 | singularityContainer = "singularity_experiment.sif" 329 | , bindPath = "--bind=/mnt/archgen/users/schmid" 330 | , qsubCommand = "qsub -sync y -b y -cwd -q archgen.q \ 331 | \-pe smp 1 -l h_vmem=10G -now n -V -j y \ 332 | \-o ~/log -N example" 333 | } 334 | ``` 335 | 336 | For my real production code, the settings data type is a bit more complex and features additional elements -- for example different cluster submission commands for different computing power requirements. 337 | 338 | You see that the building of the singularity image itself is not part of the pipeline. Building it requires `sudo` permissions, and -- more fundamentally --building it every time would undermine reproducibility: The recipe in the _.def_ file requires multiple different online servers to be available and to always provide specific versions of certain software dependencies. In a way, the singularity image should be considered a stable input data file, so nothing to be produced on the fly. 339 | 340 | This approach to environment management and configuration is bare-bones. I like the flexibility that comes with it, but I also see the appeal of a higher level of abstraction as provided by e.g. [nextflow’s executors](https://www.nextflow.io/docs/latest/executor.html). 341 | 342 | ### Shake options 343 | 344 | Shake itself comes with a number of easily configurable options how it should run. They are set in the record type `shakeOptions`, as described [here](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#g:5). These are the ones I modified for this example: 345 | 346 | ```haskell {.numberLines} 347 | shakeOptions { 348 | shakeFiles = "_build" 349 | , shakeThreads = 3 350 | , shakeChange = ChangeModtime 351 | , shakeProgress = progressSimple 352 | , shakeColor = True 353 | , shakeVerbosity = Verbose 354 | , shakeTimings = True 355 | } 356 | ``` 357 | 358 | * **shakeFiles**: The directory used for storing Shake metadata files. We already used that option above. 359 | * **shakeThreads**: The maximum number of rules to run in parallel. In our pipeline there are only three rules, and one depends on two others, so three is literally more than enough for maximum speed. 360 | * **shakeChange**: How should Shake determine if a file has changed? The [data type](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#t:Change) `Change` has multiple constructors, including the default `ChangeModetime`, which causes Shake to invalidate files based on timestamps or alternatively `ChangeDigest`, which does so via checksums. 361 | * **shakeProgress**: How progess should be reported, when the pipeline is running. `progressSimple` is a basic default, but there is an [entire datatype](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#t:Progress) `Progress`to specify configuration options. 362 | * **shakeColor**: Whether to colorize the command line output. 363 | * **shakeVerbosity**: How verbose the command line output should be. A [data type](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#t:Verbosity) `Verbosity` controles the different possible levels. 364 | * **shakeTimings**: Print timing information for each stage at the end. 365 | 366 | There is more to discover among these options and beyond in the mechanisms Shake provides. Fortunately the library is [quite extensively documented](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html). 367 | 368 | ## Conclusion 369 | 370 | Thanks for bearing with me until here. I wrote this post partly to document my decision process in this matter, but also to bring across one major and two minor points: 371 | 372 | * **Workflow managers are useful** even for small projects. Check if a tool like nextflow, snakemake or target (or whatever you prefer!) can make your daily work easier, faster and more reproducible. I find it relieving if I can be sure, that all my plots represent the latest stage of work in every script. 373 | * **Shake is a powerful tool**, if you know some Haskell. It’s flexible, very well written and elaborately documented. 374 | * **Haskell is a beautiful language** to express logic in a concise, yet clear way. Its custom operators can reduce repetitive code to a minimum. 375 | 376 | ***Acknowledgements:** I got some valuable feedback by my colleague Alexander Hübner (@alexhbnr) for this post.* -------------------------------------------------------------------------------- /_site/posts/2020-04-02-covid-19.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - COVID-19: Estimates of true infections, case fatality and growth rates in Germany 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

COVID-19: Estimates of true infections, case fatality and growth rates in Germany

33 |
34 |
35 | Posted 36 | 37 | originally here 38 | 39 | on April 2, 2020 40 | 41 | by Clemens Schmid and Stephan Schiffels 42 | 43 |
44 |
45 |

Acknowledgements: We got some valuable input and corrections from Martin Lange and Johannes Boog (both Helmholtz Centre for Environmental Research Leipzig)

46 |

Disclaimer: We have no epidemiological training and share these results without warranty of any kind. They should not be used as a basis for decision making and we refer to the respected authorities (e.g. for Germany the Robert Koch Institute) for reliable information and models. This post is only an interesting exercise in data analysis.

47 |

Note: Analyses in this post are from April 2nd, 2020, and naturally include only data from before that date.

48 |

The COVID-19 pandemic has taken its toll all around the world and caused (so far) hundreds of deaths in Germany. In this post we present current data and model estimations for multiple relevant parameters (e.g. current number of real infections and number of future deaths) for Germany.

49 |

In the context of the #WirvsVirus hackathon we started to work on the R package covid19germany that allows to download and visualize the current numbers of confirmed cases and deaths by administrative units. We use this package to access the data for this post. The code for this post can be found here. Furthermore the package comes with a webapp that allows to explore some of the following data and analyses in further detail — not just for the whole of Germany, but also for smaller administrative units as well as gender and age classes.

50 |

Quick overview about COVID-19 in Germany (2020–04-01)

51 |

The number of confirmed COVID-19 cases in Germany is rising daily, but it is unclear to which degree new infections are taking place or testing is simply catching up with past infection events. Germany may be one of the countries where testing covers a higher proportion of infected cases as the testing abilities are comparatively good. As testing will always lack behind the actual number of infected it is still an unreliable estimator of the true dimensions of this pandemic. The number of deaths caused by COVID-19 is a more trustworthy indicator — though with a significant temporal delay. More about this later.

52 |
53 | + 54 | 55 |
56 |
57 | Evolution of new daily and cumulative cases in Germany by federated state (Bundesland) 58 | 59 |
60 |

The increase of infected and deaths follows an expected acceleration trend due to exponential disease expansion with a growing number of spreaders. Dips on the weekends, especially of the number of positive tests, might be an effect of reduced working hours and reduced information transmission in and by health care authorities. At first glance, it is not entirely clear from this data if the social distancing rules imposed by the federal and local governments during the last two weeks have had a significant effect on the spreading of COVID-19, but the recent decline in the number of daily deaths raises hope.

61 |
62 | + 63 | 64 |
65 |
66 | Maps of cumulative and relative deaths and confirmed cases in Germany by county (Landkreis) 67 | 68 |
69 |

Western and Southern Germany have so far been more affected than Eastern Germany, with some individual counties (Landkreise) at the border to France, Czechia and Austria especially compromised. North Rhine-Westphalia, Bavaria and Baden-Württemberg — and therefore the federated states (Bundesländer) with the most inhabitants — have the most test-confirmed cases as well as deaths. A dashboard provided by the RKI, the GeoHealth Center at Bonn University and ESRI gives a good overview of the official numbers, which are published on a daily basis. The RKI also releases a daily report with relevant information.

70 |

Simple estimation based on systematic death lag

71 |

It generally is a difficult task to estimate the true number of infected people during an epidemic outbreak. However, we learned about two methods to do so in this excellent post by Tomas Pueyo.

72 |

One way is to focus on the current number of deaths. If we know the mean time it takes for an individual from infection to death (in case of death!) and the lethality (general probability to die from COVID-19), then we can calculate an estimation of the number of infected people in the past. We have some information about these two parameters from early scientific studies about COVID-19. We will use a fixed value of 17 days for the time to death and two different values for the lethality: 1% and 5%.

73 |

In the figure below, the estimate of the true number of infections for Germany is plotted with a line each for the two lethality scenarios. It can only be calculated for the past before the mean death time, which is indicated in the plot by a black, vertical line.

74 |
75 | Estimated true number of infected based on the registered number of deaths (for constant death probabilities 1% and 5% and a mean time from infection to death of 17 days). The red line indicates the officially registered number of infected; blue vertical line indicates the last day for which we currently have data (yesterday); black vertical line demarks the time to which the true number of infected can be estimated (yesterday minus 17 days). Data between black and blue vertical lines are predictions based on exponential growth 76 | 77 |
78 |

The lower the lethality of COVID-19, the higher the number of actually infected people in the past must have been, given the number of deaths that occurred later. We highlight that this estimated statistic is at least one order of magnitude higher than the measured observation of confirmed cases shown with the red line in the plot. Very interesting is the sudden uptick of the latter at the end of February, which is well reflected in the estimated statistic. Keep in mind: The estimation is based on deaths, not on test results! This correlation is therefore a good indicator that the estimate reflects some truth and that the number assumed for the mean time from infection to death (17 days) is not totally off.

79 |

Nevertheless this estimator per definition only provides information about the distant past (before the black, vertical line). To extrapolate this statistic until yesterday (after the black and before the blue, vertical line) we need another set of assumptions. In the simplest possible growth model the disease tends to spread in an exponential fashion with a certain time window until the number of infected doubles: the doubling time. We can take the last value I₀ in our first statistic and extend it with a time series of exponential growth with

80 |

Iₜ = I₀ x 2^(t/d)

81 |

where Iₜ is the true number of infected individuals after the time t. t is counted in days from yesterday minus the mean number of days from infection to death. d is the aforementioned doubling time in days.

82 |

The plot above shows three doubling time scenarios (3, 7 or 12 days) for each death probability scenario between the black and the blue vertical line (six scenarios in total). Some of them can already be ruled out considering the real-life testing data: They fall below the red curve. Others remain well possible. An increase of the doubling time is in all cases the desirable scenario and the following weeks will reveal (with their death count) if the social distancing measures prove to be effective to achieve this. Nevertheless it is very likely that far more people are infected right now than testing is able to confirm.

83 |

In a last step we can use the estimated infection counts to extrapolate the number of expected deaths in the near future (yesterday plus the mean number of days from infection to death) for the different doubling time scenarios. The lethality is not relevant for this particular approximation, because it already influenced the preceding calculation and is therefore removed from the equation.

84 |
85 | Current number of deaths (red line) and predicted number of future deaths (black lines) based on an exponential growth model for the number of past infected 86 | 87 |
88 |

If the number of cases that require intensive care rises above a certain threshold the capacities of hospitals would inevitably run out and the lethality would further increase beyond this projections. This dire possibility became a grim reality in Northern Italy.

89 |

Estimation via Bayesian growth models

90 |

To complement the analyses above and to make a more educated guess about the parameters visualized so far, we set up a Bayesian model to estimate the true number of infected people through time from both the reported deaths and the reported cases. This model was based on a slightly more complex notion of exponential growth with a built-in slow-down and includes the following assumptions:

91 |
    92 |
  • A death rate of exactly 1% (we discuss deviations from this below)
  • 93 |
  • A lag of 17 days between infection and death
  • 94 |
  • A lag of 7 days between infection and confirmatory test
  • 95 |
  • Exponential growth with a linear decrease of the growth rate due to the imposed social distancing measures
  • 96 |
97 |

Given these assumptions, we can estimate the true number of infections, as well as the reported number of test cases and deaths. A complete definition and analysis of this model can be found here.

98 |
99 | Model results for true (green) and confirmed cases (blue), as well as deaths (red). All three curves come from the same underlying Bayesian model and are estimated from the data (points) 100 | 101 |
102 |

The model predictions (the colored “ribbons”) are shown together with the true reported cases (points). Because this is Bayesian inference, all model predictions are given with quantified uncertainty. Note that we have incorporated only data points between February 23 and April 1 in this analysis. Before that time, Germany did not experience exponential growth yet.

103 |

As already shown above, the true number of infections (dark green) based on a death rate of 1% far exceeds the number of confirmed cases. We highlight that this is due to two effects: First, the reported cases and deaths lag behind the true infections, and so under exponential growth we expect the true infections of today to be much higher than the reported ones which were the infected seven days ago. Second, it is clearly expected that not all people with an infection get tested, for example because they don’t show symptoms.

104 |

One of the nice features of our model is that we get an explicit estimate of this miss-rate, but it depends linearly on the death-rate. In this case, we have assumed a death rate of 1%, and this yields — shockingly — a probability of getting tested between 12% and 24% only. That would mean that 76–88% of true infected cases are not tested. With a death rate of 3%, for example, the miss-rate would “only” be about 40–60%. So this is hard to estimate, but it’s clear we’re missing a lot!

105 |

A significant complication in this regard is introduced by the age structure of the population, because we know that elderly people die with much higher probability from COVID-19 then young people. An important next step for this kind of modelling would be to incorporate more realistic death rates, possibly age-stratified.

106 |

The specific growth model with linear slow-down seems to work OK for the data we have, although not perfectly. In particular, the slow down in recent days seems to be stronger than modeled. This is somewhat expected, since the measures against spread of the virus haven’t been “linear” in any way. Nevertheless, a linear slow-down is the first approximation to this process. Based on this, we can again — and this time in a more sophisticated way — try to predict how many cases we will have in the coming weeks. This is of course highly speculative and depends on assumptions in the model. In fact, the uncertainty increases the further you predict into the future, which is visible by the widening of the model bands in the figure. For example, the number of reported cases on April 15 is predicted to be anywhere between 60,000 and 150,000 (though not with uniform probability) according to this model and its uncertainty today. The reported number of deaths by that time are predicted to be between 2700 and 6000 in Germany. These wide intervals simply reflect the limited power of the data to accurately estimate the parameters of the growth model.

107 |

A popular choice to illustrate the speed of an exponential growth model is the doubling time in days, which we already employed as a static parameter in the simple model above. Our Bayesian inference now allows to estimate this parameter as a dynamic property of the underlying growth model. Here it is over the course of the last few weeks with a short outlook into the next week:

108 |
109 | Estimate of the doubling time in days. The visible slow-down (seen as an increase in the doubling time) is estimated from the data 110 | 111 |
112 |

So there definitely is some indication for a slow-down, with a doubling time just around 2.5 days around the end of February and now a rate around 5 days (the black line indicates the time of this writing), and a future prediction between 7 and 16 days in a week from now. This is interesting in light of comments from officials that a doubling time of 10 days or more should be reached in order to not overwhelm the healthcare system.

113 |

Conclusion and Outlook

114 |

We highlight three main conclusions from our modelling:

115 |
    116 |
  1. The miss-rate, so the probability for an infected person to not get tested, is one of the big unknowns in all countries currently. We can only estimate this number if strong assumptions on the death rate are made. Reversely, if the miss-rate were known better, this would allow a more accurate estimate of the death rate. One possibility to estimate the true prevalence would be representative random sampling from the population, which in fact is planned.
  2. 117 |
  3. “Predicting” the epidemiological dynamics into the future remains highly speculative. With Bayesian analyses, the degree of the resulting uncertainty is at least partly “built-in” the model. In our case, we showed that even with an arguably under-complex growth model with linear slow-down, the uncertainty on the number of infections in the future is very large, with predicted numbers to vary over a factor of 10 or more.
  4. 118 |
  5. One key, and perhaps simplifying, assumption in both our modelling attempts was the “lag” behind infections and test and death, respectively. One way to make these models more correct is by incorporating more realistic data for the course of individual infections. In reality, there is arguably a wide distribution of lag-times until symptoms, until test results, until death, while currently we assume these lag times to be fixed time periods.
  6. 119 |
120 |

We hope that our work may trigger some feedback and motivation for others. It is very easy to get started on working with the data, for example by using our ready-to-use R package. A lot more analyses are possible, when taking into account other data, some of which provided in this package, including county-based information about population numbers, the number of hospital beds, and age structure.

121 |
122 |
123 | 124 |
125 | 126 |
127 | Site generated by 128 | Hakyll 129 |
130 | 131 | 132 | -------------------------------------------------------------------------------- /_site/posts/2021-05-06-lambdar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Clemens' blog - Haskell in R? An experiment with the R package lambda.r 8 | 9 | 10 | 11 | 12 |
13 | 19 | 20 | 29 |
30 | 31 |
32 |

Haskell in R? An experiment with the R package lambda.r

33 |
34 |
35 | Posted 36 | 37 | originally here 38 | 39 | on May 6, 2021 40 | 41 | by Clemens Schmid 42 | 43 |
44 |
45 |

TL;DR: Feel free to directly jump to The lambda.r implementation if you only want to see that. The full code is posted at the end of the article.

46 |

Haskell and R are quite different programming languages. One is purely functional, statically typed and prominently features some of the most obscure abstractions in Computer Science. The other one lives at a particularly weird spot at the crossroad of the object-oriented, imperative and functional paradigms, has a ductile and dynamic type system and is optimized for the pragmatic needs of data analysis.

47 |

But still these two languages share some interesting features. For example both can be run interactively in an interpreter environment. And both consider functions first-class citizens – thus offering higher-order functions – and allow the definition of custom infix operators. And that’s why something like lambda.r is possible in the first place.

48 |

lambda.r (here v.1.2.4) is an R package that provides syntax extensions to write functional, Haskell-like code in R. It implements an astonishing number of features including type and function definition, pattern matching, guard statements and even monads! True functional programming available at your fingertips in R. All while maintaining a surprisingly Haskell-like syntax and incorporating powerful bonus features from R. Even a custom debugging system is part of the package.

49 |

The author Brian Lee Yung Rowe did an incredible job and also maintained the package over a commendable time span – the first commit on Github is from 2012 and the last change was pushed 2019.

50 |

Of course the package has some known limitations and rough edges. In my opinion it’s an extremely clever proof of concept and I enjoyed very much playing with it, but I’m not sure if I would recommend it for use in production. I’ll leave that to you and instead show you what I managed to build with it.

51 |

The experiment

52 |

Recently I wanted to implement a simple but specific logic in a bioinformatics context — so this is a real world example. But it would be tedious to explain the background, so I’ll instead replace the entities with something more digestible: Apples.

53 |

Let’s say we have two sets of apple varieties and then a number of other fruit variety sets (varieties of pears, plums, strawberries, …). The first apple collection is large and covers all sorts of types: Ambrosia, Granny Smith, Red Delicious, Jonagold, Rome, Honeycrisp and many more. The second apple collection is much smaller, but a strict subset of the first one. It only includes the three varieties Granny Smith, Red Delicious and Honeycrisp. We don’t really care about the other fruits.

54 |

Merging fruit variety sets in Haskell

55 |

How could we model these sets in Haskell? We don’t need to consider the individual varieties here. Only the variety collections. So we could create the type FruitSet with three data constructors for the three different relevant sets. For the sake of simplicity let’s shorten their names to

56 |
    57 |
  • LAS = Large Apple Set
  • 58 |
  • SAS = Small Apple Subset
  • 59 |
  • OFS = Other Fruit Set
  • 60 |
61 |
data FruitSet =
 62 |     LAS
 63 |   | SAS
 64 |   | OFS
 65 |   deriving (Eq, Show)
66 |

Now about the issue we have to solve for these sets: We need a function that merges a list of fruit sets according to a very specific logic into only one output fruit set. This has to adhere to the following pair-wise (and undirected) merging rules:

67 |
    68 |
  • If we merge two identical sets then the output should just be that set. That makes sense: Consider for example two Large Apple Sets. All the Ambrosia, Rome, Red Delicious and so forth apple varieties are present in both of the input sets in a pair-wise comparison.
  • 69 |
  • If we merge any set with one of the Other Fruit Sets then the output should always be an Other Fruit Set. Of course: we have a weird mixture of species and fruit varieties afterwards.
  • 70 |
71 |

For the final two rules, we also have to consider two different kind of merges: A union merge and an intersect merge.

72 |
    73 |
  • If we merge a Large Apple Set and a Small Apple Subset with a union merge, then a Large Apple Set should be returned. That makes sense: The varieties in the small subset — Granny Smith, Red Delicious and Honeycrisp — are already part the large superset.
  • 74 |
  • If we merge a Large Apple Set and a Small Apple Subset with an intersect merge, then we should get a Small Apple Subset. That just follows the same logic as in the previous rule.
  • 75 |
76 |

I think these rules are an excellent application for pattern matching in Haskell. We could implement them in a function like this:

77 |
fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet
 78 | fSMerge LAS LAS _     = LAS
 79 | fSMerge SAS SAS _     = SAS
 80 | fSMerge OFS _   _     = OFS
 81 | fSMerge _   OFS _     = OFS
 82 | fSMerge LAS SAS True  = SAS
 83 | fSMerge SAS LAS True  = SAS
 84 | fSMerge LAS SAS False = LAS
 85 | fSMerge SAS LAS False = LAS
86 |

Even if you’re not familiar with Haskell you may appreciate how the different pair-wise comparison cases are expressed here. The function takes two FruitSets and a logical to distinguish union (False) and intersect (True) merges. For many of these rules it does not even matter which kind of merge is applied. Here we can replace the pattern with the wildcard symbol “_”.

87 |

Now that we have these rules, we can also implement the function that applies them to an arbitrary list of FruitSets to determine the appropriate superset.

88 |
fSMergeList :: [FruitSet] -> Bool -> FruitSet
 89 | fSMergeList (x:xs) intersect = 
 90 |   foldr (\a b -> fSMerge a b intersect) x xs
91 |

It uses a fold to combine the list elements into one. Folds are operations that look at two elements of a list, apply some binary function to them, take the result and apply the same function again to that and a new list element. Just until only one result remains and the list is gone. Folds usually need a starting value that serves also as an “accumulator” to track the list-condensing result along the fold’s way through the list.

92 |

Here I used Haskell’s clever pattern matching on lists (x:xs) to separate the input list’s head and tail. That makes it straight forward to set the head element as the starting value for the fold. We will see below that lambda.r is less elegant here.

93 |

Finally we can test our code:

94 |
fSMergeList [LAS] True
 95 | -- LAS
 96 | fSMergeList [LAS, LAS] True
 97 | -- LAS
 98 | fSMergeList [LAS, LAS, SAS] True
 99 | -- SAS
100 | fSMergeList [LAS, LAS, SAS] False
101 | -- LAS
102 | fSMergeList [LAS, LAS, OFS] False
103 | -- OFS
104 |

Works like a charm! Let’s compare that with lamda.r now.

105 |

The lambda.r implementation

106 |

lambda.r provides some functions, mostly clever infix operators, to enable a Haskell-like logic and syntax in R. To access them we have to install and load the package first.

107 |
install.packages(“lambda.r”)
108 | library(lambda.r)
109 |

Just as in the Haskell code above we have to find a way to represent fruit sets. With lambda.r, types are defined by their constructor functions. Each function has a name and input arguments separated from a return value or operation with the %as% infix operator.

110 |
FruitSet("LAS") %as% "LAS"
111 | FruitSet("SAS") %as% "SAS"
112 | FruitSet("OFS") %as% "OFS"
113 |

A distinction of type and data constructor as in Haskell does not exist to my knowledge. Also no nullary data constructor (“constants”). So I decided to be creative and use pattern matching on strings to simulate a data type for different fruit sets. lambda.r understands this syntax perfectly fine and prints the resulting type as follows:

114 |
<type constructor>
115 |
[[1]]
116 | FruitSet("LAS") %:=% ...
117 | [[2]]
118 | FruitSet("SAS") %:=% ...
119 | [[3]]
120 | FruitSet("OFS") %:=% ...
121 |

With that data type we can define the pair-wise merging logic as laid out above.

122 |
fsMerge(a,b,intersect) %::% FruitSet : FruitSet : logical : FruitSet
123 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS")
124 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS")
125 | fsMerge("OFS", b,     intersect) %as% FruitSet("OFS")
126 | fsMerge(a,     "OFS", intersect) %as% FruitSet("OFS")
127 | fsMerge("LAS", "SAS", TRUE     ) %as% FruitSet("SAS")
128 | fsMerge("SAS", "LAS", TRUE     ) %as% FruitSet("SAS")
129 | fsMerge("LAS", "SAS", FALSE    ) %as% FruitSet("LAS")
130 | fsMerge("SAS", "LAS", FALSE    ) %as% FruitSet("LAS")
131 |

Note how extremely similar this syntax is to Haskell. The type interface definition follows exactly the same principle, short of some minor deviations when :: became %::% in R and -> is replaced by :. R has some limitations regarding infix operators.

132 |

One key take-away is, that this function will not run with input that is not exactly as specified. lambda.r thus introduces a static type system into R.

133 |

The pattern matching in the function definition is just as in Haskell, except of course for a number of syntactic details like the parentheses, commas, string-based values and lack of explicit wildcards. It’s another language after all!

134 |

With this function implemented, we only lack the last component: The function to apply the pair-wise comparisons with a fold on a list of FruitSets. And here things start to become a bit more tricky, unfortunately. Let’s start with the result:

135 |
fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet
136 | fsMergeList(xs, intersect) %as% 
137 |   Reduce(
138 |     function(a, b) { fsMerge(a, b, intersect) }, 
139 |     xs[tail(seq_along(xs), n = -1)], 
140 |     init = xs[[1]]
141 |   )
142 |

The general structure is again very Haskell-like. For the folding we use the Reduce function from the R base package (which is something like the Prelude in Haskell). One major difference between lambda.r and Haskell is though, that lambda.r lacks a good default way to handle lists. Maybe I just missed the relevant documentation or overlooked something else, but I struggled a bit with that.

143 |

In the end I decided to come up with my own list type.

144 |
FruitSetList(…) %::% FruitSet… : FruitSetList
145 | FruitSetList(…) %as% asFruitSetList(list(…))asFruitSetList(xs) %::% list : FruitSetList
146 | asFruitSetList(xs) %as% { 
147 |  class(xs) <- c(“FruitSetList”)
148 |  xs
149 | }
150 |

This constructor makes use of the Ellipsis type...”, a weird feature of R, well integrated into lambda.r: a single input argument that can represent a set of multiple arguments. In lambda.r it can be combined with a type constraint to make sure that the function takes an arbitrary amount of arguments, but only of this type. So here of type FruitSet.

151 |

That allows for a pretty cool constructor syntax:

152 |
FruitSetList(FruitSet(“LAS”), FruitSet(“SAS”), FruitSet(“OFS”))[[1]]
153 |
[1] "LAS"
154 | attr(,"class")
155 | [1] "FruitSet"  "character"
156 | [[2]]
157 | [1] "SAS"
158 | attr(,"class")
159 | [1] "FruitSet"  "character"
160 | [[3]]
161 | [1] "OFS"
162 | attr(,"class")
163 | [1] "FruitSet"  "character"attr(,"class")
164 | [1] "FruitSetList"
165 |

Unforturnately I found no direct way to catch the ellipsis and make it a FruitSetList. With list(...) I could indeed transform it to a list, but that’s only half the job. I resorted to the rather ugly asFruitSetList that “manually” adds the “FruitSetList” label to the class attribute of the output object. That works because lambda.r utilizes R S3 classes for its magic.

166 |

With that out of the way there was still one issue to address. I could not use Haskell’s pattern matching on lists to separate the head and tail elements for the Reduce input. It’s easy to get the first element of a list in R, but the tail requires some more advanced indexing:

167 |
xs[tail(seq_along(xs), n = -1)]
168 |

All issues should be solved now. It’s time for a final test run of our code:

169 |
fsMergeList(FruitSetList(FruitSet("LAS")), TRUE)
170 | # [1] "LAS"
171 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS")), TRUE)
172 | # [1] "LAS"
173 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), TRUE)
174 | # [1] "SAS"
175 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), FALSE)
176 | # [1] "LAS"
177 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("OFS")), FALSE)
178 | # [1] "OFS"
179 |

Excellent! The Syntax is more verbose as the one in Haskell, but the results are the same.

180 |

Recap

181 |
    182 |
  • Haskell and R are both versatile languages with large communities that regularly suggest and discuss new abstractions. Haskell is a real innovation machine and carries many functional programming concepts into other languages.
  • 183 |
  • lambda.r is a syntax extension to make some of the power of Haskell (or similar functional programming languages) available in R.
  • 184 |
  • lambda.r works and is extremely fun to play with, but it’s pretty verbose and lacks (at least to my understanding) a good list implementation. I also suspect it not to be optimized for performance — probably quite the opposite.
  • 185 |
186 |

I personally would love to see some of the concepts demonstrated with lambda.r to find their way into regular, base R. Especially a way to switch on static typing! That could avoid a lot of unexpected behavior. R interfaces often feel flimsy and not as rock solid as comparable code in Haskell. The approach lambda.r took here – e.g. with the Don’t-Care Type ., which I did not introduce – could be a way to combine dynamic and static typing. Ideally we want more sturdy interfaces without sacrificing R’s great flexibility for rapid prototyping.

187 |

Acknowledgements: I got some valuable feedback by my colleague James Fellows Yates (@jfy133) for this post.

188 |
189 |

Haskell:

190 |
data FruitSet =
191 |         LAS
192 |     |   SAS
193 |     |   OFS
194 |     deriving (Eq, Show)
195 | 
196 | fSMergeList :: [FruitSet] -> Bool -> FruitSet
197 | fSMergeList (x:xs) intersect = foldr (\a b -> fSMerge a b intersect) x xs
198 | 
199 | fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet
200 | fSMerge LAS LAS _     = LAS
201 | fSMerge SAS SAS _     = SAS
202 | fSMerge OFS _   _     = OFS
203 | fSMerge _   OFS _     = OFS
204 | fSMerge LAS SAS True  = SAS
205 | fSMerge SAS LAS True  = SAS
206 | fSMerge LAS SAS False = LAS
207 | fSMerge SAS LAS False = LAS
208 |

R:

209 |
library(lambda.r)
210 | 
211 | FruitSet("LAS") %as% "LAS"
212 | FruitSet("SAS") %as% "SAS"
213 | FruitSet("OFS") %as% "OFS"
214 | 
215 | FruitSetList(...) %::% FruitSet... : FruitSetList
216 | FruitSetList(...) %as% asFruitSetList(list(...))
217 | 
218 | asFruitSetList(xs) %::% list : FruitSetList
219 | asFruitSetList(xs) %as% { 
220 |   class(xs) <- c("FruitSetList")
221 |   xs
222 | }
223 | 
224 | fsMerge(a, b, intersect) %::% FruitSet : FruitSet : logical : FruitSet
225 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS")
226 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS")
227 | fsMerge("OFS", b,     intersect) %as% FruitSet("OFS")
228 | fsMerge(a,     "OFS", intersect) %as% FruitSet("OFS")
229 | fsMerge("LAS", "SAS", TRUE     ) %as% FruitSet("SAS")
230 | fsMerge("SAS", "LAS", TRUE     ) %as% FruitSet("SAS")
231 | fsMerge("LAS", "SAS", FALSE    ) %as% FruitSet("LAS")
232 | fsMerge("SAS", "LAS", FALSE    ) %as% FruitSet("LAS")
233 | 
234 | fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet
235 | fsMergeList(xs, intersect) %as% 
236 |   Reduce(
237 |     function(a, b) { fsMerge(a, b, intersect) }, 
238 |     xs[tail(seq_along(xs), n = -1)], 
239 |     init = xs[[1]]
240 |   )
241 |
242 |
243 | 244 |
245 | 246 | 250 | 251 | 252 | --------------------------------------------------------------------------------