├── .gitignore
├── stack.yaml
├── favicon.ico
├── _site
    ├── favicon.ico
    ├── images
    │   ├── ORCID-iD_icon_BW_16x16.png
    │   ├── 2020-04-02-covid-19
    │   │   ├── 0*0nXhv3wBHICs8oU9.webp
    │   │   ├── 0*6GxdzZ8ff8750eUI.webp
    │   │   ├── 1*ARAUUR6FfmsgiJu1ocjQ-A.webp
    │   │   ├── 1*KM-S2Z7BJotlspqUr8Te5g.webp
    │   │   ├── 1*STZnkSEKJRVMBzelagdi-A.webp
    │   │   ├── 1*WAinSw5vnzOzm5aAjgIXCg.webp
    │   │   ├── 1*bvu6XdbTRlk975p7bpVl2Q.webp
    │   │   └── 1*pXhoiK8_kaJ38oawNTGwag.webp
    │   └── 2021-12-05-shake-II
    │   │   ├── 1_9JOrZ76udsvr1kKBippbYg.webp
    │   │   └── 1_cFrMhLDcSVAt6zmR1BsCjg.webp
    ├── css
    │   ├── default.css
    │   └── syntax.css
    ├── index.html
    └── posts
    │   ├── 2021-12-05-shake-I.html
    │   ├── 2025-02-21-poseidon-git-pr-editing.html
    │   ├── 2023-12-31-poseidon-end-of-year-2023.html
    │   ├── 2020-04-02-covid-19.html
    │   └── 2021-05-06-lambdar.html
├── index.html
├── images
    ├── ORCID-iD_icon_BW_16x16.png
    ├── 2020-04-02-covid-19
    │   ├── 0*0nXhv3wBHICs8oU9.webp
    │   ├── 0*6GxdzZ8ff8750eUI.webp
    │   ├── 1*ARAUUR6FfmsgiJu1ocjQ-A.webp
    │   ├── 1*KM-S2Z7BJotlspqUr8Te5g.webp
    │   ├── 1*STZnkSEKJRVMBzelagdi-A.webp
    │   ├── 1*WAinSw5vnzOzm5aAjgIXCg.webp
    │   ├── 1*bvu6XdbTRlk975p7bpVl2Q.webp
    │   └── 1*pXhoiK8_kaJ38oawNTGwag.webp
    └── 2021-12-05-shake-II
    │   ├── 1_9JOrZ76udsvr1kKBippbYg.webp
    │   └── 1_cFrMhLDcSVAt6zmR1BsCjg.webp
├── templates
    ├── post-list.html
    ├── post.html
    └── default.html
├── blog.cabal
├── stack.yaml.lock
├── site.hs
├── css
    └── default.css
└── posts
    ├── 2025-02-21-poseidon-git-pr-editing.md
    ├── 2021-12-05-shake-I.md
    ├── 2017-12-28-custom-bars-rcppprogress.markdown
    ├── 2023-12-31-poseidon-end-of-year-2023.md
    ├── 2020-04-02-covid-19.md
    ├── 2021-05-06-lambdar.md
    └── 2021-12-05-shake-II.md


/.gitignore:
--------------------------------------------------------------------------------
1 | _cache/
2 | .stack-work/
3 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
1 | resolver: lts-22.43
2 | 
3 | packages:
4 | - .
5 | 


--------------------------------------------------------------------------------
/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/favicon.ico


--------------------------------------------------------------------------------
/_site/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/favicon.ico


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 | ---
2 | title: Posts
3 | ---
4 | 
5 | $partial("templates/post-list.html")$
6 | 


--------------------------------------------------------------------------------
/images/ORCID-iD_icon_BW_16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/ORCID-iD_icon_BW_16x16.png


--------------------------------------------------------------------------------
/_site/images/ORCID-iD_icon_BW_16x16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/ORCID-iD_icon_BW_16x16.png


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp


--------------------------------------------------------------------------------
/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp


--------------------------------------------------------------------------------
/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp


--------------------------------------------------------------------------------
/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp


--------------------------------------------------------------------------------
/templates/post-list.html:
--------------------------------------------------------------------------------
1 | <ul>
2 |     $for(posts)$
3 |         <li>
4 |             <a href="$url$">$title$</a> - $date$
5 |         </li>
6 |     $endfor$
7 | </ul>
8 | 


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp


--------------------------------------------------------------------------------
/_site/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp


--------------------------------------------------------------------------------
/_site/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp


--------------------------------------------------------------------------------
/_site/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp


--------------------------------------------------------------------------------
/templates/post.html:
--------------------------------------------------------------------------------
 1 | <article>
 2 |     <section class="header">
 3 |         Posted
 4 |         $if(origin)$
 5 |             originally <a href="$origin$">here</a>
 6 |         $endif$
 7 |         on $date$
 8 |         $if(author)$
 9 |             by $author$
10 |         $endif$
11 |     </section>
12 |     <section>
13 |         $body$
14 |     </section>
15 | </article>
16 | 


--------------------------------------------------------------------------------
/blog.cabal:
--------------------------------------------------------------------------------
 1 | name:               blog
 2 | version:            0.1.0.0
 3 | build-type:         Simple
 4 | cabal-version:      >= 1.10
 5 | 
 6 | executable site
 7 |   main-is:          site.hs
 8 |   build-depends:    base == 4.*
 9 |                   , hakyll == 4.16.*
10 |                   , pandoc
11 |   ghc-options:      -threaded -rtsopts -with-rtsopts=-N
12 |   default-language: Haskell2010
13 | 


--------------------------------------------------------------------------------
/stack.yaml.lock:
--------------------------------------------------------------------------------
 1 | # This file was autogenerated by Stack.
 2 | # You should not edit this file by hand.
 3 | # For more information, please see the documentation at:
 4 | #   https://docs.haskellstack.org/en/stable/topics/lock_files
 5 | 
 6 | packages: []
 7 | snapshots:
 8 | - completed:
 9 |     sha256: 08bd13ce621b41a8f5e51456b38d5b46d7783ce114a50ab604d6bbab0d002146
10 |     size: 720271
11 |     url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/22/43.yaml
12 |   original: lts-22.43
13 | 


--------------------------------------------------------------------------------
/templates/default.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
 6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |         <title>Clemens' blog - $title$</title>
 8 |         <link rel="stylesheet" href="/css/default.css" />
 9 |         <link rel="stylesheet" href="/css/syntax.css" />
10 |     </head>
11 |     <body>
12 |         <header>
13 |             <div class="logo">
14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
15 |                     <img src="/images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD"/>
16 |                     0000-0003-3448-5715 ← Clemens
17 |                 </a>
18 |             </div>
19 |             
20 |             <nav>
21 |                 <a href="/">Other posts</a>
22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
23 |                 <!--
24 |                 <a href="/about.html">About</a>
25 |                 <a href="/contact.html">Contact</a>
26 |                 <a href="/archive.html">Archive</a>
27 |                 -->
28 |             </nav>
29 |         </header>
30 | 
31 |         <main role="main">
32 |             <h1>$title$</h1>
33 |             $body$
34 |         </main>
35 | 
36 |         <footer>
37 |             Site generated by
38 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
39 |         </footer>
40 |     </body>
41 | </html>
42 | 


--------------------------------------------------------------------------------
/_site/css/default.css:
--------------------------------------------------------------------------------
1 | pre{font-size:1.4rem}body{background-color:#232629;color:white;font-size:1.6rem}a:link{color:pink;background-color:transparent;text-decoration:none}a:visited{color:pink;background-color:transparent;text-decoration:none}a:hover{color:red;background-color:transparent;text-decoration:underline}a:active{color:white;background-color:transparent;text-decoration:underline}html{font-size:62.5%}header{border-bottom:0.2rem solid white}nav{text-align:right}nav a{font-size:1.8rem;font-weight:bold;text-decoration:none;text-transform:uppercase}footer{margin-top:3rem;padding:1.2rem 0;border-top:0.2rem solid white;font-size:1.2rem;color:#c1c1c1}h1{font-size:2.4rem}h2{font-size:2rem}article .header{font-size:1.4rem;font-style:italic;color:#c1c1c1}.logo a{font-weight:bold;color:white;text-decoration:none}@media (max-width:319px){body{width:90%;margin:0;padding:0 5%}header{margin:4.2rem 0}nav{margin:0 auto 3rem;text-align:center}footer{text-align:center}.logo{text-align:center;margin:1rem auto 3rem}.logo a{font-size:2.4rem}nav a{display:block;line-height:1.6}}@media (min-width:320px){body{width:90%;margin:0;padding:0 5%}header{margin:4.2rem 0}nav{margin:0 auto 3rem;text-align:center}footer{text-align:center}.logo{text-align:center;margin:1rem auto 3rem}.logo a{font-size:2.4rem}nav a{display:inline;margin:0 0.6rem}}@media (min-width:640px){body{width:80rem;margin:0 auto;padding:0}header{margin:0 0 3rem;padding:1.2rem 0}nav{margin:0;text-align:right}nav a{margin:0 0 0 1.2rem;display:inline}footer{text-align:right}.logo{margin:0;text-align:left}.logo a{float:left;font-size:1.8rem}}


--------------------------------------------------------------------------------
/site.hs:
--------------------------------------------------------------------------------
 1 | --------------------------------------------------------------------------------
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | import           Data.Monoid (mappend)
 4 | import           Hakyll
 5 | import Text.Pandoc.Highlighting (Style, breezeDark, styleToCss)
 6 | import Text.Pandoc.Options      (ReaderOptions (..), WriterOptions (..))
 7 | 
 8 | --------------------------------------------------------------------------------
 9 | main :: IO ()
10 | main = hakyll $ do
11 |     match "favicon.ico" $ do
12 |         route   idRoute
13 |         compile copyFileCompiler
14 | 
15 |     match "images/**" $ do
16 |         route   idRoute
17 |         compile copyFileCompiler
18 |         
19 |     match "css/*" $ do
20 |         route   idRoute
21 |         compile compressCssCompiler
22 |     
23 |     -- css for syntax highlighting
24 |     create ["css/syntax.css"] $ do
25 |         route idRoute
26 |         compile $ do
27 |             makeItem $ styleToCss pandocCodeStyle
28 | 
29 |     --match (fromList ["about.rst", "contact.markdown"]) $ do
30 |     match (fromList []) $ do
31 |         route   $ setExtension "html"
32 |         compile $ pandocCompiler'
33 |             >>= loadAndApplyTemplate "templates/default.html" defaultContext
34 |             >>= relativizeUrls
35 | 
36 |     match "posts/*" $ do
37 |         route $ setExtension "html"
38 |         compile $ pandocCompiler'
39 |             >>= loadAndApplyTemplate "templates/post.html"    postCtx
40 |             >>= loadAndApplyTemplate "templates/default.html" postCtx
41 |             >>= relativizeUrls
42 | 
43 |     match "index.html" $ do
44 |         route idRoute
45 |         compile $ do
46 |             posts <- recentFirst =<< loadAll "posts/*"
47 |             let indexCtx =
48 |                     listField "posts" postCtx (return posts) `mappend`
49 |                     defaultContext
50 | 
51 |             getResourceBody
52 |                 >>= applyAsTemplate indexCtx
53 |                 >>= loadAndApplyTemplate "templates/default.html" indexCtx
54 |                 >>= relativizeUrls
55 | 
56 |     match "templates/*" $ compile templateBodyCompiler
57 | 
58 | 
59 | --------------------------------------------------------------------------------
60 | postCtx :: Context String
61 | postCtx =
62 |     dateField "date" "%B %e, %Y" `mappend`
63 |     defaultContext
64 | 
65 | pandocCodeStyle :: Style
66 | pandocCodeStyle = breezeDark
67 | 
68 | pandocCompiler' :: Compiler (Item String)
69 | pandocCompiler' =
70 |   pandocCompilerWith
71 |     defaultHakyllReaderOptions
72 |     defaultHakyllWriterOptions
73 |       { writerHighlightStyle   = Just pandocCodeStyle
74 |       }
75 | 


--------------------------------------------------------------------------------
/_site/index.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
 6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |         <title>Clemens' blog - Posts</title>
 8 |         <link rel="stylesheet" href="./css/default.css" />
 9 |         <link rel="stylesheet" href="./css/syntax.css" />
10 |     </head>
11 |     <body>
12 |         <header>
13 |             <div class="logo">
14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
15 |                     <img src="./images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD" />
16 |                     0000-0003-3448-5715 ← Clemens
17 |                 </a>
18 |             </div>
19 |             
20 |             <nav>
21 |                 <a href="./">Other posts</a>
22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
23 |                 <!--
24 |                 <a href="/about.html">About</a>
25 |                 <a href="/contact.html">Contact</a>
26 |                 <a href="/archive.html">Archive</a>
27 |                 -->
28 |             </nav>
29 |         </header>
30 | 
31 |         <main role="main">
32 |             <h1>Posts</h1>
33 |             <ul>
34 |     
35 |         <li>
36 |             <a href="./posts/2025-02-21-poseidon-git-pr-editing.html">Editing a pull request branch created from a fork</a> - February 21, 2025
37 |         </li>
38 |     
39 |         <li>
40 |             <a href="./posts/2023-12-31-poseidon-end-of-year-2023.html">Poseidon end-of-year review 2023</a> - December 31, 2023
41 |         </li>
42 |     
43 |         <li>
44 |             <a href="./posts/2021-12-05-shake-II.html">Workflow management with Haskell Shake II: Showcase</a> - December  5, 2021
45 |         </li>
46 |     
47 |         <li>
48 |             <a href="./posts/2021-12-05-shake-I.html">Workflow management with Haskell Shake I: Discovery</a> - December  5, 2021
49 |         </li>
50 |     
51 |         <li>
52 |             <a href="./posts/2021-05-06-lambdar.html">Haskell in R? An experiment with the R package lambda.r</a> - May  6, 2021
53 |         </li>
54 |     
55 |         <li>
56 |             <a href="./posts/2020-04-02-covid-19.html">COVID-19: Estimates of true infections, case fatality and growth rates in Germany</a> - April  2, 2020
57 |         </li>
58 |     
59 |         <li>
60 |             <a href="./posts/2017-12-28-custom-bars-rcppprogress.html">Custom progress bars for RcppProgress</a> - December 28, 2017
61 |         </li>
62 |     
63 | </ul>
64 | 
65 | 
66 |         </main>
67 | 
68 |         <footer>
69 |             Site generated by
70 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
71 |         </footer>
72 |     </body>
73 | </html>
74 | 


--------------------------------------------------------------------------------
/css/default.css:
--------------------------------------------------------------------------------
  1 | /* custom */
  2 | 
  3 | pre {
  4 |   font-size: 1.4rem;
  5 | }
  6 | 
  7 | body {
  8 |   background-color: #232629;
  9 |   color: white;
 10 |   font-size: 1.6rem;
 11 | }
 12 | 
 13 | a:link {
 14 |   color: pink;
 15 |   background-color: transparent;
 16 |   text-decoration: none;
 17 | }
 18 | 
 19 | a:visited {
 20 |   color: pink;
 21 |   background-color: transparent;
 22 |   text-decoration: none;
 23 | }
 24 | 
 25 | a:hover {
 26 |   color: red;
 27 |   background-color: transparent;
 28 |   text-decoration: underline;
 29 | }
 30 | 
 31 | a:active {
 32 |   color: white;
 33 |   background-color: transparent;
 34 |   text-decoration: underline;
 35 | }
 36 | 
 37 | /* default hakyll */
 38 | 
 39 | html {
 40 |   font-size: 62.5%;
 41 | }
 42 | 
 43 | /*
 44 | body {
 45 |   font-size: 1.6rem;
 46 |   color: #000;
 47 | }
 48 | */
 49 | 
 50 | header {
 51 |   border-bottom: 0.2rem solid white;
 52 | }
 53 | 
 54 | nav {
 55 |   text-align: right;
 56 | }
 57 | 
 58 | nav a {
 59 |   font-size: 1.8rem;
 60 |   font-weight: bold;
 61 |   text-decoration: none;
 62 |   text-transform: uppercase;
 63 | }
 64 | 
 65 | footer {
 66 |   margin-top: 3rem;
 67 |   padding: 1.2rem 0;
 68 |   border-top: 0.2rem solid white;
 69 |   font-size: 1.2rem;
 70 |   color: #c1c1c1;
 71 | }
 72 | 
 73 | h1 {
 74 |   font-size: 2.4rem;
 75 | }
 76 | 
 77 | h2 {
 78 |   font-size: 2rem;
 79 | }
 80 | 
 81 | article .header {
 82 |   font-size: 1.4rem;
 83 |   font-style: italic;
 84 |   color: #c1c1c1;
 85 | }
 86 | 
 87 | .logo a {
 88 |   font-weight: bold;
 89 |   color: white;
 90 |   text-decoration: none;
 91 | }
 92 | 
 93 | @media (max-width: 319px) {
 94 |   body {
 95 |     width: 90%;
 96 |     margin: 0;
 97 |     padding: 0 5%;
 98 |   }
 99 |   header {
100 |     margin: 4.2rem 0;
101 |   }
102 |   nav {
103 |     margin: 0 auto 3rem;
104 |     text-align: center;
105 |   }
106 |   footer {
107 |     text-align: center;
108 |   }
109 |   .logo {
110 |     text-align: center;
111 |     margin: 1rem auto 3rem;
112 |   }
113 |   .logo a {
114 |     font-size: 2.4rem;
115 |   }
116 |   nav a {
117 |     display: block;
118 |     line-height: 1.6;
119 |   }
120 | }
121 | 
122 | @media (min-width: 320px) {
123 |   body {
124 |     width: 90%;
125 |     margin: 0;
126 |     padding: 0 5%;
127 |   }
128 |   header {
129 |     margin: 4.2rem 0;
130 |   }
131 |   nav {
132 |     margin: 0 auto 3rem;
133 |     text-align: center;
134 |   }
135 |   footer {
136 |     text-align: center;
137 |   }
138 |   .logo {
139 |     text-align: center;
140 |     margin: 1rem auto 3rem;
141 |   }
142 |   .logo a {
143 |     font-size: 2.4rem;
144 |   }
145 |   nav a {
146 |     display: inline;
147 |     margin: 0 0.6rem;
148 |   }
149 | }
150 | 
151 | @media (min-width: 640px) {
152 |   body {
153 |     width: 80rem;
154 |     margin: 0 auto;
155 |     padding: 0;
156 |   }
157 |   header {
158 |     margin: 0 0 3rem;
159 |     padding: 1.2rem 0;
160 |   }
161 |   nav {
162 |     margin: 0;
163 |     text-align: right;
164 |   }
165 |   nav a {
166 |     margin: 0 0 0 1.2rem;
167 |     display: inline;
168 |   }
169 |   footer {
170 |     text-align: right;
171 |   }
172 |   .logo {
173 |     margin: 0;
174 |     text-align: left;
175 |   }
176 |   .logo a {
177 |     float: left;
178 |     font-size: 1.8rem;
179 |   }
180 | }
181 | 


--------------------------------------------------------------------------------
/_site/css/syntax.css:
--------------------------------------------------------------------------------
 1 | pre > code.sourceCode { white-space: pre; position: relative; }
 2 | pre > code.sourceCode > span { line-height: 1.25; }
 3 | pre > code.sourceCode > span:empty { height: 1.2em; }
 4 | .sourceCode { overflow: visible; }
 5 | code.sourceCode > span { color: inherit; text-decoration: inherit; }
 6 | div.sourceCode { margin: 1em 0; }
 7 | pre.sourceCode { margin: 0; }
 8 | @media screen {
 9 | div.sourceCode { overflow: auto; }
10 | }
11 | @media print {
12 | pre > code.sourceCode { white-space: pre-wrap; }
13 | pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
14 | }
15 | pre.numberSource code
16 |   { counter-reset: source-line 0; }
17 | pre.numberSource code > span
18 |   { position: relative; left: -4em; counter-increment: source-line; }
19 | pre.numberSource code > span > a:first-child::before
20 |   { content: counter(source-line);
21 |     position: relative; left: -1em; text-align: right; vertical-align: baseline;
22 |     border: none; display: inline-block;
23 |     -webkit-touch-callout: none; -webkit-user-select: none;
24 |     -khtml-user-select: none; -moz-user-select: none;
25 |     -ms-user-select: none; user-select: none;
26 |     padding: 0 4px; width: 4em;
27 |     background-color: #232629;
28 |     color: #7a7c7d;
29 |   }
30 | pre.numberSource { margin-left: 3em; border-left: 1px solid #7a7c7d;  padding-left: 4px; }
31 | div.sourceCode
32 |   { color: #cfcfc2; background-color: #232629; }
33 | @media screen {
34 | pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
35 | }
36 | code span { color: #cfcfc2; } /* Normal */
37 | code span.al { color: #95da4c; background-color: #4d1f24; font-weight: bold; } /* Alert */
38 | code span.an { color: #3f8058; } /* Annotation */
39 | code span.at { color: #2980b9; } /* Attribute */
40 | code span.bn { color: #f67400; } /* BaseN */
41 | code span.bu { color: #7f8c8d; } /* BuiltIn */
42 | code span.cf { color: #fdbc4b; font-weight: bold; } /* ControlFlow */
43 | code span.ch { color: #3daee9; } /* Char */
44 | code span.cn { color: #27aeae; font-weight: bold; } /* Constant */
45 | code span.co { color: #7a7c7d; } /* Comment */
46 | code span.cv { color: #7f8c8d; } /* CommentVar */
47 | code span.do { color: #a43340; } /* Documentation */
48 | code span.dt { color: #2980b9; } /* DataType */
49 | code span.dv { color: #f67400; } /* DecVal */
50 | code span.er { color: #da4453; text-decoration: underline; } /* Error */
51 | code span.ex { color: #0099ff; font-weight: bold; } /* Extension */
52 | code span.fl { color: #f67400; } /* Float */
53 | code span.fu { color: #8e44ad; } /* Function */
54 | code span.im { color: #27ae60; } /* Import */
55 | code span.in { color: #c45b00; } /* Information */
56 | code span.kw { color: #cfcfc2; font-weight: bold; } /* Keyword */
57 | code span.op { color: #cfcfc2; } /* Operator */
58 | code span.ot { color: #27ae60; } /* Other */
59 | code span.pp { color: #27ae60; } /* Preprocessor */
60 | code span.re { color: #2980b9; background-color: #153042; } /* RegionMarker */
61 | code span.sc { color: #3daee9; } /* SpecialChar */
62 | code span.ss { color: #da4453; } /* SpecialString */
63 | code span.st { color: #f44f4f; } /* String */
64 | code span.va { color: #27aeae; } /* Variable */
65 | code span.vs { color: #da4453; } /* VerbatimString */
66 | code span.wa { color: #da4453; } /* Warning */
67 | 


--------------------------------------------------------------------------------
/posts/2025-02-21-poseidon-git-pr-editing.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Editing a pull request branch created from a fork"
 3 | author: "Clemens Schmid"
 4 | origin: https://blog.poseidon-adna.org/posts/archive_editing_git.html
 5 | ---
 6 | 
 7 | At the time of writing, the [Poseidon community archive](https://github.com/poseidon-framework/community-archive) has 14 open pull requests -- most of which were opened by various community members to add new packages to the archive. What certainly is a pleasant development, because it indicates that the archive gets adopted, also comes with technical and administrative challenges. As an editor for the archive I recently had to step up my Git skills to address a particular issue I was facing.
 8 | 
 9 | Already multiple times I found myself in the situation that I needed to edit a submission pull request before merging. This arose for example, when a package author prepared a package almost perfectly, but I still wanted to apply some additional minor changes before merging. Or an author or reviewer had struggled with Git, manoeuvred themselves into a predicament, and needed my help to untangle the knot without [starting from scratch](https://xkcd.com/1597). So here is what I came up with to allow me to do that efficiently.
10 | 
11 | ## Workflow
12 | 
13 | GitHub's documentation includes a helpful tutorial how to [commit changes to a pull request branch created from a fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/committing-changes-to-a-pull-request-branch-created-from-a-fork). It already covers the basic workflow how to edit a fork. The article highlights a number of conditions for this to be possible:
14 | 
15 | > You can only make commits on pull request branches that:
16 | >
17 | > - Are opened in a repository that you have push access to and that were created from a fork of that repository
18 | > - Are on a user-owned fork
19 | > - Have permission granted from the pull request creator
20 | > - Don't have branch restrictions that will prevent you from committing
21 | 
22 | All of these are met in my case. But two additional challenges complicate the matter: i) the community-archive uses Git LFS for the large data files, and ii) I need to do this so frequently, that cloning every fork feels unnecessarily cumbersome. The following workflow considers this special situation.
23 | 
24 | ### 1. Clone the fork repository {#sec-clone}
25 | 
26 | ```bash
27 | GIT_LFS_SKIP_SMUDGE=1 git clone git@github.com:USERNAME/FORK-OF-THE-REPOSITORY.git
28 | ```
29 | 
30 | Note that this workflow assumes that you have installed and configured Git LFS on your system. Cloning the repo with the `GIT_LFS_SKIP_SMUDGE` environment variable prevents downloading the LFS-tracked files despite Git LFS being enabled. This saves bandwidth and costs for us on GitHub.
31 | 
32 | ### 2. (if necessary) Switch to the pull request branch
33 | 
34 | ```bash
35 | git switch PR-BRANCH
36 | ```
37 | 
38 | This is only necessary, if the PR branch is not the main/master branch.
39 | 
40 | ### 3. (if necessary) Download individual LFS-tracked files
41 | 
42 | ```bash
43 | git lfs pull --include "PATH-TO-FILE"
44 | ```
45 | 
46 | To validate a package completely it can be necessary to also access the genotype data. But because we cloned above with `GIT_LFS_SKIP_SMUDGE=1`, this data is not in our clone now. Fortunately we can selectively download it. `PATH-TO-FILE` can also include wildcards.
47 | 
48 | ### 4. Edit the files as desired
49 | 
50 | Remember to commit the changes.
51 | 
52 | ### 5. Push the changes
53 | 
54 | This should work with `git push`. But yet again, Git LFS complicates things, raising the following error message:
55 | 
56 | ```txt
57 | error: Authentication error: Authentication required: You must have push access to verify locks
58 | error: failed to push some refs to 'github.com:USERNAME/FORK-OF-THE-REPOSITORY.git'
59 | ```
60 | 
61 | This is caused by a limitation of GitHub's Git LFS implementation. A long thread here discusses the issue: [Authentication required : You must have push access to verify locks error.](https://github.com/git-lfs/git-lfs/issues/2291#issuecomment-305887405) Multiple solutions are suggested there. One reliable workaround is to [delete the git hook `.git/hooks/pre-push`](https://github.com/git-lfs/git-lfs/issues/2291#issuecomment-305887405).
62 | 
63 | ```bash
64 | rm .git/hooks/pre-push
65 | git push
66 | ```
67 | 
68 | This resolved the issue for me -- specifically because I never had to edit any of the genotype data files when working on a PR fork. I don't know how this hack affects the handling of LFS-tracked files.
69 | 
70 | ### 6. (optional) Moving to another fork in the same clone
71 | 
72 | If the changes in a fork A are already merged into the master branch of the main archive repository, then a little trick allows to switch to another fork B in the same clone.
73 | 
74 | ```bash
75 | git remote -v
76 | git remote set-url origin git@github.com:poseidon-framework/community-archive.git
77 | git switch master
78 | git pull
79 | git remote set-url origin git@github.com:USERNAME/FORK-OF-THE-NEXT-REPOSITORY.git
80 | git pull
81 | ```
82 | 
83 | We set the remote URL to the main repository, switch to the master branch, and pull. The commits from A are already there, so we have a clean state again. From here we can set a new remote URL for a fork B and pull. This effectively saves us from creating a fresh clone (as described in @sec-clone).


--------------------------------------------------------------------------------
/posts/2021-12-05-shake-I.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Workflow management with Haskell Shake I: Discovery"
 3 | author: Clemens Schmid
 4 | origin: https://medium.com/@nevrome/my-workflow-automation-journey-discovering-shake-haskell-5c270b93ff2b
 5 | ---
 6 | 
 7 | *This is part I of a two part blog post. See [part II](/posts/2021-12-05-shake-II.html) for a little showcase of Shake.*
 8 | 
 9 | Workflow management, so software to organize and run data analysis scripts, is one of these fortunate domains, where dozens of open source solutions are competing for our attention. There’s probably something for every taste (see e.g. the extensive list [here](https://github.com/pditommaso/awesome-pipeline)), and many of these projects are actively maintained or at least comparatively easy to resurrect. This post is an attempt to describe my personal journey for a tool that fits me, in the hope to motivate you to go searching as well.
10 | 
11 | ## My user story
12 | 
13 | My PhD research is located somewhere between Bioinformatics and Archaeoinformatics (yep — that’s [a thing](https://caa-international.org/)) and I work with large and high-dimensional datasets. Not really Big data, but big enough to require a high performance computing environment to run analyses in reasonable time. Space, time and (ancient)DNA meet in my data, so my code necessarily relies on a variety of software libraries from different domains. In the last two years I piled scripts on top of scripts and thus created a complex network of interlinked code for data preparation, analysis and visualization.
14 | 
15 | This is my personal user story. It eventually brought me to a point where I realized that I have to introduce a more sophisticated system for dependency management and workflow automation. The former is especially important for reproducibility, and the latter to propagate changes, so to always maintain an up-to-date version of derived data products and plots. I needed a system that defines, runs and monitors a pipeline of code across different interacting scripts.
16 | 
17 | As I share these challenges with a large number of people working professionally with computers, there are many excellent solutions for exactly these challenges out there. I just had to pick what fits me, my tasks and my interests. So I decided to follow my gut feelings and ended up with the containerization solutions docker and singularity to encapsulate my development environment (which will only be mentioned in passing here), and the build system [**Shake**](https://shakebuild.com/) to orchestrate my analysis pipeline.
18 | 
19 | ### Why Shake, of all things?
20 | 
21 | The first options l considered for pipeline management were [**Nextflow**](https://www.nextflow.io/) and [**Snakemake**](https://snakemake.readthedocs.io/en/stable/). Both are very popular among my colleagues in bioinformatics. At our department there seems to be an even divide between strong fans of the former and the latter. I personally did not want to deal neither with [Groovy](http://groovy-lang.org/documentation.html) nor with Python, though, which nextflow and snakemake respectively use as an underlying configuration language. Ideally I wanted to write the pipeline definition in a language and framework I’m already familiar with. That’s not (only) laziness. By working in either R or Haskell, with which I feel most comfortable, I could more easily leverage the power of these languages.
22 | 
23 | So then I gave some scrutiny to [**targets**](https://books.ropensci.org/targets/walkthrough.html), an implementation of a pipelining tool in R. This might have worked for me, but it gave me the impression to be too focused on workflows within R. R is certainly an important component of my personal tech stack right now, but I wanted to be prepared for whatever the future might bring. I also — and that’s very shallow— didn’t like target’s syntax from what I saw in the example code, where every computation in a pipeline got crammed into a single list object.
24 | 
25 | At this point I realized I would really like to solve this in Haskell, as the language became something of a personal passion anyway. A functional, strongly typed language should also — at least in theory — be a good fit to formalize building rules. I did some research and came across three Haskell tools that seem to offer workflow management: [**Funflow**](https://github.com/tweag/funflow), [**Porcupine**](https://github.com/tweag/porcupine) and [**Bioshake**](https://github.com/PapenfussLab/bioshake). Instead of diving into them one after the other, I took a step back and asked the excellent Haskell community on reddit for advice: [Experiences with workflow managers implemented in Haskell (funflow, porcupine, bioshake, ?)](https://old.reddit.com/r/haskell/comments/q0esys/experiences_with_workflow_managers_implemented_in/)
26 | 
27 | Fortunately [Justin Bedő](https://github.com/jbedo), the author of Bioshake, saw the post and gave me some insights about his implementation. At the time he had already moved one step further, and had discontinued the development of Bioshake for his new solution [**BioNix**](https://github.com/PapenfussLab/bionix), which solves both (!) dependency and worflow management with the fascinating [Nix](https://nixos.org/) infrastructure. As Nix is a big world on its own, I couldn’t follow him there. So I instead gave the Bioshake documentation a good read. And there I realized that Bioshake heavily relies on Shake internally: understanding Shake seemed to be inevitable to figuring out Bioshake. And Shake alone already turned out to be powerful and flexible enough for my current needs!
28 | 
29 | I had reached the end of my software exploration journey.
30 | 
31 | Your journey for a workflow management solution would certainly be different, and you would most likely reach different conclusions. But I encourage you to explore this realm, if you think you share a user story similar to mine. You can keep reading [here](/posts/2021-12-05-shake-II.html), if you want to see how I configured Shake to help me with my challenges.
32 | 


--------------------------------------------------------------------------------
/_site/posts/2021-12-05-shake-I.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
 6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |         <title>Clemens' blog - Workflow management with Haskell Shake I: Discovery</title>
 8 |         <link rel="stylesheet" href="../css/default.css" />
 9 |         <link rel="stylesheet" href="../css/syntax.css" />
10 |     </head>
11 |     <body>
12 |         <header>
13 |             <div class="logo">
14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
15 |                     <img src="../images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD" />
16 |                     0000-0003-3448-5715 ← Clemens
17 |                 </a>
18 |             </div>
19 |             
20 |             <nav>
21 |                 <a href="../">Other posts</a>
22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
23 |                 <!--
24 |                 <a href="/about.html">About</a>
25 |                 <a href="/contact.html">Contact</a>
26 |                 <a href="/archive.html">Archive</a>
27 |                 -->
28 |             </nav>
29 |         </header>
30 | 
31 |         <main role="main">
32 |             <h1>Workflow management with Haskell Shake I: Discovery</h1>
33 |             <article>
34 |     <section class="header">
35 |         Posted
36 |         
37 |             originally <a href="https://medium.com/@nevrome/my-workflow-automation-journey-discovering-shake-haskell-5c270b93ff2b">here</a>
38 |         
39 |         on December  5, 2021
40 |         
41 |             by Clemens Schmid
42 |         
43 |     </section>
44 |     <section>
45 |         <p><em>This is part I of a two part blog post. See <a href="../posts/2021-12-05-shake-II.html">part II</a> for a little showcase of Shake.</em></p>
46 | <p>Workflow management, so software to organize and run data analysis scripts, is one of these fortunate domains, where dozens of open source solutions are competing for our attention. There’s probably something for every taste (see e.g. the extensive list <a href="https://github.com/pditommaso/awesome-pipeline">here</a>), and many of these projects are actively maintained or at least comparatively easy to resurrect. This post is an attempt to describe my personal journey for a tool that fits me, in the hope to motivate you to go searching as well.</p>
47 | <h2 id="my-user-story">My user story</h2>
48 | <p>My PhD research is located somewhere between Bioinformatics and Archaeoinformatics (yep — that’s <a href="https://caa-international.org/">a thing</a>) and I work with large and high-dimensional datasets. Not really Big data, but big enough to require a high performance computing environment to run analyses in reasonable time. Space, time and (ancient)DNA meet in my data, so my code necessarily relies on a variety of software libraries from different domains. In the last two years I piled scripts on top of scripts and thus created a complex network of interlinked code for data preparation, analysis and visualization.</p>
49 | <p>This is my personal user story. It eventually brought me to a point where I realized that I have to introduce a more sophisticated system for dependency management and workflow automation. The former is especially important for reproducibility, and the latter to propagate changes, so to always maintain an up-to-date version of derived data products and plots. I needed a system that defines, runs and monitors a pipeline of code across different interacting scripts.</p>
50 | <p>As I share these challenges with a large number of people working professionally with computers, there are many excellent solutions for exactly these challenges out there. I just had to pick what fits me, my tasks and my interests. So I decided to follow my gut feelings and ended up with the containerization solutions docker and singularity to encapsulate my development environment (which will only be mentioned in passing here), and the build system <a href="https://shakebuild.com/"><strong>Shake</strong></a> to orchestrate my analysis pipeline.</p>
51 | <h3 id="why-shake-of-all-things">Why Shake, of all things?</h3>
52 | <p>The first options l considered for pipeline management were <a href="https://www.nextflow.io/"><strong>Nextflow</strong></a> and <a href="https://snakemake.readthedocs.io/en/stable/"><strong>Snakemake</strong></a>. Both are very popular among my colleagues in bioinformatics. At our department there seems to be an even divide between strong fans of the former and the latter. I personally did not want to deal neither with <a href="http://groovy-lang.org/documentation.html">Groovy</a> nor with Python, though, which nextflow and snakemake respectively use as an underlying configuration language. Ideally I wanted to write the pipeline definition in a language and framework I’m already familiar with. That’s not (only) laziness. By working in either R or Haskell, with which I feel most comfortable, I could more easily leverage the power of these languages.</p>
53 | <p>So then I gave some scrutiny to <a href="https://books.ropensci.org/targets/walkthrough.html"><strong>targets</strong></a>, an implementation of a pipelining tool in R. This might have worked for me, but it gave me the impression to be too focused on workflows within R. R is certainly an important component of my personal tech stack right now, but I wanted to be prepared for whatever the future might bring. I also — and that’s very shallow— didn’t like target’s syntax from what I saw in the example code, where every computation in a pipeline got crammed into a single list object.</p>
54 | <p>At this point I realized I would really like to solve this in Haskell, as the language became something of a personal passion anyway. A functional, strongly typed language should also — at least in theory — be a good fit to formalize building rules. I did some research and came across three Haskell tools that seem to offer workflow management: <a href="https://github.com/tweag/funflow"><strong>Funflow</strong></a>, <a href="https://github.com/tweag/porcupine"><strong>Porcupine</strong></a> and <a href="https://github.com/PapenfussLab/bioshake"><strong>Bioshake</strong></a>. Instead of diving into them one after the other, I took a step back and asked the excellent Haskell community on reddit for advice: <a href="https://old.reddit.com/r/haskell/comments/q0esys/experiences_with_workflow_managers_implemented_in/">Experiences with workflow managers implemented in Haskell (funflow, porcupine, bioshake, ?)</a></p>
55 | <p>Fortunately <a href="https://github.com/jbedo">Justin Bedő</a>, the author of Bioshake, saw the post and gave me some insights about his implementation. At the time he had already moved one step further, and had discontinued the development of Bioshake for his new solution <a href="https://github.com/PapenfussLab/bionix"><strong>BioNix</strong></a>, which solves both (!) dependency and worflow management with the fascinating <a href="https://nixos.org/">Nix</a> infrastructure. As Nix is a big world on its own, I couldn’t follow him there. So I instead gave the Bioshake documentation a good read. And there I realized that Bioshake heavily relies on Shake internally: understanding Shake seemed to be inevitable to figuring out Bioshake. And Shake alone already turned out to be powerful and flexible enough for my current needs!</p>
56 | <p>I had reached the end of my software exploration journey.</p>
57 | <p>Your journey for a workflow management solution would certainly be different, and you would most likely reach different conclusions. But I encourage you to explore this realm, if you think you share a user story similar to mine. You can keep reading <a href="../posts/2021-12-05-shake-II.html">here</a>, if you want to see how I configured Shake to help me with my challenges.</p>
58 |     </section>
59 | </article>
60 | 
61 |         </main>
62 | 
63 |         <footer>
64 |             Site generated by
65 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
66 |         </footer>
67 |     </body>
68 | </html>
69 | 


--------------------------------------------------------------------------------
/_site/posts/2025-02-21-poseidon-git-pr-editing.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
 6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |         <title>Clemens' blog - Editing a pull request branch created from a fork</title>
 8 |         <link rel="stylesheet" href="../css/default.css" />
 9 |         <link rel="stylesheet" href="../css/syntax.css" />
10 |     </head>
11 |     <body>
12 |         <header>
13 |             <div class="logo">
14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
15 |                     <img src="../images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD" />
16 |                     0000-0003-3448-5715 ← Clemens
17 |                 </a>
18 |             </div>
19 |             
20 |             <nav>
21 |                 <a href="../">Other posts</a>
22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
23 |                 <!--
24 |                 <a href="/about.html">About</a>
25 |                 <a href="/contact.html">Contact</a>
26 |                 <a href="/archive.html">Archive</a>
27 |                 -->
28 |             </nav>
29 |         </header>
30 | 
31 |         <main role="main">
32 |             <h1>Editing a pull request branch created from a fork</h1>
33 |             <article>
34 |     <section class="header">
35 |         Posted
36 |         
37 |             originally <a href="https://blog.poseidon-adna.org/posts/archive_editing_git.html">here</a>
38 |         
39 |         on February 21, 2025
40 |         
41 |             by Clemens Schmid
42 |         
43 |     </section>
44 |     <section>
45 |         <p>At the time of writing, the <a href="https://github.com/poseidon-framework/community-archive">Poseidon community archive</a> has 14 open pull requests – most of which were opened by various community members to add new packages to the archive. What certainly is a pleasant development, because it indicates that the archive gets adopted, also comes with technical and administrative challenges. As an editor for the archive I recently had to step up my Git skills to address a particular issue I was facing.</p>
46 | <p>Already multiple times I found myself in the situation that I needed to edit a submission pull request before merging. This arose for example, when a package author prepared a package almost perfectly, but I still wanted to apply some additional minor changes before merging. Or an author or reviewer had struggled with Git, manoeuvred themselves into a predicament, and needed my help to untangle the knot without <a href="https://xkcd.com/1597">starting from scratch</a>. So here is what I came up with to allow me to do that efficiently.</p>
47 | <h2 id="workflow">Workflow</h2>
48 | <p>GitHub’s documentation includes a helpful tutorial how to <a href="https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/committing-changes-to-a-pull-request-branch-created-from-a-fork">commit changes to a pull request branch created from a fork</a>. It already covers the basic workflow how to edit a fork. The article highlights a number of conditions for this to be possible:</p>
49 | <blockquote>
50 | <p>You can only make commits on pull request branches that:</p>
51 | <ul>
52 | <li>Are opened in a repository that you have push access to and that were created from a fork of that repository</li>
53 | <li>Are on a user-owned fork</li>
54 | <li>Have permission granted from the pull request creator</li>
55 | <li>Don’t have branch restrictions that will prevent you from committing</li>
56 | </ul>
57 | </blockquote>
58 | <p>All of these are met in my case. But two additional challenges complicate the matter: i) the community-archive uses Git LFS for the large data files, and ii) I need to do this so frequently, that cloning every fork feels unnecessarily cumbersome. The following workflow considers this special situation.</p>
59 | <h3 id="sec-clone">1. Clone the fork repository</h3>
60 | <div class="sourceCode" id="cb1"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="va">GIT_LFS_SKIP_SMUDGE</span><span class="op">=</span>1 <span class="fu">git</span> clone git@github.com:USERNAME/FORK-OF-THE-REPOSITORY.git</span></code></pre></div>
61 | <p>Note that this workflow assumes that you have installed and configured Git LFS on your system. Cloning the repo with the <code>GIT_LFS_SKIP_SMUDGE</code> environment variable prevents downloading the LFS-tracked files despite Git LFS being enabled. This saves bandwidth and costs for us on GitHub.</p>
62 | <h3 id="if-necessary-switch-to-the-pull-request-branch">2. (if necessary) Switch to the pull request branch</h3>
63 | <div class="sourceCode" id="cb2"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> switch PR-BRANCH</span></code></pre></div>
64 | <p>This is only necessary, if the PR branch is not the main/master branch.</p>
65 | <h3 id="if-necessary-download-individual-lfs-tracked-files">3. (if necessary) Download individual LFS-tracked files</h3>
66 | <div class="sourceCode" id="cb3"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> lfs pull <span class="at">--include</span> <span class="st">&quot;PATH-TO-FILE&quot;</span></span></code></pre></div>
67 | <p>To validate a package completely it can be necessary to also access the genotype data. But because we cloned above with <code>GIT_LFS_SKIP_SMUDGE=1</code>, this data is not in our clone now. Fortunately we can selectively download it. <code>PATH-TO-FILE</code> can also include wildcards.</p>
68 | <h3 id="edit-the-files-as-desired">4. Edit the files as desired</h3>
69 | <p>Remember to commit the changes.</p>
70 | <h3 id="push-the-changes">5. Push the changes</h3>
71 | <p>This should work with <code>git push</code>. But yet again, Git LFS complicates things, raising the following error message:</p>
72 | <div class="sourceCode" id="cb4"><pre class="sourceCode txt"><code class="sourceCode default"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>error: Authentication error: Authentication required: You must have push access to verify locks</span>
73 | <span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>error: failed to push some refs to 'github.com:USERNAME/FORK-OF-THE-REPOSITORY.git'</span></code></pre></div>
74 | <p>This is caused by a limitation of GitHub’s Git LFS implementation. A long thread here discusses the issue: <a href="https://github.com/git-lfs/git-lfs/issues/2291#issuecomment-305887405">Authentication required : You must have push access to verify locks error.</a> Multiple solutions are suggested there. One reliable workaround is to <a href="https://github.com/git-lfs/git-lfs/issues/2291#issuecomment-305887405">delete the git hook <code>.git/hooks/pre-push</code></a>.</p>
75 | <div class="sourceCode" id="cb5"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">rm</span> .git/hooks/pre-push</span>
76 | <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> push</span></code></pre></div>
77 | <p>This resolved the issue for me – specifically because I never had to edit any of the genotype data files when working on a PR fork. I don’t know how this hack affects the handling of LFS-tracked files.</p>
78 | <h3 id="optional-moving-to-another-fork-in-the-same-clone">6. (optional) Moving to another fork in the same clone</h3>
79 | <p>If the changes in a fork A are already merged into the master branch of the main archive repository, then a little trick allows to switch to another fork B in the same clone.</p>
80 | <div class="sourceCode" id="cb6"><pre class="sourceCode bash"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> remote <span class="at">-v</span></span>
81 | <span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> remote set-url origin git@github.com:poseidon-framework/community-archive.git</span>
82 | <span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> switch master</span>
83 | <span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> pull</span>
84 | <span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> remote set-url origin git@github.com:USERNAME/FORK-OF-THE-NEXT-REPOSITORY.git</span>
85 | <span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="fu">git</span> pull</span></code></pre></div>
86 | <p>We set the remote URL to the main repository, switch to the master branch, and pull. The commits from A are already there, so we have a clean state again. From here we can set a new remote URL for a fork B and pull. This effectively saves us from creating a fresh clone (as described in <span class="citation" data-cites="sec-clone">@sec-clone</span>).</p>
87 |     </section>
88 | </article>
89 | 
90 |         </main>
91 | 
92 |         <footer>
93 |             Site generated by
94 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
95 |         </footer>
96 |     </body>
97 | </html>
98 | 


--------------------------------------------------------------------------------
/posts/2017-12-28-custom-bars-rcppprogress.markdown:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Custom progress bars for RcppProgress
  3 | author: Clemens Schmid and Karl Forner
  4 | origin: https://gallery.rcpp.org/articles/custom-bars-rcppprogress
  5 | ---
  6 | 
  7 | [RcppProgress](http://cran.r-project.org/web/packages/RcppProgress/index.html) 
  8 | is a tool to help you monitor the execution time of your C++ code, by
  9 | providing a way to interrupt the execution inside the C++ code, and also to
 10 | display a progress bar indicative of the state of your computation. Additionally, 
 11 | it is compatible with multi-threaded code, for example using OpenMP. 
 12 | [The initial (yet updated) article](https://gallery.rcpp.org/articles/using-rcppprogress/) explains the 
 13 | basic setup.
 14 | 
 15 | Since version 0.4 it became more simple to create custom progress bars. In this new
 16 | article we will show how to do this. Our final example displays a progress bar which 
 17 | provides an estimation of the remaining time (ETA) to finish a computation.
 18 | 
 19 | ## A minimal example
 20 | 
 21 | Imagine you added a progress bar with RcppProgress to your function
 22 | `long_computation()` following the example from the first article mentioned above.
 23 | 
 24 | ```cpp {.numberLines}
 25 | // [[Rcpp::depends(RcppProgress)]]
 26 | #include <progress.hpp>
 27 | #include <progress_bar.hpp>
 28 | // [[Rcpp::export]]
 29 | double long_computation(int nb, bool display_progress=true) {
 30 |     double sum = 0;
 31 |     Progress p(nb*nb, display_progress);
 32 |     for (int i = 0; i < nb; ++i) {
 33 |         if (Progress::check_abort() )
 34 |             return -1.0;
 35 |         for (int j = 0; j < nb; ++j) {
 36 |             p.increment(); 
 37 |             sum += R::dlnorm(i+j, 0.0, 1.0, 0);
 38 |         }
 39 |     }
 40 |     return sum + nb;
 41 | }
 42 | 
 43 | long_computation(10)
 44 | ```
 45 | 
 46 | What you get is a basic and useful console visualization that looks like this: 
 47 | 
 48 | ```
 49 | 0%   10   20   30   40   50   60   70   80   90   100%
 50 | [----|----|----|----|----|----|----|----|----|----|
 51 | ******************************
 52 | ```
 53 | 
 54 | That's the default, platform independent display in RcppProgress defined in 
 55 | [SimpleProgressBar.hpp](https://github.com/kforner/rcpp_progress/blob/master/inst/include/simple_progress_bar.hpp). 
 56 | It's OK for most purposes to give you an idea how much work is done and it also allows
 57 | you to make a very intuitive estimation about how long it's going to take to finish.
 58 | But of course that's not everything a progress bar *could* show you. A progress bar 
 59 | could give you information about the running progress or about performance parameters 
 60 | of your system. It could contain calculated estimates of passed and remaining time. 
 61 | After all it could just look much more fancy to impress your colleagues.
 62 | 
 63 | RcppProgress makes it now easy to create your own implementation of a progress bar class. 
 64 | Your own class has to be derived from the abstract class `ProgressBar` that defines some 
 65 | basic virtual methods:
 66 | 
 67 | ```cpp {.numberLines}
 68 | class ProgressBar {
 69 |   public:
 70 |     virtual ~ProgressBar() = 0;
 71 |     virtual void display() = 0;
 72 |     virtual void update(float progress) = 0;
 73 |     virtual void end_display() = 0;
 74 | };
 75 | ```
 76 | 
 77 | `display()` starts the display that will be updated by subsequent calls of
 78 | `update()`. `end_display` finalizes it. Your progress bar implementation should
 79 | not rely on the destructor to finalize the display.
 80 | 
 81 | A very **minimal setup** could look something like this: 
 82 | 
 83 | ```cpp {.numberLines}
 84 | #include <R_ext/Print.h>
 85 | 
 86 | // [[Rcpp::depends(RcppProgress)]]
 87 | #include <progress.hpp>
 88 | #include "progress_bar.hpp"
 89 | 
 90 | class MinimalProgressBar: public ProgressBar{
 91 |   public:
 92 |     MinimalProgressBar()  {
 93 |         _finalized = false;
 94 |     }
 95 |     
 96 |     ~MinimalProgressBar() {}
 97 |     
 98 |     void display() {
 99 |       REprintf("Progress: ");
100 |     }
101 |     
102 |     void update(float progress) {
103 |       if (_finalized) return;
104 |       REprintf("+");
105 |     }
106 |     
107 |     void end_display() {
108 |       if (_finalized) return;
109 |       REprintf("\n");
110 |       _finalized = true;
111 |     }
112 |     
113 |   private:
114 |   
115 |     bool _finalized;
116 |           
117 | };
118 | 
119 | // [[Rcpp::export]]
120 | double long_computation_minimal(int nb, bool display_progress=true) {
121 |     MinimalProgressBar pb;
122 |     double sum = 0;
123 |     Progress p(nb, display_progress, pb);
124 |     for (int i = 0; i < nb; ++i) {
125 |         if (Progress::check_abort() )
126 |             return -1.0;
127 |         for (int j = 0; j < nb; ++j) {
128 |             sum += R::dlnorm(i+j, 0.0, 1.0, 0);
129 |         }
130 |         p.increment(); 
131 |     }
132 |     return sum + nb;
133 | }
134 | 
135 | long_computation_minimal(10)
136 | ```    
137 | 
138 | The `display()` method in this example does nothing more than printing the word 
139 | `Progress`. `update()` concatenates a `+` symbol every time `Progress::increment()` is 
140 | called. The result looks like this:
141 | 
142 | ```
143 | Progress: ++++++++++
144 | ```
145 | 
146 | In comparison to the example of the default progress bar above, I moved the 
147 | call to `increment()` out of the second level and into the first level loop to keep
148 | the amount of console output at bay. `update()` also checks if the display is `_finalized`.
149 | `end_display` triggers the finalization.  
150 | 
151 | ## Remaining time estimation
152 | 
153 | Based on the minimal setup above, you can implement more sophisticated 
154 | progress bars. Here's an example of one that looks exactly like the default 
155 | `SimpleProgressBar`, but adds an estimation of the remaining time for the process to finish. 
156 | You can find a complete package setup with the code for this ETAProgressBar 
157 | [here](https://github.com/kforner/rcpp_progress/tree/master/inst/examples/RcppProgressETA). 
158 | In this article we only highlight some crucial aspects of the implementation.  
159 | 
160 | We use the [Rinterface.h](https://stat.ethz.ch/R-manual/R-devel/RHOME/doc/manual/R-exts.html#Setting-R-callbacks) header to update the display dynamically. Unfortunately this [header is only available for Unix-like systems](https://stackoverflow.com/questions/47623478/creating-a-progress-update-by-replacing-output-in-the-r-console-from-c-c/47624175?noredirect=1#comment82228757_47624175). 
161 | A less cool, old version of an ETA progress bar that also works on windows can be 
162 | found [here](https://github.com/kforner/rcpp_progress/blob/5b0ec0d672c7758cf4c4134e97dfa9921ac668e0/inst/examples/RcppProgressETA/src/eta_progress_bar.hpp). 
163 | The following preprocessor statements load Rinterface.h if the code is compiled 
164 | on a non-windows computer.  
165 | 
166 | ```cpp {.numberLines}
167 | #if !defined(WIN32) && !defined(__WIN32) && !defined(__WIN32__)
168 | #include <Rinterface.h>
169 | #endif
170 | ```
171 | 
172 | The class `ETAProgressBar` inherits from the abstract class `ProgressBar`. 
173 | It has an integer variable `_max_ticks` that controls the amount of individual
174 | tick symbols necessary to reach the 100% mark of the progress bar. That depends
175 | on the display you want to craft. `ETAProgressBar` also has a boolean flag variable 
176 | `_timer_flag` that acts as a switch to separate the initial starting turn where 
177 | the time measurement starts and the following turns where the time is picked off.
178 | The measured time values are stored in two variables `start` and `end` of class
179 | `time_t` (from [ctime](http://www.cplusplus.com/reference/ctime/)).
180 | 
181 | ```cpp {.numberLines}
182 | class ETAProgressBar: public ProgressBar{
183 | 
184 |   // ...
185 |   
186 |   private: 
187 |     int _max_ticks;
188 |     bool _finalized;
189 |     bool _timer_flag;
190 |     time_t start,end;
191 |     
192 |   // ...  
193 |     
194 | }
195 | ```
196 | 
197 | The `display()` function initializes the progress bar visualization. The first two lines
198 | are hard coded ASCII art.
199 | 
200 | ```cpp {.numberLines}
201 | void display() {
202 |   REprintf("0%%   10   20   30   40   50   60   70   80   90   100%%\n");
203 |   REprintf("[----|----|----|----|----|----|----|----|----|----|\n");
204 |   flush_console();
205 | }
206 | ```
207 | 
208 | `update()` is the most important function for the progress bar mechanism. The if clause
209 | allows to separate the initial call of `update()` from the following ones to start the time
210 | counter. Afterwards the time passed is calculated and transformed to a human readable string
211 | by the custom function `_time_to_string()`. `_current_ticks_display()` is another custom 
212 | function to transform the progress information to a string with the correct amount of `*`
213 | symbols and filling whitespaces. The progress string and the time string are concatenated
214 | to create the additional third line below the initial two lines drawn by `display()`. 
215 | A string with sufficient whitespaces is also added to ensure that this dynamically updated 
216 | line is overwritten completely from turn to turn. `REprintf("\r");` triggers a carriage return
217 | to make this continuous overwriting possible.
218 | 
219 | ```cpp {.numberLines}
220 | void update(float progress) {
221 |   
222 |     // stop if already finalized
223 |     if (_finalized) return;
224 |   
225 |     // start time measurement when update() is called the first time
226 |     if (_timer_flag) {
227 |         _timer_flag = false;
228 |         // measure start time
229 |         time(&start);
230 |     } else {
231 |     
232 |         // measure current time
233 |         time(&end);
234 |     
235 |         // calculate passed time and remaining time (in seconds)
236 |         double pas_time = std::difftime(end, start);
237 |         double rem_time = (pas_time / progress) * (1 - progress);
238 |     
239 |         // convert seconds to time string
240 |         std::string time_string = _time_to_string(rem_time);
241 |     
242 |         // create progress bar string 
243 |         std::string progress_bar_string = _current_ticks_display(progress);
244 |     
245 |         // ensure overwriting of old time info
246 |         int empty_length = time_string.length();
247 |         std::string empty_space = std::string(empty_length, ' ');
248 |     
249 |         // merge progress bar and time string
250 |         std::stringstream strs;
251 |         strs << "|" << progress_bar_string << "| " << time_string << empty_space;
252 |         std::string temp_str = strs.str();
253 |         char const* char_type = temp_str.c_str();
254 |     
255 |         // print: remove old display and replace with new
256 |         REprintf("\r");
257 |         REprintf("%s", char_type);
258 |     
259 |         // finalize display when ready
260 |         if(progress == 1) {
261 |            _finalize_display();
262 |         }  
263 |     }
264 | }
265 | ```
266 | 
267 | `_time_to_string()` parses time information given in form of a floating point number of 
268 | seconds to a human-readable string. The basic algorithm is based on an example from 
269 | [programmingnotes.org](http://www.programmingnotes.org/?p=2062). 
270 | 
271 | ```cpp {.numberLines}
272 | std::string _time_to_string(double seconds) {
273 |   
274 |     int time = (int) seconds;
275 |   
276 |     int hour = 0;
277 |     int min = 0;
278 |     int sec = 0;
279 |   
280 |     hour = time / 3600;
281 |     time = time % 3600;
282 |     min = time / 60;
283 |     time = time % 60;
284 |     sec = time;
285 |   
286 |     std::stringstream time_strs;
287 |     if (hour != 0) time_strs << hour << "h ";
288 |     if (min != 0) time_strs << min << "min ";
289 |     if (sec != 0) time_strs << sec << "s ";
290 |     std::string time_str = time_strs.str();
291 |   
292 |     return time_str;
293 | }
294 | ```
295 | 
296 | `_current_ticks_display()` relies on `_compute_nb_ticks()` to first of all transform
297 | the progress information (floating point number between 0 and 1) to a natural number
298 | that expresses the correct fraction of `_max_ticks`. `_construct_ticks_display_string()`
299 | takes this value and parses a string with `*` symbols and whitespaces that can be plotted
300 | as a visual progress indication.
301 | 
302 | ```cpp {.numberLines}
303 | std::string _current_ticks_display(float progress) {
304 |     int nb_ticks = _compute_nb_ticks(progress);
305 |     std::string cur_display = _construct_ticks_display_string(nb_ticks);
306 |     return cur_display;
307 | }
308 | 
309 | int _compute_nb_ticks(float progress) {
310 |     return int(progress * _max_ticks);
311 | }
312 | 
313 | std::string _construct_ticks_display_string(int nb) {
314 |     std::stringstream ticks_strs;
315 |     for (int i = 0; i < (_max_ticks - 1); ++i) {
316 |         if (i < nb) {
317 |             ticks_strs << "*";
318 |         } else {
319 |             ticks_strs << " ";
320 |         }
321 |     }
322 |     std::string tick_space_string = ticks_strs.str();
323 |     return tick_space_string;
324 | }
325 | ```
326 | 
327 | `flush_console()` is a wrapper around [`R_FlushConsole()`](https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Setting-R-callbacks) which is called to flush any 
328 | pending output to the system console. It's necessary to do this when the display is started
329 | in `display()` and when it's closed in `_finalize_display()`.
330 | 
331 | ```cpp {.numberLines}
332 | void flush_console() {
333 | #if !defined(WIN32) && !defined(__WIN32) && !defined(__WIN32__)
334 |           R_FlushConsole();
335 | #endif
336 | }
337 | ```
338 | 
339 | The output of an `ETAProgressBar` looks like this:
340 | 
341 | ```
342 | 0%   10   20   30   40   50   60   70   80   90   100%
343 | [----|----|----|----|----|----|----|----|----|----|
344 | |*******                                          | 49s
345 | ```
346 | 


--------------------------------------------------------------------------------
/posts/2023-12-31-poseidon-end-of-year-2023.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Poseidon end-of-year review 2023
 3 | author: Clemens Schmid
 4 | origin: https://blog.poseidon-adna.org/posts/endofyear2023.html
 5 | ---
 6 | 
 7 | It's late December and the time of the year when work slows down in my part of the world. For many of us an opportunity to take a break and to look back, contemplating the achievements of the year. I decided to do so as well and write a bit about Poseidon.
 8 | 
 9 | What follows is a subjective account of the events in and around the framework in 2023 - each of my colleagues in the core team (Stephan Schiffels, Ayshin Ghalichi, Thiseas C. Lamnidis, Dhananjaya B. A. Mudiyanselage, Wolfgang Haak and I, Clemens Schmid) would probably emphasise different developments in such a write-up. That is in itself an achievement, because it shows how much the tech-stack, domains and services in our little ecosystem have grown this year: beyond the understanding of each of us individually.
10 | 
11 | ## The Poseidon schema
12 | 
13 | Let's start simple with the two new releases of the Poseidon schema we published this year: v2.7.0 and v2.7.1. They were published in short succession in March and May, the latter only slightly improving the sequencing source files (.ssf) added in the first. See the changelog [here](https://www.poseidon-adna.org/#/changelog) for more details, but the addition of the [.ssf file](https://www.poseidon-adna.org/#/ssf_details) is indeed their most remarkable contribution to the schema. With it we addressed a major desideratum and unresolved question in previous versions of Poseidon: How should genotype data be linked to the raw sequencing data on the [European Nucleotide Archive (ENA)](https://www.ebi.ac.uk/ena/browser/home) and other archives of the [International Nucleotide Sequence Database Collaboration (INSDC)](https://www.insdc.org/)?
14 | 
15 | The .ssf file is, I would argue, a smart solution for this question. It specifies the same variables already used in the ENA database, allows for an extremely flexible, yet not arbitrary n:m connection between the entities in a Poseidon package and the raw data products and it can be [generated semi-automatically](https://github.com/poseidon-framework/scripts/blob/main/get_ena_table.py) for most of the data in our public archives. With some tweaking it can also be used to organize local data repositories independent of any online databases. The .ssf file is finally the very foundation on top of which the amazing Minotaur workflow is built (see below).
16 | 
17 | Generally, both the fact that only two Poseidon releases were necessary this year and that we could treat them as non-breaking changes indicate that we reached a certain level of maturity and stability in the schema. Of course we still have ideas how to extend it further in the future, but at the moment I'm optimistic that we can maintain long-term backwards compatibility. The process in which we discussed, specified and later improved the .ssf file definition to then see Minotaur be erected on top of it was a very satisfying professional experience for me personally.
18 | 
19 | ## The Minotaur workflow
20 | 
21 | The Minotaur workflow is a semi-automatic workflow to reproducibly process published sequencing data into Poseidon packages. Developing this entirely new branch of the Poseidon ecosystem became possible because Thiseas joined the Poseidon core team in 2023. He came up with a sophisticated, yet open and transparent implementation of this process, in which authors and the community as a whole retain control over the data and the data processing parameters. A full write-up for the website is [in progress](https://github.com/poseidon-framework/poseidon-framework.github.io/pull/54). Here is the summary Thiseas prepared for [our poster at the ISBA conference](https://blog.poseidon-adna.org/posts/isba2023poster.html):
22 | 
23 | Community members can request new packages to be processed through the Minotaur workflow by submitting a build recipe as a pull request against a dedicated GitHub repository. This recipe is created from a sequencing source file (.ssf), describing the sequencing data for the package and where it can be downloaded. Using the recipe, the sequencing data gets processed via [nf-core/eager](https://nf-co.re/eager) on computational infrastructure of MPI-EVA, using a standardised, yet flexible, set of parameters. The generated genotypes, together with descriptive statistics of the sequencing data (Endogenous, Damage, Nr_SNPs, Contamination), are compiled into a Poseidon package, and made available to users in the minotaur-archive.
24 | 
25 | The Minotaur workflow is a timely addition to the Poseidon framework, providing a flexible solution to wrap legacy and new data in uniformly processed packages. Homogeneous data processing puts us closer to our great comparadum, [the AADR dataset](https://reich.hms.harvard.edu/allen-ancient-dna-resource-aadr-downloadable-genotypes-present-day-and-ancient-dna-data). It also helped us to finalize the structure of our public archives, which emerged from long discussions about the kind of data we think the aDNA community requires for derived analyses.
26 | 
27 | Right now the Minotaur workflow is still in a final development and testing phase, where we focus on the processes around it, so the submission of recipes, their review and the forwarding of results to the minotaur-archive. One particular tricky question is how context information in the .janno file should be passed from the community-archive to the new packages in the minotaur-archive. [One of the last pull requests](https://github.com/poseidon-framework/poseidon-hs/pull/282) for our software tool trident in 2023 aims to introduce a reliable mechanism to merge .janno files to address this issue.
28 | 
29 | ## The public archives
30 | 
31 | In 2023 we finally came to a conclusion on how to organize our public data archives. What emerged is a threefold division into what we call the community-archive, the minotaur-archive and the aadr-archive. The archives are described in more detail on the [website](https://www.poseidon-adna.org/#/archive_overview), but here's the gist of it:
32 | 
33 | The [community-archive](https://github.com/poseidon-framework/community-archive) emerged from our old public-archive. It includes the legacy data we originally copied from the AADR. We now decided to use this archive for author-submitted publication-wise packages to collect the exact genotype data analysed in the respective papers. The idea is twofold: With the author-submitted genotype data the results in a given paper can be reproduced exactly. And the publication authors are generally the most trustworthy authority for the context data we collect in the .janno files, e.g. the spatiotemporal origin of the individual samples. Ayshin and I recently wrote about the submission process for the community-archive [here](https://mpi-eva-archaeogenetics.github.io/comp_human_adna_book/poseidon.html#contributing-to-the-community-archive).
34 | 
35 | The [minotaur-archive](https://github.com/poseidon-framework/minotaur-archive) mirrors the community-archive in that it features publication-wise packages, usually even the very same as in the community-archive. To distinguish them clearly, package titles and sample-wise Poseidon_IDs in the minotaur-archive carry the suffix `_MNT`. As explained above the packages in this archive include consistently reprocessed genotype data, run through the Minotaur workflow.
36 | 
37 | The [aadr-archive](https://github.com/poseidon-framework/aadr-archive) is the conceptionally most simple archive. It features “poseidonized” versions of releases of the AADR dataset, currently only the latest AADR v54.1.p1. We documented the code and decisions for the cleaning and packaging process [here](https://github.com/poseidon-framework/aadr2poseidon).
38 | 
39 | 2023 not only saw the planning and setup of these three archives, but also a lot of work to fill them with life. For the community archive that meant plenty of data cleaning by all of us, most notably Dhananjaya. And it also meant providing guidance for authors to submit their data. Thanks to the hard work of Ayshin a total of eleven author-submitted packages are available in the archive now. Number twelve was [submitted shortly before christmas](https://github.com/poseidon-framework/community-archive/pull/151) and is awaiting review. The minotaur-archive is still functionally empty, but three packages [are pending](https://github.com/poseidon-framework/minotaur-archive/pulls) thanks to Thiseas and will hopefully soon be merged. Preparing the latest version of the AADR dataset for the aadr-archive was one of the projects I tackled this year.
40 | 
41 | ## The software tools
42 | 
43 | The Poseidon software tools grew significantly more powerful this year. From a user-perspective 2023 brought various new features, changes to the command line interfaces and breaking updates in the Web-API. To keep track of the releases and the Poseidon schema versions they support I created a [version overview table](https://www.poseidon-adna.org/#/version_table) on the website.
44 | 
45 | With qjanno I added an entirely new tool to the set. It is a command line tool to run SQL queries on .janno (and arbitrary .csv and .tsv) files. I created it by forking the [qsh package](https://github.com/itchyny/qhs) and then adjusting it heavily for the use on Poseidon packages. Just as trident it is written in Haskell and openly available with precompiled executables [here](https://www.poseidon-adna.org/#/qjanno).
46 | 
47 | Stephan invested a good amount of effort into consolidating the data analysis features in xerxes. He wrote a [whitepaper](https://github.com/poseidon-framework/poseidon-analysis-hs/blob/main/docs/xerxes_whitepaper.pdf) to explain and justify the reasoning behind the implemented logic for f-statistics, and another [blog post](https://blog.poseidon-adna.org/posts/xerxes_10.html) on how to run it. Even more approachable and comprehensive is a write-up he shared [here](https://mpi-eva-archaeogenetics.github.io/comp_human_adna_book/fstats.html). Together we worked on integrating the many changes to trident and its underlying poseidon-hs Haskell library into xerxes.
48 | 
49 | Our main workhorse, trident, saw an astonishing number of new releases: `v1.1.6.0` on January 8 to `v1.4.0.3` on October 30. I quickly went through the [extended changelogs](https://github.com/poseidon-framework/poseidon-hs/releases) published with each release to summarize the user-facing highlights of what trident supports now:
50 | 
51 | - Arbitrary columns in the .janno file beyond the columns specified in the Poseidon schema (v1.1.6.0)
52 | - Specification of individuals with identical names from different source packages in the `trident forge` selection language (v1.1.7.0)
53 | - Validation of the entire genotype data in a package with `--fullGeno` in `trident validate` (v1.1.10.2)
54 | - Poseidon schema version v2.7.1 with validation of the .ssf file (v1.1.12.0)
55 | - A highly improved Poseidon [Web-API](https://www.poseidon-adna.org/#/web_api) that allows to request individual (old) package versions (v1.2.0.0)
56 | - Reworked versions of `trident update`, now called `trident rectify`, and `trident validate`, which now allows to validate not just entire packages, but also individual files (v1.3.0.4)
57 | - Selecting packages by version in the forge selection language and generally handling multiple package versions (v1.4.0.2, Stephan shared yet [another blog post](https://blog.poseidon-adna.org/posts/trident_14.html) about this release)
58 | 
59 | As always I enjoyed the work on the software tools tremendously, especially in two cases: If one of our users reports an issue and we can address a concrete need with a release, and if the Haskell programming language allows for a particularly elegant solution for a given problem. A [currently pending pull request](https://github.com/poseidon-framework/poseidon-hs/pull/283) combines both: Ayshin made me aware of some validation failure cases that require better error messages and I found a neat way to provide just that with a custom-tailored monadic stack.
60 | 
61 | ## Outreach
62 | 
63 | The last domain where we made good progress in 2023 is public outreach. Naturally we invested hours in writing and updating documentation on the project website (<https://www.poseidon-adna.org>), but we also pursued a number of special projects beyond the basic, technical description of software and workflows.
64 | 
65 | The first one of these was possible thanks to the effort of Dhananjaya, Stephan and me: We built [a page on the website](https://www.poseidon-adna.org/#/archive_explorer) where the data in the public archives can be easily explored. It makes use of our Web-API to access the data and display it with a sub-page for each package. Dhananjaya wrote [a blog post](https://blog.poseidon-adna.org/posts/Archive_explorer_Blogpost.html) about this, recently.
66 | 
67 | I already mentioned this blog multiple times above. It is indeed another great addition of 2023. Stephan created a separate website at <https://blog.poseidon-adna.org> to share news and short tutorials. Our wish has always been to gather an active and engaged community of users around Poseidon, and we hope to establish this blog as one of its central communication hubs. A major medium for longer write-ups beyond the technical documentation already available on the website.
68 | 
69 | To announce our blog posts, software releases and other news we fully switched from Twitter (now X) to the Fediverse in 2023. You can follow us here: <https://ecoevo.social/@poseidon>. The switch came naturally, given the state of affairs at X. Submitting posts automatically is more easy with Mastodon compared to Twitter and I made sure that this process works reliably for our software releases on GitHub.
70 | 
71 | Beyond these technical novelties and online communication we also presented Poseidon at two in-person conferences in 2023: [ISBA10 in Tartu, Estonia](https://isba10.ut.ee) and the [NFDI4Objects community meeting in Berlin, Germany](https://www.nfdi4objects.net/index.php/en/get-informed/community-meeting-and-general-assembly). The poster we presented at both of these occasions was already mentioned above and is available [here](https://blog.poseidon-adna.org/posts/isba2023poster.html). And the slides for the talk Thiseas prepared for the latter should soon be made available by the NFDI4Objects team.
72 | 
73 | ## Conclusion
74 | 
75 | Much has happened for Poseidon in 2023 and I'm sure I'm not doing all of it due justice in this little summary. But I consider what is here already an impressive list that stands witness for the effort we put into the framework. And it seems to pay off: The user base is growing. More users help us in turn to find and address remaining issues and make Poseidon better for all of us. This will once more be one of my main aspirations in the coming year 2024.


--------------------------------------------------------------------------------
/posts/2020-04-02-covid-19.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "COVID-19: Estimates of true infections, case fatality and growth rates in Germany"
 3 | author: Clemens Schmid and Stephan Schiffels
 4 | origin: https://medium.com/stephan-schiffels/covid-19-estimates-of-true-infections-case-fatality-and-growth-rates-in-germany-383285f99966
 5 | ---
 6 | 
 7 | ***Acknowledgements**: We got some valuable input and corrections from Martin Lange and Johannes Boog (both Helmholtz Centre for Environmental Research Leipzig)*
 8 | 
 9 | ***Disclaimer**: We have no epidemiological training and share these results without warranty of any kind. They should not be used as a basis for decision making and we refer to the respected authorities (e.g. for Germany the [Robert Koch Institute](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/nCoV.html)) for reliable information and models. This post is only an interesting exercise in data analysis.*
10 | 
11 | ***Note**: Analyses in this post are from April 2nd, 2020, and naturally include only data from before that date.*
12 | 
13 | The COVID-19 pandemic has taken its toll all around the world and caused (so far) hundreds of deaths in Germany. In this post we present current data and model estimations for multiple relevant parameters (e.g. current number of real infections and number of future deaths) for Germany.
14 | 
15 | In the context of the [#WirvsVirus hackathon](https://www.bundesregierung.de/breg-de/themen/coronavirus/hackathon-der-bundesregierung-1733632) we started to work on the R package [covid19germany](https://github.com/nevrome/covid19germany) that allows to download and visualize the current numbers of confirmed cases and deaths by administrative units. We use this package to access the data for this post. The code for this post can be found [here](https://github.com/nevrome/covid19germany/tree/master/blog_post). Furthermore the package comes with a [webapp](https://nevrome.shinyapps.io/covid19germany) that allows to explore some of the following data and analyses in further detail — not just for the whole of Germany, but also for smaller administrative units as well as gender and age classes.
16 | 
17 | ## Quick overview about COVID-19 in Germany (2020–04-01)
18 | 
19 | The number of confirmed COVID-19 cases in Germany is rising daily, but it is unclear to which degree new infections are taking place or testing is simply catching up with past infection events. Germany may be one of the countries where testing covers a higher proportion of infected cases as the testing abilities are [comparatively good](https://time.com/5812555/germany-coronavirus-deaths). As testing will always lack behind the actual number of infected it is still an unreliable estimator of the true dimensions of this pandemic. The number of deaths caused by COVID-19 is a more trustworthy indicator — though with a significant temporal delay. More about this later.
20 | 
21 | ![+](/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp){width=100%}
22 | 
23 | ![Evolution of new daily and cumulative cases in Germany by federated state (Bundesland)](/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp){width=100%}
24 | 
25 | The increase of infected and deaths follows an expected acceleration trend due to exponential disease expansion with a growing number of spreaders. Dips on the weekends, especially of the number of positive tests, might be an effect of reduced working hours and reduced information transmission in and by health care authorities. At first glance, it is not entirely clear from this data if the social distancing rules imposed by the federal and local governments during the last two weeks have had a significant effect on the spreading of COVID-19, but the recent decline in the number of daily deaths raises hope.
26 | 
27 | ![+](/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp){width=100%}
28 | 
29 | ![Maps of cumulative and relative deaths and confirmed cases in Germany by county (Landkreis)](/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp){width=100%}
30 | 
31 | Western and Southern Germany have so far been more affected than Eastern Germany, with some individual counties (Landkreise) at the border to France, Czechia and Austria especially compromised. North Rhine-Westphalia, Bavaria and Baden-Württemberg — and therefore the federated states (Bundesländer) with the most inhabitants — have the most test-confirmed cases as well as deaths. A [dashboard](https://experience.arcgis.com/experience/478220a4c454480e823b17327b2bf1d4) provided by the RKI, the GeoHealth Center at Bonn University and ESRI gives a good overview of the official numbers, which are published on a daily basis. The RKI also releases a [daily report](https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/Gesamt.html?nn=13490888) with relevant information.
32 | 
33 | ## Simple estimation based on systematic death lag
34 | 
35 | It generally is a difficult task to estimate the true number of infected people during an epidemic outbreak. However, we learned about two methods to do so in [this](https://medium.com/@tomaspueyo/coronavirus-act-today-or-people-will-die-f4d3d9cd99ca) excellent post by Tomas Pueyo.
36 | 
37 | One way is to focus on the current number of deaths. If we know the mean time it takes for an individual from infection to death (in case of death!) and the lethality (general probability to die from COVID-19), then we can calculate an estimation of the number of infected people in the past. We have some information about these two parameters from [early scientific studies](https://github.com/midas-network/COVID-19/tree/master/parameter_estimates/2019_novel_coronavirus) about COVID-19. We will use a fixed value of 17 days for the time to death and two different values for the lethality: 1% and 5%.
38 | 
39 | In the figure below, the estimate of the true number of infections for Germany is plotted with a line each for the two lethality scenarios. It can only be calculated for the past **before** the mean death time, which is indicated in the plot by a black, vertical line.
40 | 
41 | ![Estimated true number of infected based on the registered number of deaths (for constant death probabilities 1% and 5% and a mean time from infection to death of 17 days). The red line indicates the officially registered number of infected; blue vertical line indicates the last day for which we currently have data (yesterday); black vertical line demarks the time to which the true number of infected can be estimated (yesterday minus 17 days). Data between black and blue vertical lines are predictions based on exponential growth](/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp){width=100%}
42 | 
43 | The lower the lethality of COVID-19, the higher the number of actually infected people in the past must have been, given the number of deaths that occurred later. We highlight that this estimated statistic is at least one order of magnitude higher than the measured observation of confirmed cases shown with the red line in the plot. Very interesting is the sudden uptick of the latter at the end of February, which is well reflected in the estimated statistic. Keep in mind: The estimation is based on deaths, not on test results! This correlation is therefore a good indicator that the estimate reflects some truth and that the number assumed for the mean time from infection to death (17 days) is not totally off.
44 | 
45 | Nevertheless this estimator per definition only provides information about the distant past (before the black, vertical line). To extrapolate this statistic until yesterday (**after** the black and before the blue, vertical line) we need another set of assumptions. In the simplest possible growth model the disease tends to spread in an exponential fashion with a certain time window until the number of infected doubles: the doubling time. We can take the last value **I₀** in our first statistic and extend it with a time series of exponential growth with
46 | 
47 | **Iₜ** = **I₀** _x_ **2**^(**t**/**d**)
48 | 
49 | where **Iₜ** is the true number of infected individuals after the time **t**. **t** is counted in days from yesterday minus the mean number of days from infection to death. **d** is the aforementioned doubling time in days.
50 | 
51 | The plot above shows three doubling time scenarios (3, 7 or 12 days) for each death probability scenario between the black and the blue vertical line (six scenarios in total). Some of them can already be ruled out considering the real-life testing data: They fall below the red curve. Others remain well possible. An increase of the doubling time is in all cases the desirable scenario and the following weeks will reveal (with their death count) if the social distancing measures prove to be effective to achieve this. Nevertheless it is very likely that far more people are infected right now than testing is able to confirm.
52 | 
53 | In a last step we can use the estimated infection counts to extrapolate the number of expected deaths in the near future (yesterday plus the mean number of days from infection to death) for the different doubling time scenarios. The lethality is not relevant for this particular approximation, because it already influenced the preceding calculation and is therefore removed from the equation.
54 | 
55 | ![Current number of deaths (red line) and predicted number of future deaths (black lines) based on an exponential growth model for the number of past infected](/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp){width=100%}
56 | 
57 | If the number of cases that require intensive care rises above a certain threshold the capacities of hospitals would inevitably run out and the lethality would further increase beyond this projections. This dire possibility became a grim reality in Northern Italy.
58 | 
59 | ## Estimation via Bayesian growth models
60 | 
61 | To complement the analyses above and to make a more educated guess about the parameters visualized so far, we set up a Bayesian model to estimate the true number of infected people through time from both the reported deaths and the reported cases. This model was based on a slightly more complex notion of exponential growth with a built-in slow-down and includes the following assumptions:
62 | 
63 | *   A death rate of exactly 1% (we discuss deviations from this below)
64 | *   A lag of 17 days between infection and death
65 | *   A lag of 7 days between infection and confirmatory test
66 | *   Exponential growth with a linear decrease of the growth rate due to the imposed social distancing measures
67 | 
68 | Given these assumptions, we can estimate the true number of infections, as well as the reported number of test cases and deaths. A complete definition and analysis of this model can be found [here](https://rpubs.com/stschiff/bayesian_covid19_model).
69 | 
70 | ![Model results for true (green) and confirmed cases (blue), as well as deaths (red). All three curves come from the same underlying Bayesian model and are estimated from the data (points)](/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp){width=100%}
71 | 
72 | The model predictions (the colored “ribbons”) are shown together with the true reported cases (points). Because this is Bayesian inference, all model predictions are given with quantified uncertainty. Note that we have incorporated only data points between February 23 and April 1 in this analysis. Before that time, Germany did not experience exponential growth yet.
73 | 
74 | As already shown above, the true number of infections (dark green) based on a death rate of 1% far exceeds the number of confirmed cases. We highlight that this is due to two effects: First, the reported cases and deaths lag behind the true infections, and so under exponential growth we expect the true infections of today to be much higher than the reported ones which were the infected seven days ago. Second, it is clearly expected that not all people with an infection get tested, for example because they don’t show symptoms.
75 | 
76 | One of the nice features of our model is that we get an explicit estimate of this miss-rate, but it depends linearly on the death-rate. In this case, we have assumed a death rate of 1%, and this yields — shockingly — a probability of getting tested between 12% and 24% only. That would mean that 76–88% of true infected cases are not tested. With a death rate of 3%, for example, the miss-rate would “only” be about 40–60%. So this is hard to estimate, but it’s clear we’re missing a lot!
77 | 
78 | A significant complication in this regard is introduced by the age structure of the population, because we know that elderly people die with much higher probability from COVID-19 then young people. An important next step for this kind of modelling would be to incorporate more realistic death rates, possibly age-stratified.
79 | 
80 | The specific growth model with linear slow-down seems to work OK for the data we have, although not perfectly. In particular, the slow down in recent days seems to be stronger than modeled. This is somewhat expected, since the measures against spread of the virus haven’t been “linear” in any way. Nevertheless, a linear slow-down is the first approximation to this process. Based on this, we can again — and this time in a more sophisticated way — try to predict how many cases we will have in the coming weeks. This is of course highly speculative and depends on assumptions in the model. In fact, the uncertainty increases the further you predict into the future, which is visible by the widening of the model bands in the figure. For example, the number of reported cases on April 15 is predicted to be anywhere between 60,000 and 150,000 (though not with uniform probability) according to this model and its uncertainty today. The reported number of deaths by that time are predicted to be between 2700 and 6000 in Germany. These wide intervals simply reflect the limited power of the data to accurately estimate the parameters of the growth model.
81 | 
82 | A popular choice to illustrate the speed of an exponential growth model is the doubling time in days, which we already employed as a static parameter in the simple model above. Our Bayesian inference now allows to estimate this parameter as a dynamic property of the underlying growth model. Here it is over the course of the last few weeks with a short outlook into the next week:
83 | 
84 | ![Estimate of the doubling time in days. The visible slow-down (seen as an increase in the doubling time) is estimated from the data](/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp){width=100%}
85 | 
86 | So there definitely is some indication for a slow-down, with a doubling time just around 2.5 days around the end of February and now a rate around 5 days (the black line indicates the time of this writing), and a future prediction between 7 and 16 days in a week from now. This is interesting in light of [comments from officials](https://www.tagesspiegel.de/politik/merkels-coronavirus-vorgabe-zehn-tage-verdopplungszeit-wie-weit-ist-deutschland-davon-entfernt/25656826.html) that a doubling time of 10 days or more should be reached in order to not overwhelm the healthcare system.
87 | 
88 | ## Conclusion and Outlook
89 | 
90 | We highlight three main conclusions from our modelling:
91 | 
92 | 1.  The miss-rate, so the probability for an infected person to not get tested, is one of the big unknowns in all countries currently. We can only estimate this number if strong assumptions on the death rate are made. Reversely, if the miss-rate were known better, this would allow a more accurate estimate of the death rate. One possibility to estimate the true prevalence would be representative random sampling from the population, which in fact is [planned](https://www.deutschlandfunk.de/covid-19-rki-will-dunkelziffer-der-coronavirus-infektionen.2850.de.html?drn%3Anews_id=1114345).
93 | 2.  “Predicting” the epidemiological dynamics into the future remains highly speculative. With Bayesian analyses, the degree of the resulting uncertainty is at least partly “built-in” the model. In our case, we showed that even with an arguably under-complex growth model with linear slow-down, the uncertainty on the number of infections in the future is very large, with predicted numbers to vary over a factor of 10 or more.
94 | 3.  One key, and perhaps simplifying, assumption in both our modelling attempts was the “lag” behind infections and test and death, respectively. One way to make these models more correct is by incorporating more realistic data for the course of individual infections. In reality, there is arguably a wide distribution of lag-times until symptoms, until test results, until death, while currently we assume these lag times to be fixed time periods.
95 | 
96 | We hope that our work may trigger some feedback and motivation for others. It is very easy to get started on working with the data, for example by using our ready-to-use [R package](https://github.com/nevrome/covid19germany). A lot more analyses are possible, when taking into account other data, some of which provided in this package, including county-based information about population numbers, the number of hospital beds, and age structure.


--------------------------------------------------------------------------------
/posts/2021-05-06-lambdar.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Haskell in R? An experiment with the R package lambda.r
  3 | author: Clemens Schmid
  4 | origin: https://nevrome.medium.com/haskell-in-r-an-experiment-with-the-r-package-lambda-r-78f21c0f9fe6
  5 | ---
  6 | 
  7 | *TL;DR: Feel free to directly jump to The lambda.r implementation if you only want to see that. The full code is posted at the end of the article.*
  8 | 
  9 | Haskell and R are quite different programming languages. One is purely functional, statically typed and prominently features some of the most obscure abstractions in Computer Science. The other one lives at a particularly weird spot at the crossroad of the object-oriented, imperative and functional paradigms, has a ductile and dynamic type system and is optimized for the pragmatic needs of data analysis.
 10 | 
 11 | But still these two languages share some interesting features. For example both can be run interactively in an interpreter environment. And both consider functions first-class citizens -- thus offering higher-order functions -- and allow the definition of custom infix operators. And that's why something like lambda.r is possible in the first place.
 12 | 
 13 | [lambda.r](https://github.com/zatonovo/lambda.r) (here v.1.2.4) is an R package that provides syntax extensions to write functional, Haskell-like code in R. It implements an astonishing number of features including type and function definition, pattern matching, guard statements and even monads! True functional programming available at your fingertips in R. All while maintaining a surprisingly Haskell-like syntax and incorporating powerful bonus features from R. Even a custom debugging system is part of the package.
 14 | 
 15 | The author Brian Lee Yung Rowe did an incredible job and also maintained the package over a commendable time span -- the first commit on Github is from 2012 and the last change was pushed 2019.
 16 | 
 17 | Of course the package has some known limitations and rough edges. In my opinion it's an extremely clever proof of concept and I enjoyed very much playing with it, but I'm not sure if I would recommend it for use in production. I'll leave that to you and instead show you what I managed to build with it.
 18 | 
 19 | ## The experiment
 20 | 
 21 | Recently I wanted to implement a simple but specific logic in a bioinformatics context — so this is a real world example. But it would be tedious to explain the background, so I'll instead replace the entities with something more digestible: Apples.
 22 | 
 23 | Let's say we have two sets of apple varieties and then a number of other fruit variety sets (varieties of pears, plums, strawberries, …). The first apple collection is large and covers all sorts of types: Ambrosia, Granny Smith, Red Delicious, Jonagold, Rome, Honeycrisp and many more. The second apple collection is much smaller, but a strict subset of the first one. It only includes the three varieties Granny Smith, Red Delicious and Honeycrisp. We don't really care about the other fruits.
 24 | 
 25 | ### Merging fruit variety sets in Haskell
 26 | 
 27 | How could we model these sets in Haskell? We don't need to consider the individual varieties here. Only the variety collections. So we could create the type `FruitSet` with three data constructors for the three different relevant sets. For the sake of simplicity let's shorten their names to
 28 | 
 29 | - LAS = Large Apple Set
 30 | - SAS = Small Apple Subset
 31 | - OFS = Other Fruit Set
 32 | 
 33 | ```Haskell {.numberLines}
 34 | data FruitSet =
 35 |     LAS
 36 |   | SAS
 37 |   | OFS
 38 |   deriving (Eq, Show)
 39 | ```
 40 | 
 41 | Now about the issue we have to solve for these sets: We need a function that merges a list of fruit sets according to a very specific logic into only one output fruit set. This has to adhere to the following pair-wise (and undirected) merging rules:
 42 | 
 43 | - If we merge two identical sets then the output should just be that set. That makes sense: Consider for example two Large Apple Sets. All the Ambrosia, Rome, Red Delicious and so forth apple varieties are present in both of the input sets in a pair-wise comparison.
 44 | - If we merge any set with one of the Other Fruit Sets then the output should always be an Other Fruit Set. Of course: we have a weird mixture of species and fruit varieties afterwards.
 45 | 
 46 | For the final two rules, we also have to consider two different kind of merges: A union merge and an intersect merge.
 47 | 
 48 | - If we merge a Large Apple Set and a Small Apple Subset with a union merge, then a Large Apple Set should be returned. That makes sense: The varieties in the small subset — Granny Smith, Red Delicious and Honeycrisp — are already part the large superset.
 49 | - If we merge a Large Apple Set and a Small Apple Subset with an intersect merge, then we should get a Small Apple Subset. That just follows the same logic as in the previous rule.
 50 | 
 51 | I think these rules are an excellent application for pattern matching in Haskell. We could implement them in a function like this:
 52 | 
 53 | ```Haskell {.numberLines}
 54 | fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet
 55 | fSMerge LAS LAS _     = LAS
 56 | fSMerge SAS SAS _     = SAS
 57 | fSMerge OFS _   _     = OFS
 58 | fSMerge _   OFS _     = OFS
 59 | fSMerge LAS SAS True  = SAS
 60 | fSMerge SAS LAS True  = SAS
 61 | fSMerge LAS SAS False = LAS
 62 | fSMerge SAS LAS False = LAS
 63 | ```
 64 | 
 65 | Even if you're not familiar with Haskell you may appreciate how the different pair-wise comparison cases are expressed here. The function takes two `FruitSet`s and a logical to distinguish union (`False`) and intersect (`True`) merges. For many of these rules it does not even matter which kind of merge is applied. Here we can replace the pattern with the wildcard symbol "`_`".
 66 | 
 67 | Now that we have these rules, we can also implement the function that applies them to an arbitrary list of `FruitSet`s to determine the appropriate superset.
 68 | 
 69 | ```Haskell {.numberLines}
 70 | fSMergeList :: [FruitSet] -> Bool -> FruitSet
 71 | fSMergeList (x:xs) intersect = 
 72 |   foldr (\a b -> fSMerge a b intersect) x xs
 73 | ```
 74 | 
 75 | It uses a fold to combine the list elements into one. [Folds](https://wiki.haskell.org/Fold) are operations that look at two elements of a list, apply some binary function to them, take the result and apply the same function again to that and a new list element. Just until only one result remains and the list is gone. Folds usually need a starting value that serves also as an "accumulator" to track the list-condensing result along the fold's way through the list.
 76 | 
 77 | Here I used Haskell's clever pattern matching on lists (`x:xs`) to separate the input list's head and tail. That makes it straight forward to set the head element as the starting value for the fold. We will see below that lambda.r is less elegant here.
 78 | 
 79 | Finally we can test our code:
 80 | 
 81 | ```haskell {.numberLines}
 82 | fSMergeList [LAS] True
 83 | -- LAS
 84 | fSMergeList [LAS, LAS] True
 85 | -- LAS
 86 | fSMergeList [LAS, LAS, SAS] True
 87 | -- SAS
 88 | fSMergeList [LAS, LAS, SAS] False
 89 | -- LAS
 90 | fSMergeList [LAS, LAS, OFS] False
 91 | -- OFS
 92 | ```
 93 | 
 94 | Works like a charm! Let's compare that with lamda.r now.
 95 | 
 96 | ## The lambda.r implementation
 97 | 
 98 | lambda.r provides some functions, mostly clever infix operators, to enable a Haskell-like logic and syntax in R. To access them we have to install and load the package first.
 99 | 
100 | ```r {.numberLines}
101 | install.packages(“lambda.r”)
102 | library(lambda.r)
103 | ```
104 | 
105 | Just as in the Haskell code above we have to find a way to represent fruit sets. With lambda.r, types are defined by their constructor functions. Each function has a name and input arguments separated from a return value or operation with the `%as%` infix operator.
106 | 
107 | ```r {.numberLines}
108 | FruitSet("LAS") %as% "LAS"
109 | FruitSet("SAS") %as% "SAS"
110 | FruitSet("OFS") %as% "OFS"
111 | ```
112 | 
113 | A distinction of type and data constructor as in Haskell does not exist to my knowledge. Also no nullary data constructor ("constants"). So I decided to be creative and use pattern matching on strings to simulate a data type for different fruit sets. lambda.r understands this syntax perfectly fine and prints the resulting type as follows:
114 | 
115 | ```r {.numberLines}
116 | <type constructor>
117 | ```
118 | 
119 | ```
120 | [[1]]
121 | FruitSet("LAS") %:=% ...
122 | [[2]]
123 | FruitSet("SAS") %:=% ...
124 | [[3]]
125 | FruitSet("OFS") %:=% ...
126 | ```
127 | 
128 | With that data type we can define the pair-wise merging logic as laid out above.
129 | 
130 | ```r {.numberLines}
131 | fsMerge(a,b,intersect) %::% FruitSet : FruitSet : logical : FruitSet
132 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS")
133 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS")
134 | fsMerge("OFS", b,     intersect) %as% FruitSet("OFS")
135 | fsMerge(a,     "OFS", intersect) %as% FruitSet("OFS")
136 | fsMerge("LAS", "SAS", TRUE     ) %as% FruitSet("SAS")
137 | fsMerge("SAS", "LAS", TRUE     ) %as% FruitSet("SAS")
138 | fsMerge("LAS", "SAS", FALSE    ) %as% FruitSet("LAS")
139 | fsMerge("SAS", "LAS", FALSE    ) %as% FruitSet("LAS")
140 | ```
141 | 
142 | Note how extremely similar this syntax is to Haskell. The type interface definition follows exactly the same principle, short of some minor deviations when `::` became `%::%` in R and `->` is replaced by `:`. R has [some limitations](https://stackoverflow.com/questions/24697248/is-it-possible-to-define-operator-without/24698311#24698311) regarding infix operators.
143 | 
144 | **One key take-away is, that this function will not run with input that is not exactly as specified. lambda.r thus introduces a static type system into R.**
145 | 
146 | The pattern matching in the function definition is just as in Haskell, except of course for a number of syntactic details like the parentheses, commas, string-based values and lack of explicit wildcards. It's another language after all!
147 | 
148 | With this function implemented, we only lack the last component: The function to apply the pair-wise comparisons with a fold on a list of `FruitSet`s. And here things start to become a bit more tricky, unfortunately. Let's start with the result:
149 | 
150 | ```r {.numberLines}
151 | fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet
152 | fsMergeList(xs, intersect) %as% 
153 |   Reduce(
154 |     function(a, b) { fsMerge(a, b, intersect) }, 
155 |     xs[tail(seq_along(xs), n = -1)], 
156 |     init = xs[[1]]
157 |   )
158 | ```
159 | 
160 | The general structure is again very Haskell-like. For the folding we use the `Reduce` function from the R base package (which is something like the [Prelude](https://hackage.haskell.org/package/base-4.15.0.0/docs/Prelude.html) in Haskell). One major difference between lambda.r and Haskell is though, that lambda.r lacks a good default way to handle lists. Maybe I just missed the relevant documentation or overlooked something else, but I struggled a bit with that.
161 | 
162 | In the end I decided to come up with my own list type.
163 | 
164 | ```r {.numberLines}
165 | FruitSetList(…) %::% FruitSet… : FruitSetList
166 | FruitSetList(…) %as% asFruitSetList(list(…))asFruitSetList(xs) %::% list : FruitSetList
167 | asFruitSetList(xs) %as% { 
168 |  class(xs) <- c(“FruitSetList”)
169 |  xs
170 | }
171 | ```
172 | 
173 | This constructor makes use of the [Ellipsis type](https://github.com/zatonovo/lambda.r#the-ellipsis-type) "`...`", a weird feature of R, well integrated into lambda.r: a single input argument that can represent a set of multiple arguments. In lambda.r it can be combined with a type constraint to make sure that the function takes an arbitrary amount of arguments, but only of this type. So here of type FruitSet.
174 | 
175 | That allows for a pretty cool constructor syntax:
176 | 
177 | ```r {.numberLines}
178 | FruitSetList(FruitSet(“LAS”), FruitSet(“SAS”), FruitSet(“OFS”))[[1]]
179 | ```
180 | 
181 | ```
182 | [1] "LAS"
183 | attr(,"class")
184 | [1] "FruitSet"  "character"
185 | [[2]]
186 | [1] "SAS"
187 | attr(,"class")
188 | [1] "FruitSet"  "character"
189 | [[3]]
190 | [1] "OFS"
191 | attr(,"class")
192 | [1] "FruitSet"  "character"attr(,"class")
193 | [1] "FruitSetList"
194 | ```
195 | 
196 | Unforturnately I found no direct way to catch the ellipsis and make it a `FruitSetList`. With `list(...)` I could indeed transform it to a list, but that's only half the job. I resorted to the rather ugly `asFruitSetList` that "manually" adds the "FruitSetList" label to the class attribute of the output object. That works because lambda.r utilizes [R S3 classes](http://adv-r.had.co.nz/S3.html) for its magic.
197 | 
198 | With that out of the way there was still one issue to address. I could not use Haskell's pattern matching on lists to separate the head and tail elements for the `Reduce` input. It's easy to get the first element of a list in R, but the tail requires some more advanced indexing:
199 | 
200 | ```r {.numberLines}
201 | xs[tail(seq_along(xs), n = -1)]
202 | ```
203 | 
204 | All issues should be solved now. It's time for a final test run of our code:
205 | 
206 | ```r {.numberLines}
207 | fsMergeList(FruitSetList(FruitSet("LAS")), TRUE)
208 | # [1] "LAS"
209 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS")), TRUE)
210 | # [1] "LAS"
211 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), TRUE)
212 | # [1] "SAS"
213 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), FALSE)
214 | # [1] "LAS"
215 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("OFS")), FALSE)
216 | # [1] "OFS"
217 | ```
218 | 
219 | Excellent! The Syntax is more verbose as the one in Haskell, but the results are the same.
220 | 
221 | ## Recap
222 | 
223 | - Haskell and R are both versatile languages with large communities that regularly suggest and discuss new abstractions. Haskell is a real innovation machine and carries many functional programming concepts into other languages.
224 | - lambda.r is a syntax extension to make some of the power of Haskell (or similar functional programming languages) available in R.
225 | - lambda.r works and is extremely fun to play with, but it's pretty verbose and lacks (at least to my understanding) a good list implementation. I also suspect it not to be optimized for performance — probably quite the opposite.
226 | 
227 | I personally would love to see some of the concepts demonstrated with lambda.r to find their way into regular, base R. Especially a way to switch on static typing! That could avoid a lot of unexpected behavior. R interfaces often feel flimsy and not as rock solid as comparable code in Haskell. The approach lambda.r took here -- e.g. with the [Don't-Care Type](https://github.com/zatonovo/lambda.r#the-dont-care-type) `.`, which I did not introduce -- could be a way to combine dynamic and static typing. Ideally we want more sturdy interfaces without sacrificing R's great flexibility for rapid prototyping.
228 | 
229 | *Acknowledgements: I got some valuable feedback by my colleague James Fellows Yates (@jfy133) for this post.*
230 | 
231 | ***
232 | 
233 | Haskell:
234 | 
235 | ```haskell {.numberLines}
236 | data FruitSet =
237 |         LAS
238 |     |   SAS
239 |     |   OFS
240 |     deriving (Eq, Show)
241 | 
242 | fSMergeList :: [FruitSet] -> Bool -> FruitSet
243 | fSMergeList (x:xs) intersect = foldr (\a b -> fSMerge a b intersect) x xs
244 | 
245 | fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet
246 | fSMerge LAS LAS _     = LAS
247 | fSMerge SAS SAS _     = SAS
248 | fSMerge OFS _   _     = OFS
249 | fSMerge _   OFS _     = OFS
250 | fSMerge LAS SAS True  = SAS
251 | fSMerge SAS LAS True  = SAS
252 | fSMerge LAS SAS False = LAS
253 | fSMerge SAS LAS False = LAS
254 | ```
255 | 
256 | R:
257 | 
258 | ```r {.numberLines}
259 | library(lambda.r)
260 | 
261 | FruitSet("LAS") %as% "LAS"
262 | FruitSet("SAS") %as% "SAS"
263 | FruitSet("OFS") %as% "OFS"
264 | 
265 | FruitSetList(...) %::% FruitSet... : FruitSetList
266 | FruitSetList(...) %as% asFruitSetList(list(...))
267 | 
268 | asFruitSetList(xs) %::% list : FruitSetList
269 | asFruitSetList(xs) %as% { 
270 |   class(xs) <- c("FruitSetList")
271 |   xs
272 | }
273 | 
274 | fsMerge(a, b, intersect) %::% FruitSet : FruitSet : logical : FruitSet
275 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS")
276 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS")
277 | fsMerge("OFS", b,     intersect) %as% FruitSet("OFS")
278 | fsMerge(a,     "OFS", intersect) %as% FruitSet("OFS")
279 | fsMerge("LAS", "SAS", TRUE     ) %as% FruitSet("SAS")
280 | fsMerge("SAS", "LAS", TRUE     ) %as% FruitSet("SAS")
281 | fsMerge("LAS", "SAS", FALSE    ) %as% FruitSet("LAS")
282 | fsMerge("SAS", "LAS", FALSE    ) %as% FruitSet("LAS")
283 | 
284 | fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet
285 | fsMergeList(xs, intersect) %as% 
286 |   Reduce(
287 |     function(a, b) { fsMerge(a, b, intersect) }, 
288 |     xs[tail(seq_along(xs), n = -1)], 
289 |     init = xs[[1]]
290 |   )
291 | ```


--------------------------------------------------------------------------------
/_site/posts/2023-12-31-poseidon-end-of-year-2023.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html lang="en">
 3 |     <head>
 4 |         <meta charset="utf-8">
 5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
 6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
 7 |         <title>Clemens' blog - Poseidon end-of-year review 2023</title>
 8 |         <link rel="stylesheet" href="../css/default.css" />
 9 |         <link rel="stylesheet" href="../css/syntax.css" />
10 |     </head>
11 |     <body>
12 |         <header>
13 |             <div class="logo">
14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
15 |                     <img src="../images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD" />
16 |                     0000-0003-3448-5715 ← Clemens
17 |                 </a>
18 |             </div>
19 |             
20 |             <nav>
21 |                 <a href="../">Other posts</a>
22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
23 |                 <!--
24 |                 <a href="/about.html">About</a>
25 |                 <a href="/contact.html">Contact</a>
26 |                 <a href="/archive.html">Archive</a>
27 |                 -->
28 |             </nav>
29 |         </header>
30 | 
31 |         <main role="main">
32 |             <h1>Poseidon end-of-year review 2023</h1>
33 |             <article>
34 |     <section class="header">
35 |         Posted
36 |         
37 |             originally <a href="https://blog.poseidon-adna.org/posts/endofyear2023.html">here</a>
38 |         
39 |         on December 31, 2023
40 |         
41 |             by Clemens Schmid
42 |         
43 |     </section>
44 |     <section>
45 |         <p>It’s late December and the time of the year when work slows down in my part of the world. For many of us an opportunity to take a break and to look back, contemplating the achievements of the year. I decided to do so as well and write a bit about Poseidon.</p>
46 | <p>What follows is a subjective account of the events in and around the framework in 2023 - each of my colleagues in the core team (Stephan Schiffels, Ayshin Ghalichi, Thiseas C. Lamnidis, Dhananjaya B. A. Mudiyanselage, Wolfgang Haak and I, Clemens Schmid) would probably emphasise different developments in such a write-up. That is in itself an achievement, because it shows how much the tech-stack, domains and services in our little ecosystem have grown this year: beyond the understanding of each of us individually.</p>
47 | <h2 id="the-poseidon-schema">The Poseidon schema</h2>
48 | <p>Let’s start simple with the two new releases of the Poseidon schema we published this year: v2.7.0 and v2.7.1. They were published in short succession in March and May, the latter only slightly improving the sequencing source files (.ssf) added in the first. See the changelog <a href="https://www.poseidon-adna.org/#/changelog">here</a> for more details, but the addition of the <a href="https://www.poseidon-adna.org/#/ssf_details">.ssf file</a> is indeed their most remarkable contribution to the schema. With it we addressed a major desideratum and unresolved question in previous versions of Poseidon: How should genotype data be linked to the raw sequencing data on the <a href="https://www.ebi.ac.uk/ena/browser/home">European Nucleotide Archive (ENA)</a> and other archives of the <a href="https://www.insdc.org/">International Nucleotide Sequence Database Collaboration (INSDC)</a>?</p>
49 | <p>The .ssf file is, I would argue, a smart solution for this question. It specifies the same variables already used in the ENA database, allows for an extremely flexible, yet not arbitrary n:m connection between the entities in a Poseidon package and the raw data products and it can be <a href="https://github.com/poseidon-framework/scripts/blob/main/get_ena_table.py">generated semi-automatically</a> for most of the data in our public archives. With some tweaking it can also be used to organize local data repositories independent of any online databases. The .ssf file is finally the very foundation on top of which the amazing Minotaur workflow is built (see below).</p>
50 | <p>Generally, both the fact that only two Poseidon releases were necessary this year and that we could treat them as non-breaking changes indicate that we reached a certain level of maturity and stability in the schema. Of course we still have ideas how to extend it further in the future, but at the moment I’m optimistic that we can maintain long-term backwards compatibility. The process in which we discussed, specified and later improved the .ssf file definition to then see Minotaur be erected on top of it was a very satisfying professional experience for me personally.</p>
51 | <h2 id="the-minotaur-workflow">The Minotaur workflow</h2>
52 | <p>The Minotaur workflow is a semi-automatic workflow to reproducibly process published sequencing data into Poseidon packages. Developing this entirely new branch of the Poseidon ecosystem became possible because Thiseas joined the Poseidon core team in 2023. He came up with a sophisticated, yet open and transparent implementation of this process, in which authors and the community as a whole retain control over the data and the data processing parameters. A full write-up for the website is <a href="https://github.com/poseidon-framework/poseidon-framework.github.io/pull/54">in progress</a>. Here is the summary Thiseas prepared for <a href="https://blog.poseidon-adna.org/posts/isba2023poster.html">our poster at the ISBA conference</a>:</p>
53 | <p>Community members can request new packages to be processed through the Minotaur workflow by submitting a build recipe as a pull request against a dedicated GitHub repository. This recipe is created from a sequencing source file (.ssf), describing the sequencing data for the package and where it can be downloaded. Using the recipe, the sequencing data gets processed via <a href="https://nf-co.re/eager">nf-core/eager</a> on computational infrastructure of MPI-EVA, using a standardised, yet flexible, set of parameters. The generated genotypes, together with descriptive statistics of the sequencing data (Endogenous, Damage, Nr_SNPs, Contamination), are compiled into a Poseidon package, and made available to users in the minotaur-archive.</p>
54 | <p>The Minotaur workflow is a timely addition to the Poseidon framework, providing a flexible solution to wrap legacy and new data in uniformly processed packages. Homogeneous data processing puts us closer to our great comparadum, <a href="https://reich.hms.harvard.edu/allen-ancient-dna-resource-aadr-downloadable-genotypes-present-day-and-ancient-dna-data">the AADR dataset</a>. It also helped us to finalize the structure of our public archives, which emerged from long discussions about the kind of data we think the aDNA community requires for derived analyses.</p>
55 | <p>Right now the Minotaur workflow is still in a final development and testing phase, where we focus on the processes around it, so the submission of recipes, their review and the forwarding of results to the minotaur-archive. One particular tricky question is how context information in the .janno file should be passed from the community-archive to the new packages in the minotaur-archive. <a href="https://github.com/poseidon-framework/poseidon-hs/pull/282">One of the last pull requests</a> for our software tool trident in 2023 aims to introduce a reliable mechanism to merge .janno files to address this issue.</p>
56 | <h2 id="the-public-archives">The public archives</h2>
57 | <p>In 2023 we finally came to a conclusion on how to organize our public data archives. What emerged is a threefold division into what we call the community-archive, the minotaur-archive and the aadr-archive. The archives are described in more detail on the <a href="https://www.poseidon-adna.org/#/archive_overview">website</a>, but here’s the gist of it:</p>
58 | <p>The <a href="https://github.com/poseidon-framework/community-archive">community-archive</a> emerged from our old public-archive. It includes the legacy data we originally copied from the AADR. We now decided to use this archive for author-submitted publication-wise packages to collect the exact genotype data analysed in the respective papers. The idea is twofold: With the author-submitted genotype data the results in a given paper can be reproduced exactly. And the publication authors are generally the most trustworthy authority for the context data we collect in the .janno files, e.g. the spatiotemporal origin of the individual samples. Ayshin and I recently wrote about the submission process for the community-archive <a href="https://mpi-eva-archaeogenetics.github.io/comp_human_adna_book/poseidon.html#contributing-to-the-community-archive">here</a>.</p>
59 | <p>The <a href="https://github.com/poseidon-framework/minotaur-archive">minotaur-archive</a> mirrors the community-archive in that it features publication-wise packages, usually even the very same as in the community-archive. To distinguish them clearly, package titles and sample-wise Poseidon_IDs in the minotaur-archive carry the suffix <code>_MNT</code>. As explained above the packages in this archive include consistently reprocessed genotype data, run through the Minotaur workflow.</p>
60 | <p>The <a href="https://github.com/poseidon-framework/aadr-archive">aadr-archive</a> is the conceptionally most simple archive. It features “poseidonized” versions of releases of the AADR dataset, currently only the latest AADR v54.1.p1. We documented the code and decisions for the cleaning and packaging process <a href="https://github.com/poseidon-framework/aadr2poseidon">here</a>.</p>
61 | <p>2023 not only saw the planning and setup of these three archives, but also a lot of work to fill them with life. For the community archive that meant plenty of data cleaning by all of us, most notably Dhananjaya. And it also meant providing guidance for authors to submit their data. Thanks to the hard work of Ayshin a total of eleven author-submitted packages are available in the archive now. Number twelve was <a href="https://github.com/poseidon-framework/community-archive/pull/151">submitted shortly before christmas</a> and is awaiting review. The minotaur-archive is still functionally empty, but three packages <a href="https://github.com/poseidon-framework/minotaur-archive/pulls">are pending</a> thanks to Thiseas and will hopefully soon be merged. Preparing the latest version of the AADR dataset for the aadr-archive was one of the projects I tackled this year.</p>
62 | <h2 id="the-software-tools">The software tools</h2>
63 | <p>The Poseidon software tools grew significantly more powerful this year. From a user-perspective 2023 brought various new features, changes to the command line interfaces and breaking updates in the Web-API. To keep track of the releases and the Poseidon schema versions they support I created a <a href="https://www.poseidon-adna.org/#/version_table">version overview table</a> on the website.</p>
64 | <p>With qjanno I added an entirely new tool to the set. It is a command line tool to run SQL queries on .janno (and arbitrary .csv and .tsv) files. I created it by forking the <a href="https://github.com/itchyny/qhs">qsh package</a> and then adjusting it heavily for the use on Poseidon packages. Just as trident it is written in Haskell and openly available with precompiled executables <a href="https://www.poseidon-adna.org/#/qjanno">here</a>.</p>
65 | <p>Stephan invested a good amount of effort into consolidating the data analysis features in xerxes. He wrote a <a href="https://github.com/poseidon-framework/poseidon-analysis-hs/blob/main/docs/xerxes_whitepaper.pdf">whitepaper</a> to explain and justify the reasoning behind the implemented logic for f-statistics, and another <a href="https://blog.poseidon-adna.org/posts/xerxes_10.html">blog post</a> on how to run it. Even more approachable and comprehensive is a write-up he shared <a href="https://mpi-eva-archaeogenetics.github.io/comp_human_adna_book/fstats.html">here</a>. Together we worked on integrating the many changes to trident and its underlying poseidon-hs Haskell library into xerxes.</p>
66 | <p>Our main workhorse, trident, saw an astonishing number of new releases: <code>v1.1.6.0</code> on January 8 to <code>v1.4.0.3</code> on October 30. I quickly went through the <a href="https://github.com/poseidon-framework/poseidon-hs/releases">extended changelogs</a> published with each release to summarize the user-facing highlights of what trident supports now:</p>
67 | <ul>
68 | <li>Arbitrary columns in the .janno file beyond the columns specified in the Poseidon schema (v1.1.6.0)</li>
69 | <li>Specification of individuals with identical names from different source packages in the <code>trident forge</code> selection language (v1.1.7.0)</li>
70 | <li>Validation of the entire genotype data in a package with <code>--fullGeno</code> in <code>trident validate</code> (v1.1.10.2)</li>
71 | <li>Poseidon schema version v2.7.1 with validation of the .ssf file (v1.1.12.0)</li>
72 | <li>A highly improved Poseidon <a href="https://www.poseidon-adna.org/#/web_api">Web-API</a> that allows to request individual (old) package versions (v1.2.0.0)</li>
73 | <li>Reworked versions of <code>trident update</code>, now called <code>trident rectify</code>, and <code>trident validate</code>, which now allows to validate not just entire packages, but also individual files (v1.3.0.4)</li>
74 | <li>Selecting packages by version in the forge selection language and generally handling multiple package versions (v1.4.0.2, Stephan shared yet <a href="https://blog.poseidon-adna.org/posts/trident_14.html">another blog post</a> about this release)</li>
75 | </ul>
76 | <p>As always I enjoyed the work on the software tools tremendously, especially in two cases: If one of our users reports an issue and we can address a concrete need with a release, and if the Haskell programming language allows for a particularly elegant solution for a given problem. A <a href="https://github.com/poseidon-framework/poseidon-hs/pull/283">currently pending pull request</a> combines both: Ayshin made me aware of some validation failure cases that require better error messages and I found a neat way to provide just that with a custom-tailored monadic stack.</p>
77 | <h2 id="outreach">Outreach</h2>
78 | <p>The last domain where we made good progress in 2023 is public outreach. Naturally we invested hours in writing and updating documentation on the project website (<a href="https://www.poseidon-adna.org" class="uri">https://www.poseidon-adna.org</a>), but we also pursued a number of special projects beyond the basic, technical description of software and workflows.</p>
79 | <p>The first one of these was possible thanks to the effort of Dhananjaya, Stephan and me: We built <a href="https://www.poseidon-adna.org/#/archive_explorer">a page on the website</a> where the data in the public archives can be easily explored. It makes use of our Web-API to access the data and display it with a sub-page for each package. Dhananjaya wrote <a href="https://blog.poseidon-adna.org/posts/Archive_explorer_Blogpost.html">a blog post</a> about this, recently.</p>
80 | <p>I already mentioned this blog multiple times above. It is indeed another great addition of 2023. Stephan created a separate website at <a href="https://blog.poseidon-adna.org" class="uri">https://blog.poseidon-adna.org</a> to share news and short tutorials. Our wish has always been to gather an active and engaged community of users around Poseidon, and we hope to establish this blog as one of its central communication hubs. A major medium for longer write-ups beyond the technical documentation already available on the website.</p>
81 | <p>To announce our blog posts, software releases and other news we fully switched from Twitter (now X) to the Fediverse in 2023. You can follow us here: <a href="https://ecoevo.social/@poseidon" class="uri">https://ecoevo.social/@poseidon</a>. The switch came naturally, given the state of affairs at X. Submitting posts automatically is more easy with Mastodon compared to Twitter and I made sure that this process works reliably for our software releases on GitHub.</p>
82 | <p>Beyond these technical novelties and online communication we also presented Poseidon at two in-person conferences in 2023: <a href="https://isba10.ut.ee">ISBA10 in Tartu, Estonia</a> and the <a href="https://www.nfdi4objects.net/index.php/en/get-informed/community-meeting-and-general-assembly">NFDI4Objects community meeting in Berlin, Germany</a>. The poster we presented at both of these occasions was already mentioned above and is available <a href="https://blog.poseidon-adna.org/posts/isba2023poster.html">here</a>. And the slides for the talk Thiseas prepared for the latter should soon be made available by the NFDI4Objects team.</p>
83 | <h2 id="conclusion">Conclusion</h2>
84 | <p>Much has happened for Poseidon in 2023 and I’m sure I’m not doing all of it due justice in this little summary. But I consider what is here already an impressive list that stands witness for the effort we put into the framework. And it seems to pay off: The user base is growing. More users help us in turn to find and address remaining issues and make Poseidon better for all of us. This will once more be one of my main aspirations in the coming year 2024.</p>
85 |     </section>
86 | </article>
87 | 
88 |         </main>
89 | 
90 |         <footer>
91 |             Site generated by
92 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
93 |         </footer>
94 |     </body>
95 | </html>
96 | 


--------------------------------------------------------------------------------
/posts/2021-12-05-shake-II.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Workflow management with Haskell Shake II: Showcase"
  3 | author: Clemens Schmid
  4 | origin: https://medium.com/@nevrome/my-workflow-automation-journey-discovering-shake-haskell-5c270b93ff2b
  5 | ---
  6 | 
  7 | *This is part II of a two part blog post. See [part I](/posts/2021-12-05-shake-I.html) for the story how I discovered Shake.*
  8 | 
  9 | *GitHub repository with the code for this showcase: <https://github.com/nevrome/ShakeExperiment>*
 10 | 
 11 | ## Using Shake
 12 | 
 13 | [Shake](https://shakebuild.com/) is a build system like [make](https://www.gnu.org/software/make/), so software to organize the compilation of large software projects. That’s why its [manual](https://shakebuild.com/manual) fully focuses on building C code. In my perception building software and managing a data analysis pipeline are very similar tasks, though: in the end you want to run every script necessary to get a certain product, and it does not matter much, if that product are crosscompiled executables or a set of plots.
 14 | 
 15 | The Shake homepage [does a good](https://shakebuild.com/why) job in listing the advantages it has over its competitors. Here are three aspects I find particularly appealing about it:
 16 | 
 17 | * **“Pull-based”**: Shake starts from the desired end product and figures out, which scripts it has to run to reach a certain result. If I modify a script, it only rebuilds everything that depends on it downstream.
 18 | * **Fast and parallel**: Compiling and running the massive, 600 line Shakefile I need for my current main project feels fast and responsive. It’s incredibly satisfying to see Shake plow through independent scripts in parallel.
 19 | * **Configurable**: Shake is a library with a simple interface, [extensive documentation](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html) and useful [configuration options](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#g:5). It boils down to idiomatic Haskell code, fully adjustable to your needs.
 20 | 
 21 | To illustrate how it works, I want to present a basic example in the following section ([Code on GitHub](https://github.com/nevrome/ShakeExperiment)).
 22 | 
 23 | ### A simple Shakefile
 24 | 
 25 | Let’s imagine a workflow like this:
 26 | 
 27 | ```
 28 | raw_input.csv --> A.R -
 29 |                         \
 30 |                           -> C.R --> 3D.png
 31 |                         /
 32 |                   B.R -
 33 | ```
 34 | 
 35 | We have three _.R_ scripts: **A**, **B** and **C**. **A** requires an input _.csv_ file, **B** is independent of **A**, and **C** requires the intermediate output of **A** and **B** to produce our desired, final output _3D.png_.
 36 | 
 37 | ![3D.png: Output of our example pipeline](/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp){width=100%}
 38 | 
 39 | In our file system this looks like this:
 40 | 
 41 | ```
 42 | .
 43 | ├── input
 44 | │   └── raw_input.csv
 45 | └── scripts
 46 |     ├── A.R
 47 |     ├── B.R
 48 |     └── C.R
 49 | ```
 50 | 
 51 | Now let’s add a “Shakefile”, so a script that expresses our tiny pipeline with Shake. This boils down to a Haskell script with a `main` method, which describes the interaction of these files in a way Shake can parse and understand.
 52 | 
 53 | In my opinion the most easy way to run an independent Haskell script is via the [Stack script interpreter](https://docs.haskellstack.org/en/stable/GUIDE/#script-interpreter). So if we have stack installed on our system, we can create a new script file _Shakefile.hs_ and append these two lines to the top:
 54 | 
 55 | ```haskell {.numberLines}
 56 | #!/usr/bin/env stack
 57 | -- stack --resolver lts-18.7 script --package shake
 58 | ```
 59 | 
 60 | If we later run our script with _./Shakefile.hs_, stack will automatically download and prepare the necessary dependencies: the [Glasgow Haskell Compiler](https://www.haskell.org/ghc/) and the Shake package. That allows us to import modules with functions and data types from Shake.
 61 | 
 62 | ```haskell {.numberLines}
 63 | import Development.Shake
 64 | import Development.Shake.Command
 65 | import Development.Shake.FilePath
 66 | ```
 67 | 
 68 | Finally we can define our main method like this:
 69 | 
 70 | ```haskell {.numberLines}
 71 | main :: IO ()
 72 | main = shake shakeOptions {shakeFiles = "_build"} $ do  want [ "output" </> "3D.png" ]
 73 |   
 74 |   "output" </> "3D.png" %> \out -> do
 75 |     let script = "scripts" </> "C.R"
 76 |         dataFiles = [          "intermediate" </> "dens_surface.RData", 
 77 |           "intermediate" </> "colours.RData" 
 78 |           ]
 79 |     need $ script : dataFiles
 80 |     cmd_ "Rscript" script  "intermediate" </> "dens_surface.RData" %> \out -> do
 81 |     let script = "scripts" </> "A.R"
 82 |         dataFiles = [ "input" </> "raw_input.csv" ]
 83 |     need $ script : dataFiles
 84 |     cmd_ "Rscript" script  "intermediate" </> "colours.RData" %> \out -> do
 85 |     let script = "scripts" </> "B.R"
 86 |     need [ script ]
 87 |     cmd_ "Rscript" script
 88 | ```
 89 | 
 90 | I don’t want to get lost in the intricate details of Haskell and the Shake interface here, so it shall be enough to say that the function
 91 | 
 92 | ```haskell {.numberLines}
 93 | shake :: ShakeOptions -> Rules () -> IO ()
 94 | ```
 95 | 
 96 | called at the very beginning of the `main` method takes a configuration type `ShakeOptions` and a set of rules -- which can be written with the Monad instance and do-notation -- and evaluates them and the actions within them in a meaningful order.
 97 | 
 98 | That’s how one of these rules looks like:
 99 | 
100 | ```haskell {.numberLines}
101 | "intermediate" </> "dens_surface.RData" %> \out -> do
102 |     let script = "scripts" </> "A.R"
103 |         dataFiles = [ "input" </> "raw_input.csv" ]
104 |     need $ script : dataFiles
105 |     cmd_ "Rscript" script
106 | ```
107 | 
108 | Each rule has output files (here: _dens_surface.RData_ in the directory _intermediate_) and requires input files (here: the script **A.R** and _input/raw_input.csv_). It finally also has some mechanism that connects input and output, so for example a command to run a specific script that takes the input and yields the output (here: `cmd_ "Rscript" script`).
109 | 
110 | In a Shakefile you write all rules necessary to fully represent your pipeline. The rest is pure magic: Shake runs all scripts in the right order, creates missing directories and keeps carefully track of the state of each input and output file.
111 | 
112 | ```bash
113 | $ ./Shakefile1.hs 
114 | # Rscript (for intermediate/colours.RData)
115 | # Rscript (for intermediate/dens_surface.RData)
116 | # Rscript (for output/3D.png)
117 | ```
118 | 
119 | After running our toy example, our directory will look like this, so full of output files:
120 | 
121 | ```
122 | .
123 | ├── _build
124 | ├── input
125 | │   └── raw_input.csv
126 | ├── intermediate
127 | │   ├── colours.RData
128 | │   └── dens_surface.RData
129 | ├── output
130 | │   └── 3D.png
131 | ├── scripts
132 | │   ├── A.R
133 | │   ├── B.R
134 | │   └── C.R
135 | └── Shakefile1.hs
136 | ```
137 | 
138 | _\_build_ is where Shake stores its knowledge and puts intermediate files for itself. You should certainly add it to your _.gitignore_ file, if you work with Git, just as the _intermediate_ and _output_ directories, which are created by the pipeline.
139 | 
140 | As a small experiment and to test Shake’s power, we can edit one of the scripts. **B**.R only produces a colour vector to be used in the plotting function in **C**.R, so it’s an easy target for modification. And indeed: If we edit one of the colours there and run our script again, it only runs **B** and **C**, producing a new, nifty _3D.png_. Brilliant!
141 | 
142 | ```bash
143 | $ ./Shakefile1.hs 
144 | # Rscript (for intermediate/colours.RData)
145 | # Rscript (for output/3D.png)
146 | ```
147 | 
148 | ![3D.png: Output of our example pipeline after a change in B.R](/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp){width=100%}
149 | 
150 | ### Adjustments for my needs and convenience
151 | 
152 | Our very simple Shake script is already fulfilling its basic purpose. The pipeline is fully defined and runs, when we execute the Shakefile.
153 | 
154 | But some more advanced elements I personally need for my actual worflows are missing (e.g. support for singularity and our in-house HPC system). Shake itself also has some neat configuration options to explore. And finally the versatility of Haskell should allow to rewrite the core pipeline mechanics in shorter and clearer syntax. So: We have some room for improvement, and I wanted to dive deeper into that.
155 | 
156 | Here’s a refactored version of the script above:
157 | 
158 | ```haskell {.numberLines}
159 | #!/usr/bin/env stack
160 | -- stack --resolver lts-18.7 script --package shake
161 | 
162 | import Development.Shake
163 | import Development.Shake.Command
164 | import Development.Shake.FilePath
165 | 
166 | data Settings = Settings {
167 |   singularityContainer :: FilePath
168 | , bindPath :: String
169 | , qsubCommand :: String
170 | }
171 | 
172 | mpiEVAClusterSettings = Settings {
173 |   singularityContainer = "singularity_experiment.sif"
174 | , bindPath             = "--bind=/mnt/archgen/users/schmid"
175 | , qsubCommand          = "qsub -sync y -b y -cwd -q archgen.q \
176 |                           \-pe smp 1 -l h_vmem=10G -now n -V -j y \
177 |                           \-o ~/log -N example"
178 | }
179 | 
180 | relevantRunCommand :: Settings -> FilePath -> Action ()
181 | relevantRunCommand (Settings singularityContainer bindPath qsubCommand) x
182 |   | takeExtension x == ".R"  = cmd_ qsubCommand 
183 |       "singularity" "exec" bindPath singularityContainer "Rscript" x
184 |   | takeExtension x == ".sh" = cmd_ qsubCommand 
185 |       "singularity" "exec" bindPath singularityContainer x
186 | 
187 | infixl 8 %$
188 | (%$) :: FilePath -> ([FilePath], [FilePath]) -> Rules ()
189 | (%$) script (inFiles, outFiles) =
190 |   let settings = mpiEVAClusterSettings
191 |   in outFiles &%> \out -> do
192 |     need $ [script, singularityContainer settings] ++ inFiles
193 |     relevantRunCommand settings script
194 | 
195 | infixl 9 -->
196 | (-->) :: a -> b -> (a,b)
197 | (-->) x y = (x,y)
198 | 
199 | input x = "input" </> x
200 | intermediate x = "intermediate" </> x
201 | scripts x = "scripts" </> x
202 | output x = "output" </> x
203 | 
204 | main :: IO ()
205 | main = shake shakeOptions {
206 |       shakeFiles     = "_build"
207 |     , shakeThreads   = 3
208 |     , shakeChange    = ChangeModtime
209 |     , shakeProgress  = progressSimple
210 |     , shakeColor     = True
211 |     , shakeVerbosity = Verbose
212 |     , shakeTimings   = True
213 |     } $ do
214 |   want [output "3D.png"]
215 |   scripts "A.R" %$ 
216 |     [input "raw_input.csv"] --> [intermediate "dens_surface.RData"]
217 |   scripts "B.R" %$ 
218 |     [ ] --> [intermediate "colours.RData"]
219 |   scripts "C.R" %$ 
220 |     map intermediate ["dens_surface.RData", "colours.RData"] --> 
221 |     [output "3D.png"]
222 | ```
223 | 
224 | There’s plenty to unpack here. So let’s pull it apart, starting with the the new files I added to our simple setup above.
225 | 
226 | ```
227 | .
228 | ├── input
229 | │   └── raw_input.csv
230 | ├── scripts
231 | │   ├── A.R
232 | │   ├── B.R
233 | │   └── C.R
234 | ├── Shakefile2.hs
235 | ├── singularity_build_sif.sh
236 | ├── singularity_experiment.def
237 | └── singularity_experiment.sif
238 | ```
239 | 
240 | Specifically for [Singularity](https://sylabs.io/guides/2.6/user-guide/quick_start.html) I added three files: _singularity_build_sif.sh_ is a bash script to build the singularity image file _singularity_experiment.sif_ as defined in _singularity_experiment.def_:
241 | 
242 | ```Dockerfile {.numberLines}
243 | Bootstrap: docker
244 | From: rocker/r-base:4.1.0%post
245 |  # install the necessary R packages
246 |  R -- slave -e 'install.packages(“MASS”)'
247 | ```
248 | 
249 | This simple configuration file describes a reproducible, self-sufficient computational environment with R v4.1.0 and only one additional R package (MASS). Singularity is very [well integrated with docker](https://sylabs.io/guides/2.6/user-guide/singularity_and_docker.html) -- here I build directly on top of a [rocker](https://www.rocker-project.org/) image. As I don’t want to get lost in singularity here, I’ll leave it at that, and instead jump right into the new Shakefile.
250 | 
251 | ### **Rules that don’t hurt the eyes**
252 | 
253 | I think the build rule creation syntax in Shake is an eyesore -- as you can see in the first Shakefile above. For my new Shakefile I wrote a wrapper, that expresses rules more clearly.
254 | 
255 | Let’s start with the new operator `%$`, which encapsulates Shake’s `%>`:
256 | 
257 | ```haskell {.numberLines}
258 | (%$) :: FilePath -> ([FilePath], [FilePath]) -> Rules ()
259 | (%$) script (inFiles, outFiles) =
260 |   let settings = mpiEVAClusterSettings
261 |   in outFiles &%> \out -> do
262 |     need $ [script, singularityContainer settings] ++ inFiles
263 |     relevantRunCommand settings script
264 | ```
265 | 
266 | It allows to write rules in an -- in my opinion -- much more idiomatic way:
267 | 
268 | ```haskell {.numberLines}
269 | script %$ ([input files], [output files])
270 | ```
271 | 
272 | The tuple `([],[])` to express input and output files in the second argument still feels a bit awkward, so I added an operator `-->` to express tuple creation more neatly. Using an arrow for that of course only makes sense in the pipeline context we’re covering here. To make sure that the two new operators are actually evaluated in the correct order, we manually have to set their [fixity](https://kowainik.github.io/posts/fixity#fixity-declaration).
273 | 
274 | ```haskell {.numberLines}
275 | (-->) :: a -> b -> (a,b)
276 | (-->) x y = (x,y)infixl 8 %$
277 | infixl 9 -->
278 | ```
279 | 
280 | That boils rule creation down to some wonderful syntax:
281 | 
282 | ```haskell {.numberLines}
283 | script %$ [input files] --> [output files]
284 | ```
285 | 
286 | The horrible
287 | 
288 | ```haskell {.numberLines}
289 | "intermediate" </> "colours.RData" %> \out -> do
290 |     let script = "scripts" </> "B.R"
291 |     need [ script ]
292 |     cmd_ "Rscript" script
293 | ```
294 | 
295 | becomes a much more pleasant
296 | 
297 | ```haskell {.numberLines}
298 | scripts "B.R" %$ [ ] --> [intermediate "colours.RData"]
299 | ```
300 | 
301 | ### **Custom run commands and environments**
302 | 
303 | Now that the rules look nicer, we can turn towards the system environment. As described above, I have pretty specific requirements how exactly my scripts should be run: Through our high performance computing setting and through a singularity container.
304 | 
305 | ```
306 | HPC runs Singularity runs Rscript runs my scripts
307 | ```
308 | 
309 | To express this, I added the function `relevantRunCommand`, that does just that: compiling a relevant run command -- here depending on the file extension of the respective script.
310 | 
311 | ```haskell {.numberLines}
312 | relevantRunCommand :: Settings -> FilePath -> Action ()
313 | relevantRunCommand (Settings singularityContainer bindPath qsubCommand) x
314 |   | takeExtension x == ".R"  = cmd_ qsubCommand
315 |       "singularity" "exec" bindPath singularityContainer "Rscript" x
316 |   | takeExtension x == ".sh" = cmd_ qsubCommand
317 |       "singularity" "exec" bindPath singularityContainer x
318 | ```
319 | 
320 | This function also requires the configuration type `Settings`, which serves to make `relevantRunCommand` somewhat flexible. It stores highly variable configuration like the path to the singularity container, which directories should be mapped into the container via [bind mounts](https://sylabs.io/guides/3.0/user-guide/bind_paths_and_mounts.html), and how exactly the scripts should be submitted to run on the HPC cluster. The example here is simplified, but true to the real setup I typically use:
321 | 
322 | ```haskell {.numberLines}
323 | data Settings = Settings {
324 |   singularityContainer :: FilePath
325 | , bindPath :: String
326 | , qsubCommand :: String
327 | }mpiEVAClusterSettings = Settings {
328 |   singularityContainer = "singularity_experiment.sif"
329 | , bindPath             = "--bind=/mnt/archgen/users/schmid"
330 | , qsubCommand          = "qsub -sync y -b y -cwd -q archgen.q \
331 |                           \-pe smp 1 -l h_vmem=10G -now n -V -j y \
332 |                           \-o ~/log -N example"
333 | }
334 | ```
335 | 
336 | For my real production code, the settings data type is a bit more complex and features additional elements -- for example different cluster submission commands for different computing power requirements.
337 | 
338 | You see that the building of the singularity image itself is not part of the pipeline. Building it requires `sudo` permissions, and -- more fundamentally --building it every time would undermine reproducibility: The recipe in the _.def_ file requires multiple different online servers to be available and to always provide specific versions of certain software dependencies. In a way, the singularity image should be considered a stable input data file, so nothing to be produced on the fly.
339 | 
340 | This approach to environment management and configuration is bare-bones. I like the flexibility that comes with it, but I also see the appeal of a higher level of abstraction as provided by e.g. [nextflow’s executors](https://www.nextflow.io/docs/latest/executor.html).
341 | 
342 | ### Shake options
343 | 
344 | Shake itself comes with a number of easily configurable options how it should run. They are set in the record type `shakeOptions`, as described [here](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#g:5). These are the ones I modified for this example:
345 | 
346 | ```haskell {.numberLines}
347 | shakeOptions {
348 |   shakeFiles     = "_build"
349 | , shakeThreads   = 3
350 | , shakeChange    = ChangeModtime
351 | , shakeProgress  = progressSimple
352 | , shakeColor     = True
353 | , shakeVerbosity = Verbose
354 | , shakeTimings   = True
355 | }
356 | ```
357 | 
358 | *   **shakeFiles**: The directory used for storing Shake metadata files. We already used that option above.
359 | *   **shakeThreads**: The maximum number of rules to run in parallel. In our pipeline there are only three rules, and one depends on two others, so three is literally more than enough for maximum speed.
360 | *   **shakeChange**: How should Shake determine if a file has changed? The [data type](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#t:Change) `Change` has multiple constructors, including the default `ChangeModetime`, which causes Shake to invalidate files based on timestamps or alternatively `ChangeDigest`, which does so via checksums.
361 | *   **shakeProgress**: How progess should be reported, when the pipeline is running. `progressSimple` is a basic default, but there is an [entire datatype](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#t:Progress) `Progress`to specify configuration options.
362 | *   **shakeColor**: Whether to colorize the command line output.
363 | *   **shakeVerbosity**: How verbose the command line output should be. A [data type](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html#t:Verbosity) `Verbosity` controles the different possible levels.
364 | *   **shakeTimings**: Print timing information for each stage at the end.
365 | 
366 | There is more to discover among these options and beyond in the mechanisms Shake provides. Fortunately the library is [quite extensively documented](https://hackage.haskell.org/package/shake-0.19.6/docs/Development-Shake.html).
367 | 
368 | ## Conclusion
369 | 
370 | Thanks for bearing with me until here. I wrote this post partly to document my decision process in this matter, but also to bring across one major and two minor points:
371 | 
372 | * **Workflow managers are useful** even for small projects. Check if a tool like nextflow, snakemake or target (or whatever you prefer!) can make your daily work easier, faster and more reproducible. I find it relieving if I can be sure, that all my plots represent the latest stage of work in every script.
373 | * **Shake is a powerful tool**, if you know some Haskell. It’s flexible, very well written and elaborately documented.
374 | * **Haskell is a beautiful language** to express logic in a concise, yet clear way. Its custom operators can reduce repetitive code to a minimum.
375 | 
376 | ***Acknowledgements:** I got some valuable feedback by my colleague Alexander Hübner (@alexhbnr) for this post.*


--------------------------------------------------------------------------------
/_site/posts/2020-04-02-covid-19.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 |     <head>
  4 |         <meta charset="utf-8">
  5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |         <title>Clemens' blog - COVID-19: Estimates of true infections, case fatality and growth rates in Germany</title>
  8 |         <link rel="stylesheet" href="../css/default.css" />
  9 |         <link rel="stylesheet" href="../css/syntax.css" />
 10 |     </head>
 11 |     <body>
 12 |         <header>
 13 |             <div class="logo">
 14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
 15 |                     <img src="../images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD" />
 16 |                     0000-0003-3448-5715 ← Clemens
 17 |                 </a>
 18 |             </div>
 19 |             
 20 |             <nav>
 21 |                 <a href="../">Other posts</a>
 22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
 23 |                 <!--
 24 |                 <a href="/about.html">About</a>
 25 |                 <a href="/contact.html">Contact</a>
 26 |                 <a href="/archive.html">Archive</a>
 27 |                 -->
 28 |             </nav>
 29 |         </header>
 30 | 
 31 |         <main role="main">
 32 |             <h1>COVID-19: Estimates of true infections, case fatality and growth rates in Germany</h1>
 33 |             <article>
 34 |     <section class="header">
 35 |         Posted
 36 |         
 37 |             originally <a href="https://medium.com/stephan-schiffels/covid-19-estimates-of-true-infections-case-fatality-and-growth-rates-in-germany-383285f99966">here</a>
 38 |         
 39 |         on April  2, 2020
 40 |         
 41 |             by Clemens Schmid and Stephan Schiffels
 42 |         
 43 |     </section>
 44 |     <section>
 45 |         <p><em><strong>Acknowledgements</strong>: We got some valuable input and corrections from Martin Lange and Johannes Boog (both Helmholtz Centre for Environmental Research Leipzig)</em></p>
 46 | <p><em><strong>Disclaimer</strong>: We have no epidemiological training and share these results without warranty of any kind. They should not be used as a basis for decision making and we refer to the respected authorities (e.g. for Germany the <a href="https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/nCoV.html">Robert Koch Institute</a>) for reliable information and models. This post is only an interesting exercise in data analysis.</em></p>
 47 | <p><em><strong>Note</strong>: Analyses in this post are from April 2nd, 2020, and naturally include only data from before that date.</em></p>
 48 | <p>The COVID-19 pandemic has taken its toll all around the world and caused (so far) hundreds of deaths in Germany. In this post we present current data and model estimations for multiple relevant parameters (e.g. current number of real infections and number of future deaths) for Germany.</p>
 49 | <p>In the context of the <a href="https://www.bundesregierung.de/breg-de/themen/coronavirus/hackathon-der-bundesregierung-1733632">#WirvsVirus hackathon</a> we started to work on the R package <a href="https://github.com/nevrome/covid19germany">covid19germany</a> that allows to download and visualize the current numbers of confirmed cases and deaths by administrative units. We use this package to access the data for this post. The code for this post can be found <a href="https://github.com/nevrome/covid19germany/tree/master/blog_post">here</a>. Furthermore the package comes with a <a href="https://nevrome.shinyapps.io/covid19germany">webapp</a> that allows to explore some of the following data and analyses in further detail — not just for the whole of Germany, but also for smaller administrative units as well as gender and age classes.</p>
 50 | <h2 id="quick-overview-about-covid-19-in-germany-202004-01">Quick overview about COVID-19 in Germany (2020–04-01)</h2>
 51 | <p>The number of confirmed COVID-19 cases in Germany is rising daily, but it is unclear to which degree new infections are taking place or testing is simply catching up with past infection events. Germany may be one of the countries where testing covers a higher proportion of infected cases as the testing abilities are <a href="https://time.com/5812555/germany-coronavirus-deaths">comparatively good</a>. As testing will always lack behind the actual number of infected it is still an unreliable estimator of the true dimensions of this pandemic. The number of deaths caused by COVID-19 is a more trustworthy indicator — though with a significant temporal delay. More about this later.</p>
 52 | <figure>
 53 | <img src="../images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp" style="width:100.0%" alt="+" />
 54 | <figcaption aria-hidden="true">+</figcaption>
 55 | </figure>
 56 | <figure>
 57 | <img src="../images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp" style="width:100.0%" alt="Evolution of new daily and cumulative cases in Germany by federated state (Bundesland)" />
 58 | <figcaption aria-hidden="true">Evolution of new daily and cumulative cases in Germany by federated state (Bundesland)</figcaption>
 59 | </figure>
 60 | <p>The increase of infected and deaths follows an expected acceleration trend due to exponential disease expansion with a growing number of spreaders. Dips on the weekends, especially of the number of positive tests, might be an effect of reduced working hours and reduced information transmission in and by health care authorities. At first glance, it is not entirely clear from this data if the social distancing rules imposed by the federal and local governments during the last two weeks have had a significant effect on the spreading of COVID-19, but the recent decline in the number of daily deaths raises hope.</p>
 61 | <figure>
 62 | <img src="../images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp" style="width:100.0%" alt="+" />
 63 | <figcaption aria-hidden="true">+</figcaption>
 64 | </figure>
 65 | <figure>
 66 | <img src="../images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp" style="width:100.0%" alt="Maps of cumulative and relative deaths and confirmed cases in Germany by county (Landkreis)" />
 67 | <figcaption aria-hidden="true">Maps of cumulative and relative deaths and confirmed cases in Germany by county (Landkreis)</figcaption>
 68 | </figure>
 69 | <p>Western and Southern Germany have so far been more affected than Eastern Germany, with some individual counties (Landkreise) at the border to France, Czechia and Austria especially compromised. North Rhine-Westphalia, Bavaria and Baden-Württemberg — and therefore the federated states (Bundesländer) with the most inhabitants — have the most test-confirmed cases as well as deaths. A <a href="https://experience.arcgis.com/experience/478220a4c454480e823b17327b2bf1d4">dashboard</a> provided by the RKI, the GeoHealth Center at Bonn University and ESRI gives a good overview of the official numbers, which are published on a daily basis. The RKI also releases a <a href="https://www.rki.de/DE/Content/InfAZ/N/Neuartiges_Coronavirus/Situationsberichte/Gesamt.html?nn=13490888">daily report</a> with relevant information.</p>
 70 | <h2 id="simple-estimation-based-on-systematic-death-lag">Simple estimation based on systematic death lag</h2>
 71 | <p>It generally is a difficult task to estimate the true number of infected people during an epidemic outbreak. However, we learned about two methods to do so in <a href="https://medium.com/@tomaspueyo/coronavirus-act-today-or-people-will-die-f4d3d9cd99ca">this</a> excellent post by Tomas Pueyo.</p>
 72 | <p>One way is to focus on the current number of deaths. If we know the mean time it takes for an individual from infection to death (in case of death!) and the lethality (general probability to die from COVID-19), then we can calculate an estimation of the number of infected people in the past. We have some information about these two parameters from <a href="https://github.com/midas-network/COVID-19/tree/master/parameter_estimates/2019_novel_coronavirus">early scientific studies</a> about COVID-19. We will use a fixed value of 17 days for the time to death and two different values for the lethality: 1% and 5%.</p>
 73 | <p>In the figure below, the estimate of the true number of infections for Germany is plotted with a line each for the two lethality scenarios. It can only be calculated for the past <strong>before</strong> the mean death time, which is indicated in the plot by a black, vertical line.</p>
 74 | <figure>
 75 | <img src="../images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp" style="width:100.0%" alt="Estimated true number of infected based on the registered number of deaths (for constant death probabilities 1% and 5% and a mean time from infection to death of 17 days). The red line indicates the officially registered number of infected; blue vertical line indicates the last day for which we currently have data (yesterday); black vertical line demarks the time to which the true number of infected can be estimated (yesterday minus 17 days). Data between black and blue vertical lines are predictions based on exponential growth" />
 76 | <figcaption aria-hidden="true">Estimated true number of infected based on the registered number of deaths (for constant death probabilities 1% and 5% and a mean time from infection to death of 17 days). The red line indicates the officially registered number of infected; blue vertical line indicates the last day for which we currently have data (yesterday); black vertical line demarks the time to which the true number of infected can be estimated (yesterday minus 17 days). Data between black and blue vertical lines are predictions based on exponential growth</figcaption>
 77 | </figure>
 78 | <p>The lower the lethality of COVID-19, the higher the number of actually infected people in the past must have been, given the number of deaths that occurred later. We highlight that this estimated statistic is at least one order of magnitude higher than the measured observation of confirmed cases shown with the red line in the plot. Very interesting is the sudden uptick of the latter at the end of February, which is well reflected in the estimated statistic. Keep in mind: The estimation is based on deaths, not on test results! This correlation is therefore a good indicator that the estimate reflects some truth and that the number assumed for the mean time from infection to death (17 days) is not totally off.</p>
 79 | <p>Nevertheless this estimator per definition only provides information about the distant past (before the black, vertical line). To extrapolate this statistic until yesterday (<strong>after</strong> the black and before the blue, vertical line) we need another set of assumptions. In the simplest possible growth model the disease tends to spread in an exponential fashion with a certain time window until the number of infected doubles: the doubling time. We can take the last value <strong>I₀</strong> in our first statistic and extend it with a time series of exponential growth with</p>
 80 | <p><strong>Iₜ</strong> = <strong>I₀</strong> <em>x</em> <strong>2</strong>^(<strong>t</strong>/<strong>d</strong>)</p>
 81 | <p>where <strong>Iₜ</strong> is the true number of infected individuals after the time <strong>t</strong>. <strong>t</strong> is counted in days from yesterday minus the mean number of days from infection to death. <strong>d</strong> is the aforementioned doubling time in days.</p>
 82 | <p>The plot above shows three doubling time scenarios (3, 7 or 12 days) for each death probability scenario between the black and the blue vertical line (six scenarios in total). Some of them can already be ruled out considering the real-life testing data: They fall below the red curve. Others remain well possible. An increase of the doubling time is in all cases the desirable scenario and the following weeks will reveal (with their death count) if the social distancing measures prove to be effective to achieve this. Nevertheless it is very likely that far more people are infected right now than testing is able to confirm.</p>
 83 | <p>In a last step we can use the estimated infection counts to extrapolate the number of expected deaths in the near future (yesterday plus the mean number of days from infection to death) for the different doubling time scenarios. The lethality is not relevant for this particular approximation, because it already influenced the preceding calculation and is therefore removed from the equation.</p>
 84 | <figure>
 85 | <img src="../images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp" style="width:100.0%" alt="Current number of deaths (red line) and predicted number of future deaths (black lines) based on an exponential growth model for the number of past infected" />
 86 | <figcaption aria-hidden="true">Current number of deaths (red line) and predicted number of future deaths (black lines) based on an exponential growth model for the number of past infected</figcaption>
 87 | </figure>
 88 | <p>If the number of cases that require intensive care rises above a certain threshold the capacities of hospitals would inevitably run out and the lethality would further increase beyond this projections. This dire possibility became a grim reality in Northern Italy.</p>
 89 | <h2 id="estimation-via-bayesian-growth-models">Estimation via Bayesian growth models</h2>
 90 | <p>To complement the analyses above and to make a more educated guess about the parameters visualized so far, we set up a Bayesian model to estimate the true number of infected people through time from both the reported deaths and the reported cases. This model was based on a slightly more complex notion of exponential growth with a built-in slow-down and includes the following assumptions:</p>
 91 | <ul>
 92 | <li>A death rate of exactly 1% (we discuss deviations from this below)</li>
 93 | <li>A lag of 17 days between infection and death</li>
 94 | <li>A lag of 7 days between infection and confirmatory test</li>
 95 | <li>Exponential growth with a linear decrease of the growth rate due to the imposed social distancing measures</li>
 96 | </ul>
 97 | <p>Given these assumptions, we can estimate the true number of infections, as well as the reported number of test cases and deaths. A complete definition and analysis of this model can be found <a href="https://rpubs.com/stschiff/bayesian_covid19_model">here</a>.</p>
 98 | <figure>
 99 | <img src="../images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp" style="width:100.0%" alt="Model results for true (green) and confirmed cases (blue), as well as deaths (red). All three curves come from the same underlying Bayesian model and are estimated from the data (points)" />
100 | <figcaption aria-hidden="true">Model results for true (green) and confirmed cases (blue), as well as deaths (red). All three curves come from the same underlying Bayesian model and are estimated from the data (points)</figcaption>
101 | </figure>
102 | <p>The model predictions (the colored “ribbons”) are shown together with the true reported cases (points). Because this is Bayesian inference, all model predictions are given with quantified uncertainty. Note that we have incorporated only data points between February 23 and April 1 in this analysis. Before that time, Germany did not experience exponential growth yet.</p>
103 | <p>As already shown above, the true number of infections (dark green) based on a death rate of 1% far exceeds the number of confirmed cases. We highlight that this is due to two effects: First, the reported cases and deaths lag behind the true infections, and so under exponential growth we expect the true infections of today to be much higher than the reported ones which were the infected seven days ago. Second, it is clearly expected that not all people with an infection get tested, for example because they don’t show symptoms.</p>
104 | <p>One of the nice features of our model is that we get an explicit estimate of this miss-rate, but it depends linearly on the death-rate. In this case, we have assumed a death rate of 1%, and this yields — shockingly — a probability of getting tested between 12% and 24% only. That would mean that 76–88% of true infected cases are not tested. With a death rate of 3%, for example, the miss-rate would “only” be about 40–60%. So this is hard to estimate, but it’s clear we’re missing a lot!</p>
105 | <p>A significant complication in this regard is introduced by the age structure of the population, because we know that elderly people die with much higher probability from COVID-19 then young people. An important next step for this kind of modelling would be to incorporate more realistic death rates, possibly age-stratified.</p>
106 | <p>The specific growth model with linear slow-down seems to work OK for the data we have, although not perfectly. In particular, the slow down in recent days seems to be stronger than modeled. This is somewhat expected, since the measures against spread of the virus haven’t been “linear” in any way. Nevertheless, a linear slow-down is the first approximation to this process. Based on this, we can again — and this time in a more sophisticated way — try to predict how many cases we will have in the coming weeks. This is of course highly speculative and depends on assumptions in the model. In fact, the uncertainty increases the further you predict into the future, which is visible by the widening of the model bands in the figure. For example, the number of reported cases on April 15 is predicted to be anywhere between 60,000 and 150,000 (though not with uniform probability) according to this model and its uncertainty today. The reported number of deaths by that time are predicted to be between 2700 and 6000 in Germany. These wide intervals simply reflect the limited power of the data to accurately estimate the parameters of the growth model.</p>
107 | <p>A popular choice to illustrate the speed of an exponential growth model is the doubling time in days, which we already employed as a static parameter in the simple model above. Our Bayesian inference now allows to estimate this parameter as a dynamic property of the underlying growth model. Here it is over the course of the last few weeks with a short outlook into the next week:</p>
108 | <figure>
109 | <img src="../images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp" style="width:100.0%" alt="Estimate of the doubling time in days. The visible slow-down (seen as an increase in the doubling time) is estimated from the data" />
110 | <figcaption aria-hidden="true">Estimate of the doubling time in days. The visible slow-down (seen as an increase in the doubling time) is estimated from the data</figcaption>
111 | </figure>
112 | <p>So there definitely is some indication for a slow-down, with a doubling time just around 2.5 days around the end of February and now a rate around 5 days (the black line indicates the time of this writing), and a future prediction between 7 and 16 days in a week from now. This is interesting in light of <a href="https://www.tagesspiegel.de/politik/merkels-coronavirus-vorgabe-zehn-tage-verdopplungszeit-wie-weit-ist-deutschland-davon-entfernt/25656826.html">comments from officials</a> that a doubling time of 10 days or more should be reached in order to not overwhelm the healthcare system.</p>
113 | <h2 id="conclusion-and-outlook">Conclusion and Outlook</h2>
114 | <p>We highlight three main conclusions from our modelling:</p>
115 | <ol type="1">
116 | <li>The miss-rate, so the probability for an infected person to not get tested, is one of the big unknowns in all countries currently. We can only estimate this number if strong assumptions on the death rate are made. Reversely, if the miss-rate were known better, this would allow a more accurate estimate of the death rate. One possibility to estimate the true prevalence would be representative random sampling from the population, which in fact is <a href="https://www.deutschlandfunk.de/covid-19-rki-will-dunkelziffer-der-coronavirus-infektionen.2850.de.html?drn%3Anews_id=1114345">planned</a>.</li>
117 | <li>“Predicting” the epidemiological dynamics into the future remains highly speculative. With Bayesian analyses, the degree of the resulting uncertainty is at least partly “built-in” the model. In our case, we showed that even with an arguably under-complex growth model with linear slow-down, the uncertainty on the number of infections in the future is very large, with predicted numbers to vary over a factor of 10 or more.</li>
118 | <li>One key, and perhaps simplifying, assumption in both our modelling attempts was the “lag” behind infections and test and death, respectively. One way to make these models more correct is by incorporating more realistic data for the course of individual infections. In reality, there is arguably a wide distribution of lag-times until symptoms, until test results, until death, while currently we assume these lag times to be fixed time periods.</li>
119 | </ol>
120 | <p>We hope that our work may trigger some feedback and motivation for others. It is very easy to get started on working with the data, for example by using our ready-to-use <a href="https://github.com/nevrome/covid19germany">R package</a>. A lot more analyses are possible, when taking into account other data, some of which provided in this package, including county-based information about population numbers, the number of hospital beds, and age structure.</p>
121 |     </section>
122 | </article>
123 | 
124 |         </main>
125 | 
126 |         <footer>
127 |             Site generated by
128 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
129 |         </footer>
130 |     </body>
131 | </html>
132 | 


--------------------------------------------------------------------------------
/_site/posts/2021-05-06-lambdar.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 |     <head>
  4 |         <meta charset="utf-8">
  5 |         <meta http-equiv="x-ua-compatible" content="ie=edge">
  6 |         <meta name="viewport" content="width=device-width, initial-scale=1">
  7 |         <title>Clemens' blog - Haskell in R? An experiment with the R package lambda.r</title>
  8 |         <link rel="stylesheet" href="../css/default.css" />
  9 |         <link rel="stylesheet" href="../css/syntax.css" />
 10 |     </head>
 11 |     <body>
 12 |         <header>
 13 |             <div class="logo">
 14 |                 <a href="https://orcid.org/0000-0003-3448-5715" aria-label="View ORCID record">
 15 |                     <img src="../images/ORCID-iD_icon_BW_16x16.png" alt="ORCID iD" />
 16 |                     0000-0003-3448-5715 ← Clemens
 17 |                 </a>
 18 |             </div>
 19 |             
 20 |             <nav>
 21 |                 <a href="../">Other posts</a>
 22 |                 <a href="https://github.com/nevrome/nevrome.de">GitHub</a>
 23 |                 <!--
 24 |                 <a href="/about.html">About</a>
 25 |                 <a href="/contact.html">Contact</a>
 26 |                 <a href="/archive.html">Archive</a>
 27 |                 -->
 28 |             </nav>
 29 |         </header>
 30 | 
 31 |         <main role="main">
 32 |             <h1>Haskell in R? An experiment with the R package lambda.r</h1>
 33 |             <article>
 34 |     <section class="header">
 35 |         Posted
 36 |         
 37 |             originally <a href="https://nevrome.medium.com/haskell-in-r-an-experiment-with-the-r-package-lambda-r-78f21c0f9fe6">here</a>
 38 |         
 39 |         on May  6, 2021
 40 |         
 41 |             by Clemens Schmid
 42 |         
 43 |     </section>
 44 |     <section>
 45 |         <p><em>TL;DR: Feel free to directly jump to The lambda.r implementation if you only want to see that. The full code is posted at the end of the article.</em></p>
 46 | <p>Haskell and R are quite different programming languages. One is purely functional, statically typed and prominently features some of the most obscure abstractions in Computer Science. The other one lives at a particularly weird spot at the crossroad of the object-oriented, imperative and functional paradigms, has a ductile and dynamic type system and is optimized for the pragmatic needs of data analysis.</p>
 47 | <p>But still these two languages share some interesting features. For example both can be run interactively in an interpreter environment. And both consider functions first-class citizens – thus offering higher-order functions – and allow the definition of custom infix operators. And that’s why something like lambda.r is possible in the first place.</p>
 48 | <p><a href="https://github.com/zatonovo/lambda.r">lambda.r</a> (here v.1.2.4) is an R package that provides syntax extensions to write functional, Haskell-like code in R. It implements an astonishing number of features including type and function definition, pattern matching, guard statements and even monads! True functional programming available at your fingertips in R. All while maintaining a surprisingly Haskell-like syntax and incorporating powerful bonus features from R. Even a custom debugging system is part of the package.</p>
 49 | <p>The author Brian Lee Yung Rowe did an incredible job and also maintained the package over a commendable time span – the first commit on Github is from 2012 and the last change was pushed 2019.</p>
 50 | <p>Of course the package has some known limitations and rough edges. In my opinion it’s an extremely clever proof of concept and I enjoyed very much playing with it, but I’m not sure if I would recommend it for use in production. I’ll leave that to you and instead show you what I managed to build with it.</p>
 51 | <h2 id="the-experiment">The experiment</h2>
 52 | <p>Recently I wanted to implement a simple but specific logic in a bioinformatics context — so this is a real world example. But it would be tedious to explain the background, so I’ll instead replace the entities with something more digestible: Apples.</p>
 53 | <p>Let’s say we have two sets of apple varieties and then a number of other fruit variety sets (varieties of pears, plums, strawberries, …). The first apple collection is large and covers all sorts of types: Ambrosia, Granny Smith, Red Delicious, Jonagold, Rome, Honeycrisp and many more. The second apple collection is much smaller, but a strict subset of the first one. It only includes the three varieties Granny Smith, Red Delicious and Honeycrisp. We don’t really care about the other fruits.</p>
 54 | <h3 id="merging-fruit-variety-sets-in-haskell">Merging fruit variety sets in Haskell</h3>
 55 | <p>How could we model these sets in Haskell? We don’t need to consider the individual varieties here. Only the variety collections. So we could create the type <code>FruitSet</code> with three data constructors for the three different relevant sets. For the sake of simplicity let’s shorten their names to</p>
 56 | <ul>
 57 | <li>LAS = Large Apple Set</li>
 58 | <li>SAS = Small Apple Subset</li>
 59 | <li>OFS = Other Fruit Set</li>
 60 | </ul>
 61 | <div class="sourceCode" id="cb1"><pre class="sourceCode numberSource haskell numberLines"><code class="sourceCode haskell"><span id="cb1-1"><a href="#cb1-1"></a><span class="kw">data</span> <span class="dt">FruitSet</span> <span class="ot">=</span></span>
 62 | <span id="cb1-2"><a href="#cb1-2"></a>    <span class="dt">LAS</span></span>
 63 | <span id="cb1-3"><a href="#cb1-3"></a>  <span class="op">|</span> <span class="dt">SAS</span></span>
 64 | <span id="cb1-4"><a href="#cb1-4"></a>  <span class="op">|</span> <span class="dt">OFS</span></span>
 65 | <span id="cb1-5"><a href="#cb1-5"></a>  <span class="kw">deriving</span> (<span class="dt">Eq</span>, <span class="dt">Show</span>)</span></code></pre></div>
 66 | <p>Now about the issue we have to solve for these sets: We need a function that merges a list of fruit sets according to a very specific logic into only one output fruit set. This has to adhere to the following pair-wise (and undirected) merging rules:</p>
 67 | <ul>
 68 | <li>If we merge two identical sets then the output should just be that set. That makes sense: Consider for example two Large Apple Sets. All the Ambrosia, Rome, Red Delicious and so forth apple varieties are present in both of the input sets in a pair-wise comparison.</li>
 69 | <li>If we merge any set with one of the Other Fruit Sets then the output should always be an Other Fruit Set. Of course: we have a weird mixture of species and fruit varieties afterwards.</li>
 70 | </ul>
 71 | <p>For the final two rules, we also have to consider two different kind of merges: A union merge and an intersect merge.</p>
 72 | <ul>
 73 | <li>If we merge a Large Apple Set and a Small Apple Subset with a union merge, then a Large Apple Set should be returned. That makes sense: The varieties in the small subset — Granny Smith, Red Delicious and Honeycrisp — are already part the large superset.</li>
 74 | <li>If we merge a Large Apple Set and a Small Apple Subset with an intersect merge, then we should get a Small Apple Subset. That just follows the same logic as in the previous rule.</li>
 75 | </ul>
 76 | <p>I think these rules are an excellent application for pattern matching in Haskell. We could implement them in a function like this:</p>
 77 | <div class="sourceCode" id="cb2"><pre class="sourceCode numberSource haskell numberLines"><code class="sourceCode haskell"><span id="cb2-1"><a href="#cb2-1"></a><span class="ot">fSMerge ::</span> <span class="dt">FruitSet</span> <span class="ot">-&gt;</span> <span class="dt">FruitSet</span> <span class="ot">-&gt;</span> <span class="dt">Bool</span> <span class="ot">-&gt;</span> <span class="dt">FruitSet</span></span>
 78 | <span id="cb2-2"><a href="#cb2-2"></a>fSMerge <span class="dt">LAS</span> <span class="dt">LAS</span> _     <span class="ot">=</span> <span class="dt">LAS</span></span>
 79 | <span id="cb2-3"><a href="#cb2-3"></a>fSMerge <span class="dt">SAS</span> <span class="dt">SAS</span> _     <span class="ot">=</span> <span class="dt">SAS</span></span>
 80 | <span id="cb2-4"><a href="#cb2-4"></a>fSMerge <span class="dt">OFS</span> _   _     <span class="ot">=</span> <span class="dt">OFS</span></span>
 81 | <span id="cb2-5"><a href="#cb2-5"></a>fSMerge _   <span class="dt">OFS</span> _     <span class="ot">=</span> <span class="dt">OFS</span></span>
 82 | <span id="cb2-6"><a href="#cb2-6"></a>fSMerge <span class="dt">LAS</span> <span class="dt">SAS</span> <span class="dt">True</span>  <span class="ot">=</span> <span class="dt">SAS</span></span>
 83 | <span id="cb2-7"><a href="#cb2-7"></a>fSMerge <span class="dt">SAS</span> <span class="dt">LAS</span> <span class="dt">True</span>  <span class="ot">=</span> <span class="dt">SAS</span></span>
 84 | <span id="cb2-8"><a href="#cb2-8"></a>fSMerge <span class="dt">LAS</span> <span class="dt">SAS</span> <span class="dt">False</span> <span class="ot">=</span> <span class="dt">LAS</span></span>
 85 | <span id="cb2-9"><a href="#cb2-9"></a>fSMerge <span class="dt">SAS</span> <span class="dt">LAS</span> <span class="dt">False</span> <span class="ot">=</span> <span class="dt">LAS</span></span></code></pre></div>
 86 | <p>Even if you’re not familiar with Haskell you may appreciate how the different pair-wise comparison cases are expressed here. The function takes two <code>FruitSet</code>s and a logical to distinguish union (<code>False</code>) and intersect (<code>True</code>) merges. For many of these rules it does not even matter which kind of merge is applied. Here we can replace the pattern with the wildcard symbol “<code>_</code>”.</p>
 87 | <p>Now that we have these rules, we can also implement the function that applies them to an arbitrary list of <code>FruitSet</code>s to determine the appropriate superset.</p>
 88 | <div class="sourceCode" id="cb3"><pre class="sourceCode numberSource haskell numberLines"><code class="sourceCode haskell"><span id="cb3-1"><a href="#cb3-1"></a><span class="ot">fSMergeList ::</span> [<span class="dt">FruitSet</span>] <span class="ot">-&gt;</span> <span class="dt">Bool</span> <span class="ot">-&gt;</span> <span class="dt">FruitSet</span></span>
 89 | <span id="cb3-2"><a href="#cb3-2"></a>fSMergeList (x<span class="op">:</span>xs) intersect <span class="ot">=</span> </span>
 90 | <span id="cb3-3"><a href="#cb3-3"></a>  <span class="fu">foldr</span> (\a b <span class="ot">-&gt;</span> fSMerge a b intersect) x xs</span></code></pre></div>
 91 | <p>It uses a fold to combine the list elements into one. <a href="https://wiki.haskell.org/Fold">Folds</a> are operations that look at two elements of a list, apply some binary function to them, take the result and apply the same function again to that and a new list element. Just until only one result remains and the list is gone. Folds usually need a starting value that serves also as an “accumulator” to track the list-condensing result along the fold’s way through the list.</p>
 92 | <p>Here I used Haskell’s clever pattern matching on lists (<code>x:xs</code>) to separate the input list’s head and tail. That makes it straight forward to set the head element as the starting value for the fold. We will see below that lambda.r is less elegant here.</p>
 93 | <p>Finally we can test our code:</p>
 94 | <div class="sourceCode" id="cb4"><pre class="sourceCode numberSource haskell numberLines"><code class="sourceCode haskell"><span id="cb4-1"><a href="#cb4-1"></a>fSMergeList [<span class="dt">LAS</span>] <span class="dt">True</span></span>
 95 | <span id="cb4-2"><a href="#cb4-2"></a><span class="co">-- LAS</span></span>
 96 | <span id="cb4-3"><a href="#cb4-3"></a>fSMergeList [<span class="dt">LAS</span>, <span class="dt">LAS</span>] <span class="dt">True</span></span>
 97 | <span id="cb4-4"><a href="#cb4-4"></a><span class="co">-- LAS</span></span>
 98 | <span id="cb4-5"><a href="#cb4-5"></a>fSMergeList [<span class="dt">LAS</span>, <span class="dt">LAS</span>, <span class="dt">SAS</span>] <span class="dt">True</span></span>
 99 | <span id="cb4-6"><a href="#cb4-6"></a><span class="co">-- SAS</span></span>
100 | <span id="cb4-7"><a href="#cb4-7"></a>fSMergeList [<span class="dt">LAS</span>, <span class="dt">LAS</span>, <span class="dt">SAS</span>] <span class="dt">False</span></span>
101 | <span id="cb4-8"><a href="#cb4-8"></a><span class="co">-- LAS</span></span>
102 | <span id="cb4-9"><a href="#cb4-9"></a>fSMergeList [<span class="dt">LAS</span>, <span class="dt">LAS</span>, <span class="dt">OFS</span>] <span class="dt">False</span></span>
103 | <span id="cb4-10"><a href="#cb4-10"></a><span class="co">-- OFS</span></span></code></pre></div>
104 | <p>Works like a charm! Let’s compare that with lamda.r now.</p>
105 | <h2 id="the-lambda.r-implementation">The lambda.r implementation</h2>
106 | <p>lambda.r provides some functions, mostly clever infix operators, to enable a Haskell-like logic and syntax in R. To access them we have to install and load the package first.</p>
107 | <div class="sourceCode" id="cb5"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1"></a><span class="fu">install.packages</span>(“lambda.r”)</span>
108 | <span id="cb5-2"><a href="#cb5-2"></a><span class="fu">library</span>(lambda.r)</span></code></pre></div>
109 | <p>Just as in the Haskell code above we have to find a way to represent fruit sets. With lambda.r, types are defined by their constructor functions. Each function has a name and input arguments separated from a return value or operation with the <code>%as%</code> infix operator.</p>
110 | <div class="sourceCode" id="cb6"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1"></a><span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>) <span class="sc">%as%</span> <span class="st">&quot;LAS&quot;</span></span>
111 | <span id="cb6-2"><a href="#cb6-2"></a><span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>) <span class="sc">%as%</span> <span class="st">&quot;SAS&quot;</span></span>
112 | <span id="cb6-3"><a href="#cb6-3"></a><span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>) <span class="sc">%as%</span> <span class="st">&quot;OFS&quot;</span></span></code></pre></div>
113 | <p>A distinction of type and data constructor as in Haskell does not exist to my knowledge. Also no nullary data constructor (“constants”). So I decided to be creative and use pattern matching on strings to simulate a data type for different fruit sets. lambda.r understands this syntax perfectly fine and prints the resulting type as follows:</p>
114 | <div class="sourceCode" id="cb7"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1"></a><span class="sc">&lt;</span>type constructor<span class="sc">&gt;</span></span></code></pre></div>
115 | <pre><code>[[1]]
116 | FruitSet(&quot;LAS&quot;) %:=% ...
117 | [[2]]
118 | FruitSet(&quot;SAS&quot;) %:=% ...
119 | [[3]]
120 | FruitSet(&quot;OFS&quot;) %:=% ...</code></pre>
121 | <p>With that data type we can define the pair-wise merging logic as laid out above.</p>
122 | <div class="sourceCode" id="cb9"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1"></a><span class="fu">fsMerge</span>(a,b,intersect) <span class="sc">%::%</span> FruitSet <span class="sc">:</span> FruitSet <span class="sc">:</span> logical <span class="sc">:</span> FruitSet</span>
123 | <span id="cb9-2"><a href="#cb9-2"></a><span class="fu">fsMerge</span>(<span class="st">&quot;LAS&quot;</span>, <span class="st">&quot;LAS&quot;</span>, intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)</span>
124 | <span id="cb9-3"><a href="#cb9-3"></a><span class="fu">fsMerge</span>(<span class="st">&quot;SAS&quot;</span>, <span class="st">&quot;SAS&quot;</span>, intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)</span>
125 | <span id="cb9-4"><a href="#cb9-4"></a><span class="fu">fsMerge</span>(<span class="st">&quot;OFS&quot;</span>, b,     intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>)</span>
126 | <span id="cb9-5"><a href="#cb9-5"></a><span class="fu">fsMerge</span>(a,     <span class="st">&quot;OFS&quot;</span>, intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>)</span>
127 | <span id="cb9-6"><a href="#cb9-6"></a><span class="fu">fsMerge</span>(<span class="st">&quot;LAS&quot;</span>, <span class="st">&quot;SAS&quot;</span>, <span class="cn">TRUE</span>     ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)</span>
128 | <span id="cb9-7"><a href="#cb9-7"></a><span class="fu">fsMerge</span>(<span class="st">&quot;SAS&quot;</span>, <span class="st">&quot;LAS&quot;</span>, <span class="cn">TRUE</span>     ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)</span>
129 | <span id="cb9-8"><a href="#cb9-8"></a><span class="fu">fsMerge</span>(<span class="st">&quot;LAS&quot;</span>, <span class="st">&quot;SAS&quot;</span>, <span class="cn">FALSE</span>    ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)</span>
130 | <span id="cb9-9"><a href="#cb9-9"></a><span class="fu">fsMerge</span>(<span class="st">&quot;SAS&quot;</span>, <span class="st">&quot;LAS&quot;</span>, <span class="cn">FALSE</span>    ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)</span></code></pre></div>
131 | <p>Note how extremely similar this syntax is to Haskell. The type interface definition follows exactly the same principle, short of some minor deviations when <code>::</code> became <code>%::%</code> in R and <code>-&gt;</code> is replaced by <code>:</code>. R has <a href="https://stackoverflow.com/questions/24697248/is-it-possible-to-define-operator-without/24698311#24698311">some limitations</a> regarding infix operators.</p>
132 | <p><strong>One key take-away is, that this function will not run with input that is not exactly as specified. lambda.r thus introduces a static type system into R.</strong></p>
133 | <p>The pattern matching in the function definition is just as in Haskell, except of course for a number of syntactic details like the parentheses, commas, string-based values and lack of explicit wildcards. It’s another language after all!</p>
134 | <p>With this function implemented, we only lack the last component: The function to apply the pair-wise comparisons with a fold on a list of <code>FruitSet</code>s. And here things start to become a bit more tricky, unfortunately. Let’s start with the result:</p>
135 | <div class="sourceCode" id="cb10"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1"></a><span class="fu">fsMergeList</span>(xs, intersect) <span class="sc">%::%</span> FruitSetList <span class="sc">:</span> logical <span class="sc">:</span> FruitSet</span>
136 | <span id="cb10-2"><a href="#cb10-2"></a><span class="fu">fsMergeList</span>(xs, intersect) <span class="sc">%as%</span> </span>
137 | <span id="cb10-3"><a href="#cb10-3"></a>  <span class="fu">Reduce</span>(</span>
138 | <span id="cb10-4"><a href="#cb10-4"></a>    <span class="cf">function</span>(a, b) { <span class="fu">fsMerge</span>(a, b, intersect) }, </span>
139 | <span id="cb10-5"><a href="#cb10-5"></a>    xs[<span class="fu">tail</span>(<span class="fu">seq_along</span>(xs), <span class="at">n =</span> <span class="sc">-</span><span class="dv">1</span>)], </span>
140 | <span id="cb10-6"><a href="#cb10-6"></a>    <span class="at">init =</span> xs[[<span class="dv">1</span>]]</span>
141 | <span id="cb10-7"><a href="#cb10-7"></a>  )</span></code></pre></div>
142 | <p>The general structure is again very Haskell-like. For the folding we use the <code>Reduce</code> function from the R base package (which is something like the <a href="https://hackage.haskell.org/package/base-4.15.0.0/docs/Prelude.html">Prelude</a> in Haskell). One major difference between lambda.r and Haskell is though, that lambda.r lacks a good default way to handle lists. Maybe I just missed the relevant documentation or overlooked something else, but I struggled a bit with that.</p>
143 | <p>In the end I decided to come up with my own list type.</p>
144 | <div class="sourceCode" id="cb11"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1"></a><span class="fu">FruitSetList</span>(…) <span class="sc">%::%</span> FruitSet… <span class="sc">:</span> FruitSetList</span>
145 | <span id="cb11-2"><a href="#cb11-2"></a><span class="fu">FruitSetList</span>(…) <span class="sc">%as%</span> <span class="fu">asFruitSetList</span>(<span class="fu">list</span>(…))<span class="fu">asFruitSetList</span>(xs) <span class="sc">%::%</span> list <span class="sc">:</span> FruitSetList</span>
146 | <span id="cb11-3"><a href="#cb11-3"></a><span class="fu">asFruitSetList</span>(xs) <span class="sc">%as%</span> { </span>
147 | <span id="cb11-4"><a href="#cb11-4"></a> <span class="fu">class</span>(xs) <span class="ot">&lt;-</span> <span class="fu">c</span>(“FruitSetList”)</span>
148 | <span id="cb11-5"><a href="#cb11-5"></a> xs</span>
149 | <span id="cb11-6"><a href="#cb11-6"></a>}</span></code></pre></div>
150 | <p>This constructor makes use of the <a href="https://github.com/zatonovo/lambda.r#the-ellipsis-type">Ellipsis type</a> “<code>...</code>”, a weird feature of R, well integrated into lambda.r: a single input argument that can represent a set of multiple arguments. In lambda.r it can be combined with a type constraint to make sure that the function takes an arbitrary amount of arguments, but only of this type. So here of type FruitSet.</p>
151 | <p>That allows for a pretty cool constructor syntax:</p>
152 | <div class="sourceCode" id="cb12"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb12-1"><a href="#cb12-1"></a><span class="fu">FruitSetList</span>(<span class="fu">FruitSet</span>(“LAS”), <span class="fu">FruitSet</span>(“SAS”), <span class="fu">FruitSet</span>(“OFS”))[[<span class="dv">1</span>]]</span></code></pre></div>
153 | <pre><code>[1] &quot;LAS&quot;
154 | attr(,&quot;class&quot;)
155 | [1] &quot;FruitSet&quot;  &quot;character&quot;
156 | [[2]]
157 | [1] &quot;SAS&quot;
158 | attr(,&quot;class&quot;)
159 | [1] &quot;FruitSet&quot;  &quot;character&quot;
160 | [[3]]
161 | [1] &quot;OFS&quot;
162 | attr(,&quot;class&quot;)
163 | [1] &quot;FruitSet&quot;  &quot;character&quot;attr(,&quot;class&quot;)
164 | [1] &quot;FruitSetList&quot;</code></pre>
165 | <p>Unforturnately I found no direct way to catch the ellipsis and make it a <code>FruitSetList</code>. With <code>list(...)</code> I could indeed transform it to a list, but that’s only half the job. I resorted to the rather ugly <code>asFruitSetList</code> that “manually” adds the “FruitSetList” label to the class attribute of the output object. That works because lambda.r utilizes <a href="http://adv-r.had.co.nz/S3.html">R S3 classes</a> for its magic.</p>
166 | <p>With that out of the way there was still one issue to address. I could not use Haskell’s pattern matching on lists to separate the head and tail elements for the <code>Reduce</code> input. It’s easy to get the first element of a list in R, but the tail requires some more advanced indexing:</p>
167 | <div class="sourceCode" id="cb14"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb14-1"><a href="#cb14-1"></a>xs[<span class="fu">tail</span>(<span class="fu">seq_along</span>(xs), <span class="at">n =</span> <span class="sc">-</span><span class="dv">1</span>)]</span></code></pre></div>
168 | <p>All issues should be solved now. It’s time for a final test run of our code:</p>
169 | <div class="sourceCode" id="cb15"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb15-1"><a href="#cb15-1"></a><span class="fu">fsMergeList</span>(<span class="fu">FruitSetList</span>(<span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)), <span class="cn">TRUE</span>)</span>
170 | <span id="cb15-2"><a href="#cb15-2"></a><span class="co"># [1] &quot;LAS&quot;</span></span>
171 | <span id="cb15-3"><a href="#cb15-3"></a><span class="fu">fsMergeList</span>(<span class="fu">FruitSetList</span>(<span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)), <span class="cn">TRUE</span>)</span>
172 | <span id="cb15-4"><a href="#cb15-4"></a><span class="co"># [1] &quot;LAS&quot;</span></span>
173 | <span id="cb15-5"><a href="#cb15-5"></a><span class="fu">fsMergeList</span>(<span class="fu">FruitSetList</span>(<span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)), <span class="cn">TRUE</span>)</span>
174 | <span id="cb15-6"><a href="#cb15-6"></a><span class="co"># [1] &quot;SAS&quot;</span></span>
175 | <span id="cb15-7"><a href="#cb15-7"></a><span class="fu">fsMergeList</span>(<span class="fu">FruitSetList</span>(<span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)), <span class="cn">FALSE</span>)</span>
176 | <span id="cb15-8"><a href="#cb15-8"></a><span class="co"># [1] &quot;LAS&quot;</span></span>
177 | <span id="cb15-9"><a href="#cb15-9"></a><span class="fu">fsMergeList</span>(<span class="fu">FruitSetList</span>(<span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>), <span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>)), <span class="cn">FALSE</span>)</span>
178 | <span id="cb15-10"><a href="#cb15-10"></a><span class="co"># [1] &quot;OFS&quot;</span></span></code></pre></div>
179 | <p>Excellent! The Syntax is more verbose as the one in Haskell, but the results are the same.</p>
180 | <h2 id="recap">Recap</h2>
181 | <ul>
182 | <li>Haskell and R are both versatile languages with large communities that regularly suggest and discuss new abstractions. Haskell is a real innovation machine and carries many functional programming concepts into other languages.</li>
183 | <li>lambda.r is a syntax extension to make some of the power of Haskell (or similar functional programming languages) available in R.</li>
184 | <li>lambda.r works and is extremely fun to play with, but it’s pretty verbose and lacks (at least to my understanding) a good list implementation. I also suspect it not to be optimized for performance — probably quite the opposite.</li>
185 | </ul>
186 | <p>I personally would love to see some of the concepts demonstrated with lambda.r to find their way into regular, base R. Especially a way to switch on static typing! That could avoid a lot of unexpected behavior. R interfaces often feel flimsy and not as rock solid as comparable code in Haskell. The approach lambda.r took here – e.g. with the <a href="https://github.com/zatonovo/lambda.r#the-dont-care-type">Don’t-Care Type</a> <code>.</code>, which I did not introduce – could be a way to combine dynamic and static typing. Ideally we want more sturdy interfaces without sacrificing R’s great flexibility for rapid prototyping.</p>
187 | <p><em>Acknowledgements: I got some valuable feedback by my colleague James Fellows Yates (<span class="citation" data-cites="jfy133">@jfy133</span>) for this post.</em></p>
188 | <hr />
189 | <p>Haskell:</p>
190 | <div class="sourceCode" id="cb16"><pre class="sourceCode numberSource haskell numberLines"><code class="sourceCode haskell"><span id="cb16-1"><a href="#cb16-1"></a><span class="kw">data</span> <span class="dt">FruitSet</span> <span class="ot">=</span></span>
191 | <span id="cb16-2"><a href="#cb16-2"></a>        <span class="dt">LAS</span></span>
192 | <span id="cb16-3"><a href="#cb16-3"></a>    <span class="op">|</span>   <span class="dt">SAS</span></span>
193 | <span id="cb16-4"><a href="#cb16-4"></a>    <span class="op">|</span>   <span class="dt">OFS</span></span>
194 | <span id="cb16-5"><a href="#cb16-5"></a>    <span class="kw">deriving</span> (<span class="dt">Eq</span>, <span class="dt">Show</span>)</span>
195 | <span id="cb16-6"><a href="#cb16-6"></a></span>
196 | <span id="cb16-7"><a href="#cb16-7"></a><span class="ot">fSMergeList ::</span> [<span class="dt">FruitSet</span>] <span class="ot">-&gt;</span> <span class="dt">Bool</span> <span class="ot">-&gt;</span> <span class="dt">FruitSet</span></span>
197 | <span id="cb16-8"><a href="#cb16-8"></a>fSMergeList (x<span class="op">:</span>xs) intersect <span class="ot">=</span> <span class="fu">foldr</span> (\a b <span class="ot">-&gt;</span> fSMerge a b intersect) x xs</span>
198 | <span id="cb16-9"><a href="#cb16-9"></a></span>
199 | <span id="cb16-10"><a href="#cb16-10"></a><span class="ot">fSMerge ::</span> <span class="dt">FruitSet</span> <span class="ot">-&gt;</span> <span class="dt">FruitSet</span> <span class="ot">-&gt;</span> <span class="dt">Bool</span> <span class="ot">-&gt;</span> <span class="dt">FruitSet</span></span>
200 | <span id="cb16-11"><a href="#cb16-11"></a>fSMerge <span class="dt">LAS</span> <span class="dt">LAS</span> _     <span class="ot">=</span> <span class="dt">LAS</span></span>
201 | <span id="cb16-12"><a href="#cb16-12"></a>fSMerge <span class="dt">SAS</span> <span class="dt">SAS</span> _     <span class="ot">=</span> <span class="dt">SAS</span></span>
202 | <span id="cb16-13"><a href="#cb16-13"></a>fSMerge <span class="dt">OFS</span> _   _     <span class="ot">=</span> <span class="dt">OFS</span></span>
203 | <span id="cb16-14"><a href="#cb16-14"></a>fSMerge _   <span class="dt">OFS</span> _     <span class="ot">=</span> <span class="dt">OFS</span></span>
204 | <span id="cb16-15"><a href="#cb16-15"></a>fSMerge <span class="dt">LAS</span> <span class="dt">SAS</span> <span class="dt">True</span>  <span class="ot">=</span> <span class="dt">SAS</span></span>
205 | <span id="cb16-16"><a href="#cb16-16"></a>fSMerge <span class="dt">SAS</span> <span class="dt">LAS</span> <span class="dt">True</span>  <span class="ot">=</span> <span class="dt">SAS</span></span>
206 | <span id="cb16-17"><a href="#cb16-17"></a>fSMerge <span class="dt">LAS</span> <span class="dt">SAS</span> <span class="dt">False</span> <span class="ot">=</span> <span class="dt">LAS</span></span>
207 | <span id="cb16-18"><a href="#cb16-18"></a>fSMerge <span class="dt">SAS</span> <span class="dt">LAS</span> <span class="dt">False</span> <span class="ot">=</span> <span class="dt">LAS</span></span></code></pre></div>
208 | <p>R:</p>
209 | <div class="sourceCode" id="cb17"><pre class="sourceCode numberSource r numberLines"><code class="sourceCode r"><span id="cb17-1"><a href="#cb17-1"></a><span class="fu">library</span>(lambda.r)</span>
210 | <span id="cb17-2"><a href="#cb17-2"></a></span>
211 | <span id="cb17-3"><a href="#cb17-3"></a><span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>) <span class="sc">%as%</span> <span class="st">&quot;LAS&quot;</span></span>
212 | <span id="cb17-4"><a href="#cb17-4"></a><span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>) <span class="sc">%as%</span> <span class="st">&quot;SAS&quot;</span></span>
213 | <span id="cb17-5"><a href="#cb17-5"></a><span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>) <span class="sc">%as%</span> <span class="st">&quot;OFS&quot;</span></span>
214 | <span id="cb17-6"><a href="#cb17-6"></a></span>
215 | <span id="cb17-7"><a href="#cb17-7"></a><span class="fu">FruitSetList</span>(...) <span class="sc">%::%</span> FruitSet... <span class="sc">:</span> FruitSetList</span>
216 | <span id="cb17-8"><a href="#cb17-8"></a><span class="fu">FruitSetList</span>(...) <span class="sc">%as%</span> <span class="fu">asFruitSetList</span>(<span class="fu">list</span>(...))</span>
217 | <span id="cb17-9"><a href="#cb17-9"></a></span>
218 | <span id="cb17-10"><a href="#cb17-10"></a><span class="fu">asFruitSetList</span>(xs) <span class="sc">%::%</span> list <span class="sc">:</span> FruitSetList</span>
219 | <span id="cb17-11"><a href="#cb17-11"></a><span class="fu">asFruitSetList</span>(xs) <span class="sc">%as%</span> { </span>
220 | <span id="cb17-12"><a href="#cb17-12"></a>  <span class="fu">class</span>(xs) <span class="ot">&lt;-</span> <span class="fu">c</span>(<span class="st">&quot;FruitSetList&quot;</span>)</span>
221 | <span id="cb17-13"><a href="#cb17-13"></a>  xs</span>
222 | <span id="cb17-14"><a href="#cb17-14"></a>}</span>
223 | <span id="cb17-15"><a href="#cb17-15"></a></span>
224 | <span id="cb17-16"><a href="#cb17-16"></a><span class="fu">fsMerge</span>(a, b, intersect) <span class="sc">%::%</span> FruitSet <span class="sc">:</span> FruitSet <span class="sc">:</span> logical <span class="sc">:</span> FruitSet</span>
225 | <span id="cb17-17"><a href="#cb17-17"></a><span class="fu">fsMerge</span>(<span class="st">&quot;LAS&quot;</span>, <span class="st">&quot;LAS&quot;</span>, intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)</span>
226 | <span id="cb17-18"><a href="#cb17-18"></a><span class="fu">fsMerge</span>(<span class="st">&quot;SAS&quot;</span>, <span class="st">&quot;SAS&quot;</span>, intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)</span>
227 | <span id="cb17-19"><a href="#cb17-19"></a><span class="fu">fsMerge</span>(<span class="st">&quot;OFS&quot;</span>, b,     intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>)</span>
228 | <span id="cb17-20"><a href="#cb17-20"></a><span class="fu">fsMerge</span>(a,     <span class="st">&quot;OFS&quot;</span>, intersect) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;OFS&quot;</span>)</span>
229 | <span id="cb17-21"><a href="#cb17-21"></a><span class="fu">fsMerge</span>(<span class="st">&quot;LAS&quot;</span>, <span class="st">&quot;SAS&quot;</span>, <span class="cn">TRUE</span>     ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)</span>
230 | <span id="cb17-22"><a href="#cb17-22"></a><span class="fu">fsMerge</span>(<span class="st">&quot;SAS&quot;</span>, <span class="st">&quot;LAS&quot;</span>, <span class="cn">TRUE</span>     ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;SAS&quot;</span>)</span>
231 | <span id="cb17-23"><a href="#cb17-23"></a><span class="fu">fsMerge</span>(<span class="st">&quot;LAS&quot;</span>, <span class="st">&quot;SAS&quot;</span>, <span class="cn">FALSE</span>    ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)</span>
232 | <span id="cb17-24"><a href="#cb17-24"></a><span class="fu">fsMerge</span>(<span class="st">&quot;SAS&quot;</span>, <span class="st">&quot;LAS&quot;</span>, <span class="cn">FALSE</span>    ) <span class="sc">%as%</span> <span class="fu">FruitSet</span>(<span class="st">&quot;LAS&quot;</span>)</span>
233 | <span id="cb17-25"><a href="#cb17-25"></a></span>
234 | <span id="cb17-26"><a href="#cb17-26"></a><span class="fu">fsMergeList</span>(xs, intersect) <span class="sc">%::%</span> FruitSetList <span class="sc">:</span> logical <span class="sc">:</span> FruitSet</span>
235 | <span id="cb17-27"><a href="#cb17-27"></a><span class="fu">fsMergeList</span>(xs, intersect) <span class="sc">%as%</span> </span>
236 | <span id="cb17-28"><a href="#cb17-28"></a>  <span class="fu">Reduce</span>(</span>
237 | <span id="cb17-29"><a href="#cb17-29"></a>    <span class="cf">function</span>(a, b) { <span class="fu">fsMerge</span>(a, b, intersect) }, </span>
238 | <span id="cb17-30"><a href="#cb17-30"></a>    xs[<span class="fu">tail</span>(<span class="fu">seq_along</span>(xs), <span class="at">n =</span> <span class="sc">-</span><span class="dv">1</span>)], </span>
239 | <span id="cb17-31"><a href="#cb17-31"></a>    <span class="at">init =</span> xs[[<span class="dv">1</span>]]</span>
240 | <span id="cb17-32"><a href="#cb17-32"></a>  )</span></code></pre></div>
241 |     </section>
242 | </article>
243 | 
244 |         </main>
245 | 
246 |         <footer>
247 |             Site generated by
248 |             <a href="http://jaspervdj.be/hakyll">Hakyll</a>
249 |         </footer>
250 |     </body>
251 | </html>
252 | 


--------------------------------------------------------------------------------