├── .gitignore ├── stack.yaml ├── favicon.ico ├── _site ├── favicon.ico ├── images │ ├── ORCID-iD_icon_BW_16x16.png │ ├── 2020-04-02-covid-19 │ │ ├── 0*0nXhv3wBHICs8oU9.webp │ │ ├── 0*6GxdzZ8ff8750eUI.webp │ │ ├── 1*ARAUUR6FfmsgiJu1ocjQ-A.webp │ │ ├── 1*KM-S2Z7BJotlspqUr8Te5g.webp │ │ ├── 1*STZnkSEKJRVMBzelagdi-A.webp │ │ ├── 1*WAinSw5vnzOzm5aAjgIXCg.webp │ │ ├── 1*bvu6XdbTRlk975p7bpVl2Q.webp │ │ └── 1*pXhoiK8_kaJ38oawNTGwag.webp │ └── 2021-12-05-shake-II │ │ ├── 1_9JOrZ76udsvr1kKBippbYg.webp │ │ └── 1_cFrMhLDcSVAt6zmR1BsCjg.webp ├── css │ ├── default.css │ └── syntax.css ├── index.html └── posts │ ├── 2021-12-05-shake-I.html │ ├── 2025-02-21-poseidon-git-pr-editing.html │ ├── 2023-12-31-poseidon-end-of-year-2023.html │ ├── 2020-04-02-covid-19.html │ └── 2021-05-06-lambdar.html ├── index.html ├── images ├── ORCID-iD_icon_BW_16x16.png ├── 2020-04-02-covid-19 │ ├── 0*0nXhv3wBHICs8oU9.webp │ ├── 0*6GxdzZ8ff8750eUI.webp │ ├── 1*ARAUUR6FfmsgiJu1ocjQ-A.webp │ ├── 1*KM-S2Z7BJotlspqUr8Te5g.webp │ ├── 1*STZnkSEKJRVMBzelagdi-A.webp │ ├── 1*WAinSw5vnzOzm5aAjgIXCg.webp │ ├── 1*bvu6XdbTRlk975p7bpVl2Q.webp │ └── 1*pXhoiK8_kaJ38oawNTGwag.webp └── 2021-12-05-shake-II │ ├── 1_9JOrZ76udsvr1kKBippbYg.webp │ └── 1_cFrMhLDcSVAt6zmR1BsCjg.webp ├── templates ├── post-list.html ├── post.html └── default.html ├── blog.cabal ├── stack.yaml.lock ├── site.hs ├── css └── default.css └── posts ├── 2025-02-21-poseidon-git-pr-editing.md ├── 2021-12-05-shake-I.md ├── 2017-12-28-custom-bars-rcppprogress.markdown ├── 2023-12-31-poseidon-end-of-year-2023.md ├── 2020-04-02-covid-19.md ├── 2021-05-06-lambdar.md └── 2021-12-05-shake-II.md /.gitignore: -------------------------------------------------------------------------------- 1 | _cache/ 2 | .stack-work/ 3 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-22.43 2 | 3 | packages: 4 | - . 5 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/favicon.ico -------------------------------------------------------------------------------- /_site/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/favicon.ico -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | --- 2 | title: Posts 3 | --- 4 | 5 | $partial("templates/post-list.html")$ 6 | -------------------------------------------------------------------------------- /images/ORCID-iD_icon_BW_16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/ORCID-iD_icon_BW_16x16.png -------------------------------------------------------------------------------- /_site/images/ORCID-iD_icon_BW_16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/ORCID-iD_icon_BW_16x16.png -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/0*0nXhv3wBHICs8oU9.webp -------------------------------------------------------------------------------- /_site/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/_site/images/2020-04-02-covid-19/0*6GxdzZ8ff8750eUI.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*ARAUUR6FfmsgiJu1ocjQ-A.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*KM-S2Z7BJotlspqUr8Te5g.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*STZnkSEKJRVMBzelagdi-A.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*WAinSw5vnzOzm5aAjgIXCg.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*bvu6XdbTRlk975p7bpVl2Q.webp -------------------------------------------------------------------------------- /images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2020-04-02-covid-19/1*pXhoiK8_kaJ38oawNTGwag.webp -------------------------------------------------------------------------------- /images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2021-12-05-shake-II/1_9JOrZ76udsvr1kKBippbYg.webp -------------------------------------------------------------------------------- /images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nevrome/nevrome.de/master/images/2021-12-05-shake-II/1_cFrMhLDcSVAt6zmR1BsCjg.webp -------------------------------------------------------------------------------- /templates/post-list.html: -------------------------------------------------------------------------------- 1 |
This is part I of a two part blog post. See part II for a little showcase of Shake.
46 |Workflow management, so software to organize and run data analysis scripts, is one of these fortunate domains, where dozens of open source solutions are competing for our attention. There’s probably something for every taste (see e.g. the extensive list here), and many of these projects are actively maintained or at least comparatively easy to resurrect. This post is an attempt to describe my personal journey for a tool that fits me, in the hope to motivate you to go searching as well.
47 |My PhD research is located somewhere between Bioinformatics and Archaeoinformatics (yep — that’s a thing) and I work with large and high-dimensional datasets. Not really Big data, but big enough to require a high performance computing environment to run analyses in reasonable time. Space, time and (ancient)DNA meet in my data, so my code necessarily relies on a variety of software libraries from different domains. In the last two years I piled scripts on top of scripts and thus created a complex network of interlinked code for data preparation, analysis and visualization.
49 |This is my personal user story. It eventually brought me to a point where I realized that I have to introduce a more sophisticated system for dependency management and workflow automation. The former is especially important for reproducibility, and the latter to propagate changes, so to always maintain an up-to-date version of derived data products and plots. I needed a system that defines, runs and monitors a pipeline of code across different interacting scripts.
50 |As I share these challenges with a large number of people working professionally with computers, there are many excellent solutions for exactly these challenges out there. I just had to pick what fits me, my tasks and my interests. So I decided to follow my gut feelings and ended up with the containerization solutions docker and singularity to encapsulate my development environment (which will only be mentioned in passing here), and the build system Shake to orchestrate my analysis pipeline.
51 |The first options l considered for pipeline management were Nextflow and Snakemake. Both are very popular among my colleagues in bioinformatics. At our department there seems to be an even divide between strong fans of the former and the latter. I personally did not want to deal neither with Groovy nor with Python, though, which nextflow and snakemake respectively use as an underlying configuration language. Ideally I wanted to write the pipeline definition in a language and framework I’m already familiar with. That’s not (only) laziness. By working in either R or Haskell, with which I feel most comfortable, I could more easily leverage the power of these languages.
53 |So then I gave some scrutiny to targets, an implementation of a pipelining tool in R. This might have worked for me, but it gave me the impression to be too focused on workflows within R. R is certainly an important component of my personal tech stack right now, but I wanted to be prepared for whatever the future might bring. I also — and that’s very shallow— didn’t like target’s syntax from what I saw in the example code, where every computation in a pipeline got crammed into a single list object.
54 |At this point I realized I would really like to solve this in Haskell, as the language became something of a personal passion anyway. A functional, strongly typed language should also — at least in theory — be a good fit to formalize building rules. I did some research and came across three Haskell tools that seem to offer workflow management: Funflow, Porcupine and Bioshake. Instead of diving into them one after the other, I took a step back and asked the excellent Haskell community on reddit for advice: Experiences with workflow managers implemented in Haskell (funflow, porcupine, bioshake, ?)
55 |Fortunately Justin Bedő, the author of Bioshake, saw the post and gave me some insights about his implementation. At the time he had already moved one step further, and had discontinued the development of Bioshake for his new solution BioNix, which solves both (!) dependency and worflow management with the fascinating Nix infrastructure. As Nix is a big world on its own, I couldn’t follow him there. So I instead gave the Bioshake documentation a good read. And there I realized that Bioshake heavily relies on Shake internally: understanding Shake seemed to be inevitable to figuring out Bioshake. And Shake alone already turned out to be powerful and flexible enough for my current needs!
56 |I had reached the end of my software exploration journey.
57 |Your journey for a workflow management solution would certainly be different, and you would most likely reach different conclusions. But I encourage you to explore this realm, if you think you share a user story similar to mine. You can keep reading here, if you want to see how I configured Shake to help me with my challenges.
58 |At the time of writing, the Poseidon community archive has 14 open pull requests – most of which were opened by various community members to add new packages to the archive. What certainly is a pleasant development, because it indicates that the archive gets adopted, also comes with technical and administrative challenges. As an editor for the archive I recently had to step up my Git skills to address a particular issue I was facing.
46 |Already multiple times I found myself in the situation that I needed to edit a submission pull request before merging. This arose for example, when a package author prepared a package almost perfectly, but I still wanted to apply some additional minor changes before merging. Or an author or reviewer had struggled with Git, manoeuvred themselves into a predicament, and needed my help to untangle the knot without starting from scratch. So here is what I came up with to allow me to do that efficiently.
47 |GitHub’s documentation includes a helpful tutorial how to commit changes to a pull request branch created from a fork. It already covers the basic workflow how to edit a fork. The article highlights a number of conditions for this to be possible:
49 |50 |58 |You can only make commits on pull request branches that:
51 |52 |
57 |- Are opened in a repository that you have push access to and that were created from a fork of that repository
53 |- Are on a user-owned fork
54 |- Have permission granted from the pull request creator
55 |- Don’t have branch restrictions that will prevent you from committing
56 |
All of these are met in my case. But two additional challenges complicate the matter: i) the community-archive uses Git LFS for the large data files, and ii) I need to do this so frequently, that cloning every fork feels unnecessarily cumbersome. The following workflow considers this special situation.
59 |GIT_LFS_SKIP_SMUDGE=1 git clone git@github.com:USERNAME/FORK-OF-THE-REPOSITORY.gitNote that this workflow assumes that you have installed and configured Git LFS on your system. Cloning the repo with the GIT_LFS_SKIP_SMUDGE environment variable prevents downloading the LFS-tracked files despite Git LFS being enabled. This saves bandwidth and costs for us on GitHub.
git switch PR-BRANCHThis is only necessary, if the PR branch is not the main/master branch.
65 |git lfs pull --include "PATH-TO-FILE"To validate a package completely it can be necessary to also access the genotype data. But because we cloned above with GIT_LFS_SKIP_SMUDGE=1, this data is not in our clone now. Fortunately we can selectively download it. PATH-TO-FILE can also include wildcards.
Remember to commit the changes.
70 |This should work with git push. But yet again, Git LFS complicates things, raising the following error message:
error: Authentication error: Authentication required: You must have push access to verify locks
73 | error: failed to push some refs to 'github.com:USERNAME/FORK-OF-THE-REPOSITORY.git'This is caused by a limitation of GitHub’s Git LFS implementation. A long thread here discusses the issue: Authentication required : You must have push access to verify locks error. Multiple solutions are suggested there. One reliable workaround is to delete the git hook .git/hooks/pre-push.
rm .git/hooks/pre-push
76 | git pushThis resolved the issue for me – specifically because I never had to edit any of the genotype data files when working on a PR fork. I don’t know how this hack affects the handling of LFS-tracked files.
78 |If the changes in a fork A are already merged into the master branch of the main archive repository, then a little trick allows to switch to another fork B in the same clone.
80 |git remote -v
81 | git remote set-url origin git@github.com:poseidon-framework/community-archive.git
82 | git switch master
83 | git pull
84 | git remote set-url origin git@github.com:USERNAME/FORK-OF-THE-NEXT-REPOSITORY.git
85 | git pullWe set the remote URL to the main repository, switch to the master branch, and pull. The commits from A are already there, so we have a clean state again. From here we can set a new remote URL for a fork B and pull. This effectively saves us from creating a fresh clone (as described in @sec-clone).
87 |It’s late December and the time of the year when work slows down in my part of the world. For many of us an opportunity to take a break and to look back, contemplating the achievements of the year. I decided to do so as well and write a bit about Poseidon.
46 |What follows is a subjective account of the events in and around the framework in 2023 - each of my colleagues in the core team (Stephan Schiffels, Ayshin Ghalichi, Thiseas C. Lamnidis, Dhananjaya B. A. Mudiyanselage, Wolfgang Haak and I, Clemens Schmid) would probably emphasise different developments in such a write-up. That is in itself an achievement, because it shows how much the tech-stack, domains and services in our little ecosystem have grown this year: beyond the understanding of each of us individually.
47 |Let’s start simple with the two new releases of the Poseidon schema we published this year: v2.7.0 and v2.7.1. They were published in short succession in March and May, the latter only slightly improving the sequencing source files (.ssf) added in the first. See the changelog here for more details, but the addition of the .ssf file is indeed their most remarkable contribution to the schema. With it we addressed a major desideratum and unresolved question in previous versions of Poseidon: How should genotype data be linked to the raw sequencing data on the European Nucleotide Archive (ENA) and other archives of the International Nucleotide Sequence Database Collaboration (INSDC)?
49 |The .ssf file is, I would argue, a smart solution for this question. It specifies the same variables already used in the ENA database, allows for an extremely flexible, yet not arbitrary n:m connection between the entities in a Poseidon package and the raw data products and it can be generated semi-automatically for most of the data in our public archives. With some tweaking it can also be used to organize local data repositories independent of any online databases. The .ssf file is finally the very foundation on top of which the amazing Minotaur workflow is built (see below).
50 |Generally, both the fact that only two Poseidon releases were necessary this year and that we could treat them as non-breaking changes indicate that we reached a certain level of maturity and stability in the schema. Of course we still have ideas how to extend it further in the future, but at the moment I’m optimistic that we can maintain long-term backwards compatibility. The process in which we discussed, specified and later improved the .ssf file definition to then see Minotaur be erected on top of it was a very satisfying professional experience for me personally.
51 |The Minotaur workflow is a semi-automatic workflow to reproducibly process published sequencing data into Poseidon packages. Developing this entirely new branch of the Poseidon ecosystem became possible because Thiseas joined the Poseidon core team in 2023. He came up with a sophisticated, yet open and transparent implementation of this process, in which authors and the community as a whole retain control over the data and the data processing parameters. A full write-up for the website is in progress. Here is the summary Thiseas prepared for our poster at the ISBA conference:
53 |Community members can request new packages to be processed through the Minotaur workflow by submitting a build recipe as a pull request against a dedicated GitHub repository. This recipe is created from a sequencing source file (.ssf), describing the sequencing data for the package and where it can be downloaded. Using the recipe, the sequencing data gets processed via nf-core/eager on computational infrastructure of MPI-EVA, using a standardised, yet flexible, set of parameters. The generated genotypes, together with descriptive statistics of the sequencing data (Endogenous, Damage, Nr_SNPs, Contamination), are compiled into a Poseidon package, and made available to users in the minotaur-archive.
54 |The Minotaur workflow is a timely addition to the Poseidon framework, providing a flexible solution to wrap legacy and new data in uniformly processed packages. Homogeneous data processing puts us closer to our great comparadum, the AADR dataset. It also helped us to finalize the structure of our public archives, which emerged from long discussions about the kind of data we think the aDNA community requires for derived analyses.
55 |Right now the Minotaur workflow is still in a final development and testing phase, where we focus on the processes around it, so the submission of recipes, their review and the forwarding of results to the minotaur-archive. One particular tricky question is how context information in the .janno file should be passed from the community-archive to the new packages in the minotaur-archive. One of the last pull requests for our software tool trident in 2023 aims to introduce a reliable mechanism to merge .janno files to address this issue.
56 |In 2023 we finally came to a conclusion on how to organize our public data archives. What emerged is a threefold division into what we call the community-archive, the minotaur-archive and the aadr-archive. The archives are described in more detail on the website, but here’s the gist of it:
58 |The community-archive emerged from our old public-archive. It includes the legacy data we originally copied from the AADR. We now decided to use this archive for author-submitted publication-wise packages to collect the exact genotype data analysed in the respective papers. The idea is twofold: With the author-submitted genotype data the results in a given paper can be reproduced exactly. And the publication authors are generally the most trustworthy authority for the context data we collect in the .janno files, e.g. the spatiotemporal origin of the individual samples. Ayshin and I recently wrote about the submission process for the community-archive here.
59 |The minotaur-archive mirrors the community-archive in that it features publication-wise packages, usually even the very same as in the community-archive. To distinguish them clearly, package titles and sample-wise Poseidon_IDs in the minotaur-archive carry the suffix _MNT. As explained above the packages in this archive include consistently reprocessed genotype data, run through the Minotaur workflow.
The aadr-archive is the conceptionally most simple archive. It features “poseidonized” versions of releases of the AADR dataset, currently only the latest AADR v54.1.p1. We documented the code and decisions for the cleaning and packaging process here.
61 |2023 not only saw the planning and setup of these three archives, but also a lot of work to fill them with life. For the community archive that meant plenty of data cleaning by all of us, most notably Dhananjaya. And it also meant providing guidance for authors to submit their data. Thanks to the hard work of Ayshin a total of eleven author-submitted packages are available in the archive now. Number twelve was submitted shortly before christmas and is awaiting review. The minotaur-archive is still functionally empty, but three packages are pending thanks to Thiseas and will hopefully soon be merged. Preparing the latest version of the AADR dataset for the aadr-archive was one of the projects I tackled this year.
62 |The Poseidon software tools grew significantly more powerful this year. From a user-perspective 2023 brought various new features, changes to the command line interfaces and breaking updates in the Web-API. To keep track of the releases and the Poseidon schema versions they support I created a version overview table on the website.
64 |With qjanno I added an entirely new tool to the set. It is a command line tool to run SQL queries on .janno (and arbitrary .csv and .tsv) files. I created it by forking the qsh package and then adjusting it heavily for the use on Poseidon packages. Just as trident it is written in Haskell and openly available with precompiled executables here.
65 |Stephan invested a good amount of effort into consolidating the data analysis features in xerxes. He wrote a whitepaper to explain and justify the reasoning behind the implemented logic for f-statistics, and another blog post on how to run it. Even more approachable and comprehensive is a write-up he shared here. Together we worked on integrating the many changes to trident and its underlying poseidon-hs Haskell library into xerxes.
66 |Our main workhorse, trident, saw an astonishing number of new releases: v1.1.6.0 on January 8 to v1.4.0.3 on October 30. I quickly went through the extended changelogs published with each release to summarize the user-facing highlights of what trident supports now:
trident forge selection language (v1.1.7.0)--fullGeno in trident validate (v1.1.10.2)trident update, now called trident rectify, and trident validate, which now allows to validate not just entire packages, but also individual files (v1.3.0.4)As always I enjoyed the work on the software tools tremendously, especially in two cases: If one of our users reports an issue and we can address a concrete need with a release, and if the Haskell programming language allows for a particularly elegant solution for a given problem. A currently pending pull request combines both: Ayshin made me aware of some validation failure cases that require better error messages and I found a neat way to provide just that with a custom-tailored monadic stack.
77 |The last domain where we made good progress in 2023 is public outreach. Naturally we invested hours in writing and updating documentation on the project website (https://www.poseidon-adna.org), but we also pursued a number of special projects beyond the basic, technical description of software and workflows.
79 |The first one of these was possible thanks to the effort of Dhananjaya, Stephan and me: We built a page on the website where the data in the public archives can be easily explored. It makes use of our Web-API to access the data and display it with a sub-page for each package. Dhananjaya wrote a blog post about this, recently.
80 |I already mentioned this blog multiple times above. It is indeed another great addition of 2023. Stephan created a separate website at https://blog.poseidon-adna.org to share news and short tutorials. Our wish has always been to gather an active and engaged community of users around Poseidon, and we hope to establish this blog as one of its central communication hubs. A major medium for longer write-ups beyond the technical documentation already available on the website.
81 |To announce our blog posts, software releases and other news we fully switched from Twitter (now X) to the Fediverse in 2023. You can follow us here: https://ecoevo.social/@poseidon. The switch came naturally, given the state of affairs at X. Submitting posts automatically is more easy with Mastodon compared to Twitter and I made sure that this process works reliably for our software releases on GitHub.
82 |Beyond these technical novelties and online communication we also presented Poseidon at two in-person conferences in 2023: ISBA10 in Tartu, Estonia and the NFDI4Objects community meeting in Berlin, Germany. The poster we presented at both of these occasions was already mentioned above and is available here. And the slides for the talk Thiseas prepared for the latter should soon be made available by the NFDI4Objects team.
83 |Much has happened for Poseidon in 2023 and I’m sure I’m not doing all of it due justice in this little summary. But I consider what is here already an impressive list that stands witness for the effort we put into the framework. And it seems to pay off: The user base is growing. More users help us in turn to find and address remaining issues and make Poseidon better for all of us. This will once more be one of my main aspirations in the coming year 2024.
85 |Acknowledgements: We got some valuable input and corrections from Martin Lange and Johannes Boog (both Helmholtz Centre for Environmental Research Leipzig)
46 |Disclaimer: We have no epidemiological training and share these results without warranty of any kind. They should not be used as a basis for decision making and we refer to the respected authorities (e.g. for Germany the Robert Koch Institute) for reliable information and models. This post is only an interesting exercise in data analysis.
47 |Note: Analyses in this post are from April 2nd, 2020, and naturally include only data from before that date.
48 |The COVID-19 pandemic has taken its toll all around the world and caused (so far) hundreds of deaths in Germany. In this post we present current data and model estimations for multiple relevant parameters (e.g. current number of real infections and number of future deaths) for Germany.
49 |In the context of the #WirvsVirus hackathon we started to work on the R package covid19germany that allows to download and visualize the current numbers of confirmed cases and deaths by administrative units. We use this package to access the data for this post. The code for this post can be found here. Furthermore the package comes with a webapp that allows to explore some of the following data and analyses in further detail — not just for the whole of Germany, but also for smaller administrative units as well as gender and age classes.
50 |The number of confirmed COVID-19 cases in Germany is rising daily, but it is unclear to which degree new infections are taking place or testing is simply catching up with past infection events. Germany may be one of the countries where testing covers a higher proportion of infected cases as the testing abilities are comparatively good. As testing will always lack behind the actual number of infected it is still an unreliable estimator of the true dimensions of this pandemic. The number of deaths caused by COVID-19 is a more trustworthy indicator — though with a significant temporal delay. More about this later.
52 |
54 |
55 |
58 |
59 | The increase of infected and deaths follows an expected acceleration trend due to exponential disease expansion with a growing number of spreaders. Dips on the weekends, especially of the number of positive tests, might be an effect of reduced working hours and reduced information transmission in and by health care authorities. At first glance, it is not entirely clear from this data if the social distancing rules imposed by the federal and local governments during the last two weeks have had a significant effect on the spreading of COVID-19, but the recent decline in the number of daily deaths raises hope.
61 |
63 |
64 |
67 |
68 | Western and Southern Germany have so far been more affected than Eastern Germany, with some individual counties (Landkreise) at the border to France, Czechia and Austria especially compromised. North Rhine-Westphalia, Bavaria and Baden-Württemberg — and therefore the federated states (Bundesländer) with the most inhabitants — have the most test-confirmed cases as well as deaths. A dashboard provided by the RKI, the GeoHealth Center at Bonn University and ESRI gives a good overview of the official numbers, which are published on a daily basis. The RKI also releases a daily report with relevant information.
70 |It generally is a difficult task to estimate the true number of infected people during an epidemic outbreak. However, we learned about two methods to do so in this excellent post by Tomas Pueyo.
72 |One way is to focus on the current number of deaths. If we know the mean time it takes for an individual from infection to death (in case of death!) and the lethality (general probability to die from COVID-19), then we can calculate an estimation of the number of infected people in the past. We have some information about these two parameters from early scientific studies about COVID-19. We will use a fixed value of 17 days for the time to death and two different values for the lethality: 1% and 5%.
73 |In the figure below, the estimate of the true number of infections for Germany is plotted with a line each for the two lethality scenarios. It can only be calculated for the past before the mean death time, which is indicated in the plot by a black, vertical line.
74 |
76 |
77 | The lower the lethality of COVID-19, the higher the number of actually infected people in the past must have been, given the number of deaths that occurred later. We highlight that this estimated statistic is at least one order of magnitude higher than the measured observation of confirmed cases shown with the red line in the plot. Very interesting is the sudden uptick of the latter at the end of February, which is well reflected in the estimated statistic. Keep in mind: The estimation is based on deaths, not on test results! This correlation is therefore a good indicator that the estimate reflects some truth and that the number assumed for the mean time from infection to death (17 days) is not totally off.
79 |Nevertheless this estimator per definition only provides information about the distant past (before the black, vertical line). To extrapolate this statistic until yesterday (after the black and before the blue, vertical line) we need another set of assumptions. In the simplest possible growth model the disease tends to spread in an exponential fashion with a certain time window until the number of infected doubles: the doubling time. We can take the last value I₀ in our first statistic and extend it with a time series of exponential growth with
80 |Iₜ = I₀ x 2^(t/d)
81 |where Iₜ is the true number of infected individuals after the time t. t is counted in days from yesterday minus the mean number of days from infection to death. d is the aforementioned doubling time in days.
82 |The plot above shows three doubling time scenarios (3, 7 or 12 days) for each death probability scenario between the black and the blue vertical line (six scenarios in total). Some of them can already be ruled out considering the real-life testing data: They fall below the red curve. Others remain well possible. An increase of the doubling time is in all cases the desirable scenario and the following weeks will reveal (with their death count) if the social distancing measures prove to be effective to achieve this. Nevertheless it is very likely that far more people are infected right now than testing is able to confirm.
83 |In a last step we can use the estimated infection counts to extrapolate the number of expected deaths in the near future (yesterday plus the mean number of days from infection to death) for the different doubling time scenarios. The lethality is not relevant for this particular approximation, because it already influenced the preceding calculation and is therefore removed from the equation.
84 |
86 |
87 | If the number of cases that require intensive care rises above a certain threshold the capacities of hospitals would inevitably run out and the lethality would further increase beyond this projections. This dire possibility became a grim reality in Northern Italy.
89 |To complement the analyses above and to make a more educated guess about the parameters visualized so far, we set up a Bayesian model to estimate the true number of infected people through time from both the reported deaths and the reported cases. This model was based on a slightly more complex notion of exponential growth with a built-in slow-down and includes the following assumptions:
91 |Given these assumptions, we can estimate the true number of infections, as well as the reported number of test cases and deaths. A complete definition and analysis of this model can be found here.
98 |
100 |
101 | The model predictions (the colored “ribbons”) are shown together with the true reported cases (points). Because this is Bayesian inference, all model predictions are given with quantified uncertainty. Note that we have incorporated only data points between February 23 and April 1 in this analysis. Before that time, Germany did not experience exponential growth yet.
103 |As already shown above, the true number of infections (dark green) based on a death rate of 1% far exceeds the number of confirmed cases. We highlight that this is due to two effects: First, the reported cases and deaths lag behind the true infections, and so under exponential growth we expect the true infections of today to be much higher than the reported ones which were the infected seven days ago. Second, it is clearly expected that not all people with an infection get tested, for example because they don’t show symptoms.
104 |One of the nice features of our model is that we get an explicit estimate of this miss-rate, but it depends linearly on the death-rate. In this case, we have assumed a death rate of 1%, and this yields — shockingly — a probability of getting tested between 12% and 24% only. That would mean that 76–88% of true infected cases are not tested. With a death rate of 3%, for example, the miss-rate would “only” be about 40–60%. So this is hard to estimate, but it’s clear we’re missing a lot!
105 |A significant complication in this regard is introduced by the age structure of the population, because we know that elderly people die with much higher probability from COVID-19 then young people. An important next step for this kind of modelling would be to incorporate more realistic death rates, possibly age-stratified.
106 |The specific growth model with linear slow-down seems to work OK for the data we have, although not perfectly. In particular, the slow down in recent days seems to be stronger than modeled. This is somewhat expected, since the measures against spread of the virus haven’t been “linear” in any way. Nevertheless, a linear slow-down is the first approximation to this process. Based on this, we can again — and this time in a more sophisticated way — try to predict how many cases we will have in the coming weeks. This is of course highly speculative and depends on assumptions in the model. In fact, the uncertainty increases the further you predict into the future, which is visible by the widening of the model bands in the figure. For example, the number of reported cases on April 15 is predicted to be anywhere between 60,000 and 150,000 (though not with uniform probability) according to this model and its uncertainty today. The reported number of deaths by that time are predicted to be between 2700 and 6000 in Germany. These wide intervals simply reflect the limited power of the data to accurately estimate the parameters of the growth model.
107 |A popular choice to illustrate the speed of an exponential growth model is the doubling time in days, which we already employed as a static parameter in the simple model above. Our Bayesian inference now allows to estimate this parameter as a dynamic property of the underlying growth model. Here it is over the course of the last few weeks with a short outlook into the next week:
108 |
110 |
111 | So there definitely is some indication for a slow-down, with a doubling time just around 2.5 days around the end of February and now a rate around 5 days (the black line indicates the time of this writing), and a future prediction between 7 and 16 days in a week from now. This is interesting in light of comments from officials that a doubling time of 10 days or more should be reached in order to not overwhelm the healthcare system.
113 |We highlight three main conclusions from our modelling:
115 |We hope that our work may trigger some feedback and motivation for others. It is very easy to get started on working with the data, for example by using our ready-to-use R package. A lot more analyses are possible, when taking into account other data, some of which provided in this package, including county-based information about population numbers, the number of hospital beds, and age structure.
121 |TL;DR: Feel free to directly jump to The lambda.r implementation if you only want to see that. The full code is posted at the end of the article.
46 |Haskell and R are quite different programming languages. One is purely functional, statically typed and prominently features some of the most obscure abstractions in Computer Science. The other one lives at a particularly weird spot at the crossroad of the object-oriented, imperative and functional paradigms, has a ductile and dynamic type system and is optimized for the pragmatic needs of data analysis.
47 |But still these two languages share some interesting features. For example both can be run interactively in an interpreter environment. And both consider functions first-class citizens – thus offering higher-order functions – and allow the definition of custom infix operators. And that’s why something like lambda.r is possible in the first place.
48 |lambda.r (here v.1.2.4) is an R package that provides syntax extensions to write functional, Haskell-like code in R. It implements an astonishing number of features including type and function definition, pattern matching, guard statements and even monads! True functional programming available at your fingertips in R. All while maintaining a surprisingly Haskell-like syntax and incorporating powerful bonus features from R. Even a custom debugging system is part of the package.
49 |The author Brian Lee Yung Rowe did an incredible job and also maintained the package over a commendable time span – the first commit on Github is from 2012 and the last change was pushed 2019.
50 |Of course the package has some known limitations and rough edges. In my opinion it’s an extremely clever proof of concept and I enjoyed very much playing with it, but I’m not sure if I would recommend it for use in production. I’ll leave that to you and instead show you what I managed to build with it.
51 |Recently I wanted to implement a simple but specific logic in a bioinformatics context — so this is a real world example. But it would be tedious to explain the background, so I’ll instead replace the entities with something more digestible: Apples.
53 |Let’s say we have two sets of apple varieties and then a number of other fruit variety sets (varieties of pears, plums, strawberries, …). The first apple collection is large and covers all sorts of types: Ambrosia, Granny Smith, Red Delicious, Jonagold, Rome, Honeycrisp and many more. The second apple collection is much smaller, but a strict subset of the first one. It only includes the three varieties Granny Smith, Red Delicious and Honeycrisp. We don’t really care about the other fruits.
54 |How could we model these sets in Haskell? We don’t need to consider the individual varieties here. Only the variety collections. So we could create the type FruitSet with three data constructors for the three different relevant sets. For the sake of simplicity let’s shorten their names to
Now about the issue we have to solve for these sets: We need a function that merges a list of fruit sets according to a very specific logic into only one output fruit set. This has to adhere to the following pair-wise (and undirected) merging rules:
67 |For the final two rules, we also have to consider two different kind of merges: A union merge and an intersect merge.
72 |I think these rules are an excellent application for pattern matching in Haskell. We could implement them in a function like this:
77 |fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet
78 | fSMerge LAS LAS _ = LAS
79 | fSMerge SAS SAS _ = SAS
80 | fSMerge OFS _ _ = OFS
81 | fSMerge _ OFS _ = OFS
82 | fSMerge LAS SAS True = SAS
83 | fSMerge SAS LAS True = SAS
84 | fSMerge LAS SAS False = LAS
85 | fSMerge SAS LAS False = LASEven if you’re not familiar with Haskell you may appreciate how the different pair-wise comparison cases are expressed here. The function takes two FruitSets and a logical to distinguish union (False) and intersect (True) merges. For many of these rules it does not even matter which kind of merge is applied. Here we can replace the pattern with the wildcard symbol “_”.
Now that we have these rules, we can also implement the function that applies them to an arbitrary list of FruitSets to determine the appropriate superset.
fSMergeList :: [FruitSet] -> Bool -> FruitSet
89 | fSMergeList (x:xs) intersect =
90 | foldr (\a b -> fSMerge a b intersect) x xsIt uses a fold to combine the list elements into one. Folds are operations that look at two elements of a list, apply some binary function to them, take the result and apply the same function again to that and a new list element. Just until only one result remains and the list is gone. Folds usually need a starting value that serves also as an “accumulator” to track the list-condensing result along the fold’s way through the list.
92 |Here I used Haskell’s clever pattern matching on lists (x:xs) to separate the input list’s head and tail. That makes it straight forward to set the head element as the starting value for the fold. We will see below that lambda.r is less elegant here.
Finally we can test our code:
94 |fSMergeList [LAS] True
95 | -- LAS
96 | fSMergeList [LAS, LAS] True
97 | -- LAS
98 | fSMergeList [LAS, LAS, SAS] True
99 | -- SAS
100 | fSMergeList [LAS, LAS, SAS] False
101 | -- LAS
102 | fSMergeList [LAS, LAS, OFS] False
103 | -- OFSWorks like a charm! Let’s compare that with lamda.r now.
105 |lambda.r provides some functions, mostly clever infix operators, to enable a Haskell-like logic and syntax in R. To access them we have to install and load the package first.
107 | 109 |Just as in the Haskell code above we have to find a way to represent fruit sets. With lambda.r, types are defined by their constructor functions. Each function has a name and input arguments separated from a return value or operation with the %as% infix operator.
A distinction of type and data constructor as in Haskell does not exist to my knowledge. Also no nullary data constructor (“constants”). So I decided to be creative and use pattern matching on strings to simulate a data type for different fruit sets. lambda.r understands this syntax perfectly fine and prints the resulting type as follows:
114 | 115 |[[1]]
116 | FruitSet("LAS") %:=% ...
117 | [[2]]
118 | FruitSet("SAS") %:=% ...
119 | [[3]]
120 | FruitSet("OFS") %:=% ...
121 | With that data type we can define the pair-wise merging logic as laid out above.
122 |fsMerge(a,b,intersect) %::% FruitSet : FruitSet : logical : FruitSet
123 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS")
124 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS")
125 | fsMerge("OFS", b, intersect) %as% FruitSet("OFS")
126 | fsMerge(a, "OFS", intersect) %as% FruitSet("OFS")
127 | fsMerge("LAS", "SAS", TRUE ) %as% FruitSet("SAS")
128 | fsMerge("SAS", "LAS", TRUE ) %as% FruitSet("SAS")
129 | fsMerge("LAS", "SAS", FALSE ) %as% FruitSet("LAS")
130 | fsMerge("SAS", "LAS", FALSE ) %as% FruitSet("LAS")Note how extremely similar this syntax is to Haskell. The type interface definition follows exactly the same principle, short of some minor deviations when :: became %::% in R and -> is replaced by :. R has some limitations regarding infix operators.
One key take-away is, that this function will not run with input that is not exactly as specified. lambda.r thus introduces a static type system into R.
133 |The pattern matching in the function definition is just as in Haskell, except of course for a number of syntactic details like the parentheses, commas, string-based values and lack of explicit wildcards. It’s another language after all!
134 |With this function implemented, we only lack the last component: The function to apply the pair-wise comparisons with a fold on a list of FruitSets. And here things start to become a bit more tricky, unfortunately. Let’s start with the result:
fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet
136 | fsMergeList(xs, intersect) %as%
137 | Reduce(
138 | function(a, b) { fsMerge(a, b, intersect) },
139 | xs[tail(seq_along(xs), n = -1)],
140 | init = xs[[1]]
141 | )The general structure is again very Haskell-like. For the folding we use the Reduce function from the R base package (which is something like the Prelude in Haskell). One major difference between lambda.r and Haskell is though, that lambda.r lacks a good default way to handle lists. Maybe I just missed the relevant documentation or overlooked something else, but I struggled a bit with that.
In the end I decided to come up with my own list type.
144 |FruitSetList(…) %::% FruitSet… : FruitSetList
145 | FruitSetList(…) %as% asFruitSetList(list(…))asFruitSetList(xs) %::% list : FruitSetList
146 | asFruitSetList(xs) %as% {
147 | class(xs) <- c(“FruitSetList”)
148 | xs
149 | }This constructor makes use of the Ellipsis type “...”, a weird feature of R, well integrated into lambda.r: a single input argument that can represent a set of multiple arguments. In lambda.r it can be combined with a type constraint to make sure that the function takes an arbitrary amount of arguments, but only of this type. So here of type FruitSet.
That allows for a pretty cool constructor syntax:
152 | 153 |[1] "LAS"
154 | attr(,"class")
155 | [1] "FruitSet" "character"
156 | [[2]]
157 | [1] "SAS"
158 | attr(,"class")
159 | [1] "FruitSet" "character"
160 | [[3]]
161 | [1] "OFS"
162 | attr(,"class")
163 | [1] "FruitSet" "character"attr(,"class")
164 | [1] "FruitSetList"
165 | Unforturnately I found no direct way to catch the ellipsis and make it a FruitSetList. With list(...) I could indeed transform it to a list, but that’s only half the job. I resorted to the rather ugly asFruitSetList that “manually” adds the “FruitSetList” label to the class attribute of the output object. That works because lambda.r utilizes R S3 classes for its magic.
With that out of the way there was still one issue to address. I could not use Haskell’s pattern matching on lists to separate the head and tail elements for the Reduce input. It’s easy to get the first element of a list in R, but the tail requires some more advanced indexing:
All issues should be solved now. It’s time for a final test run of our code:
169 |fsMergeList(FruitSetList(FruitSet("LAS")), TRUE)
170 | # [1] "LAS"
171 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS")), TRUE)
172 | # [1] "LAS"
173 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), TRUE)
174 | # [1] "SAS"
175 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("SAS")), FALSE)
176 | # [1] "LAS"
177 | fsMergeList(FruitSetList(FruitSet("LAS"), FruitSet("LAS"), FruitSet("OFS")), FALSE)
178 | # [1] "OFS"Excellent! The Syntax is more verbose as the one in Haskell, but the results are the same.
180 |I personally would love to see some of the concepts demonstrated with lambda.r to find their way into regular, base R. Especially a way to switch on static typing! That could avoid a lot of unexpected behavior. R interfaces often feel flimsy and not as rock solid as comparable code in Haskell. The approach lambda.r took here – e.g. with the Don’t-Care Type ., which I did not introduce – could be a way to combine dynamic and static typing. Ideally we want more sturdy interfaces without sacrificing R’s great flexibility for rapid prototyping.
Acknowledgements: I got some valuable feedback by my colleague James Fellows Yates (@jfy133) for this post.
188 |Haskell:
190 |data FruitSet =
191 | LAS
192 | | SAS
193 | | OFS
194 | deriving (Eq, Show)
195 |
196 | fSMergeList :: [FruitSet] -> Bool -> FruitSet
197 | fSMergeList (x:xs) intersect = foldr (\a b -> fSMerge a b intersect) x xs
198 |
199 | fSMerge :: FruitSet -> FruitSet -> Bool -> FruitSet
200 | fSMerge LAS LAS _ = LAS
201 | fSMerge SAS SAS _ = SAS
202 | fSMerge OFS _ _ = OFS
203 | fSMerge _ OFS _ = OFS
204 | fSMerge LAS SAS True = SAS
205 | fSMerge SAS LAS True = SAS
206 | fSMerge LAS SAS False = LAS
207 | fSMerge SAS LAS False = LASR:
209 |library(lambda.r)
210 |
211 | FruitSet("LAS") %as% "LAS"
212 | FruitSet("SAS") %as% "SAS"
213 | FruitSet("OFS") %as% "OFS"
214 |
215 | FruitSetList(...) %::% FruitSet... : FruitSetList
216 | FruitSetList(...) %as% asFruitSetList(list(...))
217 |
218 | asFruitSetList(xs) %::% list : FruitSetList
219 | asFruitSetList(xs) %as% {
220 | class(xs) <- c("FruitSetList")
221 | xs
222 | }
223 |
224 | fsMerge(a, b, intersect) %::% FruitSet : FruitSet : logical : FruitSet
225 | fsMerge("LAS", "LAS", intersect) %as% FruitSet("LAS")
226 | fsMerge("SAS", "SAS", intersect) %as% FruitSet("SAS")
227 | fsMerge("OFS", b, intersect) %as% FruitSet("OFS")
228 | fsMerge(a, "OFS", intersect) %as% FruitSet("OFS")
229 | fsMerge("LAS", "SAS", TRUE ) %as% FruitSet("SAS")
230 | fsMerge("SAS", "LAS", TRUE ) %as% FruitSet("SAS")
231 | fsMerge("LAS", "SAS", FALSE ) %as% FruitSet("LAS")
232 | fsMerge("SAS", "LAS", FALSE ) %as% FruitSet("LAS")
233 |
234 | fsMergeList(xs, intersect) %::% FruitSetList : logical : FruitSet
235 | fsMergeList(xs, intersect) %as%
236 | Reduce(
237 | function(a, b) { fsMerge(a, b, intersect) },
238 | xs[tail(seq_along(xs), n = -1)],
239 | init = xs[[1]]
240 | )