├── tests
    ├── .gitignore
    ├── local-inputs-compressed
    │   ├── data
    │   │   ├── .gitignore
    │   │   ├── asia_metadata.tsv.xz
    │   │   ├── asia_sequences.fasta.xz
    │   │   ├── europe_aligned.fasta.xz
    │   │   ├── europe_metadata.tsv.xz
    │   │   ├── oceania_masked.fasta.xz
    │   │   ├── oceania_metadata.tsv.xz
    │   │   ├── americas_filtered.fasta.xz
    │   │   └── americas_metadata.tsv.xz
    │   ├── config.yaml
    │   └── builds.yaml
    ├── local-inputs-uncompressed
    │   ├── data
    │   │   └── .gitignore
    │   ├── config.yaml
    │   └── builds.yaml
    ├── unsanitized_metadata.tar.gz
    ├── unsanitized_metadata.tsv
    ├── remote-inputs-compressed
    │   ├── config.yaml
    │   └── builds.yaml
    ├── remote-inputs-uncompressed
    │   ├── config.yaml
    │   └── builds.yaml
    ├── check_auspice_json.py
    └── different-inputs.t
├── defaults
    ├── include.txt
    ├── sites_ignored_for_tree_topology.txt
    ├── description.md
    ├── distance_maps
    │   └── VoC.json
    ├── annotation.gff
    ├── clade_hierarchy.tsv
    ├── clade_display_names.yml
    ├── clade_emergence_dates.tsv
    ├── clades_who.tsv
    ├── auspice_config.json
    └── population_weights.tsv
├── nextstrain_profiles
    ├── nextstrain-gisaid-21L
    │   ├── include.txt
    │   ├── config.yaml
    │   ├── exclude-clades.tsv
    │   └── prefilter.smk
    ├── nextstrain-ci
    │   ├── config.yaml
    │   └── builds.yaml
    ├── nextstrain-open
    │   └── config.yaml
    ├── nextstrain-gisaid
    │   ├── config.yaml
    │   └── legacy_clades.tsv
    └── 100k
    │   ├── config-open.yaml
    │   ├── README.md
    │   └── config-gisaid.yaml
├── data
    ├── example_sequences.fasta.gz
    ├── example_metadata_aus.tsv.xz
    ├── example_multiple_inputs.tar.xz
    ├── example_sequences_aus.fasta.xz
    ├── example_metadata_worldwide.tsv.xz
    ├── example_sequences_worldwide.fasta.xz
    └── references_metadata.tsv
├── docs
    ├── src
    │   ├── images
    │   │   ├── gisaid-login.png
    │   │   ├── gisaid-homepage.png
    │   │   ├── terra-datatable.png
    │   │   ├── dataset-custom-data.png
    │   │   ├── dataset-example-data.png
    │   │   ├── getting-started-tree.png
    │   │   ├── gisaid-epicov-search.png
    │   │   ├── gisaid-navigation-bar.png
    │   │   ├── gisaid-search-results.png
    │   │   ├── multiple_inputs_dag.png
    │   │   ├── basic_nextstrain_build.png
    │   │   ├── gisaid-downloads-window.png
    │   │   ├── dataset-genomic-surveillance.png
    │   │   ├── gisaid-epicov-navigation-bar.png
    │   │   ├── dataset-custom-data-highlighted.png
    │   │   ├── gisaid-augur-pipeline-download.png
    │   │   ├── gisaid-download-packages-window.png
    │   │   ├── gisaid-initial-search-interface.png
    │   │   ├── gisaid-search-download-window.png
    │   │   ├── gisaid-nextregions-download-window.png
    │   │   ├── gisaid-search-download-window-metadata.png
    │   │   ├── gisaid-select-sequences-10-highlighted.png
    │   │   ├── gisaid-search-download-window-sequences.png
    │   │   ├── gisaid-select-sequences-idaho-highlighted.png
    │   │   ├── gisaid-epicov-navigation-bar-with-downloads.png
    │   │   └── gisaid-nextregions-download-terms-and-conditions.png
    │   ├── tutorial
    │   │   ├── intro.rst
    │   │   ├── videos.rst
    │   │   ├── setup.rst
    │   │   ├── next-steps.rst
    │   │   └── example-data.rst
    │   ├── guides
    │   │   ├── data-prep
    │   │   │   └── index.rst
    │   │   ├── update-workflow.rst
    │   │   ├── customizing-visualization.rst
    │   │   └── run-analysis-on-terra.rst
    │   ├── visualization
    │   │   ├── narratives.rst
    │   │   └── interpretation.rst
    │   ├── _static
    │   │   └── css
    │   │   │   └── configuration-reference.css
    │   ├── reference
    │   │   ├── glossary.rst
    │   │   ├── nextstrain-overview.rst
    │   │   ├── naming_clades.rst
    │   │   ├── troubleshoot.rst
    │   │   ├── data_submitter_faq.rst
    │   │   └── files.rst
    │   └── index.rst
    ├── conda.yml
    ├── glossary.md
    ├── make.bat
    ├── README.md
    ├── Makefile
    ├── translation_docs.md
    └── redirects.yaml
├── workflow
    ├── wdl
    │   ├── genbank_ingest.json
    │   ├── ncov_workflow.json
    │   ├── gisaid_ingest.json
    │   ├── genbank_ingest.wdl
    │   ├── gisaid_ingest.wdl
    │   └── ncov_workflow.wdl
    ├── envs
    │   └── nextstrain.yaml
    └── schemas
    │   └── config.schema.yaml
├── .gitattributes
├── scripts
    ├── curate_metadata
    │   ├── requirements.txt
    │   ├── config_curate_metadata
    │   │   ├── internationalExceptions.txt
    │   │   ├── country_ordering
    │   │   │   ├── Austria_variants.txt
    │   │   │   └── Slovakia_variants.txt
    │   │   └── acceptedExposureAdditions.txt
    │   └── config_files_additional_info
    │   │   ├── variants.txt
    │   │   ├── location_pattern.txt
    │   │   ├── purpose_of_sequencing.txt
    │   │   └── info_ignore.txt
    ├── sha256sum
    ├── generate-scientific-credits.py
    ├── narrative-pdf-screens.sh
    ├── normalize_gisaid_fasta.sh
    ├── expand-clade-definitions
    ├── annotate_metadata_with_index.py
    ├── construct-recency-from-submission-date.py
    ├── rename_clades.py
    ├── upload-to-s3
    ├── add_priorities_to_meta.py
    ├── adjust_regional_meta.py
    ├── calculate_epiweek.py
    ├── mask-alignment.py
    ├── priorities.py
    ├── include_prefix.py
    ├── add_labels.py
    ├── check_missing_locations.py
    ├── revert
    ├── tsv-cast-header
    ├── explicit_translation.py
    ├── find_clusters.py
    ├── fetch_mlr_lineage_fitness.py
    ├── fix-colorings.py
    ├── deprecated
    │   └── parse_mutational_fitness_tsv_into_distance_map.py
    ├── combine_metadata.py
    └── developer_scripts
    │   └── get_population_weights
├── readthedocs.yml
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
    ├── dependabot.yml
    ├── workflows
    │   ├── sync-redirects.yaml
    │   ├── ci.yaml
    │   ├── rebuild-100k.yml
    │   ├── revert.yml
    │   ├── rebuild-gisaid.yml
    │   ├── rebuild-gisaid-21L.yml
    │   └── rebuild-open.yml
    └── pull_request_template.md
├── my_profiles
    └── README.md
├── .dockstore.yml
├── LICENSE
├── .gitignore
├── narratives
    └── ncov_template_narrative.md
└── README.md


/tests/.gitignore:
--------------------------------------------------------------------------------
1 | *.err
2 | /output


--------------------------------------------------------------------------------
/defaults/include.txt:
--------------------------------------------------------------------------------
1 | Wuhan/Hu-1/2019
2 | Wuhan-Hu-1/2019
3 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-gisaid-21L/include.txt:
--------------------------------------------------------------------------------
1 | 21L
2 | 


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.fasta
2 | *.tsv


--------------------------------------------------------------------------------
/tests/local-inputs-uncompressed/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.fasta
2 | *.tsv


--------------------------------------------------------------------------------
/data/example_sequences.fasta.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_sequences.fasta.gz


--------------------------------------------------------------------------------
/data/example_metadata_aus.tsv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_metadata_aus.tsv.xz


--------------------------------------------------------------------------------
/defaults/sites_ignored_for_tree_topology.txt:
--------------------------------------------------------------------------------
1 | 21846
2 | 21987
3 | 22992
4 | 23012
5 | 23063
6 | 23604
7 | 24410
8 | 


--------------------------------------------------------------------------------
/docs/src/images/gisaid-login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-login.png


--------------------------------------------------------------------------------
/tests/unsanitized_metadata.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/unsanitized_metadata.tar.gz


--------------------------------------------------------------------------------
/data/example_multiple_inputs.tar.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_multiple_inputs.tar.xz


--------------------------------------------------------------------------------
/data/example_sequences_aus.fasta.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_sequences_aus.fasta.xz


--------------------------------------------------------------------------------
/docs/src/images/gisaid-homepage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-homepage.png


--------------------------------------------------------------------------------
/docs/src/images/terra-datatable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/terra-datatable.png


--------------------------------------------------------------------------------
/workflow/wdl/genbank_ingest.json:
--------------------------------------------------------------------------------
1 | {
2 |   "GENBANK_INGEST.cache_nextclade_old":"${workspace.genbank_nextclade_tsv}"
3 | }


--------------------------------------------------------------------------------
/data/example_metadata_worldwide.tsv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_metadata_worldwide.tsv.xz


--------------------------------------------------------------------------------
/data/example_sequences_worldwide.fasta.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_sequences_worldwide.fasta.xz


--------------------------------------------------------------------------------
/docs/src/images/dataset-custom-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-custom-data.png


--------------------------------------------------------------------------------
/docs/src/images/dataset-example-data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-example-data.png


--------------------------------------------------------------------------------
/docs/src/images/getting-started-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/getting-started-tree.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-epicov-search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-epicov-search.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-navigation-bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-navigation-bar.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-search-results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-results.png


--------------------------------------------------------------------------------
/docs/src/images/multiple_inputs_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/multiple_inputs_dag.png


--------------------------------------------------------------------------------
/docs/src/images/basic_nextstrain_build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/basic_nextstrain_build.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-downloads-window.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-downloads-window.png


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Allow Git to decide if file is text or binary
2 | # Always use LF line endings even on Windows.
3 | * text=auto eol=lf
4 | 


--------------------------------------------------------------------------------
/docs/src/images/dataset-genomic-surveillance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-genomic-surveillance.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-epicov-navigation-bar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-epicov-navigation-bar.png


--------------------------------------------------------------------------------
/docs/src/images/dataset-custom-data-highlighted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-custom-data-highlighted.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-augur-pipeline-download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-augur-pipeline-download.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-download-packages-window.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-download-packages-window.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-initial-search-interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-initial-search-interface.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-search-download-window.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-download-window.png


--------------------------------------------------------------------------------
/workflow/wdl/ncov_workflow.json:
--------------------------------------------------------------------------------
1 | {
2 |   "Nextstrain_WRKFLW.metadata_tsv":"${this.metadata}",
3 |   "Nextstrain_WRKFLW.sequence_fasta":"${this.sequences}"
4 | }


--------------------------------------------------------------------------------
/docs/src/images/gisaid-nextregions-download-window.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-nextregions-download-window.png


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/asia_metadata.tsv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/asia_metadata.tsv.xz


--------------------------------------------------------------------------------
/docs/src/images/gisaid-search-download-window-metadata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-download-window-metadata.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-select-sequences-10-highlighted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-select-sequences-10-highlighted.png


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/asia_sequences.fasta.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/asia_sequences.fasta.xz


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/europe_aligned.fasta.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/europe_aligned.fasta.xz


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/europe_metadata.tsv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/europe_metadata.tsv.xz


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/oceania_masked.fasta.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/oceania_masked.fasta.xz


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/oceania_metadata.tsv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/oceania_metadata.tsv.xz


--------------------------------------------------------------------------------
/docs/src/images/gisaid-search-download-window-sequences.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-download-window-sequences.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-select-sequences-idaho-highlighted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-select-sequences-idaho-highlighted.png


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/americas_filtered.fasta.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/americas_filtered.fasta.xz


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/data/americas_metadata.tsv.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/americas_metadata.tsv.xz


--------------------------------------------------------------------------------
/docs/src/images/gisaid-epicov-navigation-bar-with-downloads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-epicov-navigation-bar-with-downloads.png


--------------------------------------------------------------------------------
/docs/src/images/gisaid-nextregions-download-terms-and-conditions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-nextregions-download-terms-and-conditions.png


--------------------------------------------------------------------------------
/scripts/curate_metadata/requirements.txt:
--------------------------------------------------------------------------------
1 | # python dependencies for developer scripts in this directory; install with pip install -r scripts/developer_scripts/requirements.txt
2 | geopy
3 | xlrd
4 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-ci/config.yaml:
--------------------------------------------------------------------------------
1 | configfile:
2 |   - defaults/parameters.yaml
3 |   - nextstrain_profiles/nextstrain-gisaid/builds.yaml
4 |   - nextstrain_profiles/nextstrain-ci/builds.yaml
5 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_curate_metadata/internationalExceptions.txt:
--------------------------------------------------------------------------------
1 | Europe/Austria/Italian cruise ship/	Europe/Italy/Italian cruise ship/
2 | Asia/Japan/Diamond Princess/	North America/USA/Diamond Princess/


--------------------------------------------------------------------------------
/readthedocs.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | version: 2
 3 | 
 4 | build:
 5 |   os: "ubuntu-22.04"
 6 |   tools:
 7 |     python: "mambaforge-23.11"
 8 | 
 9 | conda:
10 |   environment: docs/conda.yml
11 | 
12 | sphinx:
13 |   configuration: docs/src/conf.py
14 | 


--------------------------------------------------------------------------------
/workflow/envs/nextstrain.yaml:
--------------------------------------------------------------------------------
 1 | name: nextstrain
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 | dependencies:
 7 |   - augur=22.4.0
 8 |   - epiweeks=2.1.2
 9 |   - iqtree=2.2.0.3
10 |   - nextclade=3.9.0
11 |   - python>=3.8*
12 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-open/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - nextstrain_profiles/nextstrain-open/builds.yaml
 4 | 
 5 | cores: 8
 6 | keep-going: False
 7 | printshellcmds: True
 8 | show-failed-logs: True
 9 | restart-times: 2
10 | set-threads: tree=4
11 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-gisaid/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - nextstrain_profiles/nextstrain-gisaid/builds.yaml
 4 | 
 5 | cores: 8
 6 | keep-going: False
 7 | printshellcmds: True
 8 | show-failed-logs: True
 9 | restart-times: 2
10 | set-threads: tree=4
11 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-gisaid-21L/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
 4 | 
 5 | cores: 8
 6 | keep-going: False
 7 | printshellcmds: True
 8 | show-failed-logs: True
 9 | restart-times: 2
10 | set-threads: tree=4
11 | 


--------------------------------------------------------------------------------
/defaults/description.md:
--------------------------------------------------------------------------------
1 | Hi! This is the default description, written in [Markdown](https://www.markdownguide.org/getting-started/). You can change this by creating another Markdown file and referencing it in the workflow config file:
2 | 
3 | ```yaml
4 | files:
5 |   description: path/to/description.md
6 | ```
7 | 


--------------------------------------------------------------------------------
/scripts/sha256sum:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Portable sha256sum utility.
 4 | """
 5 | from hashlib import sha256
 6 | from sys import stdin
 7 | 
 8 | chunk_size = 5 * 1024**2 # 5 MiB
 9 | 
10 | h = sha256()
11 | 
12 | for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""):
13 |     h.update(chunk)
14 | 
15 | print(h.hexdigest())
16 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_curate_metadata/country_ordering/Austria_variants.txt:
--------------------------------------------------------------------------------
1 | Gofis	Göfis
2 | St. Pölten	Sankt Pölten
3 | Krems An Der Dou	Krems an der Donau
4 | Südoststeiermark	Feldbach
5 | St.Valentin	St. Valentin
6 | Nussdorfaa	Nussdorf am Attersee
7 | Aurachah	Aurach am Hongar
8 | Sankt Lorenz	St. Lorenz
9 | Rudolfsheim Funfhaus	Rudolfsheim-Fünfhaus


--------------------------------------------------------------------------------
/tests/unsanitized_metadata.tsv:
--------------------------------------------------------------------------------
1 | Virus name	gender	date	gisaid_epi_isl
2 | hCoV-19/OneVirus/1/2020	male	2020-10-01	EPI_ISL_1
3 | hCoV-19/OneVirus/1/2020	male	2020-10-01	EPI_ISL_2
4 | SARS-CoV-2/AnotherVirus/1/2021	female	2021-01-01	EPI_ISL_3
5 | hCoV-19/LocalVirus/2/2021	?	2021-12-01	?
6 | hCoV-19/LocalVirus/2/2021	?	2021-12-01	?
7 | hCoV-19/LocalVirus/3/2021	?	?	?
8 | 


--------------------------------------------------------------------------------
/defaults/distance_maps/VoC.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "default": 0,
 3 |   "map": {
 4 |     "S": {
 5 |       "18": 1,
 6 |       "69": 1,
 7 |       "144": 1,
 8 |       "242": 1,
 9 |       "417": 1,
10 |       "452": 1,
11 |       "477": 1,
12 |       "484": 1,
13 |       "501": 1,
14 |       "613": 1,
15 |       "614": 1,
16 |       "681": 1
17 |     }
18 |   },
19 |   "name": "MoC_count"
20 | }
21 | 


--------------------------------------------------------------------------------
/docs/conda.yml:
--------------------------------------------------------------------------------
 1 | name: ncov-docs
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - make
 6 |   - pip
 7 |   - pip:
 8 |     - nextstrain-sphinx-theme>=2022.5
 9 |     - recommonmark
10 |     - requests
11 |     - sphinx
12 |     - docutils
13 |     - sphinx-argparse
14 |     - sphinx-autobuild
15 |     - sphinx-copybutton
16 |     - sphinx-markdown-tables
17 |     - sphinx-tabs
18 | 


--------------------------------------------------------------------------------
/workflow/wdl/gisaid_ingest.json:
--------------------------------------------------------------------------------
1 | {
2 |   "GISAID_INGEST.GISAID_API_ENDPOINT":"${workspace.GISAID_API_ENDPOINT}",
3 |   "GISAID_INGEST.GISAID_USERNAME_AND_PASSWORD":"${workspace.GISAID_USERNAME_AND_PASSWORD}",
4 |   "GISAID_INGEST.cache_nextclade_old":"${workspace.gisaid_nextclade_tsv}",
5 |   "GISAID_INGEST.ingest.giturl":"https://github.com/nextstrain/ncov-ingest/archive/refs/heads/master.zip"
6 | }


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv:
--------------------------------------------------------------------------------
 1 | clade
 2 | 19A
 3 | 19B
 4 | 20A
 5 | 20B
 6 | 20C
 7 | 20D
 8 | 20E (EU1)
 9 | 20F
10 | 20G
11 | 20H (Beta, V2)
12 | 20I (Alpha, V1)
13 | 20J (Gamma, V3)
14 | 21A (Delta)
15 | 21B (Kappa)
16 | 21C (Epsilon)
17 | 21D (Eta)
18 | 21E (Theta)
19 | 21F (Iota)
20 | 21G (Lambda)
21 | 21H (Mu)
22 | 21I (Delta)
23 | 21J (Delta)
24 | 21K (Omicron)
25 | 21M (Omicron)
26 | 


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - tests/local-inputs-compressed/builds.yaml
 4 | 
 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline.
 6 | cores: 2
 7 | 
 8 | # Always print the commands that will be run to the screen for debugging.
 9 | printshellcmds: True
10 | 
11 | # Print log files of failed jobs
12 | show-failed-logs: True
13 | 


--------------------------------------------------------------------------------
/tests/remote-inputs-compressed/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - tests/remote-inputs-compressed/builds.yaml
 4 | 
 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline.
 6 | cores: 2
 7 | 
 8 | # Always print the commands that will be run to the screen for debugging.
 9 | printshellcmds: True
10 | 
11 | # Print log files of failed jobs
12 | show-failed-logs: True
13 | 


--------------------------------------------------------------------------------
/tests/local-inputs-uncompressed/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - tests/local-inputs-uncompressed/builds.yaml
 4 | 
 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline.
 6 | cores: 2
 7 | 
 8 | # Always print the commands that will be run to the screen for debugging.
 9 | printshellcmds: True
10 | 
11 | # Print log files of failed jobs
12 | show-failed-logs: True
13 | 


--------------------------------------------------------------------------------
/tests/remote-inputs-uncompressed/config.yaml:
--------------------------------------------------------------------------------
 1 | configfile:
 2 |   - defaults/parameters.yaml
 3 |   - tests/remote-inputs-uncompressed/builds.yaml
 4 | 
 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline.
 6 | cores: 2
 7 | 
 8 | # Always print the commands that will be run to the screen for debugging.
 9 | printshellcmds: True
10 | 
11 | # Print log files of failed jobs
12 | show-failed-logs: True
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Want us to add a feature to Nextstrain?
 4 | title: ""
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Context**
11 | How would this feature help you? What would it enable you to do?
12 | 
13 | **Description**
14 | A clear and concise description of what you want to happen
15 | 
16 | **Examples**
17 | 
18 | **Possible solution**
19 | (Optional)
20 | 


--------------------------------------------------------------------------------
/my_profiles/README.md:
--------------------------------------------------------------------------------
1 | Previously, we recommended using Snakemake profiles under a `my_profiles/` analysis directory. We now recommend using Snakemake config files directly via the `--configfile` parameter. You can still use existing profiles via `--configfile my_profiles/<profile_name>/builds.yaml`.
2 | 
3 | See [this guide](https://docs.nextstrain.org/projects/ncov/en/latest/tutorial/next-steps.html#create-analysis-directory) to create your own analysis directory.
4 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-gisaid/legacy_clades.tsv:
--------------------------------------------------------------------------------
 1 | clade	gene	site	alt
 2 | 
 3 | 
 4 | A1a	ORF3a	251	V
 5 | A1a	ORF1a	3606	F
 6 | 
 7 | A2	S	614	G
 8 | A2a	ORF1b	314	L
 9 | 
10 | A3	ORF1a	378	I
11 | A3	ORF1a	3606	F
12 | 
13 | A6	nuc	514	C
14 | 
15 | 
16 | A7	ORF1a	3220	V
17 | 
18 | 
19 | 
20 | 
21 | B	ORF8	84	S
22 | 
23 | B1	ORF8	84	S
24 | B1	nuc	18060	T
25 | 
26 | B2	ORF8	84	S
27 | B2	nuc	29095	T
28 | 
29 | B4	ORF8	84	S
30 | B4	N	202	N
31 | B4	N	202	N
32 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_files_additional_info/variants.txt:
--------------------------------------------------------------------------------
 1 | UK	United Kingdom
 2 | The United Kingdom	United Kingdom
 3 | US	USA
 4 | Valencia	Valencia ES
 5 | Granada	Granada ES
 6 | UAE	United Arab Emirates
 7 | United States	USA
 8 | Czechia	Czech Republic
 9 | Pamplona	Pamplona ES
10 | Tshwane	City of Tshwane
11 | Srilanka	Sri Lanka
12 | United Arab Emirate	United Arab Emirates
13 | Yucatán	Yucatan
14 | México	Mexico
15 | United States of America	USA
16 | Viet Nam	Vietnam
17 | Côte d’Ivoire	Côte d'Ivoire
18 | Zurich	Zürich


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # Dependabot configuration file
 2 | # <https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file>
 3 | #
 4 | # Each ecosystem is checked on a scheduled interval defined below.  To trigger
 5 | # a check manually, go to
 6 | #
 7 | #   https://github.com/nextstrain/ncov/network/updates
 8 | #
 9 | # and look for a "Check for updates" button.  You may need to click around a
10 | # bit first.
11 | ---
12 | version: 2
13 | updates:
14 |   - package-ecosystem: "github-actions"
15 |     directory: "/"
16 |     schedule:
17 |       interval: "weekly"
18 | 


--------------------------------------------------------------------------------
/scripts/generate-scientific-credits.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | data = pd.read_csv('../data/metadata.tsv', sep='\t')
 4 | credits = data.groupby('originating_lab')['strain'].apply(list).to_dict()
 5 | 
 6 | detailed_ofile = open('../data/detailed_credits.md', 'a')
 7 | ofile = open('../data/credits.md', 'a')
 8 | 
 9 | for institution in sorted(list(credits.keys())):
10 |     if institution == 'unknown':
11 |         continue
12 | 
13 |     ofile.write('* '+institution+'\n')
14 | 
15 |     strains = sorted(credits[institution])
16 |     detailed_ofile.write('* '+institution+'\n')
17 |     for s in strains:
18 |         detailed_ofile.write('\t* '+s+'\n')
19 |     detailed_ofile.write('\n')
20 | 
21 | detailed_ofile.close()
22 | ofile.close()
23 | 


--------------------------------------------------------------------------------
/.dockstore.yml:
--------------------------------------------------------------------------------
 1 | version: 1.2
 2 | workflows:
 3 |    - subclass: WDL
 4 |      primaryDescriptorPath: /workflow/wdl/ncov_workflow.wdl
 5 |      testParameterFiles:
 6 |      - /workflow/wdl/ncov_workflow.json
 7 |      name: ncov
 8 |      authors:
 9 |       - name: Nextstrain
10 |    - subclass: WDL
11 |      primaryDescriptorPath: /workflow/wdl/gisaid_ingest.wdl
12 |      testParameterFiles:
13 |      - /workflow/wdl/gisaid_ingest.json
14 |      name: gisaid_ingest
15 |      authors:
16 |       - name: Nextstrain
17 |    - subclass: WDL
18 |      primaryDescriptorPath: /workflow/wdl/genbank_ingest.wdl
19 |      testParameterFiles:
20 |      - /workflow/wdl/genbank_ingest.json
21 |      name: genbank_ingest
22 |      authors:
23 |       - name: Nextstrain
24 |     


--------------------------------------------------------------------------------
/docs/src/tutorial/intro.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ============
 3 | 
 4 | These tutorials will walk you through the process of running a basic genomic surveillance workflow using SARS-CoV-2 data.
 5 | We've created these resources with the goal of enabling Departments of Public Health to start using Nextstrain to understand their SARS-CoV-2 genomic data within 1-2 hours.
 6 | 
 7 | At the end, you will be able to:
 8 | 
 9 | - create phylogenetic trees of SARS-CoV-2 genomes from different sources including GISAID and Nextstrain-curated GenBank data
10 | - visualize the resulting trees in :term:`docs.nextstrain.org:Auspice`
11 | - define subsampling logic for your own genomic epidemiological analysis
12 | 
13 | If you prefer to learn about the workflow through videos, see the :doc:`demo videos <videos>`.
14 | 


--------------------------------------------------------------------------------
/docs/src/guides/data-prep/index.rst:
--------------------------------------------------------------------------------
 1 | **********************
 2 | Data preparation guide
 3 | **********************
 4 | 
 5 | To use Nextstrain to analyze your own data, you'll need to prepare two files:
 6 | 
 7 | 1. A FASTA file with viral genomic sequences
 8 | 2. A corresponding TSV file with metadata describing each sequence
 9 | 
10 | We describe the following ways to prepare data for a SARS-CoV-2 analysis:
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 |    :titlesonly:
15 | 
16 |    local-data
17 |    gisaid-search
18 |    gisaid-full
19 | 
20 | Alternatively, use pre-curated data files:
21 | 
22 | 1. :ref:`Nextstrain remote inputs <remote-inputs-open-files>`
23 | 2. `CDC: US State and Territory subsample datasets and example builds <https://github.com/CDCgov/usa-sars-cov-2-nextstrain-sets>`__
24 | 


--------------------------------------------------------------------------------
/.github/workflows/sync-redirects.yaml:
--------------------------------------------------------------------------------
 1 | name: Sync RTD redirects
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     paths:
 8 |       - docs/redirects.yaml
 9 |       - .github/workflows/sync-redirects.yaml
10 | 
11 |   pull_request:
12 |     paths:
13 |       - docs/redirects.yaml
14 |       - .github/workflows/sync-redirects.yaml
15 | 
16 |   # Manually triggered using GitHub's UI
17 |   workflow_dispatch:
18 | 
19 | jobs:
20 |   sync:
21 |     # Prevent this job from running on forks.
22 |     if: github.repository_owner == 'nextstrain'
23 |     name: rtd redirects
24 |     uses: nextstrain/.github/.github/workflows/sync-rtd-redirects.yaml@master
25 |     with:
26 |       project: nextstrain-ncov
27 |       file: docs/redirects.yaml
28 |     secrets:
29 |       RTD_TOKEN: ${{ secrets.RTD_TOKEN }}
30 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   test-build:
12 |     uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@v0
13 |     with:
14 |       build-args: all_regions -j 2 --profile nextstrain_profiles/nextstrain-ci
15 | 
16 |   test-cram:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - uses: actions/checkout@v6
20 |       - uses: actions/setup-python@v6
21 |         with:
22 |           python-version: "3.10"
23 |       - run: pip install cram nextstrain-augur
24 |       - run: cram --shell=/bin/bash tests/sanitize-metadata.t
25 | 
26 |   docs:
27 |     uses: nextstrain/.github/.github/workflows/docs-ci.yaml@master
28 |     with:
29 |       docs-directory: docs/
30 |       environment-file: docs/conda.yml
31 | 


--------------------------------------------------------------------------------
/docs/glossary.md:
--------------------------------------------------------------------------------
 1 | # Glossary  
 2 | 
 3 | #### Alignment  
 4 | 
 5 | #### Ancestral trait (reconstruction)  
 6 | 
 7 | #### Augur  
 8 | 
 9 | #### Auspice  
10 | 
11 | #### Bases  
12 | 
13 | #### Branch  
14 | 
15 | #### Build  
16 | 
17 | #### Config  
18 | 
19 | #### Division  
20 | 
21 | #### Filtering  
22 | 
23 | #### Genome  
24 | 
25 | #### Genomic epidemiology
26 | 
27 | #### GISAID   
28 | 
29 | #### Location
30 | 
31 | #### Metadata  
32 | 
33 | #### Narrative
34 | 
35 | #### Node    
36 | 
37 | #### Phylogeny
38 | 
39 | #### Reference genome  
40 | 
41 | #### Region  
42 | 
43 | #### Sample   
44 | 
45 | #### Sequence
46 | 
47 | #### Snakemake  
48 | 
49 | #### Strain  
50 | 
51 | #### Subsampling  
52 | 
53 | #### Tip (leaf)
54 | 
55 | #### TSV    
56 | 
57 | #### Trait  
58 | 
59 | #### Transmission
60 | 
61 | #### Tree      
62 | 
63 | #### Workflow manager  
64 | 


--------------------------------------------------------------------------------
/docs/src/tutorial/videos.rst:
--------------------------------------------------------------------------------
 1 | **************************
 2 | Video tutorial walkthrough
 3 | **************************
 4 | 
 5 | If you prefer to learn about the workflow through videos, see the following:
 6 | 
 7 | Running the analysis
 8 | --------------------
 9 | 
10 | .. raw:: html
11 | 
12 |     <iframe width="560" height="315" src="https://www.youtube.com/embed/ciGqAYT4d-4" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
13 | 
14 | Visualizing the results
15 | -----------------------
16 | 
17 | .. raw:: html
18 | 
19 |     <iframe width="560" height="315" src="https://www.youtube.com/embed/oWHY1plmYIM" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
20 | 


--------------------------------------------------------------------------------
/scripts/narrative-pdf-screens.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | ADDRESS=$1
 4 | PARTIAL_FNAME=$2
 5 | 
 6 | echo "Make sure that you have auspice / nextstrain running at so that ${ADDRESS} is valid!"
 7 | echo "(e.g. from the 'ncov' directory run 'auspice view --datasetDir auspice --narrativeDir narratives'"
 8 | echo "This script will save PDFs starting with the prefix ${PARTIAL_FNAME}"
 9 | echo ""
10 | 
11 | # https://gs.statcounter.com/screen-resolution-stats/desktop/worldwide
12 | RESOLUTIONS=(3200x1350 1920x1080 1600x900 366x768 )
13 | #             james'     iphone+            iphone
14 | 
15 | for RES in ${RESOLUTIONS[@]}; do
16 |   F="${PARTIAL_FNAME}.${RES}.pdf"
17 |   echo ""
18 |   echo "-------------------------------------"
19 |   echo "Making ${F}"
20 |   echo ""
21 |   decktape generic --load-pause 3000 --key ArrowDown --size ${RES} ${ADDRESS} ${F}
22 | done
23 | 
24 | exit 0
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Did something not work as expected?
 4 | title: ""
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Current Behavior**
11 | A clear and concise description of what is happening / what the bug is.
12 | 
13 | **Expected behavior**
14 | A clear and concise description of what you expected to happen instead.
15 | 
16 | **How to reproduce**
17 | Steps to reproduce the current behavior:
18 | 1. Open / run ...
19 | 2.
20 | 3.
21 | 4. See error
22 | 
23 | **Possible solution**
24 | (optional)
25 | 
26 | **Your environment: if browsing Nextstrain online**
27 |  - Operating system:
28 |  - Browser:
29 | 
30 | **Your environment: if running Nextstrain locally**
31 |  - Operating system:
32 |  - Browser:
33 |  - Version (e.g. `auspice 2.7.0`):
34 | 
35 | **Additional context**
36 | Add any other context about the problem here.
37 | 


--------------------------------------------------------------------------------
/defaults/annotation.gff:
--------------------------------------------------------------------------------
 1 | # Gene map (genome annotation) of SARS-CoV-2 in GFF format.
 2 | # For gene map purpses we only need some of the columns. We substitute unused values with "." as per GFF spec.
 3 | # See GFF format reference at https://www.ensembl.org/info/website/upload/gff.html
 4 | # seqname	source	feature	start	end	score	strand	frame	attribute
 5 | .	.	gene	26245	26472	.	+	.	 gene_name=E
 6 | .	.	gene	26523	27191	.	+	.	 gene_name=M
 7 | .	.	gene	28274	29533	.	+	.	 gene_name=N
 8 | .	.	gene	266	13468	.	+	.	 gene_name=ORF1a
 9 | .	.	gene	13468	21555	.	+	.	 gene_name=ORF1b
10 | .	.	gene	25393	26220	.	+	.	 gene_name=ORF3a
11 | .	.	gene	27202	27387	.	+	.	 gene_name=ORF6
12 | .	.	gene	27394	27759	.	+	.	 gene_name=ORF7a
13 | .	.	gene	27756	27887	.	+	.	 gene_name=ORF7b
14 | .	.	gene	27894	28259	.	+	.	 gene_name=ORF8
15 | .	.	gene	28284	28577	.	+	.	 gene_name=ORF9b
16 | .	.	gene	21563	25384	.	+	.	 gene_name=S
17 | 


--------------------------------------------------------------------------------
/tests/remote-inputs-compressed/builds.yaml:
--------------------------------------------------------------------------------
 1 | inputs:
 2 |   - name: test-remote-compressed-asia-sequences
 3 |     metadata: s3://nextstrain-data/files/ncov/test-data/asia_metadata.tsv.xz
 4 |     sequences: s3://nextstrain-data/files/ncov/test-data/asia_sequences.fasta.xz
 5 |   - name: test-remote-compressed-europe-aligned
 6 |     metadata: s3://nextstrain-data/files/ncov/test-data/europe_metadata.tsv.xz
 7 |     aligned: s3://nextstrain-data/files/ncov/test-data/europe_aligned.fasta.xz
 8 |   - name: test-remote-compressed-americas-filtered
 9 |     metadata: s3://nextstrain-data/files/ncov/test-data/americas_metadata.tsv.xz
10 |     filtered: s3://nextstrain-data/files/ncov/test-data/americas_filtered.fasta.xz
11 | 
12 | builds:
13 |   test-remote-compressed:
14 |     subsampling_scheme: small
15 | 
16 | subsampling:
17 |   small:
18 |     small-sample:
19 |       group_by: "region"
20 |       max_sequences: 100
21 | 


--------------------------------------------------------------------------------
/tests/local-inputs-compressed/builds.yaml:
--------------------------------------------------------------------------------
 1 | inputs:
 2 |   # Note: paths are relative to the --directory handed to snakemake
 3 |   - name: test-local-compressed-asia-sequences
 4 |     metadata: ../local-inputs-compressed/data/asia_metadata.tsv.xz
 5 |     sequences: ../local-inputs-compressed/data/asia_sequences.fasta.xz
 6 |   - name: test-local-compressed-europe-aligned
 7 |     metadata: ../local-inputs-compressed/data/europe_metadata.tsv.xz
 8 |     aligned: ../local-inputs-compressed/data/europe_aligned.fasta.xz
 9 |   - name: test-local-compressed-americas-filtered
10 |     metadata: ../local-inputs-compressed/data/americas_metadata.tsv.xz
11 |     filtered: ../local-inputs-compressed/data/americas_filtered.fasta.xz
12 | 
13 | builds:
14 |   test-local-compressed:
15 |     subsampling_scheme: small
16 | 
17 | subsampling:
18 |   small:
19 |     small-sample:
20 |       group_by: "region"
21 |       max_sequences: 100
22 | 


--------------------------------------------------------------------------------
/tests/local-inputs-uncompressed/builds.yaml:
--------------------------------------------------------------------------------
 1 | inputs:
 2 |   # Note: paths are relative to the --directory handed to snakemake
 3 |   - name: test-local-uncompressed-asia-sequences
 4 |     metadata: ../local-inputs-uncompressed/data/asia_metadata.tsv
 5 |     sequences: ../local-inputs-uncompressed/data/asia_sequences.fasta
 6 |   - name: test-local-uncompressed-europe-aligned
 7 |     metadata: ../local-inputs-uncompressed/data/europe_metadata.tsv
 8 |     aligned: ../local-inputs-uncompressed/data/europe_aligned.fasta
 9 |   - name: test-local-uncompressed-americas-filtered
10 |     metadata: ../local-inputs-uncompressed/data/americas_metadata.tsv
11 |     filtered: ../local-inputs-uncompressed/data/americas_filtered.fasta
12 | 
13 | builds:
14 |   test-local-uncompressed:
15 |     subsampling_scheme: small
16 | 
17 | subsampling:
18 |   small:
19 |     small-sample:
20 |       group_by: "region"
21 |       max_sequences: 100
22 | 


--------------------------------------------------------------------------------
/workflow/wdl/genbank_ingest.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "tasks/ncov_ingest.wdl" as ncov_ingest
 4 | 
 5 | workflow GENBANK_INGEST {
 6 |   input {
 7 |     # Optionals
 8 |     File? cache_nextclade_old
 9 |     String? filter  # e.g. "region:Africa" passed to tsv-filters
10 | 
11 |     Int? cpu
12 |     Int? memory       # in GiB
13 |     Int? disk_size
14 |   }
15 | 
16 |   call ncov_ingest.genbank_ingest as ingest {
17 |     input:
18 |       # optionals
19 |       cache_nextclade_old = cache_nextclade_old,
20 |       filter = filter,
21 |   
22 |       cpu = cpu,
23 |       memory = memory,
24 |       disk_size = disk_size
25 |   }
26 | 
27 |   output {
28 |     # ncov-ingest output either gisaid or genbank
29 |     File sequences_fasta = ingest.sequences_fasta
30 |     File metadata_tsv = ingest.metadata_tsv
31 | 
32 |     File nextclade_tsv = ingest.nextclade_tsv
33 |     String last_run = ingest.last_run
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/.github/workflows/rebuild-100k.yml:
--------------------------------------------------------------------------------
 1 | name: Rebuild 100k sample
 2 | 
 3 | on:
 4 |   # cron job once a week on Mondays at 12:42 UTC
 5 |   schedule:
 6 |     - cron:  '42 12 * * 1'
 7 |   # Manually triggered using GitHub's UI
 8 |   workflow_dispatch:
 9 | 
10 | jobs:
11 |   open:
12 |     permissions:
13 |       id-token: write
14 |     uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
15 |     secrets: inherit
16 |     with:
17 |       runtime: aws-batch
18 |       run: |
19 |         set -x
20 | 
21 |         declare -a config
22 |         config+=(slack_token=$SLACK_TOKEN)
23 | 
24 |         nextstrain build \
25 |           --detach \
26 |           --cpus 16 \
27 |           --memory 31GiB \
28 |           . \
29 |             upload \
30 |             --configfile nextstrain_profiles/100k/config-open.yaml \
31 |             --config "${config[@]}" \
32 |             --set-threads tree=8
33 |       artifact-name: open-build-output
34 | 


--------------------------------------------------------------------------------
/docs/src/visualization/narratives.rst:
--------------------------------------------------------------------------------
 1 | Nextstrain Narratives
 2 | =====================
 3 | 
 4 | Nextstrain Narratives allow you to pair a specific view of a dataset with text and images to generate scrollable, interactive reports.
 5 | 
 6 | For examples, `see our weekly Situation Reports <https://nextstrain.org/ncov-sit-reps>`__ from the first several months of the pandemic.
 7 | 
 8 | You can `read more about narratives <https://nextstrain.org/docs/narratives/introduction>`__ or `watch our Nextstrain narratives tutorial videos <https://www.youtube.com/playlist?list=PLsFWZl6SQqWxN9SkbgdjU8sylIfhZNDiW>`_. We've also `provided a template narrative file <https://github.com/nextstrain/ncov/blob/-/narratives/ncov_template_narrative.md>`__ for you to edit.
 9 | 
10 | You can preview the template narrative by navigating to https://nextstrain.org/community/narratives/nextstrain/ncov/template/narrative.
11 | 
12 | If you get stuck, don't hesitate to `ask for help <https://discussion.nextstrain.org/>`__.
13 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | if "%BUILDDIR%" == "" (
11 | 	set BUILDDIR=build
12 | )
13 | set SOURCEDIR=src
14 | 
15 | if "%1" == "" goto help
16 | 
17 | %SPHINXBUILD% >NUL 2>NUL
18 | if errorlevel 9009 (
19 | 	echo.
20 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
21 | 	echo.installed, then set the SPHINXBUILD environment variable to point
22 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
23 | 	echo.may add the Sphinx directory to PATH.
24 | 	echo.
25 | 	echo.If you don't have Sphinx installed, grab it from
26 | 	echo.http://sphinx-doc.org/
27 | 	exit /b 1
28 | )
29 | 
30 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
31 | goto end
32 | 
33 | :help
34 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
35 | 
36 | :end
37 | popd
38 | 


--------------------------------------------------------------------------------
/defaults/clade_hierarchy.tsv:
--------------------------------------------------------------------------------
 1 | clade	parent	WHO
 2 | 19B	19A	
 3 | 20A	19A	
 4 | 20B	20A	
 5 | 20C	20A	
 6 | 20D	20B	
 7 | 20E	20A	
 8 | 20F	20B	
 9 | 20G	20C	
10 | 20H	20C	Beta
11 | 20I	20B	Alpha
12 | 20J	20B	Gamma
13 | 21A	20A	Delta
14 | 21B	20A	Kappa
15 | 21C	20C	Epsilon
16 | 21D	20A	Eta
17 | 21E	20B	Theta
18 | 21F	20C	Iota
19 | 21G	20D	Lambda
20 | 21H	20A	Mu
21 | 21I	21A	Delta
22 | 21J	21A	Delta
23 | 21K	21M	Omicron
24 | 21L	21M	Omicron
25 | 21M	20B	Omicron
26 | 22A	21L	Omicron
27 | 22B	21L	Omicron
28 | 22C	21L	Omicron
29 | 22D	21L	Omicron
30 | 22E	22B	Omicron
31 | 22F	21L	Omicron
32 | 23A	22F	Omicron
33 | 23B	22F	Omicron
34 | 23C	22D	Omicron
35 | 23D	22F	Omicron
36 | 23E	22F	Omicron
37 | 23F	23D	Omicron
38 | 23G	23A	Omicron
39 | 23H	23F	Omicron
40 | 23I	21L	Omicron
41 | 24A	23I	Omicron
42 | 24B	24A	Omicron
43 | 24C	24B	Omicron
44 | 24D	21L	Omicron
45 | 24E	24C	Omicron
46 | 24F	24A	Omicron
47 | 24G	24B	Omicron
48 | 24H	24A	Omicron
49 | 24I	24A	Omicron
50 | 25A	24B	Omicron
51 | 25B	24D	Omicron
52 | 25C	24A	Omicron
53 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # ncov 'Read The Docs' Documentation.
 2 | 
 3 | 
 4 | ## Building the docs
 5 | 
 6 | Build dependencies are managed with [Conda](https://conda.io).
 7 | Install them
 8 | into an isolated environment named `ncov-docs` with:
 9 | 
10 |     conda env create -f=conda.yml
11 | 
12 | Enter the environment with:
13 | 
14 |     conda activate ncov-docs
15 | 
16 | You can now build the documentation with:
17 | 
18 |     make html
19 | 
20 | which invokes Sphinx to build static HTML pages in `build/html/`.
21 | You can view them by running:
22 | 
23 |     open build/html/index.html
24 | 
25 | 
26 | To monitor the source files for changes and automatically rebuild as necessary,
27 | run:
28 | 
29 |     make livehtml
30 | 
31 | and then open <http://localhost:8000>.  Pages open in the browser will
32 | automatically refresh when they're rebuilt.
33 | 
34 | You can clean the build directory for a fresh start with:
35 | 
36 |     make clean
37 | 
38 | Leave the environment with:
39 | 
40 |     conda deactivate
41 | 


--------------------------------------------------------------------------------
/docs/src/tutorial/setup.rst:
--------------------------------------------------------------------------------
 1 | Setup and installation
 2 | ======================
 3 | 
 4 | The following steps will prepare you to run complete analyses of SARS-CoV-2 data by installing required software and running a simple example workflow.
 5 | 
 6 | .. contents:: Table of Contents
 7 |    :local:
 8 | 
 9 | Register for a GISAID account
10 | -----------------------------
11 | 
12 | Some tutorials rely on data downloaded from `GISAID <https://gisaid.org/>`_.
13 | If you do not already have one, `register for a GISAID account <https://gisaid.org/registration/>`_ now.
14 | Registration may take a few days.
15 | 
16 | Install Nextstrain components
17 | --------------------------------
18 | 
19 | :doc:`Follow instructions to install Nextstrain components <docs.nextstrain.org:install>`.
20 | 
21 | Download the ncov workflow
22 | -----------------------------
23 | 
24 | Use Git to download a copy of the ncov repository containing the workflow and this tutorial.
25 | 
26 | .. code:: bash
27 | 
28 |    git clone https://github.com/nextstrain/ncov.git
29 | 


--------------------------------------------------------------------------------
/defaults/clade_display_names.yml:
--------------------------------------------------------------------------------
 1 | 19A: 19A
 2 | 19B: 19B
 3 | 20A: 20A
 4 | 20B: 20B
 5 | 20C: 20C
 6 | 20D: 20D
 7 | 20E: 20E
 8 | 20F: 20F
 9 | 20G: 20G
10 | 20I: 20I (Alpha)
11 | 20H: 20H (Beta)
12 | 20J: 20J (Gamma)
13 | 21A: 21A (Delta)
14 | 21B: 21B (Kappa)
15 | 21C: 21C (Epsilon)
16 | 21D: 21D (Eta)
17 | 21E: 21E (Theta)
18 | 21F: 21F (Iota)
19 | 21G: 21G (Lambda)
20 | 21I: 21I (Delta)
21 | 21H: 21H (Mu)
22 | 21J: 21J (Delta)
23 | 21K: 21K (BA.1)
24 | 21L: 21L (BA.2)
25 | 21M: 21M (Omicron)
26 | 22A: 22A (BA.4)
27 | 22B: 22B (BA.5)
28 | 22C: 22C (BA.2.12.1)
29 | 22D: 22D (BA.2.75)
30 | 22E: 22E (BQ.1)
31 | 22F: 22F (XBB)
32 | 23A: 23A (XBB.1.5)
33 | 23B: 23B (XBB.1.16)
34 | 23C: 23C (CH.1.1)
35 | 23D: 23D (XBB.1.9)
36 | 23E: 23E (XBB.2.3)
37 | 23F: 23F (EG.5.1)
38 | 23G: 23G (XBB.1.5.70)
39 | 23H: 23H (HK.3)
40 | 23I: 23I (BA.2.86)
41 | 24A: 24A (JN.1)
42 | 24B: 24B (JN.1.11.1)
43 | 24C: 24C (KP.3)
44 | 24D: 24D (XDV.1)
45 | 24E: 24E (KP.3.1.1)
46 | 24F: 24F (XEC)
47 | 24G: 24G (KP.2.3)
48 | 24H: 24H (LF.7)
49 | 24I: 24I (MV.1)
50 | 25A: 25A (LP.8.1)
51 | 25B: 25B (NB.1.8.1)
52 | 25C: 25C (XFG)
53 | 


--------------------------------------------------------------------------------
/defaults/clade_emergence_dates.tsv:
--------------------------------------------------------------------------------
 1 | Nextstrain_clade	first_sequence
 2 | 19A	2019-12-01
 3 | 19B	2019-12-01
 4 | 20A	2020-01-20
 5 | 20A.EU2	2020-02-15
 6 | 20B	2020-02-14
 7 | 20C	2020-02-25
 8 | 20D	2020-03-12
 9 | 20E	2020-05-27
10 | 20F	2020-05-24
11 | 20G	2020-06-11
12 | 20H	2020-08-10
13 | 20I	2020-09-20
14 | 20J	2020-10-29
15 | 21A	2020-10-30
16 | 21B	2020-10-30
17 | 21C	2020-08-03
18 | 21D	2020-11-21
19 | 21E	2021-01-10
20 | 21F	2020-11-20
21 | 21G	2021-01-05
22 | 21H	2021-01-05
23 | 21I	2020-10-30
24 | 21J	2020-10-30
25 | 21K	2021-09-01
26 | 21L	2021-09-01
27 | 21M	2021-09-01
28 | 22A	2021-12-01
29 | 22B	2021-12-01
30 | 22C	2021-12-01
31 | 22D	2022-04-01
32 | 22E	2022-07-10
33 | 22F	2022-07-01
34 | 23A	2022-10-01
35 | 23B	2022-11-01
36 | 23C	2022-06-01
37 | 23D	2022-08-01
38 | 23E	2022-10-01
39 | 23F	2023-01-01
40 | 23G	2023-01-01
41 | 23H	2023-03-01
42 | 23I	2023-04-01
43 | 24A	2023-07-01
44 | 24B	2023-11-01
45 | 24C	2023-12-01
46 | 24D	2024-01-01
47 | 24E	2023-12-01
48 | 24F	2024-05-01
49 | 24G	2024-01-01
50 | 24H	2024-05-01
51 | 24I	2024-05-01
52 | 25A	2024-07-01
53 | 25B	2025-01-01
54 | 25C	2025-01-01
55 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_curate_metadata/acceptedExposureAdditions.txt:
--------------------------------------------------------------------------------
 1 | Following combinations allowed: division (country, region) or country (region)
 2 | Italian cruise ship (Italy, Europe)
 3 | Nile River Cruise (Egypt, Africa)
 4 | Diamond Princess (Japan, Asia)
 5 | Grand Princess 2nd cruise (USA, North America)
 6 | Guayas (Ecuador, South America)
 7 | Grand Canary Islands (Spain, Europe)
 8 | Tyrol (Italy, Europe)
 9 | Trentino (Italy, Europe)
10 | Faroe Islands (Denmark, Europe)
11 | Guadalajara (Mexico, North America)
12 | Asia (Asia)
13 | Asia (Asia, Asia)
14 | Piedmont (Italy, Europe)
15 | Obwalden (Switzerland, Europe)
16 | Brazilian Cruise (Brazil, South America)
17 | Maldives (Asia, Maldives)
18 | Conakry (Guinea, Africa)
19 | Kabul (Afghanistan, Asia)
20 | Tajikistan (Asia)
21 | Sabah (Malaysia, Asia)
22 | Yemen (Asia)
23 | South Aegean Region (Greece, Europe)
24 | Arusha (Tanzania, Africa)
25 | Kalinga (Philippines, Asia)
26 | Lanao del Norte (Philippines, Asia)
27 | Sinai (Egypt, Africa)
28 | Sharjah (United Arab Emirates, Asia)
29 | Agusan del Norte (Philippines, Asia)
30 | Bohol (Philippines, Asia)
31 | Mekka (Saudi Arabia, Asia)
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Nextstrain
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_files_additional_info/location_pattern.txt:
--------------------------------------------------------------------------------
 1 | Patient residence: XXX
 2 | Civil Hospital, XXX
 3 | Industrial Enterprises in XXX
 4 | Alzheimer home, XXX 4
 5 | Airport of XXX
 6 | XXX (interpreted as patient residence
 7 | Clinical Hospital of XXX
 8 | Hospital das Clínicas de XXX
 9 | Hospital das Clinicas de XXX
10 | Zip code: XXX
11 | Hospital General de XXX
12 | Patient residence:XXX
13 | XXX Health Department
14 | XXX Health department
15 | Patient origin XXX
16 | zip code: XXX
17 | zip code: XXX  (interpreted as patient residence) (interpreted as patient residence) (interpreted as patient residence) (interpreted as patient residence) (interpreted as patient residence)
18 | CS XXX
19 | HG de XXX
20 | CS de XXX
21 | HG da XXX
22 | Lives in XXX
23 | Resident in XXX
24 | Patient resident in XXX
25 | Patient resident in XXX.
26 | Lives in the XXX
27 | Patient resides in XXX
28 | Patient from XXX transferred to Rio Grande do Sul State to receive hospital care
29 | Patient from XXX relocated to Rondonia State
30 | Residence XXX
31 | residence XXX
32 | Residence: XXX
33 | Patient From XXX  (interpreted as patient residence)
34 | Patient from XXX
35 | 


--------------------------------------------------------------------------------
/workflow/wdl/gisaid_ingest.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "tasks/ncov_ingest.wdl" as ncov_ingest
 4 | 
 5 | workflow GISAID_INGEST {
 6 |   input {
 7 |     # ncov ingest
 8 |     String GISAID_API_ENDPOINT
 9 |     String GISAID_USERNAME_AND_PASSWORD
10 | 
11 |     # Optionals
12 |     File? cache_nextclade_old
13 |     String? filter  # e.g. "region:Africa" passed to tsv-filters
14 | 
15 |     Int? cpu
16 |     Int? memory       # in GiB
17 |     Int? disk_size
18 |   }
19 | 
20 |   call ncov_ingest.gisaid_ingest as ingest {
21 |     input:
22 |       GISAID_API_ENDPOINT = GISAID_API_ENDPOINT,
23 |       GISAID_USERNAME_AND_PASSWORD = GISAID_USERNAME_AND_PASSWORD,
24 | 
25 |       # optionals
26 |       cache_nextclade_old = cache_nextclade_old,      
27 |       filter = filter,
28 | 
29 |       cpu = cpu,
30 |       memory = memory,
31 |       disk_size = disk_size
32 |   }
33 | 
34 |   output {
35 |     # ncov-ingest output either gisaid or genbank
36 |     File sequences_fasta = ingest.sequences_fasta
37 |     File metadata_tsv = ingest.metadata_tsv
38 | 
39 |     File nextclade_tsv = ingest.nextclade_tsv
40 |     String last_run = ingest.last_run
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/100k/config-open.yaml:
--------------------------------------------------------------------------------
 1 | # This file is largely duplicated from `config-gisaid.yaml` - please
 2 | # see that file for comments
 3 | S3_DST_BUCKET: "nextstrain-data/files/ncov/open/100k" # TODO XXX
 4 | S3_DST_ORIGINS: [needed-for-workflow-but-unused]
 5 | deploy_url: needed_for_workflow_but_unused
 6 | custom_rules:
 7 |   - workflow/snakemake_rules/export_for_nextstrain.smk
 8 | inputs:
 9 |   - name: open
10 |     metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.zst"
11 |     aligned: "s3://nextstrain-data/files/ncov/open/sequences.fasta.zst"
12 |     skip_sanitize_metadata: true
13 |     deduplicated: true
14 | builds:
15 |   100k:
16 |     subsampling_scheme: 100k_scheme
17 | upload:
18 |   metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz
19 |   sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz
20 | filter:
21 |   exclude_where: "division='USA'"
22 | subsampling:
23 |   100k_scheme:
24 |     50k_early:
25 |       group_by: "year month country"
26 |       max_sequences: 50000
27 |       max_date: "--max-date 1Y"
28 |     50k_late:
29 |       group_by: "year month country"
30 |       max_sequences: 50000
31 |       min_date: "--min-date 1Y"
32 | 


--------------------------------------------------------------------------------
/data/references_metadata.tsv:
--------------------------------------------------------------------------------
1 | strain	virus	gisaid_epi_isl	genbank_accession	date	region	country	division	location	region_exposure	country_exposure	division_exposure	segment	length	host	age	sex	Nextstrain_clade	pango_lineage	GISAID_clade	originating_lab	submitting_lab	authors	url	title	paper_url	date_submitted	sampling_strategy	missing_data	divergence	nonACGTN	rare_mutations	snp_clusters	QC_missing_data	QC_mixed_sites	QC_rare_mutations	QC_snp_clusters	clock_deviation
2 | Wuhan/Hu-1/2019	ncov	EPI_ISL_402125	MN908947.3	2019-12-26	Asia	China	Hubei	Wuhan	Asia	China	Hubei	genome	29903	Human	?	?	19A	B	L	National Institute for Communicable Disease Control and Prevention (ICDC) Chinese Center for Disease Control and Prevention (China CDC)	National Institute for Communicable Disease Control and Prevention (ICDC) Chinese Center for Disease Control and Prevention (China CDC)	Zhang et al	https://www.gisaid.org	A new coronavirus associated with human respiratory disease in China	https://dx.doi.org/10.1038/s41586-020-2008-3	2020-01-12		0.0	0.0	0.0	0.0	0.0	good	good	good	good	-0.4611005157393094
3 | 21L	ncov	?	?	2021-11-01	?	?	?	?	?	?	?	genome	29903	Human	?	?	21L (Omicron)	BA.2	?	?	?	?	?	?	?	2021-11-01		0.0	0.0	0.0	0.0	0.0	good	good	good	good	-0.4611005157393094
4 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first three.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | BUILDDIR      ?= build
 9 | SOURCEDIR     = src
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help help-docker Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | livehtml:
23 | 	sphinx-autobuild -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 
25 | .ONESHELL:
26 | docker-html:
27 | 	set -euox
28 | 	docker build -t nextstrain-docs-builder --network=host .
29 | 	docker run -it --rm \
30 | 	--name=nextstrain-docs-builder-$(shell date +%s) \
31 | 	--init \
32 | 	--user=$(shell id -u):$(shell id -g) \
33 | 	--volume=$(shell pwd):/home/user/src \
34 | 	--workdir=/home/user/src \
35 | 	--env 'TERM=xterm-256colors' \
36 | 	nextstrain-docs-builder
37 | 


--------------------------------------------------------------------------------
/tests/check_auspice_json.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import sys
 4 | 
 5 | if __name__ == '__main__':
 6 |     parser = argparse.ArgumentParser(
 7 |         description="Ensure certain values are present for a given node trait",
 8 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 9 |     )
10 |     parser.add_argument('--json', type=str, metavar="JSON", required=True, help="Auspice JSON")
11 |     parser.add_argument('--attr', type=str, metavar="KEY", required=True, help="node attr to collect")
12 |     parser.add_argument('--values', type=str, nargs="+", metavar="VALUE", required=True, help="values to check")
13 |     args = parser.parse_args()
14 | 
15 |     values_seen = set()
16 |     
17 |     def collect(node):
18 |         v = node.get("node_attrs", {}).get(args.attr, {}).get("value", "")
19 |         if v:
20 |             values_seen.add(v)
21 |         for child in node.get("children", []):
22 |             collect(child)
23 | 
24 |     with open(args.json, "r") as f:
25 |         input_json = json.load(f)
26 | 
27 |     collect(input_json["tree"])
28 | 
29 |     if not values_seen >= set(args.values):
30 |         print("Following values missing from JSON:", set(args.values)-values_seen)
31 |         sys.exit(1)
32 | 


--------------------------------------------------------------------------------
/defaults/clades_who.tsv:
--------------------------------------------------------------------------------
 1 | clade	gene	site	alt
 2 | 
 3 | Alpha	nuc	14676	T
 4 | Alpha	nuc	15279	T
 5 | Alpha	nuc	23063	T
 6 | 
 7 | Beta	nuc	23012	A
 8 | Beta	nuc	23063	T
 9 | Beta	nuc	23403	G
10 | Beta	nuc	26456	T
11 | 
12 | Gamma	nuc	733	C
13 | Gamma	nuc	2749	T
14 | Gamma	nuc	3828	T
15 | Gamma	nuc	5648	C
16 | Gamma	nuc	12778	T
17 | Gamma	nuc	13860	T
18 | 
19 | Delta	nuc	21618	G
20 | Delta	nuc	26767	C
21 | Delta	nuc	28461	G
22 | 
23 | Epsilon	nuc	17014	T
24 | Epsilon	nuc	21600	T
25 | Epsilon	nuc	22018	T
26 | Epsilon	nuc	22917	G
27 | 
28 | Eta	nuc	14407	T
29 | Eta	nuc	20724	G
30 | Eta	nuc	23593	C
31 | Eta	nuc	24224	C
32 | Eta	nuc	24748	T
33 | 
34 | Theta	nuc	12049	T
35 | Theta	nuc	23341	C
36 | Theta	nuc	23604	A
37 | Theta	nuc	24187	A
38 | Theta	nuc	24836	A
39 | 
40 | Iota	nuc	16500	C
41 | Iota	nuc	20262	G
42 | Iota	nuc	21575	T
43 | Iota	nuc	22320	G
44 | 
45 | Kappa	nuc	17523	T
46 | Kappa	nuc	22917	G
47 | Kappa	nuc	23012	C
48 | Kappa	nuc	27638	C
49 | Kappa	nuc	28881	T
50 | Kappa	nuc	29402	T
51 | 
52 | Lambda	nuc	21786	T
53 | Lambda	nuc	21789	T
54 | Lambda	nuc	22917	A
55 | Lambda	nuc	23031	C
56 | 
57 | Mu	nuc	3428	G
58 | Mu	nuc	4878	T
59 | Mu	nuc	11451	G
60 | Mu	nuc	13057	T
61 | Mu	nuc	17491	T
62 | Mu	nuc	27925	A
63 | 
64 | Omicron	nuc	18163	G
65 | Omicron	nuc	23599	G
66 | 


--------------------------------------------------------------------------------
/scripts/normalize_gisaid_fasta.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | GISAID_SARSCOV2_IN=$1
 4 | GISAID_SARSCOV2_OUT=$2
 5 | MIN_LENGTH=$3
 6 | 
 7 | if [[ ! -r "$GISAID_SARSCOV2_IN" ]]
 8 | then
 9 | 	echo "$0: input $GISAID_SARSCOV2_IN not found"
10 | 	exit 1
11 | fi
12 | 
13 | if [[ -z "$MIN_LENGTH" ]]
14 | then
15 | 	echo "Using default minimum length of 25000"
16 | 	MIN_LENGTH=25000
17 | fi
18 | 
19 | echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min length $MIN_LENGTH)"
20 | 
21 | # Remove leading virus name prefix from sequence names
22 | # Remove embedded spaces in sequence names (Hong Kong sequences)
23 | # Remove trailing |EPI_ISL_id|datestamp from sequence names
24 | # Remove sequences shorter than minimum length
25 | # Eliminate duplicate sequences (keep only the first seen)
26 | 
27 | #cat $GISAID_SARSCOV2_IN |
28 | 	sed 's/^>[hn]Co[Vv]-19\//>/g' $GISAID_SARSCOV2_IN |	# remove leading prefix
29 | 	sed 's/ //g' |					# remove embedded spaces
30 | 	sed 's/|.*$//' | 				# remove trailing metadata
31 | 	awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" |	# remove short seqs
32 | 	awk 'BEGIN{RS=">";FS="\n"}!x[$1]++{print ">"$0}' | 	# remove duplicates
33 | 	grep -v '^>*$' > $GISAID_SARSCOV2_OUT
34 | 
35 | exit 0
36 | 


--------------------------------------------------------------------------------
/scripts/expand-clade-definitions:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | usage: expand-clade-definitions <clades.tsv>
 4 | 
 5 | Reads in a clade definitions file suitable for `augur clades` and expands any
 6 | hierarchically-defined clades (i.e. clade definitions that inherit from earlier
 7 | clade definitions).
 8 | 
 9 | This should probably become a part of Augur in the future, as it's useful for
10 | programmatic manipulation of clade definitions files.
11 | """
12 | import csv
13 | from augur.clades import read_in_clade_definitions
14 | from sys import stdout
15 | 
16 | 
17 | def write_out_clade_definitions(file, defs):
18 |     out = csv.writer(file, dialect = "excel-tab", lineterminator = "\n")
19 |     out.writerow(("clade", "gene", "site", "alt"))
20 |     out.writerows(
21 |         (clade, gene, site + 1, alt)
22 |             for clade, muts in defs.items()
23 |             for gene, site, alt in muts)
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     from argparse import ArgumentParser
28 | 
29 |     cli = ArgumentParser(description = __doc__.strip().split("\n\n", 1)[1])
30 |     cli.add_argument("clades", metavar = "<clades.tsv>")
31 | 
32 |     args = cli.parse_args()
33 | 
34 |     write_out_clade_definitions(stdout, read_in_clade_definitions(args.clades))
35 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_curate_metadata/country_ordering/Slovakia_variants.txt:
--------------------------------------------------------------------------------
 1 | Dubnica Nd Váhom	Dubnica nad Vahom
 2 | Mala N, Hronom	Malá nad Hronom
 3 | Turciansketeplice	Turcianske Teplice
 4 | Sucha Hora	Suchá Hora
 5 | Dolny Kubin	Dolný Kubín
 6 | Mala N, Hronom	Malá nad Hronom
 7 | Oravska Jasenic	Oravska Jasenica
 8 | Dvory N, Zitavou	Dvory nad Žitavou
 9 | Vranov nad Toplou	Vranov nad Toplov
10 | Turcianska Stavnicka	Turčianska Štiavnička
11 | Tvrodsin	Tvrdosin
12 | Stropkov Region	Stropkov
13 | Zilina Region	Zilina
14 | Dolny Kubin Region	Dolný Kubín
15 | Rimavka Sobota	Rimavská Sobota
16 | Cierne Pri Cadci	Čierne
17 | Diviaky	Diviaky nad Nitricou
18 | Zlat Moravce	Zlaté Moravce
19 | Komano	Komarno
20 | Bardejov Region	Bardejov
21 | Humenne Region	Humenne
22 | Kosice Region	Kosice
23 | Senica Region	Senica
24 | Senec Region	Senec
25 | Banovce nad Bebravou Region	Banovce nad Bebravou
26 | Rimavske Sobota	Rimavska Sobota
27 | Šaľa Region	Sala
28 | Medzilaborce Region	Medzilaborce
29 | Leopolodov	Leopoldov
30 | Sobrance Region	Sobrance
31 | Hlohovec Region	Hlohovec
32 | Michalovce Region	Michalovce
33 | Košice Region	Kosice
34 | Humenné Region	Humenné
35 | Nižné Raslavice	Raslavice
36 | Rožňava Region	Rožňava
37 | Trebišov Region	Trebisov
38 | Nitra Region	Nitra
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/remote-inputs-uncompressed/builds.yaml:
--------------------------------------------------------------------------------
 1 | inputs:
 2 |   - name: test-remote-uncompressed-asia-sequences
 3 |     metadata: s3://nextstrain-data/files/ncov/test-data/asia_metadata.tsv
 4 |     sequences: s3://nextstrain-data/files/ncov/test-data/asia_sequences.fasta
 5 |   - name: test-remote-uncompressed-europe-aligned
 6 |     metadata: s3://nextstrain-data/files/ncov/test-data/europe_metadata.tsv
 7 |     aligned: s3://nextstrain-data/files/ncov/test-data/europe_aligned.fasta
 8 |   - name: test-remote-uncompressed-americas-filtered
 9 |     metadata: s3://nextstrain-data/files/ncov/test-data/americas_metadata.tsv
10 |     filtered: s3://nextstrain-data/files/ncov/test-data/americas_filtered.fasta
11 |   - name: references
12 |     metadata: data/references_metadata.tsv
13 |     sequences: data/references_sequences.fasta
14 | 
15 | # As we are not including the test data from Asia (see above), this build will
16 | # be missing the default root sequence. We instead use
17 | # `data/references_sequences.fasta` that contains Wuhan/Hu-1/2019
18 | refine:
19 |   root: "Wuhan/Hu-1/2019"
20 | 
21 | builds:
22 |   test-remote-uncompressed:
23 |     subsampling_scheme: small
24 | 
25 | subsampling:
26 |   small:
27 |     small-sample:
28 |       group_by: "region"
29 |       max_sequences: 100
30 | 


--------------------------------------------------------------------------------
/docs/src/visualization/interpretation.rst:
--------------------------------------------------------------------------------
 1 | Guidance for interpretation
 2 | ===========================
 3 | 
 4 | Introductory resources
 5 | ----------------------
 6 | 
 7 | -  Visual explanation of how viral mutations and spread are related: https://www.nytimes.com/interactive/2020/04/30/science/coronavirus-mutations.html
 8 | 
 9 | -  Introduction to interpreting phylogenetic trees: https://nextstrain.org/narratives/trees-background/
10 | 
11 | -  How to interact with Auspice (the engine for viewing trees): https://neherlab.org/201901_krisp_auspice.html
12 | 
13 | -  Overview of genomic epidemiology (older, but still relevant and clear): https://www.nature.com/articles/nrg2583
14 | 
15 | Case Studies
16 | ------------
17 | 
18 | -  UCSF-led analysis of genomic epi in California: https://science.sciencemag.org/content/early/2020/06/05/science.abb9263
19 | 
20 | -  UK analysis of hospital-acquired infections: https://www.medrxiv.org/content/10.1101/2020.05.08.20095687v1
21 | 
22 | -  UK's analysis of coronavirus introductions: https://virological.org/t/preliminary-analysis-of-sars-cov-2-importation-establishment-of-uk-transmission-lineages/507
23 | 
24 | -  Australia cluster detection: https://www.medrxiv.org/content/10.1101/2020.05.12.20099929v1
25 | 
26 | -  Nextstrain situation reports: https://nextstrain.org/ncov-sit-reps/
27 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/100k/README.md:
--------------------------------------------------------------------------------
 1 | ## Aim
 2 | 
 3 | To build a representative 100k dataset which is available for testing / developing builds locally.
 4 | This is intended to run weekly via a GitHub action (which triggers jobs to be run on AWS).
 5 | It will upload these files:
 6 | 
 7 | * `s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz`
 8 | * `s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz`
 9 | * `s3://nextstrain-ncov-private/100k/metadata.tsv.xz`
10 | * `s3://nextstrain-ncov-private/100k/sequences.fasta.xz`
11 | 
12 | While this profile is not recommended to be run locally, you can see what rules would be run via:
13 | 
14 | ```
15 | snakemake --cores 1 --configfile nextstrain_profiles/100k/config-gisaid.yaml -npf upload --dag | dot -Tpdf > dag-100k-gisaid.pdf
16 | snakemake --cores 1 --configfile nextstrain_profiles/100k/config-open.yaml -npf upload --dag | dot -Tpdf > dag-100k-open.pdf
17 | ```
18 | 
19 | To run manually you can trigger the GitHub action (recommended) or run the jobs locally via:
20 | ```
21 | nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \
22 |   --configfile nextstrain_profiles/100k/config-gisaid.yaml \
23 |   -f upload
24 | nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \
25 |   --configfile nextstrain_profiles/100k/config-open.yaml \
26 |   -f upload
27 | ```
28 | 


--------------------------------------------------------------------------------
/docs/src/_static/css/configuration-reference.css:
--------------------------------------------------------------------------------
 1 | /* Custom CSS to be applied to the reference/workflow-config-file.rst
 2 |    page. That page defines a custom class of .configuration-reference */
 3 | 
 4 | 
 5 | /* We detail a lot of nested (snakemake) configuration entries in the
 6 | page. The parent key (top-level config key) is <h3> and sub-keys are <h4>.
 7 | The default nextstrain-theme renders <h3> and <h4> extremely similarly.
 8 | The following style changes are intended to convey that certain config
 9 | entries are children of a higher-level config, rather than being top-level
10 | config parameters themselves */
11 | 
12 | .configuration-reference h4 {
13 |     font-size: 100%;
14 | }
15 | 
16 | /* Pad lists generated by a (local) contents directive showing sub-keys */
17 | .configuration-reference section > section > div.contents.local.topic {
18 |     margin-left: 24px; /* same as a nested <li> */
19 |     margin-top: -20px; /* CSS can't select previous sibling, FYI */
20 | }
21 | .configuration-reference section > section > div.contents.local.topic > ul > li {
22 |     list-style: circle;
23 | }
24 | /* pad out their siblings (which come _after_ the list) so that they
25 |    are in line with the start of text in the preceding <li> element */
26 | .configuration-reference section > section > div.contents.local.topic ~ * {
27 |     margin-left: 48px; 
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/annotate_metadata_with_index.py:
--------------------------------------------------------------------------------
 1 | """Annotate a metadata file with the given sequence index.
 2 | """
 3 | import argparse
 4 | from augur.io import read_metadata
 5 | import pandas as pd
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
10 |     parser.add_argument("--metadata", required=True, help="metadata to annotate")
11 |     parser.add_argument("--sequence-index", required=True, help="sequence index from augur index")
12 |     parser.add_argument("--output", required=True, help="metadata annotated with sequence index columns including a 'length' column based on the number of A, C, G, and T bases.")
13 | 
14 |     args = parser.parse_args()
15 | 
16 |     metadata = read_metadata(args.metadata)
17 | 
18 |     index = pd.read_csv(
19 |         args.sequence_index,
20 |         sep="\t",
21 |     ).drop(
22 |         columns=["length"],
23 |     )
24 |     index["length"] = index.loc[:, ["A", "C", "G", "T"]].sum(axis=1)
25 |     new_columns = {
26 |         column: f"_{column}"
27 |         for column in index.columns
28 |         if column != "strain"
29 |     }
30 |     index = index.rename(columns=new_columns)
31 | 
32 |     metadata.merge(
33 |         index,
34 |         on="strain",
35 |     ).to_csv(
36 |         args.output,
37 |         sep="\t",
38 |         index=False,
39 |     )
40 | 


--------------------------------------------------------------------------------
/docs/src/guides/update-workflow.rst:
--------------------------------------------------------------------------------
 1 | Update the workflow
 2 | ===================
 3 | 
 4 | We update the official workflow regularly with:
 5 | 
 6 | -  curated metadata including latitudes/longitudes, clade annotations, and low quality sequences
 7 | -  bug fixes
 8 | -  :doc:`new features <../reference/change_log>`
 9 | 
10 | Update your local copy of the workflow, to benefit from these changes.
11 | 
12 | .. code:: bash
13 | 
14 |    # Download and apply changes from the Nextstrain team.
15 |    # This only works if there is no conflict with your local repository.
16 |    git pull --ff-only origin master
17 | 
18 |    # OR:
19 | 
20 |    # Alternately, download and apply changes from the Nextstrain team
21 |    # and then replay your local changes on top of those incoming changes.
22 |    git pull --rebase origin master
23 | 
24 | Alternately, download a specific version of the workflow that you know works for you. We create new `releases of the workflow <https://github.com/nextstrain/ncov/releases/>`__ any time we introduce breaking changes, so you can choose when to update based on :doc:`what has changed <../reference/change_log>`.
25 | 
26 | .. code:: bash
27 | 
28 |    # Download version 7 (v7) of the workflow.
29 |    curl -OL https://github.com/nextstrain/ncov/archive/refs/tags/v7.zip
30 | 
31 |    # Uncompress the workflow.
32 |    unzip v7.zip
33 | 
34 |    # Change into the workflow's directory.
35 |    cd ncov-7/
36 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## Description of proposed changes
 2 | 
 3 | What is the goal of this pull request? What does this pull request change?
 4 | 
 5 | ## Related issue(s)
 6 | 
 7 | <!-- Start typing the name of a related issue and GitHub will auto-suggest the issue number for you.  -->
 8 | Fixes #
 9 | Related to #
10 | 
11 | ## Testing
12 | 
13 | What steps should be taken to test the changes you've proposed?
14 | If you added or changed behavior in the codebase, did you update the tests, or do you need help with this?
15 | 
16 | ## Release checklist
17 | 
18 | If this pull request introduces backward incompatible changes, complete the following steps for a new release of the workflow:
19 | 
20 |  - [ ] Determine the version number for the new release by incrementing [the most recent release](https://github.com/nextstrain/ncov/releases) (e.g., "v2" from "v1").
21 |  - [ ] Update `docs/src/reference/change_log.md` in this pull request to document these changes and the new version number.
22 |  - [ ] After merging, [create a new GitHub release](https://github.com/nextstrain/ncov/releases/new) with the new version number as the tag and release title.
23 | 
24 | If this pull request introduces new features, complete the following steps:
25 | 
26 |  - [ ] Update `docs/src/reference/change_log.md` in this pull request to document these changes by the date they were added.
27 | 
28 | <!-- 🙌 Thank you for contributing to Nextstrain! ✨ -->
29 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-ci/builds.yaml:
--------------------------------------------------------------------------------
 1 | # Only use one build for CI.
 2 | active_builds: europe
 3 | 
 4 | # Override full GISAID data with example data for a faster build.
 5 | inputs:
 6 |   - name: gisaid
 7 |     metadata: "data/example_metadata.tsv"
 8 |     sequences: "data/example_sequences.fasta.gz"
 9 | 
10 | warning: "Test warning"
11 | 
12 | builds:
13 |   # Override the default Nextstrain European build's subsampling scheme for more
14 |   # stable subsampling of a fixed dataset in continuous integration tests.
15 |   europe:
16 |     subsampling_scheme: nextstrain_ci_sampling
17 |     region: Europe
18 | 
19 | subsampling:
20 |   # Custom subsampling logic for CI tests.
21 |   nextstrain_ci_sampling:
22 |     # Focal samples for region
23 |     region:
24 |       group_by: "division year month"
25 |       max_sequences: 20
26 |       sampling_scheme: "--no-probabilistic-sampling"
27 |       exclude: "--exclude-where 'region!={region}'"
28 |     # Contextual samples for region from the rest of the world
29 |     global:
30 |       group_by: "year month"
31 |       max_sequences: 10
32 |       sampling_scheme: "--no-probabilistic-sampling"
33 |       exclude: "--exclude-where 'region={region}'"
34 |       priorities:
35 |         type: "proximity"
36 |         focus: "region"
37 | 
38 | # Override default frequency settings, so we can estimate frequencies from older
39 | # data with a fixed time range.
40 | frequencies:
41 |   min_date: 2020-01-01
42 |   max_date: 2020-05-10
43 | 


--------------------------------------------------------------------------------
/workflow/schemas/config.schema.yaml:
--------------------------------------------------------------------------------
 1 | $schema: https://json-schema.org/draft/2020-12/schema
 2 | 
 3 | description: snakemake configuration file
 4 | 
 5 | type: object
 6 | 
 7 | properties:
 8 |   inputs:
 9 |     type:
10 |       - array
11 |     minItems: 1
12 |     items:
13 |       type: object
14 |       required:
15 |         - name
16 |         - metadata
17 |       properties:
18 |         name:
19 |           type: string
20 |           minLength: 1
21 |         metadata:
22 |           type: string
23 |           minLength: 1
24 |         sequences:
25 |           type: string
26 |           minLength: 1
27 |         aligned:
28 |           type: string
29 |           minLength: 1
30 |         skip_sanitize_metadata:
31 |           type: boolean
32 |         deduplicated:
33 |           type: boolean
34 |       additionalProperties: false
35 | 
36 |   builds:
37 |     type: object
38 |     minProperties: 1
39 |     propertyNames:
40 |       # Allow build names to contain alphanumeric characters, underscores, and hyphens
41 |       # but not special strings used for Nextstrain builds.  Also used in the
42 |       # workflow's wildcard_constraints.
43 |       pattern: "^(?:[-a-zA-Z0-9_](?!tip-frequencies|root-sequence))+$"
44 | 
45 |   S3_DST_COMPRESSION:
46 |     type: string
47 |     enum:
48 |       - gz
49 |       - xz
50 | 
51 |   S3_DST_ORIGINS:
52 |     type: array
53 |     minItems: 1
54 |     items:
55 |       type: string
56 |       # A similar pattern is used in the workflow's wildcard constraints.
57 |       pattern: "^[a-zA-Z0-9-]+$"
58 | 
59 | 


--------------------------------------------------------------------------------
/docs/translation_docs.md:
--------------------------------------------------------------------------------
 1 | ## Translating Nextstrain Situation Reports
 2 | 
 3 | We welcome translations of the situation reports (narratives) into languages other than English (in particular to languages commonly spoken in areas affected by the outbreak). We're incredibly grateful for and impressed by the contributions provided already!
 4 | 
 5 | ### Getting started  
 6 | 
 7 | 1. Check to see if the situation report is already available in your language on [the Nextstrain homepage](https://nextstrain.org). If the date is the same for the English version and the version in your language, then it's already up to date! :)  
 8 | 
 9 | 2. Find your language\* on the [translation project board](https://github.com/nextstrain/ncov/projects/1), and comment on the issue so we know you're working on it.
10 | 
11 | 3. Follow the instructions in the issue to submit your translation.  
12 | 
13 | 4. **When you're done, please remember to move the issue to the "ready for review" column [in the project board](https://github.com/nextstrain/ncov/projects/1).** This helps us keep everything moving smoothly.
14 | 
15 | 5. When your translation has been reviewed and approved by a second translator, we'll publish it and put it on the Nextstrain homepage!
16 | 
17 | ### \*If your language isn't listed on the project board
18 | 
19 | We'd love to add even more languages! [Please open an issue here](https://github.com/nextstrain/ncov/issues/new?assignees=cassiawag&labels=&template=translation--community-request-.md&title=%5BLanguage+translation+request%5D); we'll get back to you right away!
20 | 


--------------------------------------------------------------------------------
/scripts/construct-recency-from-submission-date.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from datetime import datetime
 3 | from augur.io import read_metadata
 4 | import json
 5 | 
 6 | def get_recency(date_str, ref_date):
 7 |     date_submitted = datetime.strptime(date_str, '%Y-%m-%d').toordinal()
 8 |     ref_day = ref_date.toordinal()
 9 | 
10 |     delta_days = ref_day - date_submitted
11 |     if delta_days<=0:
12 |         return 'New'
13 |     elif delta_days<3:
14 |         return '1-2 days ago'
15 |     elif delta_days<8:
16 |         return '3-7 days ago'
17 |     elif delta_days<15:
18 |         return 'One week ago'
19 |     elif delta_days<31:
20 |         return 'One month ago'
21 |     elif delta_days>=31:
22 |         return 'Older'
23 | 
24 | if __name__ == '__main__':
25 |     parser = argparse.ArgumentParser(
26 |         description="Assign each sequence a field that specifies when it was added",
27 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
28 |     )
29 | 
30 |     parser.add_argument('--metadata', type=str, required=True, help="metadata file")
31 |     parser.add_argument('--output', type=str, required=True, help="output json")
32 |     args = parser.parse_args()
33 | 
34 |     meta = read_metadata(args.metadata)
35 | 
36 |     node_data = {'nodes':{}}
37 |     ref_date = datetime.now()
38 | 
39 |     for strain, d in meta.iterrows():
40 |         if 'date_submitted' in d and d['date_submitted'] and d['date_submitted'] != "undefined":
41 |             node_data['nodes'][strain] = {'recency': get_recency(d['date_submitted'], ref_date)}
42 | 
43 |     with open(args.output, 'wt') as fh:
44 |         json.dump(node_data, fh)
45 | 


--------------------------------------------------------------------------------
/.github/workflows/revert.yml:
--------------------------------------------------------------------------------
 1 | name: Revert nextstrain.org/ncov/gisaid or nextstrain.org/ncov/open
 2 | 
 3 | on:
 4 |   # Manually triggered using GitHub's UI
 5 |   workflow_dispatch:
 6 |     inputs:
 7 |       data_source_name:
 8 |         description: Name of data source corresponding to the datasets on nextstrain.org/ncov to reset. Options are "gisaid" or "open".
 9 |         required: true
10 |       build_region_name:
11 |         description: A single regional dataset on nextstrain.org/ncov/{data_source_name} to reset. Options are "global", "africa", "asia", "europe", "north-america", "oceania", "south-america". If not specified, reverts all. If you'd like to revert multiple regions but not all, run the action multiple times, specifying one region each time.
12 |         required: true
13 |         default: all
14 |       date:
15 |         description: Date to revert to. A corresponding set of date-stamped datasets must exist on the s3 bucket. Format is YYYY-MM-DD.
16 |         required: true
17 | 
18 | env:
19 |   DATA_SOURCE_NAME: ${{ github.event.inputs.data_source_name }}
20 |   BUILD_REGION_NAME: ${{ github.event.inputs.build_region_name }}
21 |   DATE: ${{ github.event.inputs.date }}
22 | 
23 | jobs:
24 |   revert:
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |     - uses: actions/checkout@v6
28 | 
29 |     - uses: nextstrain/.github/actions/setup-nextstrain-cli@master
30 | 
31 |     - name: Revert build
32 |       run: |
33 |         ./scripts/revert "$DATA_SOURCE_NAME" "$BUILD_REGION_NAME" "$DATE"
34 |       env:
35 |         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
36 |         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Files created by the pipeline, which we want to keep out of git
 2 | # (or at least out of _this_ git repo).
 3 | benchmarks/
 4 | logs/
 5 | results/
 6 | build/
 7 | auspice/
 8 | data/*
 9 | defaults/colors.tsv
10 | defaults/colors_*.tsv
11 | # stats.json was removed in Snakemake v8
12 | stats.json
13 | 
14 | # common analysis directory names
15 | ncov-tutorial/
16 | my-ncov-analyses/
17 | 
18 | # old analysis directory
19 | my_profiles/*
20 | !my_profiles/README.md
21 | 
22 | # Downloaded remote files from sources we expect
23 | /nextstrain-ncov-private/
24 | /nextstrain-data/
25 | /data.nextstrain.org/
26 | 
27 | # Sensitive environment variables
28 | environment*
29 | 
30 | # Snakemake state dir
31 | /.snakemake
32 | snakemake_log
33 | 
34 | # Local config overrides
35 | /config_local.yaml
36 | 
37 | # For Python #
38 | ##############
39 | *.pyc
40 | .tox/
41 | .cache/
42 | 
43 | # Compiled source #
44 | ###################
45 | *.com
46 | *.class
47 | *.dll
48 | *.exe
49 | *.o
50 | *.so
51 | 
52 | # OS generated files #
53 | ######################
54 | .DS_Store
55 | .DS_Store?
56 | ._*
57 | .Spotlight-V100
58 | .Trashes
59 | Icon?
60 | ehthumbs.db
61 | Thumbs.db
62 | *~
63 | 
64 | # IDE generated files #
65 | ######################
66 | .vscode/
67 | 
68 | narratives/*pdf
69 | 
70 | 
71 | # metadata / new seqs scripts
72 | scripts/curate_metadata/inputs_new_sequences/
73 | scripts/curate_metadata/output_curate_metadata/
74 | scripts/curate_metadata/outputs_new_sequences/
75 | scripts/curate_metadata/config_curate_metadata/geoLocationRules.txt
76 | scripts/curate_metadata/config_curate_metadata/manualAnnotationRules.txt
77 | 
78 | 
79 | # SLURM log files
80 | slurm-*.out
81 | 


--------------------------------------------------------------------------------
/docs/src/reference/glossary.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Glossary
 3 | ========
 4 | 
 5 | .. glossary::
 6 | 
 7 |    analysis directory
 8 | 
 9 |       The folder within ``ncov/`` where :term:`customization files <customization file>` live. Previously this was ``my_profiles/`` but we now allow any name of choice, and provide `ncov-tutorial <https://github.com/nextstrain/ncov-tutorial>`__ as a starter template.
10 | 
11 |    Auspice config file
12 |       also ``auspice_config.json``
13 | 
14 |       A JSON file used to configure visualization in :term:`docs.nextstrain.org:Auspice`.
15 | 
16 |    config file
17 |       also *workflow config file*, *workflow configuration file*, ``builds.yaml``
18 | 
19 |       A YAML file used to `configure <https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#standard-configuration>`__ the :term:`Snakemake` workflow (via the ``--configfile`` option). Appends to and overrides default configuration in ``defaults/parameters.yaml``. For the :term:`ncov workflow`, this file must follow a :doc:`specific format <workflow-config-file>`.
20 | 
21 |    customization file
22 | 
23 |       A file used to customize the :term:`ncov workflow`.
24 | 
25 |       Examples: :term:`Auspice config file`, :term:`workflow config file<config file>`, :term:`default files`
26 | 
27 |    default files
28 | 
29 |       Default :term:`customization files <customization file>` provided in ``ncov/defaults/``.
30 | 
31 |    ncov workflow
32 |       also *SARS-CoV-2 workflow*
33 | 
34 |       The workflow used to automate execution of :term:`builds<docs.nextstrain.org:build>`. Implemented in :term:`Snakemake`.
35 | 
36 |    Snakemake
37 | 
38 |       The workflow manager used in the :term:`ncov workflow`.
39 | 


--------------------------------------------------------------------------------
/scripts/rename_clades.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser(
 6 |         description="Rename clades in clades.tsv",
 7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 8 |     )
 9 | 
10 |     parser.add_argument('--input-clade-files', type=str, nargs='+', required=True, help="input clade files")
11 |     parser.add_argument('--name-mapping', type=str, required=False, help="YAML mapping between Nextstrain clades and display names")
12 |     parser.add_argument('--output-clades', type=str, required=True, help="renamed clade file")
13 |     args = parser.parse_args()
14 | 
15 |     # read name mapping from input yaml file
16 |     if args.name_mapping:
17 |         with open(args.name_mapping) as fh:
18 |             name_mapping = yaml.load(fh, Loader=yaml.FullLoader)
19 |     else:
20 |         name_mapping = {}
21 | 
22 | 
23 |     # write output into one consolidated file
24 |     out_clades = open(args.output_clades, "w")
25 | 
26 |     # loop over input file and replace clade names were appropriate line by line
27 |     for fname in args.input_clade_files:
28 |         with open(fname) as fh:
29 |             for line in fh:
30 |                 fields = line.strip('\n').split('\t')
31 |                 if len(fields) < 3:
32 |                     continue
33 |                 fields[0] = name_mapping.get(fields[0], fields[0])
34 |                 # if clade definition is based on other clade, replace name
35 |                 if fields[1]=='clade':
36 |                     fields[2] = name_mapping.get(fields[2], fields[2])
37 |                 out_clades.write('\t'.join(fields)+'\n')
38 | 
39 |     out_clades.close()
40 | 


--------------------------------------------------------------------------------
/docs/redirects.yaml:
--------------------------------------------------------------------------------
 1 | # Authoritative list of redirects we have configured in RTD.  See the
 2 | # docs.nextstrain.org repo's README.md¹ for more information on maintaining
 3 | # redirects.
 4 | #
 5 | # ¹ https://github.com/nextstrain/docs.nextstrain.org#configuring-redirects
 6 | ---
 7 | - type: page
 8 |   from_url: /analysis/
 9 |   to_url: /tutorial/index.html
10 | 
11 | - type: page
12 |   from_url: /analysis/index.html
13 |   to_url: /tutorial/index.html
14 | 
15 | - type: page
16 |   from_url: /analysis/customizing-analysis.html
17 |   to_url: /guides/workflow-config-file.html
18 | 
19 | - type: page
20 |   from_url: /analysis/customizing-visualization.html
21 |   to_url: /guides/customizing-visualization.html
22 | 
23 | - type: page
24 |   from_url: /analysis/data-prep.html
25 |   to_url: /guides/data-prep/index.html
26 | 
27 | - type: page
28 |   from_url: /analysis/orientation-files.html
29 |   to_url: /reference/files.html
30 | 
31 | - type: page
32 |   from_url: /analysis/orientation-workflow.html
33 |   to_url: /reference/nextstrain-overview.html
34 | 
35 | - type: page
36 |   from_url: /analysis/running.html
37 |   to_url: /reference/troubleshoot.html
38 | 
39 | - type: page
40 |   from_url: /analysis/setup.html
41 |   to_url: /tutorial/setup.html
42 | 
43 | - type: page
44 |   from_url: /videos.html
45 |   to_url: /tutorial/videos.html
46 | 
47 | - type: page
48 |   from_url: /reference/configuration.html
49 |   to_url: /reference/workflow-config-file.html
50 | 
51 | - type: page
52 |   from_url: /reference/multiple_inputs.html
53 |   to_url: /
54 | 
55 | - type: page
56 |   from_url: /visualization/index.html
57 |   to_url: /visualization/sharing.html
58 | 
59 | - type: page
60 |   from_url: /guides/index.html
61 |   to_url: /guides/run-analysis-on-terra.html
62 | 


--------------------------------------------------------------------------------
/scripts/upload-to-s3:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | bin="$(dirname "$0")"
 5 | 
 6 | main() {
 7 |     local quiet=0
 8 | 
 9 |     for arg; do
10 |         case "$arg" in
11 |             --quiet)
12 |                 quiet=1
13 |                 shift;;
14 |             *)
15 |                 break;;
16 |         esac
17 |     done
18 | 
19 |     local src="${1:?A source file is required as the first argument.}"
20 |     local dst="${2:?A destination s3:// URL is required as the second argument.}"
21 | 
22 |     local s3path="${dst#s3://}"
23 |     local bucket="${s3path%%/*}"
24 |     local key="${s3path#*/}"
25 | 
26 |     local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000
27 |     src_hash="$("$bin/sha256sum" < "$src")"
28 |     dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text || echo "$no_hash")"
29 | 
30 |     echo "$src_hash $src"
31 |     echo "$dst_hash $dst"
32 | 
33 |     if [[ $src_hash != "$dst_hash" ]]; then
34 |         echo "Uploading $src → $dst"
35 |         aws s3 cp --no-progress "$src" "$dst" --metadata sha256sum="$src_hash" "$(content-type "$dst")"
36 |     else
37 |         echo "Files are identical, skipping upload"
38 |     fi
39 | }
40 | 
41 | content-type() {
42 |     case "$1" in
43 |         *.tsv)      echo --content-type=text/tab-separated-values;;
44 |         *.csv)      echo --content-type=text/comma-separated-values;;
45 |         *.ndjson)   echo --content-type=application/x-ndjson;;
46 |         *.json)     echo --content-type=application/json;;
47 |         *.gz)       echo --content-type=application/gzip;;
48 |         *.xz)       echo --content-type=application/x-xz;;
49 |         *)          echo --content-type=text/plain;;
50 |     esac
51 | }
52 | 
53 | main "$@"
54 | 


--------------------------------------------------------------------------------
/scripts/add_priorities_to_meta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Add column to metadata with the priorities of 'context' sequences
 3 | relative to the 'focal' samples
 4 | """
 5 | 
 6 | import argparse
 7 | import pandas as pd
 8 | import csv
 9 | import json
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(
13 |         description="Add columns for priorities of sequences relative to diff focal regions",
14 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
15 |     )
16 |     parser.add_argument("--metadata", type = str, required=True, help="metadata")
17 |     parser.add_argument("--priorities", type=str, nargs="+", required=True, help="priorities files")
18 |     parser.add_argument("--config", type=str, help="config file to modify")
19 |     parser.add_argument("--output-meta", type=str, required=True, help="adjusted metadata")
20 |     parser.add_argument("--output-config", type=str, help="modified config")
21 |     args = parser.parse_args()
22 | 
23 |     metadata = pd.read_csv(args.metadata, sep='\t')
24 |     with open(args.config) as fh:
25 |         input_json = json.load(fh)
26 | 
27 |     for priority_file in args.priorities:
28 |         p_f = priority_file.replace(".tsv", "")
29 |         region = p_f.split("_")[2]
30 |         column_name = "".join(["priorities_",region])
31 |         
32 |         with open(priority_file, 'r') as f:
33 |             reader = csv.reader(f, delimiter='\t')
34 |             priors = {r[0]: r[1] for r in reader if len(r)>1}
35 | 
36 |         assign_priors =  [priors[st] if st in priors else "" for st in metadata.strain]
37 | 
38 |         metadata.insert(11, column_name, assign_priors)
39 |         input_json['colorings'].append({'key': column_name, 'type': 'continuous'})
40 | 
41 |     metadata.to_csv(args.output_meta, index=False, sep="\t")
42 | 
43 |     with open(args.output_config, 'w') as fh:
44 |         json.dump(input_json, fh, indent=2)
45 |     


--------------------------------------------------------------------------------
/.github/workflows/rebuild-gisaid.yml:
--------------------------------------------------------------------------------
 1 | name: Rebuild GISAID phylogenetic datasets
 2 | 
 3 | on:
 4 |   # This workflow can be triggered from repository_dispatch events,
 5 |   # for instance, after the appropriate preprocessing actions have completed
 6 |   repository_dispatch:
 7 |     types:
 8 |       - rebuild
 9 |       - gisaid/rebuild
10 |   # Manually triggered using GitHub's UI
11 |   workflow_dispatch:
12 |     inputs:
13 |       trial_name:
14 |         description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-ncov-private and the trees on nextstrain.org/ncov/gisaid..."
15 |         required: false
16 |       image:
17 |         description: 'Specific container image to use for build (will override the default of "nextstrain build")'
18 |         required: false
19 | 
20 | jobs:
21 |   gisaid:
22 |     permissions:
23 |       id-token: write
24 |     uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
25 |     secrets: inherit
26 |     with:
27 |       runtime: aws-batch
28 |       env: |
29 |         TRIAL_NAME: ${{ github.event.inputs.trial_name }}
30 |         NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }}
31 |       run: |
32 |         set -x
33 | 
34 |         declare -a config
35 |         config+=(build_date=\'$(date +'%Y-%m-%d')\')
36 |         if [[ "$TRIAL_NAME" ]]; then
37 |           config+=(
38 |             S3_DST_BUCKET=nextstrain-ncov-private/trial/"$TRIAL_NAME"
39 |             deploy_url=s3://nextstrain-staging/
40 |             auspice_json_prefix=ncov_gisaid_trial_"$TRIAL_NAME"
41 |           )
42 |         else
43 |           config+=(slack_token=$SLACK_TOKEN)
44 |         fi
45 | 
46 |         nextstrain build \
47 |           --detach \
48 |           --cpus 72 \
49 |           --memory 140GiB \
50 |           . \
51 |             deploy \
52 |             upload \
53 |             --config "${config[@]}" \
54 |             --profile nextstrain_profiles/nextstrain-gisaid \
55 |             --set-threads tree=8
56 | 


--------------------------------------------------------------------------------
/.github/workflows/rebuild-gisaid-21L.yml:
--------------------------------------------------------------------------------
 1 | name: Rebuild GISAID 21L phylogenetic datasets
 2 | 
 3 | on:
 4 |   # This workflow can be triggered from repository_dispatch events,
 5 |   # for instance, after the appropriate preprocessing actions have completed
 6 |   repository_dispatch:
 7 |     types:
 8 |       - rebuild
 9 |       - gisaid/rebuild
10 |   # Manually triggered using GitHub's UI
11 |   workflow_dispatch:
12 |     inputs:
13 |       trial_name:
14 |         description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-ncov-private and the trees on nextstrain.org/ncov/gisaid..."
15 |         required: false
16 |       image:
17 |         description: 'Specific container image to use for build (will override the default of "nextstrain build")'
18 |         required: false
19 | 
20 | jobs:
21 |   gisaid-21L:
22 |     permissions:
23 |       id-token: write
24 |     uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
25 |     secrets: inherit
26 |     with:
27 |       runtime: aws-batch
28 |       env: |
29 |         TRIAL_NAME: ${{ github.event.inputs.trial_name }}
30 |         NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }}
31 |       run: |
32 |         set -x
33 | 
34 |         declare -a config
35 |         config+=(build_date=\'$(date +'%Y-%m-%d')\')
36 |         if [[ "$TRIAL_NAME" ]]; then
37 |           config+=(
38 |             S3_DST_BUCKET=nextstrain-ncov-private/trial/"$TRIAL_NAME"
39 |             deploy_url=s3://nextstrain-staging/
40 |             auspice_json_prefix=ncov_gisaid_21L_trial_"$TRIAL_NAME"
41 |           )
42 |         else
43 |           config+=(slack_token=$SLACK_TOKEN)
44 |         fi
45 | 
46 |         nextstrain build \
47 |           --detach \
48 |           --cpus 72 \
49 |           --memory 140GiB \
50 |           . \
51 |             deploy \
52 |             upload \
53 |             --config "${config[@]}" \
54 |             --profile nextstrain_profiles/nextstrain-gisaid-21L \
55 |             --set-threads tree=8
56 | 


--------------------------------------------------------------------------------
/workflow/wdl/ncov_workflow.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "tasks/nextstrain.wdl" as nextstrain
 4 | 
 5 | workflow Nextstrain_WRKFLW {
 6 |   input {
 7 |     # ncov
 8 |     # Option 1: Pass in a sequence and metadata files, create a configfile_yaml
 9 |     File? sequence_fasta
10 |     File? metadata_tsv
11 |     File? context_targz #<= optional contextual seqs in a tarball
12 |     String? build_name
13 |     
14 |     # Option 2: Use a custom config file (e.g. builds.yaml) with https or s3 sequence or metadata files
15 |     File? configfile_yaml
16 |     File? custom_zip      # optional modifier: add a my_profiles.zip folder for my_auspice_config.json
17 |     String? active_builds # optional modifier: specify "Wisconsin,Minnesota,Iowa"
18 |     
19 |     # Option 3? GISAID augur zip?
20 |     # File? gisaid_zip # tarball
21 |     
22 |     # Optional Keys for deployment
23 |     String? s3deploy
24 |     String? AWS_ACCESS_KEY_ID
25 |     String? AWS_SECRET_ACCESS_KEY
26 |     
27 |     # By default, run the ncov workflow (can swap it for zika or something else)
28 |     String pathogen_giturl = "https://github.com/nextstrain/ncov/archive/refs/heads/master.zip"
29 |     Int? cpu
30 |     Int? memory       # in GiB
31 |     Int? disk_size
32 |   }
33 | 
34 |   call nextstrain.nextstrain_build as build {
35 |     input:
36 |       # Option 1
37 |       sequence_fasta = sequence_fasta,
38 |       metadata_tsv = metadata_tsv,
39 |       context_targz = context_targz,
40 |       build_name = build_name,
41 |   
42 |       # Option 2
43 |       configfile_yaml = configfile_yaml,
44 |       custom_zip = custom_zip,
45 |       active_builds = active_builds,
46 |   
47 |       # Optional deploy to s3 site
48 |       s3deploy = s3deploy,
49 |       AWS_ACCESS_KEY_ID = AWS_ACCESS_KEY_ID,
50 |       AWS_SECRET_ACCESS_KEY = AWS_SECRET_ACCESS_KEY,
51 |   
52 |       pathogen_giturl = pathogen_giturl,
53 |       cpu = cpu,
54 |       memory = memory,
55 |       disk_size = disk_size
56 |   }
57 | 
58 |   output {
59 |     File auspice_zip = build.auspice_zip
60 |     File results_zip = build.results_zip
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/.github/workflows/rebuild-open.yml:
--------------------------------------------------------------------------------
 1 | name: Rebuild open (GenBank) phylogenetic datasets
 2 | 
 3 | on:
 4 |   # This workflow can be triggered from repository_dispatch events,
 5 |   # for instance, after the appropriate preprocessing actions have completed
 6 |   repository_dispatch:
 7 |     types:
 8 |       - rebuild
 9 |       - open/rebuild
10 |       - genbank/rebuild
11 |   # Manually triggered using GitHub's UI
12 |   workflow_dispatch:
13 |     inputs:
14 |       trial_name:
15 |         description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-data/files/ncov/open/ and the trees on nextstrain.org/ncov/open/..."
16 |         required: false
17 |       image:
18 |         description: 'Specific container image to use for build (will override the default of "nextstrain build")'
19 |         required: false
20 | 
21 | jobs:
22 |   open:
23 |     permissions:
24 |       id-token: write
25 |     uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
26 |     secrets: inherit
27 |     with:
28 |       runtime: aws-batch
29 |       env: |
30 |         TRIAL_NAME: ${{ github.event.inputs.trial_name }}
31 |         NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }}
32 |       run: |
33 |         set -x
34 | 
35 |         declare -a config
36 |         config+=(build_date=\'$(date +'%Y-%m-%d')\')
37 |         if [[ "$TRIAL_NAME" ]]; then
38 |           config+=(
39 |             S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/"$TRIAL_NAME"
40 |             deploy_url=s3://nextstrain-staging/
41 |             auspice_json_prefix=ncov_open_trial_"$TRIAL_NAME"
42 |           )
43 |         else
44 |           config+=(slack_token=$SLACK_TOKEN)
45 |         fi
46 | 
47 |         nextstrain build \
48 |           --detach \
49 |           --cpus 72 \
50 |           --memory 140GiB \
51 |           . \
52 |             deploy \
53 |             upload \
54 |             --config "${config[@]}" \
55 |             --profile nextstrain_profiles/nextstrain-open \
56 |             --set-threads tree=8
57 | 


--------------------------------------------------------------------------------
/nextstrain_profiles/100k/config-gisaid.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | S3_DST_BUCKET: "nextstrain-ncov-private/100k"
 3 | S3_DST_ORIGINS: [needed-for-workflow-but-unused]
 4 | deploy_url: needed_for_workflow_but_unused
 5 | 
 6 | custom_rules:
 7 |   - workflow/snakemake_rules/export_for_nextstrain.smk
 8 | 
 9 | # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment
10 | inputs:
11 |   - name: gisaid
12 |     metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst"
13 |     aligned: "s3://nextstrain-ncov-private/sequences.fasta.zst"
14 |     skip_sanitize_metadata: true
15 |     deduplicated: true
16 | 
17 | builds:
18 |   100k:
19 |     subsampling_scheme: 100k_scheme
20 | 
21 | # mapping of remote: local files to be uploaded under S3_DST_BUCKET
22 | upload:
23 |   metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz
24 |   sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz
25 | 
26 | # remove sequences without division label in US
27 | filter:
28 |   exclude_where: "division='USA'"
29 | 
30 | # We wish to subsample 50k in the previous 12 months and 50k prior to that.
31 | # Note 1: both --max-date and --min-date are inclusive of the boundary date,
32 | #         so sequences from that date will be available to both sub-samples
33 | # Note 2: As we group by (year,month) the boundary month will be included
34 | #         in both sub-samples and thus this month will be oversampled.
35 | #         For instance, if the boundary is March 7th then `50k_early` will sample
36 | #         the same number of genomes from the first week of March as each of the
37 | #         preceding months; similarly `50k_late` will sample as many genomes from
38 | #         the final ~3 weeks of March as each of the following full months.
39 | # (see https://github.com/nextstrain/ncov/pull/1032#discussion_r1034087312)
40 | subsampling:
41 |   100k_scheme:
42 |     50k_early:
43 |       group_by: "year month country"
44 |       max_sequences: 50000
45 |       max_date: "--max-date 1Y"
46 |     50k_late:
47 |       group_by: "year month country"
48 |       max_sequences: 50000
49 |       min_date: "--min-date 1Y"
50 | 


--------------------------------------------------------------------------------
/scripts/adjust_regional_meta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Add column to metadata to denote 'focal' samples based on supplied region
 3 | Rewrite location, division and country for non-focal samples to be region
 4 | """
 5 | 
 6 | import argparse
 7 | import pandas as pd
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = argparse.ArgumentParser(
11 |         description="Add column to metadata to denote 'focal' samples based on supplied region",
12 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
13 |     )
14 |     parser.add_argument("--metadata", type = str, required=True, help="metadata")
15 |     parser.add_argument("--region", type=str, required=False, help="focal region")
16 |     parser.add_argument("--country", type=str, required=False, help="focal country")
17 |     parser.add_argument("--division", type=str, required=False, help="focal division")
18 |     parser.add_argument("--location", type=str, required=False, help="focal location")
19 |     parser.add_argument("--composite", type=str, required=False, help="composite sampling")
20 |     parser.add_argument("--output", type=str, required=True, help="adjusted metadata")
21 |     args = parser.parse_args()
22 | 
23 |     region_list = ["Asia", "Africa", "Europe", "North America", "Oceania", "South America"]
24 | 
25 |     metadata = pd.read_csv(args.metadata, delimiter='\t')
26 | 
27 |     # if in region list, then do the fixing
28 |     if args.region in region_list:
29 |         focal_region = args.region
30 |     else: # otherwise just write out metadata as is, and proceed
31 |         metadata.to_csv(args.output, index=False, sep="\t")
32 |         exit()
33 | 
34 |     print("Adjusting metadata for focal region", args.region)
35 | 
36 | 
37 |     metadata.insert(12, 'focal', True)
38 | 
39 |     metadata.loc[metadata.region != focal_region, 'focal'] = False
40 |     metadata.loc[metadata.region != focal_region, 'location'] = ""
41 |     metadata.loc[metadata.region != focal_region, 'division'] = metadata.region
42 |     metadata.loc[metadata.region != focal_region, 'country'] = metadata.region
43 | 
44 |     metadata.to_csv(args.output, index=False, sep="\t")
45 | 


--------------------------------------------------------------------------------
/scripts/calculate_epiweek.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from augur.io import read_metadata
 4 | from augur.utils import write_json
 5 | import epiweeks
 6 | import pandas as pd
 7 | import re
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser(
12 |         usage="Calculate epiweeks for dates in the given metadata",
13 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
14 |     )
15 |     parser.add_argument("--metadata", required=True, help="metadata with a 'date' column")
16 |     parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'")
17 |     parser.add_argument("--attribute-name", default="epiweek", help="name to store annotations of epiweeks in JSON output")
18 |     parser.add_argument("--output-node-data", required=True, help="node data JSON with epiweek annotations")
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     metadata = read_metadata(
23 |         args.metadata,
24 |         id_columns=args.metadata_id_columns,
25 |     )
26 | 
27 |     # Find records with unambiguous dates. These must be complete date-like
28 |     # records in YYYY-MM-DD format.
29 |     date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}$")
30 |     has_complete_date = metadata["date"].astype(str).apply(lambda date: date_pattern.match(date) is not None)
31 |     metadata_with_dates = metadata.loc[has_complete_date, ["date"]].copy()
32 | 
33 |     # Convert date strings to timestamps.
34 |     metadata_with_dates["date"] = pd.to_datetime(metadata_with_dates["date"])
35 | 
36 |     # Calculate epiweeks from date objects as a new annotation.
37 |     metadata_with_dates["epiweek"] = metadata_with_dates["date"].apply(lambda date: epiweeks.Week.fromdate(date).cdcformat())
38 | 
39 |     # Create a node data object with epiweeks.
40 |     node_data = {}
41 |     for index, record in metadata_with_dates.iterrows():
42 |         node_data[index] = {
43 |             args.attribute_name: record["epiweek"],
44 |         }
45 | 
46 |     # Save node data.
47 |     write_json({"nodes": node_data}, args.output_node_data)
48 | 


--------------------------------------------------------------------------------
/docs/src/index.rst:
--------------------------------------------------------------------------------
 1 | *****************************************************************
 2 | A Getting Started Guide to the Genomic Epidemiology of SARS-CoV-2
 3 | *****************************************************************
 4 | 
 5 | This is the documentation for the SARS-CoV-2 workflow maintained and actively used by the Nextstrain core team.
 6 | 
 7 | In addition to the phylogenetic analysis described here, you can use `Nextclade <https://clades.nextstrain.org>`_ our "drag-and-drop" tool for clade assignment, mutation calling, and sequence quality control at `clades.nextstrain.org <https://clades.nextstrain.org/>`_.
 8 | 
 9 | If something in this documentation is broken or unclear, `open an issue <https://github.com/nextstrain/ncov/issues/new/choose>`_ so we can improve it for everyone.
10 | 
11 | If you have a specific question, `post a note on the discussion board <https://discussion.nextstrain.org/>`_ -- we're happy to help!
12 | 
13 | .. toctree::
14 |     :maxdepth: 1
15 |     :titlesonly:
16 |     :caption: Tutorials
17 |     :hidden:
18 | 
19 |     tutorial/intro
20 |     tutorial/setup
21 |     tutorial/example-data
22 |     tutorial/custom-data
23 |     tutorial/genomic-surveillance
24 |     tutorial/next-steps
25 |     tutorial/videos
26 | 
27 | 
28 | .. toctree::
29 |    :maxdepth: 1
30 |    :titlesonly:
31 |    :caption: Visualization & Interpretation
32 |    :hidden:
33 | 
34 |    visualization/sharing
35 |    visualization/interpretation
36 |    visualization/narratives
37 | 
38 | .. toctree::
39 |    :maxdepth: 1
40 |    :titlesonly:
41 |    :caption: Guides
42 |    :hidden:
43 | 
44 |    guides/update-workflow
45 |    guides/data-prep/index
46 |    guides/workflow-config-file
47 |    guides/customizing-visualization
48 |    guides/run-analysis-on-terra
49 | 
50 | .. toctree::
51 |    :maxdepth: 1
52 |    :caption: Reference
53 |    :hidden:
54 | 
55 |    reference/nextstrain-overview
56 |    reference/files
57 |    reference/workflow-config-file
58 |    reference/remote_inputs
59 |    reference/metadata-fields
60 |    reference/naming_clades
61 |    reference/data_submitter_faq
62 |    reference/troubleshoot
63 |    reference/change_log
64 |    reference/glossary
65 | 
66 | .. toctree::
67 |    :maxdepth: 1
68 |    :titlesonly:
69 |    :hidden:
70 | 
71 |    Stuck? Ask us on the discussion board. We're happy to help! <https://discussion.nextstrain.org/>
72 | 


--------------------------------------------------------------------------------
/scripts/mask-alignment.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Mask initial bases from alignment FASTA
 3 | """
 4 | import argparse
 5 | from augur.io import open_file, read_sequences, write_sequences
 6 | import Bio
 7 | import Bio.SeqIO
 8 | from Bio.Seq import Seq
 9 | 
10 | def mask_terminal_gaps(seq):
11 |     L = len(seq)
12 |     seq_trimmed = seq.lstrip('-')
13 |     left_gaps = L - len(seq_trimmed)
14 |     seq_trimmed = seq_trimmed.rstrip('-')
15 |     right_gaps = L - len(seq_trimmed) - left_gaps
16 |     return "N"*left_gaps + seq_trimmed + "N"*right_gaps
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     parser = argparse.ArgumentParser(
21 |         description="Mask initial bases from alignment FASTA",
22 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
23 |     )
24 |     parser.add_argument("--alignment", required=True, help="FASTA file of alignment")
25 |     parser.add_argument("--mask-terminal-gaps", action='store_true', help="fill all terminal gaps with N as they likely represent missing data")
26 |     parser.add_argument("--mask-from-beginning", type = int, required=True, help="number of bases to mask from start")
27 |     parser.add_argument("--mask-from-end", type = int, help="number of bases to mask from end")
28 |     parser.add_argument("--mask-sites", nargs='+', type = int,  help="list of sites to mask")
29 |     parser.add_argument("--output", required=True, help="FASTA file of output alignment")
30 |     args = parser.parse_args()
31 | 
32 |     begin_length = 0
33 |     if args.mask_from_beginning:
34 |         begin_length = args.mask_from_beginning
35 |     end_length = 0
36 |     if args.mask_from_end:
37 |         end_length = args.mask_from_end
38 | 
39 |     with open_file(args.output, 'w') as outfile:
40 |         for record in read_sequences(args.alignment):
41 |             seq = str(record.seq)
42 |             if args.mask_terminal_gaps:
43 |                 seq = mask_terminal_gaps(seq)
44 | 
45 |             start = "N" * begin_length
46 |             middle = seq[begin_length:-end_length]
47 |             end = "N" * end_length
48 |             seq_list = list(start + middle + end)
49 |             if args.mask_sites:
50 |                 for site in args.mask_sites:
51 |                     if seq_list[site-1]!='-':
52 |                         seq_list[site-1] = "N"
53 |             record.seq = Seq("".join(seq_list))
54 |             write_sequences(record, outfile)
55 | 


--------------------------------------------------------------------------------
/scripts/priorities.py:
--------------------------------------------------------------------------------
 1 | """
 2 | calculate priorties from index and proximities
 3 | """
 4 | import argparse
 5 | from random import shuffle
 6 | from collections import defaultdict
 7 | import numpy as np
 8 | import pandas as pd
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser(
12 |         description="generate priorities files based on genetic proximity to focal sample",
13 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
14 |     )
15 |     parser.add_argument("--sequence-index", type=str, required=True, help="sequence index file")
16 |     parser.add_argument("--proximities", type = str, required=True, help="tsv file with proximities")
17 |     parser.add_argument("--Nweight", type = float, default=0.003, required=False, help="parameterizes de-prioritization of incomplete sequences")
18 |     parser.add_argument("--crowding-penalty", type = float, default=0.1, required=False, help="parameterizes how priorities decrease when there is many very similar sequences")
19 |     parser.add_argument("--output", type=str, required=True, help="tsv file with the priorities")
20 |     args = parser.parse_args()
21 | 
22 |     proximities = pd.read_csv(args.proximities, sep='\t', index_col=0)
23 |     index = pd.read_csv(args.sequence_index, sep='\t', index_col=0)
24 |     combined = pd.concat([proximities, index], axis=1)
25 | 
26 |     closest_matches = combined.groupby('closest strain')
27 |     candidates = {}
28 |     for focal_seq, seqs in closest_matches.groups.items():
29 |         tmp = combined.loc[seqs, ["distance", "N"]]
30 |         # penalize larger distances and more undetermined sites. 1/args.Nweight are 'as bad' as one extra mutation
31 |         tmp["priority"] = -tmp.distance - tmp.N*args.Nweight
32 |         name_prior = [(name, d.priority) for name, d in tmp.iterrows()]
33 |         shuffle(name_prior)
34 |         candidates[focal_seq] = sorted(name_prior, key=lambda x:x[1], reverse=True)
35 | 
36 |     # export priorities
37 |     crowding = args.crowding_penalty
38 |     with open(args.output, 'w') as fh:
39 |         # loop over lists of sequences that are closest to particular focal sequences
40 |         for cs in candidates.values():
41 |             # these sets have been sorted by priorities after shuffling -- reduce priorities in this shuffled/sorted order
42 |             for i, (name, pr) in enumerate(cs):
43 |                 fh.write(f"{name}\t{pr-i*crowding:1.2f}\n")
44 | 


--------------------------------------------------------------------------------
/scripts/include_prefix.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser(
 6 |         description="Rename strains to include specified prefix",
 7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 8 |     )
 9 | 
10 |     parser.add_argument('--input-auspice', type=str, metavar="JSON", required=True, help="input Auspice JSON")
11 |     parser.add_argument('--input-tip-frequencies', type=str, metavar="JSON", required=True, help="input tip frequencies JSON")
12 |     parser.add_argument("--prefix", type=str, nargs='?', const='', help="prefix to add to strains")
13 |     parser.add_argument('--output-auspice', type=str, metavar="JSON", required=True, help="output Auspice JSON")
14 |     parser.add_argument('--output-tip-frequencies', type=str, metavar="JSON", required=True, help="output tip frequencies JSON")
15 |     args = parser.parse_args()
16 | 
17 |     # update Auspice JSON
18 |     with open(args.input_auspice, "r") as f:
19 |         auspice_json = json.load(f)
20 | 
21 |     if args.prefix:
22 |         def update_strain_names(n): # closure
23 |             if "NODE_" not in n["name"] and args.prefix not in n["name"]:
24 |                 n["name"] = args.prefix + n["name"]
25 | 
26 |             if "children" in n:
27 |                 for c in n["children"]:
28 |                     update_strain_names(c)
29 |         update_strain_names(auspice_json["tree"])
30 | 
31 |     with open(args.output_auspice, 'w') as f:
32 |         json.dump(auspice_json, f, indent=2)
33 | 
34 |     # update tip frequencies JSON
35 |     with open(args.input_tip_frequencies, "r") as f:
36 |         tip_frequencies_json = json.load(f)
37 | 
38 |     if args.prefix:
39 |         modified_tip_frequencies_json = {}
40 |         for key in tip_frequencies_json:
41 |             if key != "generated_by" and key != "pivots":
42 |                 if "NODE_" not in key and args.prefix not in key:
43 |                     modified_tip_frequencies_json[args.prefix + key] = tip_frequencies_json[key]
44 |                 else:
45 |                     modified_tip_frequencies_json[key] = tip_frequencies_json[key]
46 |             else:
47 |                 modified_tip_frequencies_json[key] = tip_frequencies_json[key]
48 |     else:
49 |         modified_tip_frequencies_json = tip_frequencies_json
50 | 
51 |     with open(args.output_tip_frequencies, 'w') as f:
52 |         json.dump(modified_tip_frequencies_json, f, indent=2)
53 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_files_additional_info/purpose_of_sequencing.txt:
--------------------------------------------------------------------------------
 1 | screened by S dropout	S dropout
 2 | Not screened by S dropout	Not S dropout
 3 | not screened by S dropout	Not S dropout
 4 | Screened by S Dropout	S dropout
 5 | Not screen by S dropout	Not S dropout
 6 | Not S dropout	Not S dropout
 7 | not screened by S drop out	Not S dropout
 8 | Screened by S dropout	S dropout
 9 | screened by S drop out	S dropout
10 | screened for S dropout	S dropout
11 | Screened by S drop out	S dropout
12 | Screened for S gene Dropout	S dropout
13 | Screened for S gene dropout	S dropout
14 | Screened for S Gene Dropout	S dropout
15 | Screened for S Gene Drop out	S dropout
16 | Screened for S gene dropout - No S gene dropout	Not S dropout
17 | Not Screened for S Drop out	Not S dropout
18 | Not Screened for S Drop out - RT	Not S dropout
19 | Not Screened for S Gene Dropout	Not S dropout
20 | Screened for S gene Dropout - Negative for S gene dropout	S dropout
21 | Screened for S gene dropout - Negative for S gene dropout	S dropout
22 | Screened for S Gene Dropout - Negative for S Gene Dropout	S dropout
23 | Screened for S Gene Dropout - Negative for S gene dropout	S dropout
24 | Screened for S gene dropout - Negative for S dropout	S dropout
25 | Screened for S Gene Dropout - Negative for S Dropout	S dropout
26 | Screened for S gene dropout - Negative for S gene has a 69/70 deletion	S dropout
27 | Screened for S dropout	S dropout
28 | Screened for Spike gene dropout	S dropout
29 | Not Screened for S Dropout	Not S dropout
30 | Not screened for S gene dropout	Not S dropout
31 | Not Screened for S gene dropout	Not S dropout
32 | Not Screened for S Gene dropout	Not S dropout
33 | Not Screened for the S Gene Dropout	Not S dropout
34 | SGTF screening	S dropout
35 | Screened for Variants of Concern (VoC) with C19-SPAR-Seq	S dropout
36 | S gene screened	S dropout
37 | Returning traveller with S-gene dropout	S dropout
38 | S-gene dropout	S dropout
39 | Not S dropout	Not S dropout
40 | Baseline Surveillance	Not S dropout
41 | Baseline surveillance	Not S dropout
42 | SGTF Screening	S dropout
43 | S-dropout	S dropout
44 | SGTF screen	S dropout
45 | SGTF	S dropout
46 | Active surveillance, N-gene dropout	N dropout
47 | Active Surveillance, N-gene dropout	N dropout
48 | Baseline surveillance (random sampling)	Not S dropout
49 | ICU patient, baseline surveillance	Not S dropout
50 | Baseline	Not S dropout
51 | Non-random selection: SGTF	S dropout
52 | Non-random selection: Non-SGTF	Not S dropout
53 | screened for SGTF	S dropout
54 | S drop out	S dropout
55 | 


--------------------------------------------------------------------------------
/scripts/add_labels.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from Bio import Phylo
 4 | from collections import defaultdict
 5 | 
 6 | def attach_labels(d, labeled_nodes):
 7 |     if "children" in d:
 8 |         for c in d["children"]:
 9 |             if c["name"] in labeled_nodes:
10 |                 if "labels" not in c["branch_attrs"]:
11 |                     c["branch_attrs"]["labels"] = {}
12 |                 c['branch_attrs']['labels']['mlabel'] = labeled_nodes[c["name"]][0]
13 |                 print(c['branch_attrs']['labels'])
14 |             attach_labels(c, labeled_nodes)
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     parser = argparse.ArgumentParser(
19 |         description="Remove extraneous colorings",
20 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
21 |     )
22 | 
23 |     parser.add_argument('--input', type=str, metavar="JSON", required=True, help="input Auspice JSON")
24 |     parser.add_argument('--tree', type=str, required=True, help="tree file")
25 |     parser.add_argument('--clades', type=str, required=True, help="clades")
26 |     parser.add_argument('--mutations', type=str, required=True, help="mutations")
27 |     parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
28 |     args = parser.parse_args()
29 | 
30 |     T = Phylo.read(args.tree, 'newick')
31 | 
32 |     with open(args.mutations, "r") as f:
33 |         mutation_json = json.load(f)['nodes']
34 | 
35 |     with open(args.clades, "r") as f:
36 |         clades_json = json.load(f)['nodes']
37 | 
38 |     with open(args.input, "r") as f:
39 |         input_json = json.load(f)
40 | 
41 |     nodes = {}
42 |     for n in T.find_clades(order='postorder'):
43 |         if n.is_terminal():
44 |             n.tip_count=1
45 |         else:
46 |             n.tip_count = sum([c.tip_count for c in n])
47 |         nodes[n.name] = {'tip_count':n.tip_count}
48 | 
49 |     labels = defaultdict(list)
50 |     for node in nodes:
51 |         for m in mutation_json[node]['muts']:
52 |             if m[0] in 'ACGT' and m[-1] in 'ACGT':
53 |                 clade = clades_json[node]['clade_membership']
54 |                 tmp_label = (clade, m)
55 |                 labels[tmp_label].append((node, nodes[node]['tip_count']))
56 | 
57 |     labeled_nodes = defaultdict(list)
58 |     for label in labels:
59 |         node = sorted(labels[label], key=lambda x:-x[1])[0]
60 |         labeled_nodes[node[0]].append('/'.join(label))
61 | 
62 |     attach_labels(input_json["tree"], labeled_nodes)
63 | 
64 |     with open(args.output, 'w') as f:
65 |         json.dump(input_json, f, indent=2)
66 | 


--------------------------------------------------------------------------------
/scripts/check_missing_locations.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | 
 4 | if __name__ == '__main__':
 5 |     parser = argparse.ArgumentParser(
 6 |         description="Check for missing colors & locations",
 7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 8 |     )
 9 | 
10 |     parser.add_argument('--metadata', type=str, nargs='+', required=True, help="input region adjusted metadata")
11 |     parser.add_argument('--colors', type=str, nargs='+', required=True, help="input region specific color file")
12 |     parser.add_argument('--latlong', type=str, required=True, help="input lat-long file")
13 |     args = parser.parse_args()
14 | 
15 |     things_to_exclude_orig = ['Africa', 'Asia', 'South America', 'Europe', 
16 |                               'North America', 'Oceania', 'Grand princess cruise ship',
17 |                               'diamond princess']
18 |     things_to_exclude = [x.lower() for x in things_to_exclude_orig]
19 | 
20 |     all_metadatas = [pd.read_csv(met, delimiter='\t') for met in args.metadata]
21 |     metadata = pd.concat(all_metadatas, sort=False)
22 |     all_colors = [pd.read_csv(col, delimiter='\t', header=None) for col in args.colors]
23 |     colors = pd.concat(all_colors, sort=False)
24 | 
25 |     latlong = pd.read_csv(args.latlong, delimiter='\t', header=None)
26 | 
27 |     for geo_value in ['location', 'division', 'country']:
28 |         locs_w_color_orig = colors.loc[colors[0]==geo_value,1].values
29 |         locs_w_color = [x.lower() for x in locs_w_color_orig]
30 |         locs_w_latlong_orig = latlong.loc[latlong[0]==geo_value,1].values
31 |         locs_w_latlong = [x.lower() for x in locs_w_latlong_orig]
32 |         locs_in_meta_orig = [x for x in metadata[geo_value].unique() if not pd.isna(x)]
33 |         locs_in_meta = [x.lower() for x in locs_in_meta_orig]
34 | 
35 |         missing_color_locs = [loc for loc in locs_in_meta if loc not in locs_w_color and loc not in things_to_exclude]
36 |         if missing_color_locs:
37 |             print("The following {} are missing colors:".format(geo_value))
38 |             print(missing_color_locs)
39 |             print("\n")
40 | 
41 |         if geo_value != 'country':
42 |             missing_latlong_locs = [loc for loc in locs_in_meta if loc not in locs_w_latlong and loc not in things_to_exclude]
43 |             if missing_latlong_locs:
44 |                 print("The following {} are missing lat-long values:".format(geo_value))
45 |                 print(missing_latlong_locs)
46 |                 print("\n")
47 |     
48 |     print("Please remember this does *not* check lat-longs for countries!!")
49 | 


--------------------------------------------------------------------------------
/scripts/revert:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script is intended to be run from the GitHub Action `revert.yml` but may also be run locally.
 4 | # To run locally, you need three command line arguments (see below) as well as having AWS credentials appropriately set.
 5 | 
 6 | set -x
 7 | 
 8 | # data source name corresponding to builds to revert. "gisaid" or "open".
 9 | data_source_name=$1
10 | 
11 | # An individual regional dataset to revert. Options are "global", "africa", "asia", "europe", "north-america", "oceania", "south-america". If not specified, reverts all.
12 | # If you'd like to revert multiple regions but not all, run the script multiple times, specifying one region each time.
13 | build_region_name=$2
14 | 
15 | # date to revert to e.g. yesterday in "+%Y-%m-%d" format.
16 | date=$3
17 | 
18 | if [[ "$build_region_name" == "all" ]]
19 | then
20 |     regions=" \
21 |     global \
22 |     africa \
23 |     asia \
24 |     europe \
25 |     north-america \
26 |     oceania \
27 |     south-america \
28 |     "
29 | else
30 |     regions=$build_region_name
31 | fi
32 | 
33 | missing_count=0
34 | for region in $regions; do
35 |     if curl -fsLI "https://data.nextstrain.org/ncov_${data_source_name}_${region}_${date}.json" >/dev/null; then
36 |         ## Download the date-stamped (auspice) JSONs, renaming them to the non-datestamped version.
37 |         ## Note that if the tip-frequencies don't exist for some reason then the command (& action)
38 |         ## will fail, but this is preferable to leaving a dataset in a mixed state.
39 |         ## We skip the root-sequence JSON as this doesn't change day-to-day
40 |         nextstrain remote download \
41 |             "s3://nextstrain-data/ncov_${data_source_name}_${region}_${date}.json" \
42 |             "ncov_${data_source_name}_${region}.json"
43 |         nextstrain remote download \
44 |             "s3://nextstrain-data/ncov_${data_source_name}_${region}_${date}_tip-frequencies.json" \
45 |             "ncov_${data_source_name}_${region}_tip-frequencies.json"
46 |         ## Upload these, overwriting the canonical (non-datestamped) datasets
47 |         ## Note that we use the nextstrain-cli here as it performs cloudfront invalidation
48 |         nextstrain remote upload \
49 |             "s3://nextstrain-data" \
50 |             "ncov_${data_source_name}_${region}.json" "ncov_${data_source_name}_${region}_tip-frequencies.json"
51 |     else
52 |         echo "WARNING: The requested dataset for ${data_source_name}_${region}_${date} doesn't exist and thus we can't revert to it."
53 |         ((missing_count++))
54 |     fi
55 | done
56 | 
57 | exit $missing_count
58 | 


--------------------------------------------------------------------------------
/scripts/curate_metadata/config_files_additional_info/info_ignore.txt:
--------------------------------------------------------------------------------
 1 | Hospital
 2 | Local
 3 | imported
 4 | Local from unknow
 5 | Local from health
 6 | Local from social
 7 | Local from police
 8 | Local from airport
 9 | Hospital Delfina Torres Concha
10 | Dermatologist - contact with an infected pacient
11 | OKD Darkov
12 | 15/7 - 19/7 accommodation in a private cottage, contact with a positive
13 | Distorted ability to smell
14 | Outbreak in Ministry of Health
15 | unknown
16 | Patient with severe combined immunodeficiency (SCID)
17 | Nursing home establishments for aged and dependent individuals
18 | Migrants ship
19 | Unknown
20 | same patient as for sequence EPI_ISL_583466
21 | same patient as for sequence EPI_ISL_583469
22 | same patient as for sequence EPI_ISL_583472
23 | same patient as for sequence EPI_ISL_583475
24 | same patient as for sequence EPI_ISL_583427
25 | same patient as for sequence EPI_ISL_583479
26 | same patient as for sequence EPI_ISL_583428
27 | same patient as for sequence EPI_ISL_583431
28 | same patient as for sequence EPI_ISL_583439
29 | same patient as for sequence EPI_ISL_583444
30 | same patient as for sequence EPI_ISL_583447
31 | same patient as for sequence EPI_ISL_583456
32 | same patient as for sequence EPI_ISL_583461
33 | Airport Quarantine Station in Japan
34 | e.g. Patient infected while traveling in ….
35 | Jackson Memorial Hospital
36 | TRANSP
37 | QPS-MIA
38 | Sunset Pediatric
39 | Ausilio Mutuo
40 | Distorted ability to smell and taste
41 | Obesity
42 | Diarrhoea
43 | Imported case
44 | Symptomatic
45 | Asymptomatic
46 | General discomfort, headache, cough, myalgia, fever
47 | General discomfort, headache, cough, odynophagia
48 | Headache, cough, odynophagia
49 | General discomfort, headache, cough, myalgia, dysgeusia, fever
50 | Patient infected while traveling
51 | Traveled
52 | Airplane
53 | Airport
54 | Patient with travel history
55 | International traveler
56 | Patient seeking testing in a local hospital
57 | Hotel
58 | Hospital
59 | There is no abroad travel history in the last month.
60 | COVID-19 triage center
61 | Contact with a patient who lives in
62 | Imported
63 | Seaport Quarantine Station in Japan
64 | Unbiased surveillance
65 | unbiased surveillance
66 | Worker
67 | e.g. Cruise Ship, Convention, Live animal market
68 | Surveillance
69 | International Traveller
70 | -
71 | NA
72 | Traveller
73 | surveillance
74 | unknwon
75 | Local case
76 | Travel
77 | Contact with a patient with travel history
78 | --
79 | None
80 | None
81 | With History of Travel: YES
82 | Community Sample
83 | #VALUE!
84 | infection
85 | Traveler
86 | Overseas Case
87 | Infection
88 | Other: local case
89 | Other: local
90 | Other: Patient Sample
91 | Out of State
92 | contact
93 | Other:
94 | other
95 | Overseas case
96 | Travelling
97 | 


--------------------------------------------------------------------------------
/scripts/tsv-cast-header:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | usage: tsv-cast-header <target.tsv> <source.tsv>
 4 | 
 5 | Casts a <source.tsv> into the header of <target.tsv>.
 6 | 
 7 | Fields are reordered, dropped, and added as necessary.  Added fields will have
 8 | blank values.
 9 | 
10 | No output will be emitted if <source.tsv> has no rows.  <target.tsv> must have
11 | at least a header line.
12 | 
13 | All conversion is performed in a memory efficient manner, and inputs do not
14 | need to be seekable.
15 | 
16 | ---
17 | 
18 | This program exists because both `tsv-append` (from tsv-utils) and `csvtk
19 | concat` are by themselves unsuitable for this task.
20 | 
21 | `tsv-append` is header line aware, but not field aware: it assumes all inputs
22 | have the exact same header line and does no re-ordering, adding, or dropping of
23 | fields.  It will happily mismatch fields between inputs and produce data lines
24 | with too few or too many fields.
25 | 
26 | `csvtk concat` is field aware and DTRT, but it buffers each input completely
27 | into memory, making it a non-starter for large dataset sizes.
28 | """
29 | import csv
30 | from argparse import ArgumentParser, RawDescriptionHelpFormatter
31 | from sys import stdin, stdout, stderr, exit
32 | 
33 | 
34 | cli = ArgumentParser(
35 |     description     = __doc__.strip().split("\n---\n", 1)[0].split("\n\n", 1)[1],
36 |     epilog          = __doc__.strip().split("\n---\n", 1)[1],
37 |     formatter_class = RawDescriptionHelpFormatter)
38 | 
39 | cli.add_argument("target", metavar = "<target.tsv>")
40 | cli.add_argument("source", metavar = "<source.tsv>")
41 | 
42 | args = cli.parse_args()
43 | 
44 | 
45 | # Read header line of <target.tsv>
46 | with open(args.target, "r", encoding = "utf-8", newline = "") as target:
47 |     lines = csv.reader(target, dialect = "excel-tab")
48 |     try:
49 |         header = next(lines)
50 |     except StopIteration:
51 |         print(f"{cli.prog}: error: {args.target!r} (the target) appears to empty; it must contain at least a header line", file = stderr)
52 |         exit(1)
53 | 
54 | 
55 | # Set up output for casting from one dict to another
56 | output = csv.DictWriter(
57 |     stdout,
58 |     header,
59 |     restval = "",
60 |     extrasaction = "ignore",
61 |     dialect = "excel-tab",
62 |     lineterminator = "\n")
63 | 
64 | 
65 | # Cast <source.tsv>
66 | with open(args.source, "r", encoding = "utf-8", newline = "") as source:
67 |     input = csv.DictReader(source, dialect = "excel-tab")
68 | 
69 |     for i, row in enumerate(input):
70 |         if i == 0:
71 |             if not set(input.fieldnames) & set(output.fieldnames):
72 |                 print(f"{cli.prog}: error: {args.target!r} (the target) and {args.source!r} (the source) share no fields; they must share at least one", file = stderr)
73 |                 exit(1)
74 | 
75 |             output.writeheader()
76 | 
77 |         output.writerow(row)
78 | 


--------------------------------------------------------------------------------
/narratives/ncov_template_narrative.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Genomic analysis of COVID-19 spread.
 3 | authors:
 4 |   - Name 1
 5 |   - Name 2
 6 | 
 7 | authorLinks:
 8 |   - https://author1.org
 9 |   - https://author2.io
10 | affiliations: "Fred Hutch, Seattle, USA; Biozentrum, Basel, Switzerland; CZI, CA, USA"
11 | 
12 | license: "CC-BY"
13 | licenseLink: "https://creativecommons.org/licenses/by/4.0/"
14 | dataset: "https://nextstrain.org/ncov/gisaid/global/6m?legend=closed" # must be accessible to the auspice server running the narrative
15 | 
16 | abstract: "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
17 | ---
18 | 
19 | <!-- Comment tags like these are not rendered, they're just helpful for you -->
20 | <!-- Known 'gotcha' bug: ensure that links always end in a 'letter' (a period counts). If some kind of text doesn't follow them, it breaks the slide. -->
21 | 
22 | 
23 | <!-- ############ SLIDE BREAK ############# -->
24 | <!-- SLIDE 1 -->
25 | <!--  Each slide MUST start with a link to a specific view of the dataset (must match the `dataset` specified above) -->
26 | # [SLIDE 1 TITLE](https://nextstrain.org/ncov/gisaid/global/6m?c=country)
27 | 
28 | <!-- This is left-side text -->
29 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
30 | 
31 | <!-- There is NO right-side text on this slide -->
32 | 
33 | 
34 | <!-- ############ SLIDE BREAK ############# -->
35 | <!-- SLIDE 2 -->
36 | # [SLIDE 2 TITLE](https://nextstrain.org/ncov/gisaid/global/6m?c=region)
37 | 
38 | <!-- This is the left-side text -->
39 | 
40 | [Including a link as an example; always end the line with a period or other 'letter' character](google.com)!
41 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
42 | 
43 | <!-- This is right-side text -->
44 | ```auspiceMainDisplayMarkdown
45 | # Example using markdown for the right side
46 | We can also replace the right side view with whatever markdown contents we choose, including links, images, etc.
47 | ```
48 | 


--------------------------------------------------------------------------------
/tests/different-inputs.t:
--------------------------------------------------------------------------------
 1 | Integration tests for nCoV pipeline.
 2 | 
 3 | Note that running these tests requires setup steps, and that each test can only
 4 | run one-at-a-time due to the shared use of the test environment as otherwise
 5 | snakemake may use intermediate files from previous runs, thus producing
 6 | inconsistent test results.
 7 | 
 8 | Cram should be run in an environment which can run the pipeline via
 9 | `cram --preserve-env tests/different-inputs.t` or similar.
10 | 
11 | Set-up test environment. We could set up the correct data inside $TMP for each test
12 | if we prefer. For simplicity, we create a directory "output".
13 | 
14 |   $ pushd "$TESTDIR" > /dev/null
15 |   $ basename $( pwd )
16 |   tests
17 |   $ rm -rf output && mkdir output && cd output
18 |   $ cp -r ../../defaults . && cp -r ../../scripts . && mkdir data/ && cp ../../data/references* data/
19 |   $ cd ../..
20 |   $ basename $( pwd )
21 |   ncov
22 | 
23 | Test various input starting points, all from local (.xz) compressed files
24 | 
25 |   $ snakemake --directory tests/output --profile tests/local-inputs-compressed \
26 |   > auspice/ncov_test-local-compressed.json >tests/output/local-inputs-compressed.cram.log.txt 2>&1
27 | 
28 |   $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-local-compressed.json \
29 |   > --attr region --values "North America" "Europe" "Asia" "Oceania"
30 | 
31 |   $ rm -rf tests/output/results
32 | 
33 | Test various input starting points, all from remote (.xz) compressed files
34 | 
35 |   $ snakemake --directory tests/output --profile tests/remote-inputs-compressed \
36 |   > auspice/ncov_test-remote-compressed.json >tests/output/remote-inputs-compressed.cram.log.txt 2>&1
37 | 
38 |   $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-remote-compressed.json \
39 |   > --attr region --values "North America" "Europe" "Asia" "Oceania"
40 | 
41 |   $ rm -rf tests/output/results tests/output/data/downloaded_test*compressed*
42 | 
43 | Test various input starting points, all from local uncompressed files
44 | 
45 |   $ cp tests/local-inputs-compressed/data/*xz tests/local-inputs-uncompressed/data/
46 | 
47 |   $ for i in tests/local-inputs-uncompressed/data/*.xz; do xz -d $i; done
48 | 
49 |   $ snakemake --directory tests/output --profile tests/local-inputs-uncompressed \
50 |   > auspice/ncov_test-local-uncompressed.json >tests/output/local-inputs-uncompressed.cram.log.txt 2>&1
51 | 
52 |   $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-local-uncompressed.json \
53 |   > --attr region --values "North America" "Europe" "Asia" "Oceania"
54 | 
55 |   $ rm -rf tests/output/results tests/local-inputs-uncompressed/data/*.fasta tests/local-inputs-uncompressed/data/*.tsv
56 | 
57 | Test various input starting points which support remote uncompressed files (this is a subset of available inputs)
58 | 
59 |   $ snakemake --directory tests/output  --profile tests/remote-inputs-uncompressed \
60 |   > auspice/ncov_test-remote-uncompressed.json >tests/output/remote-inputs-uncompressed.cram.log.txt 2>&1
61 | 
62 |   $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-remote-uncompressed.json \
63 |   > --attr region --values "North America" "Europe" "Asia" "Oceania"
64 | 
65 |   $ rm -rf tests/output/results data/downloaded_test*uncompressed*


--------------------------------------------------------------------------------
/nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk:
--------------------------------------------------------------------------------
  1 | rule clades_21L:
  2 |     input:
  3 |         clades = "defaults/clades.tsv",
  4 |         exclude_clades = "nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv",
  5 |     output:
  6 |         clades = "results/clades_21L.tsv",
  7 |     log: "logs/clades_21L.txt"
  8 |     benchmark: "benchmarks/clades_21L.txt"
  9 |     conda: config["conda_environment"]
 10 |     shell:
 11 |         r"""
 12 |         exec 2> {log:q}
 13 | 
 14 |           ./scripts/expand-clade-definitions {input.clades:q} \
 15 |         | tsv-join \
 16 |             --header \
 17 |             --exclude \
 18 |             --filter-file {input.exclude_clades:q} \
 19 |             --key-fields clade \
 20 |         > {output.clades:q}
 21 |         """
 22 | 
 23 | 
 24 | rule gisaid_21L_metadata:
 25 |     input:
 26 |         references = "data/references_metadata.tsv",
 27 |         metadata = path_or_url("s3://nextstrain-ncov-private/metadata.tsv.zst", keep_local=True),
 28 |         exclude_clades = "nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv",
 29 |     output:
 30 |         metadata = "results/gisaid_21L_metadata.tsv.zst",
 31 |     log: "logs/gisaid_21L_metadata.txt"
 32 |     benchmark: "benchmarks/gisaid_21L_metadata.txt"
 33 |     conda: config["conda_environment"]
 34 |     threads: 8
 35 |     shell:
 36 |         r"""
 37 |         exec 2> {log:q}
 38 | 
 39 |         ./scripts/tsv-cast-header \
 40 |             <(unzstd < {input.metadata:q}) \
 41 |             {input.references:q} \
 42 |         | zstd \
 43 |         > {output.metadata:q}
 44 | 
 45 |         < {input.metadata:q} \
 46 |           unzstd \
 47 |         | tsv-join \
 48 |             --header \
 49 |             --exclude \
 50 |             --filter-file {input.exclude_clades:q} \
 51 |             --key-fields clade \
 52 |             --data-fields Nextstrain_clade \
 53 |         | sed 1d \
 54 |         | zstd -T$(({threads} - 2)) \
 55 |         >> {output.metadata:q}
 56 |         """
 57 | 
 58 | 
 59 | rule gisaid_21L_strains:
 60 |     input:
 61 |         metadata = "results/gisaid_21L_metadata.tsv.zst",
 62 |     output:
 63 |         strains = "results/gisaid_21L_strains.txt",
 64 |     log: "logs/gisaid_21L_strains.txt"
 65 |     benchmark: "benchmarks/gisaid_21L_strains.txt"
 66 |     conda: config["conda_environment"]
 67 |     shell:
 68 |         r"""
 69 |         exec 2> {log:q}
 70 | 
 71 |         < {input.metadata:q} \
 72 |           unzstd \
 73 |         | tsv-select --header -f strain \
 74 |         | sed 1d \
 75 |         > {output.strains:q}
 76 |         """
 77 | 
 78 | 
 79 | rule gisaid_21L_aligned:
 80 |     input:
 81 |         references = "data/references_sequences.fasta",
 82 |         aligned = path_or_url("s3://nextstrain-ncov-private/aligned.fasta.zst", keep_local=True),
 83 |         strains = "results/gisaid_21L_strains.txt",
 84 |     output:
 85 |         aligned = "results/gisaid_21L_aligned.fasta.zst",
 86 |     log: "logs/gisaid_21L_aligned.txt"
 87 |     benchmark: "benchmarks/gisaid_21L_aligned.txt"
 88 |     conda: config["conda_environment"]
 89 |     threads: 8
 90 |     shell:
 91 |         r"""
 92 |         exec 2> {log:q}
 93 | 
 94 |         < {input.references:q} \
 95 |           seqkit grep --by-name --pattern 21L \
 96 |         | zstd \
 97 |         > {output.aligned}
 98 | 
 99 |         < {input.aligned:q} \
100 |           unzstd \
101 |         | seqkit grep --by-name -f {input.strains:q} \
102 |         | zstd -T$(({threads} - 2)) \
103 |         >> {output.aligned:q}
104 |         """
105 | 


--------------------------------------------------------------------------------
/docs/src/reference/nextstrain-overview.rst:
--------------------------------------------------------------------------------
 1 | Nextstrain overview
 2 | ===================
 3 | 
 4 | Nextstrain has two main parts:
 5 | 
 6 | - :term:`docs.nextstrain.org:Augur` **performs the bioinformatic analyses** required to produce a tree, map, and other inferences from your input data.
 7 | - The outputs of Augur form the input for :term:`docs.nextstrain.org:Auspice`, **which provides the visualizations** you see on Nextstrain.org
 8 | 
 9 | You can find more information about how these tools fit together :doc:`here <docs.nextstrain.org:learn/parts>`. We'll come back to Auspice when we get to the :doc:`visualization <../visualization/sharing>` section.
10 | 
11 | First, let's take a look at how Augur works.
12 | 
13 | How bioinformatic analyses are managed
14 | --------------------------------------
15 | 
16 | At its core, Augur is a collection of Python scripts, each of which handles one step in the bioinformatic analyses necessary for visualization with Auspice.
17 | 
18 | As you might imagine, keeping track of the input and output files from each step individually can get very confusing, very quickly. So, **to manage all of these steps, we use a workflow manager called Snakemake**.
19 | 
20 | .. note::
21 | 
22 |    There are many other workflow managers out there, such as Nextflow. While we fully encourage you to use whichever workflow tools you prefer, we only provide support and maintenance for Snakemake.
23 | 
24 | Snakemake is an incredibly powerful workflow manager with many complex features. For our purposes, though, we only need to understand a few things:
25 | 
26 | -  **Each step in a workflow is called a "rule."** The inputs, outputs, and shell commands for each step/rule are defined in a ``.smk`` file.
27 | -  Each rule has a number of **parameters, which are specified in a ``.yaml`` file**.
28 | -  Each rule produces **output (called a "dependency") which may be used as input to other rules**.
29 | 
30 | Overview of a Nextstrain build
31 | ------------------------------
32 | 
33 | Below is an illustration of each step in a standard :term:`Nextstrain build <docs.nextstrain.org:build>`. Dependencies (output files from one step that act as input to the next) are indicated by grey arrows. Input files which must be provided are indicated with red outlines. As you can see in yellow, the final output is a JSON file for visualization in Auspice.
34 | 
35 | Required input files (e.g. the sequence data generated in the `data preparation section <../guides/data-prep>`__, or other files which are part of this repo) are indicated with red outlines. We'll walk through each of these in detail in the next section.
36 | 
37 | .. figure:: ../images/basic_nextstrain_build.png
38 |    :alt: nextstrain_build
39 | 
40 | Running multiple builds
41 | -----------------------
42 | 
43 | It is common practice to run several related builds. For example, to run one analysis on just your data and another analysis that incorporates background / contextual sequences, you could configure two different builds.
44 | 
45 | The ncov workflow facilitates this through the ``builds`` section in a :term:`workflow config file <config file>`. This is covered in more detail in the :doc:`genomic surveillance tutorial <../tutorial/genomic-surveillance>`.
46 | 
47 | We encourage you to take a look at `main_workflow.smk <https://github.com/nextstrain/ncov/blob/master/workflow/snakemake_rules/main_workflow.smk>`__ to see what each rule is doing in more detail.
48 | 
49 | .. note::
50 | 
51 |    Not all of the rules included are essential, or may even be desirable for your analysis. Your workflow may be able to be made a lot simpler, depending on your goals.
52 | 


--------------------------------------------------------------------------------
/docs/src/guides/customizing-visualization.rst:
--------------------------------------------------------------------------------
  1 | Customizing visualization
  2 | =========================
  3 | 
  4 | Visualization options can be configured in either a :term:`workflow config file<config file>` or a :term:`Auspice config file`, depending on the option.
  5 | 
  6 | .. contents:: Table of Contents
  7 |    :local:
  8 | 
  9 | Options in the workflow config file
 10 | -----------------------------------
 11 | 
 12 | These options can be coded into the workflow config file directly without requiring a custom Auspice config file.
 13 | 
 14 | Custom color schemes
 15 | ~~~~~~~~~~~~~~~~~~~~
 16 | 
 17 | To specify a custom color scale:
 18 | 
 19 | 1. Add a ``colors.tsv`` file, where each line is a tab-delimited list of a metadata column name; a metadata value; and a corresponding hex code. Example:
 20 | 
 21 |    ::
 22 | 
 23 |       country Russia  #5E1D9D
 24 |       country Serbia  #4D22AD
 25 |       country Europe  #4530BB
 26 |       ...
 27 | 
 28 | 2. Update your workflow config file with a reference:
 29 | 
 30 |    .. code:: yaml
 31 | 
 32 |       files:
 33 |         colors: "my-ncov-analyses/colors.tsv"
 34 | 
 35 | Changing the dataset description
 36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 37 | 
 38 | The dataset description, which appears below the visualizations, is read from a file which is specified in the workflow config file. Per-build description can be set by specifying them in the workflow config file:
 39 | 
 40 | .. code:: yaml
 41 | 
 42 |    builds:
 43 |      north-america: # name of the build
 44 |        description: my-ncov-analyses/north-america-description.md
 45 | 
 46 | If that is not provided, then a per-run description is used, also specified in the workflow config file:
 47 | 
 48 | .. code:: yaml
 49 | 
 50 |    files:
 51 |      description: my-ncov-analyses/my_description.md
 52 | 
 53 | Options in the Auspice config file
 54 | ----------------------------------
 55 | 
 56 | These options require creating an Auspice config file, used to configure :term:`docs.nextstrain.org:Auspice`. It is specified in the workflow config file using the ``auspice_config`` entry. Example:
 57 | 
 58 | .. code:: yaml
 59 | 
 60 |    auspice_config: ncov-tutorial/auspice-config-custom-data.json
 61 | 
 62 | This overrides the default Auspice config file, ``defaults/auspice_config.json``.
 63 | 
 64 | Adding custom metadata fields to color by
 65 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 66 | 
 67 | 1. Add a :doc:`valid metadata column <./data-prep/local-data>` to your ``metadata.tsv``
 68 | 2. Add an entry to the ``colorings`` block of the Auspice config file:
 69 | 
 70 |    .. code:: json
 71 | 
 72 |       "colorings": [
 73 |       {
 74 |          "key": "location",
 75 |          "title": "Location",
 76 |          "type": "categorical"
 77 |       },
 78 |       {
 79 |          "key": "metadata_column_name",
 80 |          "title": "Display name for interface",
 81 |          "type": "<categorical/continuous>"
 82 |       }
 83 |       ]
 84 | 
 85 | Choosing defaults
 86 | ~~~~~~~~~~~~~~~~~
 87 | 
 88 | You can specify the default view in the ``display_defaults`` block of an Auspice config file:
 89 | 
 90 | .. code:: json
 91 | 
 92 |    "display_defaults": {
 93 |      "color_by": "division",
 94 |      "distance_measure": "num_date",
 95 |      "geo_resolution": "division",
 96 |      "map_triplicate": true,
 97 |      "branch_label": "none"
 98 |    },
 99 | 
100 | Choosing panels to display
101 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
102 | 
103 | Similarly, you can choose which panels to enable in the ``panels`` block:
104 | 
105 | .. code:: json
106 | 
107 |    "panels": [
108 |      "tree",
109 |      "map",
110 |      "entropy"
111 |    ]
112 | 


--------------------------------------------------------------------------------
/docs/src/reference/naming_clades.rst:
--------------------------------------------------------------------------------
 1 | Clade Naming & Definitions
 2 | ==========================
 3 | 
 4 | The nomenclature used by Nextstrain to designate clades for SARS-CoV-2 is driven by the following objectives:
 5 | 
 6 | -  label genetically well defined clades that have reached significant frequency and geographic spread,
 7 | -  allow for transient clade designations that are elevated to major clades if they persist and rise in frequency,
 8 | -  provide memorable but informative names,
 9 | -  gracefully handle clade naming in the upcoming years as SARS-CoV-2 becomes a seasonal virus.
10 | 
11 | .. contents:: Table of Contents
12 |    :local:
13 | 
14 | Major clades
15 | ------------
16 | 
17 | Definition
18 | ~~~~~~~~~~
19 | 
20 | We name a new major clade when it reaches a frequency of 20% globally at any time point. When calculating these frequencies, care has to be taken to achieve approximately even sampling of sequences in time and space since sequencing effort varies strongly between countries. A clade name consists of the year it emerged and the next available letter in the alphabet. A new clade should be at least 2 mutations away from its parent major clade.
21 | 
22 | Naming
23 | ~~~~~~
24 | 
25 | We name major clades by the year they are estimated to have emerged and a letter, e.g. 19A, 19B, 20A. The yearly reset of letters will ensure that we don't progress too far into the alphabet, while the year-prefix provides immediate context on the origin of the clade that will become increasingly important going forward. These are meant as major genetic groupings and not intended to completely resolve genetic diversity.
26 | 
27 | The hierarchical structure of clades is sometimes of interest. Here, the "derivation" of a major clade can be labeled with the familiar "." notation as in 19A.20A.20C for the major clade 20C.
28 | 
29 | Subclades
30 | ---------
31 | 
32 | Within these major clades, we subclades, which we will label by their parent clade and the nucleotide mutation(s) that defines them (ex: 19A/28688C). It should be noted however, that these mutations are only meaningful in that they define the clade. Once a subclade reaches (soft) criteria on frequency, spread, and genetic distinctiveness, it will be renamed to a major clade (hypothetically 19A/28688C to 20D).
33 | 
34 | Current Clades
35 | --------------
36 | 
37 | You can view the current clades on the `GISAID reference dataset <https://nextstrain.org/ncov/gisaid/reference?branchLabel=clade&c=clade_membership>`__ or `open reference dataset <https://nextstrain.org/ncov/open/reference?branchLabel=clade&c=clade_membership>`__.
38 | 
39 | Identifying Nextstrain Clades
40 | -----------------------------
41 | 
42 | To make it easy for users to identify the Nextstrain clade of their own sequences, we provide a clade assignment tool at `clades.nextstrain.org <https://clades.nextstrain.org/>`__. In addition to assigning clades, this tool will call mutations in your sequences relative to the reference and performs some basic QC.
43 | 
44 | You can also use the `simple python script <https://github.com/nextstrain/ncov/blob/master/scripts/assign_clades.py>`__ to assign appropriate clades to sequences in a fasta file. This script is part of the ``ncov`` GitHub repository, but does not require running any other part of the workflow. However, ``augur`` :doc:`must be installed <augur:installation/installation>` to run the script.
45 | 
46 | Note when running this script you can supply ``--sequences`` if your sequences require aligning first. If you already have aligned your sequences to the ``ncov`` repository reference (for example, from running the repository), you can supply ``--alignment``. If you supply sequences that are not aligned to the ``ncov`` reference, you may get bad results!
47 | 


--------------------------------------------------------------------------------
/scripts/explicit_translation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from Bio import Phylo, SeqIO
 4 | from Bio.Align import MultipleSeqAlignment
 5 | from treetime import TreeAnc
 6 | from augur.utils import load_features
 7 | 
 8 | 
 9 | def annotation_json(features, reference):
10 |     annotations = {}
11 |     for fname, feat in features.items():
12 |         annotations[fname] = {'seqid':reference.id,
13 |                               'type':feat.type,
14 |                               'start':int(feat.location.start)+1,
15 |                               'end':int(feat.location.end),
16 |                               'strand': '+' if feat.location.strand else '-'}
17 |     annotations['nuc'] = {'seqid':reference.id,
18 |                             'type':'source',
19 |                             'start': 1,
20 |                             'end': len(reference),
21 |                             'strand': '+'}
22 |     return annotations
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     parser = argparse.ArgumentParser(
27 |         description="Add translations",
28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
29 |     )
30 | 
31 |     parser.add_argument('--tree', type=str, required=True, help="input tree")
32 |     parser.add_argument('--reference', type=str, required=True, help="reference genbank sequence")
33 |     parser.add_argument('--translations', type=str,  nargs='+', required=True, help="amino acid alignment")
34 |     parser.add_argument('--genes', type=str, nargs='+', required=True, help="amino acid alignment")
35 |     parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
36 |     args = parser.parse_args()
37 | 
38 |     genes = args.genes if type(args.genes)==list else [args.genes]
39 |     translations = args.translations if type(args.translations)==list else [args.translations]
40 |     ref = SeqIO.read(args.reference, format='genbank')
41 |     features = load_features(args.reference)
42 | 
43 |     if not set(features.keys())==set(args.genes):
44 |         print("WARNING: supplied genes don't match the annotation")
45 |         print("the following features are in the annotation by not supplied as genes:", set(features.keys()).difference(args.genes))
46 |         print("the following features are in the supplied as genes but not the annotation:", set(args.genes).difference(features.keys()))
47 | 
48 |     T = Phylo.read(args.tree, 'newick')
49 |     leafs = {n.name for n in T.get_terminals()}
50 | 
51 |     node_data = {}
52 |     root_sequence_translations = {}
53 |     for gene, translation in zip(genes, translations):
54 |         seqs = []
55 |         for s in SeqIO.parse(translation, 'fasta'):
56 |             if s.id in leafs:
57 |                 seqs.append(s)
58 | 
59 | 
60 |         tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa')
61 | 
62 |         tt.infer_ancestral_sequences(reconstruct_tip_states=True)
63 |         root_sequence_translations[gene] = tt.sequence(tt.tree.root, as_string=True, reconstructed=True)
64 | 
65 |         with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh:
66 |             for n in tt.tree.find_clades():
67 |                 if n.name not in node_data:
68 |                     node_data[n.name] = {"aa_muts":{}}
69 |                 if len(n.mutations):
70 |                     node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations]
71 |                 fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n")
72 | 
73 |     annotations = annotation_json(features, ref)
74 |     with open(args.output, 'w') as fh:
75 |         json.dump({"nodes":node_data, "annotations":annotations, "reference":root_sequence_translations}, fh)
76 | 


--------------------------------------------------------------------------------
/docs/src/reference/troubleshoot.rst:
--------------------------------------------------------------------------------
 1 | Troubleshoot common issues
 2 | ==========================
 3 | 
 4 | If you have a question that is not addressed here, please don't hesitate to `ask for help <https://discussion.nextstrain.org/>`__
 5 | 
 6 | My country / division does not show up on the map
 7 | -------------------------------------------------
 8 | 
 9 | This is most often a result of the country / division not being present in `the file defining the latitude & longitude of each deme <https://github.com/nextstrain/ncov/blob/master/defaults/lat_longs.tsv>`__. Adding it to that file (and rerunning the Snakemake rules downstream of this) should fix this.
10 | 
11 | My trait (e.g. division) is grey instead of colored
12 | ---------------------------------------------------
13 | 
14 | We generate the colors from the ``colors`` rule in the Snakefile, which uses the `ordering TSV <https://github.com/nextstrain/ncov/blob/master/defaults/color_ordering.tsv>`__ to generate these. See :doc:`../guides/workflow-config-file` for more info.
15 | 
16 | *A note about locations and colors:* Unless you want to specifically override the colors generated, it's usually easier to *add* information to the default ``ncov`` files, so that you can benefit from all the information already in those files.
17 | 
18 | My genomes aren't included in the analysis
19 | ------------------------------------------
20 | 
21 | There are a few steps where sequences can be removed:
22 | 
23 | -  During the ``filter`` step:
24 | 
25 |    -  Samples that are included in `the exclude file <https://github.com/nextstrain/ncov/blob/master/defaults/exclude.txt>`__ are removed
26 |    -  Samples that fail the filtering criteria, as defined in your :ref:`filter config <workflow-config-filter>`, are removed.
27 | 
28 |       - If you do not have any custom filtering criteria, the default filters in the `parameters.yaml <https://github.com/nextstrain/ncov/blob/master/defaults/parameters.yaml>`__ are applied.
29 | 
30 |    - Check the ``results/{build_name}/filtered_log.tsv`` file to see the filtered reason for each sequence.
31 | 
32 | -  Samples may be randomly removed during subsampling; see :doc:`../guides/workflow-config-file` for more info.
33 | -  During the ``refine`` step, Augur can drop samples that deviate from the expected clock rate. Inspect the log file named like ``logs/refine_{build_name}.txt`` to look for samples filtered by this step. :ref:`See the refine configuration guide <workflow-config-refine>`, for details on the clock rate filter.
34 | 
35 | Sequencing and alignment errors
36 | -------------------------------
37 | 
38 | Genome sequencing, bioinformatic processing of the raw data, and alignment of the sequences are all steps were errors can slip in. Such errors can distort the phylogenetic analysis. To avoid sequences with known problems to mess up the analysis, we keep a list of problematic sequences in ``config/exclude.txt`` and filter them out. To facilitate spotting such problematic sequences, we added an additional quality control step that produces the file ``results/excluded_by_diagnostics.txt``.
39 | 
40 | This file is the output of ``scripts/diagnostic.py`` and is produced by rule ``diagnostic``. This file contains only those sequences with diagnostics exceeding thresholds and mirrors the format of ``config/exclude.txt``. These names could be added to ``config/exclude.txt`` for permanent exclusion. Note, however, that some sequences might look problematic due to alignment issues rather than intrinsic problems with the sequence. The flagged sequences will be excluded from the current run.
41 | 
42 | To only run the sequence diagnostic, you can specify any of the three above files as target, or use the ``diagnostic`` target:
43 | 
44 | .. code:: bash
45 | 
46 |    nextstrain build ... diagnostic
47 | 


--------------------------------------------------------------------------------
/scripts/find_clusters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from augur.io import read_metadata
 4 | from augur.utils import read_tree, read_node_data
 5 | from collections import Counter
 6 | import csv
 7 | import hashlib
 8 | 
 9 | MAX_HASH_LENGTH = 7
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(
14 |         description="Find polytomies in a given tree that all belong to the same metadata group",
15 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
16 |     )
17 |     parser.add_argument("--tree", required=True, help="Newick tree")
18 |     parser.add_argument("--metadata", required=True, help="metadata")
19 |     parser.add_argument("--mutations", required=True, help="mutations node data JSON")
20 |     parser.add_argument("--attribute-name", default="cluster_id", help="name of attribute to store in output JSON")
21 |     parser.add_argument("--group-by", default="division", help="identify polytomies where all tips are in the same group")
22 |     parser.add_argument("--min-tips", type=int, default=3, help="minimum tips per polytomy to be consider as a cluster")
23 |     parser.add_argument("--output", required=True, help="tab-delimited file with strain, cluster id, and group value for each strain")
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     tree = read_tree(args.tree)
28 |     tree.collapse_all(lambda c: c.branch_length < 1e-5)
29 | 
30 |     metadata = read_metadata(args.metadata)
31 |     muts = read_node_data(args.mutations)
32 |     attribute_name = args.attribute_name
33 |     group_by = args.group_by
34 | 
35 |     polytomies = []
36 |     for node in tree.find_clades(terminal=False):
37 |         if node == tree.root:
38 |             continue
39 | 
40 |         count_by_group = Counter()
41 |         polytomy_sequence_id = None
42 |         for child in node.clades:
43 |             if child.is_terminal() and child.name:
44 |                 child_muts_data = muts["nodes"].get(child.name, {})
45 |                 any_muts = (len(child_muts_data.get("muts", [])) > 0)
46 |                 if not any_muts:
47 |                     count_by_group[metadata.loc[child.name, group_by]] += 1
48 | 
49 |                     if polytomy_sequence_id is None and "sequence" in child_muts_data:
50 |                         polytomy_sequence_id = hashlib.sha256(child_muts_data["sequence"].encode()).hexdigest()[:MAX_HASH_LENGTH]
51 | 
52 |         if any(count >= args.min_tips for count in count_by_group.values()):
53 |             polytomies.append({"node": node, "name": polytomy_sequence_id})
54 | 
55 |     with open(args.output, "w") as oh:
56 |         writer = csv.DictWriter(
57 |             oh,
58 |             fieldnames=(
59 |                 "strain",
60 |                 args.attribute_name,
61 |                 group_by
62 |             ),
63 |             delimiter="\t",
64 |             lineterminator="\n"
65 |         )
66 |         writer.writeheader()
67 |         clusters = 0
68 |         for polytomy_data in polytomies:
69 |             polytomy = polytomy_data["node"]
70 |             polytomy_sequence_id = polytomy_data["name"]
71 | 
72 |             if polytomy.name:
73 |                 writer.writerow({
74 |                     "strain": polytomy.name,
75 |                     args.attribute_name: polytomy_sequence_id,
76 |                     group_by: metadata.loc[polytomy.name, group_by]
77 |                 })
78 | 
79 |             for child in polytomy.clades:
80 |                 if child.is_terminal():
81 |                     writer.writerow({
82 |                         "strain": child.name,
83 |                         args.attribute_name: polytomy_sequence_id,
84 |                         group_by: metadata.loc[child.name, group_by]
85 |                     })
86 | 
87 |             clusters += 1
88 | 


--------------------------------------------------------------------------------
/scripts/fetch_mlr_lineage_fitness.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import pandas as pd
 4 | import requests
 5 | import sys
 6 | from augur.io import read_metadata
 7 | from augur.utils import write_json
 8 | 
 9 | # This script is currently assuming a match on lineage fitness this uses
10 | # https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json
11 | # that backs the live estimates on https://nextstrain.org/sars-cov-2/forecasts
12 | # This uses "Nextclade_pango" metadata label to derive sequence counts from
13 | # GISAID data and estimate relative growth advantages across collapsed Pango
14 | # lineages. It will be most relevant for 1m, 2m and 6m builds, but is not at all
15 | # broken for the all-time builds. It would be possible to swap this to key on
16 | # clade instead, but I think the greater detail of lineages is better in this case
17 | 
18 | def fetch_growth_advantages(mlr_url):
19 |     try:
20 |         response = requests.get(mlr_url)
21 |         response.raise_for_status()  # Raise an exception for HTTP errors
22 |         json_data = response.json()  # Parse the JSON content
23 |         data = json_data["data"]
24 | 
25 |         growth_advantages = {}
26 |         for entry in data:
27 |             if all(key in entry for key in ["location", "site", "variant", "value", "ps"]):
28 |                 if entry["location"] == "hierarchical" and entry["site"] == "ga" and entry["ps"] == "median":
29 |                     growth_advantages[entry["variant"]] = entry["value"]
30 |         return growth_advantages
31 |     except Exception as e:
32 |         print(f"Error fetching the JSON file: {e}", file=sys.stderr)
33 |         return None
34 | 
35 | def main():
36 |     # Set up argument parser
37 |     parser = argparse.ArgumentParser(description="Fetch MLR lineage fitness and match to strain-level metadata")
38 |     parser.add_argument("--metadata", required=True, help="Path to the metadata TSV")
39 |     parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="List of columns to use as identifiers in the metadata file")
40 |     parser.add_argument("--metadata-clade-attribute", default="Nextclade_pango", help="Matched attribute to MLR variants")
41 |     parser.add_argument("--mlr-url", default="https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json", help="URL to fetch the forecasts JSON data.")
42 |     parser.add_argument("--output-node-data", required=True, help="Path to save the output JSON node data.")
43 | 
44 |     args = parser.parse_args()
45 | 
46 |     # Fetch the remote growth advantages
47 |     growth_advantages = fetch_growth_advantages(args.mlr_url)
48 | 
49 |     # Load the local metadata
50 |     try:
51 |         metadata = read_metadata(
52 |             args.metadata,
53 |             id_columns=args.metadata_id_columns
54 |         )
55 |     except FileNotFoundError as e:
56 |         print(f"Error reading metadata file: {e}", file=sys.stderr)
57 | 
58 |     # Match Nextclade_pango entries in metadata to the fetched growth advantages
59 |     if growth_advantages:
60 |         metadata[args.metadata_clade_attribute] = metadata[args.metadata_clade_attribute].map(growth_advantages)
61 |     else:
62 |         metadata[args.metadata_clade_attribute] = None
63 | 
64 |     # Create a node data object with growth advantages
65 |     node_data = {}
66 |     for index, record in metadata.iterrows():
67 |         node_data[index] = {
68 |             "mlr_lineage_fitness": record[args.metadata_clade_attribute] if pd.notna(record[args.metadata_clade_attribute]) else None
69 |         }
70 | 
71 |     # Save node data
72 |     write_json({"nodes": node_data}, args.output_node_data)
73 | 
74 | if __name__ == '__main__':
75 |     try:
76 |         main()
77 |     except Exception as e:
78 |         print(f"An unexpected error occurred: {e}", file=sys.stderr)
79 | 


--------------------------------------------------------------------------------
/docs/src/tutorial/next-steps.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Next steps
 3 | ==========
 4 | 
 5 | Congratulations! You have completed all of the tutorials for the ncov workflow. Read on for some next steps.
 6 | 
 7 | .. contents:: Table of Contents
 8 |    :local:
 9 | 
10 | .. _create-analysis-directory:
11 | 
12 | Create your own analysis directory
13 | ==================================
14 | 
15 | On a web browser:
16 | 
17 | 1. `Sign up for a GitHub account <https://github.com/signup>`__ if you do not already have one.
18 | 2. Create a repository from the ``ncov-tutorial`` template repository:
19 | 
20 |    1. Go to https://github.com/nextstrain/ncov-tutorial.
21 |    2. Click **Use this template**.
22 |    3. Give your repository a name. We recommend ``my-ncov-analyses`` and will use that name in the following steps.
23 |    4. Click **Create repository from template**.
24 | 
25 | In a command prompt:
26 | 
27 | 1. Go to the ``ncov/`` directory.
28 | 2. Clone your new repository, replacing ``<username>`` with your own username:
29 | 
30 |    .. code:: text
31 | 
32 |       git clone https://github.com/<username>/my-ncov-analyses
33 | 
34 | 3. Read the next section to learn how to modify ``genomic-surveillance.yaml``.
35 | 
36 | Modify the genomic surveillance workflow configuration
37 | ======================================================
38 | 
39 | Instead of an Idaho-focused workflow config, you can provide your own data for the ``custom_data`` input. Follow the same steps in the tutorial for GISAID download but select your own set of sequences and rename your ``metadata.tsv`` and ``sequences.fasta`` files accordingly.
40 | 
41 |    .. note::
42 | 
43 |       Workflow run time increases with the number of sequences, and the GISAID web interface has a maximum of 5,000 sequences per download.
44 | 
45 | Then, use the following steps to customize names, titles, and context:
46 | 
47 | 1. Change the ``custom_data`` input filenames from ``idaho.metadata.tsv`` and ``idaho.sequences.fasta`` to your own.
48 | 2. Change the regional input dataset from North America to an appropriate region for your custom focal data. :doc:`See the complete list of available URLs <../reference/remote_inputs>`.
49 | 3. Rename the output dataset from ``idaho`` to your own. Note the name restrictions.
50 | 4. Reword the output dataset title to your own.
51 | 5. Rename the subsampling scheme from ``idaho_scheme`` to your own. Note the name restrictions.
52 | 6. For each sample, increase the ``max_sequences`` to your own.
53 | 7. Rename the ``usa_context`` sample and update the ``query`` accordingly.
54 | 
55 | .. warning::
56 | 
57 |    File paths in the :term:`config files <config file>` must start with the :term:`analysis directory`. For example, in the tutorial:
58 | 
59 |    .. code:: yaml
60 | 
61 |       auspice_config: ncov-tutorial/auspice-config-custom-data.json
62 | 
63 |    Now that you have created your own analysis directory, this must be modified, e.g.
64 | 
65 |    .. code:: yaml
66 | 
67 |       auspice_config: my-ncov-analyses/auspice-config-custom-data.json
68 | 
69 | Additional resources
70 | ====================
71 | 
72 | - Learn more about genomic epidemiology:
73 | 
74 |    - `An applied genomic epidemiological handbook <https://alliblk.github.io/genepi-book/intro.html>`__ by Allison Black and Gytis Dudas
75 |    - `Genomic Epidemiology Seminar Series <https://czgenepi.org/resources>`__ by Chan Zuckerberg Initiative Genomic Epidemiology (CZ GEN EPI)
76 |    - `COVID-19 Genomic Epidemiology Toolkit <https://www.cdc.gov/advanced-molecular-detection/php/training/?CDC_AAref_Val=https://www.cdc.gov/amd/training/covid-19-gen-epi-toolkit.html>`__ by Centers for Disease Control and Prevention (CDC)
77 | 
78 | - :doc:`Review all possible options to configure your SARS-CoV-2 analyses with Nextstrain <../reference/workflow-config-file>`.
79 | - Watch `this 1-hour video overview <https://youtu.be/m4_F2tG58Pc>`__ by Heather Blankenship on how to deploy Nextstrain for a Public Health lab.
80 | 


--------------------------------------------------------------------------------
/docs/src/reference/data_submitter_faq.rst:
--------------------------------------------------------------------------------
 1 | Data Submitter's FAQ
 2 | ====================
 3 | 
 4 | We often receive questions from data submitters about why their data is not visible on the `Nextstrain SARS-CoV-2 runs <https://nextstrain.org/ncov>`__. This short FAQ highlights some of the main reasons why data may not be showing up on Nextstrain.
 5 | 
 6 | Sequence Length & Number of N's
 7 | -------------------------------
 8 | 
 9 | We currently only use full-genome sequences which are at least 27,000 bases in length. They also cannot have more than 3,000 bases that are 'N'.
10 | 
11 | Subsampling
12 | -----------
13 | 
14 | Nextstrain runs can be subsampled considerably. There are over >30,000 whole-genome sequences available on GISAID currently, but we typically include <5,000 in each of our runs. If the division your samples are from contains more than about 100 samples per month, they are likely to be downsampled. Be sure to check the appropriate regional build - these are sampled more heavily from the focal region, so there's a higher chance a sequence will be included in the run. We have regional builds for `North America <https://nextstrain.org/ncov/north-america>`__, `South America <https://nextstrain.org/ncov/south-america>`__, `Asia <https://nextstrain.org/ncov/asia>`__, `Africa <https://nextstrain.org/ncov/africa/>`__, `Europe <https://nextstrain.org/ncov/europe>`__, and `Oceania <https://nextstrain.org/ncov/oceania>`__.
15 | 
16 | Missing Dates
17 | -------------
18 | 
19 | We currently only include samples that have an **exact sampling date** (day, month, year). This is because we cannot accurately estimate the sample dates from the sequences at the moment, given the short duration of the pandemic so far, and the mutation rate.
20 | 
21 | If your sample has only year or only month and year as a sampling date, it will be automatically excluded from runs. If you have privacy/data sharing concerns, it's ok to slightly change the collection date randomly by +/- 1 or 2 days. Please do *not* use the sequencing or processing date, as these can negatively influence our runs.
22 | 
23 | If you wish to add a corrected date to your samples, simply updating the sampling date in GISAID will automatically update our system, and the sequence will be included in the next run!
24 | 
25 | Many Samples with the Same Date
26 | -------------------------------
27 | 
28 | If we receive many samples that have identical dates as sample dates, we may exclude these manually. This is because this often indicates that the 'sample date' given is not actually the sample date, but the sequencing, processing, or uploading date. We try to email submitters when we do this to check whether the dates are truly the collection dates.
29 | 
30 | If you are genuinely submitting many sequences with identical dates, you can avoid us temporarily excluding them by emailing hello@nextstrain.org to let us know about the sequences and why they have the same date (ex: collected during investigation of a long-term care center).
31 | 
32 | Missing USA State
33 | -----------------
34 | 
35 | We currently exclude samples from the USA which do not have a 'division' attribute (this is the USA state or territory where they were sampled). Adding a state/territory/division to your sample on GISAID will automatically update this on our system, and the sequence will appear in our next run.
36 | 
37 | Divergence Issues
38 | -----------------
39 | 
40 | For quality control, we use a combination of automated and manual checks to ensure that sequences included seem to be free of sequencing and/or assembly error. If a sequence is deemed to be far too divergent (has more mutations than we expect given the sampling date), or far too under-diverged (has far fewer mutations than we expect given the sampling date), it may be excluded. We cannot off direct help in these cases, but suggest you revisit the raw sequence files with the aid of someone with experience using your sequencing pipeline, in order to correct any sequencing and assembly errors.
41 | 


--------------------------------------------------------------------------------
/docs/src/reference/files.rst:
--------------------------------------------------------------------------------
 1 | Files overview
 2 | ==============
 3 | 
 4 | This page gives an overview of the files in your local ``ncov/`` directory.
 5 | 
 6 | .. contents::
 7 |    :local:
 8 | 
 9 | User files
10 | ----------
11 | 
12 | User files are not tracked by version control, meaning they are either provided by the user or generated by the workflow.
13 | 
14 | Analysis directory
15 | ~~~~~~~~~~~~~~~~~~
16 | 
17 | An :term:`analysis directory` is a non-tracked directory which contains user-defined :term:`customization files <customization file>`.
18 | 
19 | In the :doc:`tutorials <../tutorial/intro>`, the analysis directory is ``ncov-tutorial/``. Follow :ref:`these steps <create-analysis-directory>` to create your own analysis directory.
20 | 
21 | .. hint::
22 | 
23 |    Previously, we recommended using Snakemake profiles under a ``my_profiles/`` analysis directory. We now recommend using Snakemake config files directly via the ``--configfile`` parameter. You can still use existing profiles via ``--configfile my_profiles/<profile_name>/builds.yaml``.
24 | 
25 | Input files
26 | ~~~~~~~~~~~
27 | 
28 | Learn how to prepare input files with :doc:`../guides/data-prep/index`.
29 | 
30 | .. note::
31 | 
32 |    A few example input files are provided when you clone ``ncov/`` locally, under ``data/``.
33 | 
34 | - Metadata file (e.g. ``data/example_metadata.tsv``): tab-delimited description of strain (i.e., sample) attributes
35 | - Sequences file (e.g. ``data/example_sequences.fasta.gz``): genomic sequences whose ids must match the ``strain`` column in the metadata file.
36 | 
37 | Output files and directories
38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
39 | 
40 | These are generated by the workflow.
41 | 
42 | - ``auspice/<dataset_name>.json``: output file for visualization in Auspice where ``<dataset_name>`` is the name of your output dataset in the workflow configuration file used by ``--configfile``.
43 | - ``results/aligned.fasta``, etc.: raw results files (dependencies) that are shared across all datasets.
44 | - ``results/<dataset_name>/``: raw results files (dependencies) that are specific to a single dataset.
45 | - ``logs/``: Log files with error messages and other information about the run.
46 | - ``benchmarks/``: Run-times (and memory usage on Linux systems) for each rule in the workflow.
47 | 
48 | Internal files
49 | --------------
50 | 
51 | These files are not intended for modification. See :doc:`../guides/workflow-config-file` on how to configure workflow behavior.
52 | 
53 | Default workflow customization files
54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 | 
56 | - ``defaults/parameters.yaml``: default :term:`config file`. Override these settings using ``--configfile your-config.yaml``.
57 | - ``defaults/auspice_config.json``: default :term:`Auspice config file`. Override these settings using ``auspice_config``.
58 | - ``defaults/include.txt``: default strain names to *include* during subsampling and filtering.
59 | - ``defaults/exclude.txt``: default strain names to *exclude* during subsampling and filtering.
60 | 
61 | Workflow definition files
62 | ~~~~~~~~~~~~~~~~~~~~~~~~~
63 | 
64 | - ``Snakefile``: entry point for Snakemake commands that also validates inputs.
65 | - ``workflow/snakemake_rules/main_workflow.smk``: defines rules for running each step in the analysis. Modify your workflow config file, rather than hardcode changes into the snakemake file itself.
66 | - ``workflow/envs/nextstrain.yaml``: specifies computing environment needed to run workflow with the ``--use-conda`` flag.
67 | - ``workflow/schemas/config.schema.yaml``: defines format (e.g., required fields and types) for workflow config files.
68 | - ``scripts/``: helper scripts for common tasks.
69 | 
70 | Documentation
71 | ~~~~~~~~~~~~~
72 | 
73 | These files are used to generate the `workflow documentation <https://docs.nextstrain.org/projects/ncov/en/latest/>`__.
74 | 
75 | Nextstrain user files
76 | ~~~~~~~~~~~~~~~~~~~~~
77 | 
78 | The Nextstrain team maintains user files in the ``ncov/`` repo, under ``nextstrain_profiles/``.
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![GitHub release (latest by date)](https://img.shields.io/github/v/release/nextstrain/ncov)](https://github.com/nextstrain/ncov/releases)
 2 | [![See recent changes](https://img.shields.io/badge/changelog-See%20recent%20changes-blue)](https://docs.nextstrain.org/projects/ncov/en/latest/reference/change_log.html)
 3 | 
 4 | # About
 5 | 
 6 | This repository analyzes viral genomes using [Nextstrain](https://nextstrain.org) to understand how SARS-CoV-2, the virus that is responsible for the COVID-19 pandemic, evolves and spreads.
 7 | 
 8 | We maintain a number of publicly-available builds, visible at [nextstrain.org/ncov](https://nextstrain.org/ncov).
 9 | 
10 | [See our change log for details about backwards-incompatible or breaking changes to the workflow](https://docs.nextstrain.org/projects/ncov/en/latest/reference/change_log.html).
11 | 
12 | Visit [the workflow documentation](https://docs.nextstrain.org/projects/ncov) for tutorials and reference material.
13 | 
14 | ## Download formatted datasets
15 | 
16 | The hCoV-19 / SARS-CoV-2 genomes were generously shared via GISAID. We gratefully acknowledge the Authors, Originating and Submitting laboratories of the genetic sequence and metadata made available through GISAID on which this research is based.
17 | 
18 | In order to download the GISAID data to run the analysis yourself, please see [this guide](https://docs.nextstrain.org/projects/ncov/en/latest/analysis/data-prep.html).
19 | > Please note that `data/metadata.tsv` is no longer included as part of this repo. However, we provide continually-updated, pre-formatted metadata & fasta files for download through GISAID.
20 | 
21 | ## Read previous Situation Reports
22 | 
23 | We issued weekly Situation Reports for the first ~5 months of the pandemic. You can find the Reports and their translations [here](https://nextstrain.org/ncov-sit-reps).
24 | 
25 | ## FAQs
26 | 
27 | - Can't find your sequences in Nextstrain? Check [here](./docs/data_faq.md) for common reasons why your sequences may not be appearing.
28 | You can also use [clades.nextstrain.org](https://clades.nextstrain.org/) to perform some basic quality control on your sequences. If they are flagged by this tool, they will likely be excluded by our pipeline.
29 | - For information about how clades are defined, and the currently named clades, please see [here](./docs/naming_clades.md). To assign clades to your own sequences, you can use our clade assignment tool at [clades.nextstrain.org](https://clades.nextstrain.org/).
30 | 
31 | ## Bioinformatics notes
32 | 
33 | Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of [8 &times; 10^-4 subs per site per year](http://virological.org/t/phylodynamic-analysis-176-genomes-6-mar-2020/356). There were SNPs present in the nCoV samples in the first and last few bases of the alignment that were masked as likely sequencing artifacts.
34 | 
35 | # Contributing
36 | 
37 | We welcome contributions from the community! Please note that we strictly adhere to the [Contributor Covenant Code of Conduct](https://github.com/nextstrain/.github/blob/master/CODE_OF_CONDUCT.md).
38 | 
39 | ### Contributing to software or documentation
40 | 
41 | Please see our [Contributor Guide](https://github.com/nextstrain/.github/blob/master/CONTRIBUTING.md) to get started!
42 | 
43 | ### Contributing data
44 | 
45 | **Please note that we automatically pick up any SARS-CoV-2 data that is submitted to GISAID.**
46 | 
47 | If you're a lab and you'd like to get started sequencing, please see:
48 | * [Protocols from the ARTIC network](https://www.protocols.io/groups/artic/publications)
49 | * [Funding opportunities for sequencing efforts](https://twitter.com/firefoxx66/status/1242147905768751106)
50 | * Or, if these don't meet your needs, [get in touch](mailto:hello@nextstrain.org)
51 | 
52 | ---
53 | 
54 | # Get in touch
55 | 
56 | To report a bug, error, or feature request, please [open an issue](https://github.com/nextstrain/ncov/issues).
57 | 
58 | For questions, head over to the [discussion board](https://discussion.nextstrain.org/); we're happy to help!
59 | 


--------------------------------------------------------------------------------
/docs/src/guides/run-analysis-on-terra.rst:
--------------------------------------------------------------------------------
 1 | *************************
 2 | Run the workflow on Terra
 3 | *************************
 4 | 
 5 | Import ``ncov`` WDL workflow from Dockstore
 6 | ===========================================
 7 | 
 8 | 1. `Setup a Terra account <https://terra.bio/>`_
 9 | #. Navigate to Dockstore: `ncov:master`_
10 | #. Top right corner, under **Launch with**, click on **Terra**
11 | #. Under "Workflow Name" set a name, can also leave default ``ncov``, and select your **Destination Workspace** in the drop down menu.
12 | #. Click button **IMPORT**
13 | #. In your workspace, click on the **WORKFLOWS** tab and verify that the imported workflow is showing a card
14 | 
15 | .. _`ncov:master`: https://dockstore.org/workflows/github.com/nextstrain/ncov:master?tab=info
16 | 
17 | Upload your data files into Terra
18 | =================================
19 | 
20 | 1. Navigate to: `https://app.terra.bio/#upload`_.
21 | 
22 | #. Select your workspace
23 | #. At the top, hit the **+** button to "create a collection"
24 | #. Within the collection, at bottom right, click **+** button to upload file, or drag and drop files to upload them.
25 | #. Go back to your Terra Dashboard
26 | #. Click on the **DATA** tab
27 | #. On the left, under **OTHER DATA**, click **Files** and there should be an "uploads/" folder shown to the right
28 | #. Click on "uploads/" to view your collection and verify that your files have been uploaded
29 | 
30 | .. _`https://app.terra.bio/#upload`: https://app.terra.bio/#upload
31 | 
32 | Connect your data files to the WDL workflow
33 | ===========================================
34 | 
35 | 1. On the **DATA** tab, click on **+** next to the **TABLES** section to create a Data Table
36 | #. Download the "sample_template.tsv" file
37 | #. Create a tab delimited file similar to below:
38 | 
39 | ::
40 | 
41 |     entity:ncov_examples_id	metadata	sequences	configfile_yaml
42 |     example	gs://COPY_PATH_HERE/example_metadata.tsv	gs://COPY_PATH_HERE/example_datasets/example_sequences.fasta.gz
43 |     example_build		gs://COPY_PATH_HERE/example-build.yaml
44 | 
45 | 4. Upload to **Tables** and you should get something like:
46 | 
47 | .. image:: ../images/terra-datatable.png
48 | 
49 | 5. Navigate back to the **Workflow** tab, and click on your imported "ncov" workflow
50 | #. Click on the radio button "Run workflow(s) with inputs defined by data table"
51 | #. Under **Step 1**, select your root entity type **ncov_examples** from the drop down menu.
52 | #. Click on **SELECT DATA** to select all rows
53 | #. Most of the values will be blank but fill in the values below: 
54 | 
55 |   +-----------------+------------------+-------+----------------------+
56 |   |Task name        | Variable         | Type  |   Attribute          |
57 |   +=================+==================+=======+======================+
58 |   |Nextstrain_WRKFLW|  build_name      | String| this.ncov_example.id |
59 |   +-----------------+------------------+-------+----------------------+
60 |   |Nextstrain_WRKFLW|  configfile_yaml | File  | this.configfile_yaml |
61 |   +-----------------+------------------+-------+----------------------+
62 |   |Nextstrain_WRKFLW|  metadata_tsv    | File  | this.metadata        |
63 |   +-----------------+------------------+-------+----------------------+
64 |   |Nextstrain_WRKFLW|  sequence_fasta  | File  | this.sequences       |
65 |   +-----------------+------------------+-------+----------------------+
66 | 
67 | 10. Click on the **OUTPUTS** tab
68 | 11. Connect your generated output back to the data table, but filling in values:
69 | 
70 |   +-----------------+-----------------+-------+----------------------+
71 |   |Task name        | Variable	      | Type  |   Attribute          |
72 |   +=================+=================+=======+======================+
73 |   |Nextstrain_WRKFLW|  auspice_zip    | File  | this.auspice_zip     |
74 |   +-----------------+-----------------+-------+----------------------+
75 |   |Nextstrain_WRKFLW|  results_zip    | File  | this.results_zip     |
76 |   +-----------------+-----------------+-------+----------------------+
77 | 
78 | 12. Click on **Save** then click on **Run Analysis**
79 | #. Under the tab **JOB HISTORY**, verify that your job is running.
80 | #. When run is complete, check the **DATA** / **TABLES** / **ncov_examples** tab and download "auspice.zip" file
81 | 


--------------------------------------------------------------------------------
/defaults/auspice_config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "title": "Genomic epidemiology of novel coronavirus",
  3 |   "build_url": "https://github.com/nextstrain/ncov",
  4 |   "maintainers": [
  5 |     {"name": "the Nextstrain team", "url": "https://nextstrain.org/"}
  6 |   ],
  7 |   "colorings": [
  8 |     {
  9 |       "key": "emerging_lineage",
 10 |       "title": "Emerging Lineage",
 11 |       "type": "categorical"
 12 |     },
 13 |     {
 14 |         "key": "immune_escape",
 15 |         "title": "Immune Escape vs BA.2",
 16 |         "type": "continuous"
 17 |     },
 18 |     {
 19 |         "key": "ace2_binding",
 20 |         "title": "ACE2 binding vs BA.2",
 21 |         "type": "continuous"
 22 |     },
 23 |     {
 24 |       "key": "pango_lineage",
 25 |       "title": "PANGO Lineage",
 26 |       "type": "categorical"
 27 |     },
 28 |     {
 29 |       "key": "GISAID_clade",
 30 |       "title": "GISAID Clade",
 31 |       "type": "categorical"
 32 |     },
 33 |     {
 34 |       "key": "S1_mutations",
 35 |       "title": "S1 mutations",
 36 |       "type": "continuous"
 37 |     },
 38 |     {
 39 |       "key": "mlr_lineage_fitness",
 40 |       "title": "MLR lineage fitness",
 41 |       "type": "continuous"
 42 |     },
 43 |     {
 44 |       "key": "location",
 45 |       "title": "Location",
 46 |       "type": "categorical"
 47 |     },
 48 |     {
 49 |       "key": "division",
 50 |       "title": "Admin Division",
 51 |       "type": "categorical"
 52 |     },
 53 |     {
 54 |       "key": "country",
 55 |       "title": "Country",
 56 |       "type": "categorical"
 57 |     },
 58 |     {
 59 |       "key": "region",
 60 |       "title": "Region",
 61 |       "type": "categorical"
 62 |     },
 63 |     {
 64 |       "key": "host",
 65 |       "title": "Host",
 66 |       "type": "categorical"
 67 |     },
 68 |     {
 69 |       "key": "age",
 70 |       "title": "Age",
 71 |       "type": "continuous"
 72 |     },
 73 |     {
 74 |       "key": "sex",
 75 |       "title": "Sex",
 76 |       "type": "categorical"
 77 |     },
 78 |     {
 79 |       "key": "author",
 80 |       "title": "Authors",
 81 |       "type": "categorical"
 82 |     },
 83 |     {
 84 |       "key": "originating_lab",
 85 |       "title": "Originating Lab",
 86 |       "type": "categorical"
 87 |     },
 88 |     {
 89 |       "key": "submitting_lab",
 90 |       "title": "Submitting Lab",
 91 |       "type": "categorical"
 92 |     },
 93 |     {
 94 |       "key": "recency",
 95 |       "title": "Submission Date",
 96 |       "type": "categorical"
 97 |     },
 98 |     {
 99 |       "key": "gisaid_epi_isl",
100 |       "type": "categorical"
101 |     },
102 |     {
103 |       "key": "genbank_accession",
104 |       "type": "categorical"
105 |     },
106 |     {
107 |       "key": "epiweek",
108 |       "title": "Epiweek (CDC)",
109 |       "type": "continuous"
110 |     },
111 |     {
112 |       "key": "QC_overall_score",
113 |       "title": "Nextclade QC score",
114 |       "type": "continuous"
115 |     },
116 |     {
117 |       "key": "QC_overall_status",
118 |       "title": "Nextclade QC status",
119 |       "type": "categorical"
120 |     },
121 |     {
122 |       "key": "reversion_mutations",
123 |       "title": "Reversion mutations",
124 |       "type": "continuous"
125 |     },
126 |     {
127 |       "key": "potential_contaminants",
128 |       "title": "Potential contaminants",
129 |       "type": "continuous"
130 |     },
131 |     {
132 |       "key": "rare_mutations",
133 |       "title": "Rare mutations",
134 |       "type": "continuous"
135 |     }
136 |   ],
137 |   "geo_resolutions": [
138 |     "location",
139 |     "division",
140 |     "country",
141 |     "region"
142 |   ],
143 |   "display_defaults": {
144 |     "color_by": "clade_membership",
145 |     "distance_measure": "num_date",
146 |     "geo_resolution": "country",
147 |     "map_triplicate": true,
148 |     "branch_label": "clade"
149 |   },
150 |   "filters": [
151 |     "recency",
152 |     "clade_membership",
153 |     "emerging_lineage",
154 |     "region",
155 |     "country",
156 |     "division",
157 |     "location",
158 |     "host",
159 |     "epiweek",
160 |     "QC_overall_status"
161 |   ],
162 |   "panels": [
163 |     "tree",
164 |     "map",
165 |     "entropy",
166 |     "frequencies"
167 |   ]
168 | }
169 | 


--------------------------------------------------------------------------------
/docs/src/tutorial/example-data.rst:
--------------------------------------------------------------------------------
 1 | Run using example data
 2 | ======================
 3 | 
 4 | This first tutorial introduces our SARS-CoV-2 workflow.
 5 | You will run the workflow using a small set of reference data that we provide.
 6 | Subsequent tutorials present more complex scenarios that build on this approach.
 7 | 
 8 | .. contents:: Table of Contents
 9 |    :local:
10 | 
11 | Prerequisites
12 | -------------
13 | 
14 | 1. :doc:`setup`. These instructions will install all of the software you need to complete this tutorial and others.
15 | 
16 | Setup
17 | -----
18 | 
19 | 1. Change directory to the ``ncov`` directory:
20 | 
21 |    .. code:: text
22 | 
23 |       cd ncov
24 | 
25 | 2. Download the example tutorial repository into a new subdirectory of ``ncov/`` called ``ncov-tutorial/``:
26 | 
27 |    .. code:: text
28 | 
29 |       git clone https://github.com/nextstrain/ncov-tutorial
30 | 
31 | Run the workflow
32 | ----------------
33 | 
34 | From within the ``ncov/`` directory, run the workflow using a :term:`configuration file <config file>` provided in the tutorial directory:
35 | 
36 | .. code:: text
37 | 
38 |    nextstrain build . --configfile ncov-tutorial/example-data.yaml
39 | 
40 | Break down the command
41 | ~~~~~~~~~~~~~~~~~~~~~~
42 | 
43 | The workflow can take several minutes to run. While it is running, you can learn about the parts of this command:
44 | 
45 | - ``nextstrain build .``
46 |    - This tells the :term:`docs.nextstrain.org:Nextstrain CLI` to :term:`build <docs.nextstrain.org:build (verb)>` the workflow from ``.``, the current directory. All subsequent command-line arguments are passed to the workflow manager, Snakemake.
47 | - ``--configfile ncov-tutorial/example-data.yaml``
48 |    - ``--configfile`` is a Snakemake option used to `configure <https://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#standard-configuration>`__ the ncov workflow. It takes a file path as the value.
49 |    - ``ncov-tutorial/example-data.yaml`` is the value given to ``--configfile``. It is a :term:`config file` that provides custom workflow configuration including inputs and outputs. The contents of this file with comments excluded are:
50 | 
51 |       .. code-block:: yaml
52 | 
53 |          inputs:
54 |            - name: reference_data
55 |              metadata: https://data.nextstrain.org/files/ncov/open/reference/metadata.tsv.xz
56 |              sequences: https://data.nextstrain.org/files/ncov/open/reference/sequences.fasta.xz
57 | 
58 |          refine:
59 |            root: "Wuhan-Hu-1/2019"
60 | 
61 |       The ``inputs`` entry provides the workflow with one input named ``reference_data``. The metadata and sequence files refer to a sample of approximately 300 sequences maintained by the Nextstrain team that represent all Nextstrain clades annotated for SARS-CoV-2. The workflow downloads these files directly from the associated URLs. :doc:`See the complete list of SARS-CoV-2 datasets we provide through data.nextstrain.org <../reference/remote_inputs>`.
62 | 
63 |       The ``refine`` entry specifies the root sequence for the example GenBank data.
64 | 
65 |       For more information, :doc:`see the workflow configuration file reference <../reference/workflow-config-file>`.
66 | 
67 | Running the workflow produces two new directories:
68 | 
69 | - ``auspice/`` contains a few files that represent a Nextstrain :term:`docs.nextstrain.org:dataset` to be visualized in the following section.
70 | - ``results/`` contains intermediate files generated during workflow execution.
71 | 
72 | Visualize the results
73 | ---------------------
74 | 
75 | Run this command to start the :term:`docs.nextstrain.org:Auspice` server, providing ``auspice/`` as the directory containing output dataset files:
76 | 
77 | .. code:: text
78 | 
79 |    nextstrain view auspice/
80 | 
81 | Navigate to http://127.0.0.1:4000/ncov/default-build. The resulting :term:`docs.nextstrain.org:dataset` should show a phylogeny of ~200 sequences:
82 | 
83 | .. figure:: ../images/dataset-example-data.png
84 |    :alt: Phylogenetic tree from the "example data" tutorial as visualized in Auspice
85 | 
86 | To stop the server, press :kbd:`Control-C` on your keyboard.
87 | 
88 | .. note::
89 | 
90 |    You can also view the results by dragging the dataset files all at once onto `auspice.us <https://auspice.us>`__:
91 | 
92 |    - ``auspice/ncov_default-build.json``
93 |    - ``auspice/ncov_default-build_root-sequence.json``
94 |    - ``auspice/ncov_default-build_tip-frequencies.json``
95 | 


--------------------------------------------------------------------------------
/scripts/fix-colorings.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import re
 4 | from numpy import linspace
 5 | from math import floor
 6 | 
 7 | def adjust_coloring_for_epiweeks(dataset):
 8 |     """
 9 |     If an auspice JSON specifies a colouring with the key "epiweek" (case sensitive) then we create a categorical
10 |     colorscale which evenly spaces the canonical nextstrain rainbow across the observed time window.
11 | 
12 |     NOTE: epiweek must be in CDC format ("YYYYMM") but this may be relaxed to include ISO format in the future.
13 |     """
14 |     EPIKEY="epiweek"
15 |     try:
16 |         (cidx, coloring) = [(i, c) for i, c in enumerate(dataset['meta'].get("colorings", [])) if c['key']==EPIKEY][0]
17 |     except IndexError: # coloring doesn't define an epiweek
18 |         return 
19 | 
20 |     # remove any duplicate coloring entries in the JSON to ensure the entry we edit is the one used by Auspice
21 |     # (NOTE: this is augur bug https://github.com/nextstrain/augur/issues/719)
22 |     dataset['meta']['colorings'] = [c for i,c in enumerate(dataset['meta']['colorings']) if not (c['key']==EPIKEY and i!=cidx)]
23 | 
24 |     # delay import to support older setups not using epiweeks package
25 |     from epiweeks import Year, Week
26 |     
27 |     observed_values = set()
28 |     def recurse(node):
29 |         value = node.get("node_attrs", {}).get(EPIKEY, {}).get("value", False)
30 |         if value:
31 |             # we validate using both the epiweeks package and a regex (epiweeks will perform coercion of non-valid data into valid data)
32 |             if not re.match(r'^(\d{4})(\d{2})$', value):
33 |                 raise(ValueError(f"Epiweek value {value} was not in format YYYYMM."))
34 |             week = Week.fromstring(value, system="cdc") # raises ValueError if not valid
35 |             observed_values.add(week)
36 |         for child in node.get("children", []):
37 |             recurse(child)
38 |     try:
39 |         recurse(dataset["tree"])
40 |     except ValueError as e:
41 |         print(str(e))
42 |         print("Skipping color scale creation for epiweek.")
43 |         return
44 |     observed_values = sorted(list(observed_values))
45 | 
46 |     ## generate epiweeks across the entire observed range for color generation
47 |     epiweeks = [ observed_values[0] ]
48 |     while epiweeks[-1] < observed_values[-1]:
49 |         epiweeks.append(epiweeks[-1]+1)
50 |     ## generate rainbow colour scale across epiweeks.
51 |     ## Since a "default" augur install does not include matplotlib, rather than interpolating between values in the scale
52 |     ## we reuse them. This only applies when n(epiweeks)>30, where distinguising between colors is problematic anyway.
53 |     rainbow = ["#511EA8", "#482BB6", "#4039C3", "#3F4ACA", "#3E5CD0", "#416CCE", "#447CCD", "#4989C4", "#4E96BC", "#559FB0", "#5DA8A4", "#66AE96", "#6FB388", "#7AB77C", "#85BA6F", "#91BC64", "#9DBE5A", "#AABD53", "#B6BD4B", "#C2BA46", "#CDB642", "#D6B03F", "#DDA83C", "#E29D39", "#E69036", "#E67F33", "#E56D30", "#E2592C", "#DF4428", "#DC2F24"]
54 |     color_indicies = [floor(x) for x in linspace(0, len(rainbow), endpoint=False, num=len(epiweeks))]
55 |     coloring['scale'] = [
56 |         [epiweek.cdcformat(), rainbow[color_indicies[i]]]
57 |         for i,epiweek in enumerate(epiweeks)
58 |         if epiweek in observed_values
59 |     ]
60 |     ## auspice will order the legend according to the provided color scale, so there is no need to set
61 |     ## `coloring['legend']` unless we want to restrict this for some reason.
62 |     coloring['type'] = 'categorical' # force the scale type to be categorical
63 | 
64 | if __name__ == '__main__':
65 |     parser = argparse.ArgumentParser(
66 |         description="Remove extraneous colorings",
67 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
68 |     )
69 | 
70 |     parser.add_argument('--input', type=str, metavar="JSON", required=True, help="input Auspice JSON")
71 |     parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
72 |     args = parser.parse_args()
73 | 
74 |     with open(args.input, "r") as f:
75 |         input_json = json.load(f)
76 | 
77 |     keys_to_remove = ["genbank_accession", "gisaid_epi_isl"]
78 | 
79 |     fixed_colorings = []
80 |     for coloring in input_json["meta"]["colorings"]:
81 |         if coloring['key'] not in keys_to_remove:
82 |             fixed_colorings.append(coloring)
83 | 
84 |     input_json["meta"]["colorings"] = fixed_colorings
85 | 
86 |     adjust_coloring_for_epiweeks(input_json)
87 | 
88 |     with open(args.output, 'w') as f:
89 |         json.dump(input_json, f, indent=2)
90 | 


--------------------------------------------------------------------------------
/scripts/deprecated/parse_mutational_fitness_tsv_into_distance_map.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Supplementary data S2 from Obermeyer et al (https://www.science.org/doi/10.1126/science.abm1208)
  3 | is a TSV table that maps mutations such as "S:D614G" to an estimate of "Δ log R" and is available via GitHub.
  4 | Here, we convert this TSV table into a JSON compatable with the augur distance command.
  5 | To update model run:
  6 | 
  7 | python scripts/developer_scripts/parse_mutational_fitness_tsv_into_distance_map.py
  8 | 
  9 | and the version the resulting changes to defaults/mutational_fitness_distance_map.json
 10 | 
 11 | Updated model outputs are available at https://github.com/bkotzen/sars-cov2-modeling following:
 12 | 
 13 | https://raw.githubusercontent.com/bkotzen/sars-cov2-modeling/main/2024-07-22/PyR0/mutations.tsv
 14 | 
 15 | --------------------------------------------------------------
 16 | 
 17 | This analysis was removed from the workflow on 2025-01-23
 18 | This was drawn from results at https://github.com/bkotzen/sars-cov2-modeling
 19 | But this repo hasn't been updated since 2024-07-22
 20 | If these results become updated more frequently, we should restore this analysis
 21 | 
 22 | This was used in the workflow following:
 23 | 
 24 | rule mutational_fitness:
 25 |     input:
 26 |         tree = "results/{build_name}/tree.nwk",
 27 |         alignments = lambda w: rules.translate.output.translations,
 28 |         distance_map = config["files"]["mutational_fitness_distance_map"]
 29 |     output:
 30 |         node_data = "results/{build_name}/mutational_fitness.json"
 31 |     benchmark:
 32 |         "benchmarks/mutational_fitness_{build_name}.txt"
 33 |     log:
 34 |         "logs/mutational_fitness_{build_name}.txt"
 35 |     params:
 36 |         genes = ' '.join(config.get('genes', ['S'])),
 37 |         compare_to = "root",
 38 |         attribute_name = "mutational_fitness"
 39 |     conda:
 40 |         config["conda_environment"],
 41 |     resources:
 42 |         mem_mb=2000
 43 |     shell:
 44 |         augur distance \
 45 |             --tree {input.tree} \
 46 |             --alignment {input.alignments} \
 47 |             --gene-names {params.genes} \
 48 |             --compare-to {params.compare_to} \
 49 |             --attribute-name {params.attribute_name} \
 50 |             --map {input.distance_map} \
 51 |             --output {output} 2>&1 | tee {log}
 52 | """
 53 | 
 54 | import argparse
 55 | import pandas as pd
 56 | import json
 57 | 
 58 | if __name__ == '__main__':
 59 |     parser = argparse.ArgumentParser(
 60 |         description="Convert mutational fitness values to an Augur distance map",
 61 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 62 |     )
 63 |     parser.add_argument("--input", default="https://raw.githubusercontent.com/bkotzen/sars-cov2-modeling/main/2024-07-22/PyR0/mutations.tsv", help="TSV file of mutational effects")
 64 |     parser.add_argument("--output", default="defaults/mutational_fitness_distance_map.json", help="JSON file for augur distance")
 65 |     args = parser.parse_args()
 66 | 
 67 |     # collect simple string mapping from TSV, ie
 68 |     # 'S:A522V': -0.00661378
 69 |     # 'ORF1a:T4304I': 0.00353199
 70 |     string_mapping = {}
 71 |     if args.input:
 72 |         df = pd.read_csv(args.input, delimiter='\t')
 73 |         for index, row in df.iterrows():
 74 |             string_mapping[row["mutation"]] = float(row["Δ log R"])
 75 | 
 76 |     # convert simple string mapping into structured mapping required by augur distance, ie
 77 |     # "map": {
 78 |     #   "S": {
 79 |     #     "522": [
 80 |     #       {
 81 |     #         "from": "A",
 82 |     #         "to": "V",
 83 |     #         "weight": -0.00661378
 84 |     #       },
 85 |     structured_mapping = {}
 86 |     for mutation, delta_log_R in string_mapping.items():
 87 |         gene, aa_change = mutation.split(":")
 88 |         if "STOP" not in aa_change:
 89 |             from_aa = aa_change[0]
 90 |             to_aa = aa_change[-1]
 91 |             pos_aa = aa_change[1:-1]
 92 |             if gene not in structured_mapping:
 93 |                 structured_mapping[gene] = {}
 94 |             if pos_aa not in structured_mapping[gene]:
 95 |                 structured_mapping[gene][pos_aa] = []
 96 |             entry = {"from": from_aa, "to": to_aa, "weight": round(delta_log_R, 10)}
 97 |             structured_mapping[gene][pos_aa].append(entry)
 98 | 
 99 |     # output this mapping as an augur distance compatable JSON
100 |     # include very slightly negative default to prevent heavily diverged artifactual genomes from
101 |     # appearing as high fitness
102 |     json_output = {"default": -0.003}
103 |     json_output["map"] = structured_mapping
104 | 
105 |     print("writing mutational_fitness_distance_map.json to defaults/")
106 |     with open(args.output, 'w') as f:
107 |         json.dump(json_output, f, indent=2)
108 | 


--------------------------------------------------------------------------------
/scripts/combine_metadata.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from augur.io import open_file, read_metadata
  3 | from Bio import SeqIO
  4 | import csv
  5 | import sys
  6 | 
  7 | EMPTY = ''
  8 | 
  9 | # This script was written in preparation for a future augur where commands
 10 | # may take multiple metadata files, thus making this script unnecessary!
 11 | #
 12 | # Merging logic:
 13 | # - Order of supplied TSVs matters
 14 | # - All columns are included (i.e. union of all columns present)
 15 | # - The last non-empty value read (from different TSVs) is used. I.e. values are overwritten.
 16 | # - Missing data is represented by an empty string
 17 | #
 18 | # We use one-hot encoding to specify which origin(s) a piece of metadata came from
 19 | 
 20 | def parse_args():
 21 |     parser = argparse.ArgumentParser(
 22 |         description="""
 23 |         Custom script to combine metadata files from different origins.
 24 |         In the case where metadata files specify different values, the latter provided file will take priority.
 25 |         Columns will be added for each origin with values "yes" or "no" to identify the input source (origin) of each sample.
 26 |         """,
 27 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 28 |     )
 29 |     parser.add_argument('--metadata', required=True, nargs='+', metavar="TSV", help="Metadata files")
 30 |     parser.add_argument('--origins', required=True, nargs='+', metavar="STR", help="Names of origins (order should match provided metadata)")
 31 |     parser.add_argument('--output', required=True, metavar="TSV", help="Output (merged) metadata")
 32 |     args = parser.parse_args()
 33 |     return args
 34 | 
 35 | if __name__ == '__main__':
 36 |     args = parse_args()
 37 |     try:
 38 |         assert(len(args.metadata)==len(args.origins))
 39 |         assert(len(args.origins)>1)
 40 |     except AssertionError:
 41 |         print("Error. Please check your inputs - there must be the same number of metadata files as origins provided, and there must be more than one of each!")
 42 |         sys.exit(2)
 43 | 
 44 |     # READ IN METADATA FILES
 45 |     metadata = []
 46 |     for (origin, fname) in zip(args.origins, args.metadata):
 47 |         data = read_metadata(fname)
 48 |         data.insert(0, "strain", data.index.values)
 49 |         columns = data.columns
 50 |         data = data.to_dict(orient="index")
 51 | 
 52 |         metadata.append({'origin': origin, "fname": fname, 'data': data, 'columns': columns, 'strains': {s for s in data.keys()}})
 53 | 
 54 |     # SUMMARISE INPUT METADATA
 55 |     print(f"Parsed {len(metadata)} metadata TSVs")
 56 |     for m in metadata:
 57 |         print(f"\t{m['origin']} ({m['fname']}): {len(m['data'].keys())} strains x {len(m['columns'])} columns")
 58 | 
 59 |     # BUILD UP COLUMN NAMES FROM MULTIPLE INPUTS TO PRESERVE ORDER
 60 |     combined_columns = []
 61 |     for m in metadata:
 62 |         combined_columns.extend([c for c in m['columns'] if c not in combined_columns])
 63 |     combined_columns.extend(list(args.origins))
 64 | 
 65 |     # ADD IN VALUES ONE BY ONE, OVERWRITING AS NECESSARY
 66 |     combined_data = metadata[0]['data']
 67 |     for strain in combined_data:
 68 |         for column in combined_columns:
 69 |             if column not in combined_data[strain]:
 70 |                 combined_data[strain][column] = EMPTY
 71 | 
 72 |     for idx in range(1, len(metadata)):
 73 |         for strain, row in metadata[idx]['data'].items():
 74 |             if strain not in combined_data:
 75 |                 combined_data[strain] = {c:EMPTY for c in combined_columns}
 76 |             for column in combined_columns:
 77 |                 if column in row:
 78 |                     existing_value = combined_data[strain][column]
 79 |                     new_value = row[column]
 80 |                     # overwrite _ANY_ existing value if the overwriting value is non empty (and different)!
 81 |                     if new_value != EMPTY and new_value != existing_value:
 82 |                         if existing_value != EMPTY:
 83 |                             print(f"[{strain}::{column}] Overwriting {combined_data[strain][column]} with {new_value}")
 84 |                         combined_data[strain][column] = new_value
 85 | 
 86 |     # one-hot encoding for origin
 87 |     # note that we use "yes" / "no" here as Booleans are problematic for `augur filter`
 88 |     for metadata_entry in metadata:
 89 |         origin = metadata_entry['origin']
 90 |         for strain in combined_data:
 91 |             combined_data[strain][origin] = "yes" if strain in metadata_entry['strains'] else "no"
 92 | 
 93 |     print(f"Combined metadata: {len(combined_data.keys())} strains x {len(combined_columns)} columns")
 94 | 
 95 |     with open_file(args.output, 'w') as fh:
 96 |         tsv_writer = csv.writer(fh, delimiter='\t')
 97 |         tsv_writer.writerow(combined_columns)
 98 |         for row in combined_data.values():
 99 |             tsv_writer.writerow([row[column] for column in combined_columns])
100 | 


--------------------------------------------------------------------------------
/scripts/developer_scripts/get_population_weights:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | #
  3 | # This script generates a TSV file containing country names and population sizes.
  4 | 
  5 | import argparse
  6 | import os
  7 | from pathlib import Path
  8 | import pandas as pd
  9 | import ssl
 10 | import urllib.request
 11 | 
 12 | 
 13 | def download_source_data(path):
 14 |     # This is the link for "1950-2100, all scenarios"¹ on the UN population CSV
 15 |     # download page: <https://population.un.org/wpp/Download/Standard/CSV/>
 16 |     url = 'https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_TotalPopulationBySex.csv.gz'
 17 | 
 18 |     # As of 2024-08-07, the URL requires a workaround to download programmatically:
 19 |     # <https://github.com/urllib3/urllib3/issues/2653#issuecomment-1165307051>
 20 |     # <https://www.ssllabs.com/ssltest/analyze.html?d=population.un.org>
 21 |     ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 22 |     ctx.options |= 0x4  # OP_LEGACY_SERVER_CONNECT
 23 |     response = urllib.request.urlopen(url, context=ctx)
 24 | 
 25 |     os.makedirs(os.path.dirname(path), exist_ok=True)
 26 |     with open(path, 'wb') as f:
 27 |         f.write(response.read())
 28 | 
 29 | 
 30 | def export_population_weights(output):
 31 |     csv = Path(os.path.dirname(__file__)) / "data/WPP2024_TotalPopulationBySex.csv.gz"
 32 |     source = 'the United Nations World Population Prospects'
 33 |     if os.path.exists(output):
 34 |         print(f'Source data already exists: {str(csv)!r}')
 35 |         print(f'... skipping download.')
 36 |     else:
 37 |         print(f'Downloading source data to {str(csv)!r}...')
 38 |         download_source_data(csv)
 39 | 
 40 |     print('Formatting data for output...')
 41 |     df = pd.read_csv(csv, usecols=['Location', 'LocTypeName', 'Time', 'PopTotal'], dtype='str')
 42 | 
 43 |     # Drop rows that represent aggregate regions/subregions/etc.
 44 |     df = df[df['LocTypeName'] == 'Country/Area']
 45 | 
 46 |     # Use data from the latest non-forecast year
 47 |     year = '2023'
 48 |     df = df[df['Time'] == year]
 49 | 
 50 |     # Rename columns to match names in metadata
 51 |     column_name_map = {
 52 |         'Location': 'country',
 53 |         'PopTotal': 'weight',
 54 |     }
 55 |     df = df.rename(columns=column_name_map)
 56 | 
 57 |     # Keep only the columns used above
 58 |     df = df[column_name_map.values()]
 59 | 
 60 |     # Set country as index and sort alphabetically
 61 |     df = df.set_index('country')
 62 |     df = df.sort_index()
 63 | 
 64 |     # Rename countries to match values in metadata
 65 |     country_name_map = {
 66 |         "Bolivia (Plurinational State of)": "Bolivia",
 67 |         "Bonaire, Sint Eustatius and Saba": "Bonaire",
 68 |         "Brunei Darussalam": "Brunei",
 69 |         "China, Hong Kong SAR": "Hong Kong",
 70 |         "China, Macao SAR": "Macao",
 71 |         "China, Taiwan Province of China": "Taiwan",
 72 |         "Comoros": "Union of the Comoros",
 73 |         "Congo": "Republic of the Congo",
 74 |         "Curaçao": "Curacao",
 75 |         "Czechia": "Czech Republic",
 76 |         "Iran (Islamic Republic of)": "Iran",
 77 |         "Kosovo (under UNSC res. 1244)": "Kosovo",
 78 |         "Lao People's Democratic Republic": "Laos",
 79 |         "Micronesia (Fed. States of)": "Micronesia",
 80 |         "Republic of Korea": "South Korea",
 81 |         "Republic of Moldova": "Moldova",
 82 |         "Russian Federation": "Russia",
 83 |         "Saint Martin (French part)": "Saint Martin",
 84 |         "Sint Maarten (Dutch part)": "Sint Maarten",
 85 |         "State of Palestine": "Palestine",
 86 |         "Syrian Arab Republic": "Syria",
 87 |         "Türkiye": "Turkey",
 88 |         "United Republic of Tanzania": "Tanzania",
 89 |         "United States of America": "USA",
 90 |         "Venezuela (Bolivarian Republic of)": "Venezuela",
 91 |         "Viet Nam": "Vietnam",
 92 |     }
 93 |     df = df.rename(index=country_name_map)
 94 | 
 95 |     # Ensure int weights are written without decimals
 96 |     df['weight'] = pd.to_numeric(df['weight']).astype(int)
 97 | 
 98 |     print(f'Writing data to {str(output)!r}...')
 99 | 
100 |     # Delete output file if it already exists
101 |     if os.path.exists(output):
102 |         os.remove(output)
103 | 
104 |     # Export
105 |     with open(output, 'a') as f:
106 |         print("# [DO NOT EDIT] This file was generated by scripts/developer_scripts/get_population_weights", file=f)
107 |         print(f"# Based on {year} population estimates from {source}", file=f)
108 |         df.to_csv(f, index=True, sep='\t')
109 | 
110 |     print('Done.')
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     parser = argparse.ArgumentParser(
115 |         description="Create population sizes file",
116 |     )
117 | 
118 |     parser.add_argument('--output', type=str, metavar="FILE",
119 |         default="defaults/population_weights.tsv",
120 |         help="Path to output population sizes file",
121 |     )
122 |     args = parser.parse_args()
123 | 
124 |     export_population_weights(args.output)
125 | 


--------------------------------------------------------------------------------
/defaults/population_weights.tsv:
--------------------------------------------------------------------------------
  1 | # [DO NOT EDIT] This file was generated by scripts/developer_scripts/get_population_weights
  2 | # Based on 2023 population estimates from the United Nations World Population Prospects
  3 | country	weight
  4 | Afghanistan	41454
  5 | Albania	2811
  6 | Algeria	46164
  7 | American Samoa	47
  8 | Andorra	80
  9 | Angola	36749
 10 | Anguilla	14
 11 | Antigua and Barbuda	93
 12 | Argentina	45538
 13 | Armenia	2943
 14 | Aruba	107
 15 | Australia	26451
 16 | Austria	9130
 17 | Azerbaijan	10318
 18 | Bahamas	399
 19 | Bahrain	1569
 20 | Bangladesh	171466
 21 | Barbados	282
 22 | Belarus	9115
 23 | Belgium	11712
 24 | Belize	411
 25 | Benin	14111
 26 | Bermuda	64
 27 | Bhutan	786
 28 | Bolivia	12244
 29 | Bonaire	29
 30 | Bosnia and Herzegovina	3185
 31 | Botswana	2480
 32 | Brazil	211140
 33 | British Virgin Islands	38
 34 | Brunei	458
 35 | Bulgaria	6795
 36 | Burkina Faso	23025
 37 | Burundi	13689
 38 | Cabo Verde	522
 39 | Cambodia	17423
 40 | Cameroon	28372
 41 | Canada	39299
 42 | Cayman Islands	73
 43 | Central African Republic	5152
 44 | Chad	19319
 45 | Chile	19658
 46 | China	1422584
 47 | Hong Kong	7442
 48 | Macao	713
 49 | Taiwan	23317
 50 | Colombia	52321
 51 | Union of the Comoros	850
 52 | Republic of the Congo	6182
 53 | Cook Islands	14
 54 | Costa Rica	5105
 55 | Croatia	3896
 56 | Cuba	11019
 57 | Curacao	185
 58 | Cyprus	1344
 59 | Czech Republic	10809
 60 | Côte d'Ivoire	31165
 61 | Dem. People's Republic of Korea	26418
 62 | Democratic Republic of the Congo	105789
 63 | Denmark	5948
 64 | Djibouti	1152
 65 | Dominica	66
 66 | Dominican Republic	11331
 67 | Ecuador	17980
 68 | Egypt	114535
 69 | El Salvador	6309
 70 | Equatorial Guinea	1847
 71 | Eritrea	3470
 72 | Estonia	1367
 73 | Eswatini	1230
 74 | Ethiopia	128691
 75 | Falkland Islands (Malvinas)	3
 76 | Faroe Islands	54
 77 | Fiji	924
 78 | Finland	5601
 79 | France	66438
 80 | French Guiana	303
 81 | French Polynesia	281
 82 | Gabon	2484
 83 | Gambia	2697
 84 | Georgia	3807
 85 | Germany	84548
 86 | Ghana	33787
 87 | Gibraltar	38
 88 | Greece	10242
 89 | Greenland	55
 90 | Grenada	117
 91 | Guadeloupe	376
 92 | Guam	166
 93 | Guatemala	18124
 94 | Guernsey	64
 95 | Guinea	14405
 96 | Guinea-Bissau	2153
 97 | Guyana	826
 98 | Haiti	11637
 99 | Holy See	0
100 | Honduras	10644
101 | Hungary	9686
102 | Iceland	387
103 | India	1438069
104 | Indonesia	281190
105 | Iran	90608
106 | Iraq	45074
107 | Ireland	5196
108 | Isle of Man	84
109 | Israel	9256
110 | Italy	59499
111 | Jamaica	2839
112 | Japan	124370
113 | Jersey	103
114 | Jordan	11439
115 | Kazakhstan	20330
116 | Kenya	55339
117 | Kiribati	132
118 | Kosovo	1700
119 | Kuwait	4838
120 | Kyrgyzstan	7073
121 | Laos	7664
122 | Latvia	1882
123 | Lebanon	5773
124 | Lesotho	2311
125 | Liberia	5493
126 | Libya	7305
127 | Liechtenstein	39
128 | Lithuania	2854
129 | Luxembourg	665
130 | Madagascar	31195
131 | Malawi	21104
132 | Malaysia	35126
133 | Maldives	525
134 | Mali	23769
135 | Malta	532
136 | Marshall Islands	38
137 | Martinique	346
138 | Mauritania	5022
139 | Mauritius	1273
140 | Mayotte	316
141 | Mexico	129739
142 | Micronesia	112
143 | Monaco	38
144 | Mongolia	3431
145 | Montenegro	633
146 | Montserrat	4
147 | Morocco	37712
148 | Mozambique	33635
149 | Myanmar	54133
150 | Namibia	2963
151 | Nauru	11
152 | Nepal	29694
153 | Netherlands	18092
154 | New Caledonia	289
155 | New Zealand	5172
156 | Nicaragua	6823
157 | Niger	26159
158 | Nigeria	227882
159 | Niue	1
160 | North Macedonia	1831
161 | Northern Mariana Islands	45
162 | Norway	5519
163 | Oman	5049
164 | Pakistan	247504
165 | Palau	17
166 | Panama	4458
167 | Papua New Guinea	10389
168 | Paraguay	6844
169 | Peru	33845
170 | Philippines	114891
171 | Poland	38762
172 | Portugal	10430
173 | Puerto Rico	3242
174 | Qatar	2979
175 | South Korea	51748
176 | Moldova	3067
177 | Romania	19118
178 | Russia	145440
179 | Rwanda	13954
180 | Réunion	874
181 | Saint Barthélemy	11
182 | Saint Helena	5
183 | Saint Kitts and Nevis	46
184 | Saint Lucia	179
185 | Saint Martin	27
186 | Saint Pierre and Miquelon	5
187 | Saint Vincent and the Grenadines	101
188 | Samoa	216
189 | San Marino	33
190 | Sao Tome and Principe	230
191 | Saudi Arabia	33264
192 | Senegal	18077
193 | Serbia	6773
194 | Seychelles	127
195 | Sierra Leone	8460
196 | Singapore	5789
197 | Sint Maarten	42
198 | Slovakia	5518
199 | Slovenia	2118
200 | Solomon Islands	800
201 | Somalia	18358
202 | South Africa	63212
203 | South Sudan	11483
204 | Spain	47911
205 | Sri Lanka	22971
206 | Palestine	5409
207 | Sudan	50042
208 | Suriname	628
209 | Sweden	10551
210 | Switzerland	8870
211 | Syria	23594
212 | Tajikistan	10389
213 | Thailand	71702
214 | Timor-Leste	1384
215 | Togo	9304
216 | Tokelau	2
217 | Tonga	104
218 | Trinidad and Tobago	1502
219 | Tunisia	12200
220 | Turkmenistan	7364
221 | Turks and Caicos Islands	46
222 | Tuvalu	9
223 | Turkey	87270
224 | Uganda	48656
225 | Ukraine	37732
226 | United Arab Emirates	10642
227 | United Kingdom	68682
228 | Tanzania	66617
229 | United States Virgin Islands	85
230 | USA	343477
231 | Uruguay	3388
232 | Uzbekistan	35652
233 | Vanuatu	320
234 | Venezuela	28300
235 | Vietnam	100352
236 | Wallis and Futuna Islands	11
237 | Western Sahara	579
238 | Yemen	39390
239 | Zambia	20723
240 | Zimbabwe	16340
241 | 


--------------------------------------------------------------------------------