├── tests ├── .gitignore ├── local-inputs-compressed │ ├── data │ │ ├── .gitignore │ │ ├── asia_metadata.tsv.xz │ │ ├── asia_sequences.fasta.xz │ │ ├── europe_aligned.fasta.xz │ │ ├── europe_metadata.tsv.xz │ │ ├── oceania_masked.fasta.xz │ │ ├── oceania_metadata.tsv.xz │ │ ├── americas_filtered.fasta.xz │ │ └── americas_metadata.tsv.xz │ ├── config.yaml │ └── builds.yaml ├── local-inputs-uncompressed │ ├── data │ │ └── .gitignore │ ├── config.yaml │ └── builds.yaml ├── unsanitized_metadata.tar.gz ├── unsanitized_metadata.tsv ├── remote-inputs-compressed │ ├── config.yaml │ └── builds.yaml ├── remote-inputs-uncompressed │ ├── config.yaml │ └── builds.yaml ├── check_auspice_json.py └── different-inputs.t ├── defaults ├── include.txt ├── sites_ignored_for_tree_topology.txt ├── description.md ├── distance_maps │ └── VoC.json ├── annotation.gff ├── clade_hierarchy.tsv ├── clade_display_names.yml ├── clade_emergence_dates.tsv ├── clades_who.tsv ├── auspice_config.json └── population_weights.tsv ├── nextstrain_profiles ├── nextstrain-gisaid-21L │ ├── include.txt │ ├── config.yaml │ ├── exclude-clades.tsv │ └── prefilter.smk ├── nextstrain-ci │ ├── config.yaml │ └── builds.yaml ├── nextstrain-open │ └── config.yaml ├── nextstrain-gisaid │ ├── config.yaml │ └── legacy_clades.tsv └── 100k │ ├── config-open.yaml │ ├── README.md │ └── config-gisaid.yaml ├── data ├── example_sequences.fasta.gz ├── example_metadata_aus.tsv.xz ├── example_multiple_inputs.tar.xz ├── example_sequences_aus.fasta.xz ├── example_metadata_worldwide.tsv.xz ├── example_sequences_worldwide.fasta.xz └── references_metadata.tsv ├── docs ├── src │ ├── images │ │ ├── gisaid-login.png │ │ ├── gisaid-homepage.png │ │ ├── terra-datatable.png │ │ ├── dataset-custom-data.png │ │ ├── dataset-example-data.png │ │ ├── getting-started-tree.png │ │ ├── gisaid-epicov-search.png │ │ ├── gisaid-navigation-bar.png │ │ ├── gisaid-search-results.png │ │ ├── multiple_inputs_dag.png │ │ ├── basic_nextstrain_build.png │ │ ├── gisaid-downloads-window.png │ │ ├── dataset-genomic-surveillance.png │ │ ├── gisaid-epicov-navigation-bar.png │ │ ├── dataset-custom-data-highlighted.png │ │ ├── gisaid-augur-pipeline-download.png │ │ ├── gisaid-download-packages-window.png │ │ ├── gisaid-initial-search-interface.png │ │ ├── gisaid-search-download-window.png │ │ ├── gisaid-nextregions-download-window.png │ │ ├── gisaid-search-download-window-metadata.png │ │ ├── gisaid-select-sequences-10-highlighted.png │ │ ├── gisaid-search-download-window-sequences.png │ │ ├── gisaid-select-sequences-idaho-highlighted.png │ │ ├── gisaid-epicov-navigation-bar-with-downloads.png │ │ └── gisaid-nextregions-download-terms-and-conditions.png │ ├── tutorial │ │ ├── intro.rst │ │ ├── videos.rst │ │ ├── setup.rst │ │ ├── next-steps.rst │ │ └── example-data.rst │ ├── guides │ │ ├── data-prep │ │ │ └── index.rst │ │ ├── update-workflow.rst │ │ ├── customizing-visualization.rst │ │ └── run-analysis-on-terra.rst │ ├── visualization │ │ ├── narratives.rst │ │ └── interpretation.rst │ ├── _static │ │ └── css │ │ │ └── configuration-reference.css │ ├── reference │ │ ├── glossary.rst │ │ ├── nextstrain-overview.rst │ │ ├── naming_clades.rst │ │ ├── troubleshoot.rst │ │ ├── data_submitter_faq.rst │ │ └── files.rst │ └── index.rst ├── conda.yml ├── glossary.md ├── make.bat ├── README.md ├── Makefile ├── translation_docs.md └── redirects.yaml ├── workflow ├── wdl │ ├── genbank_ingest.json │ ├── ncov_workflow.json │ ├── gisaid_ingest.json │ ├── genbank_ingest.wdl │ ├── gisaid_ingest.wdl │ └── ncov_workflow.wdl ├── envs │ └── nextstrain.yaml └── schemas │ └── config.schema.yaml ├── .gitattributes ├── scripts ├── curate_metadata │ ├── requirements.txt │ ├── config_curate_metadata │ │ ├── internationalExceptions.txt │ │ ├── country_ordering │ │ │ ├── Austria_variants.txt │ │ │ └── Slovakia_variants.txt │ │ └── acceptedExposureAdditions.txt │ └── config_files_additional_info │ │ ├── variants.txt │ │ ├── location_pattern.txt │ │ ├── purpose_of_sequencing.txt │ │ └── info_ignore.txt ├── sha256sum ├── generate-scientific-credits.py ├── narrative-pdf-screens.sh ├── normalize_gisaid_fasta.sh ├── expand-clade-definitions ├── annotate_metadata_with_index.py ├── construct-recency-from-submission-date.py ├── rename_clades.py ├── upload-to-s3 ├── add_priorities_to_meta.py ├── adjust_regional_meta.py ├── calculate_epiweek.py ├── mask-alignment.py ├── priorities.py ├── include_prefix.py ├── add_labels.py ├── check_missing_locations.py ├── revert ├── tsv-cast-header ├── explicit_translation.py ├── find_clusters.py ├── fetch_mlr_lineage_fitness.py ├── fix-colorings.py ├── deprecated │ └── parse_mutational_fitness_tsv_into_distance_map.py ├── combine_metadata.py └── developer_scripts │ └── get_population_weights ├── readthedocs.yml ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── dependabot.yml ├── workflows │ ├── sync-redirects.yaml │ ├── ci.yaml │ ├── rebuild-100k.yml │ ├── revert.yml │ ├── rebuild-gisaid.yml │ ├── rebuild-gisaid-21L.yml │ └── rebuild-open.yml └── pull_request_template.md ├── my_profiles └── README.md ├── .dockstore.yml ├── LICENSE ├── .gitignore ├── narratives └── ncov_template_narrative.md └── README.md /tests/.gitignore: -------------------------------------------------------------------------------- 1 | *.err 2 | /output -------------------------------------------------------------------------------- /defaults/include.txt: -------------------------------------------------------------------------------- 1 | Wuhan/Hu-1/2019 2 | Wuhan-Hu-1/2019 3 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-gisaid-21L/include.txt: -------------------------------------------------------------------------------- 1 | 21L 2 | -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.fasta 2 | *.tsv -------------------------------------------------------------------------------- /tests/local-inputs-uncompressed/data/.gitignore: -------------------------------------------------------------------------------- 1 | *.fasta 2 | *.tsv -------------------------------------------------------------------------------- /data/example_sequences.fasta.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_sequences.fasta.gz -------------------------------------------------------------------------------- /data/example_metadata_aus.tsv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_metadata_aus.tsv.xz -------------------------------------------------------------------------------- /defaults/sites_ignored_for_tree_topology.txt: -------------------------------------------------------------------------------- 1 | 21846 2 | 21987 3 | 22992 4 | 23012 5 | 23063 6 | 23604 7 | 24410 8 | -------------------------------------------------------------------------------- /docs/src/images/gisaid-login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-login.png -------------------------------------------------------------------------------- /tests/unsanitized_metadata.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/unsanitized_metadata.tar.gz -------------------------------------------------------------------------------- /data/example_multiple_inputs.tar.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_multiple_inputs.tar.xz -------------------------------------------------------------------------------- /data/example_sequences_aus.fasta.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_sequences_aus.fasta.xz -------------------------------------------------------------------------------- /docs/src/images/gisaid-homepage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-homepage.png -------------------------------------------------------------------------------- /docs/src/images/terra-datatable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/terra-datatable.png -------------------------------------------------------------------------------- /workflow/wdl/genbank_ingest.json: -------------------------------------------------------------------------------- 1 | { 2 | "GENBANK_INGEST.cache_nextclade_old":"${workspace.genbank_nextclade_tsv}" 3 | } -------------------------------------------------------------------------------- /data/example_metadata_worldwide.tsv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_metadata_worldwide.tsv.xz -------------------------------------------------------------------------------- /data/example_sequences_worldwide.fasta.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/data/example_sequences_worldwide.fasta.xz -------------------------------------------------------------------------------- /docs/src/images/dataset-custom-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-custom-data.png -------------------------------------------------------------------------------- /docs/src/images/dataset-example-data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-example-data.png -------------------------------------------------------------------------------- /docs/src/images/getting-started-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/getting-started-tree.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-epicov-search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-epicov-search.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-navigation-bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-navigation-bar.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-search-results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-results.png -------------------------------------------------------------------------------- /docs/src/images/multiple_inputs_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/multiple_inputs_dag.png -------------------------------------------------------------------------------- /docs/src/images/basic_nextstrain_build.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/basic_nextstrain_build.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-downloads-window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-downloads-window.png -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Allow Git to decide if file is text or binary 2 | # Always use LF line endings even on Windows. 3 | * text=auto eol=lf 4 | -------------------------------------------------------------------------------- /docs/src/images/dataset-genomic-surveillance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-genomic-surveillance.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-epicov-navigation-bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-epicov-navigation-bar.png -------------------------------------------------------------------------------- /docs/src/images/dataset-custom-data-highlighted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/dataset-custom-data-highlighted.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-augur-pipeline-download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-augur-pipeline-download.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-download-packages-window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-download-packages-window.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-initial-search-interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-initial-search-interface.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-search-download-window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-download-window.png -------------------------------------------------------------------------------- /workflow/wdl/ncov_workflow.json: -------------------------------------------------------------------------------- 1 | { 2 | "Nextstrain_WRKFLW.metadata_tsv":"${this.metadata}", 3 | "Nextstrain_WRKFLW.sequence_fasta":"${this.sequences}" 4 | } -------------------------------------------------------------------------------- /docs/src/images/gisaid-nextregions-download-window.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-nextregions-download-window.png -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/asia_metadata.tsv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/asia_metadata.tsv.xz -------------------------------------------------------------------------------- /docs/src/images/gisaid-search-download-window-metadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-download-window-metadata.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-select-sequences-10-highlighted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-select-sequences-10-highlighted.png -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/asia_sequences.fasta.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/asia_sequences.fasta.xz -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/europe_aligned.fasta.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/europe_aligned.fasta.xz -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/europe_metadata.tsv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/europe_metadata.tsv.xz -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/oceania_masked.fasta.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/oceania_masked.fasta.xz -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/oceania_metadata.tsv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/oceania_metadata.tsv.xz -------------------------------------------------------------------------------- /docs/src/images/gisaid-search-download-window-sequences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-search-download-window-sequences.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-select-sequences-idaho-highlighted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-select-sequences-idaho-highlighted.png -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/americas_filtered.fasta.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/americas_filtered.fasta.xz -------------------------------------------------------------------------------- /tests/local-inputs-compressed/data/americas_metadata.tsv.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/tests/local-inputs-compressed/data/americas_metadata.tsv.xz -------------------------------------------------------------------------------- /docs/src/images/gisaid-epicov-navigation-bar-with-downloads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-epicov-navigation-bar-with-downloads.png -------------------------------------------------------------------------------- /docs/src/images/gisaid-nextregions-download-terms-and-conditions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nextstrain/ncov/HEAD/docs/src/images/gisaid-nextregions-download-terms-and-conditions.png -------------------------------------------------------------------------------- /scripts/curate_metadata/requirements.txt: -------------------------------------------------------------------------------- 1 | # python dependencies for developer scripts in this directory; install with pip install -r scripts/developer_scripts/requirements.txt 2 | geopy 3 | xlrd 4 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-ci/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - nextstrain_profiles/nextstrain-gisaid/builds.yaml 4 | - nextstrain_profiles/nextstrain-ci/builds.yaml 5 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_curate_metadata/internationalExceptions.txt: -------------------------------------------------------------------------------- 1 | Europe/Austria/Italian cruise ship/ Europe/Italy/Italian cruise ship/ 2 | Asia/Japan/Diamond Princess/ North America/USA/Diamond Princess/ -------------------------------------------------------------------------------- /readthedocs.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | 4 | build: 5 | os: "ubuntu-22.04" 6 | tools: 7 | python: "mambaforge-23.11" 8 | 9 | conda: 10 | environment: docs/conda.yml 11 | 12 | sphinx: 13 | configuration: docs/src/conf.py 14 | -------------------------------------------------------------------------------- /workflow/envs/nextstrain.yaml: -------------------------------------------------------------------------------- 1 | name: nextstrain 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - augur=22.4.0 8 | - epiweeks=2.1.2 9 | - iqtree=2.2.0.3 10 | - nextclade=3.9.0 11 | - python>=3.8* 12 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-open/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - nextstrain_profiles/nextstrain-open/builds.yaml 4 | 5 | cores: 8 6 | keep-going: False 7 | printshellcmds: True 8 | show-failed-logs: True 9 | restart-times: 2 10 | set-threads: tree=4 11 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-gisaid/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - nextstrain_profiles/nextstrain-gisaid/builds.yaml 4 | 5 | cores: 8 6 | keep-going: False 7 | printshellcmds: True 8 | show-failed-logs: True 9 | restart-times: 2 10 | set-threads: tree=4 11 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-gisaid-21L/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml 4 | 5 | cores: 8 6 | keep-going: False 7 | printshellcmds: True 8 | show-failed-logs: True 9 | restart-times: 2 10 | set-threads: tree=4 11 | -------------------------------------------------------------------------------- /defaults/description.md: -------------------------------------------------------------------------------- 1 | Hi! This is the default description, written in [Markdown](https://www.markdownguide.org/getting-started/). You can change this by creating another Markdown file and referencing it in the workflow config file: 2 | 3 | ```yaml 4 | files: 5 | description: path/to/description.md 6 | ``` 7 | -------------------------------------------------------------------------------- /scripts/sha256sum: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Portable sha256sum utility. 4 | """ 5 | from hashlib import sha256 6 | from sys import stdin 7 | 8 | chunk_size = 5 * 1024**2 # 5 MiB 9 | 10 | h = sha256() 11 | 12 | for chunk in iter(lambda: stdin.buffer.read(chunk_size), b""): 13 | h.update(chunk) 14 | 15 | print(h.hexdigest()) 16 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_curate_metadata/country_ordering/Austria_variants.txt: -------------------------------------------------------------------------------- 1 | Gofis Göfis 2 | St. Pölten Sankt Pölten 3 | Krems An Der Dou Krems an der Donau 4 | Südoststeiermark Feldbach 5 | St.Valentin St. Valentin 6 | Nussdorfaa Nussdorf am Attersee 7 | Aurachah Aurach am Hongar 8 | Sankt Lorenz St. Lorenz 9 | Rudolfsheim Funfhaus Rudolfsheim-Fünfhaus -------------------------------------------------------------------------------- /tests/unsanitized_metadata.tsv: -------------------------------------------------------------------------------- 1 | Virus name gender date gisaid_epi_isl 2 | hCoV-19/OneVirus/1/2020 male 2020-10-01 EPI_ISL_1 3 | hCoV-19/OneVirus/1/2020 male 2020-10-01 EPI_ISL_2 4 | SARS-CoV-2/AnotherVirus/1/2021 female 2021-01-01 EPI_ISL_3 5 | hCoV-19/LocalVirus/2/2021 ? 2021-12-01 ? 6 | hCoV-19/LocalVirus/2/2021 ? 2021-12-01 ? 7 | hCoV-19/LocalVirus/3/2021 ? ? ? 8 | -------------------------------------------------------------------------------- /defaults/distance_maps/VoC.json: -------------------------------------------------------------------------------- 1 | { 2 | "default": 0, 3 | "map": { 4 | "S": { 5 | "18": 1, 6 | "69": 1, 7 | "144": 1, 8 | "242": 1, 9 | "417": 1, 10 | "452": 1, 11 | "477": 1, 12 | "484": 1, 13 | "501": 1, 14 | "613": 1, 15 | "614": 1, 16 | "681": 1 17 | } 18 | }, 19 | "name": "MoC_count" 20 | } 21 | -------------------------------------------------------------------------------- /docs/conda.yml: -------------------------------------------------------------------------------- 1 | name: ncov-docs 2 | channels: 3 | - defaults 4 | dependencies: 5 | - make 6 | - pip 7 | - pip: 8 | - nextstrain-sphinx-theme>=2022.5 9 | - recommonmark 10 | - requests 11 | - sphinx 12 | - docutils 13 | - sphinx-argparse 14 | - sphinx-autobuild 15 | - sphinx-copybutton 16 | - sphinx-markdown-tables 17 | - sphinx-tabs 18 | -------------------------------------------------------------------------------- /workflow/wdl/gisaid_ingest.json: -------------------------------------------------------------------------------- 1 | { 2 | "GISAID_INGEST.GISAID_API_ENDPOINT":"${workspace.GISAID_API_ENDPOINT}", 3 | "GISAID_INGEST.GISAID_USERNAME_AND_PASSWORD":"${workspace.GISAID_USERNAME_AND_PASSWORD}", 4 | "GISAID_INGEST.cache_nextclade_old":"${workspace.gisaid_nextclade_tsv}", 5 | "GISAID_INGEST.ingest.giturl":"https://github.com/nextstrain/ncov-ingest/archive/refs/heads/master.zip" 6 | } -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv: -------------------------------------------------------------------------------- 1 | clade 2 | 19A 3 | 19B 4 | 20A 5 | 20B 6 | 20C 7 | 20D 8 | 20E (EU1) 9 | 20F 10 | 20G 11 | 20H (Beta, V2) 12 | 20I (Alpha, V1) 13 | 20J (Gamma, V3) 14 | 21A (Delta) 15 | 21B (Kappa) 16 | 21C (Epsilon) 17 | 21D (Eta) 18 | 21E (Theta) 19 | 21F (Iota) 20 | 21G (Lambda) 21 | 21H (Mu) 22 | 21I (Delta) 23 | 21J (Delta) 24 | 21K (Omicron) 25 | 21M (Omicron) 26 | -------------------------------------------------------------------------------- /tests/local-inputs-compressed/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - tests/local-inputs-compressed/builds.yaml 4 | 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline. 6 | cores: 2 7 | 8 | # Always print the commands that will be run to the screen for debugging. 9 | printshellcmds: True 10 | 11 | # Print log files of failed jobs 12 | show-failed-logs: True 13 | -------------------------------------------------------------------------------- /tests/remote-inputs-compressed/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - tests/remote-inputs-compressed/builds.yaml 4 | 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline. 6 | cores: 2 7 | 8 | # Always print the commands that will be run to the screen for debugging. 9 | printshellcmds: True 10 | 11 | # Print log files of failed jobs 12 | show-failed-logs: True 13 | -------------------------------------------------------------------------------- /tests/local-inputs-uncompressed/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - tests/local-inputs-uncompressed/builds.yaml 4 | 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline. 6 | cores: 2 7 | 8 | # Always print the commands that will be run to the screen for debugging. 9 | printshellcmds: True 10 | 11 | # Print log files of failed jobs 12 | show-failed-logs: True 13 | -------------------------------------------------------------------------------- /tests/remote-inputs-uncompressed/config.yaml: -------------------------------------------------------------------------------- 1 | configfile: 2 | - defaults/parameters.yaml 3 | - tests/remote-inputs-uncompressed/builds.yaml 4 | 5 | # Set the maximum number of cores you want Snakemake to use for this pipeline. 6 | cores: 2 7 | 8 | # Always print the commands that will be run to the screen for debugging. 9 | printshellcmds: True 10 | 11 | # Print log files of failed jobs 12 | show-failed-logs: True 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Want us to add a feature to Nextstrain? 4 | title: "" 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Context** 11 | How would this feature help you? What would it enable you to do? 12 | 13 | **Description** 14 | A clear and concise description of what you want to happen 15 | 16 | **Examples** 17 | 18 | **Possible solution** 19 | (Optional) 20 | -------------------------------------------------------------------------------- /my_profiles/README.md: -------------------------------------------------------------------------------- 1 | Previously, we recommended using Snakemake profiles under a `my_profiles/` analysis directory. We now recommend using Snakemake config files directly via the `--configfile` parameter. You can still use existing profiles via `--configfile my_profiles//builds.yaml`. 2 | 3 | See [this guide](https://docs.nextstrain.org/projects/ncov/en/latest/tutorial/next-steps.html#create-analysis-directory) to create your own analysis directory. 4 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-gisaid/legacy_clades.tsv: -------------------------------------------------------------------------------- 1 | clade gene site alt 2 | 3 | 4 | A1a ORF3a 251 V 5 | A1a ORF1a 3606 F 6 | 7 | A2 S 614 G 8 | A2a ORF1b 314 L 9 | 10 | A3 ORF1a 378 I 11 | A3 ORF1a 3606 F 12 | 13 | A6 nuc 514 C 14 | 15 | 16 | A7 ORF1a 3220 V 17 | 18 | 19 | 20 | 21 | B ORF8 84 S 22 | 23 | B1 ORF8 84 S 24 | B1 nuc 18060 T 25 | 26 | B2 ORF8 84 S 27 | B2 nuc 29095 T 28 | 29 | B4 ORF8 84 S 30 | B4 N 202 N 31 | B4 N 202 N 32 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_files_additional_info/variants.txt: -------------------------------------------------------------------------------- 1 | UK United Kingdom 2 | The United Kingdom United Kingdom 3 | US USA 4 | Valencia Valencia ES 5 | Granada Granada ES 6 | UAE United Arab Emirates 7 | United States USA 8 | Czechia Czech Republic 9 | Pamplona Pamplona ES 10 | Tshwane City of Tshwane 11 | Srilanka Sri Lanka 12 | United Arab Emirate United Arab Emirates 13 | Yucatán Yucatan 14 | México Mexico 15 | United States of America USA 16 | Viet Nam Vietnam 17 | Côte d’Ivoire Côte d'Ivoire 18 | Zurich Zürich -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Dependabot configuration file 2 | # 3 | # 4 | # Each ecosystem is checked on a scheduled interval defined below. To trigger 5 | # a check manually, go to 6 | # 7 | # https://github.com/nextstrain/ncov/network/updates 8 | # 9 | # and look for a "Check for updates" button. You may need to click around a 10 | # bit first. 11 | --- 12 | version: 2 13 | updates: 14 | - package-ecosystem: "github-actions" 15 | directory: "/" 16 | schedule: 17 | interval: "weekly" 18 | -------------------------------------------------------------------------------- /scripts/generate-scientific-credits.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | data = pd.read_csv('../data/metadata.tsv', sep='\t') 4 | credits = data.groupby('originating_lab')['strain'].apply(list).to_dict() 5 | 6 | detailed_ofile = open('../data/detailed_credits.md', 'a') 7 | ofile = open('../data/credits.md', 'a') 8 | 9 | for institution in sorted(list(credits.keys())): 10 | if institution == 'unknown': 11 | continue 12 | 13 | ofile.write('* '+institution+'\n') 14 | 15 | strains = sorted(credits[institution]) 16 | detailed_ofile.write('* '+institution+'\n') 17 | for s in strains: 18 | detailed_ofile.write('\t* '+s+'\n') 19 | detailed_ofile.write('\n') 20 | 21 | detailed_ofile.close() 22 | ofile.close() 23 | -------------------------------------------------------------------------------- /.dockstore.yml: -------------------------------------------------------------------------------- 1 | version: 1.2 2 | workflows: 3 | - subclass: WDL 4 | primaryDescriptorPath: /workflow/wdl/ncov_workflow.wdl 5 | testParameterFiles: 6 | - /workflow/wdl/ncov_workflow.json 7 | name: ncov 8 | authors: 9 | - name: Nextstrain 10 | - subclass: WDL 11 | primaryDescriptorPath: /workflow/wdl/gisaid_ingest.wdl 12 | testParameterFiles: 13 | - /workflow/wdl/gisaid_ingest.json 14 | name: gisaid_ingest 15 | authors: 16 | - name: Nextstrain 17 | - subclass: WDL 18 | primaryDescriptorPath: /workflow/wdl/genbank_ingest.wdl 19 | testParameterFiles: 20 | - /workflow/wdl/genbank_ingest.json 21 | name: genbank_ingest 22 | authors: 23 | - name: Nextstrain 24 | -------------------------------------------------------------------------------- /docs/src/tutorial/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | These tutorials will walk you through the process of running a basic genomic surveillance workflow using SARS-CoV-2 data. 5 | We've created these resources with the goal of enabling Departments of Public Health to start using Nextstrain to understand their SARS-CoV-2 genomic data within 1-2 hours. 6 | 7 | At the end, you will be able to: 8 | 9 | - create phylogenetic trees of SARS-CoV-2 genomes from different sources including GISAID and Nextstrain-curated GenBank data 10 | - visualize the resulting trees in :term:`docs.nextstrain.org:Auspice` 11 | - define subsampling logic for your own genomic epidemiological analysis 12 | 13 | If you prefer to learn about the workflow through videos, see the :doc:`demo videos `. 14 | -------------------------------------------------------------------------------- /docs/src/guides/data-prep/index.rst: -------------------------------------------------------------------------------- 1 | ********************** 2 | Data preparation guide 3 | ********************** 4 | 5 | To use Nextstrain to analyze your own data, you'll need to prepare two files: 6 | 7 | 1. A FASTA file with viral genomic sequences 8 | 2. A corresponding TSV file with metadata describing each sequence 9 | 10 | We describe the following ways to prepare data for a SARS-CoV-2 analysis: 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :titlesonly: 15 | 16 | local-data 17 | gisaid-search 18 | gisaid-full 19 | 20 | Alternatively, use pre-curated data files: 21 | 22 | 1. :ref:`Nextstrain remote inputs ` 23 | 2. `CDC: US State and Territory subsample datasets and example builds `__ 24 | -------------------------------------------------------------------------------- /.github/workflows/sync-redirects.yaml: -------------------------------------------------------------------------------- 1 | name: Sync RTD redirects 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - docs/redirects.yaml 9 | - .github/workflows/sync-redirects.yaml 10 | 11 | pull_request: 12 | paths: 13 | - docs/redirects.yaml 14 | - .github/workflows/sync-redirects.yaml 15 | 16 | # Manually triggered using GitHub's UI 17 | workflow_dispatch: 18 | 19 | jobs: 20 | sync: 21 | # Prevent this job from running on forks. 22 | if: github.repository_owner == 'nextstrain' 23 | name: rtd redirects 24 | uses: nextstrain/.github/.github/workflows/sync-rtd-redirects.yaml@master 25 | with: 26 | project: nextstrain-ncov 27 | file: docs/redirects.yaml 28 | secrets: 29 | RTD_TOKEN: ${{ secrets.RTD_TOKEN }} 30 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | test-build: 12 | uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@v0 13 | with: 14 | build-args: all_regions -j 2 --profile nextstrain_profiles/nextstrain-ci 15 | 16 | test-cram: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - uses: actions/checkout@v6 20 | - uses: actions/setup-python@v6 21 | with: 22 | python-version: "3.10" 23 | - run: pip install cram nextstrain-augur 24 | - run: cram --shell=/bin/bash tests/sanitize-metadata.t 25 | 26 | docs: 27 | uses: nextstrain/.github/.github/workflows/docs-ci.yaml@master 28 | with: 29 | docs-directory: docs/ 30 | environment-file: docs/conda.yml 31 | -------------------------------------------------------------------------------- /docs/glossary.md: -------------------------------------------------------------------------------- 1 | # Glossary 2 | 3 | #### Alignment 4 | 5 | #### Ancestral trait (reconstruction) 6 | 7 | #### Augur 8 | 9 | #### Auspice 10 | 11 | #### Bases 12 | 13 | #### Branch 14 | 15 | #### Build 16 | 17 | #### Config 18 | 19 | #### Division 20 | 21 | #### Filtering 22 | 23 | #### Genome 24 | 25 | #### Genomic epidemiology 26 | 27 | #### GISAID 28 | 29 | #### Location 30 | 31 | #### Metadata 32 | 33 | #### Narrative 34 | 35 | #### Node 36 | 37 | #### Phylogeny 38 | 39 | #### Reference genome 40 | 41 | #### Region 42 | 43 | #### Sample 44 | 45 | #### Sequence 46 | 47 | #### Snakemake 48 | 49 | #### Strain 50 | 51 | #### Subsampling 52 | 53 | #### Tip (leaf) 54 | 55 | #### TSV 56 | 57 | #### Trait 58 | 59 | #### Transmission 60 | 61 | #### Tree 62 | 63 | #### Workflow manager 64 | -------------------------------------------------------------------------------- /docs/src/tutorial/videos.rst: -------------------------------------------------------------------------------- 1 | ************************** 2 | Video tutorial walkthrough 3 | ************************** 4 | 5 | If you prefer to learn about the workflow through videos, see the following: 6 | 7 | Running the analysis 8 | -------------------- 9 | 10 | .. raw:: html 11 | 12 | 13 | 14 | Visualizing the results 15 | ----------------------- 16 | 17 | .. raw:: html 18 | 19 | 20 | -------------------------------------------------------------------------------- /scripts/narrative-pdf-screens.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | ADDRESS=$1 4 | PARTIAL_FNAME=$2 5 | 6 | echo "Make sure that you have auspice / nextstrain running at so that ${ADDRESS} is valid!" 7 | echo "(e.g. from the 'ncov' directory run 'auspice view --datasetDir auspice --narrativeDir narratives'" 8 | echo "This script will save PDFs starting with the prefix ${PARTIAL_FNAME}" 9 | echo "" 10 | 11 | # https://gs.statcounter.com/screen-resolution-stats/desktop/worldwide 12 | RESOLUTIONS=(3200x1350 1920x1080 1600x900 366x768 ) 13 | # james' iphone+ iphone 14 | 15 | for RES in ${RESOLUTIONS[@]}; do 16 | F="${PARTIAL_FNAME}.${RES}.pdf" 17 | echo "" 18 | echo "-------------------------------------" 19 | echo "Making ${F}" 20 | echo "" 21 | decktape generic --load-pause 3000 --key ArrowDown --size ${RES} ${ADDRESS} ${F} 22 | done 23 | 24 | exit 0 25 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Did something not work as expected? 4 | title: "" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Current Behavior** 11 | A clear and concise description of what is happening / what the bug is. 12 | 13 | **Expected behavior** 14 | A clear and concise description of what you expected to happen instead. 15 | 16 | **How to reproduce** 17 | Steps to reproduce the current behavior: 18 | 1. Open / run ... 19 | 2. 20 | 3. 21 | 4. See error 22 | 23 | **Possible solution** 24 | (optional) 25 | 26 | **Your environment: if browsing Nextstrain online** 27 | - Operating system: 28 | - Browser: 29 | 30 | **Your environment: if running Nextstrain locally** 31 | - Operating system: 32 | - Browser: 33 | - Version (e.g. `auspice 2.7.0`): 34 | 35 | **Additional context** 36 | Add any other context about the problem here. 37 | -------------------------------------------------------------------------------- /defaults/annotation.gff: -------------------------------------------------------------------------------- 1 | # Gene map (genome annotation) of SARS-CoV-2 in GFF format. 2 | # For gene map purpses we only need some of the columns. We substitute unused values with "." as per GFF spec. 3 | # See GFF format reference at https://www.ensembl.org/info/website/upload/gff.html 4 | # seqname source feature start end score strand frame attribute 5 | . . gene 26245 26472 . + . gene_name=E 6 | . . gene 26523 27191 . + . gene_name=M 7 | . . gene 28274 29533 . + . gene_name=N 8 | . . gene 266 13468 . + . gene_name=ORF1a 9 | . . gene 13468 21555 . + . gene_name=ORF1b 10 | . . gene 25393 26220 . + . gene_name=ORF3a 11 | . . gene 27202 27387 . + . gene_name=ORF6 12 | . . gene 27394 27759 . + . gene_name=ORF7a 13 | . . gene 27756 27887 . + . gene_name=ORF7b 14 | . . gene 27894 28259 . + . gene_name=ORF8 15 | . . gene 28284 28577 . + . gene_name=ORF9b 16 | . . gene 21563 25384 . + . gene_name=S 17 | -------------------------------------------------------------------------------- /tests/remote-inputs-compressed/builds.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | - name: test-remote-compressed-asia-sequences 3 | metadata: s3://nextstrain-data/files/ncov/test-data/asia_metadata.tsv.xz 4 | sequences: s3://nextstrain-data/files/ncov/test-data/asia_sequences.fasta.xz 5 | - name: test-remote-compressed-europe-aligned 6 | metadata: s3://nextstrain-data/files/ncov/test-data/europe_metadata.tsv.xz 7 | aligned: s3://nextstrain-data/files/ncov/test-data/europe_aligned.fasta.xz 8 | - name: test-remote-compressed-americas-filtered 9 | metadata: s3://nextstrain-data/files/ncov/test-data/americas_metadata.tsv.xz 10 | filtered: s3://nextstrain-data/files/ncov/test-data/americas_filtered.fasta.xz 11 | 12 | builds: 13 | test-remote-compressed: 14 | subsampling_scheme: small 15 | 16 | subsampling: 17 | small: 18 | small-sample: 19 | group_by: "region" 20 | max_sequences: 100 21 | -------------------------------------------------------------------------------- /tests/local-inputs-compressed/builds.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | # Note: paths are relative to the --directory handed to snakemake 3 | - name: test-local-compressed-asia-sequences 4 | metadata: ../local-inputs-compressed/data/asia_metadata.tsv.xz 5 | sequences: ../local-inputs-compressed/data/asia_sequences.fasta.xz 6 | - name: test-local-compressed-europe-aligned 7 | metadata: ../local-inputs-compressed/data/europe_metadata.tsv.xz 8 | aligned: ../local-inputs-compressed/data/europe_aligned.fasta.xz 9 | - name: test-local-compressed-americas-filtered 10 | metadata: ../local-inputs-compressed/data/americas_metadata.tsv.xz 11 | filtered: ../local-inputs-compressed/data/americas_filtered.fasta.xz 12 | 13 | builds: 14 | test-local-compressed: 15 | subsampling_scheme: small 16 | 17 | subsampling: 18 | small: 19 | small-sample: 20 | group_by: "region" 21 | max_sequences: 100 22 | -------------------------------------------------------------------------------- /tests/local-inputs-uncompressed/builds.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | # Note: paths are relative to the --directory handed to snakemake 3 | - name: test-local-uncompressed-asia-sequences 4 | metadata: ../local-inputs-uncompressed/data/asia_metadata.tsv 5 | sequences: ../local-inputs-uncompressed/data/asia_sequences.fasta 6 | - name: test-local-uncompressed-europe-aligned 7 | metadata: ../local-inputs-uncompressed/data/europe_metadata.tsv 8 | aligned: ../local-inputs-uncompressed/data/europe_aligned.fasta 9 | - name: test-local-uncompressed-americas-filtered 10 | metadata: ../local-inputs-uncompressed/data/americas_metadata.tsv 11 | filtered: ../local-inputs-uncompressed/data/americas_filtered.fasta 12 | 13 | builds: 14 | test-local-uncompressed: 15 | subsampling_scheme: small 16 | 17 | subsampling: 18 | small: 19 | small-sample: 20 | group_by: "region" 21 | max_sequences: 100 22 | -------------------------------------------------------------------------------- /workflow/wdl/genbank_ingest.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "tasks/ncov_ingest.wdl" as ncov_ingest 4 | 5 | workflow GENBANK_INGEST { 6 | input { 7 | # Optionals 8 | File? cache_nextclade_old 9 | String? filter # e.g. "region:Africa" passed to tsv-filters 10 | 11 | Int? cpu 12 | Int? memory # in GiB 13 | Int? disk_size 14 | } 15 | 16 | call ncov_ingest.genbank_ingest as ingest { 17 | input: 18 | # optionals 19 | cache_nextclade_old = cache_nextclade_old, 20 | filter = filter, 21 | 22 | cpu = cpu, 23 | memory = memory, 24 | disk_size = disk_size 25 | } 26 | 27 | output { 28 | # ncov-ingest output either gisaid or genbank 29 | File sequences_fasta = ingest.sequences_fasta 30 | File metadata_tsv = ingest.metadata_tsv 31 | 32 | File nextclade_tsv = ingest.nextclade_tsv 33 | String last_run = ingest.last_run 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /.github/workflows/rebuild-100k.yml: -------------------------------------------------------------------------------- 1 | name: Rebuild 100k sample 2 | 3 | on: 4 | # cron job once a week on Mondays at 12:42 UTC 5 | schedule: 6 | - cron: '42 12 * * 1' 7 | # Manually triggered using GitHub's UI 8 | workflow_dispatch: 9 | 10 | jobs: 11 | open: 12 | permissions: 13 | id-token: write 14 | uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master 15 | secrets: inherit 16 | with: 17 | runtime: aws-batch 18 | run: | 19 | set -x 20 | 21 | declare -a config 22 | config+=(slack_token=$SLACK_TOKEN) 23 | 24 | nextstrain build \ 25 | --detach \ 26 | --cpus 16 \ 27 | --memory 31GiB \ 28 | . \ 29 | upload \ 30 | --configfile nextstrain_profiles/100k/config-open.yaml \ 31 | --config "${config[@]}" \ 32 | --set-threads tree=8 33 | artifact-name: open-build-output 34 | -------------------------------------------------------------------------------- /docs/src/visualization/narratives.rst: -------------------------------------------------------------------------------- 1 | Nextstrain Narratives 2 | ===================== 3 | 4 | Nextstrain Narratives allow you to pair a specific view of a dataset with text and images to generate scrollable, interactive reports. 5 | 6 | For examples, `see our weekly Situation Reports `__ from the first several months of the pandemic. 7 | 8 | You can `read more about narratives `__ or `watch our Nextstrain narratives tutorial videos `_. We've also `provided a template narrative file `__ for you to edit. 9 | 10 | You can preview the template narrative by navigating to https://nextstrain.org/community/narratives/nextstrain/ncov/template/narrative. 11 | 12 | If you get stuck, don't hesitate to `ask for help `__. 13 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | if "%BUILDDIR%" == "" ( 11 | set BUILDDIR=build 12 | ) 13 | set SOURCEDIR=src 14 | 15 | if "%1" == "" goto help 16 | 17 | %SPHINXBUILD% >NUL 2>NUL 18 | if errorlevel 9009 ( 19 | echo. 20 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 21 | echo.installed, then set the SPHINXBUILD environment variable to point 22 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 23 | echo.may add the Sphinx directory to PATH. 24 | echo. 25 | echo.If you don't have Sphinx installed, grab it from 26 | echo.http://sphinx-doc.org/ 27 | exit /b 1 28 | ) 29 | 30 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 31 | goto end 32 | 33 | :help 34 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 35 | 36 | :end 37 | popd 38 | -------------------------------------------------------------------------------- /defaults/clade_hierarchy.tsv: -------------------------------------------------------------------------------- 1 | clade parent WHO 2 | 19B 19A 3 | 20A 19A 4 | 20B 20A 5 | 20C 20A 6 | 20D 20B 7 | 20E 20A 8 | 20F 20B 9 | 20G 20C 10 | 20H 20C Beta 11 | 20I 20B Alpha 12 | 20J 20B Gamma 13 | 21A 20A Delta 14 | 21B 20A Kappa 15 | 21C 20C Epsilon 16 | 21D 20A Eta 17 | 21E 20B Theta 18 | 21F 20C Iota 19 | 21G 20D Lambda 20 | 21H 20A Mu 21 | 21I 21A Delta 22 | 21J 21A Delta 23 | 21K 21M Omicron 24 | 21L 21M Omicron 25 | 21M 20B Omicron 26 | 22A 21L Omicron 27 | 22B 21L Omicron 28 | 22C 21L Omicron 29 | 22D 21L Omicron 30 | 22E 22B Omicron 31 | 22F 21L Omicron 32 | 23A 22F Omicron 33 | 23B 22F Omicron 34 | 23C 22D Omicron 35 | 23D 22F Omicron 36 | 23E 22F Omicron 37 | 23F 23D Omicron 38 | 23G 23A Omicron 39 | 23H 23F Omicron 40 | 23I 21L Omicron 41 | 24A 23I Omicron 42 | 24B 24A Omicron 43 | 24C 24B Omicron 44 | 24D 21L Omicron 45 | 24E 24C Omicron 46 | 24F 24A Omicron 47 | 24G 24B Omicron 48 | 24H 24A Omicron 49 | 24I 24A Omicron 50 | 25A 24B Omicron 51 | 25B 24D Omicron 52 | 25C 24A Omicron 53 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # ncov 'Read The Docs' Documentation. 2 | 3 | 4 | ## Building the docs 5 | 6 | Build dependencies are managed with [Conda](https://conda.io). 7 | Install them 8 | into an isolated environment named `ncov-docs` with: 9 | 10 | conda env create -f=conda.yml 11 | 12 | Enter the environment with: 13 | 14 | conda activate ncov-docs 15 | 16 | You can now build the documentation with: 17 | 18 | make html 19 | 20 | which invokes Sphinx to build static HTML pages in `build/html/`. 21 | You can view them by running: 22 | 23 | open build/html/index.html 24 | 25 | 26 | To monitor the source files for changes and automatically rebuild as necessary, 27 | run: 28 | 29 | make livehtml 30 | 31 | and then open . Pages open in the browser will 32 | automatically refresh when they're rebuilt. 33 | 34 | You can clean the build directory for a fresh start with: 35 | 36 | make clean 37 | 38 | Leave the environment with: 39 | 40 | conda deactivate 41 | -------------------------------------------------------------------------------- /docs/src/tutorial/setup.rst: -------------------------------------------------------------------------------- 1 | Setup and installation 2 | ====================== 3 | 4 | The following steps will prepare you to run complete analyses of SARS-CoV-2 data by installing required software and running a simple example workflow. 5 | 6 | .. contents:: Table of Contents 7 | :local: 8 | 9 | Register for a GISAID account 10 | ----------------------------- 11 | 12 | Some tutorials rely on data downloaded from `GISAID `_. 13 | If you do not already have one, `register for a GISAID account `_ now. 14 | Registration may take a few days. 15 | 16 | Install Nextstrain components 17 | -------------------------------- 18 | 19 | :doc:`Follow instructions to install Nextstrain components `. 20 | 21 | Download the ncov workflow 22 | ----------------------------- 23 | 24 | Use Git to download a copy of the ncov repository containing the workflow and this tutorial. 25 | 26 | .. code:: bash 27 | 28 | git clone https://github.com/nextstrain/ncov.git 29 | -------------------------------------------------------------------------------- /defaults/clade_display_names.yml: -------------------------------------------------------------------------------- 1 | 19A: 19A 2 | 19B: 19B 3 | 20A: 20A 4 | 20B: 20B 5 | 20C: 20C 6 | 20D: 20D 7 | 20E: 20E 8 | 20F: 20F 9 | 20G: 20G 10 | 20I: 20I (Alpha) 11 | 20H: 20H (Beta) 12 | 20J: 20J (Gamma) 13 | 21A: 21A (Delta) 14 | 21B: 21B (Kappa) 15 | 21C: 21C (Epsilon) 16 | 21D: 21D (Eta) 17 | 21E: 21E (Theta) 18 | 21F: 21F (Iota) 19 | 21G: 21G (Lambda) 20 | 21I: 21I (Delta) 21 | 21H: 21H (Mu) 22 | 21J: 21J (Delta) 23 | 21K: 21K (BA.1) 24 | 21L: 21L (BA.2) 25 | 21M: 21M (Omicron) 26 | 22A: 22A (BA.4) 27 | 22B: 22B (BA.5) 28 | 22C: 22C (BA.2.12.1) 29 | 22D: 22D (BA.2.75) 30 | 22E: 22E (BQ.1) 31 | 22F: 22F (XBB) 32 | 23A: 23A (XBB.1.5) 33 | 23B: 23B (XBB.1.16) 34 | 23C: 23C (CH.1.1) 35 | 23D: 23D (XBB.1.9) 36 | 23E: 23E (XBB.2.3) 37 | 23F: 23F (EG.5.1) 38 | 23G: 23G (XBB.1.5.70) 39 | 23H: 23H (HK.3) 40 | 23I: 23I (BA.2.86) 41 | 24A: 24A (JN.1) 42 | 24B: 24B (JN.1.11.1) 43 | 24C: 24C (KP.3) 44 | 24D: 24D (XDV.1) 45 | 24E: 24E (KP.3.1.1) 46 | 24F: 24F (XEC) 47 | 24G: 24G (KP.2.3) 48 | 24H: 24H (LF.7) 49 | 24I: 24I (MV.1) 50 | 25A: 25A (LP.8.1) 51 | 25B: 25B (NB.1.8.1) 52 | 25C: 25C (XFG) 53 | -------------------------------------------------------------------------------- /defaults/clade_emergence_dates.tsv: -------------------------------------------------------------------------------- 1 | Nextstrain_clade first_sequence 2 | 19A 2019-12-01 3 | 19B 2019-12-01 4 | 20A 2020-01-20 5 | 20A.EU2 2020-02-15 6 | 20B 2020-02-14 7 | 20C 2020-02-25 8 | 20D 2020-03-12 9 | 20E 2020-05-27 10 | 20F 2020-05-24 11 | 20G 2020-06-11 12 | 20H 2020-08-10 13 | 20I 2020-09-20 14 | 20J 2020-10-29 15 | 21A 2020-10-30 16 | 21B 2020-10-30 17 | 21C 2020-08-03 18 | 21D 2020-11-21 19 | 21E 2021-01-10 20 | 21F 2020-11-20 21 | 21G 2021-01-05 22 | 21H 2021-01-05 23 | 21I 2020-10-30 24 | 21J 2020-10-30 25 | 21K 2021-09-01 26 | 21L 2021-09-01 27 | 21M 2021-09-01 28 | 22A 2021-12-01 29 | 22B 2021-12-01 30 | 22C 2021-12-01 31 | 22D 2022-04-01 32 | 22E 2022-07-10 33 | 22F 2022-07-01 34 | 23A 2022-10-01 35 | 23B 2022-11-01 36 | 23C 2022-06-01 37 | 23D 2022-08-01 38 | 23E 2022-10-01 39 | 23F 2023-01-01 40 | 23G 2023-01-01 41 | 23H 2023-03-01 42 | 23I 2023-04-01 43 | 24A 2023-07-01 44 | 24B 2023-11-01 45 | 24C 2023-12-01 46 | 24D 2024-01-01 47 | 24E 2023-12-01 48 | 24F 2024-05-01 49 | 24G 2024-01-01 50 | 24H 2024-05-01 51 | 24I 2024-05-01 52 | 25A 2024-07-01 53 | 25B 2025-01-01 54 | 25C 2025-01-01 55 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_curate_metadata/acceptedExposureAdditions.txt: -------------------------------------------------------------------------------- 1 | Following combinations allowed: division (country, region) or country (region) 2 | Italian cruise ship (Italy, Europe) 3 | Nile River Cruise (Egypt, Africa) 4 | Diamond Princess (Japan, Asia) 5 | Grand Princess 2nd cruise (USA, North America) 6 | Guayas (Ecuador, South America) 7 | Grand Canary Islands (Spain, Europe) 8 | Tyrol (Italy, Europe) 9 | Trentino (Italy, Europe) 10 | Faroe Islands (Denmark, Europe) 11 | Guadalajara (Mexico, North America) 12 | Asia (Asia) 13 | Asia (Asia, Asia) 14 | Piedmont (Italy, Europe) 15 | Obwalden (Switzerland, Europe) 16 | Brazilian Cruise (Brazil, South America) 17 | Maldives (Asia, Maldives) 18 | Conakry (Guinea, Africa) 19 | Kabul (Afghanistan, Asia) 20 | Tajikistan (Asia) 21 | Sabah (Malaysia, Asia) 22 | Yemen (Asia) 23 | South Aegean Region (Greece, Europe) 24 | Arusha (Tanzania, Africa) 25 | Kalinga (Philippines, Asia) 26 | Lanao del Norte (Philippines, Asia) 27 | Sinai (Egypt, Africa) 28 | Sharjah (United Arab Emirates, Asia) 29 | Agusan del Norte (Philippines, Asia) 30 | Bohol (Philippines, Asia) 31 | Mekka (Saudi Arabia, Asia) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Nextstrain 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_files_additional_info/location_pattern.txt: -------------------------------------------------------------------------------- 1 | Patient residence: XXX 2 | Civil Hospital, XXX 3 | Industrial Enterprises in XXX 4 | Alzheimer home, XXX 4 5 | Airport of XXX 6 | XXX (interpreted as patient residence 7 | Clinical Hospital of XXX 8 | Hospital das Clínicas de XXX 9 | Hospital das Clinicas de XXX 10 | Zip code: XXX 11 | Hospital General de XXX 12 | Patient residence:XXX 13 | XXX Health Department 14 | XXX Health department 15 | Patient origin XXX 16 | zip code: XXX 17 | zip code: XXX (interpreted as patient residence) (interpreted as patient residence) (interpreted as patient residence) (interpreted as patient residence) (interpreted as patient residence) 18 | CS XXX 19 | HG de XXX 20 | CS de XXX 21 | HG da XXX 22 | Lives in XXX 23 | Resident in XXX 24 | Patient resident in XXX 25 | Patient resident in XXX. 26 | Lives in the XXX 27 | Patient resides in XXX 28 | Patient from XXX transferred to Rio Grande do Sul State to receive hospital care 29 | Patient from XXX relocated to Rondonia State 30 | Residence XXX 31 | residence XXX 32 | Residence: XXX 33 | Patient From XXX (interpreted as patient residence) 34 | Patient from XXX 35 | -------------------------------------------------------------------------------- /workflow/wdl/gisaid_ingest.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "tasks/ncov_ingest.wdl" as ncov_ingest 4 | 5 | workflow GISAID_INGEST { 6 | input { 7 | # ncov ingest 8 | String GISAID_API_ENDPOINT 9 | String GISAID_USERNAME_AND_PASSWORD 10 | 11 | # Optionals 12 | File? cache_nextclade_old 13 | String? filter # e.g. "region:Africa" passed to tsv-filters 14 | 15 | Int? cpu 16 | Int? memory # in GiB 17 | Int? disk_size 18 | } 19 | 20 | call ncov_ingest.gisaid_ingest as ingest { 21 | input: 22 | GISAID_API_ENDPOINT = GISAID_API_ENDPOINT, 23 | GISAID_USERNAME_AND_PASSWORD = GISAID_USERNAME_AND_PASSWORD, 24 | 25 | # optionals 26 | cache_nextclade_old = cache_nextclade_old, 27 | filter = filter, 28 | 29 | cpu = cpu, 30 | memory = memory, 31 | disk_size = disk_size 32 | } 33 | 34 | output { 35 | # ncov-ingest output either gisaid or genbank 36 | File sequences_fasta = ingest.sequences_fasta 37 | File metadata_tsv = ingest.metadata_tsv 38 | 39 | File nextclade_tsv = ingest.nextclade_tsv 40 | String last_run = ingest.last_run 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /nextstrain_profiles/100k/config-open.yaml: -------------------------------------------------------------------------------- 1 | # This file is largely duplicated from `config-gisaid.yaml` - please 2 | # see that file for comments 3 | S3_DST_BUCKET: "nextstrain-data/files/ncov/open/100k" # TODO XXX 4 | S3_DST_ORIGINS: [needed-for-workflow-but-unused] 5 | deploy_url: needed_for_workflow_but_unused 6 | custom_rules: 7 | - workflow/snakemake_rules/export_for_nextstrain.smk 8 | inputs: 9 | - name: open 10 | metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.zst" 11 | aligned: "s3://nextstrain-data/files/ncov/open/sequences.fasta.zst" 12 | skip_sanitize_metadata: true 13 | deduplicated: true 14 | builds: 15 | 100k: 16 | subsampling_scheme: 100k_scheme 17 | upload: 18 | metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz 19 | sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz 20 | filter: 21 | exclude_where: "division='USA'" 22 | subsampling: 23 | 100k_scheme: 24 | 50k_early: 25 | group_by: "year month country" 26 | max_sequences: 50000 27 | max_date: "--max-date 1Y" 28 | 50k_late: 29 | group_by: "year month country" 30 | max_sequences: 50000 31 | min_date: "--min-date 1Y" 32 | -------------------------------------------------------------------------------- /data/references_metadata.tsv: -------------------------------------------------------------------------------- 1 | strain virus gisaid_epi_isl genbank_accession date region country division location region_exposure country_exposure division_exposure segment length host age sex Nextstrain_clade pango_lineage GISAID_clade originating_lab submitting_lab authors url title paper_url date_submitted sampling_strategy missing_data divergence nonACGTN rare_mutations snp_clusters QC_missing_data QC_mixed_sites QC_rare_mutations QC_snp_clusters clock_deviation 2 | Wuhan/Hu-1/2019 ncov EPI_ISL_402125 MN908947.3 2019-12-26 Asia China Hubei Wuhan Asia China Hubei genome 29903 Human ? ? 19A B L National Institute for Communicable Disease Control and Prevention (ICDC) Chinese Center for Disease Control and Prevention (China CDC) National Institute for Communicable Disease Control and Prevention (ICDC) Chinese Center for Disease Control and Prevention (China CDC) Zhang et al https://www.gisaid.org A new coronavirus associated with human respiratory disease in China https://dx.doi.org/10.1038/s41586-020-2008-3 2020-01-12 0.0 0.0 0.0 0.0 0.0 good good good good -0.4611005157393094 3 | 21L ncov ? ? 2021-11-01 ? ? ? ? ? ? ? genome 29903 Human ? ? 21L (Omicron) BA.2 ? ? ? ? ? ? ? 2021-11-01 0.0 0.0 0.0 0.0 0.0 good good good good -0.4611005157393094 4 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first three. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | BUILDDIR ?= build 9 | SOURCEDIR = src 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help help-docker Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | livehtml: 23 | sphinx-autobuild -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | 25 | .ONESHELL: 26 | docker-html: 27 | set -euox 28 | docker build -t nextstrain-docs-builder --network=host . 29 | docker run -it --rm \ 30 | --name=nextstrain-docs-builder-$(shell date +%s) \ 31 | --init \ 32 | --user=$(shell id -u):$(shell id -g) \ 33 | --volume=$(shell pwd):/home/user/src \ 34 | --workdir=/home/user/src \ 35 | --env 'TERM=xterm-256colors' \ 36 | nextstrain-docs-builder 37 | -------------------------------------------------------------------------------- /tests/check_auspice_json.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import sys 4 | 5 | if __name__ == '__main__': 6 | parser = argparse.ArgumentParser( 7 | description="Ensure certain values are present for a given node trait", 8 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 9 | ) 10 | parser.add_argument('--json', type=str, metavar="JSON", required=True, help="Auspice JSON") 11 | parser.add_argument('--attr', type=str, metavar="KEY", required=True, help="node attr to collect") 12 | parser.add_argument('--values', type=str, nargs="+", metavar="VALUE", required=True, help="values to check") 13 | args = parser.parse_args() 14 | 15 | values_seen = set() 16 | 17 | def collect(node): 18 | v = node.get("node_attrs", {}).get(args.attr, {}).get("value", "") 19 | if v: 20 | values_seen.add(v) 21 | for child in node.get("children", []): 22 | collect(child) 23 | 24 | with open(args.json, "r") as f: 25 | input_json = json.load(f) 26 | 27 | collect(input_json["tree"]) 28 | 29 | if not values_seen >= set(args.values): 30 | print("Following values missing from JSON:", set(args.values)-values_seen) 31 | sys.exit(1) 32 | -------------------------------------------------------------------------------- /defaults/clades_who.tsv: -------------------------------------------------------------------------------- 1 | clade gene site alt 2 | 3 | Alpha nuc 14676 T 4 | Alpha nuc 15279 T 5 | Alpha nuc 23063 T 6 | 7 | Beta nuc 23012 A 8 | Beta nuc 23063 T 9 | Beta nuc 23403 G 10 | Beta nuc 26456 T 11 | 12 | Gamma nuc 733 C 13 | Gamma nuc 2749 T 14 | Gamma nuc 3828 T 15 | Gamma nuc 5648 C 16 | Gamma nuc 12778 T 17 | Gamma nuc 13860 T 18 | 19 | Delta nuc 21618 G 20 | Delta nuc 26767 C 21 | Delta nuc 28461 G 22 | 23 | Epsilon nuc 17014 T 24 | Epsilon nuc 21600 T 25 | Epsilon nuc 22018 T 26 | Epsilon nuc 22917 G 27 | 28 | Eta nuc 14407 T 29 | Eta nuc 20724 G 30 | Eta nuc 23593 C 31 | Eta nuc 24224 C 32 | Eta nuc 24748 T 33 | 34 | Theta nuc 12049 T 35 | Theta nuc 23341 C 36 | Theta nuc 23604 A 37 | Theta nuc 24187 A 38 | Theta nuc 24836 A 39 | 40 | Iota nuc 16500 C 41 | Iota nuc 20262 G 42 | Iota nuc 21575 T 43 | Iota nuc 22320 G 44 | 45 | Kappa nuc 17523 T 46 | Kappa nuc 22917 G 47 | Kappa nuc 23012 C 48 | Kappa nuc 27638 C 49 | Kappa nuc 28881 T 50 | Kappa nuc 29402 T 51 | 52 | Lambda nuc 21786 T 53 | Lambda nuc 21789 T 54 | Lambda nuc 22917 A 55 | Lambda nuc 23031 C 56 | 57 | Mu nuc 3428 G 58 | Mu nuc 4878 T 59 | Mu nuc 11451 G 60 | Mu nuc 13057 T 61 | Mu nuc 17491 T 62 | Mu nuc 27925 A 63 | 64 | Omicron nuc 18163 G 65 | Omicron nuc 23599 G 66 | -------------------------------------------------------------------------------- /scripts/normalize_gisaid_fasta.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | GISAID_SARSCOV2_IN=$1 4 | GISAID_SARSCOV2_OUT=$2 5 | MIN_LENGTH=$3 6 | 7 | if [[ ! -r "$GISAID_SARSCOV2_IN" ]] 8 | then 9 | echo "$0: input $GISAID_SARSCOV2_IN not found" 10 | exit 1 11 | fi 12 | 13 | if [[ -z "$MIN_LENGTH" ]] 14 | then 15 | echo "Using default minimum length of 25000" 16 | MIN_LENGTH=25000 17 | fi 18 | 19 | echo "Normalizing GISAID file $GISAID_SARSCOV2_IN to $GISAID_SARSCOV2_OUT (min length $MIN_LENGTH)" 20 | 21 | # Remove leading virus name prefix from sequence names 22 | # Remove embedded spaces in sequence names (Hong Kong sequences) 23 | # Remove trailing |EPI_ISL_id|datestamp from sequence names 24 | # Remove sequences shorter than minimum length 25 | # Eliminate duplicate sequences (keep only the first seen) 26 | 27 | #cat $GISAID_SARSCOV2_IN | 28 | sed 's/^>[hn]Co[Vv]-19\//>/g' $GISAID_SARSCOV2_IN | # remove leading prefix 29 | sed 's/ //g' | # remove embedded spaces 30 | sed 's/|.*$//' | # remove trailing metadata 31 | awk "BEGIN{RS=\">\";FS=\"\n\"}length>$MIN_LENGTH{print \">\"\$0}" | # remove short seqs 32 | awk 'BEGIN{RS=">";FS="\n"}!x[$1]++{print ">"$0}' | # remove duplicates 33 | grep -v '^>*$' > $GISAID_SARSCOV2_OUT 34 | 35 | exit 0 36 | -------------------------------------------------------------------------------- /scripts/expand-clade-definitions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | usage: expand-clade-definitions 4 | 5 | Reads in a clade definitions file suitable for `augur clades` and expands any 6 | hierarchically-defined clades (i.e. clade definitions that inherit from earlier 7 | clade definitions). 8 | 9 | This should probably become a part of Augur in the future, as it's useful for 10 | programmatic manipulation of clade definitions files. 11 | """ 12 | import csv 13 | from augur.clades import read_in_clade_definitions 14 | from sys import stdout 15 | 16 | 17 | def write_out_clade_definitions(file, defs): 18 | out = csv.writer(file, dialect = "excel-tab", lineterminator = "\n") 19 | out.writerow(("clade", "gene", "site", "alt")) 20 | out.writerows( 21 | (clade, gene, site + 1, alt) 22 | for clade, muts in defs.items() 23 | for gene, site, alt in muts) 24 | 25 | 26 | if __name__ == "__main__": 27 | from argparse import ArgumentParser 28 | 29 | cli = ArgumentParser(description = __doc__.strip().split("\n\n", 1)[1]) 30 | cli.add_argument("clades", metavar = "") 31 | 32 | args = cli.parse_args() 33 | 34 | write_out_clade_definitions(stdout, read_in_clade_definitions(args.clades)) 35 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_curate_metadata/country_ordering/Slovakia_variants.txt: -------------------------------------------------------------------------------- 1 | Dubnica Nd Váhom Dubnica nad Vahom 2 | Mala N, Hronom Malá nad Hronom 3 | Turciansketeplice Turcianske Teplice 4 | Sucha Hora Suchá Hora 5 | Dolny Kubin Dolný Kubín 6 | Mala N, Hronom Malá nad Hronom 7 | Oravska Jasenic Oravska Jasenica 8 | Dvory N, Zitavou Dvory nad Žitavou 9 | Vranov nad Toplou Vranov nad Toplov 10 | Turcianska Stavnicka Turčianska Štiavnička 11 | Tvrodsin Tvrdosin 12 | Stropkov Region Stropkov 13 | Zilina Region Zilina 14 | Dolny Kubin Region Dolný Kubín 15 | Rimavka Sobota Rimavská Sobota 16 | Cierne Pri Cadci Čierne 17 | Diviaky Diviaky nad Nitricou 18 | Zlat Moravce Zlaté Moravce 19 | Komano Komarno 20 | Bardejov Region Bardejov 21 | Humenne Region Humenne 22 | Kosice Region Kosice 23 | Senica Region Senica 24 | Senec Region Senec 25 | Banovce nad Bebravou Region Banovce nad Bebravou 26 | Rimavske Sobota Rimavska Sobota 27 | Šaľa Region Sala 28 | Medzilaborce Region Medzilaborce 29 | Leopolodov Leopoldov 30 | Sobrance Region Sobrance 31 | Hlohovec Region Hlohovec 32 | Michalovce Region Michalovce 33 | Košice Region Kosice 34 | Humenné Region Humenné 35 | Nižné Raslavice Raslavice 36 | Rožňava Region Rožňava 37 | Trebišov Region Trebisov 38 | Nitra Region Nitra 39 | 40 | -------------------------------------------------------------------------------- /tests/remote-inputs-uncompressed/builds.yaml: -------------------------------------------------------------------------------- 1 | inputs: 2 | - name: test-remote-uncompressed-asia-sequences 3 | metadata: s3://nextstrain-data/files/ncov/test-data/asia_metadata.tsv 4 | sequences: s3://nextstrain-data/files/ncov/test-data/asia_sequences.fasta 5 | - name: test-remote-uncompressed-europe-aligned 6 | metadata: s3://nextstrain-data/files/ncov/test-data/europe_metadata.tsv 7 | aligned: s3://nextstrain-data/files/ncov/test-data/europe_aligned.fasta 8 | - name: test-remote-uncompressed-americas-filtered 9 | metadata: s3://nextstrain-data/files/ncov/test-data/americas_metadata.tsv 10 | filtered: s3://nextstrain-data/files/ncov/test-data/americas_filtered.fasta 11 | - name: references 12 | metadata: data/references_metadata.tsv 13 | sequences: data/references_sequences.fasta 14 | 15 | # As we are not including the test data from Asia (see above), this build will 16 | # be missing the default root sequence. We instead use 17 | # `data/references_sequences.fasta` that contains Wuhan/Hu-1/2019 18 | refine: 19 | root: "Wuhan/Hu-1/2019" 20 | 21 | builds: 22 | test-remote-uncompressed: 23 | subsampling_scheme: small 24 | 25 | subsampling: 26 | small: 27 | small-sample: 28 | group_by: "region" 29 | max_sequences: 100 30 | -------------------------------------------------------------------------------- /docs/src/visualization/interpretation.rst: -------------------------------------------------------------------------------- 1 | Guidance for interpretation 2 | =========================== 3 | 4 | Introductory resources 5 | ---------------------- 6 | 7 | - Visual explanation of how viral mutations and spread are related: https://www.nytimes.com/interactive/2020/04/30/science/coronavirus-mutations.html 8 | 9 | - Introduction to interpreting phylogenetic trees: https://nextstrain.org/narratives/trees-background/ 10 | 11 | - How to interact with Auspice (the engine for viewing trees): https://neherlab.org/201901_krisp_auspice.html 12 | 13 | - Overview of genomic epidemiology (older, but still relevant and clear): https://www.nature.com/articles/nrg2583 14 | 15 | Case Studies 16 | ------------ 17 | 18 | - UCSF-led analysis of genomic epi in California: https://science.sciencemag.org/content/early/2020/06/05/science.abb9263 19 | 20 | - UK analysis of hospital-acquired infections: https://www.medrxiv.org/content/10.1101/2020.05.08.20095687v1 21 | 22 | - UK's analysis of coronavirus introductions: https://virological.org/t/preliminary-analysis-of-sars-cov-2-importation-establishment-of-uk-transmission-lineages/507 23 | 24 | - Australia cluster detection: https://www.medrxiv.org/content/10.1101/2020.05.12.20099929v1 25 | 26 | - Nextstrain situation reports: https://nextstrain.org/ncov-sit-reps/ 27 | -------------------------------------------------------------------------------- /nextstrain_profiles/100k/README.md: -------------------------------------------------------------------------------- 1 | ## Aim 2 | 3 | To build a representative 100k dataset which is available for testing / developing builds locally. 4 | This is intended to run weekly via a GitHub action (which triggers jobs to be run on AWS). 5 | It will upload these files: 6 | 7 | * `s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz` 8 | * `s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz` 9 | * `s3://nextstrain-ncov-private/100k/metadata.tsv.xz` 10 | * `s3://nextstrain-ncov-private/100k/sequences.fasta.xz` 11 | 12 | While this profile is not recommended to be run locally, you can see what rules would be run via: 13 | 14 | ``` 15 | snakemake --cores 1 --configfile nextstrain_profiles/100k/config-gisaid.yaml -npf upload --dag | dot -Tpdf > dag-100k-gisaid.pdf 16 | snakemake --cores 1 --configfile nextstrain_profiles/100k/config-open.yaml -npf upload --dag | dot -Tpdf > dag-100k-open.pdf 17 | ``` 18 | 19 | To run manually you can trigger the GitHub action (recommended) or run the jobs locally via: 20 | ``` 21 | nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \ 22 | --configfile nextstrain_profiles/100k/config-gisaid.yaml \ 23 | -f upload 24 | nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \ 25 | --configfile nextstrain_profiles/100k/config-open.yaml \ 26 | -f upload 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/src/_static/css/configuration-reference.css: -------------------------------------------------------------------------------- 1 | /* Custom CSS to be applied to the reference/workflow-config-file.rst 2 | page. That page defines a custom class of .configuration-reference */ 3 | 4 | 5 | /* We detail a lot of nested (snakemake) configuration entries in the 6 | page. The parent key (top-level config key) is

and sub-keys are

. 7 | The default nextstrain-theme renders

and

extremely similarly. 8 | The following style changes are intended to convey that certain config 9 | entries are children of a higher-level config, rather than being top-level 10 | config parameters themselves */ 11 | 12 | .configuration-reference h4 { 13 | font-size: 100%; 14 | } 15 | 16 | /* Pad lists generated by a (local) contents directive showing sub-keys */ 17 | .configuration-reference section > section > div.contents.local.topic { 18 | margin-left: 24px; /* same as a nested
  • */ 19 | margin-top: -20px; /* CSS can't select previous sibling, FYI */ 20 | } 21 | .configuration-reference section > section > div.contents.local.topic > ul > li { 22 | list-style: circle; 23 | } 24 | /* pad out their siblings (which come _after_ the list) so that they 25 | are in line with the start of text in the preceding
  • element */ 26 | .configuration-reference section > section > div.contents.local.topic ~ * { 27 | margin-left: 48px; 28 | } 29 | 30 | -------------------------------------------------------------------------------- /scripts/annotate_metadata_with_index.py: -------------------------------------------------------------------------------- 1 | """Annotate a metadata file with the given sequence index. 2 | """ 3 | import argparse 4 | from augur.io import read_metadata 5 | import pandas as pd 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 10 | parser.add_argument("--metadata", required=True, help="metadata to annotate") 11 | parser.add_argument("--sequence-index", required=True, help="sequence index from augur index") 12 | parser.add_argument("--output", required=True, help="metadata annotated with sequence index columns including a 'length' column based on the number of A, C, G, and T bases.") 13 | 14 | args = parser.parse_args() 15 | 16 | metadata = read_metadata(args.metadata) 17 | 18 | index = pd.read_csv( 19 | args.sequence_index, 20 | sep="\t", 21 | ).drop( 22 | columns=["length"], 23 | ) 24 | index["length"] = index.loc[:, ["A", "C", "G", "T"]].sum(axis=1) 25 | new_columns = { 26 | column: f"_{column}" 27 | for column in index.columns 28 | if column != "strain" 29 | } 30 | index = index.rename(columns=new_columns) 31 | 32 | metadata.merge( 33 | index, 34 | on="strain", 35 | ).to_csv( 36 | args.output, 37 | sep="\t", 38 | index=False, 39 | ) 40 | -------------------------------------------------------------------------------- /docs/src/guides/update-workflow.rst: -------------------------------------------------------------------------------- 1 | Update the workflow 2 | =================== 3 | 4 | We update the official workflow regularly with: 5 | 6 | - curated metadata including latitudes/longitudes, clade annotations, and low quality sequences 7 | - bug fixes 8 | - :doc:`new features <../reference/change_log>` 9 | 10 | Update your local copy of the workflow, to benefit from these changes. 11 | 12 | .. code:: bash 13 | 14 | # Download and apply changes from the Nextstrain team. 15 | # This only works if there is no conflict with your local repository. 16 | git pull --ff-only origin master 17 | 18 | # OR: 19 | 20 | # Alternately, download and apply changes from the Nextstrain team 21 | # and then replay your local changes on top of those incoming changes. 22 | git pull --rebase origin master 23 | 24 | Alternately, download a specific version of the workflow that you know works for you. We create new `releases of the workflow `__ any time we introduce breaking changes, so you can choose when to update based on :doc:`what has changed <../reference/change_log>`. 25 | 26 | .. code:: bash 27 | 28 | # Download version 7 (v7) of the workflow. 29 | curl -OL https://github.com/nextstrain/ncov/archive/refs/tags/v7.zip 30 | 31 | # Uncompress the workflow. 32 | unzip v7.zip 33 | 34 | # Change into the workflow's directory. 35 | cd ncov-7/ 36 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## Description of proposed changes 2 | 3 | What is the goal of this pull request? What does this pull request change? 4 | 5 | ## Related issue(s) 6 | 7 | 8 | Fixes # 9 | Related to # 10 | 11 | ## Testing 12 | 13 | What steps should be taken to test the changes you've proposed? 14 | If you added or changed behavior in the codebase, did you update the tests, or do you need help with this? 15 | 16 | ## Release checklist 17 | 18 | If this pull request introduces backward incompatible changes, complete the following steps for a new release of the workflow: 19 | 20 | - [ ] Determine the version number for the new release by incrementing [the most recent release](https://github.com/nextstrain/ncov/releases) (e.g., "v2" from "v1"). 21 | - [ ] Update `docs/src/reference/change_log.md` in this pull request to document these changes and the new version number. 22 | - [ ] After merging, [create a new GitHub release](https://github.com/nextstrain/ncov/releases/new) with the new version number as the tag and release title. 23 | 24 | If this pull request introduces new features, complete the following steps: 25 | 26 | - [ ] Update `docs/src/reference/change_log.md` in this pull request to document these changes by the date they were added. 27 | 28 | 29 | -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-ci/builds.yaml: -------------------------------------------------------------------------------- 1 | # Only use one build for CI. 2 | active_builds: europe 3 | 4 | # Override full GISAID data with example data for a faster build. 5 | inputs: 6 | - name: gisaid 7 | metadata: "data/example_metadata.tsv" 8 | sequences: "data/example_sequences.fasta.gz" 9 | 10 | warning: "Test warning" 11 | 12 | builds: 13 | # Override the default Nextstrain European build's subsampling scheme for more 14 | # stable subsampling of a fixed dataset in continuous integration tests. 15 | europe: 16 | subsampling_scheme: nextstrain_ci_sampling 17 | region: Europe 18 | 19 | subsampling: 20 | # Custom subsampling logic for CI tests. 21 | nextstrain_ci_sampling: 22 | # Focal samples for region 23 | region: 24 | group_by: "division year month" 25 | max_sequences: 20 26 | sampling_scheme: "--no-probabilistic-sampling" 27 | exclude: "--exclude-where 'region!={region}'" 28 | # Contextual samples for region from the rest of the world 29 | global: 30 | group_by: "year month" 31 | max_sequences: 10 32 | sampling_scheme: "--no-probabilistic-sampling" 33 | exclude: "--exclude-where 'region={region}'" 34 | priorities: 35 | type: "proximity" 36 | focus: "region" 37 | 38 | # Override default frequency settings, so we can estimate frequencies from older 39 | # data with a fixed time range. 40 | frequencies: 41 | min_date: 2020-01-01 42 | max_date: 2020-05-10 43 | -------------------------------------------------------------------------------- /workflow/schemas/config.schema.yaml: -------------------------------------------------------------------------------- 1 | $schema: https://json-schema.org/draft/2020-12/schema 2 | 3 | description: snakemake configuration file 4 | 5 | type: object 6 | 7 | properties: 8 | inputs: 9 | type: 10 | - array 11 | minItems: 1 12 | items: 13 | type: object 14 | required: 15 | - name 16 | - metadata 17 | properties: 18 | name: 19 | type: string 20 | minLength: 1 21 | metadata: 22 | type: string 23 | minLength: 1 24 | sequences: 25 | type: string 26 | minLength: 1 27 | aligned: 28 | type: string 29 | minLength: 1 30 | skip_sanitize_metadata: 31 | type: boolean 32 | deduplicated: 33 | type: boolean 34 | additionalProperties: false 35 | 36 | builds: 37 | type: object 38 | minProperties: 1 39 | propertyNames: 40 | # Allow build names to contain alphanumeric characters, underscores, and hyphens 41 | # but not special strings used for Nextstrain builds. Also used in the 42 | # workflow's wildcard_constraints. 43 | pattern: "^(?:[-a-zA-Z0-9_](?!tip-frequencies|root-sequence))+$" 44 | 45 | S3_DST_COMPRESSION: 46 | type: string 47 | enum: 48 | - gz 49 | - xz 50 | 51 | S3_DST_ORIGINS: 52 | type: array 53 | minItems: 1 54 | items: 55 | type: string 56 | # A similar pattern is used in the workflow's wildcard constraints. 57 | pattern: "^[a-zA-Z0-9-]+$" 58 | 59 | -------------------------------------------------------------------------------- /docs/translation_docs.md: -------------------------------------------------------------------------------- 1 | ## Translating Nextstrain Situation Reports 2 | 3 | We welcome translations of the situation reports (narratives) into languages other than English (in particular to languages commonly spoken in areas affected by the outbreak). We're incredibly grateful for and impressed by the contributions provided already! 4 | 5 | ### Getting started 6 | 7 | 1. Check to see if the situation report is already available in your language on [the Nextstrain homepage](https://nextstrain.org). If the date is the same for the English version and the version in your language, then it's already up to date! :) 8 | 9 | 2. Find your language\* on the [translation project board](https://github.com/nextstrain/ncov/projects/1), and comment on the issue so we know you're working on it. 10 | 11 | 3. Follow the instructions in the issue to submit your translation. 12 | 13 | 4. **When you're done, please remember to move the issue to the "ready for review" column [in the project board](https://github.com/nextstrain/ncov/projects/1).** This helps us keep everything moving smoothly. 14 | 15 | 5. When your translation has been reviewed and approved by a second translator, we'll publish it and put it on the Nextstrain homepage! 16 | 17 | ### \*If your language isn't listed on the project board 18 | 19 | We'd love to add even more languages! [Please open an issue here](https://github.com/nextstrain/ncov/issues/new?assignees=cassiawag&labels=&template=translation--community-request-.md&title=%5BLanguage+translation+request%5D); we'll get back to you right away! 20 | -------------------------------------------------------------------------------- /scripts/construct-recency-from-submission-date.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from datetime import datetime 3 | from augur.io import read_metadata 4 | import json 5 | 6 | def get_recency(date_str, ref_date): 7 | date_submitted = datetime.strptime(date_str, '%Y-%m-%d').toordinal() 8 | ref_day = ref_date.toordinal() 9 | 10 | delta_days = ref_day - date_submitted 11 | if delta_days<=0: 12 | return 'New' 13 | elif delta_days<3: 14 | return '1-2 days ago' 15 | elif delta_days<8: 16 | return '3-7 days ago' 17 | elif delta_days<15: 18 | return 'One week ago' 19 | elif delta_days<31: 20 | return 'One month ago' 21 | elif delta_days>=31: 22 | return 'Older' 23 | 24 | if __name__ == '__main__': 25 | parser = argparse.ArgumentParser( 26 | description="Assign each sequence a field that specifies when it was added", 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 28 | ) 29 | 30 | parser.add_argument('--metadata', type=str, required=True, help="metadata file") 31 | parser.add_argument('--output', type=str, required=True, help="output json") 32 | args = parser.parse_args() 33 | 34 | meta = read_metadata(args.metadata) 35 | 36 | node_data = {'nodes':{}} 37 | ref_date = datetime.now() 38 | 39 | for strain, d in meta.iterrows(): 40 | if 'date_submitted' in d and d['date_submitted'] and d['date_submitted'] != "undefined": 41 | node_data['nodes'][strain] = {'recency': get_recency(d['date_submitted'], ref_date)} 42 | 43 | with open(args.output, 'wt') as fh: 44 | json.dump(node_data, fh) 45 | -------------------------------------------------------------------------------- /.github/workflows/revert.yml: -------------------------------------------------------------------------------- 1 | name: Revert nextstrain.org/ncov/gisaid or nextstrain.org/ncov/open 2 | 3 | on: 4 | # Manually triggered using GitHub's UI 5 | workflow_dispatch: 6 | inputs: 7 | data_source_name: 8 | description: Name of data source corresponding to the datasets on nextstrain.org/ncov to reset. Options are "gisaid" or "open". 9 | required: true 10 | build_region_name: 11 | description: A single regional dataset on nextstrain.org/ncov/{data_source_name} to reset. Options are "global", "africa", "asia", "europe", "north-america", "oceania", "south-america". If not specified, reverts all. If you'd like to revert multiple regions but not all, run the action multiple times, specifying one region each time. 12 | required: true 13 | default: all 14 | date: 15 | description: Date to revert to. A corresponding set of date-stamped datasets must exist on the s3 bucket. Format is YYYY-MM-DD. 16 | required: true 17 | 18 | env: 19 | DATA_SOURCE_NAME: ${{ github.event.inputs.data_source_name }} 20 | BUILD_REGION_NAME: ${{ github.event.inputs.build_region_name }} 21 | DATE: ${{ github.event.inputs.date }} 22 | 23 | jobs: 24 | revert: 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@v6 28 | 29 | - uses: nextstrain/.github/actions/setup-nextstrain-cli@master 30 | 31 | - name: Revert build 32 | run: | 33 | ./scripts/revert "$DATA_SOURCE_NAME" "$BUILD_REGION_NAME" "$DATE" 34 | env: 35 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 36 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Files created by the pipeline, which we want to keep out of git 2 | # (or at least out of _this_ git repo). 3 | benchmarks/ 4 | logs/ 5 | results/ 6 | build/ 7 | auspice/ 8 | data/* 9 | defaults/colors.tsv 10 | defaults/colors_*.tsv 11 | # stats.json was removed in Snakemake v8 12 | stats.json 13 | 14 | # common analysis directory names 15 | ncov-tutorial/ 16 | my-ncov-analyses/ 17 | 18 | # old analysis directory 19 | my_profiles/* 20 | !my_profiles/README.md 21 | 22 | # Downloaded remote files from sources we expect 23 | /nextstrain-ncov-private/ 24 | /nextstrain-data/ 25 | /data.nextstrain.org/ 26 | 27 | # Sensitive environment variables 28 | environment* 29 | 30 | # Snakemake state dir 31 | /.snakemake 32 | snakemake_log 33 | 34 | # Local config overrides 35 | /config_local.yaml 36 | 37 | # For Python # 38 | ############## 39 | *.pyc 40 | .tox/ 41 | .cache/ 42 | 43 | # Compiled source # 44 | ################### 45 | *.com 46 | *.class 47 | *.dll 48 | *.exe 49 | *.o 50 | *.so 51 | 52 | # OS generated files # 53 | ###################### 54 | .DS_Store 55 | .DS_Store? 56 | ._* 57 | .Spotlight-V100 58 | .Trashes 59 | Icon? 60 | ehthumbs.db 61 | Thumbs.db 62 | *~ 63 | 64 | # IDE generated files # 65 | ###################### 66 | .vscode/ 67 | 68 | narratives/*pdf 69 | 70 | 71 | # metadata / new seqs scripts 72 | scripts/curate_metadata/inputs_new_sequences/ 73 | scripts/curate_metadata/output_curate_metadata/ 74 | scripts/curate_metadata/outputs_new_sequences/ 75 | scripts/curate_metadata/config_curate_metadata/geoLocationRules.txt 76 | scripts/curate_metadata/config_curate_metadata/manualAnnotationRules.txt 77 | 78 | 79 | # SLURM log files 80 | slurm-*.out 81 | -------------------------------------------------------------------------------- /docs/src/reference/glossary.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Glossary 3 | ======== 4 | 5 | .. glossary:: 6 | 7 | analysis directory 8 | 9 | The folder within ``ncov/`` where :term:`customization files ` live. Previously this was ``my_profiles/`` but we now allow any name of choice, and provide `ncov-tutorial `__ as a starter template. 10 | 11 | Auspice config file 12 | also ``auspice_config.json`` 13 | 14 | A JSON file used to configure visualization in :term:`docs.nextstrain.org:Auspice`. 15 | 16 | config file 17 | also *workflow config file*, *workflow configuration file*, ``builds.yaml`` 18 | 19 | A YAML file used to `configure `__ the :term:`Snakemake` workflow (via the ``--configfile`` option). Appends to and overrides default configuration in ``defaults/parameters.yaml``. For the :term:`ncov workflow`, this file must follow a :doc:`specific format `. 20 | 21 | customization file 22 | 23 | A file used to customize the :term:`ncov workflow`. 24 | 25 | Examples: :term:`Auspice config file`, :term:`workflow config file`, :term:`default files` 26 | 27 | default files 28 | 29 | Default :term:`customization files ` provided in ``ncov/defaults/``. 30 | 31 | ncov workflow 32 | also *SARS-CoV-2 workflow* 33 | 34 | The workflow used to automate execution of :term:`builds`. Implemented in :term:`Snakemake`. 35 | 36 | Snakemake 37 | 38 | The workflow manager used in the :term:`ncov workflow`. 39 | -------------------------------------------------------------------------------- /scripts/rename_clades.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser( 6 | description="Rename clades in clades.tsv", 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 8 | ) 9 | 10 | parser.add_argument('--input-clade-files', type=str, nargs='+', required=True, help="input clade files") 11 | parser.add_argument('--name-mapping', type=str, required=False, help="YAML mapping between Nextstrain clades and display names") 12 | parser.add_argument('--output-clades', type=str, required=True, help="renamed clade file") 13 | args = parser.parse_args() 14 | 15 | # read name mapping from input yaml file 16 | if args.name_mapping: 17 | with open(args.name_mapping) as fh: 18 | name_mapping = yaml.load(fh, Loader=yaml.FullLoader) 19 | else: 20 | name_mapping = {} 21 | 22 | 23 | # write output into one consolidated file 24 | out_clades = open(args.output_clades, "w") 25 | 26 | # loop over input file and replace clade names were appropriate line by line 27 | for fname in args.input_clade_files: 28 | with open(fname) as fh: 29 | for line in fh: 30 | fields = line.strip('\n').split('\t') 31 | if len(fields) < 3: 32 | continue 33 | fields[0] = name_mapping.get(fields[0], fields[0]) 34 | # if clade definition is based on other clade, replace name 35 | if fields[1]=='clade': 36 | fields[2] = name_mapping.get(fields[2], fields[2]) 37 | out_clades.write('\t'.join(fields)+'\n') 38 | 39 | out_clades.close() 40 | -------------------------------------------------------------------------------- /docs/redirects.yaml: -------------------------------------------------------------------------------- 1 | # Authoritative list of redirects we have configured in RTD. See the 2 | # docs.nextstrain.org repo's README.md¹ for more information on maintaining 3 | # redirects. 4 | # 5 | # ¹ https://github.com/nextstrain/docs.nextstrain.org#configuring-redirects 6 | --- 7 | - type: page 8 | from_url: /analysis/ 9 | to_url: /tutorial/index.html 10 | 11 | - type: page 12 | from_url: /analysis/index.html 13 | to_url: /tutorial/index.html 14 | 15 | - type: page 16 | from_url: /analysis/customizing-analysis.html 17 | to_url: /guides/workflow-config-file.html 18 | 19 | - type: page 20 | from_url: /analysis/customizing-visualization.html 21 | to_url: /guides/customizing-visualization.html 22 | 23 | - type: page 24 | from_url: /analysis/data-prep.html 25 | to_url: /guides/data-prep/index.html 26 | 27 | - type: page 28 | from_url: /analysis/orientation-files.html 29 | to_url: /reference/files.html 30 | 31 | - type: page 32 | from_url: /analysis/orientation-workflow.html 33 | to_url: /reference/nextstrain-overview.html 34 | 35 | - type: page 36 | from_url: /analysis/running.html 37 | to_url: /reference/troubleshoot.html 38 | 39 | - type: page 40 | from_url: /analysis/setup.html 41 | to_url: /tutorial/setup.html 42 | 43 | - type: page 44 | from_url: /videos.html 45 | to_url: /tutorial/videos.html 46 | 47 | - type: page 48 | from_url: /reference/configuration.html 49 | to_url: /reference/workflow-config-file.html 50 | 51 | - type: page 52 | from_url: /reference/multiple_inputs.html 53 | to_url: / 54 | 55 | - type: page 56 | from_url: /visualization/index.html 57 | to_url: /visualization/sharing.html 58 | 59 | - type: page 60 | from_url: /guides/index.html 61 | to_url: /guides/run-analysis-on-terra.html 62 | -------------------------------------------------------------------------------- /scripts/upload-to-s3: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | bin="$(dirname "$0")" 5 | 6 | main() { 7 | local quiet=0 8 | 9 | for arg; do 10 | case "$arg" in 11 | --quiet) 12 | quiet=1 13 | shift;; 14 | *) 15 | break;; 16 | esac 17 | done 18 | 19 | local src="${1:?A source file is required as the first argument.}" 20 | local dst="${2:?A destination s3:// URL is required as the second argument.}" 21 | 22 | local s3path="${dst#s3://}" 23 | local bucket="${s3path%%/*}" 24 | local key="${s3path#*/}" 25 | 26 | local src_hash dst_hash no_hash=0000000000000000000000000000000000000000000000000000000000000000 27 | src_hash="$("$bin/sha256sum" < "$src")" 28 | dst_hash="$(aws s3api head-object --bucket "$bucket" --key "$key" --query Metadata.sha256sum --output text || echo "$no_hash")" 29 | 30 | echo "$src_hash $src" 31 | echo "$dst_hash $dst" 32 | 33 | if [[ $src_hash != "$dst_hash" ]]; then 34 | echo "Uploading $src → $dst" 35 | aws s3 cp --no-progress "$src" "$dst" --metadata sha256sum="$src_hash" "$(content-type "$dst")" 36 | else 37 | echo "Files are identical, skipping upload" 38 | fi 39 | } 40 | 41 | content-type() { 42 | case "$1" in 43 | *.tsv) echo --content-type=text/tab-separated-values;; 44 | *.csv) echo --content-type=text/comma-separated-values;; 45 | *.ndjson) echo --content-type=application/x-ndjson;; 46 | *.json) echo --content-type=application/json;; 47 | *.gz) echo --content-type=application/gzip;; 48 | *.xz) echo --content-type=application/x-xz;; 49 | *) echo --content-type=text/plain;; 50 | esac 51 | } 52 | 53 | main "$@" 54 | -------------------------------------------------------------------------------- /scripts/add_priorities_to_meta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add column to metadata with the priorities of 'context' sequences 3 | relative to the 'focal' samples 4 | """ 5 | 6 | import argparse 7 | import pandas as pd 8 | import csv 9 | import json 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser( 13 | description="Add columns for priorities of sequences relative to diff focal regions", 14 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 15 | ) 16 | parser.add_argument("--metadata", type = str, required=True, help="metadata") 17 | parser.add_argument("--priorities", type=str, nargs="+", required=True, help="priorities files") 18 | parser.add_argument("--config", type=str, help="config file to modify") 19 | parser.add_argument("--output-meta", type=str, required=True, help="adjusted metadata") 20 | parser.add_argument("--output-config", type=str, help="modified config") 21 | args = parser.parse_args() 22 | 23 | metadata = pd.read_csv(args.metadata, sep='\t') 24 | with open(args.config) as fh: 25 | input_json = json.load(fh) 26 | 27 | for priority_file in args.priorities: 28 | p_f = priority_file.replace(".tsv", "") 29 | region = p_f.split("_")[2] 30 | column_name = "".join(["priorities_",region]) 31 | 32 | with open(priority_file, 'r') as f: 33 | reader = csv.reader(f, delimiter='\t') 34 | priors = {r[0]: r[1] for r in reader if len(r)>1} 35 | 36 | assign_priors = [priors[st] if st in priors else "" for st in metadata.strain] 37 | 38 | metadata.insert(11, column_name, assign_priors) 39 | input_json['colorings'].append({'key': column_name, 'type': 'continuous'}) 40 | 41 | metadata.to_csv(args.output_meta, index=False, sep="\t") 42 | 43 | with open(args.output_config, 'w') as fh: 44 | json.dump(input_json, fh, indent=2) 45 | -------------------------------------------------------------------------------- /.github/workflows/rebuild-gisaid.yml: -------------------------------------------------------------------------------- 1 | name: Rebuild GISAID phylogenetic datasets 2 | 3 | on: 4 | # This workflow can be triggered from repository_dispatch events, 5 | # for instance, after the appropriate preprocessing actions have completed 6 | repository_dispatch: 7 | types: 8 | - rebuild 9 | - gisaid/rebuild 10 | # Manually triggered using GitHub's UI 11 | workflow_dispatch: 12 | inputs: 13 | trial_name: 14 | description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-ncov-private and the trees on nextstrain.org/ncov/gisaid..." 15 | required: false 16 | image: 17 | description: 'Specific container image to use for build (will override the default of "nextstrain build")' 18 | required: false 19 | 20 | jobs: 21 | gisaid: 22 | permissions: 23 | id-token: write 24 | uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master 25 | secrets: inherit 26 | with: 27 | runtime: aws-batch 28 | env: | 29 | TRIAL_NAME: ${{ github.event.inputs.trial_name }} 30 | NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }} 31 | run: | 32 | set -x 33 | 34 | declare -a config 35 | config+=(build_date=\'$(date +'%Y-%m-%d')\') 36 | if [[ "$TRIAL_NAME" ]]; then 37 | config+=( 38 | S3_DST_BUCKET=nextstrain-ncov-private/trial/"$TRIAL_NAME" 39 | deploy_url=s3://nextstrain-staging/ 40 | auspice_json_prefix=ncov_gisaid_trial_"$TRIAL_NAME" 41 | ) 42 | else 43 | config+=(slack_token=$SLACK_TOKEN) 44 | fi 45 | 46 | nextstrain build \ 47 | --detach \ 48 | --cpus 72 \ 49 | --memory 140GiB \ 50 | . \ 51 | deploy \ 52 | upload \ 53 | --config "${config[@]}" \ 54 | --profile nextstrain_profiles/nextstrain-gisaid \ 55 | --set-threads tree=8 56 | -------------------------------------------------------------------------------- /.github/workflows/rebuild-gisaid-21L.yml: -------------------------------------------------------------------------------- 1 | name: Rebuild GISAID 21L phylogenetic datasets 2 | 3 | on: 4 | # This workflow can be triggered from repository_dispatch events, 5 | # for instance, after the appropriate preprocessing actions have completed 6 | repository_dispatch: 7 | types: 8 | - rebuild 9 | - gisaid/rebuild 10 | # Manually triggered using GitHub's UI 11 | workflow_dispatch: 12 | inputs: 13 | trial_name: 14 | description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-ncov-private and the trees on nextstrain.org/ncov/gisaid..." 15 | required: false 16 | image: 17 | description: 'Specific container image to use for build (will override the default of "nextstrain build")' 18 | required: false 19 | 20 | jobs: 21 | gisaid-21L: 22 | permissions: 23 | id-token: write 24 | uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master 25 | secrets: inherit 26 | with: 27 | runtime: aws-batch 28 | env: | 29 | TRIAL_NAME: ${{ github.event.inputs.trial_name }} 30 | NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }} 31 | run: | 32 | set -x 33 | 34 | declare -a config 35 | config+=(build_date=\'$(date +'%Y-%m-%d')\') 36 | if [[ "$TRIAL_NAME" ]]; then 37 | config+=( 38 | S3_DST_BUCKET=nextstrain-ncov-private/trial/"$TRIAL_NAME" 39 | deploy_url=s3://nextstrain-staging/ 40 | auspice_json_prefix=ncov_gisaid_21L_trial_"$TRIAL_NAME" 41 | ) 42 | else 43 | config+=(slack_token=$SLACK_TOKEN) 44 | fi 45 | 46 | nextstrain build \ 47 | --detach \ 48 | --cpus 72 \ 49 | --memory 140GiB \ 50 | . \ 51 | deploy \ 52 | upload \ 53 | --config "${config[@]}" \ 54 | --profile nextstrain_profiles/nextstrain-gisaid-21L \ 55 | --set-threads tree=8 56 | -------------------------------------------------------------------------------- /workflow/wdl/ncov_workflow.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "tasks/nextstrain.wdl" as nextstrain 4 | 5 | workflow Nextstrain_WRKFLW { 6 | input { 7 | # ncov 8 | # Option 1: Pass in a sequence and metadata files, create a configfile_yaml 9 | File? sequence_fasta 10 | File? metadata_tsv 11 | File? context_targz #<= optional contextual seqs in a tarball 12 | String? build_name 13 | 14 | # Option 2: Use a custom config file (e.g. builds.yaml) with https or s3 sequence or metadata files 15 | File? configfile_yaml 16 | File? custom_zip # optional modifier: add a my_profiles.zip folder for my_auspice_config.json 17 | String? active_builds # optional modifier: specify "Wisconsin,Minnesota,Iowa" 18 | 19 | # Option 3? GISAID augur zip? 20 | # File? gisaid_zip # tarball 21 | 22 | # Optional Keys for deployment 23 | String? s3deploy 24 | String? AWS_ACCESS_KEY_ID 25 | String? AWS_SECRET_ACCESS_KEY 26 | 27 | # By default, run the ncov workflow (can swap it for zika or something else) 28 | String pathogen_giturl = "https://github.com/nextstrain/ncov/archive/refs/heads/master.zip" 29 | Int? cpu 30 | Int? memory # in GiB 31 | Int? disk_size 32 | } 33 | 34 | call nextstrain.nextstrain_build as build { 35 | input: 36 | # Option 1 37 | sequence_fasta = sequence_fasta, 38 | metadata_tsv = metadata_tsv, 39 | context_targz = context_targz, 40 | build_name = build_name, 41 | 42 | # Option 2 43 | configfile_yaml = configfile_yaml, 44 | custom_zip = custom_zip, 45 | active_builds = active_builds, 46 | 47 | # Optional deploy to s3 site 48 | s3deploy = s3deploy, 49 | AWS_ACCESS_KEY_ID = AWS_ACCESS_KEY_ID, 50 | AWS_SECRET_ACCESS_KEY = AWS_SECRET_ACCESS_KEY, 51 | 52 | pathogen_giturl = pathogen_giturl, 53 | cpu = cpu, 54 | memory = memory, 55 | disk_size = disk_size 56 | } 57 | 58 | output { 59 | File auspice_zip = build.auspice_zip 60 | File results_zip = build.results_zip 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /.github/workflows/rebuild-open.yml: -------------------------------------------------------------------------------- 1 | name: Rebuild open (GenBank) phylogenetic datasets 2 | 3 | on: 4 | # This workflow can be triggered from repository_dispatch events, 5 | # for instance, after the appropriate preprocessing actions have completed 6 | repository_dispatch: 7 | types: 8 | - rebuild 9 | - open/rebuild 10 | - genbank/rebuild 11 | # Manually triggered using GitHub's UI 12 | workflow_dispatch: 13 | inputs: 14 | trial_name: 15 | description: "Short name for this trial build, for prefixing the uploaded data and results files. WARNING: without this we will overwrite files in s3://nextstrain-data/files/ncov/open/ and the trees on nextstrain.org/ncov/open/..." 16 | required: false 17 | image: 18 | description: 'Specific container image to use for build (will override the default of "nextstrain build")' 19 | required: false 20 | 21 | jobs: 22 | open: 23 | permissions: 24 | id-token: write 25 | uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master 26 | secrets: inherit 27 | with: 28 | runtime: aws-batch 29 | env: | 30 | TRIAL_NAME: ${{ github.event.inputs.trial_name }} 31 | NEXTSTRAIN_DOCKER_IMAGE: ${{ github.event.inputs.image }} 32 | run: | 33 | set -x 34 | 35 | declare -a config 36 | config+=(build_date=\'$(date +'%Y-%m-%d')\') 37 | if [[ "$TRIAL_NAME" ]]; then 38 | config+=( 39 | S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/"$TRIAL_NAME" 40 | deploy_url=s3://nextstrain-staging/ 41 | auspice_json_prefix=ncov_open_trial_"$TRIAL_NAME" 42 | ) 43 | else 44 | config+=(slack_token=$SLACK_TOKEN) 45 | fi 46 | 47 | nextstrain build \ 48 | --detach \ 49 | --cpus 72 \ 50 | --memory 140GiB \ 51 | . \ 52 | deploy \ 53 | upload \ 54 | --config "${config[@]}" \ 55 | --profile nextstrain_profiles/nextstrain-open \ 56 | --set-threads tree=8 57 | -------------------------------------------------------------------------------- /nextstrain_profiles/100k/config-gisaid.yaml: -------------------------------------------------------------------------------- 1 | 2 | S3_DST_BUCKET: "nextstrain-ncov-private/100k" 3 | S3_DST_ORIGINS: [needed-for-workflow-but-unused] 4 | deploy_url: needed_for_workflow_but_unused 5 | 6 | custom_rules: 7 | - workflow/snakemake_rules/export_for_nextstrain.smk 8 | 9 | # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment 10 | inputs: 11 | - name: gisaid 12 | metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst" 13 | aligned: "s3://nextstrain-ncov-private/sequences.fasta.zst" 14 | skip_sanitize_metadata: true 15 | deduplicated: true 16 | 17 | builds: 18 | 100k: 19 | subsampling_scheme: 100k_scheme 20 | 21 | # mapping of remote: local files to be uploaded under S3_DST_BUCKET 22 | upload: 23 | metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz 24 | sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz 25 | 26 | # remove sequences without division label in US 27 | filter: 28 | exclude_where: "division='USA'" 29 | 30 | # We wish to subsample 50k in the previous 12 months and 50k prior to that. 31 | # Note 1: both --max-date and --min-date are inclusive of the boundary date, 32 | # so sequences from that date will be available to both sub-samples 33 | # Note 2: As we group by (year,month) the boundary month will be included 34 | # in both sub-samples and thus this month will be oversampled. 35 | # For instance, if the boundary is March 7th then `50k_early` will sample 36 | # the same number of genomes from the first week of March as each of the 37 | # preceding months; similarly `50k_late` will sample as many genomes from 38 | # the final ~3 weeks of March as each of the following full months. 39 | # (see https://github.com/nextstrain/ncov/pull/1032#discussion_r1034087312) 40 | subsampling: 41 | 100k_scheme: 42 | 50k_early: 43 | group_by: "year month country" 44 | max_sequences: 50000 45 | max_date: "--max-date 1Y" 46 | 50k_late: 47 | group_by: "year month country" 48 | max_sequences: 50000 49 | min_date: "--min-date 1Y" 50 | -------------------------------------------------------------------------------- /scripts/adjust_regional_meta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Add column to metadata to denote 'focal' samples based on supplied region 3 | Rewrite location, division and country for non-focal samples to be region 4 | """ 5 | 6 | import argparse 7 | import pandas as pd 8 | 9 | if __name__ == '__main__': 10 | parser = argparse.ArgumentParser( 11 | description="Add column to metadata to denote 'focal' samples based on supplied region", 12 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 13 | ) 14 | parser.add_argument("--metadata", type = str, required=True, help="metadata") 15 | parser.add_argument("--region", type=str, required=False, help="focal region") 16 | parser.add_argument("--country", type=str, required=False, help="focal country") 17 | parser.add_argument("--division", type=str, required=False, help="focal division") 18 | parser.add_argument("--location", type=str, required=False, help="focal location") 19 | parser.add_argument("--composite", type=str, required=False, help="composite sampling") 20 | parser.add_argument("--output", type=str, required=True, help="adjusted metadata") 21 | args = parser.parse_args() 22 | 23 | region_list = ["Asia", "Africa", "Europe", "North America", "Oceania", "South America"] 24 | 25 | metadata = pd.read_csv(args.metadata, delimiter='\t') 26 | 27 | # if in region list, then do the fixing 28 | if args.region in region_list: 29 | focal_region = args.region 30 | else: # otherwise just write out metadata as is, and proceed 31 | metadata.to_csv(args.output, index=False, sep="\t") 32 | exit() 33 | 34 | print("Adjusting metadata for focal region", args.region) 35 | 36 | 37 | metadata.insert(12, 'focal', True) 38 | 39 | metadata.loc[metadata.region != focal_region, 'focal'] = False 40 | metadata.loc[metadata.region != focal_region, 'location'] = "" 41 | metadata.loc[metadata.region != focal_region, 'division'] = metadata.region 42 | metadata.loc[metadata.region != focal_region, 'country'] = metadata.region 43 | 44 | metadata.to_csv(args.output, index=False, sep="\t") 45 | -------------------------------------------------------------------------------- /scripts/calculate_epiweek.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from augur.io import read_metadata 4 | from augur.utils import write_json 5 | import epiweeks 6 | import pandas as pd 7 | import re 8 | 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser( 12 | usage="Calculate epiweeks for dates in the given metadata", 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 14 | ) 15 | parser.add_argument("--metadata", required=True, help="metadata with a 'date' column") 16 | parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="names of valid metadata columns containing identifier information like 'strain' or 'name'") 17 | parser.add_argument("--attribute-name", default="epiweek", help="name to store annotations of epiweeks in JSON output") 18 | parser.add_argument("--output-node-data", required=True, help="node data JSON with epiweek annotations") 19 | 20 | args = parser.parse_args() 21 | 22 | metadata = read_metadata( 23 | args.metadata, 24 | id_columns=args.metadata_id_columns, 25 | ) 26 | 27 | # Find records with unambiguous dates. These must be complete date-like 28 | # records in YYYY-MM-DD format. 29 | date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}$") 30 | has_complete_date = metadata["date"].astype(str).apply(lambda date: date_pattern.match(date) is not None) 31 | metadata_with_dates = metadata.loc[has_complete_date, ["date"]].copy() 32 | 33 | # Convert date strings to timestamps. 34 | metadata_with_dates["date"] = pd.to_datetime(metadata_with_dates["date"]) 35 | 36 | # Calculate epiweeks from date objects as a new annotation. 37 | metadata_with_dates["epiweek"] = metadata_with_dates["date"].apply(lambda date: epiweeks.Week.fromdate(date).cdcformat()) 38 | 39 | # Create a node data object with epiweeks. 40 | node_data = {} 41 | for index, record in metadata_with_dates.iterrows(): 42 | node_data[index] = { 43 | args.attribute_name: record["epiweek"], 44 | } 45 | 46 | # Save node data. 47 | write_json({"nodes": node_data}, args.output_node_data) 48 | -------------------------------------------------------------------------------- /docs/src/index.rst: -------------------------------------------------------------------------------- 1 | ***************************************************************** 2 | A Getting Started Guide to the Genomic Epidemiology of SARS-CoV-2 3 | ***************************************************************** 4 | 5 | This is the documentation for the SARS-CoV-2 workflow maintained and actively used by the Nextstrain core team. 6 | 7 | In addition to the phylogenetic analysis described here, you can use `Nextclade `_ our "drag-and-drop" tool for clade assignment, mutation calling, and sequence quality control at `clades.nextstrain.org `_. 8 | 9 | If something in this documentation is broken or unclear, `open an issue `_ so we can improve it for everyone. 10 | 11 | If you have a specific question, `post a note on the discussion board `_ -- we're happy to help! 12 | 13 | .. toctree:: 14 | :maxdepth: 1 15 | :titlesonly: 16 | :caption: Tutorials 17 | :hidden: 18 | 19 | tutorial/intro 20 | tutorial/setup 21 | tutorial/example-data 22 | tutorial/custom-data 23 | tutorial/genomic-surveillance 24 | tutorial/next-steps 25 | tutorial/videos 26 | 27 | 28 | .. toctree:: 29 | :maxdepth: 1 30 | :titlesonly: 31 | :caption: Visualization & Interpretation 32 | :hidden: 33 | 34 | visualization/sharing 35 | visualization/interpretation 36 | visualization/narratives 37 | 38 | .. toctree:: 39 | :maxdepth: 1 40 | :titlesonly: 41 | :caption: Guides 42 | :hidden: 43 | 44 | guides/update-workflow 45 | guides/data-prep/index 46 | guides/workflow-config-file 47 | guides/customizing-visualization 48 | guides/run-analysis-on-terra 49 | 50 | .. toctree:: 51 | :maxdepth: 1 52 | :caption: Reference 53 | :hidden: 54 | 55 | reference/nextstrain-overview 56 | reference/files 57 | reference/workflow-config-file 58 | reference/remote_inputs 59 | reference/metadata-fields 60 | reference/naming_clades 61 | reference/data_submitter_faq 62 | reference/troubleshoot 63 | reference/change_log 64 | reference/glossary 65 | 66 | .. toctree:: 67 | :maxdepth: 1 68 | :titlesonly: 69 | :hidden: 70 | 71 | Stuck? Ask us on the discussion board. We're happy to help! 72 | -------------------------------------------------------------------------------- /scripts/mask-alignment.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mask initial bases from alignment FASTA 3 | """ 4 | import argparse 5 | from augur.io import open_file, read_sequences, write_sequences 6 | import Bio 7 | import Bio.SeqIO 8 | from Bio.Seq import Seq 9 | 10 | def mask_terminal_gaps(seq): 11 | L = len(seq) 12 | seq_trimmed = seq.lstrip('-') 13 | left_gaps = L - len(seq_trimmed) 14 | seq_trimmed = seq_trimmed.rstrip('-') 15 | right_gaps = L - len(seq_trimmed) - left_gaps 16 | return "N"*left_gaps + seq_trimmed + "N"*right_gaps 17 | 18 | 19 | if __name__ == '__main__': 20 | parser = argparse.ArgumentParser( 21 | description="Mask initial bases from alignment FASTA", 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 23 | ) 24 | parser.add_argument("--alignment", required=True, help="FASTA file of alignment") 25 | parser.add_argument("--mask-terminal-gaps", action='store_true', help="fill all terminal gaps with N as they likely represent missing data") 26 | parser.add_argument("--mask-from-beginning", type = int, required=True, help="number of bases to mask from start") 27 | parser.add_argument("--mask-from-end", type = int, help="number of bases to mask from end") 28 | parser.add_argument("--mask-sites", nargs='+', type = int, help="list of sites to mask") 29 | parser.add_argument("--output", required=True, help="FASTA file of output alignment") 30 | args = parser.parse_args() 31 | 32 | begin_length = 0 33 | if args.mask_from_beginning: 34 | begin_length = args.mask_from_beginning 35 | end_length = 0 36 | if args.mask_from_end: 37 | end_length = args.mask_from_end 38 | 39 | with open_file(args.output, 'w') as outfile: 40 | for record in read_sequences(args.alignment): 41 | seq = str(record.seq) 42 | if args.mask_terminal_gaps: 43 | seq = mask_terminal_gaps(seq) 44 | 45 | start = "N" * begin_length 46 | middle = seq[begin_length:-end_length] 47 | end = "N" * end_length 48 | seq_list = list(start + middle + end) 49 | if args.mask_sites: 50 | for site in args.mask_sites: 51 | if seq_list[site-1]!='-': 52 | seq_list[site-1] = "N" 53 | record.seq = Seq("".join(seq_list)) 54 | write_sequences(record, outfile) 55 | -------------------------------------------------------------------------------- /scripts/priorities.py: -------------------------------------------------------------------------------- 1 | """ 2 | calculate priorties from index and proximities 3 | """ 4 | import argparse 5 | from random import shuffle 6 | from collections import defaultdict 7 | import numpy as np 8 | import pandas as pd 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser( 12 | description="generate priorities files based on genetic proximity to focal sample", 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 14 | ) 15 | parser.add_argument("--sequence-index", type=str, required=True, help="sequence index file") 16 | parser.add_argument("--proximities", type = str, required=True, help="tsv file with proximities") 17 | parser.add_argument("--Nweight", type = float, default=0.003, required=False, help="parameterizes de-prioritization of incomplete sequences") 18 | parser.add_argument("--crowding-penalty", type = float, default=0.1, required=False, help="parameterizes how priorities decrease when there is many very similar sequences") 19 | parser.add_argument("--output", type=str, required=True, help="tsv file with the priorities") 20 | args = parser.parse_args() 21 | 22 | proximities = pd.read_csv(args.proximities, sep='\t', index_col=0) 23 | index = pd.read_csv(args.sequence_index, sep='\t', index_col=0) 24 | combined = pd.concat([proximities, index], axis=1) 25 | 26 | closest_matches = combined.groupby('closest strain') 27 | candidates = {} 28 | for focal_seq, seqs in closest_matches.groups.items(): 29 | tmp = combined.loc[seqs, ["distance", "N"]] 30 | # penalize larger distances and more undetermined sites. 1/args.Nweight are 'as bad' as one extra mutation 31 | tmp["priority"] = -tmp.distance - tmp.N*args.Nweight 32 | name_prior = [(name, d.priority) for name, d in tmp.iterrows()] 33 | shuffle(name_prior) 34 | candidates[focal_seq] = sorted(name_prior, key=lambda x:x[1], reverse=True) 35 | 36 | # export priorities 37 | crowding = args.crowding_penalty 38 | with open(args.output, 'w') as fh: 39 | # loop over lists of sequences that are closest to particular focal sequences 40 | for cs in candidates.values(): 41 | # these sets have been sorted by priorities after shuffling -- reduce priorities in this shuffled/sorted order 42 | for i, (name, pr) in enumerate(cs): 43 | fh.write(f"{name}\t{pr-i*crowding:1.2f}\n") 44 | -------------------------------------------------------------------------------- /scripts/include_prefix.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser( 6 | description="Rename strains to include specified prefix", 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 8 | ) 9 | 10 | parser.add_argument('--input-auspice', type=str, metavar="JSON", required=True, help="input Auspice JSON") 11 | parser.add_argument('--input-tip-frequencies', type=str, metavar="JSON", required=True, help="input tip frequencies JSON") 12 | parser.add_argument("--prefix", type=str, nargs='?', const='', help="prefix to add to strains") 13 | parser.add_argument('--output-auspice', type=str, metavar="JSON", required=True, help="output Auspice JSON") 14 | parser.add_argument('--output-tip-frequencies', type=str, metavar="JSON", required=True, help="output tip frequencies JSON") 15 | args = parser.parse_args() 16 | 17 | # update Auspice JSON 18 | with open(args.input_auspice, "r") as f: 19 | auspice_json = json.load(f) 20 | 21 | if args.prefix: 22 | def update_strain_names(n): # closure 23 | if "NODE_" not in n["name"] and args.prefix not in n["name"]: 24 | n["name"] = args.prefix + n["name"] 25 | 26 | if "children" in n: 27 | for c in n["children"]: 28 | update_strain_names(c) 29 | update_strain_names(auspice_json["tree"]) 30 | 31 | with open(args.output_auspice, 'w') as f: 32 | json.dump(auspice_json, f, indent=2) 33 | 34 | # update tip frequencies JSON 35 | with open(args.input_tip_frequencies, "r") as f: 36 | tip_frequencies_json = json.load(f) 37 | 38 | if args.prefix: 39 | modified_tip_frequencies_json = {} 40 | for key in tip_frequencies_json: 41 | if key != "generated_by" and key != "pivots": 42 | if "NODE_" not in key and args.prefix not in key: 43 | modified_tip_frequencies_json[args.prefix + key] = tip_frequencies_json[key] 44 | else: 45 | modified_tip_frequencies_json[key] = tip_frequencies_json[key] 46 | else: 47 | modified_tip_frequencies_json[key] = tip_frequencies_json[key] 48 | else: 49 | modified_tip_frequencies_json = tip_frequencies_json 50 | 51 | with open(args.output_tip_frequencies, 'w') as f: 52 | json.dump(modified_tip_frequencies_json, f, indent=2) 53 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_files_additional_info/purpose_of_sequencing.txt: -------------------------------------------------------------------------------- 1 | screened by S dropout S dropout 2 | Not screened by S dropout Not S dropout 3 | not screened by S dropout Not S dropout 4 | Screened by S Dropout S dropout 5 | Not screen by S dropout Not S dropout 6 | Not S dropout Not S dropout 7 | not screened by S drop out Not S dropout 8 | Screened by S dropout S dropout 9 | screened by S drop out S dropout 10 | screened for S dropout S dropout 11 | Screened by S drop out S dropout 12 | Screened for S gene Dropout S dropout 13 | Screened for S gene dropout S dropout 14 | Screened for S Gene Dropout S dropout 15 | Screened for S Gene Drop out S dropout 16 | Screened for S gene dropout - No S gene dropout Not S dropout 17 | Not Screened for S Drop out Not S dropout 18 | Not Screened for S Drop out - RT Not S dropout 19 | Not Screened for S Gene Dropout Not S dropout 20 | Screened for S gene Dropout - Negative for S gene dropout S dropout 21 | Screened for S gene dropout - Negative for S gene dropout S dropout 22 | Screened for S Gene Dropout - Negative for S Gene Dropout S dropout 23 | Screened for S Gene Dropout - Negative for S gene dropout S dropout 24 | Screened for S gene dropout - Negative for S dropout S dropout 25 | Screened for S Gene Dropout - Negative for S Dropout S dropout 26 | Screened for S gene dropout - Negative for S gene has a 69/70 deletion S dropout 27 | Screened for S dropout S dropout 28 | Screened for Spike gene dropout S dropout 29 | Not Screened for S Dropout Not S dropout 30 | Not screened for S gene dropout Not S dropout 31 | Not Screened for S gene dropout Not S dropout 32 | Not Screened for S Gene dropout Not S dropout 33 | Not Screened for the S Gene Dropout Not S dropout 34 | SGTF screening S dropout 35 | Screened for Variants of Concern (VoC) with C19-SPAR-Seq S dropout 36 | S gene screened S dropout 37 | Returning traveller with S-gene dropout S dropout 38 | S-gene dropout S dropout 39 | Not S dropout Not S dropout 40 | Baseline Surveillance Not S dropout 41 | Baseline surveillance Not S dropout 42 | SGTF Screening S dropout 43 | S-dropout S dropout 44 | SGTF screen S dropout 45 | SGTF S dropout 46 | Active surveillance, N-gene dropout N dropout 47 | Active Surveillance, N-gene dropout N dropout 48 | Baseline surveillance (random sampling) Not S dropout 49 | ICU patient, baseline surveillance Not S dropout 50 | Baseline Not S dropout 51 | Non-random selection: SGTF S dropout 52 | Non-random selection: Non-SGTF Not S dropout 53 | screened for SGTF S dropout 54 | S drop out S dropout 55 | -------------------------------------------------------------------------------- /scripts/add_labels.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from Bio import Phylo 4 | from collections import defaultdict 5 | 6 | def attach_labels(d, labeled_nodes): 7 | if "children" in d: 8 | for c in d["children"]: 9 | if c["name"] in labeled_nodes: 10 | if "labels" not in c["branch_attrs"]: 11 | c["branch_attrs"]["labels"] = {} 12 | c['branch_attrs']['labels']['mlabel'] = labeled_nodes[c["name"]][0] 13 | print(c['branch_attrs']['labels']) 14 | attach_labels(c, labeled_nodes) 15 | 16 | 17 | if __name__ == '__main__': 18 | parser = argparse.ArgumentParser( 19 | description="Remove extraneous colorings", 20 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 21 | ) 22 | 23 | parser.add_argument('--input', type=str, metavar="JSON", required=True, help="input Auspice JSON") 24 | parser.add_argument('--tree', type=str, required=True, help="tree file") 25 | parser.add_argument('--clades', type=str, required=True, help="clades") 26 | parser.add_argument('--mutations', type=str, required=True, help="mutations") 27 | parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") 28 | args = parser.parse_args() 29 | 30 | T = Phylo.read(args.tree, 'newick') 31 | 32 | with open(args.mutations, "r") as f: 33 | mutation_json = json.load(f)['nodes'] 34 | 35 | with open(args.clades, "r") as f: 36 | clades_json = json.load(f)['nodes'] 37 | 38 | with open(args.input, "r") as f: 39 | input_json = json.load(f) 40 | 41 | nodes = {} 42 | for n in T.find_clades(order='postorder'): 43 | if n.is_terminal(): 44 | n.tip_count=1 45 | else: 46 | n.tip_count = sum([c.tip_count for c in n]) 47 | nodes[n.name] = {'tip_count':n.tip_count} 48 | 49 | labels = defaultdict(list) 50 | for node in nodes: 51 | for m in mutation_json[node]['muts']: 52 | if m[0] in 'ACGT' and m[-1] in 'ACGT': 53 | clade = clades_json[node]['clade_membership'] 54 | tmp_label = (clade, m) 55 | labels[tmp_label].append((node, nodes[node]['tip_count'])) 56 | 57 | labeled_nodes = defaultdict(list) 58 | for label in labels: 59 | node = sorted(labels[label], key=lambda x:-x[1])[0] 60 | labeled_nodes[node[0]].append('/'.join(label)) 61 | 62 | attach_labels(input_json["tree"], labeled_nodes) 63 | 64 | with open(args.output, 'w') as f: 65 | json.dump(input_json, f, indent=2) 66 | -------------------------------------------------------------------------------- /scripts/check_missing_locations.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | 4 | if __name__ == '__main__': 5 | parser = argparse.ArgumentParser( 6 | description="Check for missing colors & locations", 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 8 | ) 9 | 10 | parser.add_argument('--metadata', type=str, nargs='+', required=True, help="input region adjusted metadata") 11 | parser.add_argument('--colors', type=str, nargs='+', required=True, help="input region specific color file") 12 | parser.add_argument('--latlong', type=str, required=True, help="input lat-long file") 13 | args = parser.parse_args() 14 | 15 | things_to_exclude_orig = ['Africa', 'Asia', 'South America', 'Europe', 16 | 'North America', 'Oceania', 'Grand princess cruise ship', 17 | 'diamond princess'] 18 | things_to_exclude = [x.lower() for x in things_to_exclude_orig] 19 | 20 | all_metadatas = [pd.read_csv(met, delimiter='\t') for met in args.metadata] 21 | metadata = pd.concat(all_metadatas, sort=False) 22 | all_colors = [pd.read_csv(col, delimiter='\t', header=None) for col in args.colors] 23 | colors = pd.concat(all_colors, sort=False) 24 | 25 | latlong = pd.read_csv(args.latlong, delimiter='\t', header=None) 26 | 27 | for geo_value in ['location', 'division', 'country']: 28 | locs_w_color_orig = colors.loc[colors[0]==geo_value,1].values 29 | locs_w_color = [x.lower() for x in locs_w_color_orig] 30 | locs_w_latlong_orig = latlong.loc[latlong[0]==geo_value,1].values 31 | locs_w_latlong = [x.lower() for x in locs_w_latlong_orig] 32 | locs_in_meta_orig = [x for x in metadata[geo_value].unique() if not pd.isna(x)] 33 | locs_in_meta = [x.lower() for x in locs_in_meta_orig] 34 | 35 | missing_color_locs = [loc for loc in locs_in_meta if loc not in locs_w_color and loc not in things_to_exclude] 36 | if missing_color_locs: 37 | print("The following {} are missing colors:".format(geo_value)) 38 | print(missing_color_locs) 39 | print("\n") 40 | 41 | if geo_value != 'country': 42 | missing_latlong_locs = [loc for loc in locs_in_meta if loc not in locs_w_latlong and loc not in things_to_exclude] 43 | if missing_latlong_locs: 44 | print("The following {} are missing lat-long values:".format(geo_value)) 45 | print(missing_latlong_locs) 46 | print("\n") 47 | 48 | print("Please remember this does *not* check lat-longs for countries!!") 49 | -------------------------------------------------------------------------------- /scripts/revert: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script is intended to be run from the GitHub Action `revert.yml` but may also be run locally. 4 | # To run locally, you need three command line arguments (see below) as well as having AWS credentials appropriately set. 5 | 6 | set -x 7 | 8 | # data source name corresponding to builds to revert. "gisaid" or "open". 9 | data_source_name=$1 10 | 11 | # An individual regional dataset to revert. Options are "global", "africa", "asia", "europe", "north-america", "oceania", "south-america". If not specified, reverts all. 12 | # If you'd like to revert multiple regions but not all, run the script multiple times, specifying one region each time. 13 | build_region_name=$2 14 | 15 | # date to revert to e.g. yesterday in "+%Y-%m-%d" format. 16 | date=$3 17 | 18 | if [[ "$build_region_name" == "all" ]] 19 | then 20 | regions=" \ 21 | global \ 22 | africa \ 23 | asia \ 24 | europe \ 25 | north-america \ 26 | oceania \ 27 | south-america \ 28 | " 29 | else 30 | regions=$build_region_name 31 | fi 32 | 33 | missing_count=0 34 | for region in $regions; do 35 | if curl -fsLI "https://data.nextstrain.org/ncov_${data_source_name}_${region}_${date}.json" >/dev/null; then 36 | ## Download the date-stamped (auspice) JSONs, renaming them to the non-datestamped version. 37 | ## Note that if the tip-frequencies don't exist for some reason then the command (& action) 38 | ## will fail, but this is preferable to leaving a dataset in a mixed state. 39 | ## We skip the root-sequence JSON as this doesn't change day-to-day 40 | nextstrain remote download \ 41 | "s3://nextstrain-data/ncov_${data_source_name}_${region}_${date}.json" \ 42 | "ncov_${data_source_name}_${region}.json" 43 | nextstrain remote download \ 44 | "s3://nextstrain-data/ncov_${data_source_name}_${region}_${date}_tip-frequencies.json" \ 45 | "ncov_${data_source_name}_${region}_tip-frequencies.json" 46 | ## Upload these, overwriting the canonical (non-datestamped) datasets 47 | ## Note that we use the nextstrain-cli here as it performs cloudfront invalidation 48 | nextstrain remote upload \ 49 | "s3://nextstrain-data" \ 50 | "ncov_${data_source_name}_${region}.json" "ncov_${data_source_name}_${region}_tip-frequencies.json" 51 | else 52 | echo "WARNING: The requested dataset for ${data_source_name}_${region}_${date} doesn't exist and thus we can't revert to it." 53 | ((missing_count++)) 54 | fi 55 | done 56 | 57 | exit $missing_count 58 | -------------------------------------------------------------------------------- /scripts/curate_metadata/config_files_additional_info/info_ignore.txt: -------------------------------------------------------------------------------- 1 | Hospital 2 | Local 3 | imported 4 | Local from unknow 5 | Local from health 6 | Local from social 7 | Local from police 8 | Local from airport 9 | Hospital Delfina Torres Concha 10 | Dermatologist - contact with an infected pacient 11 | OKD Darkov 12 | 15/7 - 19/7 accommodation in a private cottage, contact with a positive 13 | Distorted ability to smell 14 | Outbreak in Ministry of Health 15 | unknown 16 | Patient with severe combined immunodeficiency (SCID) 17 | Nursing home establishments for aged and dependent individuals 18 | Migrants ship 19 | Unknown 20 | same patient as for sequence EPI_ISL_583466 21 | same patient as for sequence EPI_ISL_583469 22 | same patient as for sequence EPI_ISL_583472 23 | same patient as for sequence EPI_ISL_583475 24 | same patient as for sequence EPI_ISL_583427 25 | same patient as for sequence EPI_ISL_583479 26 | same patient as for sequence EPI_ISL_583428 27 | same patient as for sequence EPI_ISL_583431 28 | same patient as for sequence EPI_ISL_583439 29 | same patient as for sequence EPI_ISL_583444 30 | same patient as for sequence EPI_ISL_583447 31 | same patient as for sequence EPI_ISL_583456 32 | same patient as for sequence EPI_ISL_583461 33 | Airport Quarantine Station in Japan 34 | e.g. Patient infected while traveling in …. 35 | Jackson Memorial Hospital 36 | TRANSP 37 | QPS-MIA 38 | Sunset Pediatric 39 | Ausilio Mutuo 40 | Distorted ability to smell and taste 41 | Obesity 42 | Diarrhoea 43 | Imported case 44 | Symptomatic 45 | Asymptomatic 46 | General discomfort, headache, cough, myalgia, fever 47 | General discomfort, headache, cough, odynophagia 48 | Headache, cough, odynophagia 49 | General discomfort, headache, cough, myalgia, dysgeusia, fever 50 | Patient infected while traveling 51 | Traveled 52 | Airplane 53 | Airport 54 | Patient with travel history 55 | International traveler 56 | Patient seeking testing in a local hospital 57 | Hotel 58 | Hospital 59 | There is no abroad travel history in the last month. 60 | COVID-19 triage center 61 | Contact with a patient who lives in 62 | Imported 63 | Seaport Quarantine Station in Japan 64 | Unbiased surveillance 65 | unbiased surveillance 66 | Worker 67 | e.g. Cruise Ship, Convention, Live animal market 68 | Surveillance 69 | International Traveller 70 | - 71 | NA 72 | Traveller 73 | surveillance 74 | unknwon 75 | Local case 76 | Travel 77 | Contact with a patient with travel history 78 | -- 79 | None 80 | None 81 | With History of Travel: YES 82 | Community Sample 83 | #VALUE! 84 | infection 85 | Traveler 86 | Overseas Case 87 | Infection 88 | Other: local case 89 | Other: local 90 | Other: Patient Sample 91 | Out of State 92 | contact 93 | Other: 94 | other 95 | Overseas case 96 | Travelling 97 | -------------------------------------------------------------------------------- /scripts/tsv-cast-header: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | usage: tsv-cast-header 4 | 5 | Casts a into the header of . 6 | 7 | Fields are reordered, dropped, and added as necessary. Added fields will have 8 | blank values. 9 | 10 | No output will be emitted if has no rows. must have 11 | at least a header line. 12 | 13 | All conversion is performed in a memory efficient manner, and inputs do not 14 | need to be seekable. 15 | 16 | --- 17 | 18 | This program exists because both `tsv-append` (from tsv-utils) and `csvtk 19 | concat` are by themselves unsuitable for this task. 20 | 21 | `tsv-append` is header line aware, but not field aware: it assumes all inputs 22 | have the exact same header line and does no re-ordering, adding, or dropping of 23 | fields. It will happily mismatch fields between inputs and produce data lines 24 | with too few or too many fields. 25 | 26 | `csvtk concat` is field aware and DTRT, but it buffers each input completely 27 | into memory, making it a non-starter for large dataset sizes. 28 | """ 29 | import csv 30 | from argparse import ArgumentParser, RawDescriptionHelpFormatter 31 | from sys import stdin, stdout, stderr, exit 32 | 33 | 34 | cli = ArgumentParser( 35 | description = __doc__.strip().split("\n---\n", 1)[0].split("\n\n", 1)[1], 36 | epilog = __doc__.strip().split("\n---\n", 1)[1], 37 | formatter_class = RawDescriptionHelpFormatter) 38 | 39 | cli.add_argument("target", metavar = "") 40 | cli.add_argument("source", metavar = "") 41 | 42 | args = cli.parse_args() 43 | 44 | 45 | # Read header line of 46 | with open(args.target, "r", encoding = "utf-8", newline = "") as target: 47 | lines = csv.reader(target, dialect = "excel-tab") 48 | try: 49 | header = next(lines) 50 | except StopIteration: 51 | print(f"{cli.prog}: error: {args.target!r} (the target) appears to empty; it must contain at least a header line", file = stderr) 52 | exit(1) 53 | 54 | 55 | # Set up output for casting from one dict to another 56 | output = csv.DictWriter( 57 | stdout, 58 | header, 59 | restval = "", 60 | extrasaction = "ignore", 61 | dialect = "excel-tab", 62 | lineterminator = "\n") 63 | 64 | 65 | # Cast 66 | with open(args.source, "r", encoding = "utf-8", newline = "") as source: 67 | input = csv.DictReader(source, dialect = "excel-tab") 68 | 69 | for i, row in enumerate(input): 70 | if i == 0: 71 | if not set(input.fieldnames) & set(output.fieldnames): 72 | print(f"{cli.prog}: error: {args.target!r} (the target) and {args.source!r} (the source) share no fields; they must share at least one", file = stderr) 73 | exit(1) 74 | 75 | output.writeheader() 76 | 77 | output.writerow(row) 78 | -------------------------------------------------------------------------------- /narratives/ncov_template_narrative.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Genomic analysis of COVID-19 spread. 3 | authors: 4 | - Name 1 5 | - Name 2 6 | 7 | authorLinks: 8 | - https://author1.org 9 | - https://author2.io 10 | affiliations: "Fred Hutch, Seattle, USA; Biozentrum, Basel, Switzerland; CZI, CA, USA" 11 | 12 | license: "CC-BY" 13 | licenseLink: "https://creativecommons.org/licenses/by/4.0/" 14 | dataset: "https://nextstrain.org/ncov/gisaid/global/6m?legend=closed" # must be accessible to the auspice server running the narrative 15 | 16 | abstract: "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 17 | --- 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | # [SLIDE 1 TITLE](https://nextstrain.org/ncov/gisaid/global/6m?c=country) 27 | 28 | 29 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 30 | 31 | 32 | 33 | 34 | 35 | 36 | # [SLIDE 2 TITLE](https://nextstrain.org/ncov/gisaid/global/6m?c=region) 37 | 38 | 39 | 40 | [Including a link as an example; always end the line with a period or other 'letter' character](google.com)! 41 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 42 | 43 | 44 | ```auspiceMainDisplayMarkdown 45 | # Example using markdown for the right side 46 | We can also replace the right side view with whatever markdown contents we choose, including links, images, etc. 47 | ``` 48 | -------------------------------------------------------------------------------- /tests/different-inputs.t: -------------------------------------------------------------------------------- 1 | Integration tests for nCoV pipeline. 2 | 3 | Note that running these tests requires setup steps, and that each test can only 4 | run one-at-a-time due to the shared use of the test environment as otherwise 5 | snakemake may use intermediate files from previous runs, thus producing 6 | inconsistent test results. 7 | 8 | Cram should be run in an environment which can run the pipeline via 9 | `cram --preserve-env tests/different-inputs.t` or similar. 10 | 11 | Set-up test environment. We could set up the correct data inside $TMP for each test 12 | if we prefer. For simplicity, we create a directory "output". 13 | 14 | $ pushd "$TESTDIR" > /dev/null 15 | $ basename $( pwd ) 16 | tests 17 | $ rm -rf output && mkdir output && cd output 18 | $ cp -r ../../defaults . && cp -r ../../scripts . && mkdir data/ && cp ../../data/references* data/ 19 | $ cd ../.. 20 | $ basename $( pwd ) 21 | ncov 22 | 23 | Test various input starting points, all from local (.xz) compressed files 24 | 25 | $ snakemake --directory tests/output --profile tests/local-inputs-compressed \ 26 | > auspice/ncov_test-local-compressed.json >tests/output/local-inputs-compressed.cram.log.txt 2>&1 27 | 28 | $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-local-compressed.json \ 29 | > --attr region --values "North America" "Europe" "Asia" "Oceania" 30 | 31 | $ rm -rf tests/output/results 32 | 33 | Test various input starting points, all from remote (.xz) compressed files 34 | 35 | $ snakemake --directory tests/output --profile tests/remote-inputs-compressed \ 36 | > auspice/ncov_test-remote-compressed.json >tests/output/remote-inputs-compressed.cram.log.txt 2>&1 37 | 38 | $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-remote-compressed.json \ 39 | > --attr region --values "North America" "Europe" "Asia" "Oceania" 40 | 41 | $ rm -rf tests/output/results tests/output/data/downloaded_test*compressed* 42 | 43 | Test various input starting points, all from local uncompressed files 44 | 45 | $ cp tests/local-inputs-compressed/data/*xz tests/local-inputs-uncompressed/data/ 46 | 47 | $ for i in tests/local-inputs-uncompressed/data/*.xz; do xz -d $i; done 48 | 49 | $ snakemake --directory tests/output --profile tests/local-inputs-uncompressed \ 50 | > auspice/ncov_test-local-uncompressed.json >tests/output/local-inputs-uncompressed.cram.log.txt 2>&1 51 | 52 | $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-local-uncompressed.json \ 53 | > --attr region --values "North America" "Europe" "Asia" "Oceania" 54 | 55 | $ rm -rf tests/output/results tests/local-inputs-uncompressed/data/*.fasta tests/local-inputs-uncompressed/data/*.tsv 56 | 57 | Test various input starting points which support remote uncompressed files (this is a subset of available inputs) 58 | 59 | $ snakemake --directory tests/output --profile tests/remote-inputs-uncompressed \ 60 | > auspice/ncov_test-remote-uncompressed.json >tests/output/remote-inputs-uncompressed.cram.log.txt 2>&1 61 | 62 | $ python3 tests/check_auspice_json.py --json tests/output/auspice/ncov_test-remote-uncompressed.json \ 63 | > --attr region --values "North America" "Europe" "Asia" "Oceania" 64 | 65 | $ rm -rf tests/output/results data/downloaded_test*uncompressed* -------------------------------------------------------------------------------- /nextstrain_profiles/nextstrain-gisaid-21L/prefilter.smk: -------------------------------------------------------------------------------- 1 | rule clades_21L: 2 | input: 3 | clades = "defaults/clades.tsv", 4 | exclude_clades = "nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv", 5 | output: 6 | clades = "results/clades_21L.tsv", 7 | log: "logs/clades_21L.txt" 8 | benchmark: "benchmarks/clades_21L.txt" 9 | conda: config["conda_environment"] 10 | shell: 11 | r""" 12 | exec 2> {log:q} 13 | 14 | ./scripts/expand-clade-definitions {input.clades:q} \ 15 | | tsv-join \ 16 | --header \ 17 | --exclude \ 18 | --filter-file {input.exclude_clades:q} \ 19 | --key-fields clade \ 20 | > {output.clades:q} 21 | """ 22 | 23 | 24 | rule gisaid_21L_metadata: 25 | input: 26 | references = "data/references_metadata.tsv", 27 | metadata = path_or_url("s3://nextstrain-ncov-private/metadata.tsv.zst", keep_local=True), 28 | exclude_clades = "nextstrain_profiles/nextstrain-gisaid-21L/exclude-clades.tsv", 29 | output: 30 | metadata = "results/gisaid_21L_metadata.tsv.zst", 31 | log: "logs/gisaid_21L_metadata.txt" 32 | benchmark: "benchmarks/gisaid_21L_metadata.txt" 33 | conda: config["conda_environment"] 34 | threads: 8 35 | shell: 36 | r""" 37 | exec 2> {log:q} 38 | 39 | ./scripts/tsv-cast-header \ 40 | <(unzstd < {input.metadata:q}) \ 41 | {input.references:q} \ 42 | | zstd \ 43 | > {output.metadata:q} 44 | 45 | < {input.metadata:q} \ 46 | unzstd \ 47 | | tsv-join \ 48 | --header \ 49 | --exclude \ 50 | --filter-file {input.exclude_clades:q} \ 51 | --key-fields clade \ 52 | --data-fields Nextstrain_clade \ 53 | | sed 1d \ 54 | | zstd -T$(({threads} - 2)) \ 55 | >> {output.metadata:q} 56 | """ 57 | 58 | 59 | rule gisaid_21L_strains: 60 | input: 61 | metadata = "results/gisaid_21L_metadata.tsv.zst", 62 | output: 63 | strains = "results/gisaid_21L_strains.txt", 64 | log: "logs/gisaid_21L_strains.txt" 65 | benchmark: "benchmarks/gisaid_21L_strains.txt" 66 | conda: config["conda_environment"] 67 | shell: 68 | r""" 69 | exec 2> {log:q} 70 | 71 | < {input.metadata:q} \ 72 | unzstd \ 73 | | tsv-select --header -f strain \ 74 | | sed 1d \ 75 | > {output.strains:q} 76 | """ 77 | 78 | 79 | rule gisaid_21L_aligned: 80 | input: 81 | references = "data/references_sequences.fasta", 82 | aligned = path_or_url("s3://nextstrain-ncov-private/aligned.fasta.zst", keep_local=True), 83 | strains = "results/gisaid_21L_strains.txt", 84 | output: 85 | aligned = "results/gisaid_21L_aligned.fasta.zst", 86 | log: "logs/gisaid_21L_aligned.txt" 87 | benchmark: "benchmarks/gisaid_21L_aligned.txt" 88 | conda: config["conda_environment"] 89 | threads: 8 90 | shell: 91 | r""" 92 | exec 2> {log:q} 93 | 94 | < {input.references:q} \ 95 | seqkit grep --by-name --pattern 21L \ 96 | | zstd \ 97 | > {output.aligned} 98 | 99 | < {input.aligned:q} \ 100 | unzstd \ 101 | | seqkit grep --by-name -f {input.strains:q} \ 102 | | zstd -T$(({threads} - 2)) \ 103 | >> {output.aligned:q} 104 | """ 105 | -------------------------------------------------------------------------------- /docs/src/reference/nextstrain-overview.rst: -------------------------------------------------------------------------------- 1 | Nextstrain overview 2 | =================== 3 | 4 | Nextstrain has two main parts: 5 | 6 | - :term:`docs.nextstrain.org:Augur` **performs the bioinformatic analyses** required to produce a tree, map, and other inferences from your input data. 7 | - The outputs of Augur form the input for :term:`docs.nextstrain.org:Auspice`, **which provides the visualizations** you see on Nextstrain.org 8 | 9 | You can find more information about how these tools fit together :doc:`here `. We'll come back to Auspice when we get to the :doc:`visualization <../visualization/sharing>` section. 10 | 11 | First, let's take a look at how Augur works. 12 | 13 | How bioinformatic analyses are managed 14 | -------------------------------------- 15 | 16 | At its core, Augur is a collection of Python scripts, each of which handles one step in the bioinformatic analyses necessary for visualization with Auspice. 17 | 18 | As you might imagine, keeping track of the input and output files from each step individually can get very confusing, very quickly. So, **to manage all of these steps, we use a workflow manager called Snakemake**. 19 | 20 | .. note:: 21 | 22 | There are many other workflow managers out there, such as Nextflow. While we fully encourage you to use whichever workflow tools you prefer, we only provide support and maintenance for Snakemake. 23 | 24 | Snakemake is an incredibly powerful workflow manager with many complex features. For our purposes, though, we only need to understand a few things: 25 | 26 | - **Each step in a workflow is called a "rule."** The inputs, outputs, and shell commands for each step/rule are defined in a ``.smk`` file. 27 | - Each rule has a number of **parameters, which are specified in a ``.yaml`` file**. 28 | - Each rule produces **output (called a "dependency") which may be used as input to other rules**. 29 | 30 | Overview of a Nextstrain build 31 | ------------------------------ 32 | 33 | Below is an illustration of each step in a standard :term:`Nextstrain build `. Dependencies (output files from one step that act as input to the next) are indicated by grey arrows. Input files which must be provided are indicated with red outlines. As you can see in yellow, the final output is a JSON file for visualization in Auspice. 34 | 35 | Required input files (e.g. the sequence data generated in the `data preparation section <../guides/data-prep>`__, or other files which are part of this repo) are indicated with red outlines. We'll walk through each of these in detail in the next section. 36 | 37 | .. figure:: ../images/basic_nextstrain_build.png 38 | :alt: nextstrain_build 39 | 40 | Running multiple builds 41 | ----------------------- 42 | 43 | It is common practice to run several related builds. For example, to run one analysis on just your data and another analysis that incorporates background / contextual sequences, you could configure two different builds. 44 | 45 | The ncov workflow facilitates this through the ``builds`` section in a :term:`workflow config file `. This is covered in more detail in the :doc:`genomic surveillance tutorial <../tutorial/genomic-surveillance>`. 46 | 47 | We encourage you to take a look at `main_workflow.smk `__ to see what each rule is doing in more detail. 48 | 49 | .. note:: 50 | 51 | Not all of the rules included are essential, or may even be desirable for your analysis. Your workflow may be able to be made a lot simpler, depending on your goals. 52 | -------------------------------------------------------------------------------- /docs/src/guides/customizing-visualization.rst: -------------------------------------------------------------------------------- 1 | Customizing visualization 2 | ========================= 3 | 4 | Visualization options can be configured in either a :term:`workflow config file` or a :term:`Auspice config file`, depending on the option. 5 | 6 | .. contents:: Table of Contents 7 | :local: 8 | 9 | Options in the workflow config file 10 | ----------------------------------- 11 | 12 | These options can be coded into the workflow config file directly without requiring a custom Auspice config file. 13 | 14 | Custom color schemes 15 | ~~~~~~~~~~~~~~~~~~~~ 16 | 17 | To specify a custom color scale: 18 | 19 | 1. Add a ``colors.tsv`` file, where each line is a tab-delimited list of a metadata column name; a metadata value; and a corresponding hex code. Example: 20 | 21 | :: 22 | 23 | country Russia #5E1D9D 24 | country Serbia #4D22AD 25 | country Europe #4530BB 26 | ... 27 | 28 | 2. Update your workflow config file with a reference: 29 | 30 | .. code:: yaml 31 | 32 | files: 33 | colors: "my-ncov-analyses/colors.tsv" 34 | 35 | Changing the dataset description 36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 37 | 38 | The dataset description, which appears below the visualizations, is read from a file which is specified in the workflow config file. Per-build description can be set by specifying them in the workflow config file: 39 | 40 | .. code:: yaml 41 | 42 | builds: 43 | north-america: # name of the build 44 | description: my-ncov-analyses/north-america-description.md 45 | 46 | If that is not provided, then a per-run description is used, also specified in the workflow config file: 47 | 48 | .. code:: yaml 49 | 50 | files: 51 | description: my-ncov-analyses/my_description.md 52 | 53 | Options in the Auspice config file 54 | ---------------------------------- 55 | 56 | These options require creating an Auspice config file, used to configure :term:`docs.nextstrain.org:Auspice`. It is specified in the workflow config file using the ``auspice_config`` entry. Example: 57 | 58 | .. code:: yaml 59 | 60 | auspice_config: ncov-tutorial/auspice-config-custom-data.json 61 | 62 | This overrides the default Auspice config file, ``defaults/auspice_config.json``. 63 | 64 | Adding custom metadata fields to color by 65 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 66 | 67 | 1. Add a :doc:`valid metadata column <./data-prep/local-data>` to your ``metadata.tsv`` 68 | 2. Add an entry to the ``colorings`` block of the Auspice config file: 69 | 70 | .. code:: json 71 | 72 | "colorings": [ 73 | { 74 | "key": "location", 75 | "title": "Location", 76 | "type": "categorical" 77 | }, 78 | { 79 | "key": "metadata_column_name", 80 | "title": "Display name for interface", 81 | "type": "" 82 | } 83 | ] 84 | 85 | Choosing defaults 86 | ~~~~~~~~~~~~~~~~~ 87 | 88 | You can specify the default view in the ``display_defaults`` block of an Auspice config file: 89 | 90 | .. code:: json 91 | 92 | "display_defaults": { 93 | "color_by": "division", 94 | "distance_measure": "num_date", 95 | "geo_resolution": "division", 96 | "map_triplicate": true, 97 | "branch_label": "none" 98 | }, 99 | 100 | Choosing panels to display 101 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 102 | 103 | Similarly, you can choose which panels to enable in the ``panels`` block: 104 | 105 | .. code:: json 106 | 107 | "panels": [ 108 | "tree", 109 | "map", 110 | "entropy" 111 | ] 112 | -------------------------------------------------------------------------------- /docs/src/reference/naming_clades.rst: -------------------------------------------------------------------------------- 1 | Clade Naming & Definitions 2 | ========================== 3 | 4 | The nomenclature used by Nextstrain to designate clades for SARS-CoV-2 is driven by the following objectives: 5 | 6 | - label genetically well defined clades that have reached significant frequency and geographic spread, 7 | - allow for transient clade designations that are elevated to major clades if they persist and rise in frequency, 8 | - provide memorable but informative names, 9 | - gracefully handle clade naming in the upcoming years as SARS-CoV-2 becomes a seasonal virus. 10 | 11 | .. contents:: Table of Contents 12 | :local: 13 | 14 | Major clades 15 | ------------ 16 | 17 | Definition 18 | ~~~~~~~~~~ 19 | 20 | We name a new major clade when it reaches a frequency of 20% globally at any time point. When calculating these frequencies, care has to be taken to achieve approximately even sampling of sequences in time and space since sequencing effort varies strongly between countries. A clade name consists of the year it emerged and the next available letter in the alphabet. A new clade should be at least 2 mutations away from its parent major clade. 21 | 22 | Naming 23 | ~~~~~~ 24 | 25 | We name major clades by the year they are estimated to have emerged and a letter, e.g. 19A, 19B, 20A. The yearly reset of letters will ensure that we don't progress too far into the alphabet, while the year-prefix provides immediate context on the origin of the clade that will become increasingly important going forward. These are meant as major genetic groupings and not intended to completely resolve genetic diversity. 26 | 27 | The hierarchical structure of clades is sometimes of interest. Here, the "derivation" of a major clade can be labeled with the familiar "." notation as in 19A.20A.20C for the major clade 20C. 28 | 29 | Subclades 30 | --------- 31 | 32 | Within these major clades, we subclades, which we will label by their parent clade and the nucleotide mutation(s) that defines them (ex: 19A/28688C). It should be noted however, that these mutations are only meaningful in that they define the clade. Once a subclade reaches (soft) criteria on frequency, spread, and genetic distinctiveness, it will be renamed to a major clade (hypothetically 19A/28688C to 20D). 33 | 34 | Current Clades 35 | -------------- 36 | 37 | You can view the current clades on the `GISAID reference dataset `__ or `open reference dataset `__. 38 | 39 | Identifying Nextstrain Clades 40 | ----------------------------- 41 | 42 | To make it easy for users to identify the Nextstrain clade of their own sequences, we provide a clade assignment tool at `clades.nextstrain.org `__. In addition to assigning clades, this tool will call mutations in your sequences relative to the reference and performs some basic QC. 43 | 44 | You can also use the `simple python script `__ to assign appropriate clades to sequences in a fasta file. This script is part of the ``ncov`` GitHub repository, but does not require running any other part of the workflow. However, ``augur`` :doc:`must be installed ` to run the script. 45 | 46 | Note when running this script you can supply ``--sequences`` if your sequences require aligning first. If you already have aligned your sequences to the ``ncov`` repository reference (for example, from running the repository), you can supply ``--alignment``. If you supply sequences that are not aligned to the ``ncov`` reference, you may get bad results! 47 | -------------------------------------------------------------------------------- /scripts/explicit_translation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from Bio import Phylo, SeqIO 4 | from Bio.Align import MultipleSeqAlignment 5 | from treetime import TreeAnc 6 | from augur.utils import load_features 7 | 8 | 9 | def annotation_json(features, reference): 10 | annotations = {} 11 | for fname, feat in features.items(): 12 | annotations[fname] = {'seqid':reference.id, 13 | 'type':feat.type, 14 | 'start':int(feat.location.start)+1, 15 | 'end':int(feat.location.end), 16 | 'strand': '+' if feat.location.strand else '-'} 17 | annotations['nuc'] = {'seqid':reference.id, 18 | 'type':'source', 19 | 'start': 1, 20 | 'end': len(reference), 21 | 'strand': '+'} 22 | return annotations 23 | 24 | 25 | if __name__ == '__main__': 26 | parser = argparse.ArgumentParser( 27 | description="Add translations", 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 29 | ) 30 | 31 | parser.add_argument('--tree', type=str, required=True, help="input tree") 32 | parser.add_argument('--reference', type=str, required=True, help="reference genbank sequence") 33 | parser.add_argument('--translations', type=str, nargs='+', required=True, help="amino acid alignment") 34 | parser.add_argument('--genes', type=str, nargs='+', required=True, help="amino acid alignment") 35 | parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") 36 | args = parser.parse_args() 37 | 38 | genes = args.genes if type(args.genes)==list else [args.genes] 39 | translations = args.translations if type(args.translations)==list else [args.translations] 40 | ref = SeqIO.read(args.reference, format='genbank') 41 | features = load_features(args.reference) 42 | 43 | if not set(features.keys())==set(args.genes): 44 | print("WARNING: supplied genes don't match the annotation") 45 | print("the following features are in the annotation by not supplied as genes:", set(features.keys()).difference(args.genes)) 46 | print("the following features are in the supplied as genes but not the annotation:", set(args.genes).difference(features.keys())) 47 | 48 | T = Phylo.read(args.tree, 'newick') 49 | leafs = {n.name for n in T.get_terminals()} 50 | 51 | node_data = {} 52 | root_sequence_translations = {} 53 | for gene, translation in zip(genes, translations): 54 | seqs = [] 55 | for s in SeqIO.parse(translation, 'fasta'): 56 | if s.id in leafs: 57 | seqs.append(s) 58 | 59 | 60 | tt = TreeAnc(tree=T, aln=MultipleSeqAlignment(seqs), alphabet='aa') 61 | 62 | tt.infer_ancestral_sequences(reconstruct_tip_states=True) 63 | root_sequence_translations[gene] = tt.sequence(tt.tree.root, as_string=True, reconstructed=True) 64 | 65 | with open(translation.replace('.fasta', '_withInternalNodes.fasta'), 'w') as fh: 66 | for n in tt.tree.find_clades(): 67 | if n.name not in node_data: 68 | node_data[n.name] = {"aa_muts":{}} 69 | if len(n.mutations): 70 | node_data[n.name]["aa_muts"][gene] = [f"{a}{p+1}{d}" for a,p,d in n.mutations] 71 | fh.write(f">{n.name}\n{tt.sequence(n, as_string=True, reconstructed=True)}\n") 72 | 73 | annotations = annotation_json(features, ref) 74 | with open(args.output, 'w') as fh: 75 | json.dump({"nodes":node_data, "annotations":annotations, "reference":root_sequence_translations}, fh) 76 | -------------------------------------------------------------------------------- /docs/src/reference/troubleshoot.rst: -------------------------------------------------------------------------------- 1 | Troubleshoot common issues 2 | ========================== 3 | 4 | If you have a question that is not addressed here, please don't hesitate to `ask for help `__ 5 | 6 | My country / division does not show up on the map 7 | ------------------------------------------------- 8 | 9 | This is most often a result of the country / division not being present in `the file defining the latitude & longitude of each deme `__. Adding it to that file (and rerunning the Snakemake rules downstream of this) should fix this. 10 | 11 | My trait (e.g. division) is grey instead of colored 12 | --------------------------------------------------- 13 | 14 | We generate the colors from the ``colors`` rule in the Snakefile, which uses the `ordering TSV `__ to generate these. See :doc:`../guides/workflow-config-file` for more info. 15 | 16 | *A note about locations and colors:* Unless you want to specifically override the colors generated, it's usually easier to *add* information to the default ``ncov`` files, so that you can benefit from all the information already in those files. 17 | 18 | My genomes aren't included in the analysis 19 | ------------------------------------------ 20 | 21 | There are a few steps where sequences can be removed: 22 | 23 | - During the ``filter`` step: 24 | 25 | - Samples that are included in `the exclude file `__ are removed 26 | - Samples that fail the filtering criteria, as defined in your :ref:`filter config `, are removed. 27 | 28 | - If you do not have any custom filtering criteria, the default filters in the `parameters.yaml `__ are applied. 29 | 30 | - Check the ``results/{build_name}/filtered_log.tsv`` file to see the filtered reason for each sequence. 31 | 32 | - Samples may be randomly removed during subsampling; see :doc:`../guides/workflow-config-file` for more info. 33 | - During the ``refine`` step, Augur can drop samples that deviate from the expected clock rate. Inspect the log file named like ``logs/refine_{build_name}.txt`` to look for samples filtered by this step. :ref:`See the refine configuration guide `, for details on the clock rate filter. 34 | 35 | Sequencing and alignment errors 36 | ------------------------------- 37 | 38 | Genome sequencing, bioinformatic processing of the raw data, and alignment of the sequences are all steps were errors can slip in. Such errors can distort the phylogenetic analysis. To avoid sequences with known problems to mess up the analysis, we keep a list of problematic sequences in ``config/exclude.txt`` and filter them out. To facilitate spotting such problematic sequences, we added an additional quality control step that produces the file ``results/excluded_by_diagnostics.txt``. 39 | 40 | This file is the output of ``scripts/diagnostic.py`` and is produced by rule ``diagnostic``. This file contains only those sequences with diagnostics exceeding thresholds and mirrors the format of ``config/exclude.txt``. These names could be added to ``config/exclude.txt`` for permanent exclusion. Note, however, that some sequences might look problematic due to alignment issues rather than intrinsic problems with the sequence. The flagged sequences will be excluded from the current run. 41 | 42 | To only run the sequence diagnostic, you can specify any of the three above files as target, or use the ``diagnostic`` target: 43 | 44 | .. code:: bash 45 | 46 | nextstrain build ... diagnostic 47 | -------------------------------------------------------------------------------- /scripts/find_clusters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from augur.io import read_metadata 4 | from augur.utils import read_tree, read_node_data 5 | from collections import Counter 6 | import csv 7 | import hashlib 8 | 9 | MAX_HASH_LENGTH = 7 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser( 14 | description="Find polytomies in a given tree that all belong to the same metadata group", 15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 16 | ) 17 | parser.add_argument("--tree", required=True, help="Newick tree") 18 | parser.add_argument("--metadata", required=True, help="metadata") 19 | parser.add_argument("--mutations", required=True, help="mutations node data JSON") 20 | parser.add_argument("--attribute-name", default="cluster_id", help="name of attribute to store in output JSON") 21 | parser.add_argument("--group-by", default="division", help="identify polytomies where all tips are in the same group") 22 | parser.add_argument("--min-tips", type=int, default=3, help="minimum tips per polytomy to be consider as a cluster") 23 | parser.add_argument("--output", required=True, help="tab-delimited file with strain, cluster id, and group value for each strain") 24 | 25 | args = parser.parse_args() 26 | 27 | tree = read_tree(args.tree) 28 | tree.collapse_all(lambda c: c.branch_length < 1e-5) 29 | 30 | metadata = read_metadata(args.metadata) 31 | muts = read_node_data(args.mutations) 32 | attribute_name = args.attribute_name 33 | group_by = args.group_by 34 | 35 | polytomies = [] 36 | for node in tree.find_clades(terminal=False): 37 | if node == tree.root: 38 | continue 39 | 40 | count_by_group = Counter() 41 | polytomy_sequence_id = None 42 | for child in node.clades: 43 | if child.is_terminal() and child.name: 44 | child_muts_data = muts["nodes"].get(child.name, {}) 45 | any_muts = (len(child_muts_data.get("muts", [])) > 0) 46 | if not any_muts: 47 | count_by_group[metadata.loc[child.name, group_by]] += 1 48 | 49 | if polytomy_sequence_id is None and "sequence" in child_muts_data: 50 | polytomy_sequence_id = hashlib.sha256(child_muts_data["sequence"].encode()).hexdigest()[:MAX_HASH_LENGTH] 51 | 52 | if any(count >= args.min_tips for count in count_by_group.values()): 53 | polytomies.append({"node": node, "name": polytomy_sequence_id}) 54 | 55 | with open(args.output, "w") as oh: 56 | writer = csv.DictWriter( 57 | oh, 58 | fieldnames=( 59 | "strain", 60 | args.attribute_name, 61 | group_by 62 | ), 63 | delimiter="\t", 64 | lineterminator="\n" 65 | ) 66 | writer.writeheader() 67 | clusters = 0 68 | for polytomy_data in polytomies: 69 | polytomy = polytomy_data["node"] 70 | polytomy_sequence_id = polytomy_data["name"] 71 | 72 | if polytomy.name: 73 | writer.writerow({ 74 | "strain": polytomy.name, 75 | args.attribute_name: polytomy_sequence_id, 76 | group_by: metadata.loc[polytomy.name, group_by] 77 | }) 78 | 79 | for child in polytomy.clades: 80 | if child.is_terminal(): 81 | writer.writerow({ 82 | "strain": child.name, 83 | args.attribute_name: polytomy_sequence_id, 84 | group_by: metadata.loc[child.name, group_by] 85 | }) 86 | 87 | clusters += 1 88 | -------------------------------------------------------------------------------- /scripts/fetch_mlr_lineage_fitness.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import pandas as pd 4 | import requests 5 | import sys 6 | from augur.io import read_metadata 7 | from augur.utils import write_json 8 | 9 | # This script is currently assuming a match on lineage fitness this uses 10 | # https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json 11 | # that backs the live estimates on https://nextstrain.org/sars-cov-2/forecasts 12 | # This uses "Nextclade_pango" metadata label to derive sequence counts from 13 | # GISAID data and estimate relative growth advantages across collapsed Pango 14 | # lineages. It will be most relevant for 1m, 2m and 6m builds, but is not at all 15 | # broken for the all-time builds. It would be possible to swap this to key on 16 | # clade instead, but I think the greater detail of lineages is better in this case 17 | 18 | def fetch_growth_advantages(mlr_url): 19 | try: 20 | response = requests.get(mlr_url) 21 | response.raise_for_status() # Raise an exception for HTTP errors 22 | json_data = response.json() # Parse the JSON content 23 | data = json_data["data"] 24 | 25 | growth_advantages = {} 26 | for entry in data: 27 | if all(key in entry for key in ["location", "site", "variant", "value", "ps"]): 28 | if entry["location"] == "hierarchical" and entry["site"] == "ga" and entry["ps"] == "median": 29 | growth_advantages[entry["variant"]] = entry["value"] 30 | return growth_advantages 31 | except Exception as e: 32 | print(f"Error fetching the JSON file: {e}", file=sys.stderr) 33 | return None 34 | 35 | def main(): 36 | # Set up argument parser 37 | parser = argparse.ArgumentParser(description="Fetch MLR lineage fitness and match to strain-level metadata") 38 | parser.add_argument("--metadata", required=True, help="Path to the metadata TSV") 39 | parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="List of columns to use as identifiers in the metadata file") 40 | parser.add_argument("--metadata-clade-attribute", default="Nextclade_pango", help="Matched attribute to MLR variants") 41 | parser.add_argument("--mlr-url", default="https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json", help="URL to fetch the forecasts JSON data.") 42 | parser.add_argument("--output-node-data", required=True, help="Path to save the output JSON node data.") 43 | 44 | args = parser.parse_args() 45 | 46 | # Fetch the remote growth advantages 47 | growth_advantages = fetch_growth_advantages(args.mlr_url) 48 | 49 | # Load the local metadata 50 | try: 51 | metadata = read_metadata( 52 | args.metadata, 53 | id_columns=args.metadata_id_columns 54 | ) 55 | except FileNotFoundError as e: 56 | print(f"Error reading metadata file: {e}", file=sys.stderr) 57 | 58 | # Match Nextclade_pango entries in metadata to the fetched growth advantages 59 | if growth_advantages: 60 | metadata[args.metadata_clade_attribute] = metadata[args.metadata_clade_attribute].map(growth_advantages) 61 | else: 62 | metadata[args.metadata_clade_attribute] = None 63 | 64 | # Create a node data object with growth advantages 65 | node_data = {} 66 | for index, record in metadata.iterrows(): 67 | node_data[index] = { 68 | "mlr_lineage_fitness": record[args.metadata_clade_attribute] if pd.notna(record[args.metadata_clade_attribute]) else None 69 | } 70 | 71 | # Save node data 72 | write_json({"nodes": node_data}, args.output_node_data) 73 | 74 | if __name__ == '__main__': 75 | try: 76 | main() 77 | except Exception as e: 78 | print(f"An unexpected error occurred: {e}", file=sys.stderr) 79 | -------------------------------------------------------------------------------- /docs/src/tutorial/next-steps.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Next steps 3 | ========== 4 | 5 | Congratulations! You have completed all of the tutorials for the ncov workflow. Read on for some next steps. 6 | 7 | .. contents:: Table of Contents 8 | :local: 9 | 10 | .. _create-analysis-directory: 11 | 12 | Create your own analysis directory 13 | ================================== 14 | 15 | On a web browser: 16 | 17 | 1. `Sign up for a GitHub account `__ if you do not already have one. 18 | 2. Create a repository from the ``ncov-tutorial`` template repository: 19 | 20 | 1. Go to https://github.com/nextstrain/ncov-tutorial. 21 | 2. Click **Use this template**. 22 | 3. Give your repository a name. We recommend ``my-ncov-analyses`` and will use that name in the following steps. 23 | 4. Click **Create repository from template**. 24 | 25 | In a command prompt: 26 | 27 | 1. Go to the ``ncov/`` directory. 28 | 2. Clone your new repository, replacing ```` with your own username: 29 | 30 | .. code:: text 31 | 32 | git clone https://github.com//my-ncov-analyses 33 | 34 | 3. Read the next section to learn how to modify ``genomic-surveillance.yaml``. 35 | 36 | Modify the genomic surveillance workflow configuration 37 | ====================================================== 38 | 39 | Instead of an Idaho-focused workflow config, you can provide your own data for the ``custom_data`` input. Follow the same steps in the tutorial for GISAID download but select your own set of sequences and rename your ``metadata.tsv`` and ``sequences.fasta`` files accordingly. 40 | 41 | .. note:: 42 | 43 | Workflow run time increases with the number of sequences, and the GISAID web interface has a maximum of 5,000 sequences per download. 44 | 45 | Then, use the following steps to customize names, titles, and context: 46 | 47 | 1. Change the ``custom_data`` input filenames from ``idaho.metadata.tsv`` and ``idaho.sequences.fasta`` to your own. 48 | 2. Change the regional input dataset from North America to an appropriate region for your custom focal data. :doc:`See the complete list of available URLs <../reference/remote_inputs>`. 49 | 3. Rename the output dataset from ``idaho`` to your own. Note the name restrictions. 50 | 4. Reword the output dataset title to your own. 51 | 5. Rename the subsampling scheme from ``idaho_scheme`` to your own. Note the name restrictions. 52 | 6. For each sample, increase the ``max_sequences`` to your own. 53 | 7. Rename the ``usa_context`` sample and update the ``query`` accordingly. 54 | 55 | .. warning:: 56 | 57 | File paths in the :term:`config files ` must start with the :term:`analysis directory`. For example, in the tutorial: 58 | 59 | .. code:: yaml 60 | 61 | auspice_config: ncov-tutorial/auspice-config-custom-data.json 62 | 63 | Now that you have created your own analysis directory, this must be modified, e.g. 64 | 65 | .. code:: yaml 66 | 67 | auspice_config: my-ncov-analyses/auspice-config-custom-data.json 68 | 69 | Additional resources 70 | ==================== 71 | 72 | - Learn more about genomic epidemiology: 73 | 74 | - `An applied genomic epidemiological handbook `__ by Allison Black and Gytis Dudas 75 | - `Genomic Epidemiology Seminar Series `__ by Chan Zuckerberg Initiative Genomic Epidemiology (CZ GEN EPI) 76 | - `COVID-19 Genomic Epidemiology Toolkit `__ by Centers for Disease Control and Prevention (CDC) 77 | 78 | - :doc:`Review all possible options to configure your SARS-CoV-2 analyses with Nextstrain <../reference/workflow-config-file>`. 79 | - Watch `this 1-hour video overview `__ by Heather Blankenship on how to deploy Nextstrain for a Public Health lab. 80 | -------------------------------------------------------------------------------- /docs/src/reference/data_submitter_faq.rst: -------------------------------------------------------------------------------- 1 | Data Submitter's FAQ 2 | ==================== 3 | 4 | We often receive questions from data submitters about why their data is not visible on the `Nextstrain SARS-CoV-2 runs `__. This short FAQ highlights some of the main reasons why data may not be showing up on Nextstrain. 5 | 6 | Sequence Length & Number of N's 7 | ------------------------------- 8 | 9 | We currently only use full-genome sequences which are at least 27,000 bases in length. They also cannot have more than 3,000 bases that are 'N'. 10 | 11 | Subsampling 12 | ----------- 13 | 14 | Nextstrain runs can be subsampled considerably. There are over >30,000 whole-genome sequences available on GISAID currently, but we typically include <5,000 in each of our runs. If the division your samples are from contains more than about 100 samples per month, they are likely to be downsampled. Be sure to check the appropriate regional build - these are sampled more heavily from the focal region, so there's a higher chance a sequence will be included in the run. We have regional builds for `North America `__, `South America `__, `Asia `__, `Africa `__, `Europe `__, and `Oceania `__. 15 | 16 | Missing Dates 17 | ------------- 18 | 19 | We currently only include samples that have an **exact sampling date** (day, month, year). This is because we cannot accurately estimate the sample dates from the sequences at the moment, given the short duration of the pandemic so far, and the mutation rate. 20 | 21 | If your sample has only year or only month and year as a sampling date, it will be automatically excluded from runs. If you have privacy/data sharing concerns, it's ok to slightly change the collection date randomly by +/- 1 or 2 days. Please do *not* use the sequencing or processing date, as these can negatively influence our runs. 22 | 23 | If you wish to add a corrected date to your samples, simply updating the sampling date in GISAID will automatically update our system, and the sequence will be included in the next run! 24 | 25 | Many Samples with the Same Date 26 | ------------------------------- 27 | 28 | If we receive many samples that have identical dates as sample dates, we may exclude these manually. This is because this often indicates that the 'sample date' given is not actually the sample date, but the sequencing, processing, or uploading date. We try to email submitters when we do this to check whether the dates are truly the collection dates. 29 | 30 | If you are genuinely submitting many sequences with identical dates, you can avoid us temporarily excluding them by emailing hello@nextstrain.org to let us know about the sequences and why they have the same date (ex: collected during investigation of a long-term care center). 31 | 32 | Missing USA State 33 | ----------------- 34 | 35 | We currently exclude samples from the USA which do not have a 'division' attribute (this is the USA state or territory where they were sampled). Adding a state/territory/division to your sample on GISAID will automatically update this on our system, and the sequence will appear in our next run. 36 | 37 | Divergence Issues 38 | ----------------- 39 | 40 | For quality control, we use a combination of automated and manual checks to ensure that sequences included seem to be free of sequencing and/or assembly error. If a sequence is deemed to be far too divergent (has more mutations than we expect given the sampling date), or far too under-diverged (has far fewer mutations than we expect given the sampling date), it may be excluded. We cannot off direct help in these cases, but suggest you revisit the raw sequence files with the aid of someone with experience using your sequencing pipeline, in order to correct any sequencing and assembly errors. 41 | -------------------------------------------------------------------------------- /docs/src/reference/files.rst: -------------------------------------------------------------------------------- 1 | Files overview 2 | ============== 3 | 4 | This page gives an overview of the files in your local ``ncov/`` directory. 5 | 6 | .. contents:: 7 | :local: 8 | 9 | User files 10 | ---------- 11 | 12 | User files are not tracked by version control, meaning they are either provided by the user or generated by the workflow. 13 | 14 | Analysis directory 15 | ~~~~~~~~~~~~~~~~~~ 16 | 17 | An :term:`analysis directory` is a non-tracked directory which contains user-defined :term:`customization files `. 18 | 19 | In the :doc:`tutorials <../tutorial/intro>`, the analysis directory is ``ncov-tutorial/``. Follow :ref:`these steps ` to create your own analysis directory. 20 | 21 | .. hint:: 22 | 23 | Previously, we recommended using Snakemake profiles under a ``my_profiles/`` analysis directory. We now recommend using Snakemake config files directly via the ``--configfile`` parameter. You can still use existing profiles via ``--configfile my_profiles//builds.yaml``. 24 | 25 | Input files 26 | ~~~~~~~~~~~ 27 | 28 | Learn how to prepare input files with :doc:`../guides/data-prep/index`. 29 | 30 | .. note:: 31 | 32 | A few example input files are provided when you clone ``ncov/`` locally, under ``data/``. 33 | 34 | - Metadata file (e.g. ``data/example_metadata.tsv``): tab-delimited description of strain (i.e., sample) attributes 35 | - Sequences file (e.g. ``data/example_sequences.fasta.gz``): genomic sequences whose ids must match the ``strain`` column in the metadata file. 36 | 37 | Output files and directories 38 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | 40 | These are generated by the workflow. 41 | 42 | - ``auspice/.json``: output file for visualization in Auspice where ```` is the name of your output dataset in the workflow configuration file used by ``--configfile``. 43 | - ``results/aligned.fasta``, etc.: raw results files (dependencies) that are shared across all datasets. 44 | - ``results//``: raw results files (dependencies) that are specific to a single dataset. 45 | - ``logs/``: Log files with error messages and other information about the run. 46 | - ``benchmarks/``: Run-times (and memory usage on Linux systems) for each rule in the workflow. 47 | 48 | Internal files 49 | -------------- 50 | 51 | These files are not intended for modification. See :doc:`../guides/workflow-config-file` on how to configure workflow behavior. 52 | 53 | Default workflow customization files 54 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | 56 | - ``defaults/parameters.yaml``: default :term:`config file`. Override these settings using ``--configfile your-config.yaml``. 57 | - ``defaults/auspice_config.json``: default :term:`Auspice config file`. Override these settings using ``auspice_config``. 58 | - ``defaults/include.txt``: default strain names to *include* during subsampling and filtering. 59 | - ``defaults/exclude.txt``: default strain names to *exclude* during subsampling and filtering. 60 | 61 | Workflow definition files 62 | ~~~~~~~~~~~~~~~~~~~~~~~~~ 63 | 64 | - ``Snakefile``: entry point for Snakemake commands that also validates inputs. 65 | - ``workflow/snakemake_rules/main_workflow.smk``: defines rules for running each step in the analysis. Modify your workflow config file, rather than hardcode changes into the snakemake file itself. 66 | - ``workflow/envs/nextstrain.yaml``: specifies computing environment needed to run workflow with the ``--use-conda`` flag. 67 | - ``workflow/schemas/config.schema.yaml``: defines format (e.g., required fields and types) for workflow config files. 68 | - ``scripts/``: helper scripts for common tasks. 69 | 70 | Documentation 71 | ~~~~~~~~~~~~~ 72 | 73 | These files are used to generate the `workflow documentation `__. 74 | 75 | Nextstrain user files 76 | ~~~~~~~~~~~~~~~~~~~~~ 77 | 78 | The Nextstrain team maintains user files in the ``ncov/`` repo, under ``nextstrain_profiles/``. 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![GitHub release (latest by date)](https://img.shields.io/github/v/release/nextstrain/ncov)](https://github.com/nextstrain/ncov/releases) 2 | [![See recent changes](https://img.shields.io/badge/changelog-See%20recent%20changes-blue)](https://docs.nextstrain.org/projects/ncov/en/latest/reference/change_log.html) 3 | 4 | # About 5 | 6 | This repository analyzes viral genomes using [Nextstrain](https://nextstrain.org) to understand how SARS-CoV-2, the virus that is responsible for the COVID-19 pandemic, evolves and spreads. 7 | 8 | We maintain a number of publicly-available builds, visible at [nextstrain.org/ncov](https://nextstrain.org/ncov). 9 | 10 | [See our change log for details about backwards-incompatible or breaking changes to the workflow](https://docs.nextstrain.org/projects/ncov/en/latest/reference/change_log.html). 11 | 12 | Visit [the workflow documentation](https://docs.nextstrain.org/projects/ncov) for tutorials and reference material. 13 | 14 | ## Download formatted datasets 15 | 16 | The hCoV-19 / SARS-CoV-2 genomes were generously shared via GISAID. We gratefully acknowledge the Authors, Originating and Submitting laboratories of the genetic sequence and metadata made available through GISAID on which this research is based. 17 | 18 | In order to download the GISAID data to run the analysis yourself, please see [this guide](https://docs.nextstrain.org/projects/ncov/en/latest/analysis/data-prep.html). 19 | > Please note that `data/metadata.tsv` is no longer included as part of this repo. However, we provide continually-updated, pre-formatted metadata & fasta files for download through GISAID. 20 | 21 | ## Read previous Situation Reports 22 | 23 | We issued weekly Situation Reports for the first ~5 months of the pandemic. You can find the Reports and their translations [here](https://nextstrain.org/ncov-sit-reps). 24 | 25 | ## FAQs 26 | 27 | - Can't find your sequences in Nextstrain? Check [here](./docs/data_faq.md) for common reasons why your sequences may not be appearing. 28 | You can also use [clades.nextstrain.org](https://clades.nextstrain.org/) to perform some basic quality control on your sequences. If they are flagged by this tool, they will likely be excluded by our pipeline. 29 | - For information about how clades are defined, and the currently named clades, please see [here](./docs/naming_clades.md). To assign clades to your own sequences, you can use our clade assignment tool at [clades.nextstrain.org](https://clades.nextstrain.org/). 30 | 31 | ## Bioinformatics notes 32 | 33 | Site numbering and genome structure uses [Wuhan-Hu-1/2019](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) as reference. The phylogeny is rooted relative to early samples from Wuhan. Temporal resolution assumes a nucleotide substitution rate of [8 × 10^-4 subs per site per year](http://virological.org/t/phylodynamic-analysis-176-genomes-6-mar-2020/356). There were SNPs present in the nCoV samples in the first and last few bases of the alignment that were masked as likely sequencing artifacts. 34 | 35 | # Contributing 36 | 37 | We welcome contributions from the community! Please note that we strictly adhere to the [Contributor Covenant Code of Conduct](https://github.com/nextstrain/.github/blob/master/CODE_OF_CONDUCT.md). 38 | 39 | ### Contributing to software or documentation 40 | 41 | Please see our [Contributor Guide](https://github.com/nextstrain/.github/blob/master/CONTRIBUTING.md) to get started! 42 | 43 | ### Contributing data 44 | 45 | **Please note that we automatically pick up any SARS-CoV-2 data that is submitted to GISAID.** 46 | 47 | If you're a lab and you'd like to get started sequencing, please see: 48 | * [Protocols from the ARTIC network](https://www.protocols.io/groups/artic/publications) 49 | * [Funding opportunities for sequencing efforts](https://twitter.com/firefoxx66/status/1242147905768751106) 50 | * Or, if these don't meet your needs, [get in touch](mailto:hello@nextstrain.org) 51 | 52 | --- 53 | 54 | # Get in touch 55 | 56 | To report a bug, error, or feature request, please [open an issue](https://github.com/nextstrain/ncov/issues). 57 | 58 | For questions, head over to the [discussion board](https://discussion.nextstrain.org/); we're happy to help! 59 | -------------------------------------------------------------------------------- /docs/src/guides/run-analysis-on-terra.rst: -------------------------------------------------------------------------------- 1 | ************************* 2 | Run the workflow on Terra 3 | ************************* 4 | 5 | Import ``ncov`` WDL workflow from Dockstore 6 | =========================================== 7 | 8 | 1. `Setup a Terra account `_ 9 | #. Navigate to Dockstore: `ncov:master`_ 10 | #. Top right corner, under **Launch with**, click on **Terra** 11 | #. Under "Workflow Name" set a name, can also leave default ``ncov``, and select your **Destination Workspace** in the drop down menu. 12 | #. Click button **IMPORT** 13 | #. In your workspace, click on the **WORKFLOWS** tab and verify that the imported workflow is showing a card 14 | 15 | .. _`ncov:master`: https://dockstore.org/workflows/github.com/nextstrain/ncov:master?tab=info 16 | 17 | Upload your data files into Terra 18 | ================================= 19 | 20 | 1. Navigate to: `https://app.terra.bio/#upload`_. 21 | 22 | #. Select your workspace 23 | #. At the top, hit the **+** button to "create a collection" 24 | #. Within the collection, at bottom right, click **+** button to upload file, or drag and drop files to upload them. 25 | #. Go back to your Terra Dashboard 26 | #. Click on the **DATA** tab 27 | #. On the left, under **OTHER DATA**, click **Files** and there should be an "uploads/" folder shown to the right 28 | #. Click on "uploads/" to view your collection and verify that your files have been uploaded 29 | 30 | .. _`https://app.terra.bio/#upload`: https://app.terra.bio/#upload 31 | 32 | Connect your data files to the WDL workflow 33 | =========================================== 34 | 35 | 1. On the **DATA** tab, click on **+** next to the **TABLES** section to create a Data Table 36 | #. Download the "sample_template.tsv" file 37 | #. Create a tab delimited file similar to below: 38 | 39 | :: 40 | 41 | entity:ncov_examples_id metadata sequences configfile_yaml 42 | example gs://COPY_PATH_HERE/example_metadata.tsv gs://COPY_PATH_HERE/example_datasets/example_sequences.fasta.gz 43 | example_build gs://COPY_PATH_HERE/example-build.yaml 44 | 45 | 4. Upload to **Tables** and you should get something like: 46 | 47 | .. image:: ../images/terra-datatable.png 48 | 49 | 5. Navigate back to the **Workflow** tab, and click on your imported "ncov" workflow 50 | #. Click on the radio button "Run workflow(s) with inputs defined by data table" 51 | #. Under **Step 1**, select your root entity type **ncov_examples** from the drop down menu. 52 | #. Click on **SELECT DATA** to select all rows 53 | #. Most of the values will be blank but fill in the values below: 54 | 55 | +-----------------+------------------+-------+----------------------+ 56 | |Task name | Variable | Type | Attribute | 57 | +=================+==================+=======+======================+ 58 | |Nextstrain_WRKFLW| build_name | String| this.ncov_example.id | 59 | +-----------------+------------------+-------+----------------------+ 60 | |Nextstrain_WRKFLW| configfile_yaml | File | this.configfile_yaml | 61 | +-----------------+------------------+-------+----------------------+ 62 | |Nextstrain_WRKFLW| metadata_tsv | File | this.metadata | 63 | +-----------------+------------------+-------+----------------------+ 64 | |Nextstrain_WRKFLW| sequence_fasta | File | this.sequences | 65 | +-----------------+------------------+-------+----------------------+ 66 | 67 | 10. Click on the **OUTPUTS** tab 68 | 11. Connect your generated output back to the data table, but filling in values: 69 | 70 | +-----------------+-----------------+-------+----------------------+ 71 | |Task name | Variable | Type | Attribute | 72 | +=================+=================+=======+======================+ 73 | |Nextstrain_WRKFLW| auspice_zip | File | this.auspice_zip | 74 | +-----------------+-----------------+-------+----------------------+ 75 | |Nextstrain_WRKFLW| results_zip | File | this.results_zip | 76 | +-----------------+-----------------+-------+----------------------+ 77 | 78 | 12. Click on **Save** then click on **Run Analysis** 79 | #. Under the tab **JOB HISTORY**, verify that your job is running. 80 | #. When run is complete, check the **DATA** / **TABLES** / **ncov_examples** tab and download "auspice.zip" file 81 | -------------------------------------------------------------------------------- /defaults/auspice_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Genomic epidemiology of novel coronavirus", 3 | "build_url": "https://github.com/nextstrain/ncov", 4 | "maintainers": [ 5 | {"name": "the Nextstrain team", "url": "https://nextstrain.org/"} 6 | ], 7 | "colorings": [ 8 | { 9 | "key": "emerging_lineage", 10 | "title": "Emerging Lineage", 11 | "type": "categorical" 12 | }, 13 | { 14 | "key": "immune_escape", 15 | "title": "Immune Escape vs BA.2", 16 | "type": "continuous" 17 | }, 18 | { 19 | "key": "ace2_binding", 20 | "title": "ACE2 binding vs BA.2", 21 | "type": "continuous" 22 | }, 23 | { 24 | "key": "pango_lineage", 25 | "title": "PANGO Lineage", 26 | "type": "categorical" 27 | }, 28 | { 29 | "key": "GISAID_clade", 30 | "title": "GISAID Clade", 31 | "type": "categorical" 32 | }, 33 | { 34 | "key": "S1_mutations", 35 | "title": "S1 mutations", 36 | "type": "continuous" 37 | }, 38 | { 39 | "key": "mlr_lineage_fitness", 40 | "title": "MLR lineage fitness", 41 | "type": "continuous" 42 | }, 43 | { 44 | "key": "location", 45 | "title": "Location", 46 | "type": "categorical" 47 | }, 48 | { 49 | "key": "division", 50 | "title": "Admin Division", 51 | "type": "categorical" 52 | }, 53 | { 54 | "key": "country", 55 | "title": "Country", 56 | "type": "categorical" 57 | }, 58 | { 59 | "key": "region", 60 | "title": "Region", 61 | "type": "categorical" 62 | }, 63 | { 64 | "key": "host", 65 | "title": "Host", 66 | "type": "categorical" 67 | }, 68 | { 69 | "key": "age", 70 | "title": "Age", 71 | "type": "continuous" 72 | }, 73 | { 74 | "key": "sex", 75 | "title": "Sex", 76 | "type": "categorical" 77 | }, 78 | { 79 | "key": "author", 80 | "title": "Authors", 81 | "type": "categorical" 82 | }, 83 | { 84 | "key": "originating_lab", 85 | "title": "Originating Lab", 86 | "type": "categorical" 87 | }, 88 | { 89 | "key": "submitting_lab", 90 | "title": "Submitting Lab", 91 | "type": "categorical" 92 | }, 93 | { 94 | "key": "recency", 95 | "title": "Submission Date", 96 | "type": "categorical" 97 | }, 98 | { 99 | "key": "gisaid_epi_isl", 100 | "type": "categorical" 101 | }, 102 | { 103 | "key": "genbank_accession", 104 | "type": "categorical" 105 | }, 106 | { 107 | "key": "epiweek", 108 | "title": "Epiweek (CDC)", 109 | "type": "continuous" 110 | }, 111 | { 112 | "key": "QC_overall_score", 113 | "title": "Nextclade QC score", 114 | "type": "continuous" 115 | }, 116 | { 117 | "key": "QC_overall_status", 118 | "title": "Nextclade QC status", 119 | "type": "categorical" 120 | }, 121 | { 122 | "key": "reversion_mutations", 123 | "title": "Reversion mutations", 124 | "type": "continuous" 125 | }, 126 | { 127 | "key": "potential_contaminants", 128 | "title": "Potential contaminants", 129 | "type": "continuous" 130 | }, 131 | { 132 | "key": "rare_mutations", 133 | "title": "Rare mutations", 134 | "type": "continuous" 135 | } 136 | ], 137 | "geo_resolutions": [ 138 | "location", 139 | "division", 140 | "country", 141 | "region" 142 | ], 143 | "display_defaults": { 144 | "color_by": "clade_membership", 145 | "distance_measure": "num_date", 146 | "geo_resolution": "country", 147 | "map_triplicate": true, 148 | "branch_label": "clade" 149 | }, 150 | "filters": [ 151 | "recency", 152 | "clade_membership", 153 | "emerging_lineage", 154 | "region", 155 | "country", 156 | "division", 157 | "location", 158 | "host", 159 | "epiweek", 160 | "QC_overall_status" 161 | ], 162 | "panels": [ 163 | "tree", 164 | "map", 165 | "entropy", 166 | "frequencies" 167 | ] 168 | } 169 | -------------------------------------------------------------------------------- /docs/src/tutorial/example-data.rst: -------------------------------------------------------------------------------- 1 | Run using example data 2 | ====================== 3 | 4 | This first tutorial introduces our SARS-CoV-2 workflow. 5 | You will run the workflow using a small set of reference data that we provide. 6 | Subsequent tutorials present more complex scenarios that build on this approach. 7 | 8 | .. contents:: Table of Contents 9 | :local: 10 | 11 | Prerequisites 12 | ------------- 13 | 14 | 1. :doc:`setup`. These instructions will install all of the software you need to complete this tutorial and others. 15 | 16 | Setup 17 | ----- 18 | 19 | 1. Change directory to the ``ncov`` directory: 20 | 21 | .. code:: text 22 | 23 | cd ncov 24 | 25 | 2. Download the example tutorial repository into a new subdirectory of ``ncov/`` called ``ncov-tutorial/``: 26 | 27 | .. code:: text 28 | 29 | git clone https://github.com/nextstrain/ncov-tutorial 30 | 31 | Run the workflow 32 | ---------------- 33 | 34 | From within the ``ncov/`` directory, run the workflow using a :term:`configuration file ` provided in the tutorial directory: 35 | 36 | .. code:: text 37 | 38 | nextstrain build . --configfile ncov-tutorial/example-data.yaml 39 | 40 | Break down the command 41 | ~~~~~~~~~~~~~~~~~~~~~~ 42 | 43 | The workflow can take several minutes to run. While it is running, you can learn about the parts of this command: 44 | 45 | - ``nextstrain build .`` 46 | - This tells the :term:`docs.nextstrain.org:Nextstrain CLI` to :term:`build ` the workflow from ``.``, the current directory. All subsequent command-line arguments are passed to the workflow manager, Snakemake. 47 | - ``--configfile ncov-tutorial/example-data.yaml`` 48 | - ``--configfile`` is a Snakemake option used to `configure `__ the ncov workflow. It takes a file path as the value. 49 | - ``ncov-tutorial/example-data.yaml`` is the value given to ``--configfile``. It is a :term:`config file` that provides custom workflow configuration including inputs and outputs. The contents of this file with comments excluded are: 50 | 51 | .. code-block:: yaml 52 | 53 | inputs: 54 | - name: reference_data 55 | metadata: https://data.nextstrain.org/files/ncov/open/reference/metadata.tsv.xz 56 | sequences: https://data.nextstrain.org/files/ncov/open/reference/sequences.fasta.xz 57 | 58 | refine: 59 | root: "Wuhan-Hu-1/2019" 60 | 61 | The ``inputs`` entry provides the workflow with one input named ``reference_data``. The metadata and sequence files refer to a sample of approximately 300 sequences maintained by the Nextstrain team that represent all Nextstrain clades annotated for SARS-CoV-2. The workflow downloads these files directly from the associated URLs. :doc:`See the complete list of SARS-CoV-2 datasets we provide through data.nextstrain.org <../reference/remote_inputs>`. 62 | 63 | The ``refine`` entry specifies the root sequence for the example GenBank data. 64 | 65 | For more information, :doc:`see the workflow configuration file reference <../reference/workflow-config-file>`. 66 | 67 | Running the workflow produces two new directories: 68 | 69 | - ``auspice/`` contains a few files that represent a Nextstrain :term:`docs.nextstrain.org:dataset` to be visualized in the following section. 70 | - ``results/`` contains intermediate files generated during workflow execution. 71 | 72 | Visualize the results 73 | --------------------- 74 | 75 | Run this command to start the :term:`docs.nextstrain.org:Auspice` server, providing ``auspice/`` as the directory containing output dataset files: 76 | 77 | .. code:: text 78 | 79 | nextstrain view auspice/ 80 | 81 | Navigate to http://127.0.0.1:4000/ncov/default-build. The resulting :term:`docs.nextstrain.org:dataset` should show a phylogeny of ~200 sequences: 82 | 83 | .. figure:: ../images/dataset-example-data.png 84 | :alt: Phylogenetic tree from the "example data" tutorial as visualized in Auspice 85 | 86 | To stop the server, press :kbd:`Control-C` on your keyboard. 87 | 88 | .. note:: 89 | 90 | You can also view the results by dragging the dataset files all at once onto `auspice.us `__: 91 | 92 | - ``auspice/ncov_default-build.json`` 93 | - ``auspice/ncov_default-build_root-sequence.json`` 94 | - ``auspice/ncov_default-build_tip-frequencies.json`` 95 | -------------------------------------------------------------------------------- /scripts/fix-colorings.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import re 4 | from numpy import linspace 5 | from math import floor 6 | 7 | def adjust_coloring_for_epiweeks(dataset): 8 | """ 9 | If an auspice JSON specifies a colouring with the key "epiweek" (case sensitive) then we create a categorical 10 | colorscale which evenly spaces the canonical nextstrain rainbow across the observed time window. 11 | 12 | NOTE: epiweek must be in CDC format ("YYYYMM") but this may be relaxed to include ISO format in the future. 13 | """ 14 | EPIKEY="epiweek" 15 | try: 16 | (cidx, coloring) = [(i, c) for i, c in enumerate(dataset['meta'].get("colorings", [])) if c['key']==EPIKEY][0] 17 | except IndexError: # coloring doesn't define an epiweek 18 | return 19 | 20 | # remove any duplicate coloring entries in the JSON to ensure the entry we edit is the one used by Auspice 21 | # (NOTE: this is augur bug https://github.com/nextstrain/augur/issues/719) 22 | dataset['meta']['colorings'] = [c for i,c in enumerate(dataset['meta']['colorings']) if not (c['key']==EPIKEY and i!=cidx)] 23 | 24 | # delay import to support older setups not using epiweeks package 25 | from epiweeks import Year, Week 26 | 27 | observed_values = set() 28 | def recurse(node): 29 | value = node.get("node_attrs", {}).get(EPIKEY, {}).get("value", False) 30 | if value: 31 | # we validate using both the epiweeks package and a regex (epiweeks will perform coercion of non-valid data into valid data) 32 | if not re.match(r'^(\d{4})(\d{2})$', value): 33 | raise(ValueError(f"Epiweek value {value} was not in format YYYYMM.")) 34 | week = Week.fromstring(value, system="cdc") # raises ValueError if not valid 35 | observed_values.add(week) 36 | for child in node.get("children", []): 37 | recurse(child) 38 | try: 39 | recurse(dataset["tree"]) 40 | except ValueError as e: 41 | print(str(e)) 42 | print("Skipping color scale creation for epiweek.") 43 | return 44 | observed_values = sorted(list(observed_values)) 45 | 46 | ## generate epiweeks across the entire observed range for color generation 47 | epiweeks = [ observed_values[0] ] 48 | while epiweeks[-1] < observed_values[-1]: 49 | epiweeks.append(epiweeks[-1]+1) 50 | ## generate rainbow colour scale across epiweeks. 51 | ## Since a "default" augur install does not include matplotlib, rather than interpolating between values in the scale 52 | ## we reuse them. This only applies when n(epiweeks)>30, where distinguising between colors is problematic anyway. 53 | rainbow = ["#511EA8", "#482BB6", "#4039C3", "#3F4ACA", "#3E5CD0", "#416CCE", "#447CCD", "#4989C4", "#4E96BC", "#559FB0", "#5DA8A4", "#66AE96", "#6FB388", "#7AB77C", "#85BA6F", "#91BC64", "#9DBE5A", "#AABD53", "#B6BD4B", "#C2BA46", "#CDB642", "#D6B03F", "#DDA83C", "#E29D39", "#E69036", "#E67F33", "#E56D30", "#E2592C", "#DF4428", "#DC2F24"] 54 | color_indicies = [floor(x) for x in linspace(0, len(rainbow), endpoint=False, num=len(epiweeks))] 55 | coloring['scale'] = [ 56 | [epiweek.cdcformat(), rainbow[color_indicies[i]]] 57 | for i,epiweek in enumerate(epiweeks) 58 | if epiweek in observed_values 59 | ] 60 | ## auspice will order the legend according to the provided color scale, so there is no need to set 61 | ## `coloring['legend']` unless we want to restrict this for some reason. 62 | coloring['type'] = 'categorical' # force the scale type to be categorical 63 | 64 | if __name__ == '__main__': 65 | parser = argparse.ArgumentParser( 66 | description="Remove extraneous colorings", 67 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 68 | ) 69 | 70 | parser.add_argument('--input', type=str, metavar="JSON", required=True, help="input Auspice JSON") 71 | parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON") 72 | args = parser.parse_args() 73 | 74 | with open(args.input, "r") as f: 75 | input_json = json.load(f) 76 | 77 | keys_to_remove = ["genbank_accession", "gisaid_epi_isl"] 78 | 79 | fixed_colorings = [] 80 | for coloring in input_json["meta"]["colorings"]: 81 | if coloring['key'] not in keys_to_remove: 82 | fixed_colorings.append(coloring) 83 | 84 | input_json["meta"]["colorings"] = fixed_colorings 85 | 86 | adjust_coloring_for_epiweeks(input_json) 87 | 88 | with open(args.output, 'w') as f: 89 | json.dump(input_json, f, indent=2) 90 | -------------------------------------------------------------------------------- /scripts/deprecated/parse_mutational_fitness_tsv_into_distance_map.py: -------------------------------------------------------------------------------- 1 | """ 2 | Supplementary data S2 from Obermeyer et al (https://www.science.org/doi/10.1126/science.abm1208) 3 | is a TSV table that maps mutations such as "S:D614G" to an estimate of "Δ log R" and is available via GitHub. 4 | Here, we convert this TSV table into a JSON compatable with the augur distance command. 5 | To update model run: 6 | 7 | python scripts/developer_scripts/parse_mutational_fitness_tsv_into_distance_map.py 8 | 9 | and the version the resulting changes to defaults/mutational_fitness_distance_map.json 10 | 11 | Updated model outputs are available at https://github.com/bkotzen/sars-cov2-modeling following: 12 | 13 | https://raw.githubusercontent.com/bkotzen/sars-cov2-modeling/main/2024-07-22/PyR0/mutations.tsv 14 | 15 | -------------------------------------------------------------- 16 | 17 | This analysis was removed from the workflow on 2025-01-23 18 | This was drawn from results at https://github.com/bkotzen/sars-cov2-modeling 19 | But this repo hasn't been updated since 2024-07-22 20 | If these results become updated more frequently, we should restore this analysis 21 | 22 | This was used in the workflow following: 23 | 24 | rule mutational_fitness: 25 | input: 26 | tree = "results/{build_name}/tree.nwk", 27 | alignments = lambda w: rules.translate.output.translations, 28 | distance_map = config["files"]["mutational_fitness_distance_map"] 29 | output: 30 | node_data = "results/{build_name}/mutational_fitness.json" 31 | benchmark: 32 | "benchmarks/mutational_fitness_{build_name}.txt" 33 | log: 34 | "logs/mutational_fitness_{build_name}.txt" 35 | params: 36 | genes = ' '.join(config.get('genes', ['S'])), 37 | compare_to = "root", 38 | attribute_name = "mutational_fitness" 39 | conda: 40 | config["conda_environment"], 41 | resources: 42 | mem_mb=2000 43 | shell: 44 | augur distance \ 45 | --tree {input.tree} \ 46 | --alignment {input.alignments} \ 47 | --gene-names {params.genes} \ 48 | --compare-to {params.compare_to} \ 49 | --attribute-name {params.attribute_name} \ 50 | --map {input.distance_map} \ 51 | --output {output} 2>&1 | tee {log} 52 | """ 53 | 54 | import argparse 55 | import pandas as pd 56 | import json 57 | 58 | if __name__ == '__main__': 59 | parser = argparse.ArgumentParser( 60 | description="Convert mutational fitness values to an Augur distance map", 61 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 62 | ) 63 | parser.add_argument("--input", default="https://raw.githubusercontent.com/bkotzen/sars-cov2-modeling/main/2024-07-22/PyR0/mutations.tsv", help="TSV file of mutational effects") 64 | parser.add_argument("--output", default="defaults/mutational_fitness_distance_map.json", help="JSON file for augur distance") 65 | args = parser.parse_args() 66 | 67 | # collect simple string mapping from TSV, ie 68 | # 'S:A522V': -0.00661378 69 | # 'ORF1a:T4304I': 0.00353199 70 | string_mapping = {} 71 | if args.input: 72 | df = pd.read_csv(args.input, delimiter='\t') 73 | for index, row in df.iterrows(): 74 | string_mapping[row["mutation"]] = float(row["Δ log R"]) 75 | 76 | # convert simple string mapping into structured mapping required by augur distance, ie 77 | # "map": { 78 | # "S": { 79 | # "522": [ 80 | # { 81 | # "from": "A", 82 | # "to": "V", 83 | # "weight": -0.00661378 84 | # }, 85 | structured_mapping = {} 86 | for mutation, delta_log_R in string_mapping.items(): 87 | gene, aa_change = mutation.split(":") 88 | if "STOP" not in aa_change: 89 | from_aa = aa_change[0] 90 | to_aa = aa_change[-1] 91 | pos_aa = aa_change[1:-1] 92 | if gene not in structured_mapping: 93 | structured_mapping[gene] = {} 94 | if pos_aa not in structured_mapping[gene]: 95 | structured_mapping[gene][pos_aa] = [] 96 | entry = {"from": from_aa, "to": to_aa, "weight": round(delta_log_R, 10)} 97 | structured_mapping[gene][pos_aa].append(entry) 98 | 99 | # output this mapping as an augur distance compatable JSON 100 | # include very slightly negative default to prevent heavily diverged artifactual genomes from 101 | # appearing as high fitness 102 | json_output = {"default": -0.003} 103 | json_output["map"] = structured_mapping 104 | 105 | print("writing mutational_fitness_distance_map.json to defaults/") 106 | with open(args.output, 'w') as f: 107 | json.dump(json_output, f, indent=2) 108 | -------------------------------------------------------------------------------- /scripts/combine_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from augur.io import open_file, read_metadata 3 | from Bio import SeqIO 4 | import csv 5 | import sys 6 | 7 | EMPTY = '' 8 | 9 | # This script was written in preparation for a future augur where commands 10 | # may take multiple metadata files, thus making this script unnecessary! 11 | # 12 | # Merging logic: 13 | # - Order of supplied TSVs matters 14 | # - All columns are included (i.e. union of all columns present) 15 | # - The last non-empty value read (from different TSVs) is used. I.e. values are overwritten. 16 | # - Missing data is represented by an empty string 17 | # 18 | # We use one-hot encoding to specify which origin(s) a piece of metadata came from 19 | 20 | def parse_args(): 21 | parser = argparse.ArgumentParser( 22 | description=""" 23 | Custom script to combine metadata files from different origins. 24 | In the case where metadata files specify different values, the latter provided file will take priority. 25 | Columns will be added for each origin with values "yes" or "no" to identify the input source (origin) of each sample. 26 | """, 27 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 28 | ) 29 | parser.add_argument('--metadata', required=True, nargs='+', metavar="TSV", help="Metadata files") 30 | parser.add_argument('--origins', required=True, nargs='+', metavar="STR", help="Names of origins (order should match provided metadata)") 31 | parser.add_argument('--output', required=True, metavar="TSV", help="Output (merged) metadata") 32 | args = parser.parse_args() 33 | return args 34 | 35 | if __name__ == '__main__': 36 | args = parse_args() 37 | try: 38 | assert(len(args.metadata)==len(args.origins)) 39 | assert(len(args.origins)>1) 40 | except AssertionError: 41 | print("Error. Please check your inputs - there must be the same number of metadata files as origins provided, and there must be more than one of each!") 42 | sys.exit(2) 43 | 44 | # READ IN METADATA FILES 45 | metadata = [] 46 | for (origin, fname) in zip(args.origins, args.metadata): 47 | data = read_metadata(fname) 48 | data.insert(0, "strain", data.index.values) 49 | columns = data.columns 50 | data = data.to_dict(orient="index") 51 | 52 | metadata.append({'origin': origin, "fname": fname, 'data': data, 'columns': columns, 'strains': {s for s in data.keys()}}) 53 | 54 | # SUMMARISE INPUT METADATA 55 | print(f"Parsed {len(metadata)} metadata TSVs") 56 | for m in metadata: 57 | print(f"\t{m['origin']} ({m['fname']}): {len(m['data'].keys())} strains x {len(m['columns'])} columns") 58 | 59 | # BUILD UP COLUMN NAMES FROM MULTIPLE INPUTS TO PRESERVE ORDER 60 | combined_columns = [] 61 | for m in metadata: 62 | combined_columns.extend([c for c in m['columns'] if c not in combined_columns]) 63 | combined_columns.extend(list(args.origins)) 64 | 65 | # ADD IN VALUES ONE BY ONE, OVERWRITING AS NECESSARY 66 | combined_data = metadata[0]['data'] 67 | for strain in combined_data: 68 | for column in combined_columns: 69 | if column not in combined_data[strain]: 70 | combined_data[strain][column] = EMPTY 71 | 72 | for idx in range(1, len(metadata)): 73 | for strain, row in metadata[idx]['data'].items(): 74 | if strain not in combined_data: 75 | combined_data[strain] = {c:EMPTY for c in combined_columns} 76 | for column in combined_columns: 77 | if column in row: 78 | existing_value = combined_data[strain][column] 79 | new_value = row[column] 80 | # overwrite _ANY_ existing value if the overwriting value is non empty (and different)! 81 | if new_value != EMPTY and new_value != existing_value: 82 | if existing_value != EMPTY: 83 | print(f"[{strain}::{column}] Overwriting {combined_data[strain][column]} with {new_value}") 84 | combined_data[strain][column] = new_value 85 | 86 | # one-hot encoding for origin 87 | # note that we use "yes" / "no" here as Booleans are problematic for `augur filter` 88 | for metadata_entry in metadata: 89 | origin = metadata_entry['origin'] 90 | for strain in combined_data: 91 | combined_data[strain][origin] = "yes" if strain in metadata_entry['strains'] else "no" 92 | 93 | print(f"Combined metadata: {len(combined_data.keys())} strains x {len(combined_columns)} columns") 94 | 95 | with open_file(args.output, 'w') as fh: 96 | tsv_writer = csv.writer(fh, delimiter='\t') 97 | tsv_writer.writerow(combined_columns) 98 | for row in combined_data.values(): 99 | tsv_writer.writerow([row[column] for column in combined_columns]) 100 | -------------------------------------------------------------------------------- /scripts/developer_scripts/get_population_weights: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # This script generates a TSV file containing country names and population sizes. 4 | 5 | import argparse 6 | import os 7 | from pathlib import Path 8 | import pandas as pd 9 | import ssl 10 | import urllib.request 11 | 12 | 13 | def download_source_data(path): 14 | # This is the link for "1950-2100, all scenarios"¹ on the UN population CSV 15 | # download page: 16 | url = 'https://population.un.org/wpp/Download/Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_TotalPopulationBySex.csv.gz' 17 | 18 | # As of 2024-08-07, the URL requires a workaround to download programmatically: 19 | # 20 | # 21 | ctx = ssl.create_default_context(ssl.Purpose.SERVER_AUTH) 22 | ctx.options |= 0x4 # OP_LEGACY_SERVER_CONNECT 23 | response = urllib.request.urlopen(url, context=ctx) 24 | 25 | os.makedirs(os.path.dirname(path), exist_ok=True) 26 | with open(path, 'wb') as f: 27 | f.write(response.read()) 28 | 29 | 30 | def export_population_weights(output): 31 | csv = Path(os.path.dirname(__file__)) / "data/WPP2024_TotalPopulationBySex.csv.gz" 32 | source = 'the United Nations World Population Prospects' 33 | if os.path.exists(output): 34 | print(f'Source data already exists: {str(csv)!r}') 35 | print(f'... skipping download.') 36 | else: 37 | print(f'Downloading source data to {str(csv)!r}...') 38 | download_source_data(csv) 39 | 40 | print('Formatting data for output...') 41 | df = pd.read_csv(csv, usecols=['Location', 'LocTypeName', 'Time', 'PopTotal'], dtype='str') 42 | 43 | # Drop rows that represent aggregate regions/subregions/etc. 44 | df = df[df['LocTypeName'] == 'Country/Area'] 45 | 46 | # Use data from the latest non-forecast year 47 | year = '2023' 48 | df = df[df['Time'] == year] 49 | 50 | # Rename columns to match names in metadata 51 | column_name_map = { 52 | 'Location': 'country', 53 | 'PopTotal': 'weight', 54 | } 55 | df = df.rename(columns=column_name_map) 56 | 57 | # Keep only the columns used above 58 | df = df[column_name_map.values()] 59 | 60 | # Set country as index and sort alphabetically 61 | df = df.set_index('country') 62 | df = df.sort_index() 63 | 64 | # Rename countries to match values in metadata 65 | country_name_map = { 66 | "Bolivia (Plurinational State of)": "Bolivia", 67 | "Bonaire, Sint Eustatius and Saba": "Bonaire", 68 | "Brunei Darussalam": "Brunei", 69 | "China, Hong Kong SAR": "Hong Kong", 70 | "China, Macao SAR": "Macao", 71 | "China, Taiwan Province of China": "Taiwan", 72 | "Comoros": "Union of the Comoros", 73 | "Congo": "Republic of the Congo", 74 | "Curaçao": "Curacao", 75 | "Czechia": "Czech Republic", 76 | "Iran (Islamic Republic of)": "Iran", 77 | "Kosovo (under UNSC res. 1244)": "Kosovo", 78 | "Lao People's Democratic Republic": "Laos", 79 | "Micronesia (Fed. States of)": "Micronesia", 80 | "Republic of Korea": "South Korea", 81 | "Republic of Moldova": "Moldova", 82 | "Russian Federation": "Russia", 83 | "Saint Martin (French part)": "Saint Martin", 84 | "Sint Maarten (Dutch part)": "Sint Maarten", 85 | "State of Palestine": "Palestine", 86 | "Syrian Arab Republic": "Syria", 87 | "Türkiye": "Turkey", 88 | "United Republic of Tanzania": "Tanzania", 89 | "United States of America": "USA", 90 | "Venezuela (Bolivarian Republic of)": "Venezuela", 91 | "Viet Nam": "Vietnam", 92 | } 93 | df = df.rename(index=country_name_map) 94 | 95 | # Ensure int weights are written without decimals 96 | df['weight'] = pd.to_numeric(df['weight']).astype(int) 97 | 98 | print(f'Writing data to {str(output)!r}...') 99 | 100 | # Delete output file if it already exists 101 | if os.path.exists(output): 102 | os.remove(output) 103 | 104 | # Export 105 | with open(output, 'a') as f: 106 | print("# [DO NOT EDIT] This file was generated by scripts/developer_scripts/get_population_weights", file=f) 107 | print(f"# Based on {year} population estimates from {source}", file=f) 108 | df.to_csv(f, index=True, sep='\t') 109 | 110 | print('Done.') 111 | 112 | 113 | if __name__ == '__main__': 114 | parser = argparse.ArgumentParser( 115 | description="Create population sizes file", 116 | ) 117 | 118 | parser.add_argument('--output', type=str, metavar="FILE", 119 | default="defaults/population_weights.tsv", 120 | help="Path to output population sizes file", 121 | ) 122 | args = parser.parse_args() 123 | 124 | export_population_weights(args.output) 125 | -------------------------------------------------------------------------------- /defaults/population_weights.tsv: -------------------------------------------------------------------------------- 1 | # [DO NOT EDIT] This file was generated by scripts/developer_scripts/get_population_weights 2 | # Based on 2023 population estimates from the United Nations World Population Prospects 3 | country weight 4 | Afghanistan 41454 5 | Albania 2811 6 | Algeria 46164 7 | American Samoa 47 8 | Andorra 80 9 | Angola 36749 10 | Anguilla 14 11 | Antigua and Barbuda 93 12 | Argentina 45538 13 | Armenia 2943 14 | Aruba 107 15 | Australia 26451 16 | Austria 9130 17 | Azerbaijan 10318 18 | Bahamas 399 19 | Bahrain 1569 20 | Bangladesh 171466 21 | Barbados 282 22 | Belarus 9115 23 | Belgium 11712 24 | Belize 411 25 | Benin 14111 26 | Bermuda 64 27 | Bhutan 786 28 | Bolivia 12244 29 | Bonaire 29 30 | Bosnia and Herzegovina 3185 31 | Botswana 2480 32 | Brazil 211140 33 | British Virgin Islands 38 34 | Brunei 458 35 | Bulgaria 6795 36 | Burkina Faso 23025 37 | Burundi 13689 38 | Cabo Verde 522 39 | Cambodia 17423 40 | Cameroon 28372 41 | Canada 39299 42 | Cayman Islands 73 43 | Central African Republic 5152 44 | Chad 19319 45 | Chile 19658 46 | China 1422584 47 | Hong Kong 7442 48 | Macao 713 49 | Taiwan 23317 50 | Colombia 52321 51 | Union of the Comoros 850 52 | Republic of the Congo 6182 53 | Cook Islands 14 54 | Costa Rica 5105 55 | Croatia 3896 56 | Cuba 11019 57 | Curacao 185 58 | Cyprus 1344 59 | Czech Republic 10809 60 | Côte d'Ivoire 31165 61 | Dem. People's Republic of Korea 26418 62 | Democratic Republic of the Congo 105789 63 | Denmark 5948 64 | Djibouti 1152 65 | Dominica 66 66 | Dominican Republic 11331 67 | Ecuador 17980 68 | Egypt 114535 69 | El Salvador 6309 70 | Equatorial Guinea 1847 71 | Eritrea 3470 72 | Estonia 1367 73 | Eswatini 1230 74 | Ethiopia 128691 75 | Falkland Islands (Malvinas) 3 76 | Faroe Islands 54 77 | Fiji 924 78 | Finland 5601 79 | France 66438 80 | French Guiana 303 81 | French Polynesia 281 82 | Gabon 2484 83 | Gambia 2697 84 | Georgia 3807 85 | Germany 84548 86 | Ghana 33787 87 | Gibraltar 38 88 | Greece 10242 89 | Greenland 55 90 | Grenada 117 91 | Guadeloupe 376 92 | Guam 166 93 | Guatemala 18124 94 | Guernsey 64 95 | Guinea 14405 96 | Guinea-Bissau 2153 97 | Guyana 826 98 | Haiti 11637 99 | Holy See 0 100 | Honduras 10644 101 | Hungary 9686 102 | Iceland 387 103 | India 1438069 104 | Indonesia 281190 105 | Iran 90608 106 | Iraq 45074 107 | Ireland 5196 108 | Isle of Man 84 109 | Israel 9256 110 | Italy 59499 111 | Jamaica 2839 112 | Japan 124370 113 | Jersey 103 114 | Jordan 11439 115 | Kazakhstan 20330 116 | Kenya 55339 117 | Kiribati 132 118 | Kosovo 1700 119 | Kuwait 4838 120 | Kyrgyzstan 7073 121 | Laos 7664 122 | Latvia 1882 123 | Lebanon 5773 124 | Lesotho 2311 125 | Liberia 5493 126 | Libya 7305 127 | Liechtenstein 39 128 | Lithuania 2854 129 | Luxembourg 665 130 | Madagascar 31195 131 | Malawi 21104 132 | Malaysia 35126 133 | Maldives 525 134 | Mali 23769 135 | Malta 532 136 | Marshall Islands 38 137 | Martinique 346 138 | Mauritania 5022 139 | Mauritius 1273 140 | Mayotte 316 141 | Mexico 129739 142 | Micronesia 112 143 | Monaco 38 144 | Mongolia 3431 145 | Montenegro 633 146 | Montserrat 4 147 | Morocco 37712 148 | Mozambique 33635 149 | Myanmar 54133 150 | Namibia 2963 151 | Nauru 11 152 | Nepal 29694 153 | Netherlands 18092 154 | New Caledonia 289 155 | New Zealand 5172 156 | Nicaragua 6823 157 | Niger 26159 158 | Nigeria 227882 159 | Niue 1 160 | North Macedonia 1831 161 | Northern Mariana Islands 45 162 | Norway 5519 163 | Oman 5049 164 | Pakistan 247504 165 | Palau 17 166 | Panama 4458 167 | Papua New Guinea 10389 168 | Paraguay 6844 169 | Peru 33845 170 | Philippines 114891 171 | Poland 38762 172 | Portugal 10430 173 | Puerto Rico 3242 174 | Qatar 2979 175 | South Korea 51748 176 | Moldova 3067 177 | Romania 19118 178 | Russia 145440 179 | Rwanda 13954 180 | Réunion 874 181 | Saint Barthélemy 11 182 | Saint Helena 5 183 | Saint Kitts and Nevis 46 184 | Saint Lucia 179 185 | Saint Martin 27 186 | Saint Pierre and Miquelon 5 187 | Saint Vincent and the Grenadines 101 188 | Samoa 216 189 | San Marino 33 190 | Sao Tome and Principe 230 191 | Saudi Arabia 33264 192 | Senegal 18077 193 | Serbia 6773 194 | Seychelles 127 195 | Sierra Leone 8460 196 | Singapore 5789 197 | Sint Maarten 42 198 | Slovakia 5518 199 | Slovenia 2118 200 | Solomon Islands 800 201 | Somalia 18358 202 | South Africa 63212 203 | South Sudan 11483 204 | Spain 47911 205 | Sri Lanka 22971 206 | Palestine 5409 207 | Sudan 50042 208 | Suriname 628 209 | Sweden 10551 210 | Switzerland 8870 211 | Syria 23594 212 | Tajikistan 10389 213 | Thailand 71702 214 | Timor-Leste 1384 215 | Togo 9304 216 | Tokelau 2 217 | Tonga 104 218 | Trinidad and Tobago 1502 219 | Tunisia 12200 220 | Turkmenistan 7364 221 | Turks and Caicos Islands 46 222 | Tuvalu 9 223 | Turkey 87270 224 | Uganda 48656 225 | Ukraine 37732 226 | United Arab Emirates 10642 227 | United Kingdom 68682 228 | Tanzania 66617 229 | United States Virgin Islands 85 230 | USA 343477 231 | Uruguay 3388 232 | Uzbekistan 35652 233 | Vanuatu 320 234 | Venezuela 28300 235 | Vietnam 100352 236 | Wallis and Futuna Islands 11 237 | Western Sahara 579 238 | Yemen 39390 239 | Zambia 20723 240 | Zimbabwe 16340 241 | --------------------------------------------------------------------------------