├── .dockstore.yml
├── .gitignore
├── .gitmodules
├── .guix-deploy
├── .guix-run
├── .guix-test
├── Dockerfile
├── LICENSE
├── README.md
├── bh20seqanalyzer
├── __init__.py
└── main.py
├── bh20sequploader
├── SARS-CoV-2-reference.fasta
├── __init__.py
├── bh20seq-options.yml
├── bh20seq-schema.yml
├── bh20seq-shex.rdf
├── main.py
├── qc_fasta.py
├── qc_metadata.py
└── validation
│ ├── Makefile
│ ├── formats
│ └── formats.mgc
├── bh20simplewebuploader
├── __init__.py
├── api.py
├── main.py
├── static
│ ├── blog.css
│ ├── image
│ │ ├── AWS-Logo.png
│ │ ├── AWS.jpg
│ │ ├── BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.pdf
│ │ ├── BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.png
│ │ ├── CWL-Logo-Header.png
│ │ ├── CWL.png
│ │ ├── ESR.png
│ │ ├── REDCap.png
│ │ ├── UTHSC-primary-stacked-logo-4c.png
│ │ ├── arvados-logo.png
│ │ ├── arvados-workflow-output.png
│ │ ├── coronasmallcomp.gif
│ │ ├── covid19biohackathon.png
│ │ ├── curii.logo.ai.png
│ │ ├── curii.logo.ai.svg
│ │ ├── edit.png
│ │ ├── oxford-nanopore.jpg
│ │ ├── oxford-nanopore2.jpg
│ │ ├── pubseq-aln.png
│ │ ├── redcap_logo_high_res_white_on_black.svg
│ │ └── redcap_logo_high_res_white_on_black.svg.png
│ ├── main.css
│ ├── main.js
│ └── map.js
└── templates
│ ├── about.html
│ ├── banner.html
│ ├── blog.html
│ ├── blurb.html
│ ├── demo.html
│ ├── download.html
│ ├── ebi-sample.xml
│ ├── error.html
│ ├── export.html
│ ├── footer.html
│ ├── form.html
│ ├── header.html
│ ├── home.html
│ ├── list.html
│ ├── mapheader.html
│ ├── menu.html
│ ├── org-header.html
│ ├── permalink.html
│ ├── resource.html
│ ├── search.html
│ ├── status.html
│ ├── success.html
│ └── validated.html
├── data
└── original_semantic_enrichment
│ ├── cases_per_country.txt
│ ├── countries.ttl
│ ├── death_per_country.txt
│ └── labels.ttl
├── doc
├── DEVELOPMENT.md
├── INSTALL.md
├── blog
│ ├── covid19-pubseq-location-data.html
│ ├── covid19-pubseq-location-data.org
│ ├── covid19-pubseq-update-rdf.org
│ ├── using-covid-19-pubseq-part1.html
│ ├── using-covid-19-pubseq-part1.org
│ ├── using-covid-19-pubseq-part2.html
│ ├── using-covid-19-pubseq-part2.org
│ ├── using-covid-19-pubseq-part3.html
│ ├── using-covid-19-pubseq-part3.org
│ ├── using-covid-19-pubseq-part4.html
│ ├── using-covid-19-pubseq-part4.org
│ ├── using-covid-19-pubseq-part5.html
│ ├── using-covid-19-pubseq-part5.org
│ ├── using-covid-19-pubseq-part6.html
│ └── using-covid-19-pubseq-part6.org
├── talks
│ └── Utrecht-20210510
│ │ └── presentation.org
└── web
│ ├── about.html
│ ├── about.org
│ ├── contact.html
│ ├── contact.org
│ ├── download.html
│ ├── download.org
│ ├── export.html
│ └── export.org
├── etc
└── virtuoso-ose
│ └── virtuoso.ini
├── example
├── esr_example.yaml
├── maximum_metadata_example.yaml
├── minimal_metadata_example.yaml
├── sequence.fasta
└── uthsc_example.yaml
├── gittaggers.py
├── image
└── homepage.png
├── lib
└── ruby
│ └── VERSION
├── paper
├── paper.bib
└── paper.md
├── scripts
├── README.md
├── cleanup.py
├── create_sra_metadata
│ ├── SraExperimentPackage.2020.07.09.xml.gz
│ └── create_sra_metadata.py
├── db_enrichment
│ ├── .gitignore
│ ├── country_enrichment.py
│ ├── input_location.csv
│ ├── readme.md
│ └── update
│ │ └── README.org
├── delete_entries_on_arvados.py
├── dict_ontology_standardization
│ ├── ncbi_countries.csv
│ ├── ncbi_host_health_status.csv
│ ├── ncbi_host_species.csv
│ ├── ncbi_sequencing_technology.csv
│ └── ncbi_speciesman_source.csv
├── docker
│ └── Dockerfile
├── esr_samples
│ ├── Pathogen.cl.1.0.xlsx
│ ├── esr_samples.py
│ ├── jetson
│ │ ├── 21JETSONTEST001.consensus.yaml
│ │ └── 21JETSONTEST001.fasta
│ └── template.yaml
├── fasta2vcf
│ ├── MZ026486.1.fasta
│ ├── README.md
│ ├── alignment2vcf.py
│ ├── fasta2vcf.sh
│ ├── resources
│ │ ├── MN908947.3.fasta
│ │ ├── NC_045512.2.fasta
│ │ ├── NC_045512.2.fasta.fai
│ │ ├── README.md
│ │ └── ensembl-export.csv
│ └── simpleVcfAnnotation.py
├── fetch_from_genbank.cwl
├── foreach.sh
├── gen_docs
│ └── org2html.sh
├── import.cwl
├── import_from_genbank.cwl
├── import_to_arvados.py
├── split_into_arrays.cwl
├── submit_ebi
│ └── example
│ │ ├── project-submission.xml
│ │ ├── project.xml
│ │ ├── sample-submission.xml
│ │ └── sample.xml
├── update_virtuoso
│ └── check_for_updates.py
├── upload.cwl
├── uthsc_samples
│ ├── .gitignore
│ ├── template.yaml
│ └── uthsc_samples.py
└── utils.py
├── setup.py
├── test
├── data
│ ├── 10_samples.fa
│ ├── 10_samples.xlsx
│ ├── input
│ │ ├── TN_UT2.fa
│ │ └── TN_UT2.yaml
│ ├── regression
│ │ └── TN_UT2.rdf
│ └── test.ttl
├── rest-api.html
├── rest-api.org
├── runner.py
├── test_shex.py
└── test_sparql.py
└── workflows
├── fastq2fasta
├── bam2fasta.cwl
├── bcftools-concat.cwl
├── bcftools-consensus.cwl
├── bcftools-index.cwl
├── bcftools-norm.cwl
├── bcftools-view-exclude-ref.cwl
├── bcftools-view-qc.cwl
├── bcftools-view.cwl
├── bwa-index.cwl
├── bwa-mem.cwl
├── fastq2fasta-create-bwaindex.cwl
├── fastq2fasta.cwl
├── freebayes.cwl
├── samtools-faidx.cwl
├── samtools-sort.cwl
└── samtools-view.cwl
├── pangenome-generate
├── abpoa.cwl
├── arv-main.cwl
├── arvados-and-samtools-dockerfile
│ ├── 1078ECD7.key
│ └── Dockerfile
├── collect-seqs.cwl
├── collect-seqs.py
├── dups2metadata.cwl
├── dups2metadata.py
├── from_sparql.cwl
├── from_sparql.py
├── merge-metadata.cwl
├── merge-metadata.py
├── odgi-build-from-xpoa-gfa.cwl
├── odgi-build.cwl
├── odgi_to_rdf.cwl
├── pangenome-generate.cwl
├── pangenome-generate_abpoa.cwl
├── pangenome-generate_spoa.cwl
├── query-to-gfa.cwl
├── relabel-seqs.cwl
├── relabel-seqs.py
├── seqwish.cwl
├── sort_fasta_by_quality_and_len.cwl
├── sort_fasta_by_quality_and_len.py
├── spoa.cwl
└── testjob.yml
├── phylogeny
├── README.md
├── align.cwl
├── augur.cwl
├── awk-coverage.cwl
├── clado-job.yml
├── coverage.cwl
├── metadata.cwl
├── newick.cwl
└── phylogeny.cwl
├── pubseq
├── generate-rdf.rb
├── normalize-step1.py
├── normalize-step2.rb
├── normalize
│ ├── README.md
│ ├── __init__.py
│ └── mapping.py
├── pubseq-fetch-data.py
├── pubseq-fetch-ids
├── validate.rb
└── wikidata
│ ├── README.org
│ ├── fetch-places.sh
│ ├── fetch-regions.sh
│ └── wikidata-fetch-places.rb
├── pull-data
└── genbank
│ ├── .gitignore
│ ├── .guix-run
│ ├── README.md
│ ├── genbank-fetch-ids.py
│ ├── genbank.py
│ ├── transform-genbank-xml2yamlfa.py
│ ├── update-from-genbank.py
│ └── utils.py
└── update-workflows.sh
/.dockstore.yml:
--------------------------------------------------------------------------------
1 | version: 1.2
2 | workflows:
3 | - name: Pangenome Generator
4 | subclass: CWL
5 | primaryDescriptorPath: /workflows/pangenome-generate/pangenome-generate.cwl
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py~
2 |
3 | # Distribution / packaging
4 | build/
5 | cache.txt
6 | metadata.ttl
7 | __pycache__/
8 | eggs/
9 | .eggs/
10 | *.egg-info/
11 | *.egg
12 |
13 | # Temp files
14 | metadata.ttl
15 | metadata*
16 | cache.txt
17 | data/wikidata
18 |
19 | # Environments
20 | .env
21 | .venv
22 | env/
23 | venv/
24 | ENV/
25 | env.bak/
26 | venv.bak/
27 |
28 | relabeledSeqs*
29 |
30 | # Generated dirs/files
31 | metadata_from_nuccore/
32 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "workflows/tools"]
2 | path = workflows/tools
3 | url = https://github.com/common-workflow-library/bio-cwl-tools.git
4 |
--------------------------------------------------------------------------------
/.guix-deploy:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | #
3 | # This script runs the web server in a Guix container
4 |
5 | GUIX_PROFILE=/home/wrk/.config/guix/current
6 | export GUILE_LOAD_PATH=$GUIX_PROFILE/share/guile/site/3.0/
7 | export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/
8 |
9 | ls $GUILE_LOAD_PATH
10 |
11 | env GUIX_PACKAGE_PATH=/home/wrk/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-redis python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_RUN_PORT=5067 FLASK_APP=bh20simplewebuploader/main.py flask run
12 |
13 |
--------------------------------------------------------------------------------
/.guix-run:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | #
3 | # Set up a container to run the scripts
4 |
5 | GUIX_PROFILE=/home/wrk/.config/guix/current
6 | export GUILE_LOAD_PATH=$GUIX_PROFILE/share/guile/site/3.0/
7 | export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/
8 |
9 | ls $GUILE_LOAD_PATH
10 |
11 | env GUIX_PACKAGE_PATH=/home/wrk/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl minimap2 python-schema-salad --share=/export/tmp
12 |
13 |
--------------------------------------------------------------------------------
/.guix-test:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | #
3 | # This script runs the tests in a Guix container
4 |
5 | GUIX_PROFILE=~/.config/guix/current
6 | export GUILE_LOAD_PATH=$GUIX_PROFILE/share/guile/site/3.0/
7 | export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/
8 |
9 | ls $GUILE_LOAD_PATH
10 |
11 | env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client -- python3 test/runner.py
12 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Dockerfile for containerizing the web interface
2 | FROM python:3.6-jessie
3 | WORKDIR /app
4 |
5 | RUN pip3 install gunicorn
6 |
7 | ADD LICENSE /app/
8 | ADD gittaggers.py /app/
9 | ADD setup.py /app/
10 | ADD README.md /app/
11 | ADD example /app/example
12 | ADD bh20seqanalyzer /app/bh20simplewebuploader
13 | ADD bh20sequploader /app/bh20sequploader
14 | ADD bh20simplewebuploader /app/bh20simplewebuploader
15 |
16 | RUN pip3 install -e .[web]
17 |
18 | ENV PORT 8080
19 | CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:8080", "bh20simplewebuploader.main:app"]
20 |
--------------------------------------------------------------------------------
/bh20seqanalyzer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20seqanalyzer/__init__.py
--------------------------------------------------------------------------------
/bh20sequploader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20sequploader/__init__.py
--------------------------------------------------------------------------------
/bh20sequploader/bh20seq-options.yml:
--------------------------------------------------------------------------------
1 | # Contains suggested human-readable field values and their corresponding IRIs.
2 | # Keyed on the field names in the types in the schema. Relies on field names
3 | # being unique or at least using the same options in different containing
4 | # types.
5 |
6 | license_type:
7 | CC0 Public Domain Dedication: http://creativecommons.org/publicdomain/zero/1.0/
8 | CC-BY-4.0 Attribution 4.0 International: http://creativecommons.org/licenses/by/4.0/
9 |
10 | host_age_unit:
11 | Years: http://purl.obolibrary.org/obo/UO_0000036
12 | Months: http://purl.obolibrary.org/obo/UO_0000035
13 | Weeks: http://purl.obolibrary.org/obo/UO_0000034
14 | Days: http://purl.obolibrary.org/obo/UO_0000033
15 | Hours: http://purl.obolibrary.org/obo/UO_0000032
16 |
17 | host_sex:
18 | Male: http://purl.obolibrary.org/obo/PATO_0000384
19 | Female: http://purl.obolibrary.org/obo/PATO_0000383
20 | Intersex: http://purl.obolibrary.org/obo/PATO_0001340
21 |
22 | host_health_status:
23 | healthy: http://purl.obolibrary.org/obo/NCIT_C115935
24 | asymptomatic: http://purl.obolibrary.org/obo/NCIT_C3833
25 | symptomatic: http://purl.obolibrary.org/obo/NCIT_C25269
26 | admitted to hospital: http://purl.obolibrary.org/obo/GENEPIO_0002020
27 | discharged from hospital: http://purl.obolibrary.org/obo/GENEPIO_0001849
28 | dead: http://purl.obolibrary.org/obo/NCIT_C28554
29 | alive: http://purl.obolibrary.org/obo/NCIT_C37987
30 |
31 | sample_sequencing_technology:
32 | Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173
33 | Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566
34 | Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567
35 | Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205
36 | Illumina: http://purl.obolibrary.org/obo/OBI_0000759
37 | IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894
38 | Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632
39 | Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818
40 | Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641
41 |
42 | specimen_source:
43 | nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831
44 | oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835
45 | sputum: http://purl.obolibrary.org/obo/NCIT_C13278
46 | bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195
47 | saliva: http://purlobolibrary.org/obo/NCIT_C13275
48 | aspirate: http://purl.obolibrary.org/obo/NCIT_C13347
49 |
--------------------------------------------------------------------------------
/bh20sequploader/qc_fasta.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 | import tempfile
3 | import magic
4 | import subprocess
5 | import tempfile
6 | import logging
7 | import re
8 | import io
9 | import gzip
10 |
11 | log = logging.getLogger(__name__ )
12 |
13 | def read_fasta(sequence):
14 | entries = 0
15 | bases = []
16 | label = None
17 | for line in sequence:
18 | if line.startswith(">"):
19 | label = line
20 | entries += 1
21 | else:
22 | bases.append(line)
23 | if entries > 1:
24 | log.debug("FASTA file contains multiple entries")
25 | raise ValueError("FASTA file contains multiple entries")
26 | return label, bases
27 |
28 | def qc_fasta(arg_sequence, check_with_mimimap2=True):
29 | log.debug("Starting qc_fasta")
30 | schema_resource = pkg_resources.resource_stream(__name__, "validation/formats")
31 | with tempfile.NamedTemporaryFile() as tmp:
32 | tmp.write(schema_resource.read())
33 | tmp.flush()
34 | val = magic.Magic(magic_file=tmp.name,
35 | uncompress=False, mime=True)
36 |
37 | gz = ""
38 | if arg_sequence.name.endswith(".gz"):
39 | sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb')
40 | gz = ".gz"
41 | else:
42 | sequence = arg_sequence
43 |
44 | sequence = io.TextIOWrapper(sequence)
45 | r = sequence.read(4096)
46 | sequence.seek(0)
47 |
48 | seqlabel = r[1:r.index("\n")]
49 | seq_type = val.from_buffer(r).lower()
50 |
51 | if seq_type == "text/fasta":
52 | # ensure that contains only one entry
53 | submitlabel, submitseq = read_fasta(sequence)
54 | sequence.seek(0)
55 | sequence.detach()
56 |
57 | if check_with_mimimap2:
58 | with tempfile.NamedTemporaryFile() as tmp1:
59 | with tempfile.NamedTemporaryFile() as tmp2:
60 | refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta")
61 | tmp1.write(refstring)
62 | tmp1.flush()
63 | tmp2.write(submitlabel.encode("utf8"))
64 | tmp2.write(("".join(submitseq)).encode("utf8"))
65 | tmp2.flush()
66 |
67 | similarity = 0
68 | try:
69 | log.debug("Trying to run minimap2")
70 | cmd = ["minimap2", "-c", "-x", "asm20", tmp1.name, tmp2.name]
71 | logging.info("QC checking similarity to reference")
72 | logging.info(" ".join(cmd))
73 | result = subprocess.run(cmd, stdout=subprocess.PIPE)
74 | result.check_returncode()
75 | res = result.stdout.decode("utf-8")
76 | mm = res.split("\t")
77 | if len(mm) >= 10:
78 | # divide Number of matching bases in the mapping / Target sequence length
79 | similarity = (float(mm[9]) / float(mm[6])) * 100.0
80 | else:
81 | similarity = 0
82 | except Exception as e:
83 | logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e)
84 |
85 | if similarity < 70.0:
86 | raise ValueError(
87 | f"QC fail for {seqlabel}: alignment to reference was less than 70% (was {similarity})")
88 |
89 | return "sequence.fasta" + gz, seqlabel, seq_type
90 | elif seq_type == "text/fastq":
91 | sequence.seek(0)
92 | sequence.detach()
93 | return "reads.fastq" + gz, seqlabel, seq_type
94 | else:
95 | log.debug(seqlabel)
96 | log.debug(seq_type)
97 | raise ValueError("Sequence file ({}) does not look like a DNA FASTA or FASTQ".format(arg_sequence))
98 |
--------------------------------------------------------------------------------
/bh20sequploader/qc_metadata.py:
--------------------------------------------------------------------------------
1 | import schema_salad.schema
2 | import schema_salad.ref_resolver
3 | import schema_salad.jsonld_context
4 | import logging
5 | import pkg_resources
6 | import logging
7 | import traceback
8 | from rdflib import Graph, Namespace
9 | from pyshex.evaluate import evaluate
10 |
11 | metadata_schema = None
12 |
13 | def qc_metadata(metadatafile):
14 | global metadata_schema
15 | log = logging.getLogger(__name__ )
16 | if metadata_schema is None:
17 | schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml")
18 | cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")}
19 | metadata_schema = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache)
20 |
21 | (document_loader,
22 | avsc_names,
23 | schema_metadata,
24 | metaschema_loader) = metadata_schema
25 |
26 | shex = pkg_resources.resource_stream(__name__, "bh20seq-shex.rdf").read().decode("utf-8")
27 |
28 | if not isinstance(avsc_names, schema_salad.avro.schema.Names):
29 | raise Exception(avsc_names)
30 |
31 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True)
32 | g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx)
33 | rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape")
34 |
35 | # As part of QC make sure serialization works too, this will raise
36 | # an exception if there are invalid URIs.
37 | g.serialize(format="ntriples")
38 |
39 | if not rslt:
40 | raise Exception(reason)
41 |
42 | return metadata['sample']['sample_id']
43 |
--------------------------------------------------------------------------------
/bh20sequploader/validation/Makefile:
--------------------------------------------------------------------------------
1 | compile: formats.mgc
2 |
3 | formats.mgc :
4 | file -C -m formats
5 |
--------------------------------------------------------------------------------
/bh20sequploader/validation/formats:
--------------------------------------------------------------------------------
1 | 0 regex \^\>.+\r?\n([A-Za-z]+\r?\n)*[A-Za-z]+(\r?\n)?$ FASTA
2 | !:mime text/fasta
3 | 0 regex \^@.+\r?\n[A-Za-z]*\n\\+.*\n[!-i]*(\r\n)? FASTQ
4 | !:mime text/fastq
--------------------------------------------------------------------------------
/bh20sequploader/validation/formats.mgc:
--------------------------------------------------------------------------------
1 | � @ =. ^>.+
2 | ?
3 | ([A-Za-z]+
4 | ?
5 | )*[A-Za-z]+(
6 | ?
7 | )?$ FASTA text/fasta @ =% ^@.+
8 | ?
9 | [A-Za-z]*
10 | \+.*
11 | [!-i]*(
12 | )? FASTQ text/fastq
--------------------------------------------------------------------------------
/bh20simplewebuploader/__init__.py:
--------------------------------------------------------------------------------
1 | import bh20simplewebuploader.api
2 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/blog.css:
--------------------------------------------------------------------------------
1 | .title { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace }
2 | .table-of-contents { font-family: monospace; color: red; }
3 | /* .text-table-of-contents { font-family: monospace; color: black; font-size:80%; } */
4 | .timestamp { font-family: monospace; color: darkgreen; }
5 |
6 | h1,h2 { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace; color:black;background-color:white; }
7 | h2 { color: black; }
8 | h3,h4 { color: black; margin:0; }
9 | code { color: darkblue; }
10 | body {font-family: Palatino, 'Palatino Linotype', serif; color:black; background-color:white; font-size: large; padding: 10px; }
11 |
12 | div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset;
13 | font-family: palatino font, monospace; font-size:80%; font-weight:bold; }
14 | div.quote { font-family: palatino font, monospace; font-size:80%; }
15 | div.quotation { font-family: palatino font, monospace; font-size:80%; }
16 | pre.example { margin: 30px; font-family: prestige, monospace; color:black; font-size:70%; background-color: lightyellow; }
17 | pre.src { margin: 30px; font-family: prestige, monospace; font-weight: bold; color:white; font-size:80%; background-color: black; }
18 |
19 | div[id="text-table-of-contents"]{
20 | font-family: palatino font, monospace; background-color:white;
21 | border-style: dotted;
22 | border-color: #98bf21;
23 | border-width: 1px;
24 | }
25 | div[class^="outline-text"] {
26 | margin: 10px;
27 | // background-color:white;
28 | // border-style: dotted;
29 | // border-color: #98bf21;
30 | // border-width: 1px;
31 | font-family: Palatino, 'Palatino Linotype', serif; color:black; font-size: large
32 | }
33 | span[class="todo TESTING"] {
34 | color:purple;
35 | }
36 | span[class="todo IN_PROGRESS"] {
37 | color:brown;
38 | }
39 | span[class^="section-number"] {
40 | color:grey;
41 | }
42 | span[class="journal"] {
43 | color:darkblue;
44 | }
45 | span[class="year"] {
46 | color:darkred;
47 | }
48 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/AWS-Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/AWS-Logo.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/AWS.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/AWS.jpg
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.pdf
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/CWL-Logo-Header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/CWL-Logo-Header.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/CWL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/CWL.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/ESR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/ESR.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/REDCap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/REDCap.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/UTHSC-primary-stacked-logo-4c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/UTHSC-primary-stacked-logo-4c.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/arvados-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/arvados-logo.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/arvados-workflow-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/arvados-workflow-output.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/coronasmallcomp.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/coronasmallcomp.gif
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/covid19biohackathon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/covid19biohackathon.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/curii.logo.ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/curii.logo.ai.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/edit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/edit.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/oxford-nanopore.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/oxford-nanopore.jpg
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/oxford-nanopore2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/oxford-nanopore2.jpg
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/pubseq-aln.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/pubseq-aln.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/image/redcap_logo_high_res_white_on_black.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/redcap_logo_high_res_white_on_black.svg.png
--------------------------------------------------------------------------------
/bh20simplewebuploader/static/map.js:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | Draws the map using Leaflet and OpenStreetmap
4 |
5 | drawMap() is the main function.
6 |
7 | */
8 | var map = L.map( 'mapid', {
9 | center: [51.505, -0.09], // Default to U.S.A
10 | minZoom: 2,
11 | zoom: 0
12 | });
13 |
14 | L.tileLayer( 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
15 | attribution: '© OpenStreetMap | COVID-19 PubSeq ',
16 | subdomains: ['a','b','c']
17 | }).addTo(map);
18 |
19 | /*
20 | * When a page gets rendered this function draws the map
21 | */
22 |
23 | function drawMap(){
24 | var mymap = map;
25 |
26 | // ---- fetch all counts
27 | fetch(scriptRoot + "api/getCountByGPS")
28 | .then(response => {
29 | console.log(response)
30 | return response.json();
31 | })
32 | .then(data => {
33 | buildMapMarkers(data);
34 |
35 | });
36 | document.getElementById("map_view").classList.remove("invisible");
37 | map.invalidateSize();
38 | }
39 |
40 |
41 | /*
42 | * Register a marker with special attribute track # sequences
43 | */
44 |
45 | seqMarker = L.Marker.extend({
46 | options: {
47 | seqMarkerLocation: "Loc",
48 | contributors: "countContrib",
49 | sequences: "countSeq"
50 | }
51 | });
52 |
53 | /*
54 | * Builds markers on the map. We use cluster groups to allow
55 | * counts at different zoom levels. This function is called
56 | * once on page loading. markerClusterGroup just handles it.
57 | * Note the display is handled in CSS (main.css) as .my-custom-icon*
58 | */
59 |
60 | function buildMapMarkers(data) {
61 | let markers = L.markerClusterGroup({
62 | singleMarkerMode: true,
63 | iconCreateFunction: function (cluster) {
64 | // ---- add marker
65 | // array of each marker in the cluster:
66 | var theseMarkers = cluster.getAllChildMarkers();
67 |
68 | // --- compute zoom level and set style
69 |
70 | sumCount = 0;
71 | for (var i = 0; i < theseMarkers.length; i++) {
72 | sumCount += theseMarkers[i].options.sequences;
73 | }
74 |
75 | if (theseMarkers.length < 2) {
76 | return L.divIcon({
77 | html: sumCount,
78 | className: 'my-custom-icon my-custom-icon-0',
79 | })
80 | } else {
81 | var digits = (sumCount + '').length;
82 | return L.divIcon({
83 | html: sumCount,
84 | className: 'my-custom-icon my-custom-icon-'+digits,
85 | });
86 | }}});
87 | // ---- Build the marker list
88 | for (let i = 0; i < data.length; i++) {
89 | let {"count": fastaCount, GPS, Location: location, LocationLabel: label } = data[i];
90 | let countSeq = Number(fastaCount);
91 |
92 | let coordinates = GPS.split(" ");
93 | if (!(coordinates == null)) {
94 | let lat, lon;
95 | [lon, lat] = coordinates.map(parseFloat);
96 | let point = L.point()
97 | marker = new seqMarker([lat, lon],markerOptions={title: fastaCount+" sequences",sequences: countSeq});
98 | marker.bindPopup("" + label + " " + "SARS-CoV-2sequences: " +fastaCount + " ");
99 | markers.addLayer(marker);
100 | }
101 | }
102 | map.addLayer(markers);
103 | }
104 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/about.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'org-header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 | {{ embed|safe }}
9 |
10 | {% include 'footer.html' %}
11 |
12 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/banner.html:
--------------------------------------------------------------------------------
1 |
9 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/blog.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 | {% if embed %}
9 | {{ embed|safe }}
10 |
11 |
12 | Other documents
13 |
14 | {% else %}
15 |
16 | Documents:
17 | {% endif %}
18 |
19 |
20 |
21 |
22 |
23 |
26 |
27 | We fetch sequence data and metadata. We query
28 | the metadata in multiple ways using SPARQL and onthologies
29 |
30 |
31 |
32 |
35 |
36 | We submit a sequence to the database. In this BLOG we fetch
37 | a sequence from GenBank and add it to the database.
38 |
39 |
40 |
41 |
44 |
45 | We modify a workflow to get new output
46 |
47 |
48 |
49 |
52 |
53 | We modify metadata for all to use! In this BLOG we add a field
54 | for a creative commons license.
55 |
56 |
57 |
58 |
61 |
62 | Dealing with PubSeq localisation data
63 |
64 |
65 |
66 |
69 |
70 | We explore the Arvados command line and API
71 |
72 |
73 |
74 |
77 |
78 | Generate the files needed for uploading to EBI/ENA
79 |
80 |
81 |
82 |
85 |
86 | Documentation for PubSeq REST API
87 |
88 |
89 |
90 |
91 |
92 |
93 | {% include 'footer.html' %}
94 |
95 |
98 |
99 |
100 |
101 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/blurb.html:
--------------------------------------------------------------------------------
1 |
2 | COVID-19 PubSeq is a free and open online bioinformatics public
3 | sequence resource with federated data using unique identifiers and
4 | with unique metadata, such as disambiguated
5 |
6 | Geo localisation . PubSeq comes with on-the-fly analysis of
7 | sequenced SARS-CoV-2 samples that allows for a quick turnaround in
8 | identification of new virus strains. PubSeq allows anyone to upload
9 | sequence material in the form of FASTA or FASTQ files with
10 | accompanying metadata through a web interface or REST API.
11 |
12 |
13 | PubSeq is not owned by anyone. There is no central authority and
14 | there is no (single) company that owns that data or workflows. Our
15 | goal is simply to help map the viral variants. Early identification
16 | of variants helps with testing and treatments! COVID-19 PubSeq
17 | accepts sequence material from all sources. In addition, PubSeq has
18 | specific workflows for Oxford Nanopore analysis in FAST5 and FASTQ
19 | format. If you have an Oxford Nanopore and need (free) help
20 | analysing SARS-CoV-2 FAST5 or FASTQ data, feel free
21 | to contact us !
22 |
23 |
24 | COVID-19 PubSeq is also a repository for sequences with a low
25 | barrier to entry for uploading sequence data using best practices,
26 | including FAIR
27 | data . Data are published with metadata using state-of-the art
28 | standards and, perhaps most importantly, providing standardised
29 | workflows that get triggered on upload, so that results are
30 | immediately available in standardised data formats. Note that, in
31 | general, there is no conflict also uploading your data to other
32 | repositories, including EBI/ENA and GISAID.
33 |
34 |
35 | Your uploaded sequence will automatically be processed and
36 | incorporated into the public pangenome with metadata using worklows
37 | from the High Performance Open Biology Lab
38 | defined here . Importantly, all
39 | data is published under
40 | a Creative
41 | Commons license (CC0 or CC-BY-4.0). Anyone can take the
42 | published (GFA/RDF/FASTA) data and use it for
43 | further processing.
44 |
45 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/demo.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 | The Virtuoso database contains public sequences! The examples here should provide a starting point to explore our data in our public SPARQL endpoint or via SIB COVID-19 Integrated Knowledgebase . See also our documentation here for more information!
9 |
20 |
21 |
49 |
50 |
51 |
52 |
53 |
56 |
57 | {% include 'footer.html' %}
58 |
59 |
74 |
75 |
76 |
77 |
78 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/download.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'org-header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 | {{ embed|safe }}
9 |
10 | {% include 'footer.html' %}
11 |
12 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/ebi-sample.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | COVID-19 PubSeq Sample
5 |
6 | 2697049
7 | Severe acute respiratory syndrome coronavirus 2
8 | SARS-CoV-2
9 |
10 |
11 |
12 | investigation type
13 | {{ investigation_type }}
14 |
15 |
16 | sequencing method
17 | {{ sequencer }}
18 |
19 |
20 | collection date
21 | {{ date }}
22 |
23 |
24 | geographic location (latitude)
25 | {{ latidude }}
26 | DD
27 |
28 |
29 | geographic location (longitude)
30 | {{ longitude }}
31 | DD
32 |
33 |
34 | geographic location (country and/or sea)
35 | {{ country }}
36 |
37 |
38 | geographic location (region and locality)
39 | {{ locality }}
40 |
41 |
42 | environment (material)
43 | {{ specimen }}
44 |
45 |
46 | ENA-CHECKLIST
47 | ERC000011
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/error.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Upload Failed
7 |
8 |
9 | Upload Failed
10 |
11 |
12 | Your upload has failed.
13 |
14 | {{error_message|safe}}
15 |
16 |
17 |
18 | Click here to try again.
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/export.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 |
9 | {% if embed %}
10 | {{ embed|safe }}
11 | {% endif %}
12 |
13 |
14 |
15 |
16 |
17 | Search
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 | {% include 'footer.html' %}
27 |
28 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/footer.html:
--------------------------------------------------------------------------------
1 |
2 |
62 | {% if load_map %}
63 |
64 | {% endif %}
65 |
66 |
67 |
80 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/header.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | COVID-19 PubSeq: Public SARS-CoV-2 Sequence Resource
9 | {% if blog %}
10 |
11 | {% endif %}
12 |
13 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/list.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | {% for row in l: %}
16 |
17 |
18 | {% for col in h: %}
19 |
20 | {% if col == 'name': %}
21 |
{{ row[col]['value'] }}
22 | {% else %}
23 | {{ row[col]['value'] }}
24 | {% endif %}
25 |
26 | {% endfor %}
27 |
28 |
29 | {% endfor %}
30 |
31 |
32 |
33 |
34 |
35 | {% include 'footer.html' %}
36 |
37 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/mapheader.html:
--------------------------------------------------------------------------------
1 |
4 |
7 |
10 |
11 |
14 |
17 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/menu.html:
--------------------------------------------------------------------------------
1 |
16 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/permalink.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 |
9 |
10 | {{id}}
11 |
12 |
13 | This is page represents a permanent COVID-19 PubSeq SARS-CoV-2 sequence resource
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | Identifier
23 |
24 |
27 |
28 |
29 |
30 |
31 |
32 | Permanent link
33 |
34 |
37 |
38 |
39 |
40 |
41 |
42 | Location
43 |
44 |
47 |
48 |
49 |
50 |
51 |
52 | Sampling date
53 |
54 |
55 | {{ date }}
56 |
57 |
58 |
59 |
60 |
61 |
62 | Institute
63 |
64 |
65 | {{ institute }}
66 |
67 |
68 |
69 |
70 |
71 |
72 | Sample type
73 |
74 |
77 |
78 |
79 |
80 |
81 |
82 | Sequence
83 |
84 |
87 |
88 |
89 |
90 |
91 |
92 | Metadata
93 |
94 |
97 |
98 |
99 |
100 |
101 |
102 | Source
103 |
104 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 | {% include 'footer.html' %}
115 |
116 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/resource.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 |
23 |
24 | {% include 'footer.html' %}
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/search.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/templates/search.html
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/status.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 | Sequence upload processing status
9 |
10 |
11 |
12 | {{ table }}
13 |
14 |
15 | {% include 'footer.html' %}
16 |
17 |
18 |
19 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/success.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Upload Successful
7 |
8 |
9 | Upload Successful
10 |
11 |
12 | Your files have been uploaded. You can track their QC status , once validated they will be part of the Public SARS-CoV-2 Sequence Resource .
13 |
14 |
15 | The upload log was:
16 |
17 | {{log}}
18 |
19 |
20 | Click here to upload more files.
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/bh20simplewebuploader/templates/validated.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {% include 'header.html' %}
4 |
5 | {% include 'banner.html' %}
6 | {% include 'menu.html' %}
7 |
8 | Validated sequences
9 |
10 |
11 | {{ table }}
12 |
13 |
14 | {% include 'footer.html' %}
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/doc/DEVELOPMENT.md:
--------------------------------------------------------------------------------
1 | # Development
2 |
3 | ## Upload resume
4 |
5 | When data files get large we may want to implement resume,
6 | like put does. See
7 | [/sdk/python/arvados/commands/put.py](https://dev.arvados.org/projects/arvados/repository/revisions/master/entry/sdk/python/arvados/commands/put.py)
8 |
--------------------------------------------------------------------------------
/doc/blog/using-covid-19-pubseq-part4.org:
--------------------------------------------------------------------------------
1 | #+TITLE: COVID-19 PubSeq (part 4)
2 | #+AUTHOR: Pjotr Prins
3 | # C-c C-e h h publish
4 | # C-c ! insert date (use . for active agenda, C-u C-c ! for date, C-u C-c . for time)
5 | # C-c C-t task rotate
6 | # RSS_IMAGE_URL: http://xxxx.xxxx.free.fr/rss_icon.png
7 |
8 | #+HTML_HEAD:
9 |
10 |
11 | * Table of Contents :TOC:noexport:
12 | - [[#what-does-this-mean][What does this mean?]]
13 | - [[#where-can-i-find-the-workflows][Where can I find the workflows?]]
14 | - [[#modify-workflow][Modify Workflow]]
15 |
16 | * What does this mean?
17 |
18 | When someone uploads a SARS-CoV-2 sequence using one
19 | of our tools (CLI or web-based) they add a sequence and some metadata
20 | which triggers a rerun of our workflows.
21 |
22 | * Where can I find the workflows?
23 |
24 | Workflows are written in the common workflow language (CWL) and listed
25 | on [[https://github.com/arvados/bh20-seq-resource/tree/master/workflows][github]]. PubSeq being an open project these workflows can be studied
26 | and modified!
27 |
28 | * Modify Workflow
29 |
30 | /Work in progress!/
31 |
--------------------------------------------------------------------------------
/doc/web/contact.org:
--------------------------------------------------------------------------------
1 | #+TITLE: CONTACT
2 | #+AUTHOR: Pjotr Prins
3 |
4 | * CONTACT and SUPPORT
5 |
6 | COVID-19 PubSeq is run by a community of [[https://github.com/arvados/bh20-seq-resource/graphs/contributors][bioinformaticians]] and
7 | software developers working at leading institutes (see sponsors below)
8 | with the goal of making online analysis available to everyone. You can
9 | talk with us directly in the [[https://matrix.to/#/!kyAxaAAAOgUKAMmXRz:matrix.org?via=matrix.org][matrix PubSeq room]]. We are open to
10 | improving tools, work flows and analysis.
11 |
12 | ** Oxford Nanopore Analysis
13 |
14 | @@html: @@
15 |
16 | We run [[https://en.wikipedia.org/wiki/Oxford_Nanopore_Technologies][Oxford Nanopore]] ourselves. It is an exciting technology because
17 | it gives us an accurate SARS-CoV-2 sequencer for a few thousand
18 | dollars which can be run in a living room! With PubSeq we aim to make
19 | it easy to analyse Nanopore material using our *free* Cloud
20 | infrastructure and the [[https://github.com/pubseq/jetson_nanopore_sequencing][NVIDIA Jetson computer]]. If you need help in
21 | using the online workflows don't hesitate to contact us.
22 |
23 | @@html: @@
24 |
25 | ** Data from other sequencers
26 |
27 | We accept FASTA sequences of SARS-CoV-2. Simply upload them using the
28 | web form and/or REST API. No sign-up required! If you have large scale
29 | short read data and require support we can discuss that. We also run
30 | Illumina sequencing in several places.
31 |
32 | ** Professional support
33 |
34 | To use COVID-19 PubSeq solutions for professional purposes you can
35 | contact Boston based [[mailto:info@curii.com][Curii]], the creators of [[https://arvados.org/][Arvados]], directly.
36 |
37 | COVID-19 is built on Arvados using CWL workflows.
38 |
39 | ** E-mail
40 |
41 | For questions feel free to write directly to [[mailto:pjotr.public821@thebird.nl][Pjotr Prins]].
42 |
--------------------------------------------------------------------------------
/doc/web/export.org:
--------------------------------------------------------------------------------
1 | #+TITLE: About/FAQ
2 | #+AUTHOR: Pjotr Prins
3 |
4 | * Table of Contents :TOC:noexport:
5 | - [[#export-data][Export data]]
6 | - [[#sparql-api][SPARQL API]]
7 | - [[#rest-api][REST API]]
8 | - [[#export-ebiena-forms][Export EBI/ENA Forms]]
9 |
10 | * Export data
11 |
12 | Apart from straight file [[http://covid19.genenetwork.org/download][downloads]] COVID-19 PubSeq allows for
13 | exporting forms and data for other services.
14 |
15 | * SPARQL API
16 |
17 |
18 | First of all, PubSeq exports a SPARQL endpoint [[http://sparql.genenetwork.org/sparql/][here]] that allows you do
19 | do any query on the data. See this [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1][document]] for examples.
20 |
21 | * REST API
22 |
23 | In addition to above flexible SPARQL endpoint - which is essentially
24 | is a query REST API - PubSeq exports its own
25 | [[http://covid19.genenetwork.org/apidoc][REST API]].
26 |
27 | * Export EBI/ENA Forms
28 |
29 | Uploading data to EBI/ENA with PubSeq is described [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part6][here]].
30 |
31 | To export, first search for an uploaded entry through its identifier:
32 |
--------------------------------------------------------------------------------
/example/esr_example.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | license:
4 | license_type: http://creativecommons.org/licenses/by/4.0/
5 | title: "SARS-CoV-2 New Zealand"
6 | attribution_name: "ESR"
7 | attribution_url: https://www.esr.cri.nz/
8 |
9 | host:
10 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
11 | additional_host_information: Optional free text field for additional information
12 |
13 | sample:
14 | sample_id: "20VR0174"
15 | collection_date: "2020-02-26"
16 | collection_location: https://www.wikidata.org/wiki/Q37100
17 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
18 | source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
19 | additional_collection_information: Optional free text field for additional information
20 |
21 | virus:
22 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
23 | virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
24 |
25 | technology:
26 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION
27 | alignment_protocol: https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics
28 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
29 | additional_technology_information: "Artic V3 workflow"
30 |
31 | submitter:
32 | authors: [Jemma L Geoghegan, Xiaoyun Ren, Matthew Storey, James Hadfield, Lauren Jelley, Sarah Jefferies, Jill Sherwood, Shevaun Paine, Sue Huang, Jordan Douglas, Fabio K Mendes, Andrew Sporle, Michael G Baker, David R Murdoch, Nigel French, Colin R Simpson, David Welch, Alexei J Drummond, Edward C Holmes, Sebastian Duchene, Joep de Ligt]
33 | submitter_name: [Joep de Ligt]
34 | submitter_address: "PO Box 50348, Porirua 5240, New Zealand"
35 | originating_lab: ESR
36 | submitter_sample_id: "PRJNA648792"
37 | submitted_to: https://www.ncbi.nlm.nih.gov/biosample
38 | publication: https://doi.org/10.1101/2020.08.05.20168930
39 | public_date: "2020-08-20"
40 | submitter_orcid: [https://orcid.org/0000-0003-0970-0153]
41 | additional_submitter_information: Optional free text field for additional information
42 |
--------------------------------------------------------------------------------
/example/maximum_metadata_example.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | license:
4 | license_type: http://creativecommons.org/licenses/by/4.0/
5 | title: "Sample"
6 | attribution_name: "John doe, Joe Boe, Jonny Oe"
7 | attribution_url: http://covid19.genenetwork.org/id
8 |
9 | host:
10 | host_id: XX1
11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
12 | host_sex: http://purl.obolibrary.org/obo/PATO_0000384
13 | host_age: 20
14 | host_age_unit: http://purl.obolibrary.org/obo/UO_0000036
15 | host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269
16 | host_treatment: Process in which the act is intended to modify or alter host status (Compounds)
17 | host_vaccination: [vaccines1,vaccine2]
18 | ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010
19 | additional_host_information: Optional free text field for additional information
20 |
21 | sample:
22 | sample_id: Id of the sample as defined by the submitter
23 | collector_name: Name of the person that took the sample
24 | collecting_institution: Institute that was responsible of sampling
25 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835]
26 | collection_date: "2020-01-01"
27 | collection_location: http://www.wikidata.org/entity/Q148
28 | sample_storage_conditions: frozen specimen
29 | source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence]
30 | additional_collection_information: Optional free text field for additional information
31 |
32 | virus:
33 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
34 | virus_strain: SARS-CoV-2/human/CHN/HS_8/2020
35 |
36 | technology:
37 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173]
38 | alignment_protocol: Protocol used for assembly
39 | sequencing_coverage: [70.0, 100.0]
40 | additional_technology_information: Optional free text field for additional information
41 |
42 | submitter:
43 | authors: [John Doe, Joe Boe, Jonny Oe]
44 | submitter_name: [John Doe]
45 | submitter_address: John Doe's address
46 | originating_lab: John Doe kitchen
47 | lab_address: John Doe's address
48 | provider: XXX1
49 | submitter_sample_id: XXX2
50 | publication: PMID00001113
51 | submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
52 | additional_submitter_information: Optional free text field for additional information
53 |
--------------------------------------------------------------------------------
/example/minimal_metadata_example.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 |
4 | license:
5 | license_type: http://creativecommons.org/licenses/by/4.0/
6 |
7 | host:
8 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
9 |
10 | sample:
11 | sample_id: XX
12 | collection_date: "2020-01-01"
13 | collection_location: http://www.wikidata.org/entity/Q148
14 |
15 | virus:
16 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
17 |
18 | technology:
19 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
20 |
21 | submitter:
22 | authors: [John Doe]
23 |
--------------------------------------------------------------------------------
/example/uthsc_example.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | license:
4 | license_type: https://creativecommons.org/licenses/by/4.0/
5 | title: "Sample"
6 | attribution_name: "Mariah Taylor, Colleen Jonsson"
7 | attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
8 |
9 | host:
10 | host_id: TN_UT2
11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
12 | additional_host_information: Optional free text field for additional information
13 |
14 | sample:
15 | sample_id: TN_UT2
16 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
17 | collection_date: "2020-04-26"
18 | collection_location: https://www.wikidata.org/wiki/Q3289517
19 | additional_collection_information: Optional free text field for additional information
20 |
21 | virus:
22 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
23 | virus_strain: SARS-CoV-2/human/USA/AL_UT14/2020
24 |
25 | technology:
26 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION
27 | alignment_protocol: guppy
28 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
29 | additional_technology_information: Optional free text field for additional information
30 |
31 | submitter:
32 | authors: [Mariah Taylor, Colleen Jonsson]
33 | submitter_name: [Mariah Taylor]
34 | submitter_address: UTHSC, Memphis, Tennessee 38163, USA
35 | originating_lab: Regional Biocontainment Laboratory
36 | provider: XXX1
37 | submitter_sample_id: XXX2
38 | publication: PMID00001113
39 | submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001]
40 | additional_submitter_information: Optional free text field for additional information
41 |
--------------------------------------------------------------------------------
/gittaggers.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import time
3 | import pkg_resources
4 | from setuptools.command.egg_info import egg_info
5 |
6 | SETUPTOOLS_VER = pkg_resources.get_distribution(
7 | "setuptools").version.split('.')
8 |
9 | RECENT_SETUPTOOLS = int(SETUPTOOLS_VER[0]) > 40 or (
10 | int(SETUPTOOLS_VER[0]) == 40 and int(SETUPTOOLS_VER[1]) > 0) or (
11 | int(SETUPTOOLS_VER[0]) == 40 and int(SETUPTOOLS_VER[1]) == 0 and
12 | int(SETUPTOOLS_VER[2]) > 0)
13 |
14 | class EggInfoFromGit(egg_info):
15 | """Tag the build with git commit timestamp.
16 |
17 | If a build tag has already been set (e.g., "egg_info -b", building
18 | from source package), leave it alone.
19 | """
20 |
21 | def git_timestamp_tag(self):
22 | gitinfo = subprocess.check_output(
23 | ['git', 'log', '--first-parent', '--max-count=1',
24 | '--format=format:%ct', '.']).strip()
25 | return time.strftime('.%Y%m%d%H%M%S', time.gmtime(int(gitinfo)))
26 |
27 | def tags(self):
28 | if self.tag_build is None:
29 | try:
30 | self.tag_build = self.git_timestamp_tag()
31 | except subprocess.CalledProcessError:
32 | pass
33 | return egg_info.tags(self)
34 |
35 | if RECENT_SETUPTOOLS:
36 | vtags = property(tags)
37 |
--------------------------------------------------------------------------------
/image/homepage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/image/homepage.png
--------------------------------------------------------------------------------
/lib/ruby/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.1
2 |
--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
1 | @book{CWL,
2 | title = "Common Workflow Language, v1.0",
3 | abstract = "The Common Workflow Language (CWL) is an informal, multi-vendor working group consisting of various organizations and individuals that have an interest in portability of data analysis workflows. Our goal is to create specifications that enable data scientists to describe analysis tools and workflows that are powerful, easy to use, portable, and support reproducibility.CWL builds on technologies such as JSON-LD and Avro for data modeling and Docker for portable runtime environments. CWL is designed to express workflows for data-intensive science, such as Bioinformatics, Medical Imaging, Chemistry, Physics, and Astronomy.This is v1.0 of the CWL tool and workflow specification, released on 2016-07-08",
4 | keywords = "cwl, workflow, specification",
5 | author = "Brad Chapman and John Chilton and Michael Heuer and Andrey Kartashov and Dan Leehr and Herv{\'e} M{\'e}nager and Maya Nedeljkovich and Matt Scales and Stian Soiland-Reyes and Luka Stojanovic",
6 | editor = "Peter Amstutz and Crusoe, {Michael R.} and Nebojša Tijanić",
7 | note = "Specification, product of the Common Workflow Language working group. http://www.commonwl.org/v1.0/",
8 | year = "2016",
9 | month = "7",
10 | day = "8",
11 | doi = "10.6084/m9.figshare.3115156.v2",
12 | language = "English",
13 | publisher = "figshare",
14 | address = "United States",
15 |
16 | }
--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
1 | ### Instructions for download and/or prepare the data and/or the metadata
2 |
3 | Just go into the `download_genbank_data` or `download_sra_data` directory and execute the python3 script inside.
4 |
5 | - `download_genbank_data/from_genbank_to_fasta_and_yaml.py` downloads the data and the matadata, preparing the FASTA and the YAML files;
6 | - `download_sra_data/download_sra_data.py` creates the metadata in the form of YAML files from the SraExperimentPackage.XXX.xml.gz file in the same directory.
7 |
--------------------------------------------------------------------------------
/scripts/cleanup.py:
--------------------------------------------------------------------------------
1 | import arvados
2 | import arvados.util
3 | import arvados.keep
4 | import ruamel.yaml
5 |
6 | api = arvados.api()
7 | keepclient = arvados.keep.KeepClient(api_client=api)
8 |
9 | UPLOADER_PROJECT = 'lugli-j7d0g-n5clictpuvwk8aa'
10 | VALIDATED_PROJECT = 'lugli-j7d0g-5ct8p1i1wrgyjvp'
11 |
12 | delete_patterns = [
13 | "%missing%`collection_location`%",
14 | "%missing%`technology`%",
15 | "%missing%`host_species`%",
16 | "%QC fail: alignment%",
17 | "%does not look like a valid URI%",
18 | "%Duplicate of%",
19 | "%No matching triples found for predicate obo:NCIT_C42781%",
20 | "%does not look like a valid URI%"
21 | ]
22 |
23 | revalidate_patterns = [
24 | "%missing%`license`%",
25 | "%QC fail%"
26 | ]
27 |
28 | for p in delete_patterns:
29 | c = arvados.util.list_all(api.collections().list, filters=[
30 | ["owner_uuid", "=", UPLOADER_PROJECT],
31 | ["properties.errors", "like", p]])
32 | for i in c:
33 | print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
34 | api.collections().delete(uuid=i["uuid"]).execute()
35 |
36 | for p in revalidate_patterns:
37 | c = arvados.util.list_all(api.collections().list, filters=[
38 | ["owner_uuid", "=", UPLOADER_PROJECT],
39 | ["properties.errors", "like", p]])
40 | for i in c:
41 | print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label")))
42 | pr = i["properties"]
43 | if "status" in pr:
44 | del pr["status"]
45 | if "errors" in pr:
46 | del pr["errors"]
47 | api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute()
48 |
49 | c = arvados.util.list_all(api.collections().list, filters=[
50 | ["owner_uuid", "=", VALIDATED_PROJECT],
51 | ["properties.sequence_label", "exists", False]])
52 | for i in c:
53 | col = arvados.collection.Collection(i["uuid"], api_client=api, keep_client=keepclient)
54 | with col.open("metadata.yaml") as md:
55 | metadata_content = ruamel.yaml.round_trip_load(md)
56 | colprop = col.get_properties()
57 | colprop["sequence_label"] = metadata_content["sample"]["sample_id"]
58 |
59 | print("fixing sequence label %s %s" % (i["uuid"], colprop.get("sequence_label")))
60 | api.collections().update(uuid=i["uuid"], body={"properties": colprop}).execute()
61 |
--------------------------------------------------------------------------------
/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz
--------------------------------------------------------------------------------
/scripts/db_enrichment/.gitignore:
--------------------------------------------------------------------------------
1 | enriched_output.txt
2 |
--------------------------------------------------------------------------------
/scripts/db_enrichment/input_location.csv:
--------------------------------------------------------------------------------
1 | http://www.wikidata.org/entity/Q7960498
2 | http://www.wikidata.org/entity/Q692895
3 | http://www.wikidata.org/entity/Q2722074
4 | http://www.wikidata.org/entity/Q25622187
5 | http://www.wikidata.org/entity/Q27684996
6 | http://www.wikidata.org/entity/Q2757125
7 |
--------------------------------------------------------------------------------
/scripts/db_enrichment/readme.md:
--------------------------------------------------------------------------------
1 | We have two files in the folder *semantic_enrichment* that are used to enrich the identifier in our triples store with additional information, e.g. human readable labels and semantics (e.g. *What countries are summarizes as a continent*). This describes how to update these two files.
2 |
3 | ### semantic_enrichment/labels.ttl
4 | Static label about the ontology vocabulary terms we use. This file has to be updated manually. Use the OLS or bioportal to find more information about a used ontology term.
5 |
6 | ### semantic_enrichment/countries.ttl
7 | File containing information about the countries in our database. Additional information about countries are e.g. the label or GPS coordinates. We enricht the country identifier via wikidata. Please ensure that the .ttl file is valid by e.g. using his online validator (http://ttl.summerofcode.be/).
8 |
9 | #### Update process
10 | - What countries (=wikidata identifier) do we have to enrich?
11 | This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet:
12 |
13 |
14 | ```sparql
15 | SELECT DISTINCT ?geoLocation WHERE
16 | {
17 | ?fasta ?x [ ?geoLocation] .
18 | FILTER NOT EXISTS {?geoLocation ?geoLocation_tmp_label}
19 | }
20 | ```
21 |
22 | [Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+)
23 |
24 | - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder
25 | - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output)
26 |
--------------------------------------------------------------------------------
/scripts/db_enrichment/update/README.org:
--------------------------------------------------------------------------------
1 | select distinct ?item ?country ?place ?official ?countryname ?loc where {
2 | ?item wdt:P17 ?country ;
3 | wdt:P1705 ?place ;
4 | wdt:P625 ?loc .
5 | ?country wdt:P1448 ?countryname .
6 | FILTER(LANG(?countryname) = "en")
7 | FILTER(LANG(?place) = "en")
8 | optional { ?item wdt:P1448 ?official } .
9 | SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
10 | }
11 |
12 |
13 | https://query.wikidata.org/#%23%20Find%20place%20and%20location%20coordinates%0Aselect%20distinct%20%3Fitem%20%3Fcountry%20%3Fplace%20%3Fofficial%20%3Fcountryname%20%3Floc%20where%20%7B%0A%20%20%20%20%3Fitem%20%20wdt%3AP17%20%3Fcountry%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20wdt%3AP1705%20%3Fplace%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20wdt%3AP625%20%3Floc%20.%0A%20%20%20%20%3Fcountry%20wdt%3AP1448%20%3Fcountryname%20.%0A%20%20%20%20FILTER%28LANG%28%3Fcountryname%29%20%3D%20%22en%22%29%0A%20%20%20%20FILTER%28LANG%28%3Fplace%29%20%3D%20%22en%22%29%0A%20%20%20%20optional%20%7B%20%3Fitem%20%20wdt%3AP1448%20%3Fofficial%20%7D%20.%0A%20%20%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20%7D%0A%7D%0A
14 |
15 | Fetches a TSV:
16 |
17 | item country place official countryname loc
18 | http://www.wikidata.org/entity/Q1297 http://www.wikidata.org/entity/Q30 Chicago City of Chicago the Unite
19 | d States of America Point(-87.627777777 41.881944444)
20 | http://www.wikidata.org/entity/Q1297 http://www.wikidata.org/entity/Q30 Chicago City of Chicago United St
21 | ates Point(-87.627777777 41.881944444)
22 | http://www.wikidata.org/entity/Q686 http://www.wikidata.org/entity/Q686 Republic of Vanuatu Ripablik blong
23 | Vanuatu Republic of Vanuatu Point(168.016669444 -16.633330555)
24 | http://www.wikidata.org/entity/Q686 http://www.wikidata.org/entity/Q686 Republic of Vanuatu Vanuatu Republi
25 | c of Vanuatu Point(168.016669444 -16.633330555)
26 | http://www.wikidata.org/entity/Q686 http://www.wikidata.org/entity/Q686 Republic of Vanuatu Republic of Van
27 | uatu Republic of Vanuatu Point(168.016669444 -16.633330555)
28 |
29 |
--------------------------------------------------------------------------------
/scripts/delete_entries_on_arvados.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import arvados
3 | import arvados.collection
4 |
5 | from datetime import datetime
6 |
7 | date_time_str = '2020-08-20'
8 | date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d')
9 |
10 | api = arvados.api()
11 | keepclient = arvados.keep.KeepClient(api_client=api)
12 |
13 | validated = arvados.util.list_all(api.collections().list, filters=[
14 | ["owner_uuid", "=", sys.argv[1]],
15 | # ["properties.status", "=", "validated"]
16 | ])
17 |
18 | # validated.sort(key=lambda v: v["portable_data_hash"])
19 |
20 | num_sample_deleted = 0
21 | for item in validated:
22 | sequence_label = item['properties']["sequence_label"]
23 |
24 | # The SRA samples start with SRR or ERR
25 | if not sequence_label.startswith('SRR') and not sequence_label.startswith('ERR'):
26 | created_at_obj = datetime.strptime(item["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ')
27 | # print(item, created_at_obj)
28 |
29 | if created_at_obj < date_time_obj:
30 | api.collections().delete(uuid=item['current_version_uuid']).execute()
31 | num_sample_deleted += 1
32 | print(sequence_label)
33 |
34 | print('num_sample_deleted: {}'.format(num_sample_deleted))
35 |
--------------------------------------------------------------------------------
/scripts/dict_ontology_standardization/ncbi_host_health_status.csv:
--------------------------------------------------------------------------------
1 | healthy,http://purl.obolibrary.org/obo/NCIT_C115935
2 | asymptomatic,http://purl.obolibrary.org/obo/NCIT_C3833
3 | symptomatic,http://purl.obolibrary.org/obo/NCIT_C25269
4 | admitted to hospital,http://purl.obolibrary.org/obo/GENEPIO_0002020
5 | hospitalized patient,http://purl.obolibrary.org/obo/GENEPIO_0002020
6 | discharged from hospital,http://purl.obolibrary.org/obo/GENEPIO_0001849
7 | dead,http://purl.obolibrary.org/obo/NCIT_C28554
8 | alive,http://purl.obolibrary.org/obo/NCIT_C37987
9 |
--------------------------------------------------------------------------------
/scripts/dict_ontology_standardization/ncbi_host_species.csv:
--------------------------------------------------------------------------------
1 | Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
2 | human,http://purl.obolibrary.org/obo/NCBITaxon_9606
3 | Human,http://purl.obolibrary.org/obo/NCBITaxon_9606
4 | sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
5 | homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
6 | howo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606
7 | Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666
8 | Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974
9 | Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685
10 | Felis catus; Domestic Shorthair,http://purl.obolibrary.org/obo/NCBITaxon_9685
11 | Panthera tigris jacksoni,http://purl.obolibrary.org/obo/NCBITaxon_419130
12 | Canis lupus familiaris,http://purl.obolibrary.org/obo/NCBITaxon_9615
13 | Neovison vison,http://purl.obolibrary.org/obo/NCBITaxon_452646
14 |
--------------------------------------------------------------------------------
/scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv:
--------------------------------------------------------------------------------
1 | Illumina HiSeq 1000,http://www.ebi.ac.uk/efo/EFO_0004204
2 | Illumina HiSeq 2000,http://www.ebi.ac.uk/efo/EFO_0004203
3 | Illumina HiSeq 2500,http://www.ebi.ac.uk/efo/EFO_0008565
4 | Illumina HiSeq 3000,http://www.ebi.ac.uk/efo/EFO_0008564
5 | Illumina HiSeq 4000,http://www.ebi.ac.uk/efo/EFO_0008563
6 | Illumina iSeq 100,http://www.ebi.ac.uk/efo/EFO_0008635
7 | Illumian NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
8 | Illumina NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
9 | NextSeq500,http://www.ebi.ac.uk/efo/EFO_0009173
10 | NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173
11 | Illumian NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566
12 | Illumina NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566
13 | NextSeq550,http://www.ebi.ac.uk/efo/EFO_0008566
14 | NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566
15 | Illumina MiniSeq,http://www.ebi.ac.uk/efo/EFO_0008636
16 | Illumina NovaSeq,http://www.ebi.ac.uk/efo/EFO_0008637
17 | Illumina NovaSeq 6000,http://www.ebi.ac.uk/efo/EFO_0008637
18 | Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632
19 | Oxford Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632
20 | ONT (Oxford Nanopore Technologies),http://purl.obolibrary.org/obo/NCIT_C146818
21 | Oxford Nanopore Technology,http://purl.obolibrary.org/obo/NCIT_C146818
22 | Oxford Nanopore technologies MinION,http://www.ebi.ac.uk/efo/EFO_0008632
23 | Oxford Nanopore Sequencing,http://purl.obolibrary.org/obo/NCIT_C146818
24 | MinION Oxford Nanopore,http://www.ebi.ac.uk/efo/EFO_0008632
25 | MinION,http://www.ebi.ac.uk/efo/EFO_0008632
26 | Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818
27 | Illumina MiSeq,http://www.ebi.ac.uk/efo/EFO_0004205
28 | Illumina,http://purl.obolibrary.org/obo/OBI_0000759
29 | Oxford Nanopore technology,http://purl.obolibrary.org/obo/NCIT_C146818
30 | Oxford Nanopore Technologies,http://purl.obolibrary.org/obo/NCIT_C146818
31 | Oxford Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818
32 | IonTorrent,http://purl.obolibrary.org/obo/NCIT_C125894
33 | Ion Torrent X5Plus,http://purl.obolibrary.org/obo/NCIT_C125894
34 | ThermoFisher S5Plus,http://purl.obolibrary.org/obo/NCIT_C125894
35 | Sanger dideoxy sequencing,http://purl.obolibrary.org/obo/NCIT_C19641
36 | MGISEQ 2000,http://virtual-bh/MGISEQ2000
37 | MGISEQ2000,http://virtual-bh/MGISEQ2000
38 | Illumina HiSeq X,http://www.ebi.ac.uk/efo/EFO_0008567
39 | ONT GridION X5,http://www.ebi.ac.uk/efo/EFO_0008633
40 | ONT PremethION,http://www.ebi.ac.uk/efo/EFO_0008634
41 | PromethION,http://www.ebi.ac.uk/efo/EFO_0008634
42 | PacBio RS II,http://www.ebi.ac.uk/efo/EFO_0008631
43 | PacBio Sequel System,http://www.ebi.ac.uk/efo/EFO_0008630
44 | Illumina Genome Analyzer,http://www.ebi.ac.uk/efo/EFO_0004200
45 | Illumina Genome Analyzer II,http://www.ebi.ac.uk/efo/EFO_0004201
46 | Illumina Genome Analyzer IIx,http://www.ebi.ac.uk/efo/EFO_0004202
47 | 454 GS 20 sequencer,http://www.ebi.ac.uk/efo/EFO_0004206
48 | 454 GS FLX Titanium sequencer,http://www.ebi.ac.uk/efo/EFO_0004433
49 | 454 GS FLX sequencer,http://www.ebi.ac.uk/efo/EFO_0004432
50 | 454 GS Junior sequencer,http://www.ebi.ac.uk/efo/EFO_0004434
51 | 454 GS sequencer,http://www.ebi.ac.uk/efo/EFO_0004431
52 | AB SOLiD 4 System,http://www.ebi.ac.uk/efo/EFO_0004438
53 | AB SOLiD 4hq System,http://www.ebi.ac.uk/efo/EFO_0004441
54 | AB SOLiD 5500,http://www.ebi.ac.uk/efo/EFO_0004440
55 | AB SOLiD 5500xl,http://www.ebi.ac.uk/efo/EFO_0004436
56 | AB SOLiD PI System,http://www.ebi.ac.uk/efo/EFO_0004437
57 | AB SOLiD System,http://www.ebi.ac.uk/efo/EFO_0004435
58 | AB SOLiD System 2.0,http://www.ebi.ac.uk/efo/EFO_0004442
59 | AB SOLiD System 3.0,http://www.ebi.ac.uk/efo/EFO_0004439
60 |
--------------------------------------------------------------------------------
/scripts/dict_ontology_standardization/ncbi_speciesman_source.csv:
--------------------------------------------------------------------------------
1 | nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
2 | Nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
3 | NPS,http://purl.obolibrary.org/obo/NCIT_C155831
4 | NasopharyngealSwab,http://purl.obolibrary.org/obo/NCIT_C155831
5 | Naso-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
6 | nasopharingeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
7 | Nasopharyngeal (NP) Swab,http://purl.obolibrary.org/obo/NCIT_C155831
8 | nasopharyngeal swabs,http://purl.obolibrary.org/obo/NCIT_C155831
9 | nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
10 | nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831
11 | Nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831
12 | respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831
13 | naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831
14 | nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831
15 | nasal swab specimen,http://purl.obolibrary.org/obo/NCIT_C155831
16 | nasal swal,http://purl.obolibrary.org/obo/NCIT_C155831
17 | pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831
18 | respiratory secretion,http://purl.obolibrary.org/obo/NCIT_C155831
19 | mid-nasal swab,http://purl.obolibrary.org/obo/NCIT_C155831
20 | Mid-nasal swab,http://purl.obolibrary.org/obo/NCIT_C155831
21 | nasopharyngeal (throat) washings,http://purl.obolibrary.org/obo/NCIT_C155831
22 | oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
23 | throat swab,http://purl.obolibrary.org/obo/NCIT_C155835
24 | oro-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
25 | Oropharyngal,http://purl.obolibrary.org/obo/NCIT_C155835
26 | oralpharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
27 | Oral-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
28 | oral-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835
29 | oro-pharngyl swab,http://purl.obolibrary.org/obo/NCIT_C155835
30 | Oro-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
31 | oro-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
32 | Oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
33 | oro pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835
34 | buccal swab,http://purl.obolibrary.org/obo/NCIT_C155835
35 | throat washing,http://purl.obolibrary.org/obo/NCIT_C155835
36 | Throat Swab,http://purl.obolibrary.org/obo/NCIT_C155835
37 | throat (oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835
38 | Throat (Oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835
39 | bronchoalveolar lavage fluid,http://purl.obolibrary.org/obo/NCIT_C13195
40 | swab,http://purl.obolibrary.org/obo/NCIT_C13195
41 | oral swab,http://purl.obolibrary.org/obo/NCIT_C13195
42 | bronchoalveolar lavage,http://purl.obolibrary.org/obo/NCIT_C13195
43 | sputum,http://purl.obolibrary.org/obo/NCIT_C13278
44 | aspirate,http://purl.obolibrary.org/obo/NCIT_C13347
45 | stool,http://purl.obolibrary.org/obo/NCIT_C13234
46 | serum,http://purl.obolibrary.org/obo/NCIT_C13325
47 | saliva,http://purl.obolibrary.org/obo/NCIT_C13275
48 | Deep throat saliva,http://purl.obolibrary.org/obo/NCIT_C13275
49 | nasal swab,http://purl.obolibrary.org/obo/NCIT_C132119
50 |
--------------------------------------------------------------------------------
/scripts/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:10
2 |
3 | RUN apt-get update && \
4 | apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \
5 | python3 python3-pip python3-setuptools python3-dev python-pycurl \
6 | minimap2 python3-biopython libcurl4-openssl-dev build-essential \
7 | libssl-dev libmagic-dev python3-magic && \
8 | apt-get clean
9 |
10 | RUN pip3 install bh20-seq-uploader py-dateutil
11 |
--------------------------------------------------------------------------------
/scripts/esr_samples/Pathogen.cl.1.0.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/scripts/esr_samples/Pathogen.cl.1.0.xlsx
--------------------------------------------------------------------------------
/scripts/esr_samples/esr_samples.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from string import Template
4 | from dateutil.parser import parse
5 |
6 | import sys
7 |
8 | sys.path.append('../')
9 | from utils import check_and_get_ontology_dictionaries
10 |
11 | # Metadata in tabular format
12 | path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx'
13 |
14 | path_template_yaml = 'template.yaml'
15 | # Removed from the template (for now)
16 | # license:
17 | # license_type: "http://creativecommons.org/licenses/by/4.0/"
18 | # title: "SARS-CoV-2 New Zealand"
19 | # attribution_name: "ESR"
20 | # attribution_url: "https://www.esr.cri.nz/"
21 |
22 |
23 | # Read the dictionaries for the ontology
24 | dir_dict_ontology_standardization = '../dict_ontology_standardization/'
25 | field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization)
26 |
27 | dir_output = 'yaml'
28 | suffix = '.consensus'
29 |
30 | if not os.path.exists(dir_output):
31 | os.makedirs(dir_output)
32 |
33 | metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12)
34 |
35 | # Maybe not the best pandas-way to do this
36 | for index, row in metadata_df.iterrows():
37 | # print(row['*sample_name'])
38 |
39 | geo_loc_name = row['*geo_loc_name'].replace(': ', ':')
40 |
41 | if geo_loc_name not in field_to_term_to_uri_dict['ncbi_countries']:
42 | if geo_loc_name in [
43 | 'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern',
44 | 'New Zealand:Waikato',
45 | 'New Zealand:Lakes', 'New Zealand:Nelson Marlborough', 'New Zealand:South Canterbury',
46 | 'New Zealand:MidCentral',
47 | 'New Zealand:Tairawhiti', 'New Zealand:Hawkes Bay', 'New Zealand:NA', 'New Zealand:Taranaki'
48 | ]:
49 | geo_loc_name = 'New Zealand'
50 | else:
51 | print(geo_loc_name)
52 | break
53 |
54 | country = field_to_term_to_uri_dict['ncbi_countries'][geo_loc_name]
55 |
56 | d = {
57 | 'host_species': field_to_term_to_uri_dict['ncbi_host_species'][row['*host']],
58 | 'sample_id': row['*sample_name'],
59 | 'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'),
60 | 'collection_location': country,
61 | 'specimen_source': field_to_term_to_uri_dict['ncbi_speciesman_source'][row['*isolation_source']],
62 | 'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049',
63 |
64 | 'submitter_sample_id': row['bioproject_accession'],
65 | }
66 |
67 | with open(path_template_yaml) as f:
68 | src = Template(f.read())
69 |
70 | with open(os.path.join(dir_output, '{}{}.yaml'.format(row['*sample_name'], suffix)), 'w') as fw:
71 | fw.write(src.substitute(d))
72 |
73 | print('{} YAML files created.'.format(len([x for x in os.listdir(dir_output) if x.endswith('.yaml')])))
74 |
--------------------------------------------------------------------------------
/scripts/esr_samples/jetson/21JETSONTEST001.consensus.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | host:
4 | host_species: "http://purl.obolibrary.org/obo/NCBITaxon_9606"
5 |
6 | sample:
7 | sample_id: "JetsonXavNX_SARSCOV_TESTRUN001"
8 | collection_date: "2020-12-05"
9 | collection_location: "http://www.wikidata.org/entity/Q37100"
10 | specimen_source: ["http://purl.obolibrary.org/obo/NCIT_C155831"]
11 |
12 | virus:
13 | virus_species: "http://purl.obolibrary.org/obo/NCBITaxon_2697049"
14 |
15 | technology:
16 | sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"]
17 | alignment_protocol: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics"
18 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
19 | additional_technology_information: "Modified Artic V3 workflow for Nvidia Jetson Xavier NX/AGX"
20 |
21 | submitter:
22 | authors: ["Miles Benton", "Matthew Storey", "Joep de Ligt"]
23 | submitter_name: ["Miles Benton"]
24 | submitter_address: "PO Box 50348, Porirua 5240, New Zealand"
25 | originating_lab: "ESR"
26 | submitter_sample_id: "PRJNA648792"
27 | submitter_orcid: ["https://orcid.org/0000-0003-3442-965X"]
28 | additional_submitter_information: "2021-01-20"
29 |
30 |
--------------------------------------------------------------------------------
/scripts/esr_samples/template.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | host:
4 | host_species: "$host_species"
5 |
6 | sample:
7 | sample_id: "$sample_id"
8 | collection_date: "$collection_date"
9 | collection_location: "$collection_location"
10 | specimen_source: ["$specimen_source"]
11 |
12 | virus:
13 | virus_species: "$virus_species"
14 |
15 | technology:
16 | sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"]
17 | alignment_protocol: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics"
18 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
19 | additional_technology_information: "Artic V3 workflow"
20 |
21 | submitter:
22 | authors: ["Jemma L Geoghegan", "Xiaoyun Ren", "Matthew Storey", "James Hadfield", "Lauren Jelley", "Sarah Jefferies", "Jill Sherwood", "Shevaun Paine", "Sue Huang", "Jordan Douglas", "Fabio K Mendes", "Andrew Sporle", "Michael G Baker", "David R Murdoch", "Nigel French", "Colin R Simpson", "David Welch", "Alexei J Drummond", "Edward C Holmes", "Sebastian Duchene", "Joep de Ligt"]
23 | submitter_name: ["Joep de Ligt"]
24 | submitter_address: "PO Box 50348, Porirua 5240, New Zealand"
25 | originating_lab: "ESR"
26 | submitter_sample_id: "$submitter_sample_id"
27 | publication: "https://doi.org/10.1101/2020.08.05.20168930"
28 | submitter_orcid: ["https://orcid.org/0000-0003-0970-0153"]
29 | additional_submitter_information: "2020-08-20"
30 |
--------------------------------------------------------------------------------
/scripts/fasta2vcf/fasta2vcf.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | path_reference=$1
4 | path_query=$2
5 | output_prefix=$3
6 | path_annotation=$4
7 |
8 | echo "Contatenating reference and query in the same FASTA file"
9 | cat $path_reference $path_query > ref+qry.fasta
10 |
11 | echo "Aligning reference and query with MAFFT"
12 | mafft ref+qry.fasta > ref+qry.alignment
13 |
14 | python3 alignment2vcf.py $path_reference ref+qry.alignment $output_prefix
15 |
16 | python3 simpleVcfAnnotation.py $output_prefix.vcf $path_annotation
17 |
18 | bcftools norm -f $path_reference $output_prefix.vcf -Ou | bcftools annotate --set-id '%CHROM\_%POS\_%REF\_%FIRST_ALT' -Ov -o - | bgzip -c > $output_prefix.vcf.gz
19 | #tabix -p vcf $output_prefix.vcf.gz
20 |
21 | #java -jar /home/tools/snpEff/5.0e/snpEff.jar NC_045512.2 $output_prefix.vcf | bgzip -c > $output_prefix.annotated.vcf.gz && tabix -p vcf $output_prefix.annotated.vcf.gz
22 |
23 | echo "Removing temporary files"
24 | #rm snpEff_genes.txt snpEff_summary.html
25 | rm ref+qry.fasta ref+qry.alignment $output_prefix.vcf
--------------------------------------------------------------------------------
/scripts/fasta2vcf/resources/NC_045512.2.fasta.fai:
--------------------------------------------------------------------------------
1 | NC_045512.2 29903 97 70 71
2 |
--------------------------------------------------------------------------------
/scripts/fasta2vcf/resources/README.md:
--------------------------------------------------------------------------------
1 | `NC_045512.2.fasta` and `MN908947.3.fasta` are the same sequence.
--------------------------------------------------------------------------------
/scripts/fasta2vcf/simpleVcfAnnotation.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 |
4 | path_vcf = sys.argv[1]
5 | path_annotation = sys.argv[2]
6 |
7 | variant_to_phenotypes_dict = {}
8 |
9 | with open(path_annotation) as f:
10 | f.readline() # Skip header
11 |
12 | for line in f:
13 | if line.startswith('MN908947'):
14 | variant = line.split(',')[0]
15 | pos, ref, alt = variant.split(':')[1:]
16 |
17 | # Ugly, Pjotr will not like it
18 | f.readline()
19 | f.readline()
20 | phenotype = f.readline().split(',')[-3].strip('"')
21 |
22 | if (pos, ref, alt) not in variant_to_phenotypes_dict:
23 | variant_to_phenotypes_dict[(pos, ref, alt)] = []
24 | variant_to_phenotypes_dict[(pos, ref, alt)].append(phenotype)
25 |
26 | new_row_in_header = '##INFO=\n'
27 |
28 | with open(path_vcf) as fin, open(f"{path_vcf}.tmp", "w") as fout:
29 | for line in fin:
30 | if line:
31 | if line.startswith('#CHROM'):
32 | if new_row_in_header:
33 | fout.write(new_row_in_header)
34 | new_row_in_header = ''
35 |
36 | if not line.startswith("#"):
37 | split_line = line.strip().split("\t")
38 | pos = split_line[1]
39 | ref, alt = split_line[3:5]
40 |
41 | if (pos, ref, alt) in variant_to_phenotypes_dict:
42 | split_line[7] = "ANN={}".format(
43 | ",".join(variant_to_phenotypes_dict[(pos, ref, alt)])
44 | )
45 | line = "\t".join(split_line) + "\n"
46 |
47 | fout.write(line)
48 | os.remove(path_vcf)
49 | os.rename(f"{path_vcf}.tmp", path_vcf)
50 |
--------------------------------------------------------------------------------
/scripts/fetch_from_genbank.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | importScript:
5 | type: File
6 | default:
7 | class: File
8 | location: download_genbank_data/from_genbank_to_fasta_and_yaml.py
9 | inputBinding: {position: 1}
10 | dict:
11 | type: Directory
12 | inputBinding:
13 | prefix: --dict-ontology
14 | position: 2
15 | default:
16 | class: Directory
17 | location: dict_ontology_standardization
18 | existing_metadata_from_nuccore:
19 | type: Directory?
20 | inputBinding:
21 | valueFrom: "--skip-request"
22 | position: 3
23 | outputs:
24 | fasta_and_yaml:
25 | type: Directory
26 | outputBinding:
27 | glob: fasta_and_yaml
28 | metadata_from_nuccore:
29 | type: Directory
30 | outputBinding:
31 | glob: metadata_from_nuccore
32 | accessions:
33 | type: File?
34 | outputBinding:
35 | glob: "*.acc"
36 | missing_terms:
37 | type: File
38 | outputBinding:
39 | glob: missing_terms.tsv
40 | requirements:
41 | InitialWorkDirRequirement:
42 | listing:
43 | - entry: $(inputs.existing_metadata_from_nuccore)
44 | entryname: metadata_from_nuccore
45 | DockerRequirement:
46 | dockerPull: bh20-seq-uploader/import
47 | NetworkAccess:
48 | networkAccess: true
49 | baseCommand: python3
50 |
--------------------------------------------------------------------------------
/scripts/foreach.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | rm -rf validated fasta_and_yaml_*
3 | mkdir -p validated
4 | ./from_genbank_to_fasta_and_yaml.py
5 | fasta_files=$(find fasta_and_yaml/ -name "*.fasta")
6 | for f in $fasta_files ; do
7 | yaml=$(echo $f | rev | cut -c7- | rev).yaml
8 | echo $f
9 | echo $yaml
10 | if bh20-seq-uploader --validate $f $yaml ; then
11 | sz=$(stat --format=%s $f)
12 | if test $sz -gt 20000 ; then
13 | mv $f $yaml validated
14 | else
15 | echo "Fasta file too small"
16 | fi
17 | fi
18 | done
19 |
--------------------------------------------------------------------------------
/scripts/gen_docs/org2html.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # This script converts .org files to .html so these generated
4 | # files do not have to live in the git repo.
5 |
6 | echo "Convert $1 from .org to .html"
7 |
8 | guix environment --ad-hoc emacs-minimal emacs-org emacs-htmlize -- emacs -batch -visit $1 -eval "(progn (require 'org) (let ((org-export-htmlize-output-type 'css)) (org-html-export-to-html nil nil nil t nil)))"
9 |
--------------------------------------------------------------------------------
/scripts/import.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | baseCommand: python3
4 | inputs:
5 | scripts:
6 | type: File
7 | default:
8 | class: File
9 | location: import_to_arvados.py
10 | inputBinding: {position: 1}
11 | importScript:
12 | type: File
13 | default:
14 | class: File
15 | location: download_genbank_data/from_genbank_to_fasta_and_yaml.py
16 | inputBinding: {position: 2}
17 | dict:
18 | type: Directory
19 | default:
20 | class: Directory
21 | location: dict_ontology_standardization
22 | inputBinding: {position: 3}
23 | outputs: []
24 | requirements:
25 | DockerRequirement:
26 | dockerPull: bh20-seq-uploader/import
27 | NetworkAccess:
28 | networkAccess: true
29 | WorkReuse:
30 | enableReuse: false
31 |
--------------------------------------------------------------------------------
/scripts/import_from_genbank.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: Workflow
3 | inputs:
4 | existing_metadata_from_nuccore:
5 | type: Directory?
6 | outputs: []
7 | requirements:
8 | ScatterFeatureRequirement: {}
9 | steps:
10 | fetch_from_genbank:
11 | in:
12 | existing_metadata_from_nuccore: existing_metadata_from_nuccore
13 | out: [fasta_and_yaml, metadata_from_nuccore, accessions]
14 | run: fetch_from_genbank.cwl
15 | split_into_arrays:
16 | in:
17 | dir: fetch_from_genbank/fasta_and_yaml
18 | out: [fasta, metadata]
19 | run: split_into_arrays.cwl
20 | upload:
21 | in:
22 | fasta: split_into_arrays/fasta
23 | metadata: split_into_arrays/metadata
24 | out: []
25 | scatter: [fasta, metadata]
26 | scatterMethod: dotproduct
27 | run: upload.cwl
28 |
--------------------------------------------------------------------------------
/scripts/import_to_arvados.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import glob
4 | import sys
5 |
6 | os.chdir(os.environ["TMPDIR"])
7 | os.symlink(sys.argv[2], "dict_ontology_standardization")
8 | subprocess.run(sys.argv[1])
9 |
10 | os.chdir("fasta_and_yaml")
11 | fasta_files = glob.glob("*.fasta")
12 |
13 | for f in fasta_files:
14 | subprocess.run(["bh20-seq-uploader", "%s.yaml" %f[:-6], f])
15 |
--------------------------------------------------------------------------------
/scripts/split_into_arrays.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: ExpressionTool
3 | requirements:
4 | InlineJavascriptRequirement: {}
5 | inputs:
6 | dir:
7 | type: Directory
8 | loadListing: shallow_listing
9 | outputs:
10 | fasta: File[]
11 | metadata: File[]
12 | expression: |
13 | ${
14 | var dir = inputs.dir;
15 | var fasta = [];
16 | var metadata = [];
17 | dir.listing.sort(function(a, b) { return a.basename < b.basename; });
18 | for (var i = 0; i < dir.listing.length; i++) {
19 | if (dir.listing[i].basename.substr(-6) == ".fasta") {
20 | fasta.push(dir.listing[i]);
21 | }
22 | if (dir.listing[i].basename.substr(-5) == ".yaml") {
23 | metadata.push(dir.listing[i]);
24 | }
25 | }
26 | if (fasta.length != metadata.length) {
27 | throw "They dont match";
28 | }
29 | return {"fasta": fasta, "metadata": metadata};
30 | }
31 |
--------------------------------------------------------------------------------
/scripts/submit_ebi/example/project-submission.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/scripts/submit_ebi/example/project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Testing PubSeq Sample uploads
5 | This is a test to allow for uploading sequences from PubSeq
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/scripts/submit_ebi/example/sample-submission.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/scripts/submit_ebi/example/sample.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | human gastric microbiota, mucosal
5 |
6 | 1284369
7 | stomach metagenome
8 |
9 |
10 |
11 |
12 | investigation type
13 | mimarks-survey
14 |
15 |
16 | sequencing method
17 | pyrosequencing
18 |
19 |
20 | collection date
21 | 2010
22 |
23 |
24 | host body site
25 | Mucosa of stomach
26 |
27 |
28 | human-associated environmental package
29 | human-associated
30 |
31 |
32 | geographic location (latitude)
33 | 1.81
34 | DD
35 |
36 |
37 | geographic location (longitude)
38 | -78.76
39 | DD
40 |
41 |
42 | geographic location (country and/or sea)
43 | Colombia
44 |
45 |
46 | geographic location (region and locality)
47 | Tumaco
48 |
49 |
50 | environment (biome)
51 | coast
52 |
53 |
54 | environment (feature)
55 | human-associated habitat
56 |
57 |
58 | environment (material)
59 | gastric biopsy
60 |
61 |
62 | ENA-CHECKLIST
63 | ERC000011
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/scripts/upload.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | fasta: File
5 | metadata: File
6 | outputs: []
7 | requirements:
8 | DockerRequirement:
9 | dockerPull: bh20-seq-uploader/import
10 | NetworkAccess:
11 | networkAccess: true
12 | baseCommand: bh20-seq-uploader
13 | arguments: [--skip-qc, $(inputs.metadata), $(inputs.fasta)]
14 |
--------------------------------------------------------------------------------
/scripts/uthsc_samples/.gitignore:
--------------------------------------------------------------------------------
1 | yaml
2 |
--------------------------------------------------------------------------------
/scripts/uthsc_samples/template.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | license:
4 | license_type: https://creativecommons.org/licenses/by/4.0/
5 | title: "$strain"
6 | attribution_name: "Mariah Taylor, Colleen B. Jonsson"
7 | attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
8 |
9 | host:
10 | host_id: "$sample_id"
11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
12 |
13 | sample:
14 | sample_id: "$sample_id"
15 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
16 | collection_date: "$collection_date"
17 | collection_location: $location
18 |
19 | virus:
20 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
21 | virus_strain: "$strain"
22 |
23 | technology:
24 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
25 | alignment_protocol: https://bio.tools/BWA#!
26 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
27 | additional_technology_information: "Oxford Nanopore MiniIon RNA long reads"
28 |
29 | submitter:
30 | authors: [Mariah Taylor, Colleen B. Jonsson]
31 | submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
32 | submitter_address: UTHSC, Memphis, Tennessee 38163, USA
33 | originating_lab: Regional Biocontainment Laboratory, Memphis, TN
34 | submitter_sample_id: $sample_id
35 | submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162]
36 |
--------------------------------------------------------------------------------
/scripts/uthsc_samples/uthsc_samples.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | from string import Template
4 | from dateutil.parser import parse
5 | import re
6 |
7 | import sys
8 |
9 | # Metadata in tabular format in a spreadsheet(?!)
10 | xlsx = '../../test/data/10_samples.xlsx'
11 |
12 | # Template in a text file
13 | template_yaml = 'template.yaml'
14 |
15 | dir_output = 'yaml'
16 |
17 | if not os.path.exists(dir_output):
18 | os.makedirs(dir_output)
19 |
20 | table = pd.read_excel(xlsx)
21 |
22 | print(table)
23 |
24 | for index, row in table.iterrows():
25 | sample = row['Sample ID']
26 | print(f"Processing sample {sample}...")
27 |
28 | with open(template_yaml) as f:
29 | text = Template(f.read())
30 | with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw:
31 | sample_id = sample
32 | sample_name = sample
33 | collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d')
34 | locationx = row['City']+", "+row['State']+", USA"
35 | location = "http://www.wikidata.org/entity/Q16563" # Memphis by default
36 | map = {
37 | "Pegram": "http://www.wikidata.org/entity/Q3289517",
38 | "Alexander": "http://www.wikidata.org/entity/Q79663",
39 | "Smithville": "http://www.wikidata.org/entity/Q2145339",
40 | "Nashville": "http://www.wikidata.org/entity/Q23197",
41 | "Madison": "http://www.wikidata.org/entity/Q494755"
42 | }
43 |
44 | for name in map:
45 | p = re.compile(name)
46 | if p.match(locationx):
47 | location = map[name]
48 | break
49 |
50 | strain = f"SARS-CoV-2/human/USA/{sample}/2020"
51 | fw.write(text.substitute(sample_id=sample_id,
52 | sample_name=sample_name,
53 | collection_date=collection_date,
54 | location=location,
55 | locationx=locationx,
56 | strain=strain
57 | ))
58 |
59 | print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa")
60 |
--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | def is_integer(string_to_check):
4 | try:
5 | int(string_to_check)
6 | return True
7 | except ValueError:
8 | return False
9 |
10 | def chunks(lst, n):
11 | for i in range(0, len(lst), n):
12 | yield lst[i:i + n]
13 |
14 | def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
15 | # Check duplicated entry looking at all dictionaries
16 | field_to_term_to_uri_dict = {}
17 |
18 | path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
19 | os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')]
20 |
21 | for path_dict_xxx_csv in path_dict_xxx_csv_list:
22 | print('Read {}'.format(path_dict_xxx_csv))
23 |
24 | with open(path_dict_xxx_csv) as f:
25 | for line in f:
26 | if len(line.split(',')) > 2:
27 | term, uri = line.strip('\n').split('",')
28 | else:
29 | term, uri = line.strip('\n').split(',')
30 |
31 | term = term.strip('"')
32 |
33 | if term in field_to_term_to_uri_dict:
34 | print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term))
35 | continue
36 |
37 | field_to_term_to_uri_dict[term] = uri
38 |
39 | # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen)
40 | field_to_term_to_uri_dict = {}
41 |
42 | for path_dict_xxx_csv in path_dict_xxx_csv_list:
43 | field = os.path.basename(path_dict_xxx_csv).split('.')[0]
44 |
45 | field_to_term_to_uri_dict[field] = {}
46 |
47 | with open(path_dict_xxx_csv) as f:
48 | for line in f:
49 | if len(line.split(',')) > 2:
50 | term, uri = line.strip('\n').split('",')
51 | else:
52 | term, uri = line.strip('\n').split(',')
53 |
54 | term = term.strip('"')
55 |
56 | if term in field_to_term_to_uri_dict[field]:
57 | print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term))
58 | continue
59 |
60 | field_to_term_to_uri_dict[field][term] = uri
61 |
62 | return field_to_term_to_uri_dict
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | import sys
4 |
5 | import setuptools.command.egg_info as egg_info_cmd
6 | from setuptools import setup
7 |
8 | SETUP_DIR = os.path.dirname(__file__)
9 | README = os.path.join(SETUP_DIR, "README.md")
10 |
11 | try:
12 | import gittaggers
13 |
14 | tagger = gittaggers.EggInfoFromGit
15 | except ImportError:
16 | tagger = egg_info_cmd.egg_info
17 |
18 | install_requires = ["arvados-python-client", "schema-salad",
19 | "python-magic", "pyshex", "pyshexc==0.7.0", "py-dateutil"]
20 | web_requires = ["flask", "pyyaml", "redis"]
21 |
22 | needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv)
23 | pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else []
24 |
25 | setup(
26 | name="bh20-seq-uploader",
27 | version="1.0",
28 | description="Biohackathon sequence uploader",
29 | long_description=open(README).read(),
30 | long_description_content_type="text/markdown",
31 | author="Peter Amstutz",
32 | author_email="peter.amstutz@curii.com",
33 | license="Apache 2.0",
34 | packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"],
35 | package_data={"bh20sequploader": ["bh20seq-schema.yml",
36 | "bh20seq-options.yml",
37 | "bh20seq-shex.rdf",
38 | "validation/formats",
39 | "SARS-CoV-2-reference.fasta",],
40 | },
41 | install_requires=install_requires,
42 | extras_require={
43 | 'web': web_requires
44 | },
45 | setup_requires=[] + pytest_runner,
46 | tests_require=["pytest<5"],
47 | entry_points={
48 | "console_scripts": [
49 | "bh20-seq-uploader=bh20sequploader.main:main",
50 | "bh20-seq-analyzer=bh20seqanalyzer.main:main"
51 | ]
52 | },
53 | zip_safe=True,
54 | cmdclass={"egg_info": tagger},
55 | python_requires=">=3.5, <4",
56 | )
57 |
--------------------------------------------------------------------------------
/test/data/10_samples.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/test/data/10_samples.xlsx
--------------------------------------------------------------------------------
/test/data/input/TN_UT2.yaml:
--------------------------------------------------------------------------------
1 | id: placeholder
2 |
3 | license:
4 | license_type: https://creativecommons.org/licenses/by/4.0/
5 | title: "TN_UT2 - Pegram, Tennessee, USA"
6 | attribution_name: "Mariah Taylor, Colleen Jonsson"
7 | attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php
8 |
9 | host:
10 | host_id: "TN_UT2"
11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606
12 |
13 | sample:
14 | sample_id: "TN_UT2"
15 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831]
16 | collection_date: "2020-04-26"
17 | collection_location: http://www.wikidata.org/entity/Q3289517
18 |
19 | virus:
20 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049
21 | virus_strain: "SARS-CoV-2/human/USA/TN_UT2/2020"
22 |
23 | technology:
24 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632]
25 | alignment_protocol: https://bio.tools/BWA#!
26 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628"
27 | additional_technology_information: Oxford Nanopore MiniIon RNA long reads
28 |
29 | submitter:
30 | authors: [Mariah Taylor, Colleen Jonsson]
31 | submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins]
32 | submitter_address: UTHSC, Memphis, Tennessee 38163, USA
33 | originating_lab: Regional Biocontainment Laboratory, Memphis, TN
34 | submitter_sample_id: TN_UT2
35 | submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162]
36 |
--------------------------------------------------------------------------------
/test/runner.py:
--------------------------------------------------------------------------------
1 | # Run tests. python3 test/runner.py
2 |
3 | import unittest
4 |
5 | # initialize the test suite
6 | loader = unittest.TestLoader()
7 | suite = unittest.TestSuite()
8 |
9 | import test_shex
10 | import test_sparql
11 |
12 | suite.addTests(loader.loadTestsFromModule(test_shex))
13 | suite.addTests(loader.loadTestsFromModule(test_sparql))
14 |
15 | # initialize a runner, pass it your suite and run it
16 | runner = unittest.TextTestRunner(verbosity=3)
17 | result = runner.run(suite)
18 |
--------------------------------------------------------------------------------
/test/test_shex.py:
--------------------------------------------------------------------------------
1 | # Run with python3 test/test_shex.py
2 |
3 | import schema_salad.schema
4 | import schema_salad.ref_resolver
5 | import schema_salad.jsonld_context
6 | from pyshex.evaluate import evaluate
7 | import unittest
8 |
9 | class TestShexMethods(unittest.TestCase):
10 |
11 | def test_schema(self):
12 | with open("bh20sequploader/bh20seq-schema.yml") as schema_resource:
13 | metadata_schema = schema_salad.schema.load_schema("bh20sequploader/bh20seq-schema.yml")
14 | (document_loader,
15 | avsc_names,
16 | schema_metadata,
17 | metaschema_loader) = metadata_schema
18 | # print(metadata_schema)
19 | self.assertTrue(isinstance(avsc_names, schema_salad.avro.schema.Names))
20 | metadatafile = "test/data/input/TN_UT2.yaml"
21 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True)
22 | print(doc)
23 | g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx)
24 | with open("bh20sequploader/bh20seq-shex.rdf") as f:
25 | shex = f.read()
26 | # Note the https link simply acts as a URI descriptor (it does not fetch)
27 | rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape")
28 |
29 | with open("test/data/regression/TN_UT2.rdf","w") as f:
30 | f.write(g.serialize(format="ntriples").decode("utf-8"))
31 |
32 | if not rslt:
33 | raise Exception(reason)
34 |
35 | if __name__ == '__main__':
36 | unittest.main()
37 |
--------------------------------------------------------------------------------
/test/test_sparql.py:
--------------------------------------------------------------------------------
1 | # Run with python3 test/test_sparql.py
2 |
3 | import unittest
4 | import requests
5 | import logging
6 |
7 | class TestSPARQL(unittest.TestCase):
8 |
9 | def test_sparql(self):
10 | # sparqlURL='http://sparql.genenetwork.org/sparql/'
11 | sparqlURL='http://127.0.0.1:8890//sparql/'
12 | id = "http://collections.lugli.arvadosapi.com/c=0002e93b86ad77824620bf938b97e134+126/sequence.fasta"
13 | id = "MT800005.1"
14 | query=f"""
15 | PREFIX pubseq:
16 | PREFIX sio:
17 | select distinct ?sample ?geoname ?date ?source ?geo ?sampletype ?institute ?sequenceuri
18 | {{
19 | ?sample sio:SIO_000115 "{id}" .
20 | ?sequenceuri pubseq:sample ?sample .
21 | ?sample ?geo .
22 | ?geo rdfs:label ?geoname .
23 | ?sample ?date .
24 | OPTIONAL {{ ?sample ?source }}
25 | OPTIONAL {{ ?sample ?sampletype }}
26 | OPTIONAL {{ ?sample ?institute }}
27 | }}
28 | """
29 | print(query)
30 | payload = {'query': query, 'format': 'json'}
31 | r = requests.get(sparqlURL, params=payload)
32 | result = r.json()['results']['bindings']
33 | # for now we just take the first one
34 | print(result)
35 | self.assertEqual(result[0]['geoname']['value'],'Mahuva')
36 |
37 | if __name__ == '__main__':
38 | unittest.main()
39 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bam2fasta.cwl:
--------------------------------------------------------------------------------
1 | # Reference:
2 | # https://github.com/VGP/vgp-assembly/blob/33cd6236a68a1aee5f282e365dfe6b97e0b4ebb7/pipeline/freebayes-polish/freebayes.sh
3 | # https://github.com/VGP/vgp-assembly/blob/33cd6236a68a1aee5f282e365dfe6b97e0b4ebb7/pipeline/freebayes-polish/consensus.sh
4 | class: Workflow
5 | cwlVersion: v1.1
6 | id: bam2fasta
7 | label: bam2fasta
8 | requirements: []
9 |
10 | inputs:
11 | bam:
12 | type: File
13 | fasta:
14 | type: File
15 | threads:
16 | type: int
17 | default: 4
18 | sample_id: string
19 |
20 | outputs:
21 | out_fasta:
22 | type: File
23 | outputSource: bcftools_consensus/out_fasta
24 |
25 | steps:
26 | freebayes:
27 | in:
28 | bam: bam
29 | ref_fasta: fasta
30 | out: [vcf]
31 | run: freebayes.cwl
32 | bcftools_view_exclude_ref:
33 | in:
34 | vcf: freebayes/vcf
35 | threads: threads
36 | out: [bcf]
37 | run: bcftools-view-exclude-ref.cwl
38 | bcftools_norm:
39 | in:
40 | ref_fasta: fasta
41 | bcf: bcftools_view_exclude_ref/bcf
42 | threads: threads
43 | out: [normalized_bcf]
44 | run: bcftools-norm.cwl
45 | bcftools_index_after_normalization:
46 | in:
47 | bcf: bcftools_norm/normalized_bcf
48 | out: [indexed]
49 | run: bcftools-index.cwl
50 | bcftools_view_qc:
51 | in:
52 | bcf: bcftools_index_after_normalization/indexed
53 | threads: threads
54 | out: [vcf]
55 | run: bcftools-view-qc.cwl
56 | bcftools_index_after_qc:
57 | in:
58 | bcf: bcftools_view_qc/vcf
59 | out: [indexed]
60 | run: bcftools-index.cwl
61 | bcftools_consensus:
62 | in:
63 | ref_fasta: fasta
64 | vcf: bcftools_index_after_qc/indexed
65 | sample_id: sample_id
66 | out: [out_fasta]
67 | run: bcftools-consensus.cwl
68 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-concat.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | baseCommand: bcftools
8 | arguments:
9 | - concat
10 | - -Ou
11 | - -o
12 | - $(inputs.output_name)
13 | - $(inputs.bcf_files)
14 | inputs:
15 | - id: output_name
16 | type: string
17 | default: "merged.bcf"
18 | - id: bcf_files
19 | type: File[]
20 | outputs:
21 | - id: merged_bcf
22 | type: File
23 | outputBinding:
24 | glob: "$(inputs.output_name)"
25 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-consensus.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | ShellCommandRequirement: {}
8 | baseCommand: bcftools
9 | arguments:
10 | - consensus
11 | - -i
12 | - 'QUAL > 10 && GT="a"'
13 | - -Hla
14 | - -f
15 | - $(inputs.ref_fasta)
16 | - $(inputs.vcf)
17 | - {shellQuote: false, valueFrom: "|"}
18 | - sed
19 | - "s/^>.*/>$(inputs.sample_id)/g"
20 | inputs:
21 | - id: ref_fasta
22 | type: File
23 | - id: vcf
24 | type: File
25 | secondaryFiles: [.csi]
26 | - id: sample_id
27 | type: string
28 | outputs:
29 | - id: out_fasta
30 | type: stdout
31 | stdout: sequence.fasta
32 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-index.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | InitialWorkDirRequirement:
8 | listing:
9 | - $(inputs.bcf)
10 | baseCommand: bcftools
11 | arguments:
12 | - index
13 | - $(inputs.bcf)
14 | inputs:
15 | - id: bcf
16 | type: File
17 | outputs:
18 | - id: indexed
19 | type: File
20 | outputBinding:
21 | glob: "$(inputs.bcf.basename)"
22 | secondaryFiles:
23 | - .csi
24 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-norm.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | baseCommand: bcftools
8 | arguments:
9 | - norm
10 | - -Ob
11 | - -f
12 | - $(inputs.ref_fasta)
13 | - -o
14 | - $(inputs.output_name)
15 | - --threads
16 | - $(inputs.threads)
17 | - $(inputs.bcf)
18 | inputs:
19 | - id: ref_fasta
20 | type: File
21 | - id: output_name
22 | type: string
23 | default: "normalized.bcf"
24 | - id: threads
25 | type: int
26 | - id: bcf
27 | type: File
28 | outputs:
29 | - id: normalized_bcf
30 | type: File
31 | outputBinding:
32 | glob: "$(inputs.output_name)"
33 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-view-exclude-ref.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | baseCommand: bcftools
8 | arguments:
9 | - view
10 | - --no-version
11 | - -Ou
12 | - -e'type=ref'
13 | - --threads=$(inputs.threads)
14 | - $(inputs.vcf)
15 | inputs:
16 | - id: vcf
17 | type: File
18 | - id: threads
19 | type: int
20 | outputs:
21 | - id: bcf
22 | type: stdout
23 | stdout: $(inputs.vcf.nameroot).without-ref.bcf
24 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-view-qc.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | baseCommand: bcftools
8 | arguments:
9 | - view
10 | - -i
11 | - 'QUAL > 10 && GT="a"'
12 | - -Oz
13 | - --threads=$(inputs.threads)
14 | - $(inputs.bcf)
15 | inputs:
16 | - id: threads
17 | type: int
18 | - id: bcf
19 | type: File
20 | secondaryFiles: [.csi]
21 | outputs:
22 | - id: vcf
23 | type: stdout
24 | stdout: out.changes.vcf.gz
25 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bcftools-view.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0"
7 | baseCommand: bcftools
8 | arguments:
9 | - view
10 | - --no-version
11 | - -Ou
12 | - $(inputs.vcf)
13 | inputs:
14 | - id: vcf
15 | type: File
16 | outputs:
17 | - id: bcf
18 | type: stdout
19 | stdout: $(inputs.vcf.nameroot).bcf
20 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bwa-index.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.1
3 | class: CommandLineTool
4 | doc: string
5 | requirements:
6 | DockerRequirement:
7 | dockerPull: quay.io/biocontainers/bwa:0.7.17--h84994c4_5
8 | InitialWorkDirRequirement:
9 | listing:
10 | - $(inputs.input_fasta)
11 | baseCommand: [bwa, index]
12 | inputs:
13 | input_fasta:
14 | type: File
15 | label: "input fasta file"
16 | inputBinding:
17 | position: 1
18 | outputs:
19 | indexed_fasta:
20 | type: File
21 | outputBinding:
22 | glob: $(inputs.input_fasta.basename)
23 | secondaryFiles:
24 | - .amb
25 | - .ann
26 | - .bwt
27 | - .pac
28 | - .sa
29 | stdout: stdout
30 | stderr: stderr
31 | stdout: bwa-index-stdout.log
32 | stderr: bwa-index-stderr.log
33 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/bwa-mem.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.1
3 | class: CommandLineTool
4 | doc: string
5 | requirements:
6 | DockerRequirement:
7 | dockerPull: quay.io/biocontainers/bwa:0.7.17--h84994c4_5
8 |
9 | baseCommand: [bwa, mem]
10 |
11 | inputs:
12 | threads:
13 | type: int
14 | label: "number of threads"
15 | default: 4
16 | inputBinding:
17 | prefix: -t
18 | output_sam:
19 | type: string
20 | label: "sam file to output results to"
21 | default: "out.sam"
22 | inputBinding:
23 | prefix: -o
24 | group_header_line:
25 | type: string?
26 | label: "read group header line such as '@RG\tID:foo\tSM:bar'"
27 | inputBinding:
28 | prefix: -R
29 | index_base:
30 | type: File
31 | label: "fasta file for index basename"
32 | inputBinding:
33 | position: 1
34 | secondaryFiles:
35 | - .amb
36 | - .ann
37 | - .bwt
38 | - .pac
39 | - .sa
40 | fastq_forward:
41 | type: File
42 | label: "input fastq file to map (single-end or forward for pair-end)"
43 | inputBinding:
44 | position: 2
45 | fastq_reverse:
46 | type: File?
47 | label: "input fastq file to map (reverse for pair-end)"
48 | inputBinding:
49 | position: 3
50 |
51 | outputs:
52 | output:
53 | type: File
54 | outputBinding:
55 | glob: "$(inputs.output_sam)"
56 | stdout: stdout
57 | stderr: stderr
58 | stdout: bwa-mem-stdout.log
59 | stderr: bwa-mem-stderr.log
60 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/fastq2fasta-create-bwaindex.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: Workflow
3 | requirements:
4 | SubworkflowFeatureRequirement: {}
5 | hints:
6 | ResourceRequirement:
7 | ramMin: 3000
8 |
9 | inputs:
10 | ref_fasta:
11 | type: File
12 | fastq_forward:
13 | type: File
14 | fastq_reverse:
15 | type: File?
16 | threads:
17 | type: int
18 | default: 4
19 |
20 | outputs:
21 | out_fasta:
22 | type: File
23 | outputSource: fastq2fasta/out_fasta
24 |
25 | steps:
26 | bwa-index:
27 | in: {input_fasta: ref_fasta}
28 | out: [indexed_fasta]
29 | run: bwa-index.cwl
30 | samtools-faidx:
31 | in: {input_fasta: bwa-index/indexed_fasta}
32 | out: [indexed_fasta]
33 | run: samtools-faidx.cwl
34 | fastq2fasta:
35 | in:
36 | fastq_forward: fastq_forward
37 | fastq_reverse: fastq_reverse
38 | ref_fasta: samtools-faidx/indexed_fasta
39 | threads: threads
40 | out: [out_fasta]
41 | run: fastq2fasta.cwl
42 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/fastq2fasta.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: Workflow
3 | requirements:
4 | SubworkflowFeatureRequirement: {}
5 | hints:
6 | ResourceRequirement:
7 | ramMin: 3000
8 |
9 | inputs:
10 | fastq_forward: File
11 | fastq_reverse: File?
12 | ref_fasta:
13 | type: File
14 | secondaryFiles:
15 | - .amb
16 | - .ann
17 | - .bwt
18 | - .pac
19 | - .sa
20 | - .fai
21 | threads:
22 | type: int
23 | default: 4
24 | metadata: File?
25 | sample_id: string
26 |
27 | outputs:
28 | out_fasta:
29 | type: File
30 | outputSource: bam2fasta/out_fasta
31 | out_metadata:
32 | type: File?
33 | outputSource: metadata
34 |
35 | steps:
36 | bwa-mem:
37 | in:
38 | threads: threads
39 | fastq_forward: fastq_forward
40 | fastq_reverse: fastq_reverse
41 | index_base: ref_fasta
42 | out: [output]
43 | run: bwa-mem.cwl
44 | samtools-view:
45 | in:
46 | threads: threads
47 | input_file: bwa-mem/output
48 | out: [bam]
49 | run: samtools-view.cwl
50 | samtools-sort:
51 | in:
52 | input_bamfile: samtools-view/bam
53 | threads: threads
54 | out: [sorted_bam]
55 | run: samtools-sort.cwl
56 | bam2fasta:
57 | in:
58 | bam: samtools-sort/sorted_bam
59 | fasta: ref_fasta
60 | threads: threads
61 | sample_id: sample_id
62 | out: [out_fasta]
63 | run: bam2fasta.cwl
64 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/freebayes.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: "quay.io/biocontainers/freebayes:1.3.2--py37hc088bd4_0"
7 | baseCommand: freebayes
8 | arguments: [
9 | --bam, $(inputs.bam),
10 | # --region=$(inputs.contig):1-$(inputs.contig_end)
11 | --ploidy, "1",
12 | -f, $(inputs.ref_fasta)]
13 | inputs:
14 | - id: bam
15 | type: File
16 | # - id: contig
17 | # type: string
18 | # - id: contig_end
19 | # type: int
20 | - id: ref_fasta
21 | type: File
22 | outputs:
23 | - id: vcf
24 | type: stdout
25 | stdout: var.vcf
26 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/samtools-faidx.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.0
3 | class: CommandLineTool
4 | doc: "samtools sort, sort given bam file"
5 | requirements:
6 | DockerRequirement:
7 | dockerPull: quay.io/biocontainers/samtools:1.9--h8571acd_11
8 | InitialWorkDirRequirement:
9 | listing:
10 | - $(inputs.input_fasta)
11 | baseCommand: [samtools, faidx]
12 | inputs:
13 | input_fasta:
14 | type: File
15 | label: "Input fasta"
16 | inputBinding:
17 | position: 1
18 | secondaryFiles:
19 | - .amb
20 | - .ann
21 | - .bwt
22 | - .pac
23 | - .sa
24 | outputs:
25 | indexed_fasta:
26 | type: File
27 | outputBinding:
28 | glob: "$(inputs.input_fasta.basename)"
29 | secondaryFiles:
30 | - .amb
31 | - .ann
32 | - .bwt
33 | - .pac
34 | - .sa
35 | - .fai
36 | stdout: stdout
37 | stderr: stderr
38 | stdout: samtools-sort-stdout.log
39 | stderr: samtools-sort-stderr.log
40 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/samtools-sort.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.0
3 | class: CommandLineTool
4 | doc: "samtools sort, sort given bam file"
5 | requirements:
6 | DockerRequirement:
7 | dockerPull: quay.io/biocontainers/samtools:1.9--h8571acd_11
8 | baseCommand: [samtools, sort]
9 | inputs:
10 | threads:
11 | type: int
12 | default: 4
13 | inputBinding:
14 | prefix: -@
15 | tmpfile:
16 | type: string
17 | default: sort.tmp
18 | label: "Write temporary files to PREFIX.nnnn.bam"
19 | inputBinding:
20 | prefix: -T
21 | output_bam:
22 | type: string
23 | default: aln.sorted.bam
24 | label: "Write final output to FILENAME"
25 | inputBinding:
26 | prefix: -o
27 | input_bamfile:
28 | type: File
29 | label: "Input bamfile"
30 | inputBinding:
31 | position: 1
32 |
33 | outputs:
34 | sorted_bam:
35 | type: File
36 | outputBinding:
37 | glob: "$(inputs.output_bam)"
38 | stdout: stdout
39 | stderr: stderr
40 | stdout: samtools-sort-stdout.log
41 | stderr: samtools-sort-stderr.log
42 |
--------------------------------------------------------------------------------
/workflows/fastq2fasta/samtools-view.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.0
3 | class: CommandLineTool
4 | doc: "samtools view to convert sam format to bam format"
5 | requirements:
6 | DockerRequirement:
7 | dockerPull: quay.io/biocontainers/samtools:1.9--h8571acd_11
8 | baseCommand: [samtools, view]
9 | inputs:
10 | threads:
11 | type: int
12 | label: "Number of additional threads to use"
13 | default: 4
14 | inputBinding:
15 | prefix: -@
16 | output_bam:
17 | type: boolean
18 | label: "output BAM"
19 | default: true
20 | inputBinding:
21 | prefix: -b
22 | output_filename:
23 | type: string
24 | label: "output file name"
25 | default: "aln.bam"
26 | inputBinding:
27 | prefix: -o
28 | input_file:
29 | type: File
30 | label: "input file"
31 | inputBinding:
32 | position: 1
33 | include_header:
34 | type: boolean
35 | label: "include the header in the output"
36 | default: false
37 | inputBinding:
38 | prefix: -h
39 | ignore_previous_version:
40 | type: boolean
41 | label: "ignored for compatibility with previous samtools versions"
42 | default: false
43 | inputBinding:
44 | prefix: -S
45 | filter_alignments:
46 | type: string?
47 | label: "Do not output alignments with any bits set in INT present in the FLAG field. INT can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' (i.e. /^0[0-7]+/) [0]."
48 | inputBinding:
49 | prefix: -F
50 | skip_alignments:
51 | type: int?
52 | label: "Skip alignments with MAPQ smaller than INT [0]."
53 | inputBinding:
54 | prefix: -q
55 | outputs:
56 | bam:
57 | type: File
58 | outputBinding:
59 | glob: "$(inputs.output_filename)"
60 | stdout: stdout
61 | stderr: stderr
62 | stdout: samtools-view-stdout.log
63 | stderr: samtools-view-stderr.log
64 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/abpoa.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | readsFA: File
5 | script:
6 | type: File
7 | default: {class: File, location: relabel-seqs.py}
8 | outputs:
9 | abpoaGFA:
10 | type: stdout
11 | requirements:
12 | InlineJavascriptRequirement: {}
13 | hints:
14 | DockerRequirement:
15 | dockerPull: "quay.io/biocontainers/abpoa:1.0.5--hed695b0_0"
16 | ResourceRequirement:
17 | coresMin: 1
18 | ramMin: $(15 * 1024)
19 | outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
20 | baseCommand: abpoa
21 | stdout: $(inputs.readsFA.nameroot).O0.gfa
22 | arguments: [
23 | $(inputs.readsFA),
24 | -r 3,
25 | -O, '0'
26 | ]
27 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/arv-main.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: Workflow
3 | requirements:
4 | SubworkflowFeatureRequirement: {}
5 | inputs:
6 | src_project: string
7 | metadataSchema: File
8 | exclude: File?
9 | outputs:
10 | odgiGraph:
11 | type: File
12 | outputSource: pangenome-generate/odgiGraph
13 | # odgiPNG:
14 | # type: File
15 | # outputSource: pangenome-generate/odgiPNG
16 | spoaGFA:
17 | type: File
18 | outputSource: pangenome-generate/spoaGFA
19 | odgiRDF:
20 | type: File
21 | outputSource: pangenome-generate/odgiRDF
22 | readsMergeDedup:
23 | type: File
24 | outputSource: pangenome-generate/readsMergeDedupSortedByQualAndLen
25 | mergedMetadata:
26 | type: File
27 | outputSource: pangenome-generate/mergedMetadata
28 | # indexed_paths:
29 | # type: File
30 | # outputSource: pangenome-generate/indexed_paths
31 | # colinear_components:
32 | # type: Directory
33 | # outputSource: pangenome-generate/colinear_components
34 | steps:
35 | collect-seqs:
36 | run: collect-seqs.cwl
37 | in:
38 | src_project: src_project
39 | metadataSchema: metadataSchema
40 | exclude: exclude
41 | out: [relabeledSeqs, mergedMetadata]
42 | pangenome-generate:
43 | run: pangenome-generate_spoa.cwl
44 | in:
45 | seqs: collect-seqs/relabeledSeqs
46 | metadata: collect-seqs/mergedMetadata
47 | exclude: exclude
48 | out: [odgiGraph, spoaGFA, odgiRDF, readsMergeDedupSortedByQualAndLen, mergedMetadata]
49 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/arvados-and-samtools-dockerfile/1078ECD7.key:
--------------------------------------------------------------------------------
1 | -----BEGIN PGP PUBLIC KEY BLOCK-----
2 |
3 | mQENBEzhgeoBCAChhoK1dqpWzNyDWqRGEvdFdkJaA9D2HRwKPfBfjAoePX6ZyrpA
4 | ItlUsvt/8s/DRiTiPEFQR4S7VqocmU6whJc3gDEGyOM6b1NF873lIfSVwUoE42QE
5 | a76dO8woOYgLUyxu2mKG+bJgGMumjBJt6ZOndYVjTYB/7sEeVxwmMVulfZe0s6zg
6 | ut0+SoTYg2R36qIqeIcWllYt97sEYnyy1qXMis4/3IZnuWkS/frsPR3aeUI4W+o2
7 | NDN1kj49+LMe7Fb5b7jZY08rZbAWXi1rU1hQx4jC9RvYqlT4HNld4Bn7os1IvOOA
8 | wNiR0oiVdiuDbBxcMvRPktxMrFVjowusRLq/ABEBAAG0PUN1cm92ZXJzZSwgSW5j
9 | IEF1dG9tYXRpYyBTaWduaW5nIEtleSA8c3lzYWRtaW5AY3Vyb3ZlcnNlLmNvbT6J
10 | ATgEEwECACIFAlNgYIECGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEFcW
11 | WREQeOzXPkEH/jQJDIYI1dxWcYiA+hczmpaZvN2/pc/kwIW/6a03+6zqmSNkebOE
12 | TgoDILacSYc17hy20R1/rWyUstOMKcEgFDBlSehhHyl0f7q/w7d8Ais6MabzsPfx
13 | IceJpsjUg87+BR7qWhgQ0sxmtIF2TKuTFLs+nkGsgSsiBOEF4NvHxuj3HD4y8F27
14 | HNqrkqwjLS8xJwwH5Gp2uMEVr1AXIH3iSRjJ8X124s8iEP97Q/3IazoYRf9/MCSm
15 | QEx8KzxwDX6t4bW6O4D01K+e9gdkTY70dcMgJoqm5IsX7yxjEubiOunphtlJnZ9d
16 | Oi1yBN5UM3pWKAdcfRj4rcfV9Simvpx9av+5AQ0ETOGB6gEIAMAA0HVMG0BbdnU7
17 | wWgl5eFdT0AUSrXK/WdcKqVEGGv+c68NETSHWZOJX7O46Eao4gY4cTYprVMBzxpY
18 | /BtQSYLpE0HLvBc1fcFd61Yz4H/9rGSNY0GcIQEbOjbJY5mr8qFsQ1K/mAf3aUL3
19 | b6ni4sHVicRiRr0Gl4Ihorlskpfu1SHs/C5tvTSVNF9p4vtl5892y1yILQeVpcBs
20 | NCR7MUpdS49xCpvnAWsDZX+ij6LTR3lzCm/ZLCg4gNuZkjgU9oqVfGkqysW7WZ8S
21 | OLvzAwUw7i1EIFX8q6QdudGoezxz8m8OgZM1v8AFpYEKlhEPf1W0MSfaRDwrj866
22 | 8nCLruEAEQEAAYkBHwQYAQIACQUCTOGB6gIbDAAKCRBXFlkREHjs199EB/4+p0G1
23 | 3PHxt6rLWSCGXobDOu4ZOA/qnv0D/JhOLroFds5TzQv6vnS8eAkhCTjHVA+b58cm
24 | kXpI0oYcD4ZP+KK1CHKq2rGfwou7HfAF+icnNqYkeBOkjjbCgkvBlcCInuAuU8JX
25 | DZMkfFk52+eBKwTjS/J/fQp0vDru8bHLp98WgdRHWfJQ3mc3gz4A5sR6zhrGPW6/
26 | ssnROS4dC2Ohp35GpgN1KjD3EmEw5RoSBYlyrARCaMsivgIKMxGUEyFZWhuJt3N1
27 | 2MTddRwz28hbmYCi+MzHYDbRv+cSyUDmvXaWhfkNKBepClBA1rTWBcldit5vvlqr
28 | yPet6wIKrtLGhAqZ
29 | =CLkG
30 | -----END PGP PUBLIC KEY BLOCK-----
31 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/arvados-and-samtools-dockerfile/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:10
2 | ENV DEBIAN_FRONTEND noninteractive
3 | RUN apt-get update -q
4 | RUN apt-get install -yq --no-install-recommends gnupg
5 | ADD 1078ECD7.key /tmp/
6 | RUN cat /tmp/1078ECD7.key | apt-key add -
7 | RUN echo 'deb http://apt.arvados.org/ buster main' > /etc/apt/sources.list.d/apt.arvados.org-stable.list
8 | RUN apt-get update -q && apt-get install -yq --no-install-recommends samtools python3-python-client
9 | RUN rm -f /usr/bin/python && ln -s /usr/share/python3/dist/python3-python-client/bin/python /usr/bin/python
10 | RUN rm -f /usr/bin/python3 && ln -s /usr/share/python3/dist/python3-python-client/bin/python /usr/bin/python3
11 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/collect-seqs.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | $namespaces:
4 | arv: "http://arvados.org/cwl#"
5 | cwltool: "http://commonwl.org/cwltool#"
6 | requirements:
7 | arv:APIRequirement: {}
8 | arv:RuntimeConstraints:
9 | outputDirType: keep_output_dir
10 | DockerRequirement:
11 | dockerImageId: arvados-and-samtools
12 | WorkReuse:
13 | enableReuse: false
14 | ResourceRequirement:
15 | coresMin: 1
16 | ramMin: 1024
17 | baseCommand: python3
18 | inputs:
19 | script:
20 | type: File
21 | default:
22 | class: File
23 | location: collect-seqs.py
24 | inputBinding: {position: 1}
25 | src_project:
26 | type: string
27 | inputBinding: {position: 2}
28 | metadataSchema:
29 | type: File
30 | inputBinding: {position: 3}
31 | exclude:
32 | type: File?
33 | inputBinding: {position: 4}
34 | outputs:
35 | relabeledSeqs:
36 | type: File
37 | outputBinding:
38 | glob: relabeledSeqs.fasta
39 | secondaryFiles: [.fai]
40 | mergedMetadata:
41 | type: File
42 | outputBinding:
43 | glob: mergedMetadata.ttl
44 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/collect-seqs.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import arvados
3 | import json
4 | import shutil
5 | import logging
6 | import subprocess
7 | import arvados.collection
8 | import ruamel.yaml
9 | import schema_salad.schema
10 | import schema_salad.jsonld_context
11 | from schema_salad.sourceline import add_lc_filename
12 |
13 | api = arvados.api()
14 | keepclient = arvados.keep.KeepClient(api_client=api)
15 |
16 | validated = arvados.util.list_all(api.collections().list, filters=[
17 | ["owner_uuid", "=", sys.argv[1]],
18 | ["properties.status", "=", "validated"]])
19 |
20 | validated.sort(key=lambda v: v["portable_data_hash"])
21 |
22 | relabeled_fasta = open("relabeledSeqs.fasta", "wt")
23 | merged_metadata = open("mergedMetadata.ttl", "wt")
24 |
25 | metadataSchema = sys.argv[2]
26 |
27 | blacklist = set()
28 | if len(sys.argv) > 3:
29 | with open(sys.argv[3]) as bl:
30 | for l in bl:
31 | blacklist.add(l.strip())
32 |
33 | (document_loader,
34 | avsc_names,
35 | schema_metadata,
36 | metaschema_loader) = schema_salad.schema.load_schema(metadataSchema)
37 |
38 |
39 | for item in validated:
40 | pdh = item["portable_data_hash"]
41 | uuid = item["uuid"]
42 | try:
43 | subject = "http://covid19.genenetwork.org/resource/%s" % uuid
44 | with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col:
45 | with col.open("metadata.yaml", "rt") as md:
46 | metadata_content = ruamel.yaml.round_trip_load(md)
47 | metadata_content["id"] = subject
48 | add_lc_filename(metadata_content, metadata_content["id"])
49 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False)
50 | g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx)
51 |
52 | with col.open("sequence.fasta", "rt") as fa:
53 | label = fa.readline().strip()
54 | merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"')))
55 | merged_metadata.write("<%s> \"%s\" .\n" % (subject, pdh))
56 | merged_metadata.write("<%s> \"%s\" .\n" % (subject, item["version"]))
57 | skip = (subject in blacklist or label[1:] in blacklist)
58 | if skip:
59 | merged_metadata.write("<%s> \"true\"^^ .\n" % subject)
60 | if not skip:
61 | relabeled_fasta.write(">"+subject+"\n")
62 | data = fa.read(8096)
63 | while data:
64 | if not skip:
65 | relabeled_fasta.write(data)
66 | endswithnewline = data.endswith("\n")
67 | data = fa.read(8096)
68 | if not skip and not endswithnewline:
69 | relabeled_fasta.write("\n")
70 |
71 | merged_metadata.write(g.serialize(format="ntriples").decode("utf-8"))
72 | except Exception as e:
73 | logging.exception("Error processing collection %s" % uuid)
74 |
75 | subprocess.run(["samtools", "faidx", "relabeledSeqs.fasta"])
76 |
77 | shutil.rmtree(".cache")
78 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/dups2metadata.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | baseCommand: python
4 | inputs:
5 | script:
6 | type: File
7 | default:
8 | class: File
9 | location: dups2metadata.py
10 | inputBinding: {position: 1}
11 | metadata:
12 | type: File
13 | inputBinding: {position: 2}
14 | dups:
15 | type: File?
16 | inputBinding: {position: 3}
17 | stdout: mergedmetadata.ttl
18 | outputs:
19 | merged: stdout
20 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/dups2metadata.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import re
3 |
4 | md = open(sys.argv[1], "rt")
5 | for d in md:
6 | sys.stdout.write(d)
7 |
8 | if len(sys.argv) < 3:
9 | exit(0)
10 |
11 | sameseqs = open(sys.argv[2], "rt")
12 | for d in sameseqs:
13 | g = re.match(r"\d+\t(.*)", d)
14 | sp = g.group(1).split(",")
15 | for n in sp[1:]:
16 | sys.stdout.write("<%s> <%s> .\n" % (n.strip(), sp[0].strip()))
17 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/from_sparql.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | $namespaces:
4 | arv: "http://arvados.org/cwl#"
5 | requirements:
6 | DockerRequirement:
7 | dockerFile: |
8 | FROM debian:10
9 | RUN apt-get update && apt-get -yq --no-install-recommends install samtools python3-rdflib
10 | dockerImageId: rdflib-and-samtools
11 | ResourceRequirement:
12 | ramMin: 768
13 | arv:RuntimeConstraints:
14 | keep_cache: 2048
15 | outputDirType: keep_output_dir
16 | inputs:
17 | script:
18 | type: File
19 | default:
20 | class: File
21 | location: from_sparql.py
22 | metadata: File
23 | fasta:
24 | type: File
25 | secondaryFiles: [.fai]
26 | query: string
27 | stdout: selected.fasta
28 | outputs:
29 | selected: stdout
30 | arguments: [python3, $(inputs.script), $(inputs.metadata), $(inputs.fasta), $(inputs.query)]
31 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/from_sparql.py:
--------------------------------------------------------------------------------
1 | from rdflib import Graph
2 | import sys
3 | import subprocess
4 | g = Graph()
5 | g.parse(sys.argv[1], format="nt")
6 | res = g.query(sys.argv[3])
7 | for r in res:
8 | subprocess.run(["samtools", "faidx", sys.argv[2], r[0]])
9 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/merge-metadata.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | hints:
4 | DockerRequirement:
5 | dockerPull: commonworkflowlanguage/cwltool_module
6 | inputs:
7 | metadata: File[]
8 | subjects: string[]
9 | metadataSchema:
10 | type: File
11 | inputBinding: {position: 2}
12 | originalLabels:
13 | type: File
14 | inputBinding: {position: 3}
15 | dups:
16 | type: File?
17 | inputBinding: {position: 4}
18 | script:
19 | type: File
20 | inputBinding: {position: 1}
21 | default: {class: File, location: merge-metadata.py}
22 | outputs:
23 | merged: stdout
24 | stdout: mergedmetadata.ttl
25 | requirements:
26 | InlineJavascriptRequirement: {}
27 | InitialWorkDirRequirement:
28 | listing: |
29 | ${
30 | var i = 0;
31 | var b = 1;
32 | var out = [];
33 | for (; i < inputs.metadata.length; i++) {
34 | var block = [];
35 | var sub = [];
36 | for (; i < (b*150) && i < inputs.metadata.length; i++) {
37 | block.push(inputs.metadata[i]);
38 | sub.push(inputs.subjects[i]);
39 | }
40 | out.push({
41 | entryname: "block"+b,
42 | entry: JSON.stringify(block)
43 | });
44 | out.push({
45 | entryname: "subs"+b,
46 | entry: JSON.stringify(sub)
47 | });
48 | b++;
49 | }
50 | return out;
51 | }
52 | baseCommand: python
53 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/merge-metadata.py:
--------------------------------------------------------------------------------
1 | import re
2 | import schema_salad.schema
3 | import schema_salad.jsonld_context
4 | import json
5 | import sys
6 | import os
7 | import logging
8 |
9 | metadataSchema = sys.argv[1]
10 | originalLabels = sys.argv[2]
11 | dups = None
12 | if len(sys.argv) == 4:
13 | dups = sys.argv[3]
14 |
15 | def readitems(stem):
16 | items = []
17 | b = 1
18 | while os.path.exists("%s%i" % (stem, b)):
19 | with open("%s%i" % (stem, b)) as f:
20 | items.extend(json.load(f))
21 | b += 1
22 | return items
23 |
24 | metadata = readitems("block")
25 | subjects = readitems("subs")
26 |
27 | (document_loader,
28 | avsc_names,
29 | schema_metadata,
30 | metaschema_loader) = schema_salad.schema.load_schema(metadataSchema)
31 |
32 | for i, m in enumerate(metadata):
33 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], False, False)
34 | doc["id"] = subjects[i]
35 | g = schema_salad.jsonld_context.makerdf(subjects[i], doc, document_loader.ctx)
36 | print(g.serialize(format="ntriples").decode("utf-8"))
37 |
38 | if dups:
39 | sameseqs = open(dups, "rt")
40 | for d in sameseqs:
41 | logging.warn(d)
42 | g = re.match(r"\d+\t(.*)", d)
43 | logging.warn("%s", g.group(1))
44 | sp = g.group(1).split(",")
45 | for n in sp[1:]:
46 | print("<%s> <%s> ." % (n.strip(), sp[0].strip()))
47 |
48 | orig = open(originalLabels, "rt")
49 | print(orig.read())
50 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | inputGFA: File
5 | outputs:
6 | odgiGraph:
7 | type: File
8 | outputBinding:
9 | glob: $(inputs.inputGFA.nameroot).unchop.sorted.odgi
10 | requirements:
11 | InlineJavascriptRequirement: {}
12 | hints:
13 | DockerRequirement:
14 | dockerPull: "odgi-bash-binutils:latest"
15 | ResourceRequirement:
16 | coresMin: 4
17 | ramMin: $(15 * 1024)
18 | outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2))
19 | InitialWorkDirRequirement:
20 | # Will fail if input file is not writable (odgi bug)
21 | listing:
22 | - entry: $(inputs.inputGFA)
23 | writable: true
24 | arguments:
25 | - "sh"
26 | - "-c"
27 | - >-
28 | odgi build -g '$(inputs.inputGFA.path)' -o - | odgi unchop -i - -o - |
29 | odgi sort -i - -p s -o $(inputs.inputGFA.nameroot).unchop.sorted.odgi
30 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/odgi-build.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | inputGFA: File
5 | outputs:
6 | odgiGraph:
7 | type: File
8 | outputBinding:
9 | glob: $(inputs.inputGFA.nameroot).odgi
10 | requirements:
11 | InlineJavascriptRequirement: {}
12 | ShellCommandRequirement: {}
13 | hints:
14 | DockerRequirement:
15 | dockerPull: "quay.io/biocontainers/odgi:v0.3--py37h8b12597_0"
16 | ResourceRequirement:
17 | coresMin: 4
18 | ramMin: $(7 * 1024)
19 | outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2))
20 | InitialWorkDirRequirement:
21 | listing:
22 | - entry: $(inputs.inputGFA)
23 | writable: true
24 | arguments: [odgi, build, -g, $(inputs.inputGFA), -o, -,
25 | {shellQuote: false, valueFrom: "|"},
26 | odgi, sort, -i, -, -p, s, -o, $(inputs.inputGFA.nameroot).odgi]
27 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/odgi_to_rdf.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | class: CommandLineTool
3 | cwlVersion: v1.1
4 | hints:
5 | DockerRequirement:
6 | dockerPull: jerven/spodgi:0.0.6
7 | requirements:
8 | InlineJavascriptRequirement: {}
9 | ShellCommandRequirement: {}
10 | ResourceRequirement:
11 | ramMin: $((2 * 1024) + 1)
12 | inputs:
13 | odgi: File
14 | output_name: string?
15 |
16 | stdout: $(inputs.output_name || inputs.odgi.nameroot+'.ttl.xz')
17 |
18 | arguments:
19 | [odgi_to_rdf.py, $(inputs.odgi), "-",
20 | {valueFrom: "|", shellQuote: false},
21 | xz, --stdout]
22 |
23 | outputs:
24 | rdf: stdout
25 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/pangenome-generate.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.1
3 | class: Workflow
4 | requirements:
5 | ScatterFeatureRequirement: {}
6 | StepInputExpressionRequirement: {}
7 | inputs:
8 | inputReads: File[]
9 | metadata: File[]
10 | metadataSchema: File
11 | subjects: string[]
12 | exclude: File?
13 | bin_widths:
14 | type: int[]
15 | default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
16 | doc: width of each bin in basepairs along the graph vector
17 | cells_per_file:
18 | type: int
19 | default: 100
20 | doc: Cells per file on component_segmentation
21 | outputs:
22 | odgiGraph:
23 | type: File
24 | outputSource: buildGraph/odgiGraph
25 | odgiPNG:
26 | type: File
27 | outputSource: vizGraph/graph_image
28 | seqwishGFA:
29 | type: File
30 | outputSource: induceGraph/seqwishGFA
31 | odgiRDF:
32 | type: File
33 | outputSource: odgi2rdf/rdf
34 | readsMergeDedup:
35 | type: File
36 | outputSource: dedup/reads_dedup
37 | mergedMetadata:
38 | type: File
39 | outputSource: mergeMetadata/merged
40 | indexed_paths:
41 | type: File
42 | outputSource: index_paths/indexed_paths
43 | colinear_components:
44 | type: Directory
45 | outputSource: segment_components/colinear_components
46 | steps:
47 | relabel:
48 | in:
49 | readsFA: inputReads
50 | subjects: subjects
51 | exclude: exclude
52 | out: [relabeledSeqs, originalLabels]
53 | run: relabel-seqs.cwl
54 | dedup:
55 | in: {reads: relabel/relabeledSeqs}
56 | out: [reads_dedup, dups]
57 | run: ../tools/seqkit/seqkit_rmdup.cwl
58 | overlapReads:
59 | in:
60 | target: dedup/reads_dedup
61 | query: dedup/reads_dedup
62 | outputCIGAR: {default: true}
63 | preset: {default: asm20}
64 | miniWinSize: {default: 1}
65 | out: [alignments]
66 | run: ../tools/minimap2/minimap2_paf.cwl
67 | induceGraph:
68 | in:
69 | readsFA: dedup/reads_dedup
70 | readsPAF: overlapReads/alignments
71 | out: [seqwishGFA]
72 | run: seqwish.cwl
73 | buildGraph:
74 | in: {inputGFA: induceGraph/seqwishGFA}
75 | out: [odgiGraph]
76 | run: odgi-build.cwl
77 | vizGraph:
78 | in:
79 | sparse_graph_index: buildGraph/odgiGraph
80 | width:
81 | default: 50000
82 | height:
83 | default: 500
84 | path_per_row:
85 | default: true
86 | path_height:
87 | default: 4
88 | out: [graph_image]
89 | run: ../tools/odgi/odgi_viz.cwl
90 | odgi2rdf:
91 | in: {odgi: buildGraph/odgiGraph}
92 | out: [rdf]
93 | run: odgi_to_rdf.cwl
94 | mergeMetadata:
95 | in:
96 | metadata: metadata
97 | metadataSchema: metadataSchema
98 | subjects: subjects
99 | dups: dedup/dups
100 | originalLabels: relabel/originalLabels
101 | out: [merged]
102 | run: merge-metadata.cwl
103 | bin_paths:
104 | run: ../tools/odgi/odgi_bin.cwl
105 | in:
106 | sparse_graph_index: buildGraph/odgiGraph
107 | bin_width: bin_widths
108 | scatter: bin_width
109 | out: [ bins, pangenome_sequence ]
110 | index_paths:
111 | label: Create path index
112 | run: ../tools/odgi/odgi_pathindex.cwl
113 | in:
114 | sparse_graph_index: buildGraph/odgiGraph
115 | out: [ indexed_paths ]
116 | segment_components:
117 | label: Run component segmentation
118 | run: ../tools/graph-genome-segmentation/component_segmentation.cwl
119 | in:
120 | bins: bin_paths/bins
121 | cells_per_file: cells_per_file
122 | pangenome_sequence:
123 | source: bin_paths/pangenome_sequence
124 | valueFrom: $(self[0])
125 | # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
126 | # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
127 | # regardless of bin_width, so we take the first pangenome_sequence as input for this step
128 | out: [ colinear_components ]
129 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/pangenome-generate_abpoa.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.1
3 | class: Workflow
4 | requirements:
5 | ScatterFeatureRequirement: {}
6 | StepInputExpressionRequirement: {}
7 | inputs:
8 | seqs: File
9 | metadata: File
10 | bin_widths:
11 | type: int[]
12 | default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
13 | doc: width of each bin in basepairs along the graph vector
14 | cells_per_file:
15 | type: int
16 | default: 100
17 | doc: Cells per file on component_segmentation
18 | reversed_sort:
19 | type: string
20 | default: "false"
21 | outputs:
22 | odgiGraph:
23 | type: File
24 | outputSource: buildGraph/odgiGraph
25 | odgiPNG:
26 | type: File
27 | outputSource: vizGraph/graph_image
28 | abpoaGFA:
29 | type: File
30 | outputSource: induceGraph/abpoaGFA
31 | # odgiRDF:
32 | # type: File
33 | # outputSource: odgi2rdf/rdf
34 | readsMergeDedupSortedByQualAndLen:
35 | type: File
36 | outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA
37 | mergedMetadata:
38 | type: File
39 | outputSource: dups2metadata/merged
40 | # indexed_paths:
41 | # type: File
42 | # outputSource: index_paths/indexed_paths
43 | # colinear_components:
44 | # type: Directory
45 | # outputSource: segment_components/colinear_components
46 | steps:
47 | dedup_and_sort_by_quality_and_len:
48 | in: {readsFA: seqs, reversed_sorting: reversed_sort}
49 | out: [sortedReadsFA, dups]
50 | run: sort_fasta_by_quality_and_len.cwl
51 | induceGraph:
52 | in:
53 | readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA
54 | out: [abpoaGFA]
55 | run: abpoa.cwl
56 | buildGraph:
57 | in: {inputGFA: induceGraph/abpoaGFA}
58 | out: [odgiGraph]
59 | run: odgi-build-from-xpoa-gfa.cwl
60 | vizGraph:
61 | in:
62 | sparse_graph_index: buildGraph/odgiGraph
63 | width:
64 | default: 50000
65 | height:
66 | default: 500
67 | path_per_row:
68 | default: true
69 | path_height:
70 | default: 4
71 | out: [graph_image]
72 | requirements:
73 | ResourceRequirement:
74 | ramMin: $(15 * 1024)
75 | outdirMin: 10
76 | run: ../tools/odgi/odgi_viz.cwl
77 | # odgi2rdf:
78 | # in: {odgi: buildGraph/odgiGraph}
79 | # out: [rdf]
80 | # run: odgi_to_rdf.cwl
81 | dups2metadata:
82 | in:
83 | metadata: metadata
84 | dups: dedup_and_sort_by_quality_and_len/dups
85 | out: [merged]
86 | run: dups2metadata.cwl
87 | # bin_paths:
88 | # requirements:
89 | # ResourceRequirement:
90 | # ramMin: 3000
91 | # outdirMin: 10
92 | # run: ../tools/odgi/odgi_bin.cwl
93 | # in:
94 | # sparse_graph_index: buildGraph/odgiGraph
95 | # bin_width: bin_widths
96 | # scatter: bin_width
97 | # out: [ bins, pangenome_sequence ]
98 | # index_paths:
99 | # label: Create path index
100 | # requirements:
101 | # ResourceRequirement:
102 | # ramMin: 3000
103 | # outdirMin: 10
104 | # run: ../tools/odgi/odgi_pathindex.cwl
105 | # in:
106 | # sparse_graph_index: buildGraph/odgiGraph
107 | # out: [ indexed_paths ]
108 | # segment_components:
109 | # label: Run component segmentation
110 | # run: ../tools/graph-genome-segmentation/component_segmentation.cwl
111 | # in:
112 | # bins: bin_paths/bins
113 | # cells_per_file: cells_per_file
114 | # pangenome_sequence:
115 | # source: bin_paths/pangenome_sequence
116 | # valueFrom: $(self[0])
117 | # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
118 | # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
119 | # # regardless of bin_width, so we take the first pangenome_sequence as input for this step
120 | # out: [ colinear_components ]
121 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/pangenome-generate_spoa.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 | cwlVersion: v1.1
3 | class: Workflow
4 | requirements:
5 | ScatterFeatureRequirement: {}
6 | StepInputExpressionRequirement: {}
7 | inputs:
8 | seqs: File
9 | metadata: File
10 | bin_widths:
11 | type: int[]
12 | default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000]
13 | doc: width of each bin in basepairs along the graph vector
14 | cells_per_file:
15 | type: int
16 | default: 100
17 | doc: Cells per file on component_segmentation
18 | reversed_sort:
19 | type: string
20 | default: "true"
21 | outputs:
22 | odgiGraph:
23 | type: File
24 | outputSource: buildGraph/odgiGraph
25 | odgiPNG:
26 | type: File
27 | outputSource: vizGraph/graph_image
28 | spoaGFA:
29 | type: File
30 | outputSource: induceGraph/spoaGFA
31 | # odgiRDF:
32 | # type: File
33 | # outputSource: odgi2rdf/rdf
34 | readsMergeDedupSortedByQualAndLen:
35 | type: File
36 | outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA
37 | mergedMetadata:
38 | type: File
39 | outputSource: dups2metadata/merged
40 | # indexed_paths:
41 | # type: File
42 | # outputSource: index_paths/indexed_paths
43 | # colinear_components:
44 | # type: Directory
45 | # outputSource: segment_components/colinear_components
46 | steps:
47 | dedup_and_sort_by_quality_and_len:
48 | in: {readsFA: seqs, reversed_sorting: reversed_sort}
49 | out: [sortedReadsFA, dups]
50 | run: sort_fasta_by_quality_and_len.cwl
51 | induceGraph:
52 | in:
53 | readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA
54 | out: [spoaGFA]
55 | run: spoa.cwl
56 | buildGraph:
57 | in: {inputGFA: induceGraph/spoaGFA}
58 | out: [odgiGraph]
59 | run: odgi-build-from-xpoa-gfa.cwl
60 | vizGraph:
61 | in:
62 | sparse_graph_index: buildGraph/odgiGraph
63 | width:
64 | default: 50000
65 | height:
66 | default: 500
67 | path_per_row:
68 | default: true
69 | path_height:
70 | default: 4
71 | out: [graph_image]
72 | requirements:
73 | ResourceRequirement:
74 | ramMin: $(15 * 1024)
75 | outdirMin: 10
76 | run: ../tools/odgi/odgi_viz.cwl
77 | # odgi2rdf:
78 | # in: {odgi: buildGraph/odgiGraph}
79 | # out: [rdf]
80 | # run: odgi_to_rdf.cwl
81 | dups2metadata:
82 | in:
83 | metadata: metadata
84 | dups: dedup_and_sort_by_quality_and_len/dups
85 | out: [merged]
86 | run: dups2metadata.cwl
87 | # bin_paths:
88 | # requirements:
89 | # ResourceRequirement:
90 | # ramMin: 3000
91 | # outdirMin: 10
92 | # run: ../tools/odgi/odgi_bin.cwl
93 | # in:
94 | # sparse_graph_index: buildGraph/odgiGraph
95 | # bin_width: bin_widths
96 | # scatter: bin_width
97 | # out: [ bins, pangenome_sequence ]
98 | # index_paths:
99 | # label: Create path index
100 | # requirements:
101 | # ResourceRequirement:
102 | # ramMin: 3000
103 | # outdirMin: 10
104 | # run: ../tools/odgi/odgi_pathindex.cwl
105 | # in:
106 | # sparse_graph_index: buildGraph/odgiGraph
107 | # out: [ indexed_paths ]
108 | # segment_components:
109 | # label: Run component segmentation
110 | # run: ../tools/graph-genome-segmentation/component_segmentation.cwl
111 | # in:
112 | # bins: bin_paths/bins
113 | # cells_per_file: cells_per_file
114 | # pangenome_sequence:
115 | # source: bin_paths/pangenome_sequence
116 | # valueFrom: $(self[0])
117 | # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index
118 | # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index
119 | # # regardless of bin_width, so we take the first pangenome_sequence as input for this step
120 | # out: [ colinear_components ]
121 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/query-to-gfa.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: Workflow
3 | requirements:
4 | SubworkflowFeatureRequirement: {}
5 | inputs:
6 | metadata: File
7 | fasta:
8 | type: File
9 | secondaryFiles: [.fai]
10 | query: string
11 | outputs:
12 | odgiGraph:
13 | type: File
14 | outputSource: make-gfa/odgiGraph
15 | spoaGFA:
16 | type: File
17 | outputSource: make-gfa/spoaGFA
18 | readsMergeDedupSortedByQualAndLen:
19 | type: File
20 | outputSource: make-gfa/readsMergeDedupSortedByQualAndLen
21 | mergedMetadata:
22 | type: File
23 | outputSource: make-gfa/mergedMetadata
24 | steps:
25 | get-subset:
26 | run: from_sparql.cwl
27 | in: {metadata: metadata, query: query, fasta: fasta}
28 | out: [selected]
29 | make-gfa:
30 | run: pangenome-generate_spoa.cwl
31 | in: {metadata: metadata, seqs: get-subset/selected}
32 | out: [odgiGraph, spoaGFA, readsMergeDedupSortedByQualAndLen, mergedMetadata]
33 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/relabel-seqs.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | readsFA: File[]
5 | subjects: string[]
6 | exclude:
7 | type: File?
8 | inputBinding: {position: 2}
9 | script:
10 | type: File
11 | default: {class: File, location: relabel-seqs.py}
12 | inputBinding: {position: 1}
13 | outputs:
14 | relabeledSeqs:
15 | type: File
16 | outputBinding:
17 | glob: relabeledSeqs.fasta
18 | originalLabels:
19 | type: File
20 | outputBinding:
21 | glob: originalLabels.ttl
22 | requirements:
23 | InlineJavascriptRequirement: {}
24 | InitialWorkDirRequirement:
25 | listing: |
26 | ${
27 | var i = 0;
28 | var b = 1;
29 | var out = [];
30 | for (; i < inputs.readsFA.length; i++) {
31 | var block = [];
32 | var sub = [];
33 | for (; i < (b*150) && i < inputs.readsFA.length; i++) {
34 | block.push(inputs.readsFA[i]);
35 | sub.push(inputs.subjects[i]);
36 | }
37 | out.push({
38 | entryname: "block"+b,
39 | entry: JSON.stringify(block)
40 | });
41 | out.push({
42 | entryname: "subs"+b,
43 | entry: JSON.stringify(sub)
44 | });
45 | b++;
46 | }
47 | return out;
48 | }
49 | hints:
50 | DockerRequirement:
51 | dockerPull: commonworkflowlanguage/cwltool_module
52 | stdout:
53 | baseCommand: [python]
54 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/relabel-seqs.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import sys
4 |
5 | def readitems(stem):
6 | items = []
7 | b = 1
8 | while os.path.exists("%s%i" % (stem, b)):
9 | with open("%s%i" % (stem, b)) as f:
10 | items.extend(json.load(f))
11 | b += 1
12 | return items
13 |
14 | reads = readitems("block")
15 | subjects = readitems("subs")
16 |
17 | relabeled_fasta = open("relabeledSeqs.fasta", "wt")
18 | original_labels = open("originalLabels.ttl", "wt")
19 |
20 | blacklist = set()
21 | if len(sys.argv) > 1:
22 | with open(sys.argv[1]) as bl:
23 | for l in bl:
24 | blacklist.add(l.strip())
25 |
26 | for i, r in enumerate(reads):
27 | with open(r["path"], "rt") as fa:
28 | label = fa.readline().strip()
29 | original_labels.write("<%s> \"%s\" .\n" % (subjects[i], label[1:].replace('"', '\\"')))
30 | skip = (subjects[i] in blacklist or label[1:] in blacklist)
31 | if skip:
32 | original_labels.write("<%s> \"true\"^^ .\n" % (subjects[i]))
33 | if not skip:
34 | relabeled_fasta.write(">"+subjects[i]+"\n")
35 | data = fa.read(8096)
36 | while data:
37 | if not skip:
38 | relabeled_fasta.write(data)
39 | endswithnewline = data.endswith("\n")
40 | data = fa.read(8096)
41 | if not skip and not endswithnewline:
42 | relabeled_fasta.write("\n")
43 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/seqwish.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | readsFA: File
5 | readsPAF: File
6 | kmerSize:
7 | type: int
8 | default: 16
9 | outputs:
10 | seqwishGFA:
11 | type: File
12 | outputBinding:
13 | glob: $(inputs.readsPAF.nameroot).gfa
14 | requirements:
15 | InlineJavascriptRequirement: {}
16 | hints:
17 | DockerRequirement:
18 | dockerPull: "quay.io/biocontainers/seqwish:0.4.1--h8b12597_0"
19 | ResourceRequirement:
20 | coresMin: 4
21 | ramMin: $(7 * 1024)
22 | outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
23 | stdout: $(inputs.readsFA.nameroot).paf
24 | baseCommand: seqwish
25 | arguments: [-t, $(runtime.cores),
26 | -k, $(inputs.kmerSize),
27 | -s, $(inputs.readsFA),
28 | -p, $(inputs.readsPAF),
29 | -g, $(inputs.readsPAF.nameroot).gfa]
30 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl:
--------------------------------------------------------------------------------
1 |
2 | cwlVersion: v1.1
3 | class: CommandLineTool
4 | hints:
5 | ResourceRequirement:
6 | coresMin: 1
7 | ramMin: 3000
8 | inputs:
9 | reversed_sorting:
10 | type: string
11 | inputBinding: {position: 3}
12 | readsFA:
13 | type: File
14 | inputBinding: {position: 2}
15 | script:
16 | type: File
17 | inputBinding: {position: 1}
18 | default: {class: File, location: sort_fasta_by_quality_and_len.py}
19 | stdout: $(inputs.readsFA.nameroot).sorted_by_quality_and_len.fasta
20 | outputs:
21 | sortedReadsFA:
22 | type: stdout
23 | dups:
24 | type: File
25 | outputBinding: {glob: dups.txt}
26 | requirements:
27 | InlineJavascriptRequirement: {}
28 | ShellCommandRequirement: {}
29 | baseCommand: [python]
30 |
31 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/sort_fasta_by_quality_and_len.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending).
4 | # The best sequence is the longest one, with no uncalled bases.
5 |
6 | import os
7 | import sys
8 | import gzip
9 |
10 | # import xxhash # Faster library
11 | import hashlib
12 |
13 |
14 | def open_gzipsafe(path_file):
15 | if path_file.endswith('.gz'):
16 | return gzip.open(path_file, 'rt')
17 | else:
18 | return open(path_file)
19 |
20 |
21 | path_fasta = sys.argv[1]
22 |
23 | hash_to_count_and_headers_dict = {}
24 |
25 | header_to_seq_dict = {}
26 | header_percCalledBases_seqLength_list = []
27 |
28 | with open_gzipsafe(path_fasta) as f:
29 | for fasta in f.read().strip('\n>').split('>'):
30 | header = fasta.strip('\n').split('\n')[0]
31 | sequence = ''.join(fasta.strip('\n').split('\n')[1:])
32 |
33 | # hash = xxhash.xxh64(sequence).hexdigest() # Faster library
34 | hash = hashlib.md5(sequence.encode('utf-8')).hexdigest()
35 |
36 | if hash not in hash_to_count_and_headers_dict:
37 | # New sequence
38 | hash_to_count_and_headers_dict[hash] = [0, []]
39 |
40 | header_to_seq_dict[header] = sequence
41 |
42 | seq_len = len(sequence)
43 | header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len])
44 |
45 | hash_to_count_and_headers_dict[hash][0] += 1
46 | hash_to_count_and_headers_dict[hash][1].append(header)
47 |
48 | with open('dups.txt', 'w') as fw:
49 | for count, header_list in hash_to_count_and_headers_dict.values():
50 | fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n')
51 |
52 | reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False
53 |
54 | for header, percCalledBases, seqLength_list in sorted(
55 | header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting
56 | ):
57 | sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header]))
58 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/spoa.cwl:
--------------------------------------------------------------------------------
1 | cwlVersion: v1.1
2 | class: CommandLineTool
3 | inputs:
4 | readsFA: File
5 | script:
6 | type: File
7 | default: {class: File, location: relabel-seqs.py}
8 | outputs:
9 | spoaGFA:
10 | type: stdout
11 | requirements:
12 | InlineJavascriptRequirement: {}
13 | hints:
14 | DockerRequirement:
15 | dockerPull: "quay.io/biocontainers/spoa:3.4.0--hc9558a2_0"
16 | ResourceRequirement:
17 | coresMin: 1
18 | ramMin: $(15 * 1024)
19 | outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20))
20 | baseCommand: spoa
21 | stdout: $(inputs.readsFA.nameroot).g6.gfa
22 | arguments: [
23 | $(inputs.readsFA),
24 | -G,
25 | -g, '-6'
26 | ]
27 |
--------------------------------------------------------------------------------
/workflows/pangenome-generate/testjob.yml:
--------------------------------------------------------------------------------
1 | inputReads:
2 | - class: File
3 | location: ../../example/sequence.fasta
4 | - class: File
5 | location: ../../example/sequence.fasta
6 | metadata:
7 | - class: File
8 | location: ../../example/metadata.yaml
9 | - class: File
10 | location: ../../example/metadata.yaml
11 | metadataSchema:
12 | class: File
13 | location: ../../bh20sequploader/bh20seq-schema.yml
14 | subjects:
15 | - http://arvados.org/keep/seq1
16 | - http://arvados.org/keep/seq2
17 |
--------------------------------------------------------------------------------
/workflows/phylogeny/README.md:
--------------------------------------------------------------------------------
1 | A workflow to generate a phylogeny that can be visualized using [auspice](https://github.com/urbanslug/auspice).
2 | Expects a multi-fasta file path at [pggb_fasta][1] and generates a tree in `json` format.
3 |
4 | #### Dependencies
5 |
6 | Depends on:
7 | - [pggb](https://github.com/pangenome/pggb/blob/master/pggb)
8 | * [wfmash](https://github.com/ekg/wfmash)
9 | * [seqwish](https://github.com/ekg/seqwish)
10 | * [smoothxg](https://github.com/pangenome/smoothxg)
11 | * [odgi](https://github.com/vgteam/odgi)
12 |
13 | - [taxophages](https://github.com/urbanslug/taxophages/)
14 | * Clone and run with `python main.py ...`
15 |
16 | - [augur](https://github.com/nextstrain/augur)
17 |
18 |
19 | #### Running
20 |
21 | Expects that taxophages is cloned in a previous dir but you can update the path [main_py_script][2] to wherever it is.
22 |
23 | Run the phylogeny workflow with the bleow after specifying your path to [pggb_fasta][1].
24 |
25 | ```bash
26 | R_PACKAGES="${HOME}/RLibraries" \ # a directory holding R packages. Needed if R packages installed using install.packages on server e.g https://github.com/urbanslug/taxophages/blob/master/scripts/deps.R
27 | TAXOPHAGES_ENV=server \ # helps taxophages figure out where it is being ran
28 | AUGUR_RECURSION_LIMIT=30000 \ # augur isn't used to working with so many nested values
29 | cwltool --preserve-entire-environment --no-container phylogeny.cwl clado-job.yml
30 | ```
31 |
32 | Alternatively run any workflow with
33 | ```
34 | cwltool --no-container .cwl clado-job.yml
35 | ```
36 |
37 | [1]: clado-job.yml#L8
38 | [2]: clado-job.yml#L28
39 |
--------------------------------------------------------------------------------
/workflows/phylogeny/align.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 |
5 | class: CommandLineTool
6 | baseCommand: pggb
7 |
8 | inputs:
9 | threads:
10 | type: int
11 | inputBinding:
12 | position: 1
13 | prefix: -t
14 |
15 | pggb_wfmash:
16 | type: boolean
17 | inputBinding:
18 | position: 2
19 | prefix: --wfmash
20 |
21 | pggb_fasta:
22 | type: File
23 | inputBinding:
24 | position: 3
25 | prefix: -i
26 |
27 | pggb_mash_k_mer:
28 | type: int
29 | inputBinding:
30 | position: 4
31 | prefix: -K
32 |
33 | pggb_map_percent_identity:
34 | type: int
35 | inputBinding:
36 | position: 5
37 | prefix: -p
38 |
39 | pggb_num_secondary_mappings:
40 | type: int
41 | inputBinding:
42 | position: 6
43 | prefix: -n
44 |
45 | pggb_segment_length:
46 | type: int
47 | inputBinding:
48 | position: 7
49 | prefix: -s
50 |
51 | pggb_output_dir:
52 | type: string
53 | inputBinding:
54 | position: 8
55 | prefix: -o
56 |
57 | outputs:
58 | pggb_odgi_graph:
59 | type: File
60 | outputBinding:
61 | glob: '*.smooth.og'
--------------------------------------------------------------------------------
/workflows/phylogeny/augur.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 |
5 | class: CommandLineTool
6 | baseCommand: bash
7 |
8 | requirements:
9 | InitialWorkDirRequirement:
10 | listing:
11 | - $(inputs.dataDir)
12 |
13 | inputs:
14 | nextstrain_bash_script:
15 | type: File
16 | inputBinding:
17 | position: 1
18 |
19 | newick_tree_2:
20 | type: File
21 | inputBinding:
22 | position: 2
23 |
24 | metadata_newick:
25 | type: File
26 | inputBinding:
27 | position: 3
28 |
29 | dataDir:
30 | type: Directory
31 |
32 | outputs:
33 | newick_json:
34 | type: File
35 | outputBinding:
36 | glob: 'covid.json'
--------------------------------------------------------------------------------
/workflows/phylogeny/awk-coverage.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 | class: CommandLineTool
5 |
6 | baseCommand: awk
7 |
8 | inputs:
9 | consensus_regex:
10 | type: string
11 | inputBinding:
12 | position: 1
13 |
14 | coverage_tsv:
15 | type: File
16 | inputBinding:
17 | position: 2
18 |
19 | outputs:
20 | awk_coverage_matrix:
21 | type: stdout
22 |
23 | stdout: coverage.no_consensus.tsv
--------------------------------------------------------------------------------
/workflows/phylogeny/clado-job.yml:
--------------------------------------------------------------------------------
1 | message: Hello world!
2 |
3 | threads: 16
4 |
5 | pggb_wfmash: true
6 | pggb_fasta:
7 | class: File
8 | path: ../data/qc/relabeledSeqs.sorted.qc.100sample.fasta
9 | pggb_mash_k_mer: 19
10 | pggb_map_percent_identity: 95
11 | pggb_num_secondary_mappings: 10000
12 | pggb_segment_length: 5000
13 | pggb_output_dir: "."
14 |
15 | odgi_paths: paths
16 | odgi_graph:
17 | class: File
18 | path: ./relabeledSeqs.sorted.qc.100sample.fasta.pggb-W-s5000-l15000-p95-n10000-a0-K19-k19-w10000-j5000-e5000-I0-R0.smooth.og
19 | haplotypes: true
20 |
21 | consensus_regex: '!/^Consensus/'
22 | coverage_tsv:
23 | class: File
24 | path: ./coverage.tsv
25 |
26 | main_py_script:
27 | class: File
28 | path: ../main.py
29 | metadata: get-metadata
30 | coverage_matrix:
31 | class: File
32 | path: ./coverage.no_consensus.tsv
33 | coverage_matrix_with_metadata: ./coverage.metadata.tsv
34 |
35 | clado-rsvd: clado-rsvd
36 | cladogram_matrix:
37 | class: File
38 | path: ./coverage.metadata.tsv
39 | reduced_matrix: ./coverage.reduced.tsv
40 | svg_figure: 30k_700cm.svg
41 |
42 | newick: gen-newick
43 | newick_dimensions: 100
44 | newick_coverage_matrix:
45 | class: File
46 | path: ./coverage.metadata.tsv
47 | newick_metadata: ./metadata.tsv
48 | newick_tree: ./tree.workflow.nwk
49 |
50 | nextstrain_R_script:
51 | class: File
52 | path: ../taxophages/viz/nextstrain.R
53 |
54 | coverage_matrix_with_metadata_2:
55 | class: File
56 | path: ../data/5k/covmatrix.5k.metadata.tsv
57 |
58 | metadata_only: ./metadata.tsv
59 | newick_tree: tree.workflow.nwk
60 | distance_matrix: distance_matrix.workflow.tsv
61 | rsvd_dimensions: "1000"
62 | filter_unknowns: "TRUE"
63 |
64 | nextstrain_bash_script:
65 | class: File
66 | path: ../scripts/nextstrain.sh
67 |
68 | newick_tree_2:
69 | class: File
70 | path: ./tree.workflow.nwk
71 |
72 | metadata_newick:
73 | class: File
74 | path: ./metadata.tsv
75 |
76 | dataDir:
77 | class: Directory
78 | path: ../config
79 |
--------------------------------------------------------------------------------
/workflows/phylogeny/coverage.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 |
5 | class: CommandLineTool
6 | baseCommand: odgi
7 |
8 | inputs:
9 | odgi_paths:
10 | type: string
11 | inputBinding:
12 | position: 1
13 |
14 | odgi_graph:
15 | type: File
16 | inputBinding:
17 | position: 2
18 | prefix: -i
19 |
20 | haplotypes:
21 | type: boolean
22 | inputBinding:
23 | position: 4
24 | prefix: -H
25 |
26 | threads:
27 | type: int
28 | inputBinding:
29 | position: 5
30 | prefix: -t
31 |
32 | outputs:
33 | coverage_matrix:
34 | type: stdout
35 |
36 | stdout: coverage.tsv
--------------------------------------------------------------------------------
/workflows/phylogeny/metadata.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 |
5 | class: CommandLineTool
6 | baseCommand: python
7 |
8 | inputs:
9 | main_py_script:
10 | type: File
11 | inputBinding:
12 | position: 1
13 |
14 | metadata:
15 | type: string
16 | inputBinding:
17 | position: 2
18 |
19 | coverage_matrix:
20 | type: File
21 | inputBinding:
22 | position: 3
23 |
24 | coverage_matrix_with_metadata:
25 | type: string
26 | inputBinding:
27 | position: 4
28 |
29 | outputs:
30 | coverage_matrix_with_metadata_out:
31 | type: File
32 | outputBinding:
33 | glob: '*.metadata.tsv'
--------------------------------------------------------------------------------
/workflows/phylogeny/newick.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 |
5 | class: CommandLineTool
6 | baseCommand: python
7 |
8 | inputs:
9 | main_py_script:
10 | type: File
11 | inputBinding:
12 | position: 1
13 |
14 | newick:
15 | type: string
16 | inputBinding:
17 | position: 2
18 |
19 | newick_dimensions:
20 | type: int
21 | inputBinding:
22 | position: 3
23 | prefix: -d
24 |
25 | newick_coverage_matrix:
26 | type: File
27 | inputBinding:
28 | position: 3
29 |
30 | newick_metadata:
31 | type: string
32 | inputBinding:
33 | position: 4
34 |
35 | newick_tree:
36 | type: string
37 | inputBinding:
38 | position: 5
39 |
40 | outputs:
41 | metadata_out:
42 | type: File
43 | outputBinding:
44 | glob: 'metadata.tsv'
45 |
46 | newick_tree_out:
47 | type: File
48 | outputBinding:
49 | glob: '*.nwk'
--------------------------------------------------------------------------------
/workflows/phylogeny/phylogeny.cwl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env cwl-runner
2 |
3 | cwlVersion: v1.1
4 | class: Workflow
5 |
6 | #############################################
7 |
8 | inputs:
9 |
10 | # align
11 | threads: int
12 | pggb_wfmash: boolean
13 | pggb_fasta: File
14 | pggb_mash_k_mer: int
15 | pggb_map_percent_identity: int
16 | pggb_num_secondary_mappings: int
17 | pggb_segment_length: int
18 | pggb_output_dir: string
19 |
20 | # extract coverage vector
21 | odgi_paths: string
22 | odgi_graph: File
23 | haplotypes: boolean
24 | threads: int
25 |
26 | # remove consensus paths
27 | consensus_regex: string
28 | coverage_tsv: File
29 |
30 | # Get metadata
31 | main_py_script: File
32 | metadata: string
33 | coverage_matrix: File
34 | coverage_matrix_with_metadata: string
35 |
36 | # Generate newick tree
37 | main_py_script: File
38 | newick: string
39 | newick_dimensions: int
40 | newick_coverage_matrix: File
41 | newick_metadata: string
42 | newick_tree: string
43 |
44 | # Genenrate augur JSON file
45 | nextstrain_bash_script: File
46 | newick_tree_2: File
47 | metadata_newick: File
48 | dataDir: Directory
49 |
50 |
51 | #############################################
52 |
53 | outputs:
54 | augur_json:
55 | type: File
56 | outputSource: augur/newick_json
57 |
58 | #############################################
59 |
60 | steps:
61 | align:
62 | run: align.cwl
63 | in:
64 | threads: threads
65 | pggb_wfmash: pggb_wfmash
66 | pggb_fasta: pggb_fasta
67 | pggb_mash_k_mer: pggb_mash_k_mer
68 | pggb_map_percent_identity: pggb_map_percent_identity
69 | pggb_num_secondary_mappings: pggb_num_secondary_mappings
70 | pggb_segment_length: pggb_segment_length
71 | pggb_output_dir: pggb_output_dir
72 | out: [pggb_odgi_graph]
73 |
74 | odgi:
75 | run: coverage.cwl
76 | in:
77 | odgi_paths: odgi_paths
78 | odgi_graph: align/pggb_odgi_graph
79 | haplotypes: haplotypes
80 | threads: threads
81 | out: [coverage_matrix]
82 |
83 | awk:
84 | run: awk-coverage.cwl
85 | in:
86 | consensus_regex: consensus_regex
87 | coverage_tsv: odgi/coverage_matrix
88 | out: [awk_coverage_matrix]
89 |
90 | metadata:
91 | run: metadata.cwl
92 | in:
93 | main_py_script: main_py_script
94 | metadata: metadata
95 | coverage_matrix: awk/awk_coverage_matrix
96 | coverage_matrix_with_metadata: coverage_matrix_with_metadata
97 | out: [coverage_matrix_with_metadata_out]
98 |
99 | newick:
100 | run: newick.cwl
101 | in:
102 | main_py_script: main_py_script
103 | newick: newick
104 | newick_dimensions: newick_dimensions
105 | newick_coverage_matrix: metadata/coverage_matrix_with_metadata_out
106 | newick_metadata: newick_metadata
107 | newick_tree: newick_tree
108 | out: [metadata_out, newick_tree_out]
109 |
110 | augur:
111 | run: augur.cwl
112 | in:
113 | nextstrain_bash_script: nextstrain_bash_script
114 | newick_tree_2: newick/newick_tree_out
115 | metadata_newick: newick/metadata_out
116 | dataDir: dataDir
117 |
118 | out: [newick_json]
119 |
--------------------------------------------------------------------------------
/workflows/pubseq/generate-rdf.rb:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env ruby
2 | #
3 | # -*- coding: UTF-8 -*-
4 | #
5 | # This script transforms pass2 JSON to JSON-LD (ready for RDF)
6 | # See also https://github.com/pubseq/bh20-seq-resource/doc/blog/covid19-pubseq-update-rdf.org
7 | #
8 | # Author:: Pjotr Prins
9 | # License:: MIT
10 | #
11 | # Copyright (C) 2021 Pjotr Prins
12 | #
13 |
14 | TOOL=File.basename($0)
15 |
16 | GEMPATH = File.dirname(__FILE__) + '/../../lib/ruby'
17 | $: << File.join(GEMPATH,'lib/ruby/pubseq')
18 |
19 | VERSION_FILENAME=File.join(GEMPATH,'VERSION')
20 | VERSION = File.new(VERSION_FILENAME).read.chomp
21 |
22 | require 'optparse'
23 | require 'ostruct'
24 | require 'fileutils'
25 | require 'json'
26 | require 'zlib'
27 |
28 | options = { show_help: false, source: 'https://github.com/pubseq', version: VERSION+' (Pjotr Prins)', date: Time.now.to_s }
29 |
30 | opts = OptionParser.new do |o|
31 | o.banner = "Usage: #{TOOL} [options] path"
32 | o.on('--dir path',String, 'Path to JSON files [REQUIRED]') do |path|
33 | options[:path] = path
34 | end
35 | o.on('--out path',String, 'Dir to write to [REQUIRED]') do |path|
36 | options[:out] = path
37 | end
38 |
39 | o.separator ""
40 |
41 | o.on("-q", "--quiet", "Run quietly") do |q|
42 | # Bio::Log::CLI.trace('error')
43 | options[:quiet] = true
44 | end
45 |
46 | o.on("-v", "--verbose", "Run verbosely") do |v|
47 | options[:verbose] = true
48 | end
49 |
50 | o.on("--progress", "Show progress") do |p|
51 | options[:progress] = true
52 | end
53 |
54 | o.on("--debug", "Show debug messages and keep intermediate output") do |v|
55 | # Bio::Log::CLI.trace('debug')
56 | options[:debug] = true
57 | end
58 |
59 | o.separator ""
60 | o.on_tail('-h', '--help', 'display this help and exit') do
61 | options[:show_help] = true
62 | end
63 | end
64 |
65 | opts.parse!(ARGV)
66 |
67 | BANNER = "#{TOOL} #{VERSION} (Ruby #{RUBY_VERSION}) by Pjotr Prins 2021\n"
68 | $stderr.print BANNER if !options[:quiet]
69 |
70 | if options[:show_help]
71 | print opts
72 | exit 1
73 | end
74 |
75 | if RUBY_VERSION =~ /^[12]/
76 | $stderr.print "WARNING: #{TOOL} may not run properly on Ruby <3.x\n"
77 | end
78 |
79 | $stderr.print "Options: ",options,"\n" if !options[:quiet]
80 |
81 | GLOBAL = OpenStruct.new(options)
82 |
83 | raise "--dir directory is required" if not GLOBAL.path
84 | raise "--out directory is required" if not GLOBAL.out
85 |
--------------------------------------------------------------------------------
/workflows/pubseq/normalize/README.md:
--------------------------------------------------------------------------------
1 | # Normalization steps
2 |
3 | This library contains generic logic to normalize (string) data and
4 | transforms strings to URIs. It should be applicable to data from
5 | any source (GenBank, ENA etc).
6 |
7 | Important: missing data should be missing or None! Do not fill
8 | in data by 'guessing'.
9 |
10 | When data is malformed a warning should be logged and added to the
11 | warning list. Functions should be small enough to return only 1
12 | warning!
13 |
14 | Pjotr Prins (c) 2021
15 |
--------------------------------------------------------------------------------
/workflows/pubseq/normalize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/workflows/pubseq/normalize/__init__.py
--------------------------------------------------------------------------------
/workflows/pubseq/pubseq-fetch-data.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import json
5 | import os
6 | import requests
7 | import sys
8 | import time
9 |
10 | parser = argparse.ArgumentParser(description="""
11 |
12 | Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs
13 | can be passed in on the command line or in a file.
14 |
15 | """)
16 | parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records')
17 | parser.add_argument('--out', type=str, help='Directory to write to',
18 | required=True)
19 | parser.add_argument('--ids', type=str, help='File with ids', required=False)
20 | parser.add_argument('id', nargs='*', help='id(s)')
21 | args = parser.parse_args()
22 |
23 | dir = args.out
24 | if not os.path.exists(dir):
25 | raise Exception(f"Directory {dir} does not exist")
26 |
27 | ids = args.id
28 | if (len(ids)==0):
29 | print(f"Reading {args.ids}")
30 | with open(args.ids) as f:
31 | ids = [ l.strip() for l in f.readlines() ]
32 |
33 | for id in ids:
34 | print(id)
35 | jsonfn = dir+"/"+id+".json"
36 | if not os.path.exists(jsonfn):
37 | count = 0
38 | r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
39 | while not r:
40 | count += 1
41 | if count>10: raise Exception(f"Can not find record for {id}")
42 | time.sleep(15)
43 | r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json")
44 | m_url = r.json()[0]['metadata']
45 | mr = requests.get(m_url)
46 | with open(dir+"/"+id+".json","w") as outf:
47 | outf.write(mr.text)
48 | if args.fasta:
49 | fastafn = dir+"/"+id+".fa"
50 | if os.path.exists(fastafn): continue
51 | fa_url = r.json()[0]['fasta']
52 | fr = requests.get(fa_url)
53 | with open(fastafn,"w") as outf:
54 | outf.write(fr.text)
55 |
56 |
--------------------------------------------------------------------------------
/workflows/pubseq/pubseq-fetch-ids:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | #
3 | # Use a SPARQL query to fetch all IDs in the PubSeq database
4 | #
5 | # pubseq-fetch-ids > pubseq_ids.txt
6 | #
7 | # Note: requires Ruby 3.x. Older Ruby gives a syntax error
8 | #
9 | # See also
10 |
11 | require 'net/http'
12 | require 'json'
13 | require 'ostruct'
14 | require 'erb'
15 | require 'pp'
16 |
17 | MAX=5_000
18 |
19 | SPARQL_HEADER="
20 | prefix rdfs:
21 | prefix rdf:
22 | prefix dc:
23 | prefix schema:
24 | PREFIX pubseq:
25 | "
26 |
27 | # Build a SPARQL query, submit and return results. Apply transform
28 | # lambda when passed in
29 | def sparql query, transform = nil
30 | api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}"
31 |
32 | response = Net::HTTP.get_response(URI.parse(api_url))
33 | data = JSON.parse(response.body,symbolize_names: true)
34 | data => { head: { vars: }, results: { bindings: results} } # Ruby3 destructuring
35 | vars = vars.map { |v| v.to_sym }
36 | results.map { |rec|
37 | # return results after transforming to a Hash and applying the
38 | # optional transform lambda. Note the transform can not only
39 | # reduce results, or create an array, but also may transform into
40 | # an OpenStruct.
41 | res = {}
42 | vars.each { |name| res[name] = rec[name][:value] }
43 | if transform
44 | transform.call(res)
45 | else
46 | res
47 | end
48 | }
49 | end
50 |
51 | start = 0
52 | num = MAX
53 | begin
54 | query = "
55 | SELECT DISTINCT ?id
56 | FROM
57 | WHERE {
58 |
59 | ?arvid ?id .
60 |
61 | } LIMIT #{num} OFFSET #{start}
62 | "
63 | list = sparql(query, lambda { |rec| rec[:id] })
64 | list.each do | l |
65 | print(l,"\n")
66 | end
67 | $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
68 | start += num
69 | end while list.size == MAX
70 |
--------------------------------------------------------------------------------
/workflows/pubseq/validate.rb:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env ruby
2 | #
3 | # -*- coding: UTF-8 -*-
4 | #
5 | # Metadata validation routine - does one JSON file. At this stage this
6 | # is mostly a debugging tool
7 | #
8 | # Author:: Pjotr Prins
9 | # License:: MIT
10 | #
11 | # Copyright (C) 2021 Pjotr Prins
12 | #
13 | #
14 |
15 | TOOL=File.basename($0)
16 |
17 | GEMPATH = File.dirname(__FILE__) + '/../../lib/ruby'
18 | $: << File.join(GEMPATH,'lib/ruby/pubseq')
19 |
20 | VERSION_FILENAME=File.join(GEMPATH,'VERSION')
21 | VERSION = File.new(VERSION_FILENAME).read.chomp
22 |
23 | require 'colorize'
24 | require 'optparse'
25 | require 'ostruct'
26 | require 'fileutils'
27 | require 'json'
28 | require 'zlib'
29 |
30 | options = { show_help: false, source: 'https://github.com/pubseq', version: VERSION+' (Pjotr Prins)', date: Time.now.to_s }
31 |
32 | opts = OptionParser.new do |o|
33 | o.banner = "Usage: #{TOOL} [options] path"
34 |
35 | o.separator ""
36 |
37 | o.on("-q", "--quiet", "Run quietly") do |q|
38 | # Bio::Log::CLI.trace('error')
39 | options[:quiet] = true
40 | end
41 |
42 | o.on("-v", "--verbose", "Run verbosely") do |v|
43 | options[:verbose] = true
44 | end
45 |
46 | o.on("--debug", "Show debug messages and keep intermediate output") do |v|
47 | # Bio::Log::CLI.trace('debug')
48 | options[:debug] = true
49 | end
50 |
51 | o.separator ""
52 | o.on_tail('-h', '--help', 'display this help and exit') do
53 | options[:show_help] = true
54 | end
55 | end
56 |
57 | opts.parse!(ARGV)
58 |
59 | BANNER = "#{TOOL} #{VERSION} (Ruby #{RUBY_VERSION}) by Pjotr Prins 2021\n"
60 | $stderr.print BANNER if !options[:quiet]
61 |
62 | if options[:show_help]
63 | print opts
64 | print "\nExample: ruby validate.rb MT810507.json -q|jq\n"
65 | exit 1
66 | end
67 |
68 | if RUBY_VERSION =~ /^[12]/
69 | $stderr.print "WARNING: #{TOOL} may not run properly on Ruby <3.x\n"
70 | end
71 |
72 | $stderr.print "Options: ",options,"\n" if !options[:quiet]
73 |
74 | GLOBAL = OpenStruct.new(options)
75 | $has_error = false
76 |
77 |
78 | for fn in ARGV
79 | next if fn == "state.json"
80 | json = JSON.parse(File.read(fn))
81 | meta = OpenStruct.new(json)
82 | sample = OpenStruct.new(meta.sample)
83 |
84 | error = lambda { |msg|
85 | print(json.to_json,"\n")
86 | $stderr.print "ERROR: ".red,msg.red,"\n"
87 | $has_error = true
88 | }
89 |
90 | # ---- Check for location
91 | location = meta.sample['collection_location']
92 | error.call "Missing collection_location" if not location
93 | error.call "Collection_location <#{location}> not normalized" if location !~ /^http:\/\/www.wikidata.org\/entity\/Q/
94 |
95 | # ---- Dates
96 | error.call "Sample collection_date <#{sample.collection_date}> malformed" if sample.collection_date !~ /\d\d\d\d-\d\d-\d\d/
97 |
98 | end
99 |
100 | exit 1 if $has_error
101 |
--------------------------------------------------------------------------------
/workflows/pubseq/wikidata/README.org:
--------------------------------------------------------------------------------
1 | This directory contains scripts to directly download
2 | data from wikidata.org
3 |
--------------------------------------------------------------------------------
/workflows/pubseq/wikidata/fetch-places.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | #
3 | # This query fetches approx 80K places
4 | #
5 | curl -G https://query.wikidata.org/sparql -H "Accept: text/csv; charset=utf-8" --data-urlencode query="
6 | SELECT DISTINCT ?placename ?place ?country ?coor ?population where {
7 | ?place wdt:P17 ?country ;
8 | wdt:P625 ?coor ;
9 | wdt:P1082 ?population .
10 | FILTER (?population > 9999)
11 | # minus { ?place wdt:P31 wd:Q3024240 } .
12 | ?place rdfs:label ?placename .
13 | FILTER (lang(?placename)='en')
14 | }
15 | "
16 |
--------------------------------------------------------------------------------
/workflows/pubseq/wikidata/fetch-regions.sh:
--------------------------------------------------------------------------------
1 | # curl -G https://query.wikidata.org/sparql -H "Accept: text/tab-separated-values; charset=utf-8" --data-urlencode query="
2 | curl -G https://query.wikidata.org/sparql -H "Accept: text/csv; charset=utf-8" --data-urlencode query="
3 | select distinct ?placename ?place ?country ?coor ?population where {
4 | VALUES ?v { wd:Q82794 wd:Q107390 wd:Q34876 wd:Q9316670 wd:Q515 }
5 | ?statetype wdt:P279+ ?v .
6 | ?place wdt:P31 ?statetype ;
7 | wdt:P17 ?country ;
8 | wdt:P625 ?coor;
9 | wdt:P1082 ?population .
10 | FILTER (?population > 99999)
11 | ?place rdfs:label ?placename .
12 | FILTER (lang(?placename)='en')
13 | }
14 |
15 | "
16 |
--------------------------------------------------------------------------------
/workflows/pubseq/wikidata/wikidata-fetch-places.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | #
3 | # Use a SPARQL query to fetch Wikidata places
4 | #
5 | # Note: requires Ruby 3.x. Older Ruby gives a syntax error
6 | #
7 | # You may need to set
8 | #
9 | # export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt
10 | #
11 | # See also
12 |
13 | raise "Currently not used"
14 |
15 | require 'net/http'
16 | require 'json'
17 | require 'ostruct'
18 | require 'erb'
19 | require 'pp'
20 |
21 | MAX=10
22 |
23 | USER_AGENT = {'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0', "Accept": "text/csv"}
24 |
25 | SPARQL_HEADER="
26 | prefix rdfs:
27 | prefix rdf:
28 | prefix dc:
29 | prefix schema:
30 | "
31 |
32 | # Build a SPARQL query, submit and return results. Apply transform
33 | # lambda when passed in
34 | def sparql query, transform = nil
35 |
36 | api_url = 'https://query.wikidata.org/sparql'
37 | response = Net::HTTP.get_response(URI.parse(api_url),USER_AGENT)
38 | data = JSON.parse(response.body,symbolize_names: true)
39 | data => { head: { vars: }, results: { bindings: results} } # Ruby3 destructuring
40 | vars = vars.map { |v| v.to_sym }
41 | results.map { |rec|
42 | # return results after transforming to a Hash and applying the
43 | # optional transform lambda. Note the transform can not only
44 | # reduce results, or create an array, but also may transform into
45 | # an OpenStruct.
46 | res = {}
47 | vars.each { |name| res[name] = rec[name][:value] }
48 | if transform
49 | transform.call(res)
50 | else
51 | res
52 | end
53 | }
54 | end
55 |
56 | start = 0
57 | num = MAX
58 | begin
59 | query = "
60 | SELECT DISTINCT ?place ?placename ?country ?coor ?population where {
61 | ?place wdt:P17 ?country ;
62 | wdt:P625 ?coor ;
63 | wdt:P1082 ?population .
64 | FILTER (?population > 9999)
65 | ?place rdfs:label ?placename .
66 | FILTER (lang(?placename)='en')
67 | } LIMIT #{num} OFFSET #{start}
68 | "
69 | list = sparql(query) # , lambda { |rec| rec[:id] })
70 | list.each do | l |
71 | print(l,"\n")
72 | end
73 | $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress
74 | start += num
75 | exit 1
76 | end while list.size == MAX
77 |
--------------------------------------------------------------------------------
/workflows/pull-data/genbank/.gitignore:
--------------------------------------------------------------------------------
1 | fasta_and_yaml/
2 | *.tsv
3 | *.acc
4 | *.txt
5 |
--------------------------------------------------------------------------------
/workflows/pull-data/genbank/.guix-run:
--------------------------------------------------------------------------------
1 | # Set up the Guix environment with dependencies
2 | #
3 |
4 | echo # next run:
5 | echo 'export PATH=$GUIX_ENVIRONMENT/bin:$PATH'
6 |
7 | ~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby jq
8 |
9 |
--------------------------------------------------------------------------------
/workflows/pull-data/genbank/genbank-fetch-ids.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | #
3 | # Find all genbank IDs
4 | #
5 | # genbank-fetch-ids.py --max 1000 --skip pubseq_ids.txt
6 | #
7 | # See also directory .guix-run and README.md
8 |
9 | BATCH_SIZE=5000
10 |
11 | import argparse
12 | import json
13 | import os
14 | import requests
15 | import sys
16 | import xml.etree.ElementTree as ET
17 | from datetime import date, datetime
18 | from dateutil.parser import parse
19 |
20 | parser = argparse.ArgumentParser()
21 | parser.add_argument('--max', type=int, help='Max queries', required=False)
22 | parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False)
23 | args = parser.parse_args()
24 |
25 | from Bio import Entrez
26 | Entrez.email = 'another_email@gmail.com' # FIXME
27 |
28 | # min_acceptable_collection_date = datetime(2019, 12, 1)
29 |
30 | today_date = date.today().strftime("%Y.%m.%d")
31 |
32 | skip = set()
33 | if args.skip:
34 | with open(args.skip) as f:
35 | content = f.readlines()
36 | for line in content:
37 | skip.add(line.strip())
38 |
39 | print(f"Skip size is {len(skip)}",file=sys.stderr)
40 |
41 | # Try to search several strings
42 | TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]']
43 | # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with
44 | PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP']
45 |
46 |
47 | ids = set()
48 | for term in TERMS:
49 | num_read = BATCH_SIZE
50 | retstart = 0
51 | while num_read == BATCH_SIZE:
52 | record = Entrez.read(
53 | Entrez.esearch(db='nuccore', term=term, idtype='acc',
54 | retstart=retstart, retmax=BATCH_SIZE)
55 | )
56 | idlist = record['IdList']
57 | new_ids = set(idlist)
58 | num_read = len(new_ids)
59 | print(num_read,":",idlist[0],file=sys.stderr)
60 | retstart += num_read
61 | new_ids.difference_update(skip) # remove skip ids
62 | new_ids = set([id for id in new_ids if id[:2] not in PREFIX])
63 | ids.update(new_ids) # add to total set
64 | print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr)
65 | if args.max and len(ids) > args.max:
66 | print(f"Stopping past #{args.max} items",file=sys.stderr)
67 | break
68 |
69 | for id in ids:
70 | print(id)
71 |
--------------------------------------------------------------------------------
/workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | #
3 | # Create a single YAML/FASTA for each genbank entry in GenBank XML file
4 | #
5 | # transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s)
6 | #
7 | # Also writes a validation file in the outdir named state.json
8 | # ----------------------------------------------------------------------
9 |
10 | # See also directory .guix-run and README.md
11 |
12 | import argparse
13 | import gzip
14 | import json
15 | import os
16 | import sys
17 | import types
18 | import xml.etree.ElementTree as ET
19 | from utils import chunks
20 | import genbank
21 |
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('--out', type=str, help='Directory to write to',
24 | required=True)
25 | parser.add_argument('files', nargs='+', help='file(s)')
26 | args = parser.parse_args()
27 |
28 | dir = args.out
29 | if not os.path.exists(dir):
30 | raise Exception(f"Directory {dir} does not exist")
31 |
32 | states = {}
33 |
34 | for xmlfn in args.files:
35 | print(f"--- Reading {xmlfn}")
36 | with gzip.open(xmlfn, 'r') as f:
37 | xml = f.read().decode()
38 | tree = ET.fromstring(xml)
39 | for gb in tree.findall('./GBSeq'):
40 | valid = None
41 | error = None
42 | meta = {}
43 | id = gb.find("GBSeq_locus").text
44 | basename = dir+"/"+id
45 | print(f" parsing {xmlfn} {id}")
46 | try:
47 | valid,meta = genbank.get_metadata(id,gb)
48 | if valid:
49 | # --- write JSON
50 | jsonfn = basename + ".json"
51 | with open(jsonfn, 'w') as outfile:
52 | print(f" writing {jsonfn}")
53 | json.dump(meta, outfile, indent=4)
54 | # --- write FASTA
55 | fa = basename+".fa"
56 | seq = genbank.get_sequence(id,gb)
57 | if seq:
58 | print(f" writing {fa}")
59 | with open(fa,"w") as f2:
60 | f2.write(f"> {id}\n")
61 | f2.write(seq)
62 | else:
63 | valid = False
64 | except genbank.GBError as e:
65 | error = f"{e} for {id}"
66 | print(error,file=sys.stderr)
67 | valid = False
68 | state = {}
69 | state['valid'] = valid
70 | if error:
71 | state['error'] = error
72 | if meta['warnings']:
73 | state['warnings'] = meta['warnings']
74 | states[id] = state
75 |
76 | statefn = dir + '/state.json'
77 | with open(statefn, 'w') as outfile:
78 | print(f" Writing {statefn}")
79 | json.dump(states, outfile, indent=4)
80 |
--------------------------------------------------------------------------------
/workflows/pull-data/genbank/update-from-genbank.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | #
3 | # bulk download genbank data and matadata, preparing the FASTA and the
4 | # YAML files
5 | #
6 | # update-from-genbank.py --max 10 --ids ids.txt --out ~/tmp/genbank-xml
7 | #
8 | # See also directory .guix-run and README.md
9 |
10 | import argparse
11 | import gzip
12 | import os
13 | import sys
14 | from utils import chunks
15 |
16 | from Bio import Entrez
17 | Entrez.email = 'another_email@gmail.com' # FIXME
18 |
19 | BATCH=100
20 |
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('--max', type=int, help='Max queries', required=False)
23 | parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True)
24 | parser.add_argument('--out', type=str, help='Directory to write to', required=True)
25 |
26 | args = parser.parse_args()
27 |
28 | ids = set()
29 | with open(args.ids) as f:
30 | content = f.readlines()
31 | for line in content:
32 | ids.add(line.strip())
33 |
34 | dir = args.out
35 | if not os.path.exists(dir):
36 | raise Exception(f"Directory {dir} does not exist")
37 |
38 | request_num = BATCH
39 | if args.max:
40 | request_num = min(BATCH,args.max)
41 |
42 | for i, idsx in enumerate(chunks(list(ids), request_num)):
43 | xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz")
44 | if os.path.exists(xmlfn):
45 | print(f"Skipping {xmlfn} ({i*request_num})",file=sys.stderr)
46 | else:
47 | print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr)
48 | with gzip.open(xmlfn, 'w') as f:
49 | f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode())
50 | if args.max and i*request_num >= args.max:
51 | break
52 |
--------------------------------------------------------------------------------
/workflows/pull-data/genbank/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | def is_integer(string_to_check):
5 | try:
6 | int(string_to_check)
7 | return True
8 | except ValueError:
9 | return False
10 |
11 |
12 | def chunks(lst, n):
13 | for i in range(0, len(lst), n):
14 | yield lst[i:i + n]
15 |
16 |
17 | def check_and_get_ontology_dictionaries(dir_ontology_dictionaries):
18 | """
19 | Check duplicated entry by looking in all dictionaries
20 | """
21 |
22 | field_to_term_to_uri_dict = {}
23 |
24 | path_dict_xxx_csv_list = [
25 | os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in
26 | os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')
27 | ]
28 |
29 | for path_dict_xxx_csv in path_dict_xxx_csv_list:
30 | print(f'Read {path_dict_xxx_csv}')
31 |
32 | with open(path_dict_xxx_csv) as f:
33 | for line in f:
34 | if len(line.split(',')) > 2:
35 | term, uri = line.strip('\n').split('",')
36 | else:
37 | term, uri = line.strip('\n').split(',')
38 |
39 | term = term.strip('"')
40 |
41 | if term in field_to_term_to_uri_dict:
42 | print(f'Warning: in the dictionaries there are more entries for the same term ({term}).')
43 | continue
44 |
45 | field_to_term_to_uri_dict[term] = uri
46 |
47 | # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen)
48 | field_to_term_to_uri_dict = {}
49 |
50 | for path_dict_xxx_csv in path_dict_xxx_csv_list:
51 | field = os.path.basename(path_dict_xxx_csv).split('.')[0]
52 |
53 | field_to_term_to_uri_dict[field] = {}
54 |
55 | with open(path_dict_xxx_csv) as f:
56 | for line in f:
57 | if len(line.split(',')) > 2:
58 | term, uri = line.strip('\n').split('",')
59 | else:
60 | term, uri = line.strip('\n').split(',')
61 |
62 | term = term.strip('"')
63 |
64 | if term in field_to_term_to_uri_dict[field]:
65 | print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).')
66 | continue
67 |
68 | field_to_term_to_uri_dict[field][term] = uri
69 |
70 | return field_to_term_to_uri_dict
71 |
--------------------------------------------------------------------------------
/workflows/update-workflows.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-2zp9q4jo5xpif9y fastq2fasta/fastq2fasta.cwl
3 | arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-mqfu9y3ofnpnho1 pangenome-generate/collect-seqs.cwl
4 |
--------------------------------------------------------------------------------