├── .dockstore.yml ├── .gitignore ├── .gitmodules ├── .guix-deploy ├── .guix-run ├── .guix-test ├── Dockerfile ├── LICENSE ├── README.md ├── bh20seqanalyzer ├── __init__.py └── main.py ├── bh20sequploader ├── SARS-CoV-2-reference.fasta ├── __init__.py ├── bh20seq-options.yml ├── bh20seq-schema.yml ├── bh20seq-shex.rdf ├── main.py ├── qc_fasta.py ├── qc_metadata.py └── validation │ ├── Makefile │ ├── formats │ └── formats.mgc ├── bh20simplewebuploader ├── __init__.py ├── api.py ├── main.py ├── static │ ├── blog.css │ ├── image │ │ ├── AWS-Logo.png │ │ ├── AWS.jpg │ │ ├── BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.pdf │ │ ├── BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.png │ │ ├── CWL-Logo-Header.png │ │ ├── CWL.png │ │ ├── ESR.png │ │ ├── REDCap.png │ │ ├── UTHSC-primary-stacked-logo-4c.png │ │ ├── arvados-logo.png │ │ ├── arvados-workflow-output.png │ │ ├── coronasmallcomp.gif │ │ ├── covid19biohackathon.png │ │ ├── curii.logo.ai.png │ │ ├── curii.logo.ai.svg │ │ ├── edit.png │ │ ├── oxford-nanopore.jpg │ │ ├── oxford-nanopore2.jpg │ │ ├── pubseq-aln.png │ │ ├── redcap_logo_high_res_white_on_black.svg │ │ └── redcap_logo_high_res_white_on_black.svg.png │ ├── main.css │ ├── main.js │ └── map.js └── templates │ ├── about.html │ ├── banner.html │ ├── blog.html │ ├── blurb.html │ ├── demo.html │ ├── download.html │ ├── ebi-sample.xml │ ├── error.html │ ├── export.html │ ├── footer.html │ ├── form.html │ ├── header.html │ ├── home.html │ ├── list.html │ ├── mapheader.html │ ├── menu.html │ ├── org-header.html │ ├── permalink.html │ ├── resource.html │ ├── search.html │ ├── status.html │ ├── success.html │ └── validated.html ├── data └── original_semantic_enrichment │ ├── cases_per_country.txt │ ├── countries.ttl │ ├── death_per_country.txt │ └── labels.ttl ├── doc ├── DEVELOPMENT.md ├── INSTALL.md ├── blog │ ├── covid19-pubseq-location-data.html │ ├── covid19-pubseq-location-data.org │ ├── covid19-pubseq-update-rdf.org │ ├── using-covid-19-pubseq-part1.html │ ├── using-covid-19-pubseq-part1.org │ ├── using-covid-19-pubseq-part2.html │ ├── using-covid-19-pubseq-part2.org │ ├── using-covid-19-pubseq-part3.html │ ├── using-covid-19-pubseq-part3.org │ ├── using-covid-19-pubseq-part4.html │ ├── using-covid-19-pubseq-part4.org │ ├── using-covid-19-pubseq-part5.html │ ├── using-covid-19-pubseq-part5.org │ ├── using-covid-19-pubseq-part6.html │ └── using-covid-19-pubseq-part6.org ├── talks │ └── Utrecht-20210510 │ │ └── presentation.org └── web │ ├── about.html │ ├── about.org │ ├── contact.html │ ├── contact.org │ ├── download.html │ ├── download.org │ ├── export.html │ └── export.org ├── etc └── virtuoso-ose │ └── virtuoso.ini ├── example ├── esr_example.yaml ├── maximum_metadata_example.yaml ├── minimal_metadata_example.yaml ├── sequence.fasta └── uthsc_example.yaml ├── gittaggers.py ├── image └── homepage.png ├── lib └── ruby │ └── VERSION ├── paper ├── paper.bib └── paper.md ├── scripts ├── README.md ├── cleanup.py ├── create_sra_metadata │ ├── SraExperimentPackage.2020.07.09.xml.gz │ └── create_sra_metadata.py ├── db_enrichment │ ├── .gitignore │ ├── country_enrichment.py │ ├── input_location.csv │ ├── readme.md │ └── update │ │ └── README.org ├── delete_entries_on_arvados.py ├── dict_ontology_standardization │ ├── ncbi_countries.csv │ ├── ncbi_host_health_status.csv │ ├── ncbi_host_species.csv │ ├── ncbi_sequencing_technology.csv │ └── ncbi_speciesman_source.csv ├── docker │ └── Dockerfile ├── esr_samples │ ├── Pathogen.cl.1.0.xlsx │ ├── esr_samples.py │ ├── jetson │ │ ├── 21JETSONTEST001.consensus.yaml │ │ └── 21JETSONTEST001.fasta │ └── template.yaml ├── fasta2vcf │ ├── MZ026486.1.fasta │ ├── README.md │ ├── alignment2vcf.py │ ├── fasta2vcf.sh │ ├── resources │ │ ├── MN908947.3.fasta │ │ ├── NC_045512.2.fasta │ │ ├── NC_045512.2.fasta.fai │ │ ├── README.md │ │ └── ensembl-export.csv │ └── simpleVcfAnnotation.py ├── fetch_from_genbank.cwl ├── foreach.sh ├── gen_docs │ └── org2html.sh ├── import.cwl ├── import_from_genbank.cwl ├── import_to_arvados.py ├── split_into_arrays.cwl ├── submit_ebi │ └── example │ │ ├── project-submission.xml │ │ ├── project.xml │ │ ├── sample-submission.xml │ │ └── sample.xml ├── update_virtuoso │ └── check_for_updates.py ├── upload.cwl ├── uthsc_samples │ ├── .gitignore │ ├── template.yaml │ └── uthsc_samples.py └── utils.py ├── setup.py ├── test ├── data │ ├── 10_samples.fa │ ├── 10_samples.xlsx │ ├── input │ │ ├── TN_UT2.fa │ │ └── TN_UT2.yaml │ ├── regression │ │ └── TN_UT2.rdf │ └── test.ttl ├── rest-api.html ├── rest-api.org ├── runner.py ├── test_shex.py └── test_sparql.py └── workflows ├── fastq2fasta ├── bam2fasta.cwl ├── bcftools-concat.cwl ├── bcftools-consensus.cwl ├── bcftools-index.cwl ├── bcftools-norm.cwl ├── bcftools-view-exclude-ref.cwl ├── bcftools-view-qc.cwl ├── bcftools-view.cwl ├── bwa-index.cwl ├── bwa-mem.cwl ├── fastq2fasta-create-bwaindex.cwl ├── fastq2fasta.cwl ├── freebayes.cwl ├── samtools-faidx.cwl ├── samtools-sort.cwl └── samtools-view.cwl ├── pangenome-generate ├── abpoa.cwl ├── arv-main.cwl ├── arvados-and-samtools-dockerfile │ ├── 1078ECD7.key │ └── Dockerfile ├── collect-seqs.cwl ├── collect-seqs.py ├── dups2metadata.cwl ├── dups2metadata.py ├── from_sparql.cwl ├── from_sparql.py ├── merge-metadata.cwl ├── merge-metadata.py ├── odgi-build-from-xpoa-gfa.cwl ├── odgi-build.cwl ├── odgi_to_rdf.cwl ├── pangenome-generate.cwl ├── pangenome-generate_abpoa.cwl ├── pangenome-generate_spoa.cwl ├── query-to-gfa.cwl ├── relabel-seqs.cwl ├── relabel-seqs.py ├── seqwish.cwl ├── sort_fasta_by_quality_and_len.cwl ├── sort_fasta_by_quality_and_len.py ├── spoa.cwl └── testjob.yml ├── phylogeny ├── README.md ├── align.cwl ├── augur.cwl ├── awk-coverage.cwl ├── clado-job.yml ├── coverage.cwl ├── metadata.cwl ├── newick.cwl └── phylogeny.cwl ├── pubseq ├── generate-rdf.rb ├── normalize-step1.py ├── normalize-step2.rb ├── normalize │ ├── README.md │ ├── __init__.py │ └── mapping.py ├── pubseq-fetch-data.py ├── pubseq-fetch-ids ├── validate.rb └── wikidata │ ├── README.org │ ├── fetch-places.sh │ ├── fetch-regions.sh │ └── wikidata-fetch-places.rb ├── pull-data └── genbank │ ├── .gitignore │ ├── .guix-run │ ├── README.md │ ├── genbank-fetch-ids.py │ ├── genbank.py │ ├── transform-genbank-xml2yamlfa.py │ ├── update-from-genbank.py │ └── utils.py └── update-workflows.sh /.dockstore.yml: -------------------------------------------------------------------------------- 1 | version: 1.2 2 | workflows: 3 | - name: Pangenome Generator 4 | subclass: CWL 5 | primaryDescriptorPath: /workflows/pangenome-generate/pangenome-generate.cwl 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py~ 2 | 3 | # Distribution / packaging 4 | build/ 5 | cache.txt 6 | metadata.ttl 7 | __pycache__/ 8 | eggs/ 9 | .eggs/ 10 | *.egg-info/ 11 | *.egg 12 | 13 | # Temp files 14 | metadata.ttl 15 | metadata* 16 | cache.txt 17 | data/wikidata 18 | 19 | # Environments 20 | .env 21 | .venv 22 | env/ 23 | venv/ 24 | ENV/ 25 | env.bak/ 26 | venv.bak/ 27 | 28 | relabeledSeqs* 29 | 30 | # Generated dirs/files 31 | metadata_from_nuccore/ 32 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "workflows/tools"] 2 | path = workflows/tools 3 | url = https://github.com/common-workflow-library/bio-cwl-tools.git 4 | -------------------------------------------------------------------------------- /.guix-deploy: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # This script runs the web server in a Guix container 4 | 5 | GUIX_PROFILE=/home/wrk/.config/guix/current 6 | export GUILE_LOAD_PATH=$GUIX_PROFILE/share/guile/site/3.0/ 7 | export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/ 8 | 9 | ls $GUILE_LOAD_PATH 10 | 11 | env GUIX_PACKAGE_PATH=/home/wrk/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-redis python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client --share=/export/tmp -- env TMPDIR=/export/tmp FLASK_ENV=development FLASK_RUN_PORT=5067 FLASK_APP=bh20simplewebuploader/main.py flask run 12 | 13 | -------------------------------------------------------------------------------- /.guix-run: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # Set up a container to run the scripts 4 | 5 | GUIX_PROFILE=/home/wrk/.config/guix/current 6 | export GUILE_LOAD_PATH=$GUIX_PROFILE/share/guile/site/3.0/ 7 | export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/ 8 | 9 | ls $GUILE_LOAD_PATH 10 | 11 | env GUIX_PACKAGE_PATH=/home/wrk/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl minimap2 python-schema-salad --share=/export/tmp 12 | 13 | -------------------------------------------------------------------------------- /.guix-test: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | # 3 | # This script runs the tests in a Guix container 4 | 5 | GUIX_PROFILE=~/.config/guix/current 6 | export GUILE_LOAD_PATH=$GUIX_PROFILE/share/guile/site/3.0/ 7 | export GUILE_LOAD_COMPILED_PATH=$GUIX_PROFILE/share/guile/site/3.0/ 8 | 9 | ls $GUILE_LOAD_PATH 10 | 11 | env GUIX_PACKAGE_PATH=~/iwrk/opensource/guix/guix-bioinformatics/ $GUIX_PROFILE/bin/guix environment -C guix --ad-hoc git python python-flask python-pyyaml python-pycurl python-magic nss-certs python-pyshex python-pyyaml --network openssl python-pyshex python-pyshexc minimap2 python-schema-salad python-arvados-python-client -- python3 test/runner.py 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile for containerizing the web interface 2 | FROM python:3.6-jessie 3 | WORKDIR /app 4 | 5 | RUN pip3 install gunicorn 6 | 7 | ADD LICENSE /app/ 8 | ADD gittaggers.py /app/ 9 | ADD setup.py /app/ 10 | ADD README.md /app/ 11 | ADD example /app/example 12 | ADD bh20seqanalyzer /app/bh20simplewebuploader 13 | ADD bh20sequploader /app/bh20sequploader 14 | ADD bh20simplewebuploader /app/bh20simplewebuploader 15 | 16 | RUN pip3 install -e .[web] 17 | 18 | ENV PORT 8080 19 | CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:8080", "bh20simplewebuploader.main:app"] 20 | -------------------------------------------------------------------------------- /bh20seqanalyzer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20seqanalyzer/__init__.py -------------------------------------------------------------------------------- /bh20sequploader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20sequploader/__init__.py -------------------------------------------------------------------------------- /bh20sequploader/bh20seq-options.yml: -------------------------------------------------------------------------------- 1 | # Contains suggested human-readable field values and their corresponding IRIs. 2 | # Keyed on the field names in the types in the schema. Relies on field names 3 | # being unique or at least using the same options in different containing 4 | # types. 5 | 6 | license_type: 7 | CC0 Public Domain Dedication: http://creativecommons.org/publicdomain/zero/1.0/ 8 | CC-BY-4.0 Attribution 4.0 International: http://creativecommons.org/licenses/by/4.0/ 9 | 10 | host_age_unit: 11 | Years: http://purl.obolibrary.org/obo/UO_0000036 12 | Months: http://purl.obolibrary.org/obo/UO_0000035 13 | Weeks: http://purl.obolibrary.org/obo/UO_0000034 14 | Days: http://purl.obolibrary.org/obo/UO_0000033 15 | Hours: http://purl.obolibrary.org/obo/UO_0000032 16 | 17 | host_sex: 18 | Male: http://purl.obolibrary.org/obo/PATO_0000384 19 | Female: http://purl.obolibrary.org/obo/PATO_0000383 20 | Intersex: http://purl.obolibrary.org/obo/PATO_0001340 21 | 22 | host_health_status: 23 | healthy: http://purl.obolibrary.org/obo/NCIT_C115935 24 | asymptomatic: http://purl.obolibrary.org/obo/NCIT_C3833 25 | symptomatic: http://purl.obolibrary.org/obo/NCIT_C25269 26 | admitted to hospital: http://purl.obolibrary.org/obo/GENEPIO_0002020 27 | discharged from hospital: http://purl.obolibrary.org/obo/GENEPIO_0001849 28 | dead: http://purl.obolibrary.org/obo/NCIT_C28554 29 | alive: http://purl.obolibrary.org/obo/NCIT_C37987 30 | 31 | sample_sequencing_technology: 32 | Illumina NextSeq 500: http://www.ebi.ac.uk/efo/EFO_0009173 33 | Illumina NextSeq 550: http://www.ebi.ac.uk/efo/EFO_0008566 34 | Illumina HiSeq X: http://www.ebi.ac.uk/efo/EFO_0008567 35 | Illumina MiSeq: http://www.ebi.ac.uk/efo/EFO_0004205 36 | Illumina: http://purl.obolibrary.org/obo/OBI_0000759 37 | IonTorrent: http://purl.obolibrary.org/obo/NCIT_C125894 38 | Oxford Nanopore MinION: http://www.ebi.ac.uk/efo/EFO_0008632 39 | Oxford Nanopore Sequencing: http://purl.obolibrary.org/obo/NCIT_C146818 40 | Sanger dideoxy sequencing: http://purl.obolibrary.org/obo/NCIT_C19641 41 | 42 | specimen_source: 43 | nasopharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155831 44 | oropharyngeal swab: http://purl.obolibrary.org/obo/NCIT_C155835 45 | sputum: http://purl.obolibrary.org/obo/NCIT_C13278 46 | bronchoalveolar lavage fluid: http://purl.obolibrary.org/obo/NCIT_C13195 47 | saliva: http://purlobolibrary.org/obo/NCIT_C13275 48 | aspirate: http://purl.obolibrary.org/obo/NCIT_C13347 49 | -------------------------------------------------------------------------------- /bh20sequploader/qc_fasta.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | import tempfile 3 | import magic 4 | import subprocess 5 | import tempfile 6 | import logging 7 | import re 8 | import io 9 | import gzip 10 | 11 | log = logging.getLogger(__name__ ) 12 | 13 | def read_fasta(sequence): 14 | entries = 0 15 | bases = [] 16 | label = None 17 | for line in sequence: 18 | if line.startswith(">"): 19 | label = line 20 | entries += 1 21 | else: 22 | bases.append(line) 23 | if entries > 1: 24 | log.debug("FASTA file contains multiple entries") 25 | raise ValueError("FASTA file contains multiple entries") 26 | return label, bases 27 | 28 | def qc_fasta(arg_sequence, check_with_mimimap2=True): 29 | log.debug("Starting qc_fasta") 30 | schema_resource = pkg_resources.resource_stream(__name__, "validation/formats") 31 | with tempfile.NamedTemporaryFile() as tmp: 32 | tmp.write(schema_resource.read()) 33 | tmp.flush() 34 | val = magic.Magic(magic_file=tmp.name, 35 | uncompress=False, mime=True) 36 | 37 | gz = "" 38 | if arg_sequence.name.endswith(".gz"): 39 | sequence = gzip.GzipFile(fileobj=arg_sequence, mode='rb') 40 | gz = ".gz" 41 | else: 42 | sequence = arg_sequence 43 | 44 | sequence = io.TextIOWrapper(sequence) 45 | r = sequence.read(4096) 46 | sequence.seek(0) 47 | 48 | seqlabel = r[1:r.index("\n")] 49 | seq_type = val.from_buffer(r).lower() 50 | 51 | if seq_type == "text/fasta": 52 | # ensure that contains only one entry 53 | submitlabel, submitseq = read_fasta(sequence) 54 | sequence.seek(0) 55 | sequence.detach() 56 | 57 | if check_with_mimimap2: 58 | with tempfile.NamedTemporaryFile() as tmp1: 59 | with tempfile.NamedTemporaryFile() as tmp2: 60 | refstring = pkg_resources.resource_string(__name__, "SARS-CoV-2-reference.fasta") 61 | tmp1.write(refstring) 62 | tmp1.flush() 63 | tmp2.write(submitlabel.encode("utf8")) 64 | tmp2.write(("".join(submitseq)).encode("utf8")) 65 | tmp2.flush() 66 | 67 | similarity = 0 68 | try: 69 | log.debug("Trying to run minimap2") 70 | cmd = ["minimap2", "-c", "-x", "asm20", tmp1.name, tmp2.name] 71 | logging.info("QC checking similarity to reference") 72 | logging.info(" ".join(cmd)) 73 | result = subprocess.run(cmd, stdout=subprocess.PIPE) 74 | result.check_returncode() 75 | res = result.stdout.decode("utf-8") 76 | mm = res.split("\t") 77 | if len(mm) >= 10: 78 | # divide Number of matching bases in the mapping / Target sequence length 79 | similarity = (float(mm[9]) / float(mm[6])) * 100.0 80 | else: 81 | similarity = 0 82 | except Exception as e: 83 | logging.warn("QC against reference sequence using 'minimap2': %s", e, exc_info=e) 84 | 85 | if similarity < 70.0: 86 | raise ValueError( 87 | f"QC fail for {seqlabel}: alignment to reference was less than 70% (was {similarity})") 88 | 89 | return "sequence.fasta" + gz, seqlabel, seq_type 90 | elif seq_type == "text/fastq": 91 | sequence.seek(0) 92 | sequence.detach() 93 | return "reads.fastq" + gz, seqlabel, seq_type 94 | else: 95 | log.debug(seqlabel) 96 | log.debug(seq_type) 97 | raise ValueError("Sequence file ({}) does not look like a DNA FASTA or FASTQ".format(arg_sequence)) 98 | -------------------------------------------------------------------------------- /bh20sequploader/qc_metadata.py: -------------------------------------------------------------------------------- 1 | import schema_salad.schema 2 | import schema_salad.ref_resolver 3 | import schema_salad.jsonld_context 4 | import logging 5 | import pkg_resources 6 | import logging 7 | import traceback 8 | from rdflib import Graph, Namespace 9 | from pyshex.evaluate import evaluate 10 | 11 | metadata_schema = None 12 | 13 | def qc_metadata(metadatafile): 14 | global metadata_schema 15 | log = logging.getLogger(__name__ ) 16 | if metadata_schema is None: 17 | schema_resource = pkg_resources.resource_stream(__name__, "bh20seq-schema.yml") 18 | cache = {"https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml": schema_resource.read().decode("utf-8")} 19 | metadata_schema = schema_salad.schema.load_schema("https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-schema.yml", cache=cache) 20 | 21 | (document_loader, 22 | avsc_names, 23 | schema_metadata, 24 | metaschema_loader) = metadata_schema 25 | 26 | shex = pkg_resources.resource_stream(__name__, "bh20seq-shex.rdf").read().decode("utf-8") 27 | 28 | if not isinstance(avsc_names, schema_salad.avro.schema.Names): 29 | raise Exception(avsc_names) 30 | 31 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) 32 | g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx) 33 | rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape") 34 | 35 | # As part of QC make sure serialization works too, this will raise 36 | # an exception if there are invalid URIs. 37 | g.serialize(format="ntriples") 38 | 39 | if not rslt: 40 | raise Exception(reason) 41 | 42 | return metadata['sample']['sample_id'] 43 | -------------------------------------------------------------------------------- /bh20sequploader/validation/Makefile: -------------------------------------------------------------------------------- 1 | compile: formats.mgc 2 | 3 | formats.mgc : 4 | file -C -m formats 5 | -------------------------------------------------------------------------------- /bh20sequploader/validation/formats: -------------------------------------------------------------------------------- 1 | 0 regex \^\>.+\r?\n([A-Za-z]+\r?\n)*[A-Za-z]+(\r?\n)?$ FASTA 2 | !:mime text/fasta 3 | 0 regex \^@.+\r?\n[A-Za-z]*\n\\+.*\n[!-i]*(\r\n)? FASTQ 4 | !:mime text/fastq -------------------------------------------------------------------------------- /bh20sequploader/validation/formats.mgc: -------------------------------------------------------------------------------- 1 | �@=.^>.+ 2 | ? 3 | ([A-Za-z]+ 4 | ? 5 | )*[A-Za-z]+( 6 | ? 7 | )?$FASTAtext/fasta@=%^@.+ 8 | ? 9 | [A-Za-z]* 10 | \+.* 11 | [!-i]*( 12 | )?FASTQtext/fastq -------------------------------------------------------------------------------- /bh20simplewebuploader/__init__.py: -------------------------------------------------------------------------------- 1 | import bh20simplewebuploader.api 2 | -------------------------------------------------------------------------------- /bh20simplewebuploader/static/blog.css: -------------------------------------------------------------------------------- 1 | .title { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace } 2 | .table-of-contents { font-family: monospace; color: red; } 3 | /* .text-table-of-contents { font-family: monospace; color: black; font-size:80%; } */ 4 | .timestamp { font-family: monospace; color: darkgreen; } 5 | 6 | h1,h2 { font-family: Lucida Sans Typewriter,Lucida Console,monaco,Bitstream Vera Sans Mono,monospace; color:black;background-color:white; } 7 | h2 { color: black; } 8 | h3,h4 { color: black; margin:0; } 9 | code { color: darkblue; } 10 | body {font-family: Palatino, 'Palatino Linotype', serif; color:black; background-color:white; font-size: large; padding: 10px; } 11 | 12 | div.verbatim { margin: 30px; color: black; background-color: white; border-style:outset; 13 | font-family: palatino font, monospace; font-size:80%; font-weight:bold; } 14 | div.quote { font-family: palatino font, monospace; font-size:80%; } 15 | div.quotation { font-family: palatino font, monospace; font-size:80%; } 16 | pre.example { margin: 30px; font-family: prestige, monospace; color:black; font-size:70%; background-color: lightyellow; } 17 | pre.src { margin: 30px; font-family: prestige, monospace; font-weight: bold; color:white; font-size:80%; background-color: black; } 18 | 19 | div[id="text-table-of-contents"]{ 20 | font-family: palatino font, monospace; background-color:white; 21 | border-style: dotted; 22 | border-color: #98bf21; 23 | border-width: 1px; 24 | } 25 | div[class^="outline-text"] { 26 | margin: 10px; 27 | // background-color:white; 28 | // border-style: dotted; 29 | // border-color: #98bf21; 30 | // border-width: 1px; 31 | font-family: Palatino, 'Palatino Linotype', serif; color:black; font-size: large 32 | } 33 | span[class="todo TESTING"] { 34 | color:purple; 35 | } 36 | span[class="todo IN_PROGRESS"] { 37 | color:brown; 38 | } 39 | span[class^="section-number"] { 40 | color:grey; 41 | } 42 | span[class="journal"] { 43 | color:darkblue; 44 | } 45 | span[class="year"] { 46 | color:darkred; 47 | } 48 | -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/AWS-Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/AWS-Logo.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/AWS.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/AWS.jpg -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.pdf -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/BCC2020_AndreaGuarracino_COVID19PubSeq_Poster.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/CWL-Logo-Header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/CWL-Logo-Header.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/CWL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/CWL.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/ESR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/ESR.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/REDCap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/REDCap.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/UTHSC-primary-stacked-logo-4c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/UTHSC-primary-stacked-logo-4c.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/arvados-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/arvados-logo.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/arvados-workflow-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/arvados-workflow-output.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/coronasmallcomp.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/coronasmallcomp.gif -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/covid19biohackathon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/covid19biohackathon.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/curii.logo.ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/curii.logo.ai.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/edit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/edit.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/oxford-nanopore.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/oxford-nanopore.jpg -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/oxford-nanopore2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/oxford-nanopore2.jpg -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/pubseq-aln.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/pubseq-aln.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/image/redcap_logo_high_res_white_on_black.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/static/image/redcap_logo_high_res_white_on_black.svg.png -------------------------------------------------------------------------------- /bh20simplewebuploader/static/map.js: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Draws the map using Leaflet and OpenStreetmap 4 | 5 | drawMap() is the main function. 6 | 7 | */ 8 | var map = L.map( 'mapid', { 9 | center: [51.505, -0.09], // Default to U.S.A 10 | minZoom: 2, 11 | zoom: 0 12 | }); 13 | 14 | L.tileLayer( 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', { 15 | attribution: '© OpenStreetMap | COVID-19 PubSeq', 16 | subdomains: ['a','b','c'] 17 | }).addTo(map); 18 | 19 | /* 20 | * When a page gets rendered this function draws the map 21 | */ 22 | 23 | function drawMap(){ 24 | var mymap = map; 25 | 26 | // ---- fetch all counts 27 | fetch(scriptRoot + "api/getCountByGPS") 28 | .then(response => { 29 | console.log(response) 30 | return response.json(); 31 | }) 32 | .then(data => { 33 | buildMapMarkers(data); 34 | 35 | }); 36 | document.getElementById("map_view").classList.remove("invisible"); 37 | map.invalidateSize(); 38 | } 39 | 40 | 41 | /* 42 | * Register a marker with special attribute track # sequences 43 | */ 44 | 45 | seqMarker = L.Marker.extend({ 46 | options: { 47 | seqMarkerLocation: "Loc", 48 | contributors: "countContrib", 49 | sequences: "countSeq" 50 | } 51 | }); 52 | 53 | /* 54 | * Builds markers on the map. We use cluster groups to allow 55 | * counts at different zoom levels. This function is called 56 | * once on page loading. markerClusterGroup just handles it. 57 | * Note the display is handled in CSS (main.css) as .my-custom-icon* 58 | */ 59 | 60 | function buildMapMarkers(data) { 61 | let markers = L.markerClusterGroup({ 62 | singleMarkerMode: true, 63 | iconCreateFunction: function (cluster) { 64 | // ---- add marker 65 | // array of each marker in the cluster: 66 | var theseMarkers = cluster.getAllChildMarkers(); 67 | 68 | // --- compute zoom level and set style 69 | 70 | sumCount = 0; 71 | for (var i = 0; i < theseMarkers.length; i++) { 72 | sumCount += theseMarkers[i].options.sequences; 73 | } 74 | 75 | if (theseMarkers.length < 2) { 76 | return L.divIcon({ 77 | html: sumCount, 78 | className: 'my-custom-icon my-custom-icon-0', 79 | }) 80 | } else { 81 | var digits = (sumCount + '').length; 82 | return L.divIcon({ 83 | html: sumCount, 84 | className: 'my-custom-icon my-custom-icon-'+digits, 85 | }); 86 | }}}); 87 | // ---- Build the marker list 88 | for (let i = 0; i < data.length; i++) { 89 | let {"count": fastaCount, GPS, Location: location, LocationLabel: label } = data[i]; 90 | let countSeq = Number(fastaCount); 91 | 92 | let coordinates = GPS.split(" "); 93 | if (!(coordinates == null)) { 94 | let lat, lon; 95 | [lon, lat] = coordinates.map(parseFloat); 96 | let point = L.point() 97 | marker = new seqMarker([lat, lon],markerOptions={title: fastaCount+" sequences",sequences: countSeq}); 98 | marker.bindPopup("" + label + "
" + "SARS-CoV-2
sequences: " +fastaCount + ""); 99 | markers.addLayer(marker); 100 | } 101 | } 102 | map.addLayer(markers); 103 | } 104 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/about.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'org-header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 | {{ embed|safe }} 9 | 10 | {% include 'footer.html' %} 11 | 12 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/banner.html: -------------------------------------------------------------------------------- 1 |
2 | 3 |

COVID-19 PubSeq: Public SARS-CoV-2 Sequence Resource

4 | 5 |

public sequences ready for download!

6 | 7 | May 2021 update: we are now at 86,377 sequences with normalized metadata on AWS OpenData! 8 |
9 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/blog.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 | {% if embed %} 9 | {{ embed|safe }} 10 |
11 | 12 |

Other documents

13 | 14 | {% else %} 15 | 16 |

Documents:

17 | {% endif %} 18 | 19 |
20 |
21 |
22 |
23 | 26 |
27 | We fetch sequence data and metadata. We query 28 | the metadata in multiple ways using SPARQL and onthologies 29 |
30 |
31 |
32 | 35 |
36 | We submit a sequence to the database. In this BLOG we fetch 37 | a sequence from GenBank and add it to the database. 38 |
39 |
40 |
41 |
42 | Modify workflow 43 |
44 |
45 | We modify a workflow to get new output 46 |
47 |
48 |
49 |
50 | Modify metadata 51 |
52 |
53 | We modify metadata for all to use! In this BLOG we add a field 54 | for a creative commons license. 55 |
56 |
57 |
58 |
59 | Geo information 60 |
61 |
62 | Dealing with PubSeq localisation data 63 |
64 |
65 |
66 | 69 |
70 | We explore the Arvados command line and API 71 |
72 |
73 |
74 | 77 |
78 | Generate the files needed for uploading to EBI/ENA 79 |
80 |
81 |
82 |
83 | REST API 84 |
85 |
86 | Documentation for PubSeq REST API 87 |
88 |
89 |
90 |
91 |
92 | 93 | {% include 'footer.html' %} 94 | 95 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/blurb.html: -------------------------------------------------------------------------------- 1 |

2 | COVID-19 PubSeq is a free and open online bioinformatics public 3 | sequence resource with federated data using unique identifiers and 4 | with unique metadata, such as disambiguated 5 | 6 | Geo localisation. PubSeq comes with on-the-fly analysis of 7 | sequenced SARS-CoV-2 samples that allows for a quick turnaround in 8 | identification of new virus strains. PubSeq allows anyone to upload 9 | sequence material in the form of FASTA or FASTQ files with 10 | accompanying metadata through a web interface or REST API. 11 |

12 |

13 | PubSeq is not owned by anyone. There is no central authority and 14 | there is no (single) company that owns that data or workflows. Our 15 | goal is simply to help map the viral variants. Early identification 16 | of variants helps with testing and treatments! COVID-19 PubSeq 17 | accepts sequence material from all sources. In addition, PubSeq has 18 | specific workflows for Oxford Nanopore analysis in FAST5 and FASTQ 19 | format. If you have an Oxford Nanopore and need (free) help 20 | analysing SARS-CoV-2 FAST5 or FASTQ data, feel free 21 | to contact us! 22 |

23 |

24 | COVID-19 PubSeq is also a repository for sequences with a low 25 | barrier to entry for uploading sequence data using best practices, 26 | including FAIR 27 | data. Data are published with metadata using state-of-the art 28 | standards and, perhaps most importantly, providing standardised 29 | workflows that get triggered on upload, so that results are 30 | immediately available in standardised data formats. Note that, in 31 | general, there is no conflict also uploading your data to other 32 | repositories, including EBI/ENA and GISAID. 33 |

34 |

35 | Your uploaded sequence will automatically be processed and 36 | incorporated into the public pangenome with metadata using worklows 37 | from the High Performance Open Biology Lab 38 | defined here. Importantly, all 39 | data is published under 40 | a Creative 41 | Commons license (CC0 or CC-BY-4.0). Anyone can take the 42 | published (GFA/RDF/FASTA) data and use it for 43 | further processing. 44 |

45 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 |

The Virtuoso database contains public sequences! The examples here should provide a starting point to explore our data in our public SPARQL endpoint or via SIB COVID-19 Integrated Knowledgebase. See also our documentation here for more information!

9 | 20 | 21 |
22 |
23 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 44 |
45 | 46 |
47 | 48 |
49 | 51 | 52 | 53 |
54 |
55 |
56 | 57 | {% include 'footer.html' %} 58 | 59 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/download.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'org-header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 | {{ embed|safe }} 9 | 10 | {% include 'footer.html' %} 11 | 12 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/ebi-sample.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | COVID-19 PubSeq Sample 5 | 6 | 2697049 7 | Severe acute respiratory syndrome coronavirus 2 8 | SARS-CoV-2 9 | 10 | 11 | 12 | investigation type 13 | {{ investigation_type }} 14 | 15 | 16 | sequencing method 17 | {{ sequencer }} 18 | 19 | 20 | collection date 21 | {{ date }} 22 | 23 | 24 | geographic location (latitude) 25 | {{ latidude }} 26 | DD 27 | 28 | 29 | geographic location (longitude) 30 | {{ longitude }} 31 | DD 32 | 33 | 34 | geographic location (country and/or sea) 35 | {{ country }} 36 | 37 | 38 | geographic location (region and locality) 39 | {{ locality }} 40 | 41 | 42 | environment (material) 43 | {{ specimen }} 44 | 45 | 46 | ENA-CHECKLIST 47 | ERC000011 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/error.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Upload Failed 7 | 8 | 9 |

Upload Failed

10 |
11 |

12 | Your upload has failed. 13 |

14 |             {{error_message|safe}}
15 |           
16 |

17 |

18 | Click here to try again. 19 |

20 |
21 | 22 | 23 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/export.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 | 9 | {% if embed %} 10 | {{ embed|safe }} 11 | {% endif %} 12 | 13 | 21 | 22 |
23 | 24 |
25 | 26 | {% include 'footer.html' %} 27 | 28 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/footer.html: -------------------------------------------------------------------------------- 1 |
2 | 62 | {% if load_map %} 63 | 64 | {% endif %} 65 | 66 | 67 | 80 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | COVID-19 PubSeq: Public SARS-CoV-2 Sequence Resource 9 | {% if blog %} 10 | 11 | {% endif %} 12 | 13 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/list.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 |
9 | 10 |

{{id}}

11 | 12 |
13 |
14 |
15 | {% for row in l: %} 16 |
17 |
18 | {% for col in h: %} 19 |
20 | {% if col == 'name': %} 21 | {{ row[col]['value'] }} 22 | {% else %} 23 | {{ row[col]['value'] }} 24 | {% endif %} 25 |
26 | {% endfor %} 27 |
28 |
29 | {% endfor %} 30 |
31 |
32 | 33 |
34 | 35 | {% include 'footer.html' %} 36 | 37 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/mapheader.html: -------------------------------------------------------------------------------- 1 | 4 | 7 | 10 | 11 | 14 | 17 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/menu.html: -------------------------------------------------------------------------------- 1 | 16 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/permalink.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 |
9 | 10 |

{{id}}

11 | 12 |

13 | This is page represents a permanent COVID-19 PubSeq SARS-CoV-2 sequence resource 14 |

15 | 16 |
17 |
18 |
19 |
20 |
21 |
22 | Identifier 23 |
24 |
25 | {{id}} 26 |
27 |
28 |
29 |
30 |
31 |
32 | Permanent link 33 |
34 |
35 | {{uri}} 36 |
37 |
38 |
39 |
40 |
41 |
42 | Location 43 |
44 |
45 | {{location}} 46 |
47 |
48 |
49 |
50 |
51 |
52 | Sampling date 53 |
54 |
55 | {{ date }} 56 |
57 |
58 |
59 |
60 |
61 |
62 | Institute 63 |
64 |
65 | {{ institute }} 66 |
67 |
68 |
69 |
70 |
71 |
72 | Sample type 73 |
74 |
75 | {{sampletype}} 76 |
77 |
78 |
79 |
80 |
81 |
82 | Sequence 83 |
84 |
85 | {{sequenceuri}} 86 |
87 |
88 |
89 |
90 |
91 |
92 | Metadata 93 |
94 |
95 | {{metauri}} 96 |
97 |
98 |
99 |
100 |
101 |
102 | Source 103 |
104 |
105 | {{source}} 106 |
107 |
108 |
109 |
110 |
111 | 112 |
113 | 114 | {% include 'footer.html' %} 115 | 116 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/resource.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 |
9 | 10 |

All sequences project

11 |

All sequences (FASTA) relabled and deduplicated

12 |

Metadata (RDF) for all sequences

13 | 14 | 15 | 16 | 17 | 18 |

SPARQL endpoint - Sample query for accessions 19 | 20 | {{ embed|safe }} 21 | 22 |

23 | 24 | {% include 'footer.html' %} 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/search.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/bh20simplewebuploader/templates/search.html -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/status.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 |

Sequence upload processing status

9 | 10 |
11 | 12 | {{ table }} 13 |
14 | 15 | {% include 'footer.html' %} 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/success.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Upload Successful 7 | 8 | 9 |

Upload Successful

10 |
11 |

12 | Your files have been uploaded. You can track their QC status, once validated they will be part of the Public SARS-CoV-2 Sequence Resource. 13 |

14 |

15 | The upload log was: 16 |

17 |
{{log}}
18 |
19 |

20 | Click here to upload more files. 21 |

22 |
23 | 24 | 25 | -------------------------------------------------------------------------------- /bh20simplewebuploader/templates/validated.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'header.html' %} 4 | 5 | {% include 'banner.html' %} 6 | {% include 'menu.html' %} 7 | 8 |

Validated sequences

9 | 10 |
11 | {{ table }} 12 |
13 | 14 | {% include 'footer.html' %} 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /doc/DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## Upload resume 4 | 5 | When data files get large we may want to implement resume, 6 | like put does. See 7 | [/sdk/python/arvados/commands/put.py](https://dev.arvados.org/projects/arvados/repository/revisions/master/entry/sdk/python/arvados/commands/put.py) 8 | -------------------------------------------------------------------------------- /doc/blog/using-covid-19-pubseq-part4.org: -------------------------------------------------------------------------------- 1 | #+TITLE: COVID-19 PubSeq (part 4) 2 | #+AUTHOR: Pjotr Prins 3 | # C-c C-e h h publish 4 | # C-c ! insert date (use . for active agenda, C-u C-c ! for date, C-u C-c . for time) 5 | # C-c C-t task rotate 6 | # RSS_IMAGE_URL: http://xxxx.xxxx.free.fr/rss_icon.png 7 | 8 | #+HTML_HEAD: 9 | 10 | 11 | * Table of Contents :TOC:noexport: 12 | - [[#what-does-this-mean][What does this mean?]] 13 | - [[#where-can-i-find-the-workflows][Where can I find the workflows?]] 14 | - [[#modify-workflow][Modify Workflow]] 15 | 16 | * What does this mean? 17 | 18 | When someone uploads a SARS-CoV-2 sequence using one 19 | of our tools (CLI or web-based) they add a sequence and some metadata 20 | which triggers a rerun of our workflows. 21 | 22 | * Where can I find the workflows? 23 | 24 | Workflows are written in the common workflow language (CWL) and listed 25 | on [[https://github.com/arvados/bh20-seq-resource/tree/master/workflows][github]]. PubSeq being an open project these workflows can be studied 26 | and modified! 27 | 28 | * Modify Workflow 29 | 30 | /Work in progress!/ 31 | -------------------------------------------------------------------------------- /doc/web/contact.org: -------------------------------------------------------------------------------- 1 | #+TITLE: CONTACT 2 | #+AUTHOR: Pjotr Prins 3 | 4 | * CONTACT and SUPPORT 5 | 6 | COVID-19 PubSeq is run by a community of [[https://github.com/arvados/bh20-seq-resource/graphs/contributors][bioinformaticians]] and 7 | software developers working at leading institutes (see sponsors below) 8 | with the goal of making online analysis available to everyone. You can 9 | talk with us directly in the [[https://matrix.to/#/!kyAxaAAAOgUKAMmXRz:matrix.org?via=matrix.org][matrix PubSeq room]]. We are open to 10 | improving tools, work flows and analysis. 11 | 12 | ** Oxford Nanopore Analysis 13 | 14 | @@html: @@ 15 | 16 | We run [[https://en.wikipedia.org/wiki/Oxford_Nanopore_Technologies][Oxford Nanopore]] ourselves. It is an exciting technology because 17 | it gives us an accurate SARS-CoV-2 sequencer for a few thousand 18 | dollars which can be run in a living room! With PubSeq we aim to make 19 | it easy to analyse Nanopore material using our *free* Cloud 20 | infrastructure and the [[https://github.com/pubseq/jetson_nanopore_sequencing][NVIDIA Jetson computer]]. If you need help in 21 | using the online workflows don't hesitate to contact us. 22 | 23 | @@html:


@@ 24 | 25 | ** Data from other sequencers 26 | 27 | We accept FASTA sequences of SARS-CoV-2. Simply upload them using the 28 | web form and/or REST API. No sign-up required! If you have large scale 29 | short read data and require support we can discuss that. We also run 30 | Illumina sequencing in several places. 31 | 32 | ** Professional support 33 | 34 | To use COVID-19 PubSeq solutions for professional purposes you can 35 | contact Boston based [[mailto:info@curii.com][Curii]], the creators of [[https://arvados.org/][Arvados]], directly. 36 | 37 | COVID-19 is built on Arvados using CWL workflows. 38 | 39 | ** E-mail 40 | 41 | For questions feel free to write directly to [[mailto:pjotr.public821@thebird.nl][Pjotr Prins]]. 42 | -------------------------------------------------------------------------------- /doc/web/export.org: -------------------------------------------------------------------------------- 1 | #+TITLE: About/FAQ 2 | #+AUTHOR: Pjotr Prins 3 | 4 | * Table of Contents :TOC:noexport: 5 | - [[#export-data][Export data]] 6 | - [[#sparql-api][SPARQL API]] 7 | - [[#rest-api][REST API]] 8 | - [[#export-ebiena-forms][Export EBI/ENA Forms]] 9 | 10 | * Export data 11 | 12 | Apart from straight file [[http://covid19.genenetwork.org/download][downloads]] COVID-19 PubSeq allows for 13 | exporting forms and data for other services. 14 | 15 | * SPARQL API 16 | 17 | 18 | First of all, PubSeq exports a SPARQL endpoint [[http://sparql.genenetwork.org/sparql/][here]] that allows you do 19 | do any query on the data. See this [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part1][document]] for examples. 20 | 21 | * REST API 22 | 23 | In addition to above flexible SPARQL endpoint - which is essentially 24 | is a query REST API - PubSeq exports its own 25 | [[http://covid19.genenetwork.org/apidoc][REST API]]. 26 | 27 | * Export EBI/ENA Forms 28 | 29 | Uploading data to EBI/ENA with PubSeq is described [[http://covid19.genenetwork.org/blog?id=using-covid-19-pubseq-part6][here]]. 30 | 31 | To export, first search for an uploaded entry through its identifier: 32 | -------------------------------------------------------------------------------- /example/esr_example.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | license: 4 | license_type: http://creativecommons.org/licenses/by/4.0/ 5 | title: "SARS-CoV-2 New Zealand" 6 | attribution_name: "ESR" 7 | attribution_url: https://www.esr.cri.nz/ 8 | 9 | host: 10 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 11 | additional_host_information: Optional free text field for additional information 12 | 13 | sample: 14 | sample_id: "20VR0174" 15 | collection_date: "2020-02-26" 16 | collection_location: https://www.wikidata.org/wiki/Q37100 17 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] 18 | source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] 19 | additional_collection_information: Optional free text field for additional information 20 | 21 | virus: 22 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 23 | virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 24 | 25 | technology: 26 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION 27 | alignment_protocol: https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics 28 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" 29 | additional_technology_information: "Artic V3 workflow" 30 | 31 | submitter: 32 | authors: [Jemma L Geoghegan, Xiaoyun Ren, Matthew Storey, James Hadfield, Lauren Jelley, Sarah Jefferies, Jill Sherwood, Shevaun Paine, Sue Huang, Jordan Douglas, Fabio K Mendes, Andrew Sporle, Michael G Baker, David R Murdoch, Nigel French, Colin R Simpson, David Welch, Alexei J Drummond, Edward C Holmes, Sebastian Duchene, Joep de Ligt] 33 | submitter_name: [Joep de Ligt] 34 | submitter_address: "PO Box 50348, Porirua 5240, New Zealand" 35 | originating_lab: ESR 36 | submitter_sample_id: "PRJNA648792" 37 | submitted_to: https://www.ncbi.nlm.nih.gov/biosample 38 | publication: https://doi.org/10.1101/2020.08.05.20168930 39 | public_date: "2020-08-20" 40 | submitter_orcid: [https://orcid.org/0000-0003-0970-0153] 41 | additional_submitter_information: Optional free text field for additional information 42 | -------------------------------------------------------------------------------- /example/maximum_metadata_example.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | license: 4 | license_type: http://creativecommons.org/licenses/by/4.0/ 5 | title: "Sample" 6 | attribution_name: "John doe, Joe Boe, Jonny Oe" 7 | attribution_url: http://covid19.genenetwork.org/id 8 | 9 | host: 10 | host_id: XX1 11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 12 | host_sex: http://purl.obolibrary.org/obo/PATO_0000384 13 | host_age: 20 14 | host_age_unit: http://purl.obolibrary.org/obo/UO_0000036 15 | host_health_status: http://purl.obolibrary.org/obo/NCIT_C25269 16 | host_treatment: Process in which the act is intended to modify or alter host status (Compounds) 17 | host_vaccination: [vaccines1,vaccine2] 18 | ethnicity: http://purl.obolibrary.org/obo/HANCESTRO_0010 19 | additional_host_information: Optional free text field for additional information 20 | 21 | sample: 22 | sample_id: Id of the sample as defined by the submitter 23 | collector_name: Name of the person that took the sample 24 | collecting_institution: Institute that was responsible of sampling 25 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831,http://purl.obolibrary.org/obo/NCIT_C155835] 26 | collection_date: "2020-01-01" 27 | collection_location: http://www.wikidata.org/entity/Q148 28 | sample_storage_conditions: frozen specimen 29 | source_database_accession: [http://identifiers.org/insdc/LC522350.1#sequence] 30 | additional_collection_information: Optional free text field for additional information 31 | 32 | virus: 33 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 34 | virus_strain: SARS-CoV-2/human/CHN/HS_8/2020 35 | 36 | technology: 37 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0009173,http://www.ebi.ac.uk/efo/EFO_0009173] 38 | alignment_protocol: Protocol used for assembly 39 | sequencing_coverage: [70.0, 100.0] 40 | additional_technology_information: Optional free text field for additional information 41 | 42 | submitter: 43 | authors: [John Doe, Joe Boe, Jonny Oe] 44 | submitter_name: [John Doe] 45 | submitter_address: John Doe's address 46 | originating_lab: John Doe kitchen 47 | lab_address: John Doe's address 48 | provider: XXX1 49 | submitter_sample_id: XXX2 50 | publication: PMID00001113 51 | submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001] 52 | additional_submitter_information: Optional free text field for additional information 53 | -------------------------------------------------------------------------------- /example/minimal_metadata_example.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | 4 | license: 5 | license_type: http://creativecommons.org/licenses/by/4.0/ 6 | 7 | host: 8 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 9 | 10 | sample: 11 | sample_id: XX 12 | collection_date: "2020-01-01" 13 | collection_location: http://www.wikidata.org/entity/Q148 14 | 15 | virus: 16 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 17 | 18 | technology: 19 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] 20 | 21 | submitter: 22 | authors: [John Doe] 23 | -------------------------------------------------------------------------------- /example/uthsc_example.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | license: 4 | license_type: https://creativecommons.org/licenses/by/4.0/ 5 | title: "Sample" 6 | attribution_name: "Mariah Taylor, Colleen Jonsson" 7 | attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php 8 | 9 | host: 10 | host_id: TN_UT2 11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 12 | additional_host_information: Optional free text field for additional information 13 | 14 | sample: 15 | sample_id: TN_UT2 16 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] 17 | collection_date: "2020-04-26" 18 | collection_location: https://www.wikidata.org/wiki/Q3289517 19 | additional_collection_information: Optional free text field for additional information 20 | 21 | virus: 22 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 23 | virus_strain: SARS-CoV-2/human/USA/AL_UT14/2020 24 | 25 | technology: 26 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] # Nanopore MinION 27 | alignment_protocol: guppy 28 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" 29 | additional_technology_information: Optional free text field for additional information 30 | 31 | submitter: 32 | authors: [Mariah Taylor, Colleen Jonsson] 33 | submitter_name: [Mariah Taylor] 34 | submitter_address: UTHSC, Memphis, Tennessee 38163, USA 35 | originating_lab: Regional Biocontainment Laboratory 36 | provider: XXX1 37 | submitter_sample_id: XXX2 38 | publication: PMID00001113 39 | submitter_orcid: [https://orcid.org/0000-0000-0000-0000,https://orcid.org/0000-0000-0000-0001] 40 | additional_submitter_information: Optional free text field for additional information 41 | -------------------------------------------------------------------------------- /gittaggers.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import pkg_resources 4 | from setuptools.command.egg_info import egg_info 5 | 6 | SETUPTOOLS_VER = pkg_resources.get_distribution( 7 | "setuptools").version.split('.') 8 | 9 | RECENT_SETUPTOOLS = int(SETUPTOOLS_VER[0]) > 40 or ( 10 | int(SETUPTOOLS_VER[0]) == 40 and int(SETUPTOOLS_VER[1]) > 0) or ( 11 | int(SETUPTOOLS_VER[0]) == 40 and int(SETUPTOOLS_VER[1]) == 0 and 12 | int(SETUPTOOLS_VER[2]) > 0) 13 | 14 | class EggInfoFromGit(egg_info): 15 | """Tag the build with git commit timestamp. 16 | 17 | If a build tag has already been set (e.g., "egg_info -b", building 18 | from source package), leave it alone. 19 | """ 20 | 21 | def git_timestamp_tag(self): 22 | gitinfo = subprocess.check_output( 23 | ['git', 'log', '--first-parent', '--max-count=1', 24 | '--format=format:%ct', '.']).strip() 25 | return time.strftime('.%Y%m%d%H%M%S', time.gmtime(int(gitinfo))) 26 | 27 | def tags(self): 28 | if self.tag_build is None: 29 | try: 30 | self.tag_build = self.git_timestamp_tag() 31 | except subprocess.CalledProcessError: 32 | pass 33 | return egg_info.tags(self) 34 | 35 | if RECENT_SETUPTOOLS: 36 | vtags = property(tags) 37 | -------------------------------------------------------------------------------- /image/homepage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/image/homepage.png -------------------------------------------------------------------------------- /lib/ruby/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 2 | -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @book{CWL, 2 | title = "Common Workflow Language, v1.0", 3 | abstract = "The Common Workflow Language (CWL) is an informal, multi-vendor working group consisting of various organizations and individuals that have an interest in portability of data analysis workflows. Our goal is to create specifications that enable data scientists to describe analysis tools and workflows that are powerful, easy to use, portable, and support reproducibility.CWL builds on technologies such as JSON-LD and Avro for data modeling and Docker for portable runtime environments. CWL is designed to express workflows for data-intensive science, such as Bioinformatics, Medical Imaging, Chemistry, Physics, and Astronomy.This is v1.0 of the CWL tool and workflow specification, released on 2016-07-08", 4 | keywords = "cwl, workflow, specification", 5 | author = "Brad Chapman and John Chilton and Michael Heuer and Andrey Kartashov and Dan Leehr and Herv{\'e} M{\'e}nager and Maya Nedeljkovich and Matt Scales and Stian Soiland-Reyes and Luka Stojanovic", 6 | editor = "Peter Amstutz and Crusoe, {Michael R.} and Nebojša Tijanić", 7 | note = "Specification, product of the Common Workflow Language working group. http://www.commonwl.org/v1.0/", 8 | year = "2016", 9 | month = "7", 10 | day = "8", 11 | doi = "10.6084/m9.figshare.3115156.v2", 12 | language = "English", 13 | publisher = "figshare", 14 | address = "United States", 15 | 16 | } -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | ### Instructions for download and/or prepare the data and/or the metadata 2 | 3 | Just go into the `download_genbank_data` or `download_sra_data` directory and execute the python3 script inside. 4 | 5 | - `download_genbank_data/from_genbank_to_fasta_and_yaml.py` downloads the data and the matadata, preparing the FASTA and the YAML files; 6 | - `download_sra_data/download_sra_data.py` creates the metadata in the form of YAML files from the SraExperimentPackage.XXX.xml.gz file in the same directory. 7 | -------------------------------------------------------------------------------- /scripts/cleanup.py: -------------------------------------------------------------------------------- 1 | import arvados 2 | import arvados.util 3 | import arvados.keep 4 | import ruamel.yaml 5 | 6 | api = arvados.api() 7 | keepclient = arvados.keep.KeepClient(api_client=api) 8 | 9 | UPLOADER_PROJECT = 'lugli-j7d0g-n5clictpuvwk8aa' 10 | VALIDATED_PROJECT = 'lugli-j7d0g-5ct8p1i1wrgyjvp' 11 | 12 | delete_patterns = [ 13 | "%missing%`collection_location`%", 14 | "%missing%`technology`%", 15 | "%missing%`host_species`%", 16 | "%QC fail: alignment%", 17 | "%does not look like a valid URI%", 18 | "%Duplicate of%", 19 | "%No matching triples found for predicate obo:NCIT_C42781%", 20 | "%does not look like a valid URI%" 21 | ] 22 | 23 | revalidate_patterns = [ 24 | "%missing%`license`%", 25 | "%QC fail%" 26 | ] 27 | 28 | for p in delete_patterns: 29 | c = arvados.util.list_all(api.collections().list, filters=[ 30 | ["owner_uuid", "=", UPLOADER_PROJECT], 31 | ["properties.errors", "like", p]]) 32 | for i in c: 33 | print("trashing %s %s" % (i["uuid"], i["properties"].get("sequence_label"))) 34 | api.collections().delete(uuid=i["uuid"]).execute() 35 | 36 | for p in revalidate_patterns: 37 | c = arvados.util.list_all(api.collections().list, filters=[ 38 | ["owner_uuid", "=", UPLOADER_PROJECT], 39 | ["properties.errors", "like", p]]) 40 | for i in c: 41 | print("clearing status %s %s" % (i["uuid"], i["properties"].get("sequence_label"))) 42 | pr = i["properties"] 43 | if "status" in pr: 44 | del pr["status"] 45 | if "errors" in pr: 46 | del pr["errors"] 47 | api.collections().update(uuid=i["uuid"], body={"properties": pr}).execute() 48 | 49 | c = arvados.util.list_all(api.collections().list, filters=[ 50 | ["owner_uuid", "=", VALIDATED_PROJECT], 51 | ["properties.sequence_label", "exists", False]]) 52 | for i in c: 53 | col = arvados.collection.Collection(i["uuid"], api_client=api, keep_client=keepclient) 54 | with col.open("metadata.yaml") as md: 55 | metadata_content = ruamel.yaml.round_trip_load(md) 56 | colprop = col.get_properties() 57 | colprop["sequence_label"] = metadata_content["sample"]["sample_id"] 58 | 59 | print("fixing sequence label %s %s" % (i["uuid"], colprop.get("sequence_label"))) 60 | api.collections().update(uuid=i["uuid"], body={"properties": colprop}).execute() 61 | -------------------------------------------------------------------------------- /scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/scripts/create_sra_metadata/SraExperimentPackage.2020.07.09.xml.gz -------------------------------------------------------------------------------- /scripts/db_enrichment/.gitignore: -------------------------------------------------------------------------------- 1 | enriched_output.txt 2 | -------------------------------------------------------------------------------- /scripts/db_enrichment/input_location.csv: -------------------------------------------------------------------------------- 1 | http://www.wikidata.org/entity/Q7960498 2 | http://www.wikidata.org/entity/Q692895 3 | http://www.wikidata.org/entity/Q2722074 4 | http://www.wikidata.org/entity/Q25622187 5 | http://www.wikidata.org/entity/Q27684996 6 | http://www.wikidata.org/entity/Q2757125 7 | -------------------------------------------------------------------------------- /scripts/db_enrichment/readme.md: -------------------------------------------------------------------------------- 1 | We have two files in the folder *semantic_enrichment* that are used to enrich the identifier in our triples store with additional information, e.g. human readable labels and semantics (e.g. *What countries are summarizes as a continent*). This describes how to update these two files. 2 | 3 | ### semantic_enrichment/labels.ttl 4 | Static label about the ontology vocabulary terms we use. This file has to be updated manually. Use the OLS or bioportal to find more information about a used ontology term. 5 | 6 | ### semantic_enrichment/countries.ttl 7 | File containing information about the countries in our database. Additional information about countries are e.g. the label or GPS coordinates. We enricht the country identifier via wikidata. Please ensure that the .ttl file is valid by e.g. using his online validator (http://ttl.summerofcode.be/). 8 | 9 | #### Update process 10 | - What countries (=wikidata identifier) do we have to enrich? 11 | This SPARQL query (http://sparql.genenetwork.org/sparql/) retrieves all countries (ids) from our database that do not have a label yet: 12 | 13 | 14 | ```sparql 15 | SELECT DISTINCT ?geoLocation WHERE 16 | { 17 | ?fasta ?x [ ?geoLocation] . 18 | FILTER NOT EXISTS {?geoLocation ?geoLocation_tmp_label} 19 | } 20 | ``` 21 | 22 | [Run query](http://sparql.genenetwork.org/sparql/?default-graph-uri=&query=%0D%0ASELECT+DISTINCT+%3FgeoLocation++WHERE%0D%0A%7B%0D%0A++%3Ffasta+%3Fx+%5B+%3Chttp%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGAZ_00000448%3E+%3FgeoLocation%5D+.%0D%0A++FILTER+NOT+EXISTS+%7B%3FgeoLocation+%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23label%3E+%3FgeoLocation_tmp_label%7D%0D%0A%7D&format=text%2Fhtml&timeout=0&debug=on&run=+Run+Query+) 23 | 24 | - Use the list of identifiers created with the query above as input for the update script *country_enrichment.py*. The script creates a temporary .ttl file in this folder 25 | - Merge the output of the script above manually into the file semantic_enrichment/countries.ttl (TODO: Improve script output so manual intervention no longer needed. Currently there are "double entries" for continents in the output) 26 | -------------------------------------------------------------------------------- /scripts/db_enrichment/update/README.org: -------------------------------------------------------------------------------- 1 | select distinct ?item ?country ?place ?official ?countryname ?loc where { 2 | ?item wdt:P17 ?country ; 3 | wdt:P1705 ?place ; 4 | wdt:P625 ?loc . 5 | ?country wdt:P1448 ?countryname . 6 | FILTER(LANG(?countryname) = "en") 7 | FILTER(LANG(?place) = "en") 8 | optional { ?item wdt:P1448 ?official } . 9 | SERVICE wikibase:label { bd:serviceParam wikibase:language "en" } 10 | } 11 | 12 | 13 | https://query.wikidata.org/#%23%20Find%20place%20and%20location%20coordinates%0Aselect%20distinct%20%3Fitem%20%3Fcountry%20%3Fplace%20%3Fofficial%20%3Fcountryname%20%3Floc%20where%20%7B%0A%20%20%20%20%3Fitem%20%20wdt%3AP17%20%3Fcountry%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20wdt%3AP1705%20%3Fplace%20%3B%0A%20%20%20%20%20%20%20%20%20%20%20wdt%3AP625%20%3Floc%20.%0A%20%20%20%20%3Fcountry%20wdt%3AP1448%20%3Fcountryname%20.%0A%20%20%20%20FILTER%28LANG%28%3Fcountryname%29%20%3D%20%22en%22%29%0A%20%20%20%20FILTER%28LANG%28%3Fplace%29%20%3D%20%22en%22%29%0A%20%20%20%20optional%20%7B%20%3Fitem%20%20wdt%3AP1448%20%3Fofficial%20%7D%20.%0A%20%20%20%20SERVICE%20wikibase%3Alabel%20%7B%20bd%3AserviceParam%20wikibase%3Alanguage%20%22en%22%20%7D%0A%7D%0A 14 | 15 | Fetches a TSV: 16 | 17 | item country place official countryname loc 18 | http://www.wikidata.org/entity/Q1297 http://www.wikidata.org/entity/Q30 Chicago City of Chicago the Unite 19 | d States of America Point(-87.627777777 41.881944444) 20 | http://www.wikidata.org/entity/Q1297 http://www.wikidata.org/entity/Q30 Chicago City of Chicago United St 21 | ates Point(-87.627777777 41.881944444) 22 | http://www.wikidata.org/entity/Q686 http://www.wikidata.org/entity/Q686 Republic of Vanuatu Ripablik blong 23 | Vanuatu Republic of Vanuatu Point(168.016669444 -16.633330555) 24 | http://www.wikidata.org/entity/Q686 http://www.wikidata.org/entity/Q686 Republic of Vanuatu Vanuatu Republi 25 | c of Vanuatu Point(168.016669444 -16.633330555) 26 | http://www.wikidata.org/entity/Q686 http://www.wikidata.org/entity/Q686 Republic of Vanuatu Republic of Van 27 | uatu Republic of Vanuatu Point(168.016669444 -16.633330555) 28 | 29 | -------------------------------------------------------------------------------- /scripts/delete_entries_on_arvados.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import arvados 3 | import arvados.collection 4 | 5 | from datetime import datetime 6 | 7 | date_time_str = '2020-08-20' 8 | date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d') 9 | 10 | api = arvados.api() 11 | keepclient = arvados.keep.KeepClient(api_client=api) 12 | 13 | validated = arvados.util.list_all(api.collections().list, filters=[ 14 | ["owner_uuid", "=", sys.argv[1]], 15 | # ["properties.status", "=", "validated"] 16 | ]) 17 | 18 | # validated.sort(key=lambda v: v["portable_data_hash"]) 19 | 20 | num_sample_deleted = 0 21 | for item in validated: 22 | sequence_label = item['properties']["sequence_label"] 23 | 24 | # The SRA samples start with SRR or ERR 25 | if not sequence_label.startswith('SRR') and not sequence_label.startswith('ERR'): 26 | created_at_obj = datetime.strptime(item["created_at"], '%Y-%m-%dT%H:%M:%S.%fZ') 27 | # print(item, created_at_obj) 28 | 29 | if created_at_obj < date_time_obj: 30 | api.collections().delete(uuid=item['current_version_uuid']).execute() 31 | num_sample_deleted += 1 32 | print(sequence_label) 33 | 34 | print('num_sample_deleted: {}'.format(num_sample_deleted)) 35 | -------------------------------------------------------------------------------- /scripts/dict_ontology_standardization/ncbi_host_health_status.csv: -------------------------------------------------------------------------------- 1 | healthy,http://purl.obolibrary.org/obo/NCIT_C115935 2 | asymptomatic,http://purl.obolibrary.org/obo/NCIT_C3833 3 | symptomatic,http://purl.obolibrary.org/obo/NCIT_C25269 4 | admitted to hospital,http://purl.obolibrary.org/obo/GENEPIO_0002020 5 | hospitalized patient,http://purl.obolibrary.org/obo/GENEPIO_0002020 6 | discharged from hospital,http://purl.obolibrary.org/obo/GENEPIO_0001849 7 | dead,http://purl.obolibrary.org/obo/NCIT_C28554 8 | alive,http://purl.obolibrary.org/obo/NCIT_C37987 9 | -------------------------------------------------------------------------------- /scripts/dict_ontology_standardization/ncbi_host_species.csv: -------------------------------------------------------------------------------- 1 | Homo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 2 | human,http://purl.obolibrary.org/obo/NCBITaxon_9606 3 | Human,http://purl.obolibrary.org/obo/NCBITaxon_9606 4 | sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 5 | homosapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 6 | howo sapiens,http://purl.obolibrary.org/obo/NCBITaxon_9606 7 | Mustela lutreola,http://purl.obolibrary.org/obo/NCBITaxon_9666 8 | Manis javanica,http://purl.obolibrary.org/obo/NCBITaxon_9974 9 | Felis catus,http://purl.obolibrary.org/obo/NCBITaxon_9685 10 | Felis catus; Domestic Shorthair,http://purl.obolibrary.org/obo/NCBITaxon_9685 11 | Panthera tigris jacksoni,http://purl.obolibrary.org/obo/NCBITaxon_419130 12 | Canis lupus familiaris,http://purl.obolibrary.org/obo/NCBITaxon_9615 13 | Neovison vison,http://purl.obolibrary.org/obo/NCBITaxon_452646 14 | -------------------------------------------------------------------------------- /scripts/dict_ontology_standardization/ncbi_sequencing_technology.csv: -------------------------------------------------------------------------------- 1 | Illumina HiSeq 1000,http://www.ebi.ac.uk/efo/EFO_0004204 2 | Illumina HiSeq 2000,http://www.ebi.ac.uk/efo/EFO_0004203 3 | Illumina HiSeq 2500,http://www.ebi.ac.uk/efo/EFO_0008565 4 | Illumina HiSeq 3000,http://www.ebi.ac.uk/efo/EFO_0008564 5 | Illumina HiSeq 4000,http://www.ebi.ac.uk/efo/EFO_0008563 6 | Illumina iSeq 100,http://www.ebi.ac.uk/efo/EFO_0008635 7 | Illumian NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 8 | Illumina NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 9 | NextSeq500,http://www.ebi.ac.uk/efo/EFO_0009173 10 | NextSeq 500,http://www.ebi.ac.uk/efo/EFO_0009173 11 | Illumian NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566 12 | Illumina NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566 13 | NextSeq550,http://www.ebi.ac.uk/efo/EFO_0008566 14 | NextSeq 550,http://www.ebi.ac.uk/efo/EFO_0008566 15 | Illumina MiniSeq,http://www.ebi.ac.uk/efo/EFO_0008636 16 | Illumina NovaSeq,http://www.ebi.ac.uk/efo/EFO_0008637 17 | Illumina NovaSeq 6000,http://www.ebi.ac.uk/efo/EFO_0008637 18 | Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632 19 | Oxford Nanopore MinION,http://www.ebi.ac.uk/efo/EFO_0008632 20 | ONT (Oxford Nanopore Technologies),http://purl.obolibrary.org/obo/NCIT_C146818 21 | Oxford Nanopore Technology,http://purl.obolibrary.org/obo/NCIT_C146818 22 | Oxford Nanopore technologies MinION,http://www.ebi.ac.uk/efo/EFO_0008632 23 | Oxford Nanopore Sequencing,http://purl.obolibrary.org/obo/NCIT_C146818 24 | MinION Oxford Nanopore,http://www.ebi.ac.uk/efo/EFO_0008632 25 | MinION,http://www.ebi.ac.uk/efo/EFO_0008632 26 | Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818 27 | Illumina MiSeq,http://www.ebi.ac.uk/efo/EFO_0004205 28 | Illumina,http://purl.obolibrary.org/obo/OBI_0000759 29 | Oxford Nanopore technology,http://purl.obolibrary.org/obo/NCIT_C146818 30 | Oxford Nanopore Technologies,http://purl.obolibrary.org/obo/NCIT_C146818 31 | Oxford Nanopore,http://purl.obolibrary.org/obo/NCIT_C146818 32 | IonTorrent,http://purl.obolibrary.org/obo/NCIT_C125894 33 | Ion Torrent X5Plus,http://purl.obolibrary.org/obo/NCIT_C125894 34 | ThermoFisher S5Plus,http://purl.obolibrary.org/obo/NCIT_C125894 35 | Sanger dideoxy sequencing,http://purl.obolibrary.org/obo/NCIT_C19641 36 | MGISEQ 2000,http://virtual-bh/MGISEQ2000 37 | MGISEQ2000,http://virtual-bh/MGISEQ2000 38 | Illumina HiSeq X,http://www.ebi.ac.uk/efo/EFO_0008567 39 | ONT GridION X5,http://www.ebi.ac.uk/efo/EFO_0008633 40 | ONT PremethION,http://www.ebi.ac.uk/efo/EFO_0008634 41 | PromethION,http://www.ebi.ac.uk/efo/EFO_0008634 42 | PacBio RS II,http://www.ebi.ac.uk/efo/EFO_0008631 43 | PacBio Sequel System,http://www.ebi.ac.uk/efo/EFO_0008630 44 | Illumina Genome Analyzer,http://www.ebi.ac.uk/efo/EFO_0004200 45 | Illumina Genome Analyzer II,http://www.ebi.ac.uk/efo/EFO_0004201 46 | Illumina Genome Analyzer IIx,http://www.ebi.ac.uk/efo/EFO_0004202 47 | 454 GS 20 sequencer,http://www.ebi.ac.uk/efo/EFO_0004206 48 | 454 GS FLX Titanium sequencer,http://www.ebi.ac.uk/efo/EFO_0004433 49 | 454 GS FLX sequencer,http://www.ebi.ac.uk/efo/EFO_0004432 50 | 454 GS Junior sequencer,http://www.ebi.ac.uk/efo/EFO_0004434 51 | 454 GS sequencer,http://www.ebi.ac.uk/efo/EFO_0004431 52 | AB SOLiD 4 System,http://www.ebi.ac.uk/efo/EFO_0004438 53 | AB SOLiD 4hq System,http://www.ebi.ac.uk/efo/EFO_0004441 54 | AB SOLiD 5500,http://www.ebi.ac.uk/efo/EFO_0004440 55 | AB SOLiD 5500xl,http://www.ebi.ac.uk/efo/EFO_0004436 56 | AB SOLiD PI System,http://www.ebi.ac.uk/efo/EFO_0004437 57 | AB SOLiD System,http://www.ebi.ac.uk/efo/EFO_0004435 58 | AB SOLiD System 2.0,http://www.ebi.ac.uk/efo/EFO_0004442 59 | AB SOLiD System 3.0,http://www.ebi.ac.uk/efo/EFO_0004439 60 | -------------------------------------------------------------------------------- /scripts/dict_ontology_standardization/ncbi_speciesman_source.csv: -------------------------------------------------------------------------------- 1 | nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 2 | Nasopharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 3 | NPS,http://purl.obolibrary.org/obo/NCIT_C155831 4 | NasopharyngealSwab,http://purl.obolibrary.org/obo/NCIT_C155831 5 | Naso-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 6 | nasopharingeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 7 | Nasopharyngeal (NP) Swab,http://purl.obolibrary.org/obo/NCIT_C155831 8 | nasopharyngeal swabs,http://purl.obolibrary.org/obo/NCIT_C155831 9 | nasopharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 10 | nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 11 | Nasopharyngeal,http://purl.obolibrary.org/obo/NCIT_C155831 12 | respiratory swab,http://purl.obolibrary.org/obo/NCIT_C155831 13 | naso-pharyngeal exudate,http://purl.obolibrary.org/obo/NCIT_C155831 14 | nasopharyngeal aspirate,http://purl.obolibrary.org/obo/NCIT_C155831 15 | nasal swab specimen,http://purl.obolibrary.org/obo/NCIT_C155831 16 | nasal swal,http://purl.obolibrary.org/obo/NCIT_C155831 17 | pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155831 18 | respiratory secretion,http://purl.obolibrary.org/obo/NCIT_C155831 19 | mid-nasal swab,http://purl.obolibrary.org/obo/NCIT_C155831 20 | Mid-nasal swab,http://purl.obolibrary.org/obo/NCIT_C155831 21 | nasopharyngeal (throat) washings,http://purl.obolibrary.org/obo/NCIT_C155831 22 | oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 23 | throat swab,http://purl.obolibrary.org/obo/NCIT_C155835 24 | oro-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 25 | Oropharyngal,http://purl.obolibrary.org/obo/NCIT_C155835 26 | oralpharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 27 | Oral-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 28 | oral-pharyngeal,http://purl.obolibrary.org/obo/NCIT_C155835 29 | oro-pharngyl swab,http://purl.obolibrary.org/obo/NCIT_C155835 30 | Oro-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 31 | oro-pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 32 | Oropharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 33 | oro pharyngeal swab,http://purl.obolibrary.org/obo/NCIT_C155835 34 | buccal swab,http://purl.obolibrary.org/obo/NCIT_C155835 35 | throat washing,http://purl.obolibrary.org/obo/NCIT_C155835 36 | Throat Swab,http://purl.obolibrary.org/obo/NCIT_C155835 37 | throat (oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835 38 | Throat (Oropharyngeal) swab,http://purl.obolibrary.org/obo/NCIT_C155835 39 | bronchoalveolar lavage fluid,http://purl.obolibrary.org/obo/NCIT_C13195 40 | swab,http://purl.obolibrary.org/obo/NCIT_C13195 41 | oral swab,http://purl.obolibrary.org/obo/NCIT_C13195 42 | bronchoalveolar lavage,http://purl.obolibrary.org/obo/NCIT_C13195 43 | sputum,http://purl.obolibrary.org/obo/NCIT_C13278 44 | aspirate,http://purl.obolibrary.org/obo/NCIT_C13347 45 | stool,http://purl.obolibrary.org/obo/NCIT_C13234 46 | serum,http://purl.obolibrary.org/obo/NCIT_C13325 47 | saliva,http://purl.obolibrary.org/obo/NCIT_C13275 48 | Deep throat saliva,http://purl.obolibrary.org/obo/NCIT_C13275 49 | nasal swab,http://purl.obolibrary.org/obo/NCIT_C132119 50 | -------------------------------------------------------------------------------- /scripts/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:10 2 | 3 | RUN apt-get update && \ 4 | apt-get -yq --no-install-recommends -o Acquire::Retries=6 install \ 5 | python3 python3-pip python3-setuptools python3-dev python-pycurl \ 6 | minimap2 python3-biopython libcurl4-openssl-dev build-essential \ 7 | libssl-dev libmagic-dev python3-magic && \ 8 | apt-get clean 9 | 10 | RUN pip3 install bh20-seq-uploader py-dateutil 11 | -------------------------------------------------------------------------------- /scripts/esr_samples/Pathogen.cl.1.0.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/scripts/esr_samples/Pathogen.cl.1.0.xlsx -------------------------------------------------------------------------------- /scripts/esr_samples/esr_samples.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from string import Template 4 | from dateutil.parser import parse 5 | 6 | import sys 7 | 8 | sys.path.append('../') 9 | from utils import check_and_get_ontology_dictionaries 10 | 11 | # Metadata in tabular format 12 | path_metadata_xlsx = 'Pathogen.cl.1.0.xlsx' 13 | 14 | path_template_yaml = 'template.yaml' 15 | # Removed from the template (for now) 16 | # license: 17 | # license_type: "http://creativecommons.org/licenses/by/4.0/" 18 | # title: "SARS-CoV-2 New Zealand" 19 | # attribution_name: "ESR" 20 | # attribution_url: "https://www.esr.cri.nz/" 21 | 22 | 23 | # Read the dictionaries for the ontology 24 | dir_dict_ontology_standardization = '../dict_ontology_standardization/' 25 | field_to_term_to_uri_dict = check_and_get_ontology_dictionaries(dir_dict_ontology_standardization) 26 | 27 | dir_output = 'yaml' 28 | suffix = '.consensus' 29 | 30 | if not os.path.exists(dir_output): 31 | os.makedirs(dir_output) 32 | 33 | metadata_df = pd.read_excel(path_metadata_xlsx, skiprows=12) 34 | 35 | # Maybe not the best pandas-way to do this 36 | for index, row in metadata_df.iterrows(): 37 | # print(row['*sample_name']) 38 | 39 | geo_loc_name = row['*geo_loc_name'].replace(': ', ':') 40 | 41 | if geo_loc_name not in field_to_term_to_uri_dict['ncbi_countries']: 42 | if geo_loc_name in [ 43 | 'New Zealand:Counties Manukau', 'New Zealand:Capital and Coast', 'New Zealand:Southern', 44 | 'New Zealand:Waikato', 45 | 'New Zealand:Lakes', 'New Zealand:Nelson Marlborough', 'New Zealand:South Canterbury', 46 | 'New Zealand:MidCentral', 47 | 'New Zealand:Tairawhiti', 'New Zealand:Hawkes Bay', 'New Zealand:NA', 'New Zealand:Taranaki' 48 | ]: 49 | geo_loc_name = 'New Zealand' 50 | else: 51 | print(geo_loc_name) 52 | break 53 | 54 | country = field_to_term_to_uri_dict['ncbi_countries'][geo_loc_name] 55 | 56 | d = { 57 | 'host_species': field_to_term_to_uri_dict['ncbi_host_species'][row['*host']], 58 | 'sample_id': row['*sample_name'], 59 | 'collection_date': parse(row['*collection_date']).strftime('%Y-%m-%d'), 60 | 'collection_location': country, 61 | 'specimen_source': field_to_term_to_uri_dict['ncbi_speciesman_source'][row['*isolation_source']], 62 | 'virus_species': 'http://purl.obolibrary.org/obo/NCBITaxon_2697049', 63 | 64 | 'submitter_sample_id': row['bioproject_accession'], 65 | } 66 | 67 | with open(path_template_yaml) as f: 68 | src = Template(f.read()) 69 | 70 | with open(os.path.join(dir_output, '{}{}.yaml'.format(row['*sample_name'], suffix)), 'w') as fw: 71 | fw.write(src.substitute(d)) 72 | 73 | print('{} YAML files created.'.format(len([x for x in os.listdir(dir_output) if x.endswith('.yaml')]))) 74 | -------------------------------------------------------------------------------- /scripts/esr_samples/jetson/21JETSONTEST001.consensus.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | host: 4 | host_species: "http://purl.obolibrary.org/obo/NCBITaxon_9606" 5 | 6 | sample: 7 | sample_id: "JetsonXavNX_SARSCOV_TESTRUN001" 8 | collection_date: "2020-12-05" 9 | collection_location: "http://www.wikidata.org/entity/Q37100" 10 | specimen_source: ["http://purl.obolibrary.org/obo/NCIT_C155831"] 11 | 12 | virus: 13 | virus_species: "http://purl.obolibrary.org/obo/NCBITaxon_2697049" 14 | 15 | technology: 16 | sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"] 17 | alignment_protocol: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics" 18 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" 19 | additional_technology_information: "Modified Artic V3 workflow for Nvidia Jetson Xavier NX/AGX" 20 | 21 | submitter: 22 | authors: ["Miles Benton", "Matthew Storey", "Joep de Ligt"] 23 | submitter_name: ["Miles Benton"] 24 | submitter_address: "PO Box 50348, Porirua 5240, New Zealand" 25 | originating_lab: "ESR" 26 | submitter_sample_id: "PRJNA648792" 27 | submitter_orcid: ["https://orcid.org/0000-0003-3442-965X"] 28 | additional_submitter_information: "2021-01-20" 29 | 30 | -------------------------------------------------------------------------------- /scripts/esr_samples/template.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | host: 4 | host_species: "$host_species" 5 | 6 | sample: 7 | sample_id: "$sample_id" 8 | collection_date: "$collection_date" 9 | collection_location: "$collection_location" 10 | specimen_source: ["$specimen_source"] 11 | 12 | virus: 13 | virus_species: "$virus_species" 14 | 15 | technology: 16 | sample_sequencing_technology: ["http://www.ebi.ac.uk/efo/EFO_0008632"] 17 | alignment_protocol: "https://github.com/ESR-NZ/NZ_SARS-CoV-2_genomics" 18 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" 19 | additional_technology_information: "Artic V3 workflow" 20 | 21 | submitter: 22 | authors: ["Jemma L Geoghegan", "Xiaoyun Ren", "Matthew Storey", "James Hadfield", "Lauren Jelley", "Sarah Jefferies", "Jill Sherwood", "Shevaun Paine", "Sue Huang", "Jordan Douglas", "Fabio K Mendes", "Andrew Sporle", "Michael G Baker", "David R Murdoch", "Nigel French", "Colin R Simpson", "David Welch", "Alexei J Drummond", "Edward C Holmes", "Sebastian Duchene", "Joep de Ligt"] 23 | submitter_name: ["Joep de Ligt"] 24 | submitter_address: "PO Box 50348, Porirua 5240, New Zealand" 25 | originating_lab: "ESR" 26 | submitter_sample_id: "$submitter_sample_id" 27 | publication: "https://doi.org/10.1101/2020.08.05.20168930" 28 | submitter_orcid: ["https://orcid.org/0000-0003-0970-0153"] 29 | additional_submitter_information: "2020-08-20" 30 | -------------------------------------------------------------------------------- /scripts/fasta2vcf/fasta2vcf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | path_reference=$1 4 | path_query=$2 5 | output_prefix=$3 6 | path_annotation=$4 7 | 8 | echo "Contatenating reference and query in the same FASTA file" 9 | cat $path_reference $path_query > ref+qry.fasta 10 | 11 | echo "Aligning reference and query with MAFFT" 12 | mafft ref+qry.fasta > ref+qry.alignment 13 | 14 | python3 alignment2vcf.py $path_reference ref+qry.alignment $output_prefix 15 | 16 | python3 simpleVcfAnnotation.py $output_prefix.vcf $path_annotation 17 | 18 | bcftools norm -f $path_reference $output_prefix.vcf -Ou | bcftools annotate --set-id '%CHROM\_%POS\_%REF\_%FIRST_ALT' -Ov -o - | bgzip -c > $output_prefix.vcf.gz 19 | #tabix -p vcf $output_prefix.vcf.gz 20 | 21 | #java -jar /home/tools/snpEff/5.0e/snpEff.jar NC_045512.2 $output_prefix.vcf | bgzip -c > $output_prefix.annotated.vcf.gz && tabix -p vcf $output_prefix.annotated.vcf.gz 22 | 23 | echo "Removing temporary files" 24 | #rm snpEff_genes.txt snpEff_summary.html 25 | rm ref+qry.fasta ref+qry.alignment $output_prefix.vcf -------------------------------------------------------------------------------- /scripts/fasta2vcf/resources/NC_045512.2.fasta.fai: -------------------------------------------------------------------------------- 1 | NC_045512.2 29903 97 70 71 2 | -------------------------------------------------------------------------------- /scripts/fasta2vcf/resources/README.md: -------------------------------------------------------------------------------- 1 | `NC_045512.2.fasta` and `MN908947.3.fasta` are the same sequence. -------------------------------------------------------------------------------- /scripts/fasta2vcf/simpleVcfAnnotation.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | path_vcf = sys.argv[1] 5 | path_annotation = sys.argv[2] 6 | 7 | variant_to_phenotypes_dict = {} 8 | 9 | with open(path_annotation) as f: 10 | f.readline() # Skip header 11 | 12 | for line in f: 13 | if line.startswith('MN908947'): 14 | variant = line.split(',')[0] 15 | pos, ref, alt = variant.split(':')[1:] 16 | 17 | # Ugly, Pjotr will not like it 18 | f.readline() 19 | f.readline() 20 | phenotype = f.readline().split(',')[-3].strip('"') 21 | 22 | if (pos, ref, alt) not in variant_to_phenotypes_dict: 23 | variant_to_phenotypes_dict[(pos, ref, alt)] = [] 24 | variant_to_phenotypes_dict[(pos, ref, alt)].append(phenotype) 25 | 26 | new_row_in_header = '##INFO=\n' 27 | 28 | with open(path_vcf) as fin, open(f"{path_vcf}.tmp", "w") as fout: 29 | for line in fin: 30 | if line: 31 | if line.startswith('#CHROM'): 32 | if new_row_in_header: 33 | fout.write(new_row_in_header) 34 | new_row_in_header = '' 35 | 36 | if not line.startswith("#"): 37 | split_line = line.strip().split("\t") 38 | pos = split_line[1] 39 | ref, alt = split_line[3:5] 40 | 41 | if (pos, ref, alt) in variant_to_phenotypes_dict: 42 | split_line[7] = "ANN={}".format( 43 | ",".join(variant_to_phenotypes_dict[(pos, ref, alt)]) 44 | ) 45 | line = "\t".join(split_line) + "\n" 46 | 47 | fout.write(line) 48 | os.remove(path_vcf) 49 | os.rename(f"{path_vcf}.tmp", path_vcf) 50 | -------------------------------------------------------------------------------- /scripts/fetch_from_genbank.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | importScript: 5 | type: File 6 | default: 7 | class: File 8 | location: download_genbank_data/from_genbank_to_fasta_and_yaml.py 9 | inputBinding: {position: 1} 10 | dict: 11 | type: Directory 12 | inputBinding: 13 | prefix: --dict-ontology 14 | position: 2 15 | default: 16 | class: Directory 17 | location: dict_ontology_standardization 18 | existing_metadata_from_nuccore: 19 | type: Directory? 20 | inputBinding: 21 | valueFrom: "--skip-request" 22 | position: 3 23 | outputs: 24 | fasta_and_yaml: 25 | type: Directory 26 | outputBinding: 27 | glob: fasta_and_yaml 28 | metadata_from_nuccore: 29 | type: Directory 30 | outputBinding: 31 | glob: metadata_from_nuccore 32 | accessions: 33 | type: File? 34 | outputBinding: 35 | glob: "*.acc" 36 | missing_terms: 37 | type: File 38 | outputBinding: 39 | glob: missing_terms.tsv 40 | requirements: 41 | InitialWorkDirRequirement: 42 | listing: 43 | - entry: $(inputs.existing_metadata_from_nuccore) 44 | entryname: metadata_from_nuccore 45 | DockerRequirement: 46 | dockerPull: bh20-seq-uploader/import 47 | NetworkAccess: 48 | networkAccess: true 49 | baseCommand: python3 50 | -------------------------------------------------------------------------------- /scripts/foreach.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -rf validated fasta_and_yaml_* 3 | mkdir -p validated 4 | ./from_genbank_to_fasta_and_yaml.py 5 | fasta_files=$(find fasta_and_yaml/ -name "*.fasta") 6 | for f in $fasta_files ; do 7 | yaml=$(echo $f | rev | cut -c7- | rev).yaml 8 | echo $f 9 | echo $yaml 10 | if bh20-seq-uploader --validate $f $yaml ; then 11 | sz=$(stat --format=%s $f) 12 | if test $sz -gt 20000 ; then 13 | mv $f $yaml validated 14 | else 15 | echo "Fasta file too small" 16 | fi 17 | fi 18 | done 19 | -------------------------------------------------------------------------------- /scripts/gen_docs/org2html.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script converts .org files to .html so these generated 4 | # files do not have to live in the git repo. 5 | 6 | echo "Convert $1 from .org to .html" 7 | 8 | guix environment --ad-hoc emacs-minimal emacs-org emacs-htmlize -- emacs -batch -visit $1 -eval "(progn (require 'org) (let ((org-export-htmlize-output-type 'css)) (org-html-export-to-html nil nil nil t nil)))" 9 | -------------------------------------------------------------------------------- /scripts/import.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | baseCommand: python3 4 | inputs: 5 | scripts: 6 | type: File 7 | default: 8 | class: File 9 | location: import_to_arvados.py 10 | inputBinding: {position: 1} 11 | importScript: 12 | type: File 13 | default: 14 | class: File 15 | location: download_genbank_data/from_genbank_to_fasta_and_yaml.py 16 | inputBinding: {position: 2} 17 | dict: 18 | type: Directory 19 | default: 20 | class: Directory 21 | location: dict_ontology_standardization 22 | inputBinding: {position: 3} 23 | outputs: [] 24 | requirements: 25 | DockerRequirement: 26 | dockerPull: bh20-seq-uploader/import 27 | NetworkAccess: 28 | networkAccess: true 29 | WorkReuse: 30 | enableReuse: false 31 | -------------------------------------------------------------------------------- /scripts/import_from_genbank.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: Workflow 3 | inputs: 4 | existing_metadata_from_nuccore: 5 | type: Directory? 6 | outputs: [] 7 | requirements: 8 | ScatterFeatureRequirement: {} 9 | steps: 10 | fetch_from_genbank: 11 | in: 12 | existing_metadata_from_nuccore: existing_metadata_from_nuccore 13 | out: [fasta_and_yaml, metadata_from_nuccore, accessions] 14 | run: fetch_from_genbank.cwl 15 | split_into_arrays: 16 | in: 17 | dir: fetch_from_genbank/fasta_and_yaml 18 | out: [fasta, metadata] 19 | run: split_into_arrays.cwl 20 | upload: 21 | in: 22 | fasta: split_into_arrays/fasta 23 | metadata: split_into_arrays/metadata 24 | out: [] 25 | scatter: [fasta, metadata] 26 | scatterMethod: dotproduct 27 | run: upload.cwl 28 | -------------------------------------------------------------------------------- /scripts/import_to_arvados.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import glob 4 | import sys 5 | 6 | os.chdir(os.environ["TMPDIR"]) 7 | os.symlink(sys.argv[2], "dict_ontology_standardization") 8 | subprocess.run(sys.argv[1]) 9 | 10 | os.chdir("fasta_and_yaml") 11 | fasta_files = glob.glob("*.fasta") 12 | 13 | for f in fasta_files: 14 | subprocess.run(["bh20-seq-uploader", "%s.yaml" %f[:-6], f]) 15 | -------------------------------------------------------------------------------- /scripts/split_into_arrays.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: ExpressionTool 3 | requirements: 4 | InlineJavascriptRequirement: {} 5 | inputs: 6 | dir: 7 | type: Directory 8 | loadListing: shallow_listing 9 | outputs: 10 | fasta: File[] 11 | metadata: File[] 12 | expression: | 13 | ${ 14 | var dir = inputs.dir; 15 | var fasta = []; 16 | var metadata = []; 17 | dir.listing.sort(function(a, b) { return a.basename < b.basename; }); 18 | for (var i = 0; i < dir.listing.length; i++) { 19 | if (dir.listing[i].basename.substr(-6) == ".fasta") { 20 | fasta.push(dir.listing[i]); 21 | } 22 | if (dir.listing[i].basename.substr(-5) == ".yaml") { 23 | metadata.push(dir.listing[i]); 24 | } 25 | } 26 | if (fasta.length != metadata.length) { 27 | throw "They dont match"; 28 | } 29 | return {"fasta": fasta, "metadata": metadata}; 30 | } 31 | -------------------------------------------------------------------------------- /scripts/submit_ebi/example/project-submission.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /scripts/submit_ebi/example/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Testing PubSeq Sample uploads 5 | This is a test to allow for uploading sequences from PubSeq 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /scripts/submit_ebi/example/sample-submission.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /scripts/submit_ebi/example/sample.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | human gastric microbiota, mucosal 5 | 6 | 1284369 7 | stomach metagenome 8 | 9 | 10 | 11 | 12 | investigation type 13 | mimarks-survey 14 | 15 | 16 | sequencing method 17 | pyrosequencing 18 | 19 | 20 | collection date 21 | 2010 22 | 23 | 24 | host body site 25 | Mucosa of stomach 26 | 27 | 28 | human-associated environmental package 29 | human-associated 30 | 31 | 32 | geographic location (latitude) 33 | 1.81 34 | DD 35 | 36 | 37 | geographic location (longitude) 38 | -78.76 39 | DD 40 | 41 | 42 | geographic location (country and/or sea) 43 | Colombia 44 | 45 | 46 | geographic location (region and locality) 47 | Tumaco 48 | 49 | 50 | environment (biome) 51 | coast 52 | 53 | 54 | environment (feature) 55 | human-associated habitat 56 | 57 | 58 | environment (material) 59 | gastric biopsy 60 | 61 | 62 | ENA-CHECKLIST 63 | ERC000011 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /scripts/upload.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | fasta: File 5 | metadata: File 6 | outputs: [] 7 | requirements: 8 | DockerRequirement: 9 | dockerPull: bh20-seq-uploader/import 10 | NetworkAccess: 11 | networkAccess: true 12 | baseCommand: bh20-seq-uploader 13 | arguments: [--skip-qc, $(inputs.metadata), $(inputs.fasta)] 14 | -------------------------------------------------------------------------------- /scripts/uthsc_samples/.gitignore: -------------------------------------------------------------------------------- 1 | yaml 2 | -------------------------------------------------------------------------------- /scripts/uthsc_samples/template.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | license: 4 | license_type: https://creativecommons.org/licenses/by/4.0/ 5 | title: "$strain" 6 | attribution_name: "Mariah Taylor, Colleen B. Jonsson" 7 | attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php 8 | 9 | host: 10 | host_id: "$sample_id" 11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 12 | 13 | sample: 14 | sample_id: "$sample_id" 15 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] 16 | collection_date: "$collection_date" 17 | collection_location: $location 18 | 19 | virus: 20 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 21 | virus_strain: "$strain" 22 | 23 | technology: 24 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] 25 | alignment_protocol: https://bio.tools/BWA#! 26 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" 27 | additional_technology_information: "Oxford Nanopore MiniIon RNA long reads" 28 | 29 | submitter: 30 | authors: [Mariah Taylor, Colleen B. Jonsson] 31 | submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins] 32 | submitter_address: UTHSC, Memphis, Tennessee 38163, USA 33 | originating_lab: Regional Biocontainment Laboratory, Memphis, TN 34 | submitter_sample_id: $sample_id 35 | submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162] 36 | -------------------------------------------------------------------------------- /scripts/uthsc_samples/uthsc_samples.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from string import Template 4 | from dateutil.parser import parse 5 | import re 6 | 7 | import sys 8 | 9 | # Metadata in tabular format in a spreadsheet(?!) 10 | xlsx = '../../test/data/10_samples.xlsx' 11 | 12 | # Template in a text file 13 | template_yaml = 'template.yaml' 14 | 15 | dir_output = 'yaml' 16 | 17 | if not os.path.exists(dir_output): 18 | os.makedirs(dir_output) 19 | 20 | table = pd.read_excel(xlsx) 21 | 22 | print(table) 23 | 24 | for index, row in table.iterrows(): 25 | sample = row['Sample ID'] 26 | print(f"Processing sample {sample}...") 27 | 28 | with open(template_yaml) as f: 29 | text = Template(f.read()) 30 | with open(os.path.join(dir_output,f"{sample}.yaml"), 'w') as fw: 31 | sample_id = sample 32 | sample_name = sample 33 | collection_date = parse(str(row['Collection Date'])).strftime('%Y-%m-%d') 34 | locationx = row['City']+", "+row['State']+", USA" 35 | location = "http://www.wikidata.org/entity/Q16563" # Memphis by default 36 | map = { 37 | "Pegram": "http://www.wikidata.org/entity/Q3289517", 38 | "Alexander": "http://www.wikidata.org/entity/Q79663", 39 | "Smithville": "http://www.wikidata.org/entity/Q2145339", 40 | "Nashville": "http://www.wikidata.org/entity/Q23197", 41 | "Madison": "http://www.wikidata.org/entity/Q494755" 42 | } 43 | 44 | for name in map: 45 | p = re.compile(name) 46 | if p.match(locationx): 47 | location = map[name] 48 | break 49 | 50 | strain = f"SARS-CoV-2/human/USA/{sample}/2020" 51 | fw.write(text.substitute(sample_id=sample_id, 52 | sample_name=sample_name, 53 | collection_date=collection_date, 54 | location=location, 55 | locationx=locationx, 56 | strain=strain 57 | )) 58 | 59 | print(f"Run: python3 bh20sequploader/main.py scripts/uthsc_samples/yaml/{sample}.yaml scripts/uthsc_samples/yaml/{sample}.fa") 60 | -------------------------------------------------------------------------------- /scripts/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def is_integer(string_to_check): 4 | try: 5 | int(string_to_check) 6 | return True 7 | except ValueError: 8 | return False 9 | 10 | def chunks(lst, n): 11 | for i in range(0, len(lst), n): 12 | yield lst[i:i + n] 13 | 14 | def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): 15 | # Check duplicated entry looking at all dictionaries 16 | field_to_term_to_uri_dict = {} 17 | 18 | path_dict_xxx_csv_list = [os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in 19 | os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv')] 20 | 21 | for path_dict_xxx_csv in path_dict_xxx_csv_list: 22 | print('Read {}'.format(path_dict_xxx_csv)) 23 | 24 | with open(path_dict_xxx_csv) as f: 25 | for line in f: 26 | if len(line.split(',')) > 2: 27 | term, uri = line.strip('\n').split('",') 28 | else: 29 | term, uri = line.strip('\n').split(',') 30 | 31 | term = term.strip('"') 32 | 33 | if term in field_to_term_to_uri_dict: 34 | print('Warning: in the dictionaries there are more entries for the same term ({}).'.format(term)) 35 | continue 36 | 37 | field_to_term_to_uri_dict[term] = uri 38 | 39 | # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen) 40 | field_to_term_to_uri_dict = {} 41 | 42 | for path_dict_xxx_csv in path_dict_xxx_csv_list: 43 | field = os.path.basename(path_dict_xxx_csv).split('.')[0] 44 | 45 | field_to_term_to_uri_dict[field] = {} 46 | 47 | with open(path_dict_xxx_csv) as f: 48 | for line in f: 49 | if len(line.split(',')) > 2: 50 | term, uri = line.strip('\n').split('",') 51 | else: 52 | term, uri = line.strip('\n').split(',') 53 | 54 | term = term.strip('"') 55 | 56 | if term in field_to_term_to_uri_dict[field]: 57 | print('Warning: in the {} dictionary there are more entries for the same term ({}).'.format(field, term)) 58 | continue 59 | 60 | field_to_term_to_uri_dict[field][term] = uri 61 | 62 | return field_to_term_to_uri_dict -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | 5 | import setuptools.command.egg_info as egg_info_cmd 6 | from setuptools import setup 7 | 8 | SETUP_DIR = os.path.dirname(__file__) 9 | README = os.path.join(SETUP_DIR, "README.md") 10 | 11 | try: 12 | import gittaggers 13 | 14 | tagger = gittaggers.EggInfoFromGit 15 | except ImportError: 16 | tagger = egg_info_cmd.egg_info 17 | 18 | install_requires = ["arvados-python-client", "schema-salad", 19 | "python-magic", "pyshex", "pyshexc==0.7.0", "py-dateutil"] 20 | web_requires = ["flask", "pyyaml", "redis"] 21 | 22 | needs_pytest = {"pytest", "test", "ptr"}.intersection(sys.argv) 23 | pytest_runner = ["pytest < 6", "pytest-runner < 5"] if needs_pytest else [] 24 | 25 | setup( 26 | name="bh20-seq-uploader", 27 | version="1.0", 28 | description="Biohackathon sequence uploader", 29 | long_description=open(README).read(), 30 | long_description_content_type="text/markdown", 31 | author="Peter Amstutz", 32 | author_email="peter.amstutz@curii.com", 33 | license="Apache 2.0", 34 | packages=["bh20sequploader", "bh20seqanalyzer", "bh20simplewebuploader"], 35 | package_data={"bh20sequploader": ["bh20seq-schema.yml", 36 | "bh20seq-options.yml", 37 | "bh20seq-shex.rdf", 38 | "validation/formats", 39 | "SARS-CoV-2-reference.fasta",], 40 | }, 41 | install_requires=install_requires, 42 | extras_require={ 43 | 'web': web_requires 44 | }, 45 | setup_requires=[] + pytest_runner, 46 | tests_require=["pytest<5"], 47 | entry_points={ 48 | "console_scripts": [ 49 | "bh20-seq-uploader=bh20sequploader.main:main", 50 | "bh20-seq-analyzer=bh20seqanalyzer.main:main" 51 | ] 52 | }, 53 | zip_safe=True, 54 | cmdclass={"egg_info": tagger}, 55 | python_requires=">=3.5, <4", 56 | ) 57 | -------------------------------------------------------------------------------- /test/data/10_samples.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/test/data/10_samples.xlsx -------------------------------------------------------------------------------- /test/data/input/TN_UT2.yaml: -------------------------------------------------------------------------------- 1 | id: placeholder 2 | 3 | license: 4 | license_type: https://creativecommons.org/licenses/by/4.0/ 5 | title: "TN_UT2 - Pegram, Tennessee, USA" 6 | attribution_name: "Mariah Taylor, Colleen Jonsson" 7 | attribution_url: https://www.uthsc.edu/medicine/molecular-sciences/faculty-directory/jonsson.php 8 | 9 | host: 10 | host_id: "TN_UT2" 11 | host_species: http://purl.obolibrary.org/obo/NCBITaxon_9606 12 | 13 | sample: 14 | sample_id: "TN_UT2" 15 | specimen_source: [http://purl.obolibrary.org/obo/NCIT_C155831] 16 | collection_date: "2020-04-26" 17 | collection_location: http://www.wikidata.org/entity/Q3289517 18 | 19 | virus: 20 | virus_species: http://purl.obolibrary.org/obo/NCBITaxon_2697049 21 | virus_strain: "SARS-CoV-2/human/USA/TN_UT2/2020" 22 | 23 | technology: 24 | sample_sequencing_technology: [http://www.ebi.ac.uk/efo/EFO_0008632] 25 | alignment_protocol: https://bio.tools/BWA#! 26 | assembly_method: "http://purl.obolibrary.org/obo/GENEPIO_0001628" 27 | additional_technology_information: Oxford Nanopore MiniIon RNA long reads 28 | 29 | submitter: 30 | authors: [Mariah Taylor, Colleen Jonsson] 31 | submitter_name: [Mariah Taylor, Colleen B. Jonsson, Pjotr Prins] 32 | submitter_address: UTHSC, Memphis, Tennessee 38163, USA 33 | originating_lab: Regional Biocontainment Laboratory, Memphis, TN 34 | submitter_sample_id: TN_UT2 35 | submitter_orcid: [https://orcid.org/0000-0002-2640-7672,https://orcid.org/0000-0002-8021-9162] 36 | -------------------------------------------------------------------------------- /test/runner.py: -------------------------------------------------------------------------------- 1 | # Run tests. python3 test/runner.py 2 | 3 | import unittest 4 | 5 | # initialize the test suite 6 | loader = unittest.TestLoader() 7 | suite = unittest.TestSuite() 8 | 9 | import test_shex 10 | import test_sparql 11 | 12 | suite.addTests(loader.loadTestsFromModule(test_shex)) 13 | suite.addTests(loader.loadTestsFromModule(test_sparql)) 14 | 15 | # initialize a runner, pass it your suite and run it 16 | runner = unittest.TextTestRunner(verbosity=3) 17 | result = runner.run(suite) 18 | -------------------------------------------------------------------------------- /test/test_shex.py: -------------------------------------------------------------------------------- 1 | # Run with python3 test/test_shex.py 2 | 3 | import schema_salad.schema 4 | import schema_salad.ref_resolver 5 | import schema_salad.jsonld_context 6 | from pyshex.evaluate import evaluate 7 | import unittest 8 | 9 | class TestShexMethods(unittest.TestCase): 10 | 11 | def test_schema(self): 12 | with open("bh20sequploader/bh20seq-schema.yml") as schema_resource: 13 | metadata_schema = schema_salad.schema.load_schema("bh20sequploader/bh20seq-schema.yml") 14 | (document_loader, 15 | avsc_names, 16 | schema_metadata, 17 | metaschema_loader) = metadata_schema 18 | # print(metadata_schema) 19 | self.assertTrue(isinstance(avsc_names, schema_salad.avro.schema.Names)) 20 | metadatafile = "test/data/input/TN_UT2.yaml" 21 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadatafile, True) 22 | print(doc) 23 | g = schema_salad.jsonld_context.makerdf("workflow", doc, document_loader.ctx) 24 | with open("bh20sequploader/bh20seq-shex.rdf") as f: 25 | shex = f.read() 26 | # Note the https link simply acts as a URI descriptor (it does not fetch) 27 | rslt, reason = evaluate(g, shex, doc["id"], "https://raw.githubusercontent.com/arvados/bh20-seq-resource/master/bh20sequploader/bh20seq-shex.rdf#submissionShape") 28 | 29 | with open("test/data/regression/TN_UT2.rdf","w") as f: 30 | f.write(g.serialize(format="ntriples").decode("utf-8")) 31 | 32 | if not rslt: 33 | raise Exception(reason) 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /test/test_sparql.py: -------------------------------------------------------------------------------- 1 | # Run with python3 test/test_sparql.py 2 | 3 | import unittest 4 | import requests 5 | import logging 6 | 7 | class TestSPARQL(unittest.TestCase): 8 | 9 | def test_sparql(self): 10 | # sparqlURL='http://sparql.genenetwork.org/sparql/' 11 | sparqlURL='http://127.0.0.1:8890//sparql/' 12 | id = "http://collections.lugli.arvadosapi.com/c=0002e93b86ad77824620bf938b97e134+126/sequence.fasta" 13 | id = "MT800005.1" 14 | query=f""" 15 | PREFIX pubseq: 16 | PREFIX sio: 17 | select distinct ?sample ?geoname ?date ?source ?geo ?sampletype ?institute ?sequenceuri 18 | {{ 19 | ?sample sio:SIO_000115 "{id}" . 20 | ?sequenceuri pubseq:sample ?sample . 21 | ?sample ?geo . 22 | ?geo rdfs:label ?geoname . 23 | ?sample ?date . 24 | OPTIONAL {{ ?sample ?source }} 25 | OPTIONAL {{ ?sample ?sampletype }} 26 | OPTIONAL {{ ?sample ?institute }} 27 | }} 28 | """ 29 | print(query) 30 | payload = {'query': query, 'format': 'json'} 31 | r = requests.get(sparqlURL, params=payload) 32 | result = r.json()['results']['bindings'] 33 | # for now we just take the first one 34 | print(result) 35 | self.assertEqual(result[0]['geoname']['value'],'Mahuva') 36 | 37 | if __name__ == '__main__': 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bam2fasta.cwl: -------------------------------------------------------------------------------- 1 | # Reference: 2 | # https://github.com/VGP/vgp-assembly/blob/33cd6236a68a1aee5f282e365dfe6b97e0b4ebb7/pipeline/freebayes-polish/freebayes.sh 3 | # https://github.com/VGP/vgp-assembly/blob/33cd6236a68a1aee5f282e365dfe6b97e0b4ebb7/pipeline/freebayes-polish/consensus.sh 4 | class: Workflow 5 | cwlVersion: v1.1 6 | id: bam2fasta 7 | label: bam2fasta 8 | requirements: [] 9 | 10 | inputs: 11 | bam: 12 | type: File 13 | fasta: 14 | type: File 15 | threads: 16 | type: int 17 | default: 4 18 | sample_id: string 19 | 20 | outputs: 21 | out_fasta: 22 | type: File 23 | outputSource: bcftools_consensus/out_fasta 24 | 25 | steps: 26 | freebayes: 27 | in: 28 | bam: bam 29 | ref_fasta: fasta 30 | out: [vcf] 31 | run: freebayes.cwl 32 | bcftools_view_exclude_ref: 33 | in: 34 | vcf: freebayes/vcf 35 | threads: threads 36 | out: [bcf] 37 | run: bcftools-view-exclude-ref.cwl 38 | bcftools_norm: 39 | in: 40 | ref_fasta: fasta 41 | bcf: bcftools_view_exclude_ref/bcf 42 | threads: threads 43 | out: [normalized_bcf] 44 | run: bcftools-norm.cwl 45 | bcftools_index_after_normalization: 46 | in: 47 | bcf: bcftools_norm/normalized_bcf 48 | out: [indexed] 49 | run: bcftools-index.cwl 50 | bcftools_view_qc: 51 | in: 52 | bcf: bcftools_index_after_normalization/indexed 53 | threads: threads 54 | out: [vcf] 55 | run: bcftools-view-qc.cwl 56 | bcftools_index_after_qc: 57 | in: 58 | bcf: bcftools_view_qc/vcf 59 | out: [indexed] 60 | run: bcftools-index.cwl 61 | bcftools_consensus: 62 | in: 63 | ref_fasta: fasta 64 | vcf: bcftools_index_after_qc/indexed 65 | sample_id: sample_id 66 | out: [out_fasta] 67 | run: bcftools-consensus.cwl 68 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-concat.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | baseCommand: bcftools 8 | arguments: 9 | - concat 10 | - -Ou 11 | - -o 12 | - $(inputs.output_name) 13 | - $(inputs.bcf_files) 14 | inputs: 15 | - id: output_name 16 | type: string 17 | default: "merged.bcf" 18 | - id: bcf_files 19 | type: File[] 20 | outputs: 21 | - id: merged_bcf 22 | type: File 23 | outputBinding: 24 | glob: "$(inputs.output_name)" 25 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-consensus.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | ShellCommandRequirement: {} 8 | baseCommand: bcftools 9 | arguments: 10 | - consensus 11 | - -i 12 | - 'QUAL > 10 && GT="a"' 13 | - -Hla 14 | - -f 15 | - $(inputs.ref_fasta) 16 | - $(inputs.vcf) 17 | - {shellQuote: false, valueFrom: "|"} 18 | - sed 19 | - "s/^>.*/>$(inputs.sample_id)/g" 20 | inputs: 21 | - id: ref_fasta 22 | type: File 23 | - id: vcf 24 | type: File 25 | secondaryFiles: [.csi] 26 | - id: sample_id 27 | type: string 28 | outputs: 29 | - id: out_fasta 30 | type: stdout 31 | stdout: sequence.fasta 32 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-index.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | InitialWorkDirRequirement: 8 | listing: 9 | - $(inputs.bcf) 10 | baseCommand: bcftools 11 | arguments: 12 | - index 13 | - $(inputs.bcf) 14 | inputs: 15 | - id: bcf 16 | type: File 17 | outputs: 18 | - id: indexed 19 | type: File 20 | outputBinding: 21 | glob: "$(inputs.bcf.basename)" 22 | secondaryFiles: 23 | - .csi 24 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-norm.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | baseCommand: bcftools 8 | arguments: 9 | - norm 10 | - -Ob 11 | - -f 12 | - $(inputs.ref_fasta) 13 | - -o 14 | - $(inputs.output_name) 15 | - --threads 16 | - $(inputs.threads) 17 | - $(inputs.bcf) 18 | inputs: 19 | - id: ref_fasta 20 | type: File 21 | - id: output_name 22 | type: string 23 | default: "normalized.bcf" 24 | - id: threads 25 | type: int 26 | - id: bcf 27 | type: File 28 | outputs: 29 | - id: normalized_bcf 30 | type: File 31 | outputBinding: 32 | glob: "$(inputs.output_name)" 33 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-view-exclude-ref.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | baseCommand: bcftools 8 | arguments: 9 | - view 10 | - --no-version 11 | - -Ou 12 | - -e'type=ref' 13 | - --threads=$(inputs.threads) 14 | - $(inputs.vcf) 15 | inputs: 16 | - id: vcf 17 | type: File 18 | - id: threads 19 | type: int 20 | outputs: 21 | - id: bcf 22 | type: stdout 23 | stdout: $(inputs.vcf.nameroot).without-ref.bcf 24 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-view-qc.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | baseCommand: bcftools 8 | arguments: 9 | - view 10 | - -i 11 | - 'QUAL > 10 && GT="a"' 12 | - -Oz 13 | - --threads=$(inputs.threads) 14 | - $(inputs.bcf) 15 | inputs: 16 | - id: threads 17 | type: int 18 | - id: bcf 19 | type: File 20 | secondaryFiles: [.csi] 21 | outputs: 22 | - id: vcf 23 | type: stdout 24 | stdout: out.changes.vcf.gz 25 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bcftools-view.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/bcftools:1.10.2--hd2cd319_0" 7 | baseCommand: bcftools 8 | arguments: 9 | - view 10 | - --no-version 11 | - -Ou 12 | - $(inputs.vcf) 13 | inputs: 14 | - id: vcf 15 | type: File 16 | outputs: 17 | - id: bcf 18 | type: stdout 19 | stdout: $(inputs.vcf.nameroot).bcf 20 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bwa-index.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.1 3 | class: CommandLineTool 4 | doc: string 5 | requirements: 6 | DockerRequirement: 7 | dockerPull: quay.io/biocontainers/bwa:0.7.17--h84994c4_5 8 | InitialWorkDirRequirement: 9 | listing: 10 | - $(inputs.input_fasta) 11 | baseCommand: [bwa, index] 12 | inputs: 13 | input_fasta: 14 | type: File 15 | label: "input fasta file" 16 | inputBinding: 17 | position: 1 18 | outputs: 19 | indexed_fasta: 20 | type: File 21 | outputBinding: 22 | glob: $(inputs.input_fasta.basename) 23 | secondaryFiles: 24 | - .amb 25 | - .ann 26 | - .bwt 27 | - .pac 28 | - .sa 29 | stdout: stdout 30 | stderr: stderr 31 | stdout: bwa-index-stdout.log 32 | stderr: bwa-index-stderr.log 33 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/bwa-mem.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.1 3 | class: CommandLineTool 4 | doc: string 5 | requirements: 6 | DockerRequirement: 7 | dockerPull: quay.io/biocontainers/bwa:0.7.17--h84994c4_5 8 | 9 | baseCommand: [bwa, mem] 10 | 11 | inputs: 12 | threads: 13 | type: int 14 | label: "number of threads" 15 | default: 4 16 | inputBinding: 17 | prefix: -t 18 | output_sam: 19 | type: string 20 | label: "sam file to output results to" 21 | default: "out.sam" 22 | inputBinding: 23 | prefix: -o 24 | group_header_line: 25 | type: string? 26 | label: "read group header line such as '@RG\tID:foo\tSM:bar'" 27 | inputBinding: 28 | prefix: -R 29 | index_base: 30 | type: File 31 | label: "fasta file for index basename" 32 | inputBinding: 33 | position: 1 34 | secondaryFiles: 35 | - .amb 36 | - .ann 37 | - .bwt 38 | - .pac 39 | - .sa 40 | fastq_forward: 41 | type: File 42 | label: "input fastq file to map (single-end or forward for pair-end)" 43 | inputBinding: 44 | position: 2 45 | fastq_reverse: 46 | type: File? 47 | label: "input fastq file to map (reverse for pair-end)" 48 | inputBinding: 49 | position: 3 50 | 51 | outputs: 52 | output: 53 | type: File 54 | outputBinding: 55 | glob: "$(inputs.output_sam)" 56 | stdout: stdout 57 | stderr: stderr 58 | stdout: bwa-mem-stdout.log 59 | stderr: bwa-mem-stderr.log 60 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/fastq2fasta-create-bwaindex.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: Workflow 3 | requirements: 4 | SubworkflowFeatureRequirement: {} 5 | hints: 6 | ResourceRequirement: 7 | ramMin: 3000 8 | 9 | inputs: 10 | ref_fasta: 11 | type: File 12 | fastq_forward: 13 | type: File 14 | fastq_reverse: 15 | type: File? 16 | threads: 17 | type: int 18 | default: 4 19 | 20 | outputs: 21 | out_fasta: 22 | type: File 23 | outputSource: fastq2fasta/out_fasta 24 | 25 | steps: 26 | bwa-index: 27 | in: {input_fasta: ref_fasta} 28 | out: [indexed_fasta] 29 | run: bwa-index.cwl 30 | samtools-faidx: 31 | in: {input_fasta: bwa-index/indexed_fasta} 32 | out: [indexed_fasta] 33 | run: samtools-faidx.cwl 34 | fastq2fasta: 35 | in: 36 | fastq_forward: fastq_forward 37 | fastq_reverse: fastq_reverse 38 | ref_fasta: samtools-faidx/indexed_fasta 39 | threads: threads 40 | out: [out_fasta] 41 | run: fastq2fasta.cwl 42 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/fastq2fasta.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: Workflow 3 | requirements: 4 | SubworkflowFeatureRequirement: {} 5 | hints: 6 | ResourceRequirement: 7 | ramMin: 3000 8 | 9 | inputs: 10 | fastq_forward: File 11 | fastq_reverse: File? 12 | ref_fasta: 13 | type: File 14 | secondaryFiles: 15 | - .amb 16 | - .ann 17 | - .bwt 18 | - .pac 19 | - .sa 20 | - .fai 21 | threads: 22 | type: int 23 | default: 4 24 | metadata: File? 25 | sample_id: string 26 | 27 | outputs: 28 | out_fasta: 29 | type: File 30 | outputSource: bam2fasta/out_fasta 31 | out_metadata: 32 | type: File? 33 | outputSource: metadata 34 | 35 | steps: 36 | bwa-mem: 37 | in: 38 | threads: threads 39 | fastq_forward: fastq_forward 40 | fastq_reverse: fastq_reverse 41 | index_base: ref_fasta 42 | out: [output] 43 | run: bwa-mem.cwl 44 | samtools-view: 45 | in: 46 | threads: threads 47 | input_file: bwa-mem/output 48 | out: [bam] 49 | run: samtools-view.cwl 50 | samtools-sort: 51 | in: 52 | input_bamfile: samtools-view/bam 53 | threads: threads 54 | out: [sorted_bam] 55 | run: samtools-sort.cwl 56 | bam2fasta: 57 | in: 58 | bam: samtools-sort/sorted_bam 59 | fasta: ref_fasta 60 | threads: threads 61 | sample_id: sample_id 62 | out: [out_fasta] 63 | run: bam2fasta.cwl 64 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/freebayes.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: "quay.io/biocontainers/freebayes:1.3.2--py37hc088bd4_0" 7 | baseCommand: freebayes 8 | arguments: [ 9 | --bam, $(inputs.bam), 10 | # --region=$(inputs.contig):1-$(inputs.contig_end) 11 | --ploidy, "1", 12 | -f, $(inputs.ref_fasta)] 13 | inputs: 14 | - id: bam 15 | type: File 16 | # - id: contig 17 | # type: string 18 | # - id: contig_end 19 | # type: int 20 | - id: ref_fasta 21 | type: File 22 | outputs: 23 | - id: vcf 24 | type: stdout 25 | stdout: var.vcf 26 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/samtools-faidx.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.0 3 | class: CommandLineTool 4 | doc: "samtools sort, sort given bam file" 5 | requirements: 6 | DockerRequirement: 7 | dockerPull: quay.io/biocontainers/samtools:1.9--h8571acd_11 8 | InitialWorkDirRequirement: 9 | listing: 10 | - $(inputs.input_fasta) 11 | baseCommand: [samtools, faidx] 12 | inputs: 13 | input_fasta: 14 | type: File 15 | label: "Input fasta" 16 | inputBinding: 17 | position: 1 18 | secondaryFiles: 19 | - .amb 20 | - .ann 21 | - .bwt 22 | - .pac 23 | - .sa 24 | outputs: 25 | indexed_fasta: 26 | type: File 27 | outputBinding: 28 | glob: "$(inputs.input_fasta.basename)" 29 | secondaryFiles: 30 | - .amb 31 | - .ann 32 | - .bwt 33 | - .pac 34 | - .sa 35 | - .fai 36 | stdout: stdout 37 | stderr: stderr 38 | stdout: samtools-sort-stdout.log 39 | stderr: samtools-sort-stderr.log 40 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/samtools-sort.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.0 3 | class: CommandLineTool 4 | doc: "samtools sort, sort given bam file" 5 | requirements: 6 | DockerRequirement: 7 | dockerPull: quay.io/biocontainers/samtools:1.9--h8571acd_11 8 | baseCommand: [samtools, sort] 9 | inputs: 10 | threads: 11 | type: int 12 | default: 4 13 | inputBinding: 14 | prefix: -@ 15 | tmpfile: 16 | type: string 17 | default: sort.tmp 18 | label: "Write temporary files to PREFIX.nnnn.bam" 19 | inputBinding: 20 | prefix: -T 21 | output_bam: 22 | type: string 23 | default: aln.sorted.bam 24 | label: "Write final output to FILENAME" 25 | inputBinding: 26 | prefix: -o 27 | input_bamfile: 28 | type: File 29 | label: "Input bamfile" 30 | inputBinding: 31 | position: 1 32 | 33 | outputs: 34 | sorted_bam: 35 | type: File 36 | outputBinding: 37 | glob: "$(inputs.output_bam)" 38 | stdout: stdout 39 | stderr: stderr 40 | stdout: samtools-sort-stdout.log 41 | stderr: samtools-sort-stderr.log 42 | -------------------------------------------------------------------------------- /workflows/fastq2fasta/samtools-view.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.0 3 | class: CommandLineTool 4 | doc: "samtools view to convert sam format to bam format" 5 | requirements: 6 | DockerRequirement: 7 | dockerPull: quay.io/biocontainers/samtools:1.9--h8571acd_11 8 | baseCommand: [samtools, view] 9 | inputs: 10 | threads: 11 | type: int 12 | label: "Number of additional threads to use" 13 | default: 4 14 | inputBinding: 15 | prefix: -@ 16 | output_bam: 17 | type: boolean 18 | label: "output BAM" 19 | default: true 20 | inputBinding: 21 | prefix: -b 22 | output_filename: 23 | type: string 24 | label: "output file name" 25 | default: "aln.bam" 26 | inputBinding: 27 | prefix: -o 28 | input_file: 29 | type: File 30 | label: "input file" 31 | inputBinding: 32 | position: 1 33 | include_header: 34 | type: boolean 35 | label: "include the header in the output" 36 | default: false 37 | inputBinding: 38 | prefix: -h 39 | ignore_previous_version: 40 | type: boolean 41 | label: "ignored for compatibility with previous samtools versions" 42 | default: false 43 | inputBinding: 44 | prefix: -S 45 | filter_alignments: 46 | type: string? 47 | label: "Do not output alignments with any bits set in INT present in the FLAG field. INT can be specified in hex by beginning with `0x' (i.e. /^0x[0-9A-F]+/) or in octal by beginning with `0' (i.e. /^0[0-7]+/) [0]." 48 | inputBinding: 49 | prefix: -F 50 | skip_alignments: 51 | type: int? 52 | label: "Skip alignments with MAPQ smaller than INT [0]." 53 | inputBinding: 54 | prefix: -q 55 | outputs: 56 | bam: 57 | type: File 58 | outputBinding: 59 | glob: "$(inputs.output_filename)" 60 | stdout: stdout 61 | stderr: stderr 62 | stdout: samtools-view-stdout.log 63 | stderr: samtools-view-stderr.log 64 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/abpoa.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | readsFA: File 5 | script: 6 | type: File 7 | default: {class: File, location: relabel-seqs.py} 8 | outputs: 9 | abpoaGFA: 10 | type: stdout 11 | requirements: 12 | InlineJavascriptRequirement: {} 13 | hints: 14 | DockerRequirement: 15 | dockerPull: "quay.io/biocontainers/abpoa:1.0.5--hed695b0_0" 16 | ResourceRequirement: 17 | coresMin: 1 18 | ramMin: $(15 * 1024) 19 | outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) 20 | baseCommand: abpoa 21 | stdout: $(inputs.readsFA.nameroot).O0.gfa 22 | arguments: [ 23 | $(inputs.readsFA), 24 | -r 3, 25 | -O, '0' 26 | ] 27 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/arv-main.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: Workflow 3 | requirements: 4 | SubworkflowFeatureRequirement: {} 5 | inputs: 6 | src_project: string 7 | metadataSchema: File 8 | exclude: File? 9 | outputs: 10 | odgiGraph: 11 | type: File 12 | outputSource: pangenome-generate/odgiGraph 13 | # odgiPNG: 14 | # type: File 15 | # outputSource: pangenome-generate/odgiPNG 16 | spoaGFA: 17 | type: File 18 | outputSource: pangenome-generate/spoaGFA 19 | odgiRDF: 20 | type: File 21 | outputSource: pangenome-generate/odgiRDF 22 | readsMergeDedup: 23 | type: File 24 | outputSource: pangenome-generate/readsMergeDedupSortedByQualAndLen 25 | mergedMetadata: 26 | type: File 27 | outputSource: pangenome-generate/mergedMetadata 28 | # indexed_paths: 29 | # type: File 30 | # outputSource: pangenome-generate/indexed_paths 31 | # colinear_components: 32 | # type: Directory 33 | # outputSource: pangenome-generate/colinear_components 34 | steps: 35 | collect-seqs: 36 | run: collect-seqs.cwl 37 | in: 38 | src_project: src_project 39 | metadataSchema: metadataSchema 40 | exclude: exclude 41 | out: [relabeledSeqs, mergedMetadata] 42 | pangenome-generate: 43 | run: pangenome-generate_spoa.cwl 44 | in: 45 | seqs: collect-seqs/relabeledSeqs 46 | metadata: collect-seqs/mergedMetadata 47 | exclude: exclude 48 | out: [odgiGraph, spoaGFA, odgiRDF, readsMergeDedupSortedByQualAndLen, mergedMetadata] 49 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/arvados-and-samtools-dockerfile/1078ECD7.key: -------------------------------------------------------------------------------- 1 | -----BEGIN PGP PUBLIC KEY BLOCK----- 2 | 3 | mQENBEzhgeoBCAChhoK1dqpWzNyDWqRGEvdFdkJaA9D2HRwKPfBfjAoePX6ZyrpA 4 | ItlUsvt/8s/DRiTiPEFQR4S7VqocmU6whJc3gDEGyOM6b1NF873lIfSVwUoE42QE 5 | a76dO8woOYgLUyxu2mKG+bJgGMumjBJt6ZOndYVjTYB/7sEeVxwmMVulfZe0s6zg 6 | ut0+SoTYg2R36qIqeIcWllYt97sEYnyy1qXMis4/3IZnuWkS/frsPR3aeUI4W+o2 7 | NDN1kj49+LMe7Fb5b7jZY08rZbAWXi1rU1hQx4jC9RvYqlT4HNld4Bn7os1IvOOA 8 | wNiR0oiVdiuDbBxcMvRPktxMrFVjowusRLq/ABEBAAG0PUN1cm92ZXJzZSwgSW5j 9 | IEF1dG9tYXRpYyBTaWduaW5nIEtleSA8c3lzYWRtaW5AY3Vyb3ZlcnNlLmNvbT6J 10 | ATgEEwECACIFAlNgYIECGwMGCwkIBwMCBhUIAgkKCwQWAgMBAh4BAheAAAoJEFcW 11 | WREQeOzXPkEH/jQJDIYI1dxWcYiA+hczmpaZvN2/pc/kwIW/6a03+6zqmSNkebOE 12 | TgoDILacSYc17hy20R1/rWyUstOMKcEgFDBlSehhHyl0f7q/w7d8Ais6MabzsPfx 13 | IceJpsjUg87+BR7qWhgQ0sxmtIF2TKuTFLs+nkGsgSsiBOEF4NvHxuj3HD4y8F27 14 | HNqrkqwjLS8xJwwH5Gp2uMEVr1AXIH3iSRjJ8X124s8iEP97Q/3IazoYRf9/MCSm 15 | QEx8KzxwDX6t4bW6O4D01K+e9gdkTY70dcMgJoqm5IsX7yxjEubiOunphtlJnZ9d 16 | Oi1yBN5UM3pWKAdcfRj4rcfV9Simvpx9av+5AQ0ETOGB6gEIAMAA0HVMG0BbdnU7 17 | wWgl5eFdT0AUSrXK/WdcKqVEGGv+c68NETSHWZOJX7O46Eao4gY4cTYprVMBzxpY 18 | /BtQSYLpE0HLvBc1fcFd61Yz4H/9rGSNY0GcIQEbOjbJY5mr8qFsQ1K/mAf3aUL3 19 | b6ni4sHVicRiRr0Gl4Ihorlskpfu1SHs/C5tvTSVNF9p4vtl5892y1yILQeVpcBs 20 | NCR7MUpdS49xCpvnAWsDZX+ij6LTR3lzCm/ZLCg4gNuZkjgU9oqVfGkqysW7WZ8S 21 | OLvzAwUw7i1EIFX8q6QdudGoezxz8m8OgZM1v8AFpYEKlhEPf1W0MSfaRDwrj866 22 | 8nCLruEAEQEAAYkBHwQYAQIACQUCTOGB6gIbDAAKCRBXFlkREHjs199EB/4+p0G1 23 | 3PHxt6rLWSCGXobDOu4ZOA/qnv0D/JhOLroFds5TzQv6vnS8eAkhCTjHVA+b58cm 24 | kXpI0oYcD4ZP+KK1CHKq2rGfwou7HfAF+icnNqYkeBOkjjbCgkvBlcCInuAuU8JX 25 | DZMkfFk52+eBKwTjS/J/fQp0vDru8bHLp98WgdRHWfJQ3mc3gz4A5sR6zhrGPW6/ 26 | ssnROS4dC2Ohp35GpgN1KjD3EmEw5RoSBYlyrARCaMsivgIKMxGUEyFZWhuJt3N1 27 | 2MTddRwz28hbmYCi+MzHYDbRv+cSyUDmvXaWhfkNKBepClBA1rTWBcldit5vvlqr 28 | yPet6wIKrtLGhAqZ 29 | =CLkG 30 | -----END PGP PUBLIC KEY BLOCK----- 31 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/arvados-and-samtools-dockerfile/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:10 2 | ENV DEBIAN_FRONTEND noninteractive 3 | RUN apt-get update -q 4 | RUN apt-get install -yq --no-install-recommends gnupg 5 | ADD 1078ECD7.key /tmp/ 6 | RUN cat /tmp/1078ECD7.key | apt-key add - 7 | RUN echo 'deb http://apt.arvados.org/ buster main' > /etc/apt/sources.list.d/apt.arvados.org-stable.list 8 | RUN apt-get update -q && apt-get install -yq --no-install-recommends samtools python3-python-client 9 | RUN rm -f /usr/bin/python && ln -s /usr/share/python3/dist/python3-python-client/bin/python /usr/bin/python 10 | RUN rm -f /usr/bin/python3 && ln -s /usr/share/python3/dist/python3-python-client/bin/python /usr/bin/python3 11 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/collect-seqs.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | $namespaces: 4 | arv: "http://arvados.org/cwl#" 5 | cwltool: "http://commonwl.org/cwltool#" 6 | requirements: 7 | arv:APIRequirement: {} 8 | arv:RuntimeConstraints: 9 | outputDirType: keep_output_dir 10 | DockerRequirement: 11 | dockerImageId: arvados-and-samtools 12 | WorkReuse: 13 | enableReuse: false 14 | ResourceRequirement: 15 | coresMin: 1 16 | ramMin: 1024 17 | baseCommand: python3 18 | inputs: 19 | script: 20 | type: File 21 | default: 22 | class: File 23 | location: collect-seqs.py 24 | inputBinding: {position: 1} 25 | src_project: 26 | type: string 27 | inputBinding: {position: 2} 28 | metadataSchema: 29 | type: File 30 | inputBinding: {position: 3} 31 | exclude: 32 | type: File? 33 | inputBinding: {position: 4} 34 | outputs: 35 | relabeledSeqs: 36 | type: File 37 | outputBinding: 38 | glob: relabeledSeqs.fasta 39 | secondaryFiles: [.fai] 40 | mergedMetadata: 41 | type: File 42 | outputBinding: 43 | glob: mergedMetadata.ttl 44 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/collect-seqs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import arvados 3 | import json 4 | import shutil 5 | import logging 6 | import subprocess 7 | import arvados.collection 8 | import ruamel.yaml 9 | import schema_salad.schema 10 | import schema_salad.jsonld_context 11 | from schema_salad.sourceline import add_lc_filename 12 | 13 | api = arvados.api() 14 | keepclient = arvados.keep.KeepClient(api_client=api) 15 | 16 | validated = arvados.util.list_all(api.collections().list, filters=[ 17 | ["owner_uuid", "=", sys.argv[1]], 18 | ["properties.status", "=", "validated"]]) 19 | 20 | validated.sort(key=lambda v: v["portable_data_hash"]) 21 | 22 | relabeled_fasta = open("relabeledSeqs.fasta", "wt") 23 | merged_metadata = open("mergedMetadata.ttl", "wt") 24 | 25 | metadataSchema = sys.argv[2] 26 | 27 | blacklist = set() 28 | if len(sys.argv) > 3: 29 | with open(sys.argv[3]) as bl: 30 | for l in bl: 31 | blacklist.add(l.strip()) 32 | 33 | (document_loader, 34 | avsc_names, 35 | schema_metadata, 36 | metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) 37 | 38 | 39 | for item in validated: 40 | pdh = item["portable_data_hash"] 41 | uuid = item["uuid"] 42 | try: 43 | subject = "http://covid19.genenetwork.org/resource/%s" % uuid 44 | with arvados.collection.CollectionReader(pdh, api_client=api, keep_client=keepclient) as col: 45 | with col.open("metadata.yaml", "rt") as md: 46 | metadata_content = ruamel.yaml.round_trip_load(md) 47 | metadata_content["id"] = subject 48 | add_lc_filename(metadata_content, metadata_content["id"]) 49 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, metadata_content, False, False) 50 | g = schema_salad.jsonld_context.makerdf(subject, doc, document_loader.ctx) 51 | 52 | with col.open("sequence.fasta", "rt") as fa: 53 | label = fa.readline().strip() 54 | merged_metadata.write("<%s> \"%s\" .\n" % (subject, label[1:].replace('"', '\\"'))) 55 | merged_metadata.write("<%s> \"%s\" .\n" % (subject, pdh)) 56 | merged_metadata.write("<%s> \"%s\" .\n" % (subject, item["version"])) 57 | skip = (subject in blacklist or label[1:] in blacklist) 58 | if skip: 59 | merged_metadata.write("<%s> \"true\"^^ .\n" % subject) 60 | if not skip: 61 | relabeled_fasta.write(">"+subject+"\n") 62 | data = fa.read(8096) 63 | while data: 64 | if not skip: 65 | relabeled_fasta.write(data) 66 | endswithnewline = data.endswith("\n") 67 | data = fa.read(8096) 68 | if not skip and not endswithnewline: 69 | relabeled_fasta.write("\n") 70 | 71 | merged_metadata.write(g.serialize(format="ntriples").decode("utf-8")) 72 | except Exception as e: 73 | logging.exception("Error processing collection %s" % uuid) 74 | 75 | subprocess.run(["samtools", "faidx", "relabeledSeqs.fasta"]) 76 | 77 | shutil.rmtree(".cache") 78 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/dups2metadata.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | baseCommand: python 4 | inputs: 5 | script: 6 | type: File 7 | default: 8 | class: File 9 | location: dups2metadata.py 10 | inputBinding: {position: 1} 11 | metadata: 12 | type: File 13 | inputBinding: {position: 2} 14 | dups: 15 | type: File? 16 | inputBinding: {position: 3} 17 | stdout: mergedmetadata.ttl 18 | outputs: 19 | merged: stdout 20 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/dups2metadata.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | 4 | md = open(sys.argv[1], "rt") 5 | for d in md: 6 | sys.stdout.write(d) 7 | 8 | if len(sys.argv) < 3: 9 | exit(0) 10 | 11 | sameseqs = open(sys.argv[2], "rt") 12 | for d in sameseqs: 13 | g = re.match(r"\d+\t(.*)", d) 14 | sp = g.group(1).split(",") 15 | for n in sp[1:]: 16 | sys.stdout.write("<%s> <%s> .\n" % (n.strip(), sp[0].strip())) 17 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/from_sparql.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | $namespaces: 4 | arv: "http://arvados.org/cwl#" 5 | requirements: 6 | DockerRequirement: 7 | dockerFile: | 8 | FROM debian:10 9 | RUN apt-get update && apt-get -yq --no-install-recommends install samtools python3-rdflib 10 | dockerImageId: rdflib-and-samtools 11 | ResourceRequirement: 12 | ramMin: 768 13 | arv:RuntimeConstraints: 14 | keep_cache: 2048 15 | outputDirType: keep_output_dir 16 | inputs: 17 | script: 18 | type: File 19 | default: 20 | class: File 21 | location: from_sparql.py 22 | metadata: File 23 | fasta: 24 | type: File 25 | secondaryFiles: [.fai] 26 | query: string 27 | stdout: selected.fasta 28 | outputs: 29 | selected: stdout 30 | arguments: [python3, $(inputs.script), $(inputs.metadata), $(inputs.fasta), $(inputs.query)] 31 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/from_sparql.py: -------------------------------------------------------------------------------- 1 | from rdflib import Graph 2 | import sys 3 | import subprocess 4 | g = Graph() 5 | g.parse(sys.argv[1], format="nt") 6 | res = g.query(sys.argv[3]) 7 | for r in res: 8 | subprocess.run(["samtools", "faidx", sys.argv[2], r[0]]) 9 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/merge-metadata.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | hints: 4 | DockerRequirement: 5 | dockerPull: commonworkflowlanguage/cwltool_module 6 | inputs: 7 | metadata: File[] 8 | subjects: string[] 9 | metadataSchema: 10 | type: File 11 | inputBinding: {position: 2} 12 | originalLabels: 13 | type: File 14 | inputBinding: {position: 3} 15 | dups: 16 | type: File? 17 | inputBinding: {position: 4} 18 | script: 19 | type: File 20 | inputBinding: {position: 1} 21 | default: {class: File, location: merge-metadata.py} 22 | outputs: 23 | merged: stdout 24 | stdout: mergedmetadata.ttl 25 | requirements: 26 | InlineJavascriptRequirement: {} 27 | InitialWorkDirRequirement: 28 | listing: | 29 | ${ 30 | var i = 0; 31 | var b = 1; 32 | var out = []; 33 | for (; i < inputs.metadata.length; i++) { 34 | var block = []; 35 | var sub = []; 36 | for (; i < (b*150) && i < inputs.metadata.length; i++) { 37 | block.push(inputs.metadata[i]); 38 | sub.push(inputs.subjects[i]); 39 | } 40 | out.push({ 41 | entryname: "block"+b, 42 | entry: JSON.stringify(block) 43 | }); 44 | out.push({ 45 | entryname: "subs"+b, 46 | entry: JSON.stringify(sub) 47 | }); 48 | b++; 49 | } 50 | return out; 51 | } 52 | baseCommand: python 53 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/merge-metadata.py: -------------------------------------------------------------------------------- 1 | import re 2 | import schema_salad.schema 3 | import schema_salad.jsonld_context 4 | import json 5 | import sys 6 | import os 7 | import logging 8 | 9 | metadataSchema = sys.argv[1] 10 | originalLabels = sys.argv[2] 11 | dups = None 12 | if len(sys.argv) == 4: 13 | dups = sys.argv[3] 14 | 15 | def readitems(stem): 16 | items = [] 17 | b = 1 18 | while os.path.exists("%s%i" % (stem, b)): 19 | with open("%s%i" % (stem, b)) as f: 20 | items.extend(json.load(f)) 21 | b += 1 22 | return items 23 | 24 | metadata = readitems("block") 25 | subjects = readitems("subs") 26 | 27 | (document_loader, 28 | avsc_names, 29 | schema_metadata, 30 | metaschema_loader) = schema_salad.schema.load_schema(metadataSchema) 31 | 32 | for i, m in enumerate(metadata): 33 | doc, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, m["path"], False, False) 34 | doc["id"] = subjects[i] 35 | g = schema_salad.jsonld_context.makerdf(subjects[i], doc, document_loader.ctx) 36 | print(g.serialize(format="ntriples").decode("utf-8")) 37 | 38 | if dups: 39 | sameseqs = open(dups, "rt") 40 | for d in sameseqs: 41 | logging.warn(d) 42 | g = re.match(r"\d+\t(.*)", d) 43 | logging.warn("%s", g.group(1)) 44 | sp = g.group(1).split(",") 45 | for n in sp[1:]: 46 | print("<%s> <%s> ." % (n.strip(), sp[0].strip())) 47 | 48 | orig = open(originalLabels, "rt") 49 | print(orig.read()) 50 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/odgi-build-from-xpoa-gfa.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | inputGFA: File 5 | outputs: 6 | odgiGraph: 7 | type: File 8 | outputBinding: 9 | glob: $(inputs.inputGFA.nameroot).unchop.sorted.odgi 10 | requirements: 11 | InlineJavascriptRequirement: {} 12 | hints: 13 | DockerRequirement: 14 | dockerPull: "odgi-bash-binutils:latest" 15 | ResourceRequirement: 16 | coresMin: 4 17 | ramMin: $(15 * 1024) 18 | outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2)) 19 | InitialWorkDirRequirement: 20 | # Will fail if input file is not writable (odgi bug) 21 | listing: 22 | - entry: $(inputs.inputGFA) 23 | writable: true 24 | arguments: 25 | - "sh" 26 | - "-c" 27 | - >- 28 | odgi build -g '$(inputs.inputGFA.path)' -o - | odgi unchop -i - -o - | 29 | odgi sort -i - -p s -o $(inputs.inputGFA.nameroot).unchop.sorted.odgi 30 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/odgi-build.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | inputGFA: File 5 | outputs: 6 | odgiGraph: 7 | type: File 8 | outputBinding: 9 | glob: $(inputs.inputGFA.nameroot).odgi 10 | requirements: 11 | InlineJavascriptRequirement: {} 12 | ShellCommandRequirement: {} 13 | hints: 14 | DockerRequirement: 15 | dockerPull: "quay.io/biocontainers/odgi:v0.3--py37h8b12597_0" 16 | ResourceRequirement: 17 | coresMin: 4 18 | ramMin: $(7 * 1024) 19 | outdirMin: $(Math.ceil((inputs.inputGFA.size/(1024*1024*1024)+1) * 2)) 20 | InitialWorkDirRequirement: 21 | listing: 22 | - entry: $(inputs.inputGFA) 23 | writable: true 24 | arguments: [odgi, build, -g, $(inputs.inputGFA), -o, -, 25 | {shellQuote: false, valueFrom: "|"}, 26 | odgi, sort, -i, -, -p, s, -o, $(inputs.inputGFA.nameroot).odgi] 27 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/odgi_to_rdf.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | class: CommandLineTool 3 | cwlVersion: v1.1 4 | hints: 5 | DockerRequirement: 6 | dockerPull: jerven/spodgi:0.0.6 7 | requirements: 8 | InlineJavascriptRequirement: {} 9 | ShellCommandRequirement: {} 10 | ResourceRequirement: 11 | ramMin: $((2 * 1024) + 1) 12 | inputs: 13 | odgi: File 14 | output_name: string? 15 | 16 | stdout: $(inputs.output_name || inputs.odgi.nameroot+'.ttl.xz') 17 | 18 | arguments: 19 | [odgi_to_rdf.py, $(inputs.odgi), "-", 20 | {valueFrom: "|", shellQuote: false}, 21 | xz, --stdout] 22 | 23 | outputs: 24 | rdf: stdout 25 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/pangenome-generate.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.1 3 | class: Workflow 4 | requirements: 5 | ScatterFeatureRequirement: {} 6 | StepInputExpressionRequirement: {} 7 | inputs: 8 | inputReads: File[] 9 | metadata: File[] 10 | metadataSchema: File 11 | subjects: string[] 12 | exclude: File? 13 | bin_widths: 14 | type: int[] 15 | default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] 16 | doc: width of each bin in basepairs along the graph vector 17 | cells_per_file: 18 | type: int 19 | default: 100 20 | doc: Cells per file on component_segmentation 21 | outputs: 22 | odgiGraph: 23 | type: File 24 | outputSource: buildGraph/odgiGraph 25 | odgiPNG: 26 | type: File 27 | outputSource: vizGraph/graph_image 28 | seqwishGFA: 29 | type: File 30 | outputSource: induceGraph/seqwishGFA 31 | odgiRDF: 32 | type: File 33 | outputSource: odgi2rdf/rdf 34 | readsMergeDedup: 35 | type: File 36 | outputSource: dedup/reads_dedup 37 | mergedMetadata: 38 | type: File 39 | outputSource: mergeMetadata/merged 40 | indexed_paths: 41 | type: File 42 | outputSource: index_paths/indexed_paths 43 | colinear_components: 44 | type: Directory 45 | outputSource: segment_components/colinear_components 46 | steps: 47 | relabel: 48 | in: 49 | readsFA: inputReads 50 | subjects: subjects 51 | exclude: exclude 52 | out: [relabeledSeqs, originalLabels] 53 | run: relabel-seqs.cwl 54 | dedup: 55 | in: {reads: relabel/relabeledSeqs} 56 | out: [reads_dedup, dups] 57 | run: ../tools/seqkit/seqkit_rmdup.cwl 58 | overlapReads: 59 | in: 60 | target: dedup/reads_dedup 61 | query: dedup/reads_dedup 62 | outputCIGAR: {default: true} 63 | preset: {default: asm20} 64 | miniWinSize: {default: 1} 65 | out: [alignments] 66 | run: ../tools/minimap2/minimap2_paf.cwl 67 | induceGraph: 68 | in: 69 | readsFA: dedup/reads_dedup 70 | readsPAF: overlapReads/alignments 71 | out: [seqwishGFA] 72 | run: seqwish.cwl 73 | buildGraph: 74 | in: {inputGFA: induceGraph/seqwishGFA} 75 | out: [odgiGraph] 76 | run: odgi-build.cwl 77 | vizGraph: 78 | in: 79 | sparse_graph_index: buildGraph/odgiGraph 80 | width: 81 | default: 50000 82 | height: 83 | default: 500 84 | path_per_row: 85 | default: true 86 | path_height: 87 | default: 4 88 | out: [graph_image] 89 | run: ../tools/odgi/odgi_viz.cwl 90 | odgi2rdf: 91 | in: {odgi: buildGraph/odgiGraph} 92 | out: [rdf] 93 | run: odgi_to_rdf.cwl 94 | mergeMetadata: 95 | in: 96 | metadata: metadata 97 | metadataSchema: metadataSchema 98 | subjects: subjects 99 | dups: dedup/dups 100 | originalLabels: relabel/originalLabels 101 | out: [merged] 102 | run: merge-metadata.cwl 103 | bin_paths: 104 | run: ../tools/odgi/odgi_bin.cwl 105 | in: 106 | sparse_graph_index: buildGraph/odgiGraph 107 | bin_width: bin_widths 108 | scatter: bin_width 109 | out: [ bins, pangenome_sequence ] 110 | index_paths: 111 | label: Create path index 112 | run: ../tools/odgi/odgi_pathindex.cwl 113 | in: 114 | sparse_graph_index: buildGraph/odgiGraph 115 | out: [ indexed_paths ] 116 | segment_components: 117 | label: Run component segmentation 118 | run: ../tools/graph-genome-segmentation/component_segmentation.cwl 119 | in: 120 | bins: bin_paths/bins 121 | cells_per_file: cells_per_file 122 | pangenome_sequence: 123 | source: bin_paths/pangenome_sequence 124 | valueFrom: $(self[0]) 125 | # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index 126 | # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index 127 | # regardless of bin_width, so we take the first pangenome_sequence as input for this step 128 | out: [ colinear_components ] 129 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/pangenome-generate_abpoa.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.1 3 | class: Workflow 4 | requirements: 5 | ScatterFeatureRequirement: {} 6 | StepInputExpressionRequirement: {} 7 | inputs: 8 | seqs: File 9 | metadata: File 10 | bin_widths: 11 | type: int[] 12 | default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] 13 | doc: width of each bin in basepairs along the graph vector 14 | cells_per_file: 15 | type: int 16 | default: 100 17 | doc: Cells per file on component_segmentation 18 | reversed_sort: 19 | type: string 20 | default: "false" 21 | outputs: 22 | odgiGraph: 23 | type: File 24 | outputSource: buildGraph/odgiGraph 25 | odgiPNG: 26 | type: File 27 | outputSource: vizGraph/graph_image 28 | abpoaGFA: 29 | type: File 30 | outputSource: induceGraph/abpoaGFA 31 | # odgiRDF: 32 | # type: File 33 | # outputSource: odgi2rdf/rdf 34 | readsMergeDedupSortedByQualAndLen: 35 | type: File 36 | outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA 37 | mergedMetadata: 38 | type: File 39 | outputSource: dups2metadata/merged 40 | # indexed_paths: 41 | # type: File 42 | # outputSource: index_paths/indexed_paths 43 | # colinear_components: 44 | # type: Directory 45 | # outputSource: segment_components/colinear_components 46 | steps: 47 | dedup_and_sort_by_quality_and_len: 48 | in: {readsFA: seqs, reversed_sorting: reversed_sort} 49 | out: [sortedReadsFA, dups] 50 | run: sort_fasta_by_quality_and_len.cwl 51 | induceGraph: 52 | in: 53 | readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA 54 | out: [abpoaGFA] 55 | run: abpoa.cwl 56 | buildGraph: 57 | in: {inputGFA: induceGraph/abpoaGFA} 58 | out: [odgiGraph] 59 | run: odgi-build-from-xpoa-gfa.cwl 60 | vizGraph: 61 | in: 62 | sparse_graph_index: buildGraph/odgiGraph 63 | width: 64 | default: 50000 65 | height: 66 | default: 500 67 | path_per_row: 68 | default: true 69 | path_height: 70 | default: 4 71 | out: [graph_image] 72 | requirements: 73 | ResourceRequirement: 74 | ramMin: $(15 * 1024) 75 | outdirMin: 10 76 | run: ../tools/odgi/odgi_viz.cwl 77 | # odgi2rdf: 78 | # in: {odgi: buildGraph/odgiGraph} 79 | # out: [rdf] 80 | # run: odgi_to_rdf.cwl 81 | dups2metadata: 82 | in: 83 | metadata: metadata 84 | dups: dedup_and_sort_by_quality_and_len/dups 85 | out: [merged] 86 | run: dups2metadata.cwl 87 | # bin_paths: 88 | # requirements: 89 | # ResourceRequirement: 90 | # ramMin: 3000 91 | # outdirMin: 10 92 | # run: ../tools/odgi/odgi_bin.cwl 93 | # in: 94 | # sparse_graph_index: buildGraph/odgiGraph 95 | # bin_width: bin_widths 96 | # scatter: bin_width 97 | # out: [ bins, pangenome_sequence ] 98 | # index_paths: 99 | # label: Create path index 100 | # requirements: 101 | # ResourceRequirement: 102 | # ramMin: 3000 103 | # outdirMin: 10 104 | # run: ../tools/odgi/odgi_pathindex.cwl 105 | # in: 106 | # sparse_graph_index: buildGraph/odgiGraph 107 | # out: [ indexed_paths ] 108 | # segment_components: 109 | # label: Run component segmentation 110 | # run: ../tools/graph-genome-segmentation/component_segmentation.cwl 111 | # in: 112 | # bins: bin_paths/bins 113 | # cells_per_file: cells_per_file 114 | # pangenome_sequence: 115 | # source: bin_paths/pangenome_sequence 116 | # valueFrom: $(self[0]) 117 | # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index 118 | # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index 119 | # # regardless of bin_width, so we take the first pangenome_sequence as input for this step 120 | # out: [ colinear_components ] 121 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/pangenome-generate_spoa.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | cwlVersion: v1.1 3 | class: Workflow 4 | requirements: 5 | ScatterFeatureRequirement: {} 6 | StepInputExpressionRequirement: {} 7 | inputs: 8 | seqs: File 9 | metadata: File 10 | bin_widths: 11 | type: int[] 12 | default: [ 1, 4, 16, 64, 256, 1000, 4000, 16000] 13 | doc: width of each bin in basepairs along the graph vector 14 | cells_per_file: 15 | type: int 16 | default: 100 17 | doc: Cells per file on component_segmentation 18 | reversed_sort: 19 | type: string 20 | default: "true" 21 | outputs: 22 | odgiGraph: 23 | type: File 24 | outputSource: buildGraph/odgiGraph 25 | odgiPNG: 26 | type: File 27 | outputSource: vizGraph/graph_image 28 | spoaGFA: 29 | type: File 30 | outputSource: induceGraph/spoaGFA 31 | # odgiRDF: 32 | # type: File 33 | # outputSource: odgi2rdf/rdf 34 | readsMergeDedupSortedByQualAndLen: 35 | type: File 36 | outputSource: dedup_and_sort_by_quality_and_len/sortedReadsFA 37 | mergedMetadata: 38 | type: File 39 | outputSource: dups2metadata/merged 40 | # indexed_paths: 41 | # type: File 42 | # outputSource: index_paths/indexed_paths 43 | # colinear_components: 44 | # type: Directory 45 | # outputSource: segment_components/colinear_components 46 | steps: 47 | dedup_and_sort_by_quality_and_len: 48 | in: {readsFA: seqs, reversed_sorting: reversed_sort} 49 | out: [sortedReadsFA, dups] 50 | run: sort_fasta_by_quality_and_len.cwl 51 | induceGraph: 52 | in: 53 | readsFA: dedup_and_sort_by_quality_and_len/sortedReadsFA 54 | out: [spoaGFA] 55 | run: spoa.cwl 56 | buildGraph: 57 | in: {inputGFA: induceGraph/spoaGFA} 58 | out: [odgiGraph] 59 | run: odgi-build-from-xpoa-gfa.cwl 60 | vizGraph: 61 | in: 62 | sparse_graph_index: buildGraph/odgiGraph 63 | width: 64 | default: 50000 65 | height: 66 | default: 500 67 | path_per_row: 68 | default: true 69 | path_height: 70 | default: 4 71 | out: [graph_image] 72 | requirements: 73 | ResourceRequirement: 74 | ramMin: $(15 * 1024) 75 | outdirMin: 10 76 | run: ../tools/odgi/odgi_viz.cwl 77 | # odgi2rdf: 78 | # in: {odgi: buildGraph/odgiGraph} 79 | # out: [rdf] 80 | # run: odgi_to_rdf.cwl 81 | dups2metadata: 82 | in: 83 | metadata: metadata 84 | dups: dedup_and_sort_by_quality_and_len/dups 85 | out: [merged] 86 | run: dups2metadata.cwl 87 | # bin_paths: 88 | # requirements: 89 | # ResourceRequirement: 90 | # ramMin: 3000 91 | # outdirMin: 10 92 | # run: ../tools/odgi/odgi_bin.cwl 93 | # in: 94 | # sparse_graph_index: buildGraph/odgiGraph 95 | # bin_width: bin_widths 96 | # scatter: bin_width 97 | # out: [ bins, pangenome_sequence ] 98 | # index_paths: 99 | # label: Create path index 100 | # requirements: 101 | # ResourceRequirement: 102 | # ramMin: 3000 103 | # outdirMin: 10 104 | # run: ../tools/odgi/odgi_pathindex.cwl 105 | # in: 106 | # sparse_graph_index: buildGraph/odgiGraph 107 | # out: [ indexed_paths ] 108 | # segment_components: 109 | # label: Run component segmentation 110 | # run: ../tools/graph-genome-segmentation/component_segmentation.cwl 111 | # in: 112 | # bins: bin_paths/bins 113 | # cells_per_file: cells_per_file 114 | # pangenome_sequence: 115 | # source: bin_paths/pangenome_sequence 116 | # valueFrom: $(self[0]) 117 | # # the bin_paths step is scattered over the bin_width array, but always using the same sparse_graph_index 118 | # # the pangenome_sequence that is extracted is exactly the same for the same sparse_graph_index 119 | # # regardless of bin_width, so we take the first pangenome_sequence as input for this step 120 | # out: [ colinear_components ] 121 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/query-to-gfa.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: Workflow 3 | requirements: 4 | SubworkflowFeatureRequirement: {} 5 | inputs: 6 | metadata: File 7 | fasta: 8 | type: File 9 | secondaryFiles: [.fai] 10 | query: string 11 | outputs: 12 | odgiGraph: 13 | type: File 14 | outputSource: make-gfa/odgiGraph 15 | spoaGFA: 16 | type: File 17 | outputSource: make-gfa/spoaGFA 18 | readsMergeDedupSortedByQualAndLen: 19 | type: File 20 | outputSource: make-gfa/readsMergeDedupSortedByQualAndLen 21 | mergedMetadata: 22 | type: File 23 | outputSource: make-gfa/mergedMetadata 24 | steps: 25 | get-subset: 26 | run: from_sparql.cwl 27 | in: {metadata: metadata, query: query, fasta: fasta} 28 | out: [selected] 29 | make-gfa: 30 | run: pangenome-generate_spoa.cwl 31 | in: {metadata: metadata, seqs: get-subset/selected} 32 | out: [odgiGraph, spoaGFA, readsMergeDedupSortedByQualAndLen, mergedMetadata] 33 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/relabel-seqs.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | readsFA: File[] 5 | subjects: string[] 6 | exclude: 7 | type: File? 8 | inputBinding: {position: 2} 9 | script: 10 | type: File 11 | default: {class: File, location: relabel-seqs.py} 12 | inputBinding: {position: 1} 13 | outputs: 14 | relabeledSeqs: 15 | type: File 16 | outputBinding: 17 | glob: relabeledSeqs.fasta 18 | originalLabels: 19 | type: File 20 | outputBinding: 21 | glob: originalLabels.ttl 22 | requirements: 23 | InlineJavascriptRequirement: {} 24 | InitialWorkDirRequirement: 25 | listing: | 26 | ${ 27 | var i = 0; 28 | var b = 1; 29 | var out = []; 30 | for (; i < inputs.readsFA.length; i++) { 31 | var block = []; 32 | var sub = []; 33 | for (; i < (b*150) && i < inputs.readsFA.length; i++) { 34 | block.push(inputs.readsFA[i]); 35 | sub.push(inputs.subjects[i]); 36 | } 37 | out.push({ 38 | entryname: "block"+b, 39 | entry: JSON.stringify(block) 40 | }); 41 | out.push({ 42 | entryname: "subs"+b, 43 | entry: JSON.stringify(sub) 44 | }); 45 | b++; 46 | } 47 | return out; 48 | } 49 | hints: 50 | DockerRequirement: 51 | dockerPull: commonworkflowlanguage/cwltool_module 52 | stdout: 53 | baseCommand: [python] 54 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/relabel-seqs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import sys 4 | 5 | def readitems(stem): 6 | items = [] 7 | b = 1 8 | while os.path.exists("%s%i" % (stem, b)): 9 | with open("%s%i" % (stem, b)) as f: 10 | items.extend(json.load(f)) 11 | b += 1 12 | return items 13 | 14 | reads = readitems("block") 15 | subjects = readitems("subs") 16 | 17 | relabeled_fasta = open("relabeledSeqs.fasta", "wt") 18 | original_labels = open("originalLabels.ttl", "wt") 19 | 20 | blacklist = set() 21 | if len(sys.argv) > 1: 22 | with open(sys.argv[1]) as bl: 23 | for l in bl: 24 | blacklist.add(l.strip()) 25 | 26 | for i, r in enumerate(reads): 27 | with open(r["path"], "rt") as fa: 28 | label = fa.readline().strip() 29 | original_labels.write("<%s> \"%s\" .\n" % (subjects[i], label[1:].replace('"', '\\"'))) 30 | skip = (subjects[i] in blacklist or label[1:] in blacklist) 31 | if skip: 32 | original_labels.write("<%s> \"true\"^^ .\n" % (subjects[i])) 33 | if not skip: 34 | relabeled_fasta.write(">"+subjects[i]+"\n") 35 | data = fa.read(8096) 36 | while data: 37 | if not skip: 38 | relabeled_fasta.write(data) 39 | endswithnewline = data.endswith("\n") 40 | data = fa.read(8096) 41 | if not skip and not endswithnewline: 42 | relabeled_fasta.write("\n") 43 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/seqwish.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | readsFA: File 5 | readsPAF: File 6 | kmerSize: 7 | type: int 8 | default: 16 9 | outputs: 10 | seqwishGFA: 11 | type: File 12 | outputBinding: 13 | glob: $(inputs.readsPAF.nameroot).gfa 14 | requirements: 15 | InlineJavascriptRequirement: {} 16 | hints: 17 | DockerRequirement: 18 | dockerPull: "quay.io/biocontainers/seqwish:0.4.1--h8b12597_0" 19 | ResourceRequirement: 20 | coresMin: 4 21 | ramMin: $(7 * 1024) 22 | outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) 23 | stdout: $(inputs.readsFA.nameroot).paf 24 | baseCommand: seqwish 25 | arguments: [-t, $(runtime.cores), 26 | -k, $(inputs.kmerSize), 27 | -s, $(inputs.readsFA), 28 | -p, $(inputs.readsPAF), 29 | -g, $(inputs.readsPAF.nameroot).gfa] 30 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/sort_fasta_by_quality_and_len.cwl: -------------------------------------------------------------------------------- 1 | 2 | cwlVersion: v1.1 3 | class: CommandLineTool 4 | hints: 5 | ResourceRequirement: 6 | coresMin: 1 7 | ramMin: 3000 8 | inputs: 9 | reversed_sorting: 10 | type: string 11 | inputBinding: {position: 3} 12 | readsFA: 13 | type: File 14 | inputBinding: {position: 2} 15 | script: 16 | type: File 17 | inputBinding: {position: 1} 18 | default: {class: File, location: sort_fasta_by_quality_and_len.py} 19 | stdout: $(inputs.readsFA.nameroot).sorted_by_quality_and_len.fasta 20 | outputs: 21 | sortedReadsFA: 22 | type: stdout 23 | dups: 24 | type: File 25 | outputBinding: {glob: dups.txt} 26 | requirements: 27 | InlineJavascriptRequirement: {} 28 | ShellCommandRequirement: {} 29 | baseCommand: [python] 30 | 31 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/sort_fasta_by_quality_and_len.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Sort the sequences by quality (percentage of number of N bases not called, descending) and by length (descending). 4 | # The best sequence is the longest one, with no uncalled bases. 5 | 6 | import os 7 | import sys 8 | import gzip 9 | 10 | # import xxhash # Faster library 11 | import hashlib 12 | 13 | 14 | def open_gzipsafe(path_file): 15 | if path_file.endswith('.gz'): 16 | return gzip.open(path_file, 'rt') 17 | else: 18 | return open(path_file) 19 | 20 | 21 | path_fasta = sys.argv[1] 22 | 23 | hash_to_count_and_headers_dict = {} 24 | 25 | header_to_seq_dict = {} 26 | header_percCalledBases_seqLength_list = [] 27 | 28 | with open_gzipsafe(path_fasta) as f: 29 | for fasta in f.read().strip('\n>').split('>'): 30 | header = fasta.strip('\n').split('\n')[0] 31 | sequence = ''.join(fasta.strip('\n').split('\n')[1:]) 32 | 33 | # hash = xxhash.xxh64(sequence).hexdigest() # Faster library 34 | hash = hashlib.md5(sequence.encode('utf-8')).hexdigest() 35 | 36 | if hash not in hash_to_count_and_headers_dict: 37 | # New sequence 38 | hash_to_count_and_headers_dict[hash] = [0, []] 39 | 40 | header_to_seq_dict[header] = sequence 41 | 42 | seq_len = len(sequence) 43 | header_percCalledBases_seqLength_list.append([header, (seq_len - sequence.count('N')) / seq_len, seq_len]) 44 | 45 | hash_to_count_and_headers_dict[hash][0] += 1 46 | hash_to_count_and_headers_dict[hash][1].append(header) 47 | 48 | with open('dups.txt', 'w') as fw: 49 | for count, header_list in hash_to_count_and_headers_dict.values(): 50 | fw.write('\t'.join([str(count), ', '.join(header_list)]) + '\n') 51 | 52 | reversed_sorting = True if len(sys.argv) > 2 and sys.argv[2].lower() == 'true' else False 53 | 54 | for header, percCalledBases, seqLength_list in sorted( 55 | header_percCalledBases_seqLength_list, key=lambda x: (x[-2], x[-1]), reverse=reversed_sorting 56 | ): 57 | sys.stdout.write('>{}\n{}\n'.format(header, header_to_seq_dict[header])) 58 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/spoa.cwl: -------------------------------------------------------------------------------- 1 | cwlVersion: v1.1 2 | class: CommandLineTool 3 | inputs: 4 | readsFA: File 5 | script: 6 | type: File 7 | default: {class: File, location: relabel-seqs.py} 8 | outputs: 9 | spoaGFA: 10 | type: stdout 11 | requirements: 12 | InlineJavascriptRequirement: {} 13 | hints: 14 | DockerRequirement: 15 | dockerPull: "quay.io/biocontainers/spoa:3.4.0--hc9558a2_0" 16 | ResourceRequirement: 17 | coresMin: 1 18 | ramMin: $(15 * 1024) 19 | outdirMin: $(Math.ceil(inputs.readsFA.size/(1024*1024*1024) + 20)) 20 | baseCommand: spoa 21 | stdout: $(inputs.readsFA.nameroot).g6.gfa 22 | arguments: [ 23 | $(inputs.readsFA), 24 | -G, 25 | -g, '-6' 26 | ] 27 | -------------------------------------------------------------------------------- /workflows/pangenome-generate/testjob.yml: -------------------------------------------------------------------------------- 1 | inputReads: 2 | - class: File 3 | location: ../../example/sequence.fasta 4 | - class: File 5 | location: ../../example/sequence.fasta 6 | metadata: 7 | - class: File 8 | location: ../../example/metadata.yaml 9 | - class: File 10 | location: ../../example/metadata.yaml 11 | metadataSchema: 12 | class: File 13 | location: ../../bh20sequploader/bh20seq-schema.yml 14 | subjects: 15 | - http://arvados.org/keep/seq1 16 | - http://arvados.org/keep/seq2 17 | -------------------------------------------------------------------------------- /workflows/phylogeny/README.md: -------------------------------------------------------------------------------- 1 | A workflow to generate a phylogeny that can be visualized using [auspice](https://github.com/urbanslug/auspice). 2 | Expects a multi-fasta file path at [pggb_fasta][1] and generates a tree in `json` format. 3 | 4 | #### Dependencies 5 | 6 | Depends on: 7 | - [pggb](https://github.com/pangenome/pggb/blob/master/pggb) 8 | * [wfmash](https://github.com/ekg/wfmash) 9 | * [seqwish](https://github.com/ekg/seqwish) 10 | * [smoothxg](https://github.com/pangenome/smoothxg) 11 | * [odgi](https://github.com/vgteam/odgi) 12 | 13 | - [taxophages](https://github.com/urbanslug/taxophages/) 14 | * Clone and run with `python main.py ...` 15 | 16 | - [augur](https://github.com/nextstrain/augur) 17 | 18 | 19 | #### Running 20 | 21 | Expects that taxophages is cloned in a previous dir but you can update the path [main_py_script][2] to wherever it is. 22 | 23 | Run the phylogeny workflow with the bleow after specifying your path to [pggb_fasta][1]. 24 | 25 | ```bash 26 | R_PACKAGES="${HOME}/RLibraries" \ # a directory holding R packages. Needed if R packages installed using install.packages on server e.g https://github.com/urbanslug/taxophages/blob/master/scripts/deps.R 27 | TAXOPHAGES_ENV=server \ # helps taxophages figure out where it is being ran 28 | AUGUR_RECURSION_LIMIT=30000 \ # augur isn't used to working with so many nested values 29 | cwltool --preserve-entire-environment --no-container phylogeny.cwl clado-job.yml 30 | ``` 31 | 32 | Alternatively run any workflow with 33 | ``` 34 | cwltool --no-container .cwl clado-job.yml 35 | ``` 36 | 37 | [1]: clado-job.yml#L8 38 | [2]: clado-job.yml#L28 39 | -------------------------------------------------------------------------------- /workflows/phylogeny/align.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | 5 | class: CommandLineTool 6 | baseCommand: pggb 7 | 8 | inputs: 9 | threads: 10 | type: int 11 | inputBinding: 12 | position: 1 13 | prefix: -t 14 | 15 | pggb_wfmash: 16 | type: boolean 17 | inputBinding: 18 | position: 2 19 | prefix: --wfmash 20 | 21 | pggb_fasta: 22 | type: File 23 | inputBinding: 24 | position: 3 25 | prefix: -i 26 | 27 | pggb_mash_k_mer: 28 | type: int 29 | inputBinding: 30 | position: 4 31 | prefix: -K 32 | 33 | pggb_map_percent_identity: 34 | type: int 35 | inputBinding: 36 | position: 5 37 | prefix: -p 38 | 39 | pggb_num_secondary_mappings: 40 | type: int 41 | inputBinding: 42 | position: 6 43 | prefix: -n 44 | 45 | pggb_segment_length: 46 | type: int 47 | inputBinding: 48 | position: 7 49 | prefix: -s 50 | 51 | pggb_output_dir: 52 | type: string 53 | inputBinding: 54 | position: 8 55 | prefix: -o 56 | 57 | outputs: 58 | pggb_odgi_graph: 59 | type: File 60 | outputBinding: 61 | glob: '*.smooth.og' -------------------------------------------------------------------------------- /workflows/phylogeny/augur.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | 5 | class: CommandLineTool 6 | baseCommand: bash 7 | 8 | requirements: 9 | InitialWorkDirRequirement: 10 | listing: 11 | - $(inputs.dataDir) 12 | 13 | inputs: 14 | nextstrain_bash_script: 15 | type: File 16 | inputBinding: 17 | position: 1 18 | 19 | newick_tree_2: 20 | type: File 21 | inputBinding: 22 | position: 2 23 | 24 | metadata_newick: 25 | type: File 26 | inputBinding: 27 | position: 3 28 | 29 | dataDir: 30 | type: Directory 31 | 32 | outputs: 33 | newick_json: 34 | type: File 35 | outputBinding: 36 | glob: 'covid.json' -------------------------------------------------------------------------------- /workflows/phylogeny/awk-coverage.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | class: CommandLineTool 5 | 6 | baseCommand: awk 7 | 8 | inputs: 9 | consensus_regex: 10 | type: string 11 | inputBinding: 12 | position: 1 13 | 14 | coverage_tsv: 15 | type: File 16 | inputBinding: 17 | position: 2 18 | 19 | outputs: 20 | awk_coverage_matrix: 21 | type: stdout 22 | 23 | stdout: coverage.no_consensus.tsv -------------------------------------------------------------------------------- /workflows/phylogeny/clado-job.yml: -------------------------------------------------------------------------------- 1 | message: Hello world! 2 | 3 | threads: 16 4 | 5 | pggb_wfmash: true 6 | pggb_fasta: 7 | class: File 8 | path: ../data/qc/relabeledSeqs.sorted.qc.100sample.fasta 9 | pggb_mash_k_mer: 19 10 | pggb_map_percent_identity: 95 11 | pggb_num_secondary_mappings: 10000 12 | pggb_segment_length: 5000 13 | pggb_output_dir: "." 14 | 15 | odgi_paths: paths 16 | odgi_graph: 17 | class: File 18 | path: ./relabeledSeqs.sorted.qc.100sample.fasta.pggb-W-s5000-l15000-p95-n10000-a0-K19-k19-w10000-j5000-e5000-I0-R0.smooth.og 19 | haplotypes: true 20 | 21 | consensus_regex: '!/^Consensus/' 22 | coverage_tsv: 23 | class: File 24 | path: ./coverage.tsv 25 | 26 | main_py_script: 27 | class: File 28 | path: ../main.py 29 | metadata: get-metadata 30 | coverage_matrix: 31 | class: File 32 | path: ./coverage.no_consensus.tsv 33 | coverage_matrix_with_metadata: ./coverage.metadata.tsv 34 | 35 | clado-rsvd: clado-rsvd 36 | cladogram_matrix: 37 | class: File 38 | path: ./coverage.metadata.tsv 39 | reduced_matrix: ./coverage.reduced.tsv 40 | svg_figure: 30k_700cm.svg 41 | 42 | newick: gen-newick 43 | newick_dimensions: 100 44 | newick_coverage_matrix: 45 | class: File 46 | path: ./coverage.metadata.tsv 47 | newick_metadata: ./metadata.tsv 48 | newick_tree: ./tree.workflow.nwk 49 | 50 | nextstrain_R_script: 51 | class: File 52 | path: ../taxophages/viz/nextstrain.R 53 | 54 | coverage_matrix_with_metadata_2: 55 | class: File 56 | path: ../data/5k/covmatrix.5k.metadata.tsv 57 | 58 | metadata_only: ./metadata.tsv 59 | newick_tree: tree.workflow.nwk 60 | distance_matrix: distance_matrix.workflow.tsv 61 | rsvd_dimensions: "1000" 62 | filter_unknowns: "TRUE" 63 | 64 | nextstrain_bash_script: 65 | class: File 66 | path: ../scripts/nextstrain.sh 67 | 68 | newick_tree_2: 69 | class: File 70 | path: ./tree.workflow.nwk 71 | 72 | metadata_newick: 73 | class: File 74 | path: ./metadata.tsv 75 | 76 | dataDir: 77 | class: Directory 78 | path: ../config 79 | -------------------------------------------------------------------------------- /workflows/phylogeny/coverage.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | 5 | class: CommandLineTool 6 | baseCommand: odgi 7 | 8 | inputs: 9 | odgi_paths: 10 | type: string 11 | inputBinding: 12 | position: 1 13 | 14 | odgi_graph: 15 | type: File 16 | inputBinding: 17 | position: 2 18 | prefix: -i 19 | 20 | haplotypes: 21 | type: boolean 22 | inputBinding: 23 | position: 4 24 | prefix: -H 25 | 26 | threads: 27 | type: int 28 | inputBinding: 29 | position: 5 30 | prefix: -t 31 | 32 | outputs: 33 | coverage_matrix: 34 | type: stdout 35 | 36 | stdout: coverage.tsv -------------------------------------------------------------------------------- /workflows/phylogeny/metadata.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | 5 | class: CommandLineTool 6 | baseCommand: python 7 | 8 | inputs: 9 | main_py_script: 10 | type: File 11 | inputBinding: 12 | position: 1 13 | 14 | metadata: 15 | type: string 16 | inputBinding: 17 | position: 2 18 | 19 | coverage_matrix: 20 | type: File 21 | inputBinding: 22 | position: 3 23 | 24 | coverage_matrix_with_metadata: 25 | type: string 26 | inputBinding: 27 | position: 4 28 | 29 | outputs: 30 | coverage_matrix_with_metadata_out: 31 | type: File 32 | outputBinding: 33 | glob: '*.metadata.tsv' -------------------------------------------------------------------------------- /workflows/phylogeny/newick.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | 5 | class: CommandLineTool 6 | baseCommand: python 7 | 8 | inputs: 9 | main_py_script: 10 | type: File 11 | inputBinding: 12 | position: 1 13 | 14 | newick: 15 | type: string 16 | inputBinding: 17 | position: 2 18 | 19 | newick_dimensions: 20 | type: int 21 | inputBinding: 22 | position: 3 23 | prefix: -d 24 | 25 | newick_coverage_matrix: 26 | type: File 27 | inputBinding: 28 | position: 3 29 | 30 | newick_metadata: 31 | type: string 32 | inputBinding: 33 | position: 4 34 | 35 | newick_tree: 36 | type: string 37 | inputBinding: 38 | position: 5 39 | 40 | outputs: 41 | metadata_out: 42 | type: File 43 | outputBinding: 44 | glob: 'metadata.tsv' 45 | 46 | newick_tree_out: 47 | type: File 48 | outputBinding: 49 | glob: '*.nwk' -------------------------------------------------------------------------------- /workflows/phylogeny/phylogeny.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | cwlVersion: v1.1 4 | class: Workflow 5 | 6 | ############################################# 7 | 8 | inputs: 9 | 10 | # align 11 | threads: int 12 | pggb_wfmash: boolean 13 | pggb_fasta: File 14 | pggb_mash_k_mer: int 15 | pggb_map_percent_identity: int 16 | pggb_num_secondary_mappings: int 17 | pggb_segment_length: int 18 | pggb_output_dir: string 19 | 20 | # extract coverage vector 21 | odgi_paths: string 22 | odgi_graph: File 23 | haplotypes: boolean 24 | threads: int 25 | 26 | # remove consensus paths 27 | consensus_regex: string 28 | coverage_tsv: File 29 | 30 | # Get metadata 31 | main_py_script: File 32 | metadata: string 33 | coverage_matrix: File 34 | coverage_matrix_with_metadata: string 35 | 36 | # Generate newick tree 37 | main_py_script: File 38 | newick: string 39 | newick_dimensions: int 40 | newick_coverage_matrix: File 41 | newick_metadata: string 42 | newick_tree: string 43 | 44 | # Genenrate augur JSON file 45 | nextstrain_bash_script: File 46 | newick_tree_2: File 47 | metadata_newick: File 48 | dataDir: Directory 49 | 50 | 51 | ############################################# 52 | 53 | outputs: 54 | augur_json: 55 | type: File 56 | outputSource: augur/newick_json 57 | 58 | ############################################# 59 | 60 | steps: 61 | align: 62 | run: align.cwl 63 | in: 64 | threads: threads 65 | pggb_wfmash: pggb_wfmash 66 | pggb_fasta: pggb_fasta 67 | pggb_mash_k_mer: pggb_mash_k_mer 68 | pggb_map_percent_identity: pggb_map_percent_identity 69 | pggb_num_secondary_mappings: pggb_num_secondary_mappings 70 | pggb_segment_length: pggb_segment_length 71 | pggb_output_dir: pggb_output_dir 72 | out: [pggb_odgi_graph] 73 | 74 | odgi: 75 | run: coverage.cwl 76 | in: 77 | odgi_paths: odgi_paths 78 | odgi_graph: align/pggb_odgi_graph 79 | haplotypes: haplotypes 80 | threads: threads 81 | out: [coverage_matrix] 82 | 83 | awk: 84 | run: awk-coverage.cwl 85 | in: 86 | consensus_regex: consensus_regex 87 | coverage_tsv: odgi/coverage_matrix 88 | out: [awk_coverage_matrix] 89 | 90 | metadata: 91 | run: metadata.cwl 92 | in: 93 | main_py_script: main_py_script 94 | metadata: metadata 95 | coverage_matrix: awk/awk_coverage_matrix 96 | coverage_matrix_with_metadata: coverage_matrix_with_metadata 97 | out: [coverage_matrix_with_metadata_out] 98 | 99 | newick: 100 | run: newick.cwl 101 | in: 102 | main_py_script: main_py_script 103 | newick: newick 104 | newick_dimensions: newick_dimensions 105 | newick_coverage_matrix: metadata/coverage_matrix_with_metadata_out 106 | newick_metadata: newick_metadata 107 | newick_tree: newick_tree 108 | out: [metadata_out, newick_tree_out] 109 | 110 | augur: 111 | run: augur.cwl 112 | in: 113 | nextstrain_bash_script: nextstrain_bash_script 114 | newick_tree_2: newick/newick_tree_out 115 | metadata_newick: newick/metadata_out 116 | dataDir: dataDir 117 | 118 | out: [newick_json] 119 | -------------------------------------------------------------------------------- /workflows/pubseq/generate-rdf.rb: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env ruby 2 | # 3 | # -*- coding: UTF-8 -*- 4 | # 5 | # This script transforms pass2 JSON to JSON-LD (ready for RDF) 6 | # See also https://github.com/pubseq/bh20-seq-resource/doc/blog/covid19-pubseq-update-rdf.org 7 | # 8 | # Author:: Pjotr Prins 9 | # License:: MIT 10 | # 11 | # Copyright (C) 2021 Pjotr Prins 12 | # 13 | 14 | TOOL=File.basename($0) 15 | 16 | GEMPATH = File.dirname(__FILE__) + '/../../lib/ruby' 17 | $: << File.join(GEMPATH,'lib/ruby/pubseq') 18 | 19 | VERSION_FILENAME=File.join(GEMPATH,'VERSION') 20 | VERSION = File.new(VERSION_FILENAME).read.chomp 21 | 22 | require 'optparse' 23 | require 'ostruct' 24 | require 'fileutils' 25 | require 'json' 26 | require 'zlib' 27 | 28 | options = { show_help: false, source: 'https://github.com/pubseq', version: VERSION+' (Pjotr Prins)', date: Time.now.to_s } 29 | 30 | opts = OptionParser.new do |o| 31 | o.banner = "Usage: #{TOOL} [options] path" 32 | o.on('--dir path',String, 'Path to JSON files [REQUIRED]') do |path| 33 | options[:path] = path 34 | end 35 | o.on('--out path',String, 'Dir to write to [REQUIRED]') do |path| 36 | options[:out] = path 37 | end 38 | 39 | o.separator "" 40 | 41 | o.on("-q", "--quiet", "Run quietly") do |q| 42 | # Bio::Log::CLI.trace('error') 43 | options[:quiet] = true 44 | end 45 | 46 | o.on("-v", "--verbose", "Run verbosely") do |v| 47 | options[:verbose] = true 48 | end 49 | 50 | o.on("--progress", "Show progress") do |p| 51 | options[:progress] = true 52 | end 53 | 54 | o.on("--debug", "Show debug messages and keep intermediate output") do |v| 55 | # Bio::Log::CLI.trace('debug') 56 | options[:debug] = true 57 | end 58 | 59 | o.separator "" 60 | o.on_tail('-h', '--help', 'display this help and exit') do 61 | options[:show_help] = true 62 | end 63 | end 64 | 65 | opts.parse!(ARGV) 66 | 67 | BANNER = "#{TOOL} #{VERSION} (Ruby #{RUBY_VERSION}) by Pjotr Prins 2021\n" 68 | $stderr.print BANNER if !options[:quiet] 69 | 70 | if options[:show_help] 71 | print opts 72 | exit 1 73 | end 74 | 75 | if RUBY_VERSION =~ /^[12]/ 76 | $stderr.print "WARNING: #{TOOL} may not run properly on Ruby <3.x\n" 77 | end 78 | 79 | $stderr.print "Options: ",options,"\n" if !options[:quiet] 80 | 81 | GLOBAL = OpenStruct.new(options) 82 | 83 | raise "--dir directory is required" if not GLOBAL.path 84 | raise "--out directory is required" if not GLOBAL.out 85 | -------------------------------------------------------------------------------- /workflows/pubseq/normalize/README.md: -------------------------------------------------------------------------------- 1 | # Normalization steps 2 | 3 | This library contains generic logic to normalize (string) data and 4 | transforms strings to URIs. It should be applicable to data from 5 | any source (GenBank, ENA etc). 6 | 7 | Important: missing data should be missing or None! Do not fill 8 | in data by 'guessing'. 9 | 10 | When data is malformed a warning should be logged and added to the 11 | warning list. Functions should be small enough to return only 1 12 | warning! 13 | 14 | Pjotr Prins (c) 2021 15 | -------------------------------------------------------------------------------- /workflows/pubseq/normalize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pubseq/bh20-seq-resource/2ae71911cd87ce4f2eabdff21e538267b3270d45/workflows/pubseq/normalize/__init__.py -------------------------------------------------------------------------------- /workflows/pubseq/pubseq-fetch-data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import json 5 | import os 6 | import requests 7 | import sys 8 | import time 9 | 10 | parser = argparse.ArgumentParser(description=""" 11 | 12 | Fetch metadata (JSON) from PubSeq and optionally the FASTA files. IDs 13 | can be passed in on the command line or in a file. 14 | 15 | """) 16 | parser.add_argument('--fasta', action='store_true', help='Also fetch FASTA records') 17 | parser.add_argument('--out', type=str, help='Directory to write to', 18 | required=True) 19 | parser.add_argument('--ids', type=str, help='File with ids', required=False) 20 | parser.add_argument('id', nargs='*', help='id(s)') 21 | args = parser.parse_args() 22 | 23 | dir = args.out 24 | if not os.path.exists(dir): 25 | raise Exception(f"Directory {dir} does not exist") 26 | 27 | ids = args.id 28 | if (len(ids)==0): 29 | print(f"Reading {args.ids}") 30 | with open(args.ids) as f: 31 | ids = [ l.strip() for l in f.readlines() ] 32 | 33 | for id in ids: 34 | print(id) 35 | jsonfn = dir+"/"+id+".json" 36 | if not os.path.exists(jsonfn): 37 | count = 0 38 | r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") 39 | while not r: 40 | count += 1 41 | if count>10: raise Exception(f"Can not find record for {id}") 42 | time.sleep(15) 43 | r = requests.get(f"http://covid19.genenetwork.org/api/sample/{id}.json") 44 | m_url = r.json()[0]['metadata'] 45 | mr = requests.get(m_url) 46 | with open(dir+"/"+id+".json","w") as outf: 47 | outf.write(mr.text) 48 | if args.fasta: 49 | fastafn = dir+"/"+id+".fa" 50 | if os.path.exists(fastafn): continue 51 | fa_url = r.json()[0]['fasta'] 52 | fr = requests.get(fa_url) 53 | with open(fastafn,"w") as outf: 54 | outf.write(fr.text) 55 | 56 | -------------------------------------------------------------------------------- /workflows/pubseq/pubseq-fetch-ids: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # 3 | # Use a SPARQL query to fetch all IDs in the PubSeq database 4 | # 5 | # pubseq-fetch-ids > pubseq_ids.txt 6 | # 7 | # Note: requires Ruby 3.x. Older Ruby gives a syntax error 8 | # 9 | # See also 10 | 11 | require 'net/http' 12 | require 'json' 13 | require 'ostruct' 14 | require 'erb' 15 | require 'pp' 16 | 17 | MAX=5_000 18 | 19 | SPARQL_HEADER=" 20 | prefix rdfs: 21 | prefix rdf: 22 | prefix dc: 23 | prefix schema: 24 | PREFIX pubseq: 25 | " 26 | 27 | # Build a SPARQL query, submit and return results. Apply transform 28 | # lambda when passed in 29 | def sparql query, transform = nil 30 | api_url = "http://sparql.genenetwork.org/sparql/?default-graph-uri=&format=application%2Fsparql-results%2Bjson&timeout=0&debug=on&run=+Run+Query+&query=#{ERB::Util.url_encode(SPARQL_HEADER + query)}" 31 | 32 | response = Net::HTTP.get_response(URI.parse(api_url)) 33 | data = JSON.parse(response.body,symbolize_names: true) 34 | data => { head: { vars: }, results: { bindings: results} } # Ruby3 destructuring 35 | vars = vars.map { |v| v.to_sym } 36 | results.map { |rec| 37 | # return results after transforming to a Hash and applying the 38 | # optional transform lambda. Note the transform can not only 39 | # reduce results, or create an array, but also may transform into 40 | # an OpenStruct. 41 | res = {} 42 | vars.each { |name| res[name] = rec[name][:value] } 43 | if transform 44 | transform.call(res) 45 | else 46 | res 47 | end 48 | } 49 | end 50 | 51 | start = 0 52 | num = MAX 53 | begin 54 | query = " 55 | SELECT DISTINCT ?id 56 | FROM 57 | WHERE { 58 | 59 | ?arvid ?id . 60 | 61 | } LIMIT #{num} OFFSET #{start} 62 | " 63 | list = sparql(query, lambda { |rec| rec[:id] }) 64 | list.each do | l | 65 | print(l,"\n") 66 | end 67 | $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress 68 | start += num 69 | end while list.size == MAX 70 | -------------------------------------------------------------------------------- /workflows/pubseq/validate.rb: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env ruby 2 | # 3 | # -*- coding: UTF-8 -*- 4 | # 5 | # Metadata validation routine - does one JSON file. At this stage this 6 | # is mostly a debugging tool 7 | # 8 | # Author:: Pjotr Prins 9 | # License:: MIT 10 | # 11 | # Copyright (C) 2021 Pjotr Prins 12 | # 13 | # 14 | 15 | TOOL=File.basename($0) 16 | 17 | GEMPATH = File.dirname(__FILE__) + '/../../lib/ruby' 18 | $: << File.join(GEMPATH,'lib/ruby/pubseq') 19 | 20 | VERSION_FILENAME=File.join(GEMPATH,'VERSION') 21 | VERSION = File.new(VERSION_FILENAME).read.chomp 22 | 23 | require 'colorize' 24 | require 'optparse' 25 | require 'ostruct' 26 | require 'fileutils' 27 | require 'json' 28 | require 'zlib' 29 | 30 | options = { show_help: false, source: 'https://github.com/pubseq', version: VERSION+' (Pjotr Prins)', date: Time.now.to_s } 31 | 32 | opts = OptionParser.new do |o| 33 | o.banner = "Usage: #{TOOL} [options] path" 34 | 35 | o.separator "" 36 | 37 | o.on("-q", "--quiet", "Run quietly") do |q| 38 | # Bio::Log::CLI.trace('error') 39 | options[:quiet] = true 40 | end 41 | 42 | o.on("-v", "--verbose", "Run verbosely") do |v| 43 | options[:verbose] = true 44 | end 45 | 46 | o.on("--debug", "Show debug messages and keep intermediate output") do |v| 47 | # Bio::Log::CLI.trace('debug') 48 | options[:debug] = true 49 | end 50 | 51 | o.separator "" 52 | o.on_tail('-h', '--help', 'display this help and exit') do 53 | options[:show_help] = true 54 | end 55 | end 56 | 57 | opts.parse!(ARGV) 58 | 59 | BANNER = "#{TOOL} #{VERSION} (Ruby #{RUBY_VERSION}) by Pjotr Prins 2021\n" 60 | $stderr.print BANNER if !options[:quiet] 61 | 62 | if options[:show_help] 63 | print opts 64 | print "\nExample: ruby validate.rb MT810507.json -q|jq\n" 65 | exit 1 66 | end 67 | 68 | if RUBY_VERSION =~ /^[12]/ 69 | $stderr.print "WARNING: #{TOOL} may not run properly on Ruby <3.x\n" 70 | end 71 | 72 | $stderr.print "Options: ",options,"\n" if !options[:quiet] 73 | 74 | GLOBAL = OpenStruct.new(options) 75 | $has_error = false 76 | 77 | 78 | for fn in ARGV 79 | next if fn == "state.json" 80 | json = JSON.parse(File.read(fn)) 81 | meta = OpenStruct.new(json) 82 | sample = OpenStruct.new(meta.sample) 83 | 84 | error = lambda { |msg| 85 | print(json.to_json,"\n") 86 | $stderr.print "ERROR: ".red,msg.red,"\n" 87 | $has_error = true 88 | } 89 | 90 | # ---- Check for location 91 | location = meta.sample['collection_location'] 92 | error.call "Missing collection_location" if not location 93 | error.call "Collection_location <#{location}> not normalized" if location !~ /^http:\/\/www.wikidata.org\/entity\/Q/ 94 | 95 | # ---- Dates 96 | error.call "Sample collection_date <#{sample.collection_date}> malformed" if sample.collection_date !~ /\d\d\d\d-\d\d-\d\d/ 97 | 98 | end 99 | 100 | exit 1 if $has_error 101 | -------------------------------------------------------------------------------- /workflows/pubseq/wikidata/README.org: -------------------------------------------------------------------------------- 1 | This directory contains scripts to directly download 2 | data from wikidata.org 3 | -------------------------------------------------------------------------------- /workflows/pubseq/wikidata/fetch-places.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # 3 | # This query fetches approx 80K places 4 | # 5 | curl -G https://query.wikidata.org/sparql -H "Accept: text/csv; charset=utf-8" --data-urlencode query=" 6 | SELECT DISTINCT ?placename ?place ?country ?coor ?population where { 7 | ?place wdt:P17 ?country ; 8 | wdt:P625 ?coor ; 9 | wdt:P1082 ?population . 10 | FILTER (?population > 9999) 11 | # minus { ?place wdt:P31 wd:Q3024240 } . 12 | ?place rdfs:label ?placename . 13 | FILTER (lang(?placename)='en') 14 | } 15 | " 16 | -------------------------------------------------------------------------------- /workflows/pubseq/wikidata/fetch-regions.sh: -------------------------------------------------------------------------------- 1 | # curl -G https://query.wikidata.org/sparql -H "Accept: text/tab-separated-values; charset=utf-8" --data-urlencode query=" 2 | curl -G https://query.wikidata.org/sparql -H "Accept: text/csv; charset=utf-8" --data-urlencode query=" 3 | select distinct ?placename ?place ?country ?coor ?population where { 4 | VALUES ?v { wd:Q82794 wd:Q107390 wd:Q34876 wd:Q9316670 wd:Q515 } 5 | ?statetype wdt:P279+ ?v . 6 | ?place wdt:P31 ?statetype ; 7 | wdt:P17 ?country ; 8 | wdt:P625 ?coor; 9 | wdt:P1082 ?population . 10 | FILTER (?population > 99999) 11 | ?place rdfs:label ?placename . 12 | FILTER (lang(?placename)='en') 13 | } 14 | 15 | " 16 | -------------------------------------------------------------------------------- /workflows/pubseq/wikidata/wikidata-fetch-places.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # 3 | # Use a SPARQL query to fetch Wikidata places 4 | # 5 | # Note: requires Ruby 3.x. Older Ruby gives a syntax error 6 | # 7 | # You may need to set 8 | # 9 | # export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt 10 | # 11 | # See also 12 | 13 | raise "Currently not used" 14 | 15 | require 'net/http' 16 | require 'json' 17 | require 'ostruct' 18 | require 'erb' 19 | require 'pp' 20 | 21 | MAX=10 22 | 23 | USER_AGENT = {'User-Agent': 'genenetworkCrawler/1.0 (covid-19.genenetwork.org; pjotr.public821@thebird.nl) genenetworkCrawler/1.0', "Accept": "text/csv"} 24 | 25 | SPARQL_HEADER=" 26 | prefix rdfs: 27 | prefix rdf: 28 | prefix dc: 29 | prefix schema: 30 | " 31 | 32 | # Build a SPARQL query, submit and return results. Apply transform 33 | # lambda when passed in 34 | def sparql query, transform = nil 35 | 36 | api_url = 'https://query.wikidata.org/sparql' 37 | response = Net::HTTP.get_response(URI.parse(api_url),USER_AGENT) 38 | data = JSON.parse(response.body,symbolize_names: true) 39 | data => { head: { vars: }, results: { bindings: results} } # Ruby3 destructuring 40 | vars = vars.map { |v| v.to_sym } 41 | results.map { |rec| 42 | # return results after transforming to a Hash and applying the 43 | # optional transform lambda. Note the transform can not only 44 | # reduce results, or create an array, but also may transform into 45 | # an OpenStruct. 46 | res = {} 47 | vars.each { |name| res[name] = rec[name][:value] } 48 | if transform 49 | transform.call(res) 50 | else 51 | res 52 | end 53 | } 54 | end 55 | 56 | start = 0 57 | num = MAX 58 | begin 59 | query = " 60 | SELECT DISTINCT ?place ?placename ?country ?coor ?population where { 61 | ?place wdt:P17 ?country ; 62 | wdt:P625 ?coor ; 63 | wdt:P1082 ?population . 64 | FILTER (?population > 9999) 65 | ?place rdfs:label ?placename . 66 | FILTER (lang(?placename)='en') 67 | } LIMIT #{num} OFFSET #{start} 68 | " 69 | list = sparql(query) # , lambda { |rec| rec[:id] }) 70 | list.each do | l | 71 | print(l,"\n") 72 | end 73 | $stderr.print("#{start}-#{start+list.size}:#{list.first}\n") # show progress 74 | start += num 75 | exit 1 76 | end while list.size == MAX 77 | -------------------------------------------------------------------------------- /workflows/pull-data/genbank/.gitignore: -------------------------------------------------------------------------------- 1 | fasta_and_yaml/ 2 | *.tsv 3 | *.acc 4 | *.txt 5 | -------------------------------------------------------------------------------- /workflows/pull-data/genbank/.guix-run: -------------------------------------------------------------------------------- 1 | # Set up the Guix environment with dependencies 2 | # 3 | 4 | echo # next run: 5 | echo 'export PATH=$GUIX_ENVIRONMENT/bin:$PATH' 6 | 7 | ~/.config/guix/current/bin/guix environment guix --ad-hoc python python-biopython python-requests python-dateutil ruby jq 8 | 9 | -------------------------------------------------------------------------------- /workflows/pull-data/genbank/genbank-fetch-ids.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Find all genbank IDs 4 | # 5 | # genbank-fetch-ids.py --max 1000 --skip pubseq_ids.txt 6 | # 7 | # See also directory .guix-run and README.md 8 | 9 | BATCH_SIZE=5000 10 | 11 | import argparse 12 | import json 13 | import os 14 | import requests 15 | import sys 16 | import xml.etree.ElementTree as ET 17 | from datetime import date, datetime 18 | from dateutil.parser import parse 19 | 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--max', type=int, help='Max queries', required=False) 22 | parser.add_argument('--skip', type=str, help='File with ids to skip, 1 id per line', required=False) 23 | args = parser.parse_args() 24 | 25 | from Bio import Entrez 26 | Entrez.email = 'another_email@gmail.com' # FIXME 27 | 28 | # min_acceptable_collection_date = datetime(2019, 12, 1) 29 | 30 | today_date = date.today().strftime("%Y.%m.%d") 31 | 32 | skip = set() 33 | if args.skip: 34 | with open(args.skip) as f: 35 | content = f.readlines() 36 | for line in content: 37 | skip.add(line.strip()) 38 | 39 | print(f"Skip size is {len(skip)}",file=sys.stderr) 40 | 41 | # Try to search several strings 42 | TERMS = ['SARS-CoV-2', 'SARS-CoV2', 'SARS CoV2', 'SARSCoV2', 'txid2697049[Organism]'] 43 | # Remove mRNAs, ncRNAs, Proteins, and predicted models (more information here: https://en.wikipedia.org/wiki/RefSeq) starting with 44 | PREFIX = ['NM', 'NR', 'NP', 'XM', 'XR', 'XP', 'WP'] 45 | 46 | 47 | ids = set() 48 | for term in TERMS: 49 | num_read = BATCH_SIZE 50 | retstart = 0 51 | while num_read == BATCH_SIZE: 52 | record = Entrez.read( 53 | Entrez.esearch(db='nuccore', term=term, idtype='acc', 54 | retstart=retstart, retmax=BATCH_SIZE) 55 | ) 56 | idlist = record['IdList'] 57 | new_ids = set(idlist) 58 | num_read = len(new_ids) 59 | print(num_read,":",idlist[0],file=sys.stderr) 60 | retstart += num_read 61 | new_ids.difference_update(skip) # remove skip ids 62 | new_ids = set([id for id in new_ids if id[:2] not in PREFIX]) 63 | ids.update(new_ids) # add to total set 64 | print(f"Term: {term} --> #{len(new_ids)} new IDs ---> Total unique IDs #{len(ids)})",file=sys.stderr) 65 | if args.max and len(ids) > args.max: 66 | print(f"Stopping past #{args.max} items",file=sys.stderr) 67 | break 68 | 69 | for id in ids: 70 | print(id) 71 | -------------------------------------------------------------------------------- /workflows/pull-data/genbank/transform-genbank-xml2yamlfa.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # Create a single YAML/FASTA for each genbank entry in GenBank XML file 4 | # 5 | # transform-genbank-xml2yamlfa --out ~/tmp/pubseq file(s) 6 | # 7 | # Also writes a validation file in the outdir named state.json 8 | # ---------------------------------------------------------------------- 9 | 10 | # See also directory .guix-run and README.md 11 | 12 | import argparse 13 | import gzip 14 | import json 15 | import os 16 | import sys 17 | import types 18 | import xml.etree.ElementTree as ET 19 | from utils import chunks 20 | import genbank 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--out', type=str, help='Directory to write to', 24 | required=True) 25 | parser.add_argument('files', nargs='+', help='file(s)') 26 | args = parser.parse_args() 27 | 28 | dir = args.out 29 | if not os.path.exists(dir): 30 | raise Exception(f"Directory {dir} does not exist") 31 | 32 | states = {} 33 | 34 | for xmlfn in args.files: 35 | print(f"--- Reading {xmlfn}") 36 | with gzip.open(xmlfn, 'r') as f: 37 | xml = f.read().decode() 38 | tree = ET.fromstring(xml) 39 | for gb in tree.findall('./GBSeq'): 40 | valid = None 41 | error = None 42 | meta = {} 43 | id = gb.find("GBSeq_locus").text 44 | basename = dir+"/"+id 45 | print(f" parsing {xmlfn} {id}") 46 | try: 47 | valid,meta = genbank.get_metadata(id,gb) 48 | if valid: 49 | # --- write JSON 50 | jsonfn = basename + ".json" 51 | with open(jsonfn, 'w') as outfile: 52 | print(f" writing {jsonfn}") 53 | json.dump(meta, outfile, indent=4) 54 | # --- write FASTA 55 | fa = basename+".fa" 56 | seq = genbank.get_sequence(id,gb) 57 | if seq: 58 | print(f" writing {fa}") 59 | with open(fa,"w") as f2: 60 | f2.write(f"> {id}\n") 61 | f2.write(seq) 62 | else: 63 | valid = False 64 | except genbank.GBError as e: 65 | error = f"{e} for {id}" 66 | print(error,file=sys.stderr) 67 | valid = False 68 | state = {} 69 | state['valid'] = valid 70 | if error: 71 | state['error'] = error 72 | if meta['warnings']: 73 | state['warnings'] = meta['warnings'] 74 | states[id] = state 75 | 76 | statefn = dir + '/state.json' 77 | with open(statefn, 'w') as outfile: 78 | print(f" Writing {statefn}") 79 | json.dump(states, outfile, indent=4) 80 | -------------------------------------------------------------------------------- /workflows/pull-data/genbank/update-from-genbank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # 3 | # bulk download genbank data and matadata, preparing the FASTA and the 4 | # YAML files 5 | # 6 | # update-from-genbank.py --max 10 --ids ids.txt --out ~/tmp/genbank-xml 7 | # 8 | # See also directory .guix-run and README.md 9 | 10 | import argparse 11 | import gzip 12 | import os 13 | import sys 14 | from utils import chunks 15 | 16 | from Bio import Entrez 17 | Entrez.email = 'another_email@gmail.com' # FIXME 18 | 19 | BATCH=100 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--max', type=int, help='Max queries', required=False) 23 | parser.add_argument('--ids', type=str, help='File with ids to fetch, 1 id per line', required=True) 24 | parser.add_argument('--out', type=str, help='Directory to write to', required=True) 25 | 26 | args = parser.parse_args() 27 | 28 | ids = set() 29 | with open(args.ids) as f: 30 | content = f.readlines() 31 | for line in content: 32 | ids.add(line.strip()) 33 | 34 | dir = args.out 35 | if not os.path.exists(dir): 36 | raise Exception(f"Directory {dir} does not exist") 37 | 38 | request_num = BATCH 39 | if args.max: 40 | request_num = min(BATCH,args.max) 41 | 42 | for i, idsx in enumerate(chunks(list(ids), request_num)): 43 | xmlfn = os.path.join(dir, f"metadata_{i}.xml.gz") 44 | if os.path.exists(xmlfn): 45 | print(f"Skipping {xmlfn} ({i*request_num})",file=sys.stderr) 46 | else: 47 | print(f"Fetching {xmlfn} ({i*request_num})",file=sys.stderr) 48 | with gzip.open(xmlfn, 'w') as f: 49 | f.write((Entrez.efetch(db='nuccore', id=idsx, retmode='xml').read()).encode()) 50 | if args.max and i*request_num >= args.max: 51 | break 52 | -------------------------------------------------------------------------------- /workflows/pull-data/genbank/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def is_integer(string_to_check): 5 | try: 6 | int(string_to_check) 7 | return True 8 | except ValueError: 9 | return False 10 | 11 | 12 | def chunks(lst, n): 13 | for i in range(0, len(lst), n): 14 | yield lst[i:i + n] 15 | 16 | 17 | def check_and_get_ontology_dictionaries(dir_ontology_dictionaries): 18 | """ 19 | Check duplicated entry by looking in all dictionaries 20 | """ 21 | 22 | field_to_term_to_uri_dict = {} 23 | 24 | path_dict_xxx_csv_list = [ 25 | os.path.join(dir_ontology_dictionaries, name_xxx_csv) for name_xxx_csv in 26 | os.listdir(dir_ontology_dictionaries) if name_xxx_csv.endswith('.csv') 27 | ] 28 | 29 | for path_dict_xxx_csv in path_dict_xxx_csv_list: 30 | print(f'Read {path_dict_xxx_csv}') 31 | 32 | with open(path_dict_xxx_csv) as f: 33 | for line in f: 34 | if len(line.split(',')) > 2: 35 | term, uri = line.strip('\n').split('",') 36 | else: 37 | term, uri = line.strip('\n').split(',') 38 | 39 | term = term.strip('"') 40 | 41 | if term in field_to_term_to_uri_dict: 42 | print(f'Warning: in the dictionaries there are more entries for the same term ({term}).') 43 | continue 44 | 45 | field_to_term_to_uri_dict[term] = uri 46 | 47 | # Prepare separated dictionaries (to avoid, for example, that a valid IRI for species is accepted as specimen) 48 | field_to_term_to_uri_dict = {} 49 | 50 | for path_dict_xxx_csv in path_dict_xxx_csv_list: 51 | field = os.path.basename(path_dict_xxx_csv).split('.')[0] 52 | 53 | field_to_term_to_uri_dict[field] = {} 54 | 55 | with open(path_dict_xxx_csv) as f: 56 | for line in f: 57 | if len(line.split(',')) > 2: 58 | term, uri = line.strip('\n').split('",') 59 | else: 60 | term, uri = line.strip('\n').split(',') 61 | 62 | term = term.strip('"') 63 | 64 | if term in field_to_term_to_uri_dict[field]: 65 | print(f'Warning: in the {field} dictionary there are more entries for the same term ({term}).') 66 | continue 67 | 68 | field_to_term_to_uri_dict[field][term] = uri 69 | 70 | return field_to_term_to_uri_dict 71 | -------------------------------------------------------------------------------- /workflows/update-workflows.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-2zp9q4jo5xpif9y fastq2fasta/fastq2fasta.cwl 3 | arvados-cwl-runner --project-uuid=lugli-j7d0g-5hswinmpyho8dju --update-workflow=lugli-7fd4e-mqfu9y3ofnpnho1 pangenome-generate/collect-seqs.cwl 4 | --------------------------------------------------------------------------------