├── xml2json ├── schema.pickle ├── serialize_schema.py ├── xml_dir2json.py ├── xml2json.py └── xml_dir2json_random.py ├── sra_metadata_libs ├── __init__.py └── bcolors.py ├── json ├── README.md ├── extract_runs.py └── print_json_fields.py ├── LICENSE ├── json_examples ├── README.md └── json │ ├── ERA │ ├── ERA570 │ │ └── ERA570895.json │ └── ERA693 │ │ └── ERA693801.json │ └── SRA │ ├── SRA490 │ └── SRA490640.json │ ├── SRA268 │ └── SRA268165.json │ ├── SRA889 │ └── SRA889255.json │ ├── SRA609 │ └── SRA609343.json │ ├── SRA563 │ └── SRA563707.json │ └── SRA245 │ └── SRA245334.json ├── .gitignore └── README.md /xml2json/schema.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/linsalrob/SRA_Metadata/HEAD/xml2json/schema.pickle -------------------------------------------------------------------------------- /sra_metadata_libs/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | 5 | 6 | from .bcolors import bcolors 7 | 8 | __all__ = [ 9 | 'bcolors' 10 | ] 11 | -------------------------------------------------------------------------------- /json/README.md: -------------------------------------------------------------------------------- 1 | # Parse JSON Files 2 | 3 | Each of these examples uses the JSON data in the [json examples](../json_examples) directory. 4 | 5 | 6 | To extract all the run IDs and accession IDs, you can use: 7 | 8 | ```bash 9 | python3 json/extract_runs.py -d json_examples/ 10 | ``` 11 | 12 | To take a look at the fields in a specific file, you can use: 13 | 14 | ```bash 15 | python3 json/print_json_fields.py -f json_examples/SRA575213.json | less 16 | ``` -------------------------------------------------------------------------------- /sra_metadata_libs/bcolors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Colors that you can import and make the text look pretty 3 | 4 | Source: https://stackoverflow.com/questions/287871/print-in-terminal-with-colors 5 | """ 6 | 7 | __author__ = 'Rob Edwards' 8 | 9 | 10 | class bcolors(object): 11 | HEADER = '\033[95m' 12 | OKBLUE = '\033[94m' 13 | OKGREEN = '\033[92m' 14 | WARNING = '\033[93m' 15 | FAIL = '\033[91m' 16 | ENDC = '\033[0m' 17 | BOLD = '\033[1m' 18 | UNDERLINE = '\033[4m' 19 | 20 | PINK = '\033[95m' 21 | BLUE = '\033[94m' 22 | GREEN = '\033[92m' 23 | YELLOW = '\033[93m' 24 | RED = '\033[91m' 25 | WHITE = '\033[0m' 26 | 27 | 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Rob Edwards 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /json_examples/README.md: -------------------------------------------------------------------------------- 1 | # JSON Examples 2 | 3 | These are some example data from the August 2019 SRA metadata that were chosen at random. Quite literally! 4 | 5 | I used this command: 6 | 7 | ```bash 8 | for F in $(find . grep json$ | sort -R | head); do mkdir -p json_examples/json/${F:2:3}/${F:2:6}; cp $F json_examples/json/${F:2:3}/${F:2:6}; done 9 | ``` 10 | 11 | (`sort -R` is a good command to have up your sleeves. The construct ${F:2:3} takes characters 3-6 of the string $F.) 12 | 13 | These ten files represent a random selection of metadata, and hopefully will have some of the variation we see in the whole directory (but probably not all of it). They are good to use with the code in [the json directory](../json) to see what the contents of typical json files are. 14 | 15 | Note that the directory structure mirrors that of the complete [metadata in JSON format](https://edwards.sdsu.edu/data/sra/current.tar.gz) we make available: 16 | 17 | ```text 18 | json/ 19 | ├── ERA 20 | │   ├── ERA570 21 | │   │   └── ERA570895.json 22 | │   └── ERA693 23 | │   └── ERA693801.json 24 | └── SRA 25 | ├── SRA245 26 | │   └── SRA245334.json 27 | ├── SRA268 28 | │   └── SRA268165.json 29 | ├── SRA490 30 | │   └── SRA490640.json 31 | ├── SRA563 32 | │   └── SRA563707.json 33 | ├── SRA575 34 | │   └── SRA575213.json 35 | ├── SRA609 36 | │   └── SRA609343.json 37 | └── SRA889 38 | └── SRA889255.json 39 | ``` 40 | 41 | We use this structure to reduce the number of files per directory and make commands like `ls` work! -------------------------------------------------------------------------------- /json/extract_runs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read all the json files in a directory and extract the runs associated with each ID 3 | """ 4 | 5 | import json 6 | import os 7 | import sys 8 | import argparse 9 | 10 | from sra_metadata_libs import bcolors 11 | 12 | def extract_runs(jf, verbose=False): 13 | """ 14 | Extract the run information 15 | :param jf: The JSON file to parse 16 | :param verbose: more information 17 | :return: prints out the Submission @accession and the run 18 | """ 19 | 20 | with open(jf, 'r') as json_in: 21 | data = json.load(json_in) 22 | if 'SUBMISSION' in data: 23 | acc = data['SUBMISSION']['@accession'] 24 | else: 25 | if verbose: 26 | sys.stderr.write(f"{bcolors.RED}No @accession found in {jf}{bcolors.ENDC}\n") 27 | return 28 | 29 | if 'RUN' in data: 30 | for r in data['RUN']: 31 | if 'PRIMARY_ID' in r['IDENTIFIERS']: 32 | print(f"{acc}\t{r['IDENTIFIERS']['PRIMARY_ID']}") 33 | elif verbose: 34 | sys.stderr.write(f"{bcolors.PINK}No runs found in {acc}{bcolors.ENDC}") 35 | 36 | 37 | 38 | if __name__ == '__main__': 39 | parser = argparse.ArgumentParser(description='') 40 | parser.add_argument('-d', help='Directory of json files', required=True) 41 | parser.add_argument('-v', help='verbose output', action='store_true') 42 | args = parser.parse_args() 43 | 44 | for j in os.listdir(args.d): 45 | if not j.endswith('.json'): 46 | sys.stderr.write(f"{bcolors.WARNING}{j} does not end in .json and so we skipped it. Is this a json file?{bcolors.ENDC}\n") 47 | continue 48 | extract_runs(os.path.join(args.d, j)) 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pycharm 107 | .idea 108 | -------------------------------------------------------------------------------- /json/print_json_fields.py: -------------------------------------------------------------------------------- 1 | """ 2 | Explore the contents of a JSON file. This code prints all the headings in the file, in a tree format 3 | so you can see the relationship between elements. There is a new line between root elements in the tree 4 | so you know which elements you can call directly. 5 | 6 | """ 7 | 8 | import os 9 | import sys 10 | import argparse 11 | 12 | from sra_metadata_libs import bcolors 13 | import json 14 | 15 | def print_str(s, l, verbose=False): 16 | """ 17 | Print the string at level l 18 | :param s: string to print 19 | :param l: level to print it 20 | :param verbose: more output 21 | :return: 22 | """ 23 | 24 | lo = l * '.' 25 | print(f"{lo} {s}") 26 | 27 | def get_keys(js, l, verbose=False): 28 | """ 29 | Get the keys at this level, and test for more dicts 30 | :param js: the json object 31 | :param l: the current level 32 | :param verbose: more output 33 | :return: 34 | """ 35 | 36 | for k in js: 37 | print_str(k, l, verbose) 38 | if isinstance(js[k], dict): 39 | get_keys(js[k], l+1, verbose) 40 | elif isinstance(js[k], list): 41 | get_keys(js[k][0], l+1, verbose) 42 | if l == 0: 43 | print() 44 | 45 | 46 | 47 | def print_json(jf, verbose=False): 48 | """ 49 | Parse and print the fields 50 | :param jf: JSON file to parse 51 | :param verbose: more output 52 | :return: 53 | """ 54 | 55 | if verbose: 56 | sys.stderr.write(f"{bcolors.GREEN}Parsing {jf}{bcolors.ENDC}\n") 57 | 58 | with open(jf, 'r') as ji: 59 | data = json.load(ji) 60 | 61 | get_keys(data, 0, verbose) 62 | 63 | 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser(description='') 67 | parser.add_argument('-f', help='JSON file to query', required=True) 68 | parser.add_argument('-v', help='verbose output', action='store_true') 69 | args = parser.parse_args() 70 | 71 | print_json(args.f) 72 | -------------------------------------------------------------------------------- /json_examples/json/ERA/ERA570/ERA570895.json: -------------------------------------------------------------------------------- 1 | { 2 | "SAMPLE": [ 3 | { 4 | "@alias": "SAMEA3889671", 5 | "@accession": "ERS1076805", 6 | "IDENTIFIERS": { 7 | "PRIMARY_ID": "ERS1076805", 8 | "EXTERNAL_ID": [ 9 | { 10 | "@namespace": "BioSample", 11 | "$": "SAMEA3889671" 12 | } 13 | ] 14 | }, 15 | "TITLE": "4ef65e50-c4e2-11e5-88b1-3c4a9275d6c6", 16 | "SAMPLE_NAME": { 17 | "TAXON_ID": 4932, 18 | "SCIENTIFIC_NAME": "Saccharomyces cerevisiae" 19 | }, 20 | "SAMPLE_ATTRIBUTES": { 21 | "SAMPLE_ATTRIBUTE": [ 22 | { 23 | "TAG": "Alias", 24 | "VALUE": "4ef65e50-c4e2-11e5-88b1-3c4a9275d6c6" 25 | }, 26 | { 27 | "TAG": "ENA checklist", 28 | "VALUE": "ERC000011" 29 | }, 30 | { 31 | "TAG": "INSDC center name", 32 | "VALUE": "SC" 33 | }, 34 | { 35 | "TAG": "INSDC first public", 36 | "VALUE": "2016-10-05T09:26:57Z" 37 | }, 38 | { 39 | "TAG": "INSDC last update", 40 | "VALUE": "2016-03-08T15:17:43Z" 41 | }, 42 | { 43 | "TAG": "INSDC status", 44 | "VALUE": "public" 45 | }, 46 | { 47 | "TAG": "SRA accession", 48 | "VALUE": "ERS1076805" 49 | }, 50 | { 51 | "TAG": "Sample Name", 52 | "VALUE": "ERS1076805" 53 | }, 54 | { 55 | "TAG": "Title", 56 | "VALUE": "3858STDY6309587" 57 | }, 58 | { 59 | "TAG": "sample_description", 60 | "VALUE": "unknown" 61 | }, 62 | { 63 | "TAG": "strain", 64 | "VALUE": "unknown" 65 | } 66 | ] 67 | } 68 | } 69 | ], 70 | "SUBMISSION": { 71 | "@broker_name": "", 72 | "@alias": "3858STDY6309587-sc-2517133", 73 | "@accession": "ERA570895", 74 | "@lab_name": "European Nucleotide Archive" 75 | } 76 | } -------------------------------------------------------------------------------- /xml2json/serialize_schema.py: -------------------------------------------------------------------------------- 1 | """ 2 | serialize the schema so we can time loading it. We need it in a quicker format. 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import json 9 | from random import randint 10 | import time 11 | import pickle 12 | import xmlschema 13 | from roblib import bcolors 14 | __author__ = 'Rob Edwards' 15 | 16 | def read_schemas(schemadir, verbose=True): 17 | """ 18 | Read the XML Schema defintion files, and return a dict of schema objects. 19 | :param schemadir: directory with all the schemas in it. 20 | :param verbose: more output 21 | :return: dict of schema objects 22 | """ 23 | 24 | # known XML Schemas 25 | schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd", 26 | "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 27 | "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd", 28 | "submission" : "SRA.submission.xsd"} 29 | 30 | 31 | schemas = {} 32 | if verbose: 33 | sys.stderr.write(f"Reading schemas\n") 34 | 35 | for s in schema_types: 36 | if verbose: 37 | sys.stderr.write(f"Schema parsing {s}\n") 38 | schemas[s] = xmlschema.XMLSchema(os.path.join(schemadir, schema_types[s])) 39 | 40 | if verbose: 41 | sys.stderr.write(f"Done reading schemas\n") 42 | 43 | return schemas 44 | 45 | 46 | def write_json(schemas, jsonfile, verbose=False): 47 | """ 48 | Write the json file 49 | """ 50 | 51 | if verbose: 52 | sys.stderr.write(f"Writing json file {jsonfile}\n") 53 | 54 | with open(jsonfile, 'w') as f: 55 | json.dump(dict(schemas), f) 56 | 57 | def read_json(jsonfile, verbose=False): 58 | """ 59 | Read the json file 60 | """ 61 | 62 | if verbose: 63 | sys.stderr.write(f"Reading json file {jsonfile}\n") 64 | 65 | with open(jsonfile, 'r') as f: 66 | schemas = json.load(f) 67 | 68 | return schemas 69 | 70 | 71 | def write_pickle(schemas, picklefile, verbose=False): 72 | """ 73 | Write the pickle file 74 | """ 75 | if verbose: 76 | sys.stderr.write(f"Writing pickle file {picklefile}\n") 77 | 78 | with open(picklefile, 'wb') as f: 79 | pickle.dump(schemas, f) 80 | 81 | 82 | def read_pickle(picklefile, verbose=False): 83 | """ 84 | Read the pickle file 85 | """ 86 | if verbose: 87 | sys.stderr.write(f"Reading pickle file {picklefile}\n") 88 | 89 | with open(picklefile, 'rb') as f: 90 | schemas = pickle.load(f) 91 | 92 | return schemas 93 | 94 | 95 | 96 | if __name__ == "__main__": 97 | parser = argparse.ArgumentParser(description=' ') 98 | parser.add_argument('-x', help='xml schema directory', required=True) 99 | parser.add_argument('-j', help='json output to write') 100 | parser.add_argument('-p', help='pickle to write') 101 | parser.add_argument('-v', help='verbose output', action='store_true') 102 | args = parser.parse_args() 103 | 104 | schemas = read_schemas(args.x, args.v) 105 | 106 | """ 107 | write_json(schemas, args.j, args.v) 108 | for i in range(5): 109 | start = time.time() 110 | s = read_json(args.j, args.v) 111 | end = time.time() 112 | print(f"JSON: {end - start}") 113 | """ 114 | 115 | write_pickle(schemas, args.p, args.v) 116 | pick = [] 117 | xml = [] 118 | for i in range(5): 119 | if randint(0,10) < 5: 120 | sys.stderr.write("PICKLE\n") 121 | start = time.time() 122 | s = read_pickle(args.p, False) 123 | end = time.time() 124 | pick.append(end - start) 125 | sys.stderr.write("XML\n") 126 | start = time.time() 127 | s = read_schemas(args.x, False) 128 | end = time.time() 129 | xml.append(end - start) 130 | else: 131 | sys.stderr.write("XML\n") 132 | start = time.time() 133 | s = read_schemas(args.x, False) 134 | end = time.time() 135 | xml.append(end - start) 136 | sys.stderr.write("PICKLE\n") 137 | start = time.time() 138 | s = read_pickle(args.p, False) 139 | end = time.time() 140 | pick.append(end - start) 141 | 142 | print(f"Pickle: {sum(pick)/len(pick)} XML: {sum(xml)/len(xml)}\n") 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /json_examples/json/SRA/SRA490/SRA490640.json: -------------------------------------------------------------------------------- 1 | { 2 | "SAMPLE": [ 3 | { 4 | "@alias": "NWD511638", 5 | "@accession": "SRS1784730", 6 | "IDENTIFIERS": { 7 | "PRIMARY_ID": "SRS1784730", 8 | "EXTERNAL_ID": [ 9 | { 10 | "@namespace": "BioSample", 11 | "$": "SAMN05552233" 12 | }, 13 | { 14 | "@namespace": "dbGaP", 15 | "@label": "Sample name", 16 | "$": "951-NWD511638" 17 | }, 18 | { 19 | "@namespace": "phs000951", 20 | "@label": "submitted sample id", 21 | "$": "NWD511638" 22 | } 23 | ] 24 | }, 25 | "TITLE": "Non-tumor DNA sample from Blood of a human female participant in the dbGaP study \"NHLBI TOPMed: Genetic Epidemiology of COPD (COPDGene) in the TOPMed Program\"", 26 | "SAMPLE_NAME": { 27 | "TAXON_ID": 9606, 28 | "SCIENTIFIC_NAME": "Homo sapiens" 29 | }, 30 | "SAMPLE_ATTRIBUTES": { 31 | "SAMPLE_ATTRIBUTE": [ 32 | { 33 | "TAG": "gap_accession", 34 | "VALUE": "phs000951" 35 | }, 36 | { 37 | "TAG": "submitter handle", 38 | "VALUE": "NHLBI_COPDGene" 39 | }, 40 | { 41 | "TAG": "biospecimen repository", 42 | "VALUE": "NHLBI_COPDGene" 43 | }, 44 | { 45 | "TAG": "study name", 46 | "VALUE": "NHLBI TOPMed: Genetic Epidemiology of COPD (COPDGene) in the TOPMed Program" 47 | }, 48 | { 49 | "TAG": "study design", 50 | "VALUE": "Case-Control" 51 | }, 52 | { 53 | "TAG": "biospecimen repository sample id", 54 | "VALUE": "NWD511638" 55 | }, 56 | { 57 | "TAG": "submitted sample id", 58 | "VALUE": "NWD511638" 59 | }, 60 | { 61 | "TAG": "submitted subject id", 62 | "VALUE": "COPDGene_G31211" 63 | }, 64 | { 65 | "TAG": "gap_sample_id", 66 | "VALUE": "2064233" 67 | }, 68 | { 69 | "TAG": "gap_subject_id", 70 | "VALUE": "432226" 71 | }, 72 | { 73 | "TAG": "sex", 74 | "VALUE": "female" 75 | }, 76 | { 77 | "TAG": "body site", 78 | "VALUE": "Blood" 79 | }, 80 | { 81 | "TAG": "histological type", 82 | "VALUE": "Blood" 83 | }, 84 | { 85 | "TAG": "analyte type", 86 | "VALUE": "DNA" 87 | }, 88 | { 89 | "TAG": "is tumor", 90 | "VALUE": "No" 91 | }, 92 | { 93 | "TAG": "subject is affected", 94 | "VALUE": "No" 95 | }, 96 | { 97 | "TAG": "molecular data type", 98 | "VALUE": "SNP/CNV Genotypes (NGS)" 99 | }, 100 | { 101 | "TAG": "gap_consent_code", 102 | "VALUE": "1" 103 | }, 104 | { 105 | "TAG": "gap_consent_short_name", 106 | "VALUE": "HMB-MDS" 107 | } 108 | ] 109 | } 110 | } 111 | ], 112 | "SUBMISSION": { 113 | "@alias": "NWD511638.expt.submit", 114 | "@center_name": "Broad Institute", 115 | "@broker_name": "UM-SPH", 116 | "@lab_name": "Abecasis", 117 | "@submission_comment": "", 118 | "@accession": "SRA490640" 119 | } 120 | } -------------------------------------------------------------------------------- /xml2json/xml_dir2json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read a directory of XML files and convert the output to JSON. 3 | We write the JSON object to a file. By default, we do not 4 | overwrite existing files, but you can force that with the -o 5 | flag. 6 | """ 7 | 8 | import os 9 | import sys 10 | import argparse 11 | from sra_metadata_libs import bcolors 12 | import xmlschema 13 | from xmlschema.validators.exceptions import XMLSchemaValidationError 14 | import json 15 | from pprint import pprint 16 | __author__ = 'Rob Edwards' 17 | 18 | 19 | def validation_errors(sample, error, verbose=False): 20 | """ 21 | Log the validation error to a file. We append to the file. 22 | 23 | :param sample: The sample ID 24 | :param error: The python Error object 25 | :param verbose: More output 26 | """ 27 | 28 | if verbose: 29 | sys.stderr.write(f"{bcolors.PINK}Logging error for {sample}{bcolors.ENDC}\n") 30 | 31 | with open("XML_validation_errors.txt", "a") as out: 32 | out.write(f"\n=== BEGIN {sample} ===\n") 33 | out.write(str(error)) 34 | out.write(f"\n=== END {sample} ===\n") 35 | 36 | def read_schemas(verbose=True): 37 | """ 38 | Read the XML Schema defintion files, and return a dict of schema objects. 39 | :param verbose: more output 40 | :return: dict of schema objects 41 | """ 42 | 43 | # known XML Schemas 44 | schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd", 45 | "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 46 | "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd", 47 | "submission" : "SRA.submission.xsd"} 48 | 49 | 50 | schemas = {} 51 | 52 | for s in schema_types: 53 | if verbose: 54 | sys.stderr.write(f"{bcolors.GREEN}Schema parsing{bcolors.ENDC} {s}\n") 55 | schemas[s] = xmlschema.XMLSchema(os.path.join("Schemas", schema_types[s])) 56 | 57 | return schemas 58 | 59 | def read_directory(basedir, subdir, schemas, verbose=False): 60 | """ 61 | Read a directory and create a single dict for that directory 62 | 63 | :param basedir: The base directory of all the XML files 64 | :param subdir: The sample directory with each of the individual XML files 65 | :param schemas: The dictionary of XML Schema Definitions 66 | :param verbose: more output 67 | :return: a dict of all the data 68 | """ 69 | 70 | data = {} 71 | for s in schemas: 72 | sc = schemas[s] 73 | if not os.path.exists(os.path.join(basedir, subdir, f"{subdir}.{s}.xml")): 74 | if verbose and s not in ['analysis', 'common', 'package']: 75 | sys.stderr.write(f"{bcolors.RED}WARN: {basedir}/{subdir}/{subdir}.{s}.xml not found\n") 76 | continue 77 | 78 | try: 79 | xm = schemas[s].to_dict(os.path.join(basedir, subdir, f"{subdir}.{s}.xml"), decimal_type=str) 80 | except XMLSchemaValidationError as e: 81 | validation_errors(subdir, e, verbose) 82 | continue 83 | 84 | #data[s.upper()] = xm[s.upper()] 85 | if 'submission' == s: 86 | data['SUBMISSION'] = xm 87 | else: 88 | data[s.upper()] = xm[s.upper()] 89 | 90 | return data 91 | 92 | def write_json(xml, outfile, verbose=False): 93 | """ 94 | Write the dictionary to a JSON file 95 | 96 | :param xml: The dictionary of the XML object 97 | :param outfile: The file to write to 98 | :param verbose: more output 99 | """ 100 | 101 | with open(outfile, 'w') as out: 102 | out.write(json.dumps(xml, indent=4)) 103 | 104 | 105 | if __name__ == "__main__": 106 | parser = argparse.ArgumentParser(description='Parse a directory or directories and create a json output for each one') 107 | parser.add_argument('-d', help='directory to parse', required=True) 108 | parser.add_argument('-o', help='where to put the json files.', required=True) 109 | parser.add_argument('-f', help='force writing of the file, even if it exists', action='store_true') 110 | parser.add_argument('-v', help='verbose output', action='store_true') 111 | args = parser.parse_args() 112 | 113 | 114 | # read all the known schemas 115 | if not os.path.exists("Schemas"): 116 | sys.stderr.write(f"{bcolors.RED}FATAL: Schemas/ directory with known xml schemas not found{bcolors.ENDC}\n") 117 | sys.exit(-1) 118 | schemas = read_schemas(True) 119 | 120 | if not os.path.exists(args.o): 121 | os.mkdir(args.o) 122 | 123 | # read all the files in the base directory 124 | for submission in os.listdir(args.d): 125 | outfile = os.path.join(args.o, f"{submission}.json") 126 | if (not args.f) and os.path.exists(outfile): 127 | continue 128 | 129 | if args.v: 130 | sys.stderr.write(f"{bcolors.GREEN}Parsing {bcolors.ENDC} {submission}\n") 131 | 132 | data = read_directory(args.d, submission, schemas, args.v) 133 | 134 | if args.v: 135 | sys.stderr.write(f"{bcolors.BLUE}Writing {bcolors.ENDC} {submission}\n") 136 | write_json(data, outfile, args.v) 137 | 138 | -------------------------------------------------------------------------------- /json_examples/json/SRA/SRA268/SRA268165.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": [ 3 | { 4 | "@alias": "2b-RAD data of Patinopecten yessoensis", 5 | "@accession": "SRX1027271", 6 | "@center_name": "Ocean University of China", 7 | "IDENTIFIERS": { 8 | "PRIMARY_ID": "SRX1027271", 9 | "SUBMITTER_ID": [ 10 | { 11 | "@namespace": "Ocean University of China", 12 | "$": "2b-RAD data of Patinopecten yessoensis" 13 | } 14 | ] 15 | }, 16 | "TITLE": "2b-RAD data from Genome Sequencing Project of Patinopecten yessoensis", 17 | "STUDY_REF": { 18 | "@accession": "SRP046829", 19 | "IDENTIFIERS": { 20 | "PRIMARY_ID": "SRP046829" 21 | } 22 | }, 23 | "DESIGN": { 24 | "DESIGN_DESCRIPTION": null, 25 | "SAMPLE_DESCRIPTOR": { 26 | "@accession": "SRS935919", 27 | "IDENTIFIERS": { 28 | "PRIMARY_ID": "SRS935919" 29 | } 30 | }, 31 | "LIBRARY_DESCRIPTOR": { 32 | "LIBRARY_NAME": null, 33 | "LIBRARY_STRATEGY": "OTHER", 34 | "LIBRARY_SOURCE": "GENOMIC", 35 | "LIBRARY_SELECTION": "PCR", 36 | "LIBRARY_LAYOUT": { 37 | "SINGLE": null 38 | } 39 | }, 40 | "SPOT_DESCRIPTOR": { 41 | "SPOT_DECODE_SPEC": { 42 | "SPOT_LENGTH": 36, 43 | "READ_SPEC": [ 44 | { 45 | "READ_INDEX": 0, 46 | "READ_CLASS": "Application Read", 47 | "READ_TYPE": "Forward", 48 | "BASE_COORD": 1 49 | } 50 | ] 51 | } 52 | } 53 | }, 54 | "PLATFORM": { 55 | "ILLUMINA": { 56 | "INSTRUMENT_MODEL": "Illumina HiSeq 2000" 57 | } 58 | } 59 | } 60 | ], 61 | "RUN": [ 62 | { 63 | "@accession": "SRR2027758", 64 | "@center_name": "Ocean University of China", 65 | "@alias": "2b-RAD data of Patinopecten yessoensis", 66 | "IDENTIFIERS": { 67 | "PRIMARY_ID": "SRR2027758", 68 | "SUBMITTER_ID": [ 69 | { 70 | "@namespace": "Ocean University of China", 71 | "$": "2b-RAD data of Patinopecten yessoensis" 72 | } 73 | ] 74 | }, 75 | "EXPERIMENT_REF": { 76 | "@accession": "SRX1027271" 77 | } 78 | } 79 | ], 80 | "SAMPLE": [ 81 | { 82 | "@alias": "2b-RAD data of Patinopecten yessoensis", 83 | "@accession": "SRS935919", 84 | "IDENTIFIERS": { 85 | "PRIMARY_ID": "SRS935919", 86 | "EXTERNAL_ID": [ 87 | { 88 | "@namespace": "BioSample", 89 | "$": "SAMN03657538" 90 | } 91 | ] 92 | }, 93 | "SAMPLE_NAME": { 94 | "TAXON_ID": 6573, 95 | "SCIENTIFIC_NAME": "Mizuhopecten yessoensis" 96 | }, 97 | "SAMPLE_ATTRIBUTES": { 98 | "SAMPLE_ATTRIBUTE": [ 99 | { 100 | "TAG": "strain", 101 | "VALUE": "cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Protostomia; Lophotrochozoa; Mollusca; Bivalvia; Pteriomorphia; Pectinoida; Pectinoidea; Pectinidae; Mizuhopecten" 102 | }, 103 | { 104 | "TAG": "age", 105 | "VALUE": "one-year and two-year old" 106 | }, 107 | { 108 | "TAG": "sex", 109 | "VALUE": "not determined" 110 | }, 111 | { 112 | "TAG": "tissue", 113 | "VALUE": "striated muscle" 114 | }, 115 | { 116 | "TAG": "BioSampleModel", 117 | "VALUE": "Model organism or animal" 118 | } 119 | ] 120 | } 121 | } 122 | ], 123 | "SUBMISSION": { 124 | "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 125 | "@alias": "2b-RAD data of Patinopecten yessoensis", 126 | "@submission_comment": "2b-RAD data from Genome Sequencing Project of Patinopecten yessoensis", 127 | "@center_name": "Ocean University of China", 128 | "@lab_name": "Ministry of Education Key Laboratory of Marine Gen", 129 | "@accession": "SRA268165" 130 | } 131 | } -------------------------------------------------------------------------------- /json_examples/json/SRA/SRA889/SRA889255.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": [ 3 | { 4 | "@accession": "SRX5882665", 5 | "@alias": "PNUSAS076100:wgs", 6 | "IDENTIFIERS": { 7 | "PRIMARY_ID": "SRX5882665", 8 | "EXTERNAL_ID": [ 9 | { 10 | "@namespace": "EDLB-CDC", 11 | "$": "PNUSAS076100:wgs" 12 | } 13 | ] 14 | }, 15 | "TITLE": null, 16 | "STUDY_REF": { 17 | "@accession": "SRP040281", 18 | "IDENTIFIERS": { 19 | "PRIMARY_ID": "SRP040281", 20 | "EXTERNAL_ID": [ 21 | { 22 | "@namespace": "BioProject", 23 | "$": "PRJNA230403" 24 | } 25 | ] 26 | } 27 | }, 28 | "DESIGN": { 29 | "DESIGN_DESCRIPTION": null, 30 | "SAMPLE_DESCRIPTOR": { 31 | "@accession": "SRS4805223", 32 | "IDENTIFIERS": { 33 | "PRIMARY_ID": "SRS4805223", 34 | "EXTERNAL_ID": [ 35 | { 36 | "@namespace": "EDLB-CDC", 37 | "$": "PNUSAS076100" 38 | } 39 | ] 40 | } 41 | }, 42 | "LIBRARY_DESCRIPTOR": { 43 | "LIBRARY_NAME": "NexteraXT", 44 | "LIBRARY_STRATEGY": "WGS", 45 | "LIBRARY_SOURCE": "GENOMIC", 46 | "LIBRARY_SELECTION": "RANDOM", 47 | "LIBRARY_LAYOUT": { 48 | "PAIRED": null 49 | }, 50 | "LIBRARY_CONSTRUCTION_PROTOCOL": "NexteraXT" 51 | } 52 | }, 53 | "PLATFORM": { 54 | "ILLUMINA": { 55 | "INSTRUMENT_MODEL": "Illumina MiSeq" 56 | } 57 | } 58 | } 59 | ], 60 | "RUN": [ 61 | { 62 | "@accession": "SRR9108043", 63 | "@alias": "PNUSAS076100:wgs", 64 | "IDENTIFIERS": { 65 | "PRIMARY_ID": "SRR9108043", 66 | "EXTERNAL_ID": [ 67 | { 68 | "@namespace": "EDLB-CDC", 69 | "$": "PNUSAS076100:wgs" 70 | } 71 | ] 72 | }, 73 | "EXPERIMENT_REF": { 74 | "@accession": "SRX5882665", 75 | "IDENTIFIERS": { 76 | "EXTERNAL_ID": [ 77 | { 78 | "@namespace": "EDLB-CDC", 79 | "$": "PNUSAS076100:wgs" 80 | } 81 | ] 82 | } 83 | } 84 | } 85 | ], 86 | "SAMPLE": [ 87 | { 88 | "@alias": "PNUSAS076100", 89 | "@accession": "SRS4805223", 90 | "IDENTIFIERS": { 91 | "PRIMARY_ID": "SRS4805223", 92 | "EXTERNAL_ID": [ 93 | { 94 | "@namespace": "BioSample", 95 | "$": "SAMN11822565" 96 | } 97 | ] 98 | }, 99 | "TITLE": "Salmonella enterica", 100 | "SAMPLE_NAME": { 101 | "TAXON_ID": 28901, 102 | "SCIENTIFIC_NAME": "Salmonella enterica" 103 | }, 104 | "SAMPLE_LINKS": { 105 | "SAMPLE_LINK": [ 106 | { 107 | "XREF_LINK": { 108 | "DB": "bioproject", 109 | "ID": "230403", 110 | "LABEL": "PRJNA230403" 111 | } 112 | } 113 | ] 114 | }, 115 | "SAMPLE_ATTRIBUTES": { 116 | "SAMPLE_ATTRIBUTE": [ 117 | { 118 | "TAG": "strain", 119 | "VALUE": "PNUSAS076100" 120 | }, 121 | { 122 | "TAG": "isolate", 123 | "VALUE": "Missing" 124 | }, 125 | { 126 | "TAG": "isolation_source", 127 | "VALUE": "missing" 128 | }, 129 | { 130 | "TAG": "collected_by", 131 | "VALUE": "CDC" 132 | }, 133 | { 134 | "TAG": "collection_date", 135 | "VALUE": "missing" 136 | }, 137 | { 138 | "TAG": "geo_loc_name", 139 | "VALUE": "USA" 140 | }, 141 | { 142 | "TAG": "lat_lon", 143 | "VALUE": "missing" 144 | }, 145 | { 146 | "TAG": "host", 147 | "VALUE": "missing" 148 | }, 149 | { 150 | "TAG": "host_disease", 151 | "VALUE": "missing" 152 | }, 153 | { 154 | "TAG": "BioSampleModel", 155 | "VALUE": "Pathogen.cl" 156 | } 157 | ] 158 | } 159 | } 160 | ], 161 | "SUBMISSION": { 162 | "@lab_name": "", 163 | "@center_name": "Pulsenet", 164 | "@accession": "SRA889255", 165 | "@alias": "SUB5658461" 166 | } 167 | } -------------------------------------------------------------------------------- /xml2json/xml2json.py: -------------------------------------------------------------------------------- 1 | """ 2 | Process a single SRA directory and convert it to JSON. 3 | """ 4 | 5 | import os 6 | import sys 7 | import argparse 8 | import fcntl 9 | import pickle 10 | import xmlschema 11 | from xmlschema.validators.exceptions import XMLSchemaValidationError 12 | import json 13 | __author__ = 'Rob Edwards' 14 | 15 | 16 | def validation_errors(sample, error, verbose=False): 17 | """ 18 | Log the validation error to a file. We append to the file. 19 | 20 | :param sample: The sample ID 21 | :param error: The python Error object 22 | :param verbose: More output 23 | """ 24 | 25 | if verbose: 26 | sys.stderr.write(f"Logging error for {sample}\n") 27 | 28 | with open("XML_validation_errors.txt", "a") as out: 29 | # get an exclusive lock 30 | fcntl.flock(out, fcntl.LOCK_EX) 31 | out.write(f"\n=== BEGIN {sample} ===\n") 32 | out.write(str(error)) 33 | out.write(f"\n=== END {sample} ===\n") 34 | fcntl.flock(out, fcntl.LOCK_UN) 35 | 36 | def read_schema_pickle(picklefile, verbose=False): 37 | """ 38 | Read the schema from a pickle file 39 | """ 40 | if verbose: 41 | sys.stderr.write(f"Reading pickle file {picklefile}\n") 42 | 43 | with open(picklefile, 'rb') as f: 44 | schemas = pickle.load(f) 45 | 46 | return schemas 47 | 48 | def read_schemas(schemadir, verbose=True): 49 | """ 50 | Read the XML Schema defintion files, and return a dict of schema objects. 51 | :param schemadir: directory with all the schemas in it. 52 | :param verbose: more output 53 | :return: dict of schema objects 54 | """ 55 | 56 | # known XML Schemas 57 | schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd", 58 | "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 59 | "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd", 60 | "submission" : "SRA.submission.xsd"} 61 | 62 | 63 | schemas = {} 64 | 65 | for s in schema_types: 66 | if verbose: 67 | sys.stderr.write(f"Schema parsing {s}\n") 68 | schemas[s] = xmlschema.XMLSchema(os.path.join(schemadir, schema_types[s])) 69 | 70 | return schemas 71 | 72 | def read_directory(basedir, sampleid, schemas, verbose=False): 73 | """ 74 | Read a directory and create a single dict for that directory 75 | 76 | :param basedir: The base directory of all the XML files 77 | :param sampleid: The sample directory with each of the individual XML files 78 | :param schemas: The dictionary of XML Schema Definitions 79 | :param verbose: more output 80 | :return: a dict of all the data 81 | """ 82 | 83 | data = {} 84 | for s in schemas: 85 | sc = schemas[s] 86 | if not os.path.exists(os.path.join(basedir, sampleid, f"{sampleid}.{s}.xml")): 87 | if verbose and s not in ['analysis', 'common', 'package']: 88 | sys.stderr.write(f"WARN: {basedir}/{sampleid}/{sampleid}.{s}.xml not found\n") 89 | continue 90 | 91 | try: 92 | xm = schemas[s].to_dict(os.path.join(basedir, sampleid, f"{sampleid}.{s}.xml"), decimal_type=str) 93 | except XMLSchemaValidationError as e: 94 | validation_errors(sampleid, e, verbose) 95 | continue 96 | 97 | #data[s.upper()] = xm[s.upper()] 98 | if 'submission' == s: 99 | data['SUBMISSION'] = xm 100 | else: 101 | data[s.upper()] = xm[s.upper()] 102 | 103 | return data 104 | 105 | def write_json(xml, outfile, verbose=False): 106 | """ 107 | Write the dictionary to a JSON file 108 | 109 | :param xml: The dictionary of the XML object 110 | :param outfile: The file to write to 111 | :param verbose: more output 112 | """ 113 | 114 | with open(outfile, 'w') as out: 115 | out.write(json.dumps(xml, indent=4)) 116 | 117 | 118 | if __name__ == "__main__": 119 | parser = argparse.ArgumentParser(description='Parse a directory or directories and create a json output for each one') 120 | parser.add_argument('-d', help='directory with the submission diretories', required=True) 121 | parser.add_argument('-x', help='Sample ID to parse', required=True) 122 | parser.add_argument('-o', help='where to put the json files.', required=True) 123 | parser.add_argument('-s', help='Schema directory') 124 | parser.add_argument('-p', help='Schema pickle') 125 | parser.add_argument('-f', help='force writing of the file, even if it exists', action='store_true') 126 | parser.add_argument('-v', help='verbose output', action='store_true') 127 | args = parser.parse_args() 128 | 129 | 130 | if not args.s and not args.p: 131 | sys.stderr.write("FATAL: Please provide a schema either as a directory or a pickle file\n") 132 | sys.exit(-1) 133 | 134 | # read all the files in the base directory 135 | outfile = os.path.join(args.o, f"{args.x}.json") 136 | if (not args.f) and os.path.exists(outfile): 137 | sys.exit(0) 138 | 139 | # read all the known schemas 140 | schemas = None 141 | if args.p: 142 | schemas = read_schema_pickle(args.p, args.v) 143 | 144 | if args.s: 145 | if not os.path.exists(args.s): 146 | sys.stderr.write(f"FATAL: {args.s} directory with known xml schemas not found\n") 147 | sys.exit(-1) 148 | schemas = read_schemas(args.s, args.v) 149 | 150 | if not schemas: 151 | sys.stderr.write("FATAL: Could not read your schemas\n") 152 | sys.exit(-1) 153 | 154 | 155 | if not os.path.exists(args.o): 156 | os.mkdir(args.o) 157 | 158 | if args.v: 159 | sys.stderr.write(f"Parsing {args.x}\n") 160 | 161 | data = read_directory(args.d, args.x, schemas, args.v) 162 | 163 | if args.v: 164 | sys.stderr.write(f"Writing {args.x}\n") 165 | write_json(data, outfile, args.v) 166 | 167 | -------------------------------------------------------------------------------- /json_examples/json/SRA/SRA609/SRA609343.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": [ 3 | { 4 | "@accession": "SRX3197638", 5 | "@alias": "PNUSAS022123:wgs", 6 | "IDENTIFIERS": { 7 | "PRIMARY_ID": "SRX3197638", 8 | "EXTERNAL_ID": [ 9 | { 10 | "@namespace": "EDLB-CDC", 11 | "$": "PNUSAS022123:wgs" 12 | } 13 | ] 14 | }, 15 | "TITLE": null, 16 | "STUDY_REF": { 17 | "@accession": "SRP040281", 18 | "IDENTIFIERS": { 19 | "PRIMARY_ID": "SRP040281", 20 | "EXTERNAL_ID": [ 21 | { 22 | "@namespace": "BioProject", 23 | "$": "PRJNA230403" 24 | } 25 | ] 26 | } 27 | }, 28 | "DESIGN": { 29 | "DESIGN_DESCRIPTION": null, 30 | "SAMPLE_DESCRIPTOR": { 31 | "@accession": "SRS2524717", 32 | "IDENTIFIERS": { 33 | "PRIMARY_ID": "SRS2524717", 34 | "EXTERNAL_ID": [ 35 | { 36 | "@namespace": "BioSample", 37 | "$": "SAMN07638965" 38 | } 39 | ] 40 | } 41 | }, 42 | "LIBRARY_DESCRIPTOR": { 43 | "LIBRARY_NAME": "NexteraXT", 44 | "LIBRARY_STRATEGY": "WGS", 45 | "LIBRARY_SOURCE": "GENOMIC", 46 | "LIBRARY_SELECTION": "RANDOM", 47 | "LIBRARY_LAYOUT": { 48 | "PAIRED": null 49 | }, 50 | "LIBRARY_CONSTRUCTION_PROTOCOL": "NexteraXT" 51 | } 52 | }, 53 | "PLATFORM": { 54 | "ILLUMINA": { 55 | "INSTRUMENT_MODEL": "Illumina MiSeq" 56 | } 57 | } 58 | } 59 | ], 60 | "RUN": [ 61 | { 62 | "@accession": "SRR6050671", 63 | "@alias": "PNUSAS022123:wgs", 64 | "IDENTIFIERS": { 65 | "PRIMARY_ID": "SRR6050671", 66 | "EXTERNAL_ID": [ 67 | { 68 | "@namespace": "EDLB-CDC", 69 | "$": "PNUSAS022123:wgs" 70 | } 71 | ] 72 | }, 73 | "EXPERIMENT_REF": { 74 | "@accession": "SRX3197638", 75 | "IDENTIFIERS": { 76 | "EXTERNAL_ID": [ 77 | { 78 | "@namespace": "EDLB-CDC", 79 | "$": "PNUSAS022123:wgs" 80 | } 81 | ] 82 | } 83 | } 84 | } 85 | ], 86 | "SAMPLE": [ 87 | { 88 | "@alias": "PNUSAS022123", 89 | "@accession": "SRS2524717", 90 | "IDENTIFIERS": { 91 | "PRIMARY_ID": "SRS2524717", 92 | "EXTERNAL_ID": [ 93 | { 94 | "@namespace": "BioSample", 95 | "$": "SAMN07638965" 96 | } 97 | ] 98 | }, 99 | "SAMPLE_NAME": { 100 | "TAXON_ID": 436295, 101 | "SCIENTIFIC_NAME": "Salmonella enterica subsp. enterica serovar Poona" 102 | }, 103 | "SAMPLE_LINKS": { 104 | "SAMPLE_LINK": [ 105 | { 106 | "XREF_LINK": { 107 | "DB": "bioproject", 108 | "ID": "230403", 109 | "LABEL": "PRJNA230403" 110 | } 111 | } 112 | ] 113 | }, 114 | "SAMPLE_ATTRIBUTES": { 115 | "SAMPLE_ATTRIBUTE": [ 116 | { 117 | "TAG": "collection_date", 118 | "VALUE": "Jul-2017" 119 | }, 120 | { 121 | "TAG": "strain", 122 | "VALUE": "PNUSAS022123" 123 | }, 124 | { 125 | "TAG": "collected_by", 126 | "VALUE": "CDC" 127 | }, 128 | { 129 | "TAG": "serovar", 130 | "VALUE": "Poona" 131 | }, 132 | { 133 | "TAG": "lat_lon", 134 | "VALUE": "Missing" 135 | }, 136 | { 137 | "TAG": "geo_loc_name", 138 | "VALUE": "USA" 139 | }, 140 | { 141 | "TAG": "host", 142 | "VALUE": "Missing" 143 | }, 144 | { 145 | "TAG": "isolation_source", 146 | "VALUE": "urine" 147 | }, 148 | { 149 | "TAG": "HHS_region", 150 | "VALUE": "9" 151 | }, 152 | { 153 | "TAG": "host_age", 154 | "VALUE": "10-19" 155 | }, 156 | { 157 | "TAG": "host_disease", 158 | "VALUE": "Missing" 159 | }, 160 | { 161 | "TAG": "sub_species", 162 | "VALUE": "enterica" 163 | }, 164 | { 165 | "TAG": "BioSampleModel", 166 | "VALUE": "Pathogen.cl" 167 | } 168 | ] 169 | } 170 | } 171 | ], 172 | "SUBMISSION": { 173 | "@lab_name": "", 174 | "@center_name": "edlb-cdc", 175 | "@accession": "SRA609343", 176 | "@alias": "SUB3055691" 177 | } 178 | } -------------------------------------------------------------------------------- /json_examples/json/SRA/SRA563/SRA563707.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": [ 3 | { 4 | "@accession": "SRX2841104", 5 | "@alias": "367250", 6 | "IDENTIFIERS": { 7 | "PRIMARY_ID": "SRX2841104", 8 | "EXTERNAL_ID": [ 9 | { 10 | "@namespace": "PHE", 11 | "$": "367250" 12 | } 13 | ] 14 | }, 15 | "TITLE": null, 16 | "STUDY_REF": { 17 | "@accession": "SRP042645", 18 | "IDENTIFIERS": { 19 | "PRIMARY_ID": "SRP042645", 20 | "EXTERNAL_ID": [ 21 | { 22 | "@namespace": "BioProject", 23 | "$": "PRJNA248792" 24 | } 25 | ] 26 | } 27 | }, 28 | "DESIGN": { 29 | "DESIGN_DESCRIPTION": null, 30 | "SAMPLE_DESCRIPTOR": { 31 | "@accession": "SRS2214451", 32 | "IDENTIFIERS": { 33 | "PRIMARY_ID": "SRS2214451", 34 | "EXTERNAL_ID": [ 35 | { 36 | "@namespace": "PHE", 37 | "$": "367250.biosample" 38 | } 39 | ] 40 | } 41 | }, 42 | "LIBRARY_DESCRIPTOR": { 43 | "LIBRARY_NAME": "367250", 44 | "LIBRARY_STRATEGY": "WGS", 45 | "LIBRARY_SOURCE": "GENOMIC", 46 | "LIBRARY_SELECTION": "RANDOM", 47 | "LIBRARY_LAYOUT": { 48 | "PAIRED": null 49 | }, 50 | "LIBRARY_CONSTRUCTION_PROTOCOL": "Illumina Nextera XT" 51 | } 52 | }, 53 | "PLATFORM": { 54 | "ILLUMINA": { 55 | "INSTRUMENT_MODEL": "Illumina HiSeq 2500" 56 | } 57 | } 58 | } 59 | ], 60 | "RUN": [ 61 | { 62 | "@accession": "SRR5583191", 63 | "@alias": "367250", 64 | "IDENTIFIERS": { 65 | "PRIMARY_ID": "SRR5583191", 66 | "EXTERNAL_ID": [ 67 | { 68 | "@namespace": "PHE", 69 | "$": "367250" 70 | } 71 | ] 72 | }, 73 | "EXPERIMENT_REF": { 74 | "@accession": "SRX2841104", 75 | "IDENTIFIERS": { 76 | "EXTERNAL_ID": [ 77 | { 78 | "@namespace": "PHE", 79 | "$": "367250" 80 | } 81 | ] 82 | } 83 | } 84 | } 85 | ], 86 | "SAMPLE": [ 87 | { 88 | "@alias": "367250.biosample", 89 | "@accession": "SRS2214451", 90 | "IDENTIFIERS": { 91 | "PRIMARY_ID": "SRS2214451", 92 | "EXTERNAL_ID": [ 93 | { 94 | "@namespace": "BioSample", 95 | "$": "SAMN07152381" 96 | } 97 | ] 98 | }, 99 | "TITLE": "Salmonella enterica enterica serovar Salmonella Typhimurium 367250", 100 | "SAMPLE_NAME": { 101 | "TAXON_ID": 59201, 102 | "SCIENTIFIC_NAME": "Salmonella enterica subsp. enterica" 103 | }, 104 | "SAMPLE_LINKS": { 105 | "SAMPLE_LINK": [ 106 | { 107 | "XREF_LINK": { 108 | "DB": "bioproject", 109 | "ID": "248792", 110 | "LABEL": "PRJNA248792" 111 | } 112 | } 113 | ] 114 | }, 115 | "SAMPLE_ATTRIBUTES": { 116 | "SAMPLE_ATTRIBUTE": [ 117 | { 118 | "TAG": "strain", 119 | "VALUE": "367250" 120 | }, 121 | { 122 | "TAG": "collected_by", 123 | "VALUE": "PHE" 124 | }, 125 | { 126 | "TAG": "collection_date", 127 | "VALUE": "Apr-2017" 128 | }, 129 | { 130 | "TAG": "isolation_source", 131 | "VALUE": "human" 132 | }, 133 | { 134 | "TAG": "geo_loc_name", 135 | "VALUE": "United Kingdom: United Kingdom" 136 | }, 137 | { 138 | "TAG": "lat_lon", 139 | "VALUE": "Missing" 140 | }, 141 | { 142 | "TAG": "serovar", 143 | "VALUE": "Salmonella Typhimurium" 144 | }, 145 | { 146 | "TAG": "isolate_name_alias", 147 | "VALUE": "367250" 148 | }, 149 | { 150 | "TAG": "sequence_type", 151 | "VALUE": "19" 152 | }, 153 | { 154 | "TAG": "sub_species", 155 | "VALUE": "enterica" 156 | }, 157 | { 158 | "TAG": "potential_contaminant", 159 | "VALUE": "None detected" 160 | }, 161 | { 162 | "TAG": "host", 163 | "VALUE": "Homo sapiens" 164 | }, 165 | { 166 | "TAG": "host_disease", 167 | "VALUE": "Not available" 168 | }, 169 | { 170 | "TAG": "BioSampleModel", 171 | "VALUE": "Pathogen.cl" 172 | } 173 | ] 174 | } 175 | } 176 | ], 177 | "SUBMISSION": { 178 | "@lab_name": "", 179 | "@center_name": "PHE", 180 | "@accession": "SRA563707", 181 | "@alias": "SUB2709487" 182 | } 183 | } -------------------------------------------------------------------------------- /json_examples/json/SRA/SRA245/SRA245334.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": [ 3 | { 4 | "@alias": "1 Themisto libellula", 5 | "@accession": "SRX895485", 6 | "@center_name": "McGill University", 7 | "IDENTIFIERS": { 8 | "PRIMARY_ID": "SRX895485", 9 | "SUBMITTER_ID": [ 10 | { 11 | "@namespace": "McGill University", 12 | "$": "1 Themisto libellula" 13 | } 14 | ] 15 | }, 16 | "TITLE": "1 Themisto libellula V4 18S", 17 | "STUDY_REF": { 18 | "@accession": "SRP055766", 19 | "IDENTIFIERS": { 20 | "PRIMARY_ID": "SRP055766", 21 | "EXTERNAL_ID": [ 22 | { 23 | "@namespace": "SubPortal", 24 | "$": "SUB832829" 25 | } 26 | ] 27 | } 28 | }, 29 | "DESIGN": { 30 | "DESIGN_DESCRIPTION": "A single individual of the species Themisto libellula was PCR amplified at the V4 region of 18S, using barcoded primers, and 454 sequenced along with PCR products of 19 other species.", 31 | "SAMPLE_DESCRIPTOR": { 32 | "@accession": "SRS861262", 33 | "IDENTIFIERS": { 34 | "PRIMARY_ID": "SRS861262", 35 | "EXTERNAL_ID": [ 36 | { 37 | "@namespace": "BioSample", 38 | "$": "SAMN03382336" 39 | } 40 | ] 41 | } 42 | }, 43 | "LIBRARY_DESCRIPTOR": { 44 | "LIBRARY_NAME": "1 Themisto libellula V4 18S", 45 | "LIBRARY_STRATEGY": "AMPLICON", 46 | "LIBRARY_SOURCE": "METAGENOMIC", 47 | "LIBRARY_SELECTION": "PCR", 48 | "LIBRARY_LAYOUT": { 49 | "SINGLE": null 50 | } 51 | }, 52 | "SPOT_DESCRIPTOR": { 53 | "SPOT_DECODE_SPEC": { 54 | "SPOT_LENGTH": 0, 55 | "READ_SPEC": [ 56 | { 57 | "READ_INDEX": 0, 58 | "READ_CLASS": "Technical Read", 59 | "READ_TYPE": "Adapter", 60 | "BASE_COORD": 1 61 | }, 62 | { 63 | "READ_INDEX": 1, 64 | "READ_CLASS": "Application Read", 65 | "READ_TYPE": "Forward", 66 | "BASE_COORD": 5 67 | } 68 | ] 69 | } 70 | } 71 | }, 72 | "PLATFORM": { 73 | "LS454": { 74 | "INSTRUMENT_MODEL": "454 GS FLX Titanium" 75 | } 76 | }, 77 | "PROCESSING": null 78 | } 79 | ], 80 | "RUN": [ 81 | { 82 | "@alias": "1 Themisto libellula V4 18S", 83 | "@accession": "SRR1823977", 84 | "@center_name": "McGill University", 85 | "IDENTIFIERS": { 86 | "PRIMARY_ID": "SRR1823977", 87 | "SUBMITTER_ID": [ 88 | { 89 | "@namespace": "McGill University", 90 | "$": "1 Themisto libellula V4 18S" 91 | } 92 | ] 93 | }, 94 | "EXPERIMENT_REF": { 95 | "@accession": "SRX895485" 96 | } 97 | } 98 | ], 99 | "SAMPLE": [ 100 | { 101 | "@center_name": "McGill University", 102 | "@alias": "1 Themisto libellula", 103 | "@accession": "SRS861262", 104 | "IDENTIFIERS": { 105 | "PRIMARY_ID": "SRS861262", 106 | "EXTERNAL_ID": [ 107 | { 108 | "@namespace": "BioSample", 109 | "$": "SAMN03382336" 110 | } 111 | ] 112 | }, 113 | "TITLE": "1 Themisto libellula", 114 | "SAMPLE_NAME": { 115 | "TAXON_ID": 1169740, 116 | "SCIENTIFIC_NAME": "aquatic metagenome" 117 | }, 118 | "DESCRIPTION": "An individual Themisto libellula was DNA extracted, amplified with a barcoded primer at the V4 region of 18S, and 454 sequenced with 19 other species.", 119 | "SAMPLE_LINKS": { 120 | "SAMPLE_LINK": [ 121 | { 122 | "XREF_LINK": { 123 | "DB": "bioproject", 124 | "ID": "277040", 125 | "LABEL": "PRJNA277040" 126 | } 127 | } 128 | ] 129 | }, 130 | "SAMPLE_ATTRIBUTES": { 131 | "SAMPLE_ATTRIBUTE": [ 132 | { 133 | "TAG": "collection_date", 134 | "VALUE": "missing" 135 | }, 136 | { 137 | "TAG": "env_biome", 138 | "VALUE": "Aquatic" 139 | }, 140 | { 141 | "TAG": "env_feature", 142 | "VALUE": "Aquatic" 143 | }, 144 | { 145 | "TAG": "env_material", 146 | "VALUE": "Water" 147 | }, 148 | { 149 | "TAG": "geo_loc_name", 150 | "VALUE": "USA: Chukchi Sea, Alaska" 151 | }, 152 | { 153 | "TAG": "lat_lon", 154 | "VALUE": "missing" 155 | }, 156 | { 157 | "TAG": "BioSampleModel", 158 | "VALUE": "MIMARKS.survey" 159 | }, 160 | { 161 | "TAG": "BioSampleModel", 162 | "VALUE": "MIGS/MIMS/MIMARKS.miscellaneous" 163 | } 164 | ] 165 | } 166 | } 167 | ], 168 | "SUBMISSION": { 169 | "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance", 170 | "@alias": "1 Themisto libellula", 171 | "@lab_name": "Cristescu lab", 172 | "@center_name": "McGill University", 173 | "@accession": "SRA245334" 174 | } 175 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Edwards Lab](https://img.shields.io/badge/Bioinformatics-EdwardsLab-03A9F4)](https://edwards.flinders.edu.au) 2 | # SRA_Metadata 3 | Get, parse, and extract information from the SRA metadata files 4 | 5 | ## About the SRA metadata 6 | 7 | The SRA contains over 1.5 million samples, and each sample contains lots of runs. The metadata is really key to understanding that data, but the metadata is difficult to organize and understand. Here we collate the metadata information available from the SRA to make it easier to search and find things. 8 | 9 | 10 | ## See also 11 | 12 | You might also look at our [collection of blog posts](https://edwards.flinders.edu.au/sra) about the SRA that explain the organization of the SRA data, and provide alternate mechanisms to download the data, and so on. 13 | 14 | # Downloading the SRA metadata 15 | 16 | There are several components to the SRA data that we are going to download. 17 | 18 | ## SRA_Accessions.tab 19 | 20 | This tab separated file can be downloaded directly from the NCBI: [ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab](ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab). 21 | 22 | This file lists all the submissions to the SRA, and lists every accession number associated with each submission. It tells you the status of the datasets. 23 | 24 | It contains the following columns: 25 | 26 | * Accession 27 | * Submission 28 | * Status 29 | * Updated 30 | * Published 31 | * Received 32 | * Type 33 | * Center 34 | * Visibility 35 | * Alias 36 | * Experiment 37 | * Sample 38 | * Study 39 | * Loaded 40 | * Spots 41 | * Bases 42 | * Md5sum 43 | * BioSample 44 | * BioProject 45 | * ReplacedBy 46 | 47 | The key columns here are Accession, Submission, and Status. 48 | 49 | The data in this file is replicated. A single submission may occur mutliple times, represented once for each of the accessions associated with it. 50 | 51 | At the time of writing there were 27,838,771 entries (lines) in that file. However, there are only 1,413,223 unique submission IDs. 52 | 53 | From those 1,413,223 unique submission IDs, the `Status` field reports 54 | 55 | * 1,290,528 live 56 | * 161,652 suppressed 57 | * 92,103 unpublished 58 | * 10 withdrawn 59 | 60 | (These numbers don't quite add up because there are some projects where the project maybe be live, but the runs or other parts of the data release may be suppressed or unpublished.) 61 | 62 | # XML Metadata 63 | 64 | The XML metadata is available for download from [ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/](ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/). There are daily files, and then once per month, or so, there is a complete release. 65 | 66 | For example, this file was downloaded: 67 | 68 | ```bash 69 | curl -o NCBI_SRA_Metadata_Full.tgz ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_Full_20180205.tar.gz 70 | ``` 71 | 72 | When you extract these files, you will get 1,000,000+ directories! Each directory is a single submission, and contains several files describing the data. I extract these using a command like: 73 | 74 | ```bash 75 | mkdir xml 76 | tar -C xml/ -zxf NCBI_SRA_Metadata_Full.tar.gz 77 | ``` 78 | 79 | There are several [XML Schema Definition files](https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=xml_schemas) that define the data sets. Currently, there are `.xsd` files for: 80 | 81 | 1. SRA Common 82 | 2. SRA Submission 83 | 3. SRA Study 84 | 4. SRA Sample 85 | 5. SRA Experiment 86 | 6. SRA Run 87 | 7. SRA Analysis 88 | 89 | # Converting the XML files to JSON 90 | 91 | We batch process the XML files and convert them to JSON, using [a Python script](xml2json/xml_dir2json_random.py). This code uses the XML Schema Definition files to validate the XML files, and then dumps a single file per submission in JSON format. 92 | 93 | This version chooses a file at random from the XML directory, checks to see if it has already been processed, and if not, it processes it. This allows us to run the code in parallel (using the awesome [GNU parallel](https://www.gnu.org/software/parallel/) and process lots of XML files all at once. For example, to process this code using 30 different processors, we can do: 94 | 95 | ```bash 96 | echo "xml_dir2json_random.py -s $HOME/SRA/SRAdb/XML/Schemas/ -d xml -o json -m srr_sra_ids.tsv" > ./run_xml.sh 97 | seq 1 30 | parallel ./run_xml.sh {} 98 | ``` 99 | 100 | This command creates a directory called `json` with three subdirectories, one each for `SRA`, `ERA`, `DRA`. Within those three directories, there are directories for each run, starting with the first three numbers. We use this structure because (a) it mirrors the structure at NCBI and elsewhere, and (b) breaking up the files into multiple subdirectories is much better for your filesystem. There are over 1,000,000 files, and so it takes commands like `ls` a long time to read the [inodes](http://www.grymoire.com/Unix/Inodes.html). By splitting the files out, we can more readily access and process them. 101 | 102 | 103 | > *Tip:* If you have an SRA ID such as `SRR=SRA889255` you can access the appropriate file with, for example, `ls json/${SRR:0:3}/${SRR:0:6}/$SRR.json`. 104 | 105 | This command also creates an *id mapping* file called `srr_sra_ids.tsv` that has two columns, the SRA submission ID (or ERA/DBA ID) and the SRA Run ID. The most common association we are looking for is from SRR -> SRA. For example, we usually know the SRR IDs associated with a sequence run, and would like to explore the metadata associated with that run. Alternatively, we know a sample we would like to get the DNA sequences associated with. This mapping provides that connection, and you can quickly look for either a run or a submission using `grep`. 106 | 107 | In addition, we create a file called `XML_validation_errors.txt` that reports any improper XML data that does not match the XML Schema Defintions. 108 | 109 | We now have a directory with all the metadata as json objects that you can analyze in different ways. 110 | 111 | # JSON 112 | 113 | We have some [JSON](json/) parsing code to help you explore the data. Before you begin, however, take a look at the [json_examples](json_examples/) data directory. These are ten samples chosen completely at random from the August 2019 metadata to demonstrate the organization of the metadata there. 114 | 115 | I also recommend using [jq](https://stedolan.github.io/jq/) for processing the data on the command line. 116 | 117 | Here are a couple of examples from our [partie](https://github.com/linsalrob/partie) analysis of SRA datasets. 118 | 119 | First , find all the submissions that are metagenomes or microbiomes using grep. You could also do this with the XML files, there is nothing specific about this grep and json. 120 | 121 | ```bash 122 | egrep -rli 'metagenome|microbiome' json | perl -pe 's#json/##; s#.json##' > metagenomes.txt 123 | ``` 124 | 125 | We now have a file, called `metagenomes.txt` that has one SRA submission per line where somewhere in the file it has the words `metagenome` or `microbiome`. 126 | 127 | Now we can use [jq](https://stedolan.github.io/jq/) to extract just the run identifiers from these files: 128 | 129 | ```bash 130 | cat metagenomes.txt | xargs -i jq -r "try .RUN[].IDENTIFIERS.PRIMARY_ID" json/{}.json > metagenome_runs.txt 131 | ``` 132 | 133 | In this command, we cat the file of IDs, and for each file, we use `jq` to parse the json data. We look for any `RUN` and from that pull the `IDENTIFIERS` entry, and then the `PRIMARY_ID` for that run. This prints out one `PRIMARY_ID` per line. The `try` in that command is a jq option that is basic error handling. We could add both a `try` and a `catch`, and use that to report on any JSON files that do not have a RUN associated with them, however, at the moment we don't care about those ... we just ignore them! 134 | 135 | I don't know how to succinctly parse the XML to get this information (though you could probably do it with `grep`). 136 | 137 | 138 | -------------------------------------------------------------------------------- /xml2json/xml_dir2json_random.py: -------------------------------------------------------------------------------- 1 | """ 2 | Read a directory of XML files and convert the output to JSON. 3 | We write the JSON object to a file. By default, we do not 4 | overwrite existing files, but you can force that with the -o 5 | flag. 6 | """ 7 | 8 | import os 9 | import sys 10 | import argparse 11 | from sra_metadata_libs import bcolors 12 | import xmlschema 13 | from xmlschema.validators.exceptions import XMLSchemaValidationError 14 | import json 15 | import random 16 | import errno 17 | import time 18 | import fcntl 19 | from pprint import pprint 20 | __author__ = 'Rob Edwards' 21 | 22 | 23 | def validation_errors(sample, error, verbose=False): 24 | """ 25 | Log the validation error to a file. We append to the file. 26 | 27 | :param sample: The sample ID 28 | :param error: The python Error object 29 | :param verbose: More output 30 | """ 31 | 32 | if verbose: 33 | sys.stderr.write(f"{bcolors.PINK}Logging error for {sample}{bcolors.ENDC}\n") 34 | 35 | out = open("XML_validation_errors.txt", "a") 36 | while True: 37 | try: 38 | fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB) 39 | break 40 | except IOError as e: 41 | # raise on unrelated IOErrors 42 | if e.errno != errno.EAGAIN: 43 | raise 44 | else: 45 | time.sleep(0.1) 46 | 47 | out.write(f"\n=== BEGIN {sample} ===\n") 48 | out.write(str(error)) 49 | out.write(f"\n=== END {sample} ===\n") 50 | fcntl.flock(out, fcntl.LOCK_UN) 51 | out.close() 52 | 53 | def write_id_map(data, imf, verbose=False): 54 | """ 55 | Write an ID mapping file that has the SRA submission ID and the SRR Run ID 56 | :param data: The JSON data object 57 | :param imf: The id mapping file to write to 58 | :param verbose: more output 59 | :return: 60 | """ 61 | 62 | out = open(imf, "a") 63 | while True: 64 | try: 65 | fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB) 66 | break 67 | except IOError as e: 68 | # raise on unrelated IOErrors 69 | if e.errno != errno.EAGAIN: 70 | raise 71 | else: 72 | time.sleep(0.1) 73 | acc = None 74 | if 'SUBMISSION' in data and '@accession' in data['SUBMISSION']: 75 | acc = data['SUBMISSION']['@accession'] 76 | if verbose: 77 | sys.stderr.write(f"{bcolors.GREEN}ACCESSION: {acc}. Writing runs{bcolors.ENDC}\n") 78 | else: 79 | # note that we now test for this earlier, so shouldn't really get here! 80 | sys.stderr.write(f"{bcolors.RED}FATAL. NO @accession in {data}{bcolors.ENDC}") 81 | sys.exit(-1) 82 | 83 | if 'RUN' in data: 84 | for run in data['RUN']: 85 | if 'PRIMARY_ID' in run['IDENTIFIERS']: 86 | if verbose: 87 | sys.stderr.write(f"\t{bcolors.GREEN}{run['IDENTIFIERS']['PRIMARY_ID']}{bcolors.ENDC}\n") 88 | out.write(f"{acc}\t{run['IDENTIFIERS']['PRIMARY_ID']}\n") 89 | else: 90 | if verbose: 91 | sys.stderr.write(f"{bcolors.PINK}No RUN found in {acc}{bcolors.ENDC}\n") 92 | fcntl.flock(out, fcntl.LOCK_UN) 93 | out.close() 94 | 95 | def read_schemas(schemadir, verbose=True): 96 | """ 97 | Read the XML Schema defintion files, and return a dict of schema objects. 98 | :param verbose: more output 99 | :return: dict of schema objects 100 | """ 101 | 102 | # known XML Schemas 103 | schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd", 104 | "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 105 | "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd", 106 | "submission" : "SRA.submission.xsd"} 107 | 108 | 109 | schemas = {} 110 | 111 | for s in schema_types: 112 | if verbose: 113 | sys.stderr.write(f"{bcolors.GREEN}Schema parsing{bcolors.ENDC} {s}\n") 114 | schemas[s] = xmlschema.XMLSchema(os.path.join(schemadir, schema_types[s])) 115 | 116 | return schemas 117 | 118 | def read_directory(basedir, subdir, schemas, verbose=False): 119 | """ 120 | Read a directory and create a single dict for that directory 121 | 122 | :param basedir: The base directory of all the XML files 123 | :param subdir: The sample directory with each of the individual XML files 124 | :param schemas: The dictionary of XML Schema Definitions 125 | :param verbose: more output 126 | :return: a dict of all the data 127 | """ 128 | 129 | data = {} 130 | for s in schemas: 131 | sc = schemas[s] 132 | if not os.path.exists(os.path.join(basedir, subdir, f"{subdir}.{s}.xml")): 133 | if verbose and s not in ['analysis', 'common', 'package']: 134 | sys.stderr.write(f"{bcolors.RED}WARN: {basedir}/{subdir}/{subdir}.{s}.xml not found\n") 135 | continue 136 | 137 | try: 138 | xm = schemas[s].to_dict(os.path.join(basedir, subdir, f"{subdir}.{s}.xml"), decimal_type=str) 139 | except XMLSchemaValidationError as e: 140 | validation_errors(subdir, e, verbose) 141 | continue 142 | 143 | #data[s.upper()] = xm[s.upper()] 144 | if 'submission' == s: 145 | data['SUBMISSION'] = xm 146 | else: 147 | data[s.upper()] = xm[s.upper()] 148 | 149 | return data 150 | 151 | def write_json(xml, outfile, verbose=False): 152 | """ 153 | Write the dictionary to a JSON file 154 | 155 | :param xml: The dictionary of the XML object 156 | :param outfile: The file to write to 157 | :param verbose: more output 158 | """ 159 | 160 | with open(outfile, 'w') as out: 161 | out.write(json.dumps(xml, indent=4)) 162 | 163 | 164 | if __name__ == "__main__": 165 | parser = argparse.ArgumentParser(description='Parse a directory or directories and create a json output for each one') 166 | parser.add_argument('-d', help='directory to parse', required=True) 167 | parser.add_argument('-o', help='where to put the json files.', required=True) 168 | parser.add_argument('-s', help='Schema directory', required=True) 169 | parser.add_argument('-f', help='force writing of the file, even if it exists', action='store_true') 170 | parser.add_argument('-m', help="Run ID to Submission ID mapping file (default=srr_sra_ids.tsv)", default="srr_sra_ids.tsv") 171 | parser.add_argument('-v', help='verbose output', action='store_true') 172 | args = parser.parse_args() 173 | 174 | 175 | # read all the known schemas 176 | if not os.path.exists(args.s): 177 | sys.stderr.write(f"{bcolors.RED}FATAL: Schemas/ directory with known xml schemas not found{bcolors.ENDC}\n") 178 | sys.exit(-1) 179 | schemas = read_schemas(args.s, args.v) 180 | 181 | if not os.path.exists(args.o): 182 | os.mkdir(args.o) 183 | 184 | # read all the files in the base directory and randomize the order. This is so 185 | # we can run multiple instances in parallel 186 | fs = os.listdir(args.d) 187 | random.shuffle(fs) 188 | for submission in fs: 189 | outpath = os.path.join(args.o, submission[0:3], submission[0:6]) 190 | if not os.path.exists(outpath): 191 | os.makedirs(outpath, exist_ok=True) 192 | outfile = os.path.join(outpath, f"{submission}.json") 193 | if (not args.f) and os.path.exists(outfile): 194 | continue 195 | # we create the semaphore file that is empty so we don't try and do this twice 196 | with open(outfile, 'w') as w: 197 | w.write("") 198 | 199 | 200 | if args.v: 201 | sys.stderr.write(f"{bcolors.GREEN}Parsing {bcolors.ENDC} {submission}\n") 202 | 203 | data = read_directory(args.d, submission, schemas, args.v) 204 | 205 | if 'SUBMISSION' not in data: 206 | sys.stderr.write(f"No SUBMISSION read for {submission}. Skipped\n") 207 | continue 208 | elif '@accession' not in data['SUBMISSION']: 209 | sys.stderr.write(f"No @accession in the SUBMISSION for {submission}. Skipped\n") 210 | continue 211 | 212 | write_id_map(data, args.m, args.v) 213 | 214 | if args.v: 215 | sys.stderr.write(f"{bcolors.BLUE}Writing {bcolors.ENDC} {submission}\n") 216 | write_json(data, outfile, args.v) 217 | 218 | -------------------------------------------------------------------------------- /json_examples/json/ERA/ERA693/ERA693801.json: -------------------------------------------------------------------------------- 1 | { 2 | "EXPERIMENT": [ 3 | { 4 | "@alias": "EXP-68-1-2016-08-23_13-12-12", 5 | "@accession": "ERX1667430", 6 | "@broker_name": "", 7 | "IDENTIFIERS": { 8 | "PRIMARY_ID": "ERX1667430", 9 | "SUBMITTER_ID": [ 10 | { 11 | "@namespace": "IFH_MS", 12 | "$": "EXP-68-1-2016-08-23_13-12-12" 13 | }, 14 | { 15 | "@namespace": "University Hospital Muenster", 16 | "$": "EXP-68-1-2016-08-23_13-12-12" 17 | } 18 | ] 19 | }, 20 | "TITLE": "Illumina MiSeq paired end sequencing; EXP-68-1-2016-08-23_13-12-12", 21 | "STUDY_REF": { 22 | "@accession": "ERP016940", 23 | "IDENTIFIERS": { 24 | "PRIMARY_ID": "ERP016940" 25 | } 26 | }, 27 | "DESIGN": { 28 | "DESIGN_DESCRIPTION": null, 29 | "SAMPLE_DESCRIPTOR": { 30 | "@accession": "ERS1305242", 31 | "IDENTIFIERS": { 32 | "PRIMARY_ID": "ERS1305242", 33 | "EXTERNAL_ID": [ 34 | { 35 | "@namespace": "BioSample", 36 | "$": "SAMEA4393793" 37 | } 38 | ] 39 | } 40 | }, 41 | "LIBRARY_DESCRIPTOR": { 42 | "LIBRARY_NAME": "unspecified", 43 | "LIBRARY_STRATEGY": "WGS", 44 | "LIBRARY_SOURCE": "GENOMIC", 45 | "LIBRARY_SELECTION": "RANDOM", 46 | "LIBRARY_LAYOUT": { 47 | "PAIRED": { 48 | "@NOMINAL_LENGTH": 300 49 | } 50 | } 51 | }, 52 | "SPOT_DESCRIPTOR": { 53 | "SPOT_DECODE_SPEC": { 54 | "SPOT_LENGTH": 500, 55 | "READ_SPEC": [ 56 | { 57 | "READ_INDEX": 0, 58 | "READ_LABEL": "F1", 59 | "READ_CLASS": "Application Read", 60 | "READ_TYPE": "Forward", 61 | "BASE_COORD": 1 62 | }, 63 | { 64 | "READ_INDEX": 1, 65 | "READ_LABEL": "R2", 66 | "READ_CLASS": "Application Read", 67 | "READ_TYPE": "Reverse", 68 | "BASE_COORD": 251 69 | } 70 | ] 71 | } 72 | } 73 | }, 74 | "PLATFORM": { 75 | "ILLUMINA": { 76 | "INSTRUMENT_MODEL": "Illumina MiSeq" 77 | } 78 | } 79 | } 80 | ], 81 | "RUN": [ 82 | { 83 | "@accession": "ERR1596844", 84 | "@alias": "NGSRT18C2 exp EXP-68-1-2016-08-23_13-12-12 run 1", 85 | "IDENTIFIERS": { 86 | "PRIMARY_ID": "ERR1596844", 87 | "SUBMITTER_ID": [ 88 | { 89 | "@namespace": "IFH_MS", 90 | "$": "NGSRT18C2 exp EXP-68-1-2016-08-23_13-12-12 run 1" 91 | }, 92 | { 93 | "@namespace": "University Hospital Muenster", 94 | "$": "NGSRT18C2 exp EXP-68-1-2016-08-23_13-12-12 run 1" 95 | } 96 | ] 97 | }, 98 | "TITLE": "Illumina MiSeq paired end sequencing; EXP-68-1-2016-08-23_13-12-12", 99 | "EXPERIMENT_REF": { 100 | "@accession": "ERX1667430", 101 | "IDENTIFIERS": { 102 | "PRIMARY_ID": "ERX1667430" 103 | } 104 | }, 105 | "SPOT_DESCRIPTOR": { 106 | "SPOT_DECODE_SPEC": { 107 | "SPOT_LENGTH": 502, 108 | "READ_SPEC": [ 109 | { 110 | "READ_INDEX": 0, 111 | "READ_CLASS": "Application Read", 112 | "READ_TYPE": "Forward", 113 | "BASE_COORD": 1 114 | }, 115 | { 116 | "READ_INDEX": 1, 117 | "READ_CLASS": "Application Read", 118 | "READ_TYPE": "Reverse", 119 | "BASE_COORD": 252 120 | } 121 | ] 122 | } 123 | }, 124 | "RUN_ATTRIBUTES": { 125 | "RUN_ATTRIBUTE": [ 126 | { 127 | "TAG": "ENA-FIRST-PUBLIC", 128 | "VALUE": "2016-12-25" 129 | }, 130 | { 131 | "TAG": "ENA-LAST-UPDATE", 132 | "VALUE": "2018-11-16" 133 | } 134 | ] 135 | } 136 | } 137 | ], 138 | "SAMPLE": [ 139 | { 140 | "@alias": "SAMEA4393793", 141 | "@accession": "ERS1305242", 142 | "IDENTIFIERS": { 143 | "PRIMARY_ID": "ERS1305242", 144 | "EXTERNAL_ID": [ 145 | { 146 | "@namespace": "BioSample", 147 | "$": "SAMEA4393793" 148 | } 149 | ] 150 | }, 151 | "TITLE": "NGSRT18C2", 152 | "SAMPLE_NAME": { 153 | "TAXON_ID": 1280, 154 | "SCIENTIFIC_NAME": "Staphylococcus aureus" 155 | }, 156 | "SAMPLE_ATTRIBUTES": { 157 | "SAMPLE_ATTRIBUTE": [ 158 | { 159 | "TAG": "Alias", 160 | "VALUE": "NGSRT18C2" 161 | }, 162 | { 163 | "TAG": "ENA checklist", 164 | "VALUE": "ERC000011" 165 | }, 166 | { 167 | "TAG": "INSDC center name", 168 | "VALUE": "IFH_MS" 169 | }, 170 | { 171 | "TAG": "INSDC first public", 172 | "VALUE": "2016-12-25T17:01:10Z" 173 | }, 174 | { 175 | "TAG": "INSDC last update", 176 | "VALUE": "2016-08-23T15:37:20Z" 177 | }, 178 | { 179 | "TAG": "INSDC status", 180 | "VALUE": "public" 181 | }, 182 | { 183 | "TAG": "SRA accession", 184 | "VALUE": "ERS1305242" 185 | }, 186 | { 187 | "TAG": "Sample Name", 188 | "VALUE": "ERS1305242" 189 | }, 190 | { 191 | "TAG": "Title", 192 | "VALUE": "NGSRT18C2" 193 | }, 194 | { 195 | "TAG": "collection_date", 196 | "VALUE": "not available" 197 | }, 198 | { 199 | "TAG": "country", 200 | "VALUE": "not available" 201 | }, 202 | { 203 | "TAG": "host", 204 | "VALUE": "Homo sapiens" 205 | }, 206 | { 207 | "TAG": "host_associated", 208 | "VALUE": "Yes" 209 | } 210 | ] 211 | } 212 | } 213 | ], 214 | "SUBMISSION": { 215 | "@alias": "NGSRT18C2 sub 1", 216 | "@accession": "ERA693801", 217 | "@lab_name": "European Nucleotide Archive" 218 | } 219 | } --------------------------------------------------------------------------------