├── xml2json
    ├── schema.pickle
    ├── serialize_schema.py
    ├── xml_dir2json.py
    ├── xml2json.py
    └── xml_dir2json_random.py
├── sra_metadata_libs
    ├── __init__.py
    └── bcolors.py
├── json
    ├── README.md
    ├── extract_runs.py
    └── print_json_fields.py
├── LICENSE
├── json_examples
    ├── README.md
    └── json
    │   ├── ERA
    │       ├── ERA570
    │       │   └── ERA570895.json
    │       └── ERA693
    │       │   └── ERA693801.json
    │   └── SRA
    │       ├── SRA490
    │           └── SRA490640.json
    │       ├── SRA268
    │           └── SRA268165.json
    │       ├── SRA889
    │           └── SRA889255.json
    │       ├── SRA609
    │           └── SRA609343.json
    │       ├── SRA563
    │           └── SRA563707.json
    │       └── SRA245
    │           └── SRA245334.json
├── .gitignore
└── README.md


/xml2json/schema.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/linsalrob/SRA_Metadata/HEAD/xml2json/schema.pickle


--------------------------------------------------------------------------------
/sra_metadata_libs/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | 
 5 | 
 6 | from .bcolors import bcolors
 7 | 
 8 | __all__ = [
 9 |     'bcolors'
10 |     ]
11 | 


--------------------------------------------------------------------------------
/json/README.md:
--------------------------------------------------------------------------------
 1 | # Parse JSON Files
 2 | 
 3 | Each of these examples uses the JSON data in the [json examples](../json_examples) directory. 
 4 | 
 5 | 
 6 | To extract all the run IDs and accession IDs, you can use:
 7 | 
 8 | ```bash
 9 | python3 json/extract_runs.py -d json_examples/
10 | ```
11 | 
12 | To take a look at the fields in a specific file, you can use:
13 | 
14 | ```bash
15 | python3 json/print_json_fields.py -f json_examples/SRA575213.json | less
16 | ```


--------------------------------------------------------------------------------
/sra_metadata_libs/bcolors.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Colors that you can import and make the text look pretty
 3 | 
 4 | Source: https://stackoverflow.com/questions/287871/print-in-terminal-with-colors
 5 | """
 6 | 
 7 | __author__ = 'Rob Edwards'
 8 | 
 9 | 
10 | class bcolors(object):
11 |     HEADER = '\033[95m'
12 |     OKBLUE = '\033[94m'
13 |     OKGREEN = '\033[92m'
14 |     WARNING = '\033[93m'
15 |     FAIL = '\033[91m'
16 |     ENDC = '\033[0m'
17 |     BOLD = '\033[1m'
18 |     UNDERLINE = '\033[4m'
19 | 
20 |     PINK = '\033[95m'
21 |     BLUE = '\033[94m'
22 |     GREEN = '\033[92m'
23 |     YELLOW = '\033[93m'
24 |     RED = '\033[91m'
25 |     WHITE = '\033[0m'
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Rob Edwards
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/json_examples/README.md:
--------------------------------------------------------------------------------
 1 | # JSON Examples
 2 | 
 3 | These are some example data from the August 2019 SRA metadata that were chosen at random. Quite literally! 
 4 | 
 5 | I used this command:
 6 | 
 7 | ```bash
 8 | for F in $(find . grep json$ | sort -R | head); do mkdir -p json_examples/json/${F:2:3}/${F:2:6}; cp $F json_examples/json/${F:2:3}/${F:2:6}; done
 9 | ```
10 | 
11 | (`sort -R` is a good command to have up your sleeves. The construct ${F:2:3} takes characters 3-6 of the string $F.)
12 | 
13 | These ten files represent a random selection of metadata, and hopefully will have some of the variation we see in the whole directory (but probably not all of it). They are good to use with the code in [the json directory](../json) to see what the contents of typical json files are.
14 | 
15 | Note that the directory structure mirrors that of the complete [metadata in JSON format](https://edwards.sdsu.edu/data/sra/current.tar.gz) we make available:
16 | 
17 | ```text
18 | json/
19 | ├── ERA
20 | │   ├── ERA570
21 | │   │   └── ERA570895.json
22 | │   └── ERA693
23 | │       └── ERA693801.json
24 | └── SRA
25 |     ├── SRA245
26 |     │   └── SRA245334.json
27 |     ├── SRA268
28 |     │   └── SRA268165.json
29 |     ├── SRA490
30 |     │   └── SRA490640.json
31 |     ├── SRA563
32 |     │   └── SRA563707.json
33 |     ├── SRA575
34 |     │   └── SRA575213.json
35 |     ├── SRA609
36 |     │   └── SRA609343.json
37 |     └── SRA889
38 |         └── SRA889255.json
39 | ```
40 | 
41 | We use this structure to reduce the number of files per directory and make commands like `ls` work!


--------------------------------------------------------------------------------
/json/extract_runs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Read all the json files in a directory and extract the runs associated with each ID
 3 | """
 4 | 
 5 | import json
 6 | import os
 7 | import sys
 8 | import argparse
 9 | 
10 | from sra_metadata_libs import bcolors
11 | 
12 | def extract_runs(jf, verbose=False):
13 |     """
14 |     Extract the run information
15 |     :param jf: The JSON file to parse
16 |     :param verbose: more information
17 |     :return: prints out the Submission @accession and the run
18 |     """
19 | 
20 |     with open(jf, 'r') as json_in:
21 |         data = json.load(json_in)
22 |         if 'SUBMISSION' in data:
23 |             acc = data['SUBMISSION']['@accession']
24 |         else:
25 |             if verbose:
26 |                 sys.stderr.write(f"{bcolors.RED}No @accession found in {jf}{bcolors.ENDC}\n")
27 |             return
28 | 
29 |         if 'RUN' in data:
30 |             for r in data['RUN']:
31 |                 if 'PRIMARY_ID' in r['IDENTIFIERS']:
32 |                     print(f"{acc}\t{r['IDENTIFIERS']['PRIMARY_ID']}")
33 |         elif verbose:
34 |             sys.stderr.write(f"{bcolors.PINK}No runs found in {acc}{bcolors.ENDC}")
35 | 
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     parser = argparse.ArgumentParser(description='')
40 |     parser.add_argument('-d', help='Directory of json files', required=True)
41 |     parser.add_argument('-v', help='verbose output', action='store_true')
42 |     args = parser.parse_args()
43 | 
44 |     for j in os.listdir(args.d):
45 |         if not j.endswith('.json'):
46 |             sys.stderr.write(f"{bcolors.WARNING}{j} does not end in .json and so we skipped it. Is this a json file?{bcolors.ENDC}\n")
47 |             continue
48 |         extract_runs(os.path.join(args.d, j))
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # pycharm
107 | .idea
108 | 


--------------------------------------------------------------------------------
/json/print_json_fields.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Explore the contents of a JSON file. This code prints all the headings in the file, in a tree format
 3 | so you can see the relationship between elements. There is a new line between root elements in the tree
 4 | so you know which elements you can call directly.
 5 | 
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | import argparse
11 | 
12 | from sra_metadata_libs import bcolors
13 | import json
14 | 
15 | def print_str(s, l, verbose=False):
16 |     """
17 |     Print the string at level l
18 |     :param s: string to print
19 |     :param l: level to print it
20 |     :param verbose: more output
21 |     :return:
22 |     """
23 | 
24 |     lo = l * '.'
25 |     print(f"{lo} {s}")
26 | 
27 | def get_keys(js, l, verbose=False):
28 |     """
29 |     Get the keys at this level, and test for more dicts
30 |     :param js: the json object
31 |     :param l: the current level
32 |     :param verbose: more output
33 |     :return:
34 |     """
35 | 
36 |     for k in js:
37 |         print_str(k, l, verbose)
38 |         if isinstance(js[k], dict):
39 |             get_keys(js[k], l+1, verbose)
40 |         elif isinstance(js[k], list):
41 |             get_keys(js[k][0], l+1, verbose)
42 |         if l == 0:
43 |             print()
44 | 
45 | 
46 | 
47 | def print_json(jf, verbose=False):
48 |     """
49 |     Parse and print the fields
50 |     :param jf: JSON file to parse
51 |     :param verbose: more output
52 |     :return:
53 |     """
54 | 
55 |     if verbose:
56 |         sys.stderr.write(f"{bcolors.GREEN}Parsing {jf}{bcolors.ENDC}\n")
57 | 
58 |     with open(jf, 'r') as ji:
59 |         data = json.load(ji)
60 | 
61 |     get_keys(data, 0, verbose)
62 | 
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     parser = argparse.ArgumentParser(description='')
67 |     parser.add_argument('-f', help='JSON file to query', required=True)
68 |     parser.add_argument('-v', help='verbose output', action='store_true')
69 |     args = parser.parse_args()
70 | 
71 |     print_json(args.f)
72 | 


--------------------------------------------------------------------------------
/json_examples/json/ERA/ERA570/ERA570895.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "SAMPLE": [
 3 |         {
 4 |             "@alias": "SAMEA3889671",
 5 |             "@accession": "ERS1076805",
 6 |             "IDENTIFIERS": {
 7 |                 "PRIMARY_ID": "ERS1076805",
 8 |                 "EXTERNAL_ID": [
 9 |                     {
10 |                         "@namespace": "BioSample",
11 |                         "$": "SAMEA3889671"
12 |                     }
13 |                 ]
14 |             },
15 |             "TITLE": "4ef65e50-c4e2-11e5-88b1-3c4a9275d6c6",
16 |             "SAMPLE_NAME": {
17 |                 "TAXON_ID": 4932,
18 |                 "SCIENTIFIC_NAME": "Saccharomyces cerevisiae"
19 |             },
20 |             "SAMPLE_ATTRIBUTES": {
21 |                 "SAMPLE_ATTRIBUTE": [
22 |                     {
23 |                         "TAG": "Alias",
24 |                         "VALUE": "4ef65e50-c4e2-11e5-88b1-3c4a9275d6c6"
25 |                     },
26 |                     {
27 |                         "TAG": "ENA checklist",
28 |                         "VALUE": "ERC000011"
29 |                     },
30 |                     {
31 |                         "TAG": "INSDC center name",
32 |                         "VALUE": "SC"
33 |                     },
34 |                     {
35 |                         "TAG": "INSDC first public",
36 |                         "VALUE": "2016-10-05T09:26:57Z"
37 |                     },
38 |                     {
39 |                         "TAG": "INSDC last update",
40 |                         "VALUE": "2016-03-08T15:17:43Z"
41 |                     },
42 |                     {
43 |                         "TAG": "INSDC status",
44 |                         "VALUE": "public"
45 |                     },
46 |                     {
47 |                         "TAG": "SRA accession",
48 |                         "VALUE": "ERS1076805"
49 |                     },
50 |                     {
51 |                         "TAG": "Sample Name",
52 |                         "VALUE": "ERS1076805"
53 |                     },
54 |                     {
55 |                         "TAG": "Title",
56 |                         "VALUE": "3858STDY6309587"
57 |                     },
58 |                     {
59 |                         "TAG": "sample_description",
60 |                         "VALUE": "unknown"
61 |                     },
62 |                     {
63 |                         "TAG": "strain",
64 |                         "VALUE": "unknown"
65 |                     }
66 |                 ]
67 |             }
68 |         }
69 |     ],
70 |     "SUBMISSION": {
71 |         "@broker_name": "",
72 |         "@alias": "3858STDY6309587-sc-2517133",
73 |         "@accession": "ERA570895",
74 |         "@lab_name": "European Nucleotide Archive"
75 |     }
76 | }


--------------------------------------------------------------------------------
/xml2json/serialize_schema.py:
--------------------------------------------------------------------------------
  1 | """
  2 | serialize the schema so we can time loading it. We need it in a quicker format.
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | import argparse
  8 | import json
  9 | from random import randint
 10 | import time
 11 | import pickle
 12 | import xmlschema
 13 | from roblib import bcolors
 14 | __author__ = 'Rob Edwards'
 15 | 
 16 | def read_schemas(schemadir, verbose=True):
 17 |     """
 18 |     Read the XML Schema defintion files, and return a dict of schema objects.
 19 |     :param schemadir: directory with all the schemas in it.
 20 |     :param verbose: more output
 21 |     :return: dict of schema objects
 22 |     """
 23 | 
 24 |     #  known XML Schemas
 25 |     schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd",
 26 |         "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 
 27 |         "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd",
 28 |         "submission" : "SRA.submission.xsd"}
 29 | 
 30 | 
 31 |     schemas = {}
 32 |     if verbose:
 33 |         sys.stderr.write(f"Reading schemas\n")
 34 | 
 35 |     for s in schema_types:
 36 |         if verbose:
 37 |             sys.stderr.write(f"Schema parsing {s}\n")
 38 |         schemas[s] = xmlschema.XMLSchema(os.path.join(schemadir, schema_types[s]))
 39 |     
 40 |     if verbose:
 41 |         sys.stderr.write(f"Done reading schemas\n")
 42 | 
 43 |     return schemas
 44 | 
 45 | 
 46 | def write_json(schemas, jsonfile, verbose=False):
 47 |     """
 48 |     Write the json file
 49 |     """
 50 | 
 51 |     if verbose:
 52 |         sys.stderr.write(f"Writing json file {jsonfile}\n")
 53 | 
 54 |     with open(jsonfile, 'w') as f:
 55 |         json.dump(dict(schemas), f)
 56 | 
 57 | def read_json(jsonfile, verbose=False):
 58 |     """
 59 |     Read the json file
 60 |     """
 61 | 
 62 |     if verbose:
 63 |         sys.stderr.write(f"Reading json file {jsonfile}\n")
 64 | 
 65 |     with open(jsonfile, 'r') as f:
 66 |         schemas = json.load(f)
 67 |     
 68 |     return schemas
 69 | 
 70 | 
 71 | def write_pickle(schemas, picklefile, verbose=False):
 72 |     """
 73 |     Write the pickle file
 74 |     """
 75 |     if verbose:
 76 |         sys.stderr.write(f"Writing pickle file {picklefile}\n")
 77 | 
 78 |     with open(picklefile, 'wb') as f:
 79 |         pickle.dump(schemas, f)
 80 | 
 81 | 
 82 | def read_pickle(picklefile, verbose=False):
 83 |     """
 84 |     Read the pickle file
 85 |     """
 86 |     if verbose:
 87 |         sys.stderr.write(f"Reading pickle file {picklefile}\n")
 88 | 
 89 |     with open(picklefile, 'rb') as f:
 90 |         schemas = pickle.load(f)
 91 | 
 92 |     return schemas
 93 | 
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     parser = argparse.ArgumentParser(description=' ')
 98 |     parser.add_argument('-x', help='xml schema directory', required=True)
 99 |     parser.add_argument('-j', help='json output to write')
100 |     parser.add_argument('-p', help='pickle to write')
101 |     parser.add_argument('-v', help='verbose output', action='store_true')
102 |     args = parser.parse_args()
103 | 
104 |     schemas = read_schemas(args.x, args.v)
105 | 
106 |     """
107 |     write_json(schemas, args.j, args.v)
108 |     for i in range(5):
109 |         start = time.time()
110 |         s = read_json(args.j, args.v)
111 |         end = time.time()
112 |         print(f"JSON: {end - start}")
113 |     """
114 | 
115 |     write_pickle(schemas, args.p, args.v)
116 |     pick = []
117 |     xml = []
118 |     for i in range(5):
119 |         if randint(0,10) < 5:
120 |             sys.stderr.write("PICKLE\n")
121 |             start = time.time()
122 |             s = read_pickle(args.p, False)
123 |             end = time.time()
124 |             pick.append(end - start)
125 |             sys.stderr.write("XML\n")
126 |             start = time.time()
127 |             s = read_schemas(args.x, False)
128 |             end = time.time()
129 |             xml.append(end - start)
130 |         else:
131 |             sys.stderr.write("XML\n")
132 |             start = time.time()
133 |             s = read_schemas(args.x, False)
134 |             end = time.time()
135 |             xml.append(end - start)
136 |             sys.stderr.write("PICKLE\n")
137 |             start = time.time()
138 |             s = read_pickle(args.p, False)
139 |             end = time.time()
140 |             pick.append(end - start)
141 | 
142 |     print(f"Pickle: {sum(pick)/len(pick)} XML: {sum(xml)/len(xml)}\n")
143 | 
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/json_examples/json/SRA/SRA490/SRA490640.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "SAMPLE": [
  3 |         {
  4 |             "@alias": "NWD511638",
  5 |             "@accession": "SRS1784730",
  6 |             "IDENTIFIERS": {
  7 |                 "PRIMARY_ID": "SRS1784730",
  8 |                 "EXTERNAL_ID": [
  9 |                     {
 10 |                         "@namespace": "BioSample",
 11 |                         "$": "SAMN05552233"
 12 |                     },
 13 |                     {
 14 |                         "@namespace": "dbGaP",
 15 |                         "@label": "Sample name",
 16 |                         "$": "951-NWD511638"
 17 |                     },
 18 |                     {
 19 |                         "@namespace": "phs000951",
 20 |                         "@label": "submitted sample id",
 21 |                         "$": "NWD511638"
 22 |                     }
 23 |                 ]
 24 |             },
 25 |             "TITLE": "Non-tumor DNA sample from Blood of a human female participant in the dbGaP study \"NHLBI TOPMed: Genetic Epidemiology of COPD (COPDGene) in the TOPMed Program\"",
 26 |             "SAMPLE_NAME": {
 27 |                 "TAXON_ID": 9606,
 28 |                 "SCIENTIFIC_NAME": "Homo sapiens"
 29 |             },
 30 |             "SAMPLE_ATTRIBUTES": {
 31 |                 "SAMPLE_ATTRIBUTE": [
 32 |                     {
 33 |                         "TAG": "gap_accession",
 34 |                         "VALUE": "phs000951"
 35 |                     },
 36 |                     {
 37 |                         "TAG": "submitter handle",
 38 |                         "VALUE": "NHLBI_COPDGene"
 39 |                     },
 40 |                     {
 41 |                         "TAG": "biospecimen repository",
 42 |                         "VALUE": "NHLBI_COPDGene"
 43 |                     },
 44 |                     {
 45 |                         "TAG": "study name",
 46 |                         "VALUE": "NHLBI TOPMed: Genetic Epidemiology of COPD (COPDGene) in the TOPMed Program"
 47 |                     },
 48 |                     {
 49 |                         "TAG": "study design",
 50 |                         "VALUE": "Case-Control"
 51 |                     },
 52 |                     {
 53 |                         "TAG": "biospecimen repository sample id",
 54 |                         "VALUE": "NWD511638"
 55 |                     },
 56 |                     {
 57 |                         "TAG": "submitted sample id",
 58 |                         "VALUE": "NWD511638"
 59 |                     },
 60 |                     {
 61 |                         "TAG": "submitted subject id",
 62 |                         "VALUE": "COPDGene_G31211"
 63 |                     },
 64 |                     {
 65 |                         "TAG": "gap_sample_id",
 66 |                         "VALUE": "2064233"
 67 |                     },
 68 |                     {
 69 |                         "TAG": "gap_subject_id",
 70 |                         "VALUE": "432226"
 71 |                     },
 72 |                     {
 73 |                         "TAG": "sex",
 74 |                         "VALUE": "female"
 75 |                     },
 76 |                     {
 77 |                         "TAG": "body site",
 78 |                         "VALUE": "Blood"
 79 |                     },
 80 |                     {
 81 |                         "TAG": "histological type",
 82 |                         "VALUE": "Blood"
 83 |                     },
 84 |                     {
 85 |                         "TAG": "analyte type",
 86 |                         "VALUE": "DNA"
 87 |                     },
 88 |                     {
 89 |                         "TAG": "is tumor",
 90 |                         "VALUE": "No"
 91 |                     },
 92 |                     {
 93 |                         "TAG": "subject is affected",
 94 |                         "VALUE": "No"
 95 |                     },
 96 |                     {
 97 |                         "TAG": "molecular data type",
 98 |                         "VALUE": "SNP/CNV Genotypes (NGS)"
 99 |                     },
100 |                     {
101 |                         "TAG": "gap_consent_code",
102 |                         "VALUE": "1"
103 |                     },
104 |                     {
105 |                         "TAG": "gap_consent_short_name",
106 |                         "VALUE": "HMB-MDS"
107 |                     }
108 |                 ]
109 |             }
110 |         }
111 |     ],
112 |     "SUBMISSION": {
113 |         "@alias": "NWD511638.expt.submit",
114 |         "@center_name": "Broad Institute",
115 |         "@broker_name": "UM-SPH",
116 |         "@lab_name": "Abecasis",
117 |         "@submission_comment": "",
118 |         "@accession": "SRA490640"
119 |     }
120 | }


--------------------------------------------------------------------------------
/xml2json/xml_dir2json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Read a directory of XML files and convert the output to JSON.
  3 | We write the JSON object to a file. By default, we do not
  4 | overwrite existing files, but you can force that with the -o 
  5 | flag.
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import argparse
 11 | from sra_metadata_libs import bcolors
 12 | import xmlschema
 13 | from xmlschema.validators.exceptions import XMLSchemaValidationError
 14 | import json
 15 | from pprint import pprint
 16 | __author__ = 'Rob Edwards'
 17 | 
 18 | 
 19 | def validation_errors(sample, error, verbose=False):
 20 |     """
 21 |     Log the validation error to a file. We append to the file.
 22 | 
 23 |     :param sample: The sample ID
 24 |     :param error: The python Error object
 25 |     :param verbose: More output
 26 |     """
 27 | 
 28 |     if verbose:
 29 |         sys.stderr.write(f"{bcolors.PINK}Logging error for {sample}{bcolors.ENDC}\n")
 30 | 
 31 |     with open("XML_validation_errors.txt", "a") as out:
 32 |         out.write(f"\n=== BEGIN {sample} ===\n")
 33 |         out.write(str(error))
 34 |         out.write(f"\n=== END {sample} ===\n")
 35 | 
 36 | def read_schemas(verbose=True):
 37 |     """
 38 |     Read the XML Schema defintion files, and return a dict of schema objects.
 39 |     :param verbose: more output
 40 |     :return: dict of schema objects
 41 |     """
 42 | 
 43 |     #  known XML Schemas
 44 |     schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd",
 45 |         "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 
 46 |         "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd",
 47 |         "submission" : "SRA.submission.xsd"}
 48 | 
 49 | 
 50 |     schemas = {}
 51 | 
 52 |     for s in schema_types:
 53 |         if verbose:
 54 |             sys.stderr.write(f"{bcolors.GREEN}Schema parsing{bcolors.ENDC} {s}\n")
 55 |         schemas[s] = xmlschema.XMLSchema(os.path.join("Schemas", schema_types[s]))
 56 | 
 57 |     return schemas
 58 | 
 59 | def read_directory(basedir, subdir, schemas, verbose=False):
 60 |     """
 61 |     Read a directory and create a single dict for that directory
 62 |   
 63 |     :param basedir: The base directory of all the XML files
 64 |     :param subdir: The sample directory with each of the individual XML files
 65 |     :param schemas: The dictionary of XML Schema Definitions
 66 |     :param verbose: more output
 67 |     :return: a dict of all the data
 68 |     """
 69 |     
 70 |     data = {}
 71 |     for s in schemas:
 72 |         sc = schemas[s]
 73 |         if not os.path.exists(os.path.join(basedir, subdir, f"{subdir}.{s}.xml")):
 74 |             if verbose and s not in ['analysis', 'common', 'package']:
 75 |                 sys.stderr.write(f"{bcolors.RED}WARN: {basedir}/{subdir}/{subdir}.{s}.xml not found\n")
 76 |             continue
 77 |         
 78 |         try:
 79 |             xm = schemas[s].to_dict(os.path.join(basedir, subdir, f"{subdir}.{s}.xml"), decimal_type=str)
 80 |         except XMLSchemaValidationError as e:
 81 |             validation_errors(subdir, e, verbose)
 82 |             continue
 83 | 
 84 |         #data[s.upper()] = xm[s.upper()]
 85 |         if 'submission' == s:
 86 |             data['SUBMISSION'] = xm
 87 |         else:
 88 |             data[s.upper()] = xm[s.upper()]
 89 |     
 90 |     return data
 91 | 
 92 | def write_json(xml, outfile, verbose=False):
 93 |     """
 94 |     Write the dictionary to a JSON file
 95 |     
 96 |     :param xml: The dictionary of the XML object
 97 |     :param outfile: The file to write to
 98 |     :param verbose: more output
 99 |     """
100 | 
101 |     with open(outfile, 'w') as out:
102 |         out.write(json.dumps(xml, indent=4))
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     parser = argparse.ArgumentParser(description='Parse a directory or directories and create a json output for each one')
107 |     parser.add_argument('-d', help='directory to parse', required=True)
108 |     parser.add_argument('-o', help='where to put the json files.', required=True)
109 |     parser.add_argument('-f', help='force writing of the file, even if it exists', action='store_true')
110 |     parser.add_argument('-v', help='verbose output', action='store_true')
111 |     args = parser.parse_args()
112 | 
113 | 
114 |     # read all the known schemas
115 |     if not os.path.exists("Schemas"):
116 |         sys.stderr.write(f"{bcolors.RED}FATAL: Schemas/ directory with known xml schemas not found{bcolors.ENDC}\n")
117 |         sys.exit(-1)
118 |     schemas = read_schemas(True)
119 | 
120 |     if not os.path.exists(args.o):
121 |         os.mkdir(args.o)
122 |     
123 |     # read all the files in the base directory
124 |     for submission in os.listdir(args.d):
125 |         outfile = os.path.join(args.o, f"{submission}.json")
126 |         if (not args.f) and os.path.exists(outfile):
127 |             continue
128 | 
129 |         if args.v:
130 |             sys.stderr.write(f"{bcolors.GREEN}Parsing {bcolors.ENDC} {submission}\n")
131 | 
132 |         data = read_directory(args.d, submission, schemas, args.v)
133 | 
134 |         if args.v:
135 |             sys.stderr.write(f"{bcolors.BLUE}Writing {bcolors.ENDC} {submission}\n")
136 |         write_json(data, outfile, args.v)
137 | 
138 | 


--------------------------------------------------------------------------------
/json_examples/json/SRA/SRA268/SRA268165.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "EXPERIMENT": [
  3 |         {
  4 |             "@alias": "2b-RAD data of Patinopecten yessoensis",
  5 |             "@accession": "SRX1027271",
  6 |             "@center_name": "Ocean University of China",
  7 |             "IDENTIFIERS": {
  8 |                 "PRIMARY_ID": "SRX1027271",
  9 |                 "SUBMITTER_ID": [
 10 |                     {
 11 |                         "@namespace": "Ocean University of China",
 12 |                         "$": "2b-RAD data of Patinopecten yessoensis"
 13 |                     }
 14 |                 ]
 15 |             },
 16 |             "TITLE": "2b-RAD data from Genome Sequencing Project of Patinopecten yessoensis",
 17 |             "STUDY_REF": {
 18 |                 "@accession": "SRP046829",
 19 |                 "IDENTIFIERS": {
 20 |                     "PRIMARY_ID": "SRP046829"
 21 |                 }
 22 |             },
 23 |             "DESIGN": {
 24 |                 "DESIGN_DESCRIPTION": null,
 25 |                 "SAMPLE_DESCRIPTOR": {
 26 |                     "@accession": "SRS935919",
 27 |                     "IDENTIFIERS": {
 28 |                         "PRIMARY_ID": "SRS935919"
 29 |                     }
 30 |                 },
 31 |                 "LIBRARY_DESCRIPTOR": {
 32 |                     "LIBRARY_NAME": null,
 33 |                     "LIBRARY_STRATEGY": "OTHER",
 34 |                     "LIBRARY_SOURCE": "GENOMIC",
 35 |                     "LIBRARY_SELECTION": "PCR",
 36 |                     "LIBRARY_LAYOUT": {
 37 |                         "SINGLE": null
 38 |                     }
 39 |                 },
 40 |                 "SPOT_DESCRIPTOR": {
 41 |                     "SPOT_DECODE_SPEC": {
 42 |                         "SPOT_LENGTH": 36,
 43 |                         "READ_SPEC": [
 44 |                             {
 45 |                                 "READ_INDEX": 0,
 46 |                                 "READ_CLASS": "Application Read",
 47 |                                 "READ_TYPE": "Forward",
 48 |                                 "BASE_COORD": 1
 49 |                             }
 50 |                         ]
 51 |                     }
 52 |                 }
 53 |             },
 54 |             "PLATFORM": {
 55 |                 "ILLUMINA": {
 56 |                     "INSTRUMENT_MODEL": "Illumina HiSeq 2000"
 57 |                 }
 58 |             }
 59 |         }
 60 |     ],
 61 |     "RUN": [
 62 |         {
 63 |             "@accession": "SRR2027758",
 64 |             "@center_name": "Ocean University of China",
 65 |             "@alias": "2b-RAD data of Patinopecten yessoensis",
 66 |             "IDENTIFIERS": {
 67 |                 "PRIMARY_ID": "SRR2027758",
 68 |                 "SUBMITTER_ID": [
 69 |                     {
 70 |                         "@namespace": "Ocean University of China",
 71 |                         "$": "2b-RAD data of Patinopecten yessoensis"
 72 |                     }
 73 |                 ]
 74 |             },
 75 |             "EXPERIMENT_REF": {
 76 |                 "@accession": "SRX1027271"
 77 |             }
 78 |         }
 79 |     ],
 80 |     "SAMPLE": [
 81 |         {
 82 |             "@alias": "2b-RAD data of Patinopecten yessoensis",
 83 |             "@accession": "SRS935919",
 84 |             "IDENTIFIERS": {
 85 |                 "PRIMARY_ID": "SRS935919",
 86 |                 "EXTERNAL_ID": [
 87 |                     {
 88 |                         "@namespace": "BioSample",
 89 |                         "$": "SAMN03657538"
 90 |                     }
 91 |                 ]
 92 |             },
 93 |             "SAMPLE_NAME": {
 94 |                 "TAXON_ID": 6573,
 95 |                 "SCIENTIFIC_NAME": "Mizuhopecten yessoensis"
 96 |             },
 97 |             "SAMPLE_ATTRIBUTES": {
 98 |                 "SAMPLE_ATTRIBUTE": [
 99 |                     {
100 |                         "TAG": "strain",
101 |                         "VALUE": "cellular organisms; Eukaryota; Opisthokonta; Metazoa; Eumetazoa; Bilateria; Protostomia; Lophotrochozoa; Mollusca; Bivalvia; Pteriomorphia; Pectinoida; Pectinoidea; Pectinidae; Mizuhopecten"
102 |                     },
103 |                     {
104 |                         "TAG": "age",
105 |                         "VALUE": "one-year and two-year old"
106 |                     },
107 |                     {
108 |                         "TAG": "sex",
109 |                         "VALUE": "not determined"
110 |                     },
111 |                     {
112 |                         "TAG": "tissue",
113 |                         "VALUE": "striated muscle"
114 |                     },
115 |                     {
116 |                         "TAG": "BioSampleModel",
117 |                         "VALUE": "Model organism or animal"
118 |                     }
119 |                 ]
120 |             }
121 |         }
122 |     ],
123 |     "SUBMISSION": {
124 |         "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
125 |         "@alias": "2b-RAD data of Patinopecten yessoensis",
126 |         "@submission_comment": "2b-RAD data from Genome Sequencing Project of Patinopecten yessoensis",
127 |         "@center_name": "Ocean University of China",
128 |         "@lab_name": "Ministry of Education Key Laboratory of Marine Gen",
129 |         "@accession": "SRA268165"
130 |     }
131 | }


--------------------------------------------------------------------------------
/json_examples/json/SRA/SRA889/SRA889255.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "EXPERIMENT": [
  3 |         {
  4 |             "@accession": "SRX5882665",
  5 |             "@alias": "PNUSAS076100:wgs",
  6 |             "IDENTIFIERS": {
  7 |                 "PRIMARY_ID": "SRX5882665",
  8 |                 "EXTERNAL_ID": [
  9 |                     {
 10 |                         "@namespace": "EDLB-CDC",
 11 |                         "$": "PNUSAS076100:wgs"
 12 |                     }
 13 |                 ]
 14 |             },
 15 |             "TITLE": null,
 16 |             "STUDY_REF": {
 17 |                 "@accession": "SRP040281",
 18 |                 "IDENTIFIERS": {
 19 |                     "PRIMARY_ID": "SRP040281",
 20 |                     "EXTERNAL_ID": [
 21 |                         {
 22 |                             "@namespace": "BioProject",
 23 |                             "$": "PRJNA230403"
 24 |                         }
 25 |                     ]
 26 |                 }
 27 |             },
 28 |             "DESIGN": {
 29 |                 "DESIGN_DESCRIPTION": null,
 30 |                 "SAMPLE_DESCRIPTOR": {
 31 |                     "@accession": "SRS4805223",
 32 |                     "IDENTIFIERS": {
 33 |                         "PRIMARY_ID": "SRS4805223",
 34 |                         "EXTERNAL_ID": [
 35 |                             {
 36 |                                 "@namespace": "EDLB-CDC",
 37 |                                 "$": "PNUSAS076100"
 38 |                             }
 39 |                         ]
 40 |                     }
 41 |                 },
 42 |                 "LIBRARY_DESCRIPTOR": {
 43 |                     "LIBRARY_NAME": "NexteraXT",
 44 |                     "LIBRARY_STRATEGY": "WGS",
 45 |                     "LIBRARY_SOURCE": "GENOMIC",
 46 |                     "LIBRARY_SELECTION": "RANDOM",
 47 |                     "LIBRARY_LAYOUT": {
 48 |                         "PAIRED": null
 49 |                     },
 50 |                     "LIBRARY_CONSTRUCTION_PROTOCOL": "NexteraXT"
 51 |                 }
 52 |             },
 53 |             "PLATFORM": {
 54 |                 "ILLUMINA": {
 55 |                     "INSTRUMENT_MODEL": "Illumina MiSeq"
 56 |                 }
 57 |             }
 58 |         }
 59 |     ],
 60 |     "RUN": [
 61 |         {
 62 |             "@accession": "SRR9108043",
 63 |             "@alias": "PNUSAS076100:wgs",
 64 |             "IDENTIFIERS": {
 65 |                 "PRIMARY_ID": "SRR9108043",
 66 |                 "EXTERNAL_ID": [
 67 |                     {
 68 |                         "@namespace": "EDLB-CDC",
 69 |                         "$": "PNUSAS076100:wgs"
 70 |                     }
 71 |                 ]
 72 |             },
 73 |             "EXPERIMENT_REF": {
 74 |                 "@accession": "SRX5882665",
 75 |                 "IDENTIFIERS": {
 76 |                     "EXTERNAL_ID": [
 77 |                         {
 78 |                             "@namespace": "EDLB-CDC",
 79 |                             "$": "PNUSAS076100:wgs"
 80 |                         }
 81 |                     ]
 82 |                 }
 83 |             }
 84 |         }
 85 |     ],
 86 |     "SAMPLE": [
 87 |         {
 88 |             "@alias": "PNUSAS076100",
 89 |             "@accession": "SRS4805223",
 90 |             "IDENTIFIERS": {
 91 |                 "PRIMARY_ID": "SRS4805223",
 92 |                 "EXTERNAL_ID": [
 93 |                     {
 94 |                         "@namespace": "BioSample",
 95 |                         "$": "SAMN11822565"
 96 |                     }
 97 |                 ]
 98 |             },
 99 |             "TITLE": "Salmonella enterica",
100 |             "SAMPLE_NAME": {
101 |                 "TAXON_ID": 28901,
102 |                 "SCIENTIFIC_NAME": "Salmonella enterica"
103 |             },
104 |             "SAMPLE_LINKS": {
105 |                 "SAMPLE_LINK": [
106 |                     {
107 |                         "XREF_LINK": {
108 |                             "DB": "bioproject",
109 |                             "ID": "230403",
110 |                             "LABEL": "PRJNA230403"
111 |                         }
112 |                     }
113 |                 ]
114 |             },
115 |             "SAMPLE_ATTRIBUTES": {
116 |                 "SAMPLE_ATTRIBUTE": [
117 |                     {
118 |                         "TAG": "strain",
119 |                         "VALUE": "PNUSAS076100"
120 |                     },
121 |                     {
122 |                         "TAG": "isolate",
123 |                         "VALUE": "Missing"
124 |                     },
125 |                     {
126 |                         "TAG": "isolation_source",
127 |                         "VALUE": "missing"
128 |                     },
129 |                     {
130 |                         "TAG": "collected_by",
131 |                         "VALUE": "CDC"
132 |                     },
133 |                     {
134 |                         "TAG": "collection_date",
135 |                         "VALUE": "missing"
136 |                     },
137 |                     {
138 |                         "TAG": "geo_loc_name",
139 |                         "VALUE": "USA"
140 |                     },
141 |                     {
142 |                         "TAG": "lat_lon",
143 |                         "VALUE": "missing"
144 |                     },
145 |                     {
146 |                         "TAG": "host",
147 |                         "VALUE": "missing"
148 |                     },
149 |                     {
150 |                         "TAG": "host_disease",
151 |                         "VALUE": "missing"
152 |                     },
153 |                     {
154 |                         "TAG": "BioSampleModel",
155 |                         "VALUE": "Pathogen.cl"
156 |                     }
157 |                 ]
158 |             }
159 |         }
160 |     ],
161 |     "SUBMISSION": {
162 |         "@lab_name": "",
163 |         "@center_name": "Pulsenet",
164 |         "@accession": "SRA889255",
165 |         "@alias": "SUB5658461"
166 |     }
167 | }


--------------------------------------------------------------------------------
/xml2json/xml2json.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Process a single SRA directory and convert it to JSON. 
  3 | """
  4 | 
  5 | import os
  6 | import sys
  7 | import argparse
  8 | import fcntl
  9 | import pickle
 10 | import xmlschema
 11 | from xmlschema.validators.exceptions import XMLSchemaValidationError
 12 | import json
 13 | __author__ = 'Rob Edwards'
 14 | 
 15 | 
 16 | def validation_errors(sample, error, verbose=False):
 17 |     """
 18 |     Log the validation error to a file. We append to the file.
 19 | 
 20 |     :param sample: The sample ID
 21 |     :param error: The python Error object
 22 |     :param verbose: More output
 23 |     """
 24 | 
 25 |     if verbose:
 26 |         sys.stderr.write(f"Logging error for {sample}\n")
 27 | 
 28 |     with open("XML_validation_errors.txt", "a") as out:
 29 |         # get an exclusive lock
 30 |         fcntl.flock(out, fcntl.LOCK_EX)
 31 |         out.write(f"\n=== BEGIN {sample} ===\n")
 32 |         out.write(str(error))
 33 |         out.write(f"\n=== END {sample} ===\n")
 34 |         fcntl.flock(out, fcntl.LOCK_UN)
 35 | 
 36 | def read_schema_pickle(picklefile, verbose=False):
 37 |     """
 38 |     Read the schema from a pickle file
 39 |     """
 40 |     if verbose:
 41 |         sys.stderr.write(f"Reading pickle file {picklefile}\n")
 42 | 
 43 |     with open(picklefile, 'rb') as f:
 44 |         schemas = pickle.load(f)
 45 | 
 46 |     return schemas
 47 | 
 48 | def read_schemas(schemadir, verbose=True):
 49 |     """
 50 |     Read the XML Schema defintion files, and return a dict of schema objects.
 51 |     :param schemadir: directory with all the schemas in it.
 52 |     :param verbose: more output
 53 |     :return: dict of schema objects
 54 |     """
 55 | 
 56 |     #  known XML Schemas
 57 |     schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd",
 58 |         "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 
 59 |         "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd",
 60 |         "submission" : "SRA.submission.xsd"}
 61 | 
 62 | 
 63 |     schemas = {}
 64 | 
 65 |     for s in schema_types:
 66 |         if verbose:
 67 |             sys.stderr.write(f"Schema parsing {s}\n")
 68 |         schemas[s] = xmlschema.XMLSchema(os.path.join(schemadir, schema_types[s]))
 69 | 
 70 |     return schemas
 71 | 
 72 | def read_directory(basedir, sampleid, schemas, verbose=False):
 73 |     """
 74 |     Read a directory and create a single dict for that directory
 75 |   
 76 |     :param basedir: The base directory of all the XML files
 77 |     :param sampleid: The sample directory with each of the individual XML files
 78 |     :param schemas: The dictionary of XML Schema Definitions
 79 |     :param verbose: more output
 80 |     :return: a dict of all the data
 81 |     """
 82 |     
 83 |     data = {}
 84 |     for s in schemas:
 85 |         sc = schemas[s]
 86 |         if not os.path.exists(os.path.join(basedir, sampleid, f"{sampleid}.{s}.xml")):
 87 |             if verbose and s not in ['analysis', 'common', 'package']:
 88 |                 sys.stderr.write(f"WARN: {basedir}/{sampleid}/{sampleid}.{s}.xml not found\n")
 89 |             continue
 90 |         
 91 |         try:
 92 |             xm = schemas[s].to_dict(os.path.join(basedir, sampleid, f"{sampleid}.{s}.xml"), decimal_type=str)
 93 |         except XMLSchemaValidationError as e:
 94 |             validation_errors(sampleid, e, verbose)
 95 |             continue
 96 | 
 97 |         #data[s.upper()] = xm[s.upper()]
 98 |         if 'submission' == s:
 99 |             data['SUBMISSION'] = xm
100 |         else:
101 |             data[s.upper()] = xm[s.upper()]
102 |     
103 |     return data
104 | 
105 | def write_json(xml, outfile, verbose=False):
106 |     """
107 |     Write the dictionary to a JSON file
108 |     
109 |     :param xml: The dictionary of the XML object
110 |     :param outfile: The file to write to
111 |     :param verbose: more output
112 |     """
113 | 
114 |     with open(outfile, 'w') as out:
115 |         out.write(json.dumps(xml, indent=4))
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     parser = argparse.ArgumentParser(description='Parse a directory or directories and create a json output for each one')
120 |     parser.add_argument('-d', help='directory with the submission diretories', required=True)
121 |     parser.add_argument('-x', help='Sample ID to parse', required=True)
122 |     parser.add_argument('-o', help='where to put the json files.', required=True)
123 |     parser.add_argument('-s', help='Schema directory')
124 |     parser.add_argument('-p', help='Schema pickle')
125 |     parser.add_argument('-f', help='force writing of the file, even if it exists', action='store_true')
126 |     parser.add_argument('-v', help='verbose output', action='store_true')
127 |     args = parser.parse_args()
128 | 
129 | 
130 |     if not args.s and not args.p:
131 |         sys.stderr.write("FATAL: Please provide a schema either as a directory or a pickle file\n")
132 |         sys.exit(-1)
133 | 
134 |     # read all the files in the base directory
135 |     outfile = os.path.join(args.o, f"{args.x}.json")
136 |     if (not args.f) and os.path.exists(outfile):
137 |         sys.exit(0)
138 | 
139 |     # read all the known schemas
140 |     schemas = None
141 |     if args.p:
142 |         schemas = read_schema_pickle(args.p, args.v)
143 | 
144 |     if args.s:
145 |         if not os.path.exists(args.s):
146 |             sys.stderr.write(f"FATAL: {args.s} directory with known xml schemas not found\n")
147 |             sys.exit(-1)
148 |         schemas = read_schemas(args.s, args.v)
149 | 
150 |     if not schemas:
151 |         sys.stderr.write("FATAL: Could not read your schemas\n")
152 |         sys.exit(-1)
153 | 
154 | 
155 |     if not os.path.exists(args.o):
156 |         os.mkdir(args.o)
157 |     
158 |     if args.v:
159 |         sys.stderr.write(f"Parsing {args.x}\n")
160 | 
161 |     data = read_directory(args.d, args.x, schemas, args.v)
162 | 
163 |     if args.v:
164 |         sys.stderr.write(f"Writing {args.x}\n")
165 |     write_json(data, outfile, args.v)
166 | 
167 | 


--------------------------------------------------------------------------------
/json_examples/json/SRA/SRA609/SRA609343.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "EXPERIMENT": [
  3 |         {
  4 |             "@accession": "SRX3197638",
  5 |             "@alias": "PNUSAS022123:wgs",
  6 |             "IDENTIFIERS": {
  7 |                 "PRIMARY_ID": "SRX3197638",
  8 |                 "EXTERNAL_ID": [
  9 |                     {
 10 |                         "@namespace": "EDLB-CDC",
 11 |                         "$": "PNUSAS022123:wgs"
 12 |                     }
 13 |                 ]
 14 |             },
 15 |             "TITLE": null,
 16 |             "STUDY_REF": {
 17 |                 "@accession": "SRP040281",
 18 |                 "IDENTIFIERS": {
 19 |                     "PRIMARY_ID": "SRP040281",
 20 |                     "EXTERNAL_ID": [
 21 |                         {
 22 |                             "@namespace": "BioProject",
 23 |                             "$": "PRJNA230403"
 24 |                         }
 25 |                     ]
 26 |                 }
 27 |             },
 28 |             "DESIGN": {
 29 |                 "DESIGN_DESCRIPTION": null,
 30 |                 "SAMPLE_DESCRIPTOR": {
 31 |                     "@accession": "SRS2524717",
 32 |                     "IDENTIFIERS": {
 33 |                         "PRIMARY_ID": "SRS2524717",
 34 |                         "EXTERNAL_ID": [
 35 |                             {
 36 |                                 "@namespace": "BioSample",
 37 |                                 "$": "SAMN07638965"
 38 |                             }
 39 |                         ]
 40 |                     }
 41 |                 },
 42 |                 "LIBRARY_DESCRIPTOR": {
 43 |                     "LIBRARY_NAME": "NexteraXT",
 44 |                     "LIBRARY_STRATEGY": "WGS",
 45 |                     "LIBRARY_SOURCE": "GENOMIC",
 46 |                     "LIBRARY_SELECTION": "RANDOM",
 47 |                     "LIBRARY_LAYOUT": {
 48 |                         "PAIRED": null
 49 |                     },
 50 |                     "LIBRARY_CONSTRUCTION_PROTOCOL": "NexteraXT"
 51 |                 }
 52 |             },
 53 |             "PLATFORM": {
 54 |                 "ILLUMINA": {
 55 |                     "INSTRUMENT_MODEL": "Illumina MiSeq"
 56 |                 }
 57 |             }
 58 |         }
 59 |     ],
 60 |     "RUN": [
 61 |         {
 62 |             "@accession": "SRR6050671",
 63 |             "@alias": "PNUSAS022123:wgs",
 64 |             "IDENTIFIERS": {
 65 |                 "PRIMARY_ID": "SRR6050671",
 66 |                 "EXTERNAL_ID": [
 67 |                     {
 68 |                         "@namespace": "EDLB-CDC",
 69 |                         "$": "PNUSAS022123:wgs"
 70 |                     }
 71 |                 ]
 72 |             },
 73 |             "EXPERIMENT_REF": {
 74 |                 "@accession": "SRX3197638",
 75 |                 "IDENTIFIERS": {
 76 |                     "EXTERNAL_ID": [
 77 |                         {
 78 |                             "@namespace": "EDLB-CDC",
 79 |                             "$": "PNUSAS022123:wgs"
 80 |                         }
 81 |                     ]
 82 |                 }
 83 |             }
 84 |         }
 85 |     ],
 86 |     "SAMPLE": [
 87 |         {
 88 |             "@alias": "PNUSAS022123",
 89 |             "@accession": "SRS2524717",
 90 |             "IDENTIFIERS": {
 91 |                 "PRIMARY_ID": "SRS2524717",
 92 |                 "EXTERNAL_ID": [
 93 |                     {
 94 |                         "@namespace": "BioSample",
 95 |                         "$": "SAMN07638965"
 96 |                     }
 97 |                 ]
 98 |             },
 99 |             "SAMPLE_NAME": {
100 |                 "TAXON_ID": 436295,
101 |                 "SCIENTIFIC_NAME": "Salmonella enterica subsp. enterica serovar Poona"
102 |             },
103 |             "SAMPLE_LINKS": {
104 |                 "SAMPLE_LINK": [
105 |                     {
106 |                         "XREF_LINK": {
107 |                             "DB": "bioproject",
108 |                             "ID": "230403",
109 |                             "LABEL": "PRJNA230403"
110 |                         }
111 |                     }
112 |                 ]
113 |             },
114 |             "SAMPLE_ATTRIBUTES": {
115 |                 "SAMPLE_ATTRIBUTE": [
116 |                     {
117 |                         "TAG": "collection_date",
118 |                         "VALUE": "Jul-2017"
119 |                     },
120 |                     {
121 |                         "TAG": "strain",
122 |                         "VALUE": "PNUSAS022123"
123 |                     },
124 |                     {
125 |                         "TAG": "collected_by",
126 |                         "VALUE": "CDC"
127 |                     },
128 |                     {
129 |                         "TAG": "serovar",
130 |                         "VALUE": "Poona"
131 |                     },
132 |                     {
133 |                         "TAG": "lat_lon",
134 |                         "VALUE": "Missing"
135 |                     },
136 |                     {
137 |                         "TAG": "geo_loc_name",
138 |                         "VALUE": "USA"
139 |                     },
140 |                     {
141 |                         "TAG": "host",
142 |                         "VALUE": "Missing"
143 |                     },
144 |                     {
145 |                         "TAG": "isolation_source",
146 |                         "VALUE": "urine"
147 |                     },
148 |                     {
149 |                         "TAG": "HHS_region",
150 |                         "VALUE": "9"
151 |                     },
152 |                     {
153 |                         "TAG": "host_age",
154 |                         "VALUE": "10-19"
155 |                     },
156 |                     {
157 |                         "TAG": "host_disease",
158 |                         "VALUE": "Missing"
159 |                     },
160 |                     {
161 |                         "TAG": "sub_species",
162 |                         "VALUE": "enterica"
163 |                     },
164 |                     {
165 |                         "TAG": "BioSampleModel",
166 |                         "VALUE": "Pathogen.cl"
167 |                     }
168 |                 ]
169 |             }
170 |         }
171 |     ],
172 |     "SUBMISSION": {
173 |         "@lab_name": "",
174 |         "@center_name": "edlb-cdc",
175 |         "@accession": "SRA609343",
176 |         "@alias": "SUB3055691"
177 |     }
178 | }


--------------------------------------------------------------------------------
/json_examples/json/SRA/SRA563/SRA563707.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "EXPERIMENT": [
  3 |         {
  4 |             "@accession": "SRX2841104",
  5 |             "@alias": "367250",
  6 |             "IDENTIFIERS": {
  7 |                 "PRIMARY_ID": "SRX2841104",
  8 |                 "EXTERNAL_ID": [
  9 |                     {
 10 |                         "@namespace": "PHE",
 11 |                         "$": "367250"
 12 |                     }
 13 |                 ]
 14 |             },
 15 |             "TITLE": null,
 16 |             "STUDY_REF": {
 17 |                 "@accession": "SRP042645",
 18 |                 "IDENTIFIERS": {
 19 |                     "PRIMARY_ID": "SRP042645",
 20 |                     "EXTERNAL_ID": [
 21 |                         {
 22 |                             "@namespace": "BioProject",
 23 |                             "$": "PRJNA248792"
 24 |                         }
 25 |                     ]
 26 |                 }
 27 |             },
 28 |             "DESIGN": {
 29 |                 "DESIGN_DESCRIPTION": null,
 30 |                 "SAMPLE_DESCRIPTOR": {
 31 |                     "@accession": "SRS2214451",
 32 |                     "IDENTIFIERS": {
 33 |                         "PRIMARY_ID": "SRS2214451",
 34 |                         "EXTERNAL_ID": [
 35 |                             {
 36 |                                 "@namespace": "PHE",
 37 |                                 "$": "367250.biosample"
 38 |                             }
 39 |                         ]
 40 |                     }
 41 |                 },
 42 |                 "LIBRARY_DESCRIPTOR": {
 43 |                     "LIBRARY_NAME": "367250",
 44 |                     "LIBRARY_STRATEGY": "WGS",
 45 |                     "LIBRARY_SOURCE": "GENOMIC",
 46 |                     "LIBRARY_SELECTION": "RANDOM",
 47 |                     "LIBRARY_LAYOUT": {
 48 |                         "PAIRED": null
 49 |                     },
 50 |                     "LIBRARY_CONSTRUCTION_PROTOCOL": "Illumina Nextera XT"
 51 |                 }
 52 |             },
 53 |             "PLATFORM": {
 54 |                 "ILLUMINA": {
 55 |                     "INSTRUMENT_MODEL": "Illumina HiSeq 2500"
 56 |                 }
 57 |             }
 58 |         }
 59 |     ],
 60 |     "RUN": [
 61 |         {
 62 |             "@accession": "SRR5583191",
 63 |             "@alias": "367250",
 64 |             "IDENTIFIERS": {
 65 |                 "PRIMARY_ID": "SRR5583191",
 66 |                 "EXTERNAL_ID": [
 67 |                     {
 68 |                         "@namespace": "PHE",
 69 |                         "$": "367250"
 70 |                     }
 71 |                 ]
 72 |             },
 73 |             "EXPERIMENT_REF": {
 74 |                 "@accession": "SRX2841104",
 75 |                 "IDENTIFIERS": {
 76 |                     "EXTERNAL_ID": [
 77 |                         {
 78 |                             "@namespace": "PHE",
 79 |                             "$": "367250"
 80 |                         }
 81 |                     ]
 82 |                 }
 83 |             }
 84 |         }
 85 |     ],
 86 |     "SAMPLE": [
 87 |         {
 88 |             "@alias": "367250.biosample",
 89 |             "@accession": "SRS2214451",
 90 |             "IDENTIFIERS": {
 91 |                 "PRIMARY_ID": "SRS2214451",
 92 |                 "EXTERNAL_ID": [
 93 |                     {
 94 |                         "@namespace": "BioSample",
 95 |                         "$": "SAMN07152381"
 96 |                     }
 97 |                 ]
 98 |             },
 99 |             "TITLE": "Salmonella enterica enterica serovar Salmonella Typhimurium 367250",
100 |             "SAMPLE_NAME": {
101 |                 "TAXON_ID": 59201,
102 |                 "SCIENTIFIC_NAME": "Salmonella enterica subsp. enterica"
103 |             },
104 |             "SAMPLE_LINKS": {
105 |                 "SAMPLE_LINK": [
106 |                     {
107 |                         "XREF_LINK": {
108 |                             "DB": "bioproject",
109 |                             "ID": "248792",
110 |                             "LABEL": "PRJNA248792"
111 |                         }
112 |                     }
113 |                 ]
114 |             },
115 |             "SAMPLE_ATTRIBUTES": {
116 |                 "SAMPLE_ATTRIBUTE": [
117 |                     {
118 |                         "TAG": "strain",
119 |                         "VALUE": "367250"
120 |                     },
121 |                     {
122 |                         "TAG": "collected_by",
123 |                         "VALUE": "PHE"
124 |                     },
125 |                     {
126 |                         "TAG": "collection_date",
127 |                         "VALUE": "Apr-2017"
128 |                     },
129 |                     {
130 |                         "TAG": "isolation_source",
131 |                         "VALUE": "human"
132 |                     },
133 |                     {
134 |                         "TAG": "geo_loc_name",
135 |                         "VALUE": "United Kingdom: United Kingdom"
136 |                     },
137 |                     {
138 |                         "TAG": "lat_lon",
139 |                         "VALUE": "Missing"
140 |                     },
141 |                     {
142 |                         "TAG": "serovar",
143 |                         "VALUE": "Salmonella Typhimurium"
144 |                     },
145 |                     {
146 |                         "TAG": "isolate_name_alias",
147 |                         "VALUE": "367250"
148 |                     },
149 |                     {
150 |                         "TAG": "sequence_type",
151 |                         "VALUE": "19"
152 |                     },
153 |                     {
154 |                         "TAG": "sub_species",
155 |                         "VALUE": "enterica"
156 |                     },
157 |                     {
158 |                         "TAG": "potential_contaminant",
159 |                         "VALUE": "None detected"
160 |                     },
161 |                     {
162 |                         "TAG": "host",
163 |                         "VALUE": "Homo sapiens"
164 |                     },
165 |                     {
166 |                         "TAG": "host_disease",
167 |                         "VALUE": "Not available"
168 |                     },
169 |                     {
170 |                         "TAG": "BioSampleModel",
171 |                         "VALUE": "Pathogen.cl"
172 |                     }
173 |                 ]
174 |             }
175 |         }
176 |     ],
177 |     "SUBMISSION": {
178 |         "@lab_name": "",
179 |         "@center_name": "PHE",
180 |         "@accession": "SRA563707",
181 |         "@alias": "SUB2709487"
182 |     }
183 | }


--------------------------------------------------------------------------------
/json_examples/json/SRA/SRA245/SRA245334.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "EXPERIMENT": [
  3 |         {
  4 |             "@alias": "1 Themisto libellula",
  5 |             "@accession": "SRX895485",
  6 |             "@center_name": "McGill University",
  7 |             "IDENTIFIERS": {
  8 |                 "PRIMARY_ID": "SRX895485",
  9 |                 "SUBMITTER_ID": [
 10 |                     {
 11 |                         "@namespace": "McGill University",
 12 |                         "$": "1 Themisto libellula"
 13 |                     }
 14 |                 ]
 15 |             },
 16 |             "TITLE": "1 Themisto libellula V4 18S",
 17 |             "STUDY_REF": {
 18 |                 "@accession": "SRP055766",
 19 |                 "IDENTIFIERS": {
 20 |                     "PRIMARY_ID": "SRP055766",
 21 |                     "EXTERNAL_ID": [
 22 |                         {
 23 |                             "@namespace": "SubPortal",
 24 |                             "$": "SUB832829"
 25 |                         }
 26 |                     ]
 27 |                 }
 28 |             },
 29 |             "DESIGN": {
 30 |                 "DESIGN_DESCRIPTION": "A single individual of the species Themisto libellula was PCR amplified at the V4 region of 18S, using barcoded primers, and 454 sequenced along with PCR products of 19 other species.",
 31 |                 "SAMPLE_DESCRIPTOR": {
 32 |                     "@accession": "SRS861262",
 33 |                     "IDENTIFIERS": {
 34 |                         "PRIMARY_ID": "SRS861262",
 35 |                         "EXTERNAL_ID": [
 36 |                             {
 37 |                                 "@namespace": "BioSample",
 38 |                                 "$": "SAMN03382336"
 39 |                             }
 40 |                         ]
 41 |                     }
 42 |                 },
 43 |                 "LIBRARY_DESCRIPTOR": {
 44 |                     "LIBRARY_NAME": "1 Themisto libellula V4 18S",
 45 |                     "LIBRARY_STRATEGY": "AMPLICON",
 46 |                     "LIBRARY_SOURCE": "METAGENOMIC",
 47 |                     "LIBRARY_SELECTION": "PCR",
 48 |                     "LIBRARY_LAYOUT": {
 49 |                         "SINGLE": null
 50 |                     }
 51 |                 },
 52 |                 "SPOT_DESCRIPTOR": {
 53 |                     "SPOT_DECODE_SPEC": {
 54 |                         "SPOT_LENGTH": 0,
 55 |                         "READ_SPEC": [
 56 |                             {
 57 |                                 "READ_INDEX": 0,
 58 |                                 "READ_CLASS": "Technical Read",
 59 |                                 "READ_TYPE": "Adapter",
 60 |                                 "BASE_COORD": 1
 61 |                             },
 62 |                             {
 63 |                                 "READ_INDEX": 1,
 64 |                                 "READ_CLASS": "Application Read",
 65 |                                 "READ_TYPE": "Forward",
 66 |                                 "BASE_COORD": 5
 67 |                             }
 68 |                         ]
 69 |                     }
 70 |                 }
 71 |             },
 72 |             "PLATFORM": {
 73 |                 "LS454": {
 74 |                     "INSTRUMENT_MODEL": "454 GS FLX Titanium"
 75 |                 }
 76 |             },
 77 |             "PROCESSING": null
 78 |         }
 79 |     ],
 80 |     "RUN": [
 81 |         {
 82 |             "@alias": "1 Themisto libellula V4 18S",
 83 |             "@accession": "SRR1823977",
 84 |             "@center_name": "McGill University",
 85 |             "IDENTIFIERS": {
 86 |                 "PRIMARY_ID": "SRR1823977",
 87 |                 "SUBMITTER_ID": [
 88 |                     {
 89 |                         "@namespace": "McGill University",
 90 |                         "$": "1 Themisto libellula V4 18S"
 91 |                     }
 92 |                 ]
 93 |             },
 94 |             "EXPERIMENT_REF": {
 95 |                 "@accession": "SRX895485"
 96 |             }
 97 |         }
 98 |     ],
 99 |     "SAMPLE": [
100 |         {
101 |             "@center_name": "McGill University",
102 |             "@alias": "1 Themisto libellula",
103 |             "@accession": "SRS861262",
104 |             "IDENTIFIERS": {
105 |                 "PRIMARY_ID": "SRS861262",
106 |                 "EXTERNAL_ID": [
107 |                     {
108 |                         "@namespace": "BioSample",
109 |                         "$": "SAMN03382336"
110 |                     }
111 |                 ]
112 |             },
113 |             "TITLE": "1 Themisto libellula",
114 |             "SAMPLE_NAME": {
115 |                 "TAXON_ID": 1169740,
116 |                 "SCIENTIFIC_NAME": "aquatic metagenome"
117 |             },
118 |             "DESCRIPTION": "An individual Themisto libellula was DNA extracted, amplified with a barcoded primer at the V4 region of 18S, and 454 sequenced with 19 other species.",
119 |             "SAMPLE_LINKS": {
120 |                 "SAMPLE_LINK": [
121 |                     {
122 |                         "XREF_LINK": {
123 |                             "DB": "bioproject",
124 |                             "ID": "277040",
125 |                             "LABEL": "PRJNA277040"
126 |                         }
127 |                     }
128 |                 ]
129 |             },
130 |             "SAMPLE_ATTRIBUTES": {
131 |                 "SAMPLE_ATTRIBUTE": [
132 |                     {
133 |                         "TAG": "collection_date",
134 |                         "VALUE": "missing"
135 |                     },
136 |                     {
137 |                         "TAG": "env_biome",
138 |                         "VALUE": "Aquatic"
139 |                     },
140 |                     {
141 |                         "TAG": "env_feature",
142 |                         "VALUE": "Aquatic"
143 |                     },
144 |                     {
145 |                         "TAG": "env_material",
146 |                         "VALUE": "Water"
147 |                     },
148 |                     {
149 |                         "TAG": "geo_loc_name",
150 |                         "VALUE": "USA: Chukchi Sea, Alaska"
151 |                     },
152 |                     {
153 |                         "TAG": "lat_lon",
154 |                         "VALUE": "missing"
155 |                     },
156 |                     {
157 |                         "TAG": "BioSampleModel",
158 |                         "VALUE": "MIMARKS.survey"
159 |                     },
160 |                     {
161 |                         "TAG": "BioSampleModel",
162 |                         "VALUE": "MIGS/MIMS/MIMARKS.miscellaneous"
163 |                     }
164 |                 ]
165 |             }
166 |         }
167 |     ],
168 |     "SUBMISSION": {
169 |         "@xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
170 |         "@alias": "1 Themisto libellula",
171 |         "@lab_name": "Cristescu lab",
172 |         "@center_name": "McGill University",
173 |         "@accession": "SRA245334"
174 |     }
175 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Edwards Lab](https://img.shields.io/badge/Bioinformatics-EdwardsLab-03A9F4)](https://edwards.flinders.edu.au)
  2 | # SRA_Metadata
  3 | Get, parse, and extract information from the SRA metadata files
  4 | 
  5 | ## About the SRA metadata
  6 | 
  7 | The SRA contains over 1.5 million samples, and each sample contains lots of runs. The metadata is really key to understanding that data, but the metadata is difficult to organize and understand. Here we collate the metadata information available from the SRA to make it easier to search and find things.
  8 | 
  9 | 
 10 | ## See also
 11 | 
 12 | You might also look at our [collection of blog posts](https://edwards.flinders.edu.au/sra) about the SRA that explain the organization of the SRA data, and provide alternate mechanisms to download the data, and so on.
 13 | 
 14 | # Downloading the SRA metadata
 15 | 
 16 | There are several components to the SRA data that we are going to download.
 17 | 
 18 | ## SRA_Accessions.tab
 19 | 
 20 | This tab separated file can be downloaded directly from the NCBI: [ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab](ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab). 
 21 | 
 22 | This file lists all the submissions to the SRA, and lists every accession number associated with each submission. It tells you the status of the datasets.
 23 | 
 24 | It contains the following columns:
 25 | 
 26 | * Accession
 27 | * Submission
 28 | * Status
 29 | * Updated
 30 | * Published
 31 | * Received
 32 | * Type
 33 | * Center
 34 | * Visibility
 35 | * Alias
 36 | * Experiment
 37 | * Sample
 38 | * Study
 39 | * Loaded
 40 | * Spots
 41 | * Bases
 42 | * Md5sum
 43 | * BioSample
 44 | * BioProject
 45 | * ReplacedBy
 46 | 
 47 | The key columns here are Accession, Submission, and Status.
 48 | 
 49 | The data in this file is replicated. A single submission may occur mutliple times, represented once for each of the accessions associated with it.
 50 | 
 51 | At the time of writing there were 27,838,771 entries (lines) in that file. However, there are only 1,413,223 unique submission IDs. 
 52 | 
 53 | From those 1,413,223 unique submission IDs, the `Status` field reports
 54 | 
 55 | * 1,290,528 live
 56 | * 161,652 suppressed
 57 | * 92,103 unpublished
 58 | * 10 withdrawn
 59 | 
 60 | (These numbers don't quite add up because there are some projects where the project maybe be live, but the runs or other parts of the data release may be suppressed or unpublished.)
 61 | 
 62 | # XML Metadata
 63 | 
 64 | The XML metadata is available for download from [ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/](ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/). There are daily files, and then once per month, or so, there is a complete release. 
 65 | 
 66 | For example, this file was downloaded:
 67 | 
 68 | ```bash
 69 | curl -o NCBI_SRA_Metadata_Full.tgz ftp://ftp.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_Full_20180205.tar.gz
 70 | ```
 71 | 
 72 | When you extract these files, you will get 1,000,000+ directories! Each directory is a single submission, and contains several files describing the data. I extract these using a command like: 
 73 | 
 74 | ```bash
 75 | mkdir xml
 76 | tar -C xml/ -zxf NCBI_SRA_Metadata_Full.tar.gz
 77 | ```
 78 | 
 79 | There are several [XML Schema Definition files](https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=xml_schemas) that define the data sets. Currently, there are `.xsd` files for:
 80 | 
 81 | 1. SRA Common
 82 | 2. SRA Submission
 83 | 3. SRA Study
 84 | 4. SRA Sample
 85 | 5. SRA Experiment
 86 | 6. SRA Run
 87 | 7. SRA Analysis
 88 | 
 89 | # Converting the XML files to JSON
 90 | 
 91 | We batch process the XML files and convert them to JSON, using [a Python script](xml2json/xml_dir2json_random.py). This code uses the XML Schema Definition files to validate the XML files, and then dumps a single file per submission in JSON format.
 92 | 
 93 | This version chooses a file at random from the XML directory, checks to see if it has already been processed, and if not, it processes it. This allows us to run the code in parallel (using the awesome [GNU parallel](https://www.gnu.org/software/parallel/) and process lots of XML files all at once. For example, to process this code using 30 different processors, we can do:
 94 | 
 95 | ```bash
 96 | echo "xml_dir2json_random.py -s $HOME/SRA/SRAdb/XML/Schemas/ -d xml -o json -m srr_sra_ids.tsv" > ./run_xml.sh
 97 | seq 1 30 | parallel ./run_xml.sh {}
 98 | ```
 99 | 
100 | This command creates a directory called `json` with three subdirectories, one each for `SRA`, `ERA`, `DRA`. Within those three directories, there are directories for each run, starting with the first three numbers. We use this structure because (a) it mirrors the structure at NCBI and elsewhere, and (b) breaking up the files into multiple subdirectories is much better for your filesystem. There are over 1,000,000 files, and so it takes commands like `ls` a long time to read the [inodes](http://www.grymoire.com/Unix/Inodes.html). By splitting the files out, we can more readily access and process them.
101 | 
102 | 
103 | > *Tip:* If you have an SRA ID such as `SRR=SRA889255` you can access the appropriate file with, for example, `ls json/${SRR:0:3}/${SRR:0:6}/$SRR.json`.
104 | 
105 | This command also creates an *id mapping* file called `srr_sra_ids.tsv` that has two columns, the SRA submission ID (or ERA/DBA ID) and the SRA Run ID. The most common association we are looking for is from SRR -> SRA. For example, we usually know the SRR IDs associated with a sequence run, and would like to explore the metadata associated with that run. Alternatively, we know a sample we would like to get the DNA sequences associated with. This mapping provides that connection, and you can quickly look for either a run or a submission using `grep`.
106 | 
107 | In addition, we create a file called `XML_validation_errors.txt` that reports any improper XML data that does not match the XML Schema Defintions. 
108 | 
109 | We now have a directory with all the metadata as json objects that you can analyze in different ways. 
110 | 
111 | # JSON
112 | 
113 | We have some [JSON](json/) parsing code to help you explore the data. Before you begin, however, take a look at the [json_examples](json_examples/) data directory. These are ten samples chosen completely at random from the August 2019 metadata to demonstrate the organization of the metadata there.
114 | 
115 | I also recommend using [jq](https://stedolan.github.io/jq/) for processing the data on the command line.
116 | 
117 | Here are a couple of examples from our [partie](https://github.com/linsalrob/partie) analysis of SRA datasets.
118 | 
119 | First , find all the submissions that are metagenomes or microbiomes using grep. You could also do this with the XML files, there is nothing specific about this grep and json.
120 | 
121 | ```bash
122 | egrep -rli 'metagenome|microbiome' json | perl -pe 's#json/##; s#.json##' > metagenomes.txt 
123 | ```
124 | 
125 | We now have a file, called `metagenomes.txt` that has one SRA submission per line where somewhere in the file it has the words `metagenome` or `microbiome`.
126 | 
127 | Now we can use [jq](https://stedolan.github.io/jq/) to extract just the run identifiers from these files:
128 | 
129 | ```bash
130 | cat metagenomes.txt | xargs -i jq -r "try .RUN[].IDENTIFIERS.PRIMARY_ID" json/{}.json > metagenome_runs.txt
131 | ```
132 | 
133 | In this command, we cat the file of IDs, and for each file, we use `jq` to parse the json data. We look for any `RUN` and from that pull the `IDENTIFIERS` entry, and then the `PRIMARY_ID` for that run. This prints out one `PRIMARY_ID` per line. The `try` in that command is a jq option that is basic error handling. We could add both a `try` and a `catch`, and use that to report on any JSON files that do not have a RUN associated with them, however, at the moment we don't care about those ... we just ignore them!
134 | 
135 | I don't know how to succinctly parse the XML to get this information (though you could probably do it with `grep`).
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/xml2json/xml_dir2json_random.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Read a directory of XML files and convert the output to JSON.
  3 | We write the JSON object to a file. By default, we do not
  4 | overwrite existing files, but you can force that with the -o 
  5 | flag.
  6 | """
  7 | 
  8 | import os
  9 | import sys
 10 | import argparse
 11 | from sra_metadata_libs import bcolors
 12 | import xmlschema
 13 | from xmlschema.validators.exceptions import XMLSchemaValidationError
 14 | import json
 15 | import random
 16 | import errno
 17 | import time
 18 | import fcntl
 19 | from pprint import pprint
 20 | __author__ = 'Rob Edwards'
 21 | 
 22 | 
 23 | def validation_errors(sample, error, verbose=False):
 24 |     """
 25 |     Log the validation error to a file. We append to the file.
 26 | 
 27 |     :param sample: The sample ID
 28 |     :param error: The python Error object
 29 |     :param verbose: More output
 30 |     """
 31 | 
 32 |     if verbose:
 33 |         sys.stderr.write(f"{bcolors.PINK}Logging error for {sample}{bcolors.ENDC}\n")
 34 | 
 35 |     out = open("XML_validation_errors.txt", "a")
 36 |     while True:
 37 |         try:
 38 |             fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB)
 39 |             break
 40 |         except IOError as e:
 41 |             # raise on unrelated IOErrors
 42 |             if e.errno != errno.EAGAIN:
 43 |                 raise
 44 |             else:
 45 |                 time.sleep(0.1)
 46 | 
 47 |     out.write(f"\n=== BEGIN {sample} ===\n")
 48 |     out.write(str(error))
 49 |     out.write(f"\n=== END {sample} ===\n")
 50 |     fcntl.flock(out, fcntl.LOCK_UN)
 51 |     out.close()
 52 | 
 53 | def write_id_map(data, imf, verbose=False):
 54 |     """
 55 |     Write an ID mapping file that has the SRA submission ID and the SRR Run ID
 56 |     :param data: The JSON data object
 57 |     :param imf: The id mapping file to write to
 58 |     :param verbose: more output
 59 |     :return:
 60 |     """
 61 | 
 62 |     out = open(imf, "a")
 63 |     while True:
 64 |         try:
 65 |             fcntl.flock(out, fcntl.LOCK_EX | fcntl.LOCK_NB)
 66 |             break
 67 |         except IOError as e:
 68 |             # raise on unrelated IOErrors
 69 |             if e.errno != errno.EAGAIN:
 70 |                 raise
 71 |             else:
 72 |                 time.sleep(0.1)
 73 |     acc = None
 74 |     if 'SUBMISSION' in data and '@accession' in data['SUBMISSION']:
 75 |         acc = data['SUBMISSION']['@accession']
 76 |         if verbose:
 77 |             sys.stderr.write(f"{bcolors.GREEN}ACCESSION: {acc}. Writing runs{bcolors.ENDC}\n")
 78 |     else:
 79 |         # note that we now test for this earlier, so shouldn't really get here!
 80 |         sys.stderr.write(f"{bcolors.RED}FATAL. NO @accession in {data}{bcolors.ENDC}")
 81 |         sys.exit(-1)
 82 | 
 83 |     if 'RUN' in data:
 84 |         for run in data['RUN']:
 85 |             if 'PRIMARY_ID' in  run['IDENTIFIERS']:
 86 |                 if verbose:
 87 |                     sys.stderr.write(f"\t{bcolors.GREEN}{run['IDENTIFIERS']['PRIMARY_ID']}{bcolors.ENDC}\n")
 88 |                 out.write(f"{acc}\t{run['IDENTIFIERS']['PRIMARY_ID']}\n")
 89 |     else:
 90 |         if verbose:
 91 |             sys.stderr.write(f"{bcolors.PINK}No RUN found in {acc}{bcolors.ENDC}\n")
 92 |     fcntl.flock(out, fcntl.LOCK_UN)
 93 |     out.close()
 94 | 
 95 | def read_schemas(schemadir, verbose=True):
 96 |     """
 97 |     Read the XML Schema defintion files, and return a dict of schema objects.
 98 |     :param verbose: more output
 99 |     :return: dict of schema objects
100 |     """
101 | 
102 |     #  known XML Schemas
103 |     schema_types = {"analysis" : "SRA.analysis.xsd", "common" : "SRA.common.xsd",
104 |         "experiment" : "SRA.experiment.xsd", "package" : "SRA.package.xsd", 
105 |         "run" : "SRA.run.xsd", "sample" : "SRA.sample.xsd", "study": "SRA.study.xsd",
106 |         "submission" : "SRA.submission.xsd"}
107 | 
108 | 
109 |     schemas = {}
110 | 
111 |     for s in schema_types:
112 |         if verbose:
113 |             sys.stderr.write(f"{bcolors.GREEN}Schema parsing{bcolors.ENDC} {s}\n")
114 |         schemas[s] = xmlschema.XMLSchema(os.path.join(schemadir, schema_types[s]))
115 | 
116 |     return schemas
117 | 
118 | def read_directory(basedir, subdir, schemas, verbose=False):
119 |     """
120 |     Read a directory and create a single dict for that directory
121 |   
122 |     :param basedir: The base directory of all the XML files
123 |     :param subdir: The sample directory with each of the individual XML files
124 |     :param schemas: The dictionary of XML Schema Definitions
125 |     :param verbose: more output
126 |     :return: a dict of all the data
127 |     """
128 |     
129 |     data = {}
130 |     for s in schemas:
131 |         sc = schemas[s]
132 |         if not os.path.exists(os.path.join(basedir, subdir, f"{subdir}.{s}.xml")):
133 |             if verbose and s not in ['analysis', 'common', 'package']:
134 |                 sys.stderr.write(f"{bcolors.RED}WARN: {basedir}/{subdir}/{subdir}.{s}.xml not found\n")
135 |             continue
136 |         
137 |         try:
138 |             xm = schemas[s].to_dict(os.path.join(basedir, subdir, f"{subdir}.{s}.xml"), decimal_type=str)
139 |         except XMLSchemaValidationError as e:
140 |             validation_errors(subdir, e, verbose)
141 |             continue
142 | 
143 |         #data[s.upper()] = xm[s.upper()]
144 |         if 'submission' == s:
145 |             data['SUBMISSION'] = xm
146 |         else:
147 |             data[s.upper()] = xm[s.upper()]
148 |     
149 |     return data
150 | 
151 | def write_json(xml, outfile, verbose=False):
152 |     """
153 |     Write the dictionary to a JSON file
154 |     
155 |     :param xml: The dictionary of the XML object
156 |     :param outfile: The file to write to
157 |     :param verbose: more output
158 |     """
159 | 
160 |     with open(outfile, 'w') as out:
161 |         out.write(json.dumps(xml, indent=4))
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     parser = argparse.ArgumentParser(description='Parse a directory or directories and create a json output for each one')
166 |     parser.add_argument('-d', help='directory to parse', required=True)
167 |     parser.add_argument('-o', help='where to put the json files.', required=True)
168 |     parser.add_argument('-s', help='Schema directory', required=True)
169 |     parser.add_argument('-f', help='force writing of the file, even if it exists', action='store_true')
170 |     parser.add_argument('-m', help="Run ID to Submission ID mapping file (default=srr_sra_ids.tsv)", default="srr_sra_ids.tsv")
171 |     parser.add_argument('-v', help='verbose output', action='store_true')
172 |     args = parser.parse_args()
173 | 
174 | 
175 |     # read all the known schemas
176 |     if not os.path.exists(args.s):
177 |         sys.stderr.write(f"{bcolors.RED}FATAL: Schemas/ directory with known xml schemas not found{bcolors.ENDC}\n")
178 |         sys.exit(-1)
179 |     schemas = read_schemas(args.s, args.v)
180 | 
181 |     if not os.path.exists(args.o):
182 |         os.mkdir(args.o)
183 |     
184 |     # read all the files in the base directory and randomize the order. This is so
185 |     # we can run multiple instances in parallel
186 |     fs = os.listdir(args.d)
187 |     random.shuffle(fs)
188 |     for submission in fs:
189 |         outpath = os.path.join(args.o, submission[0:3], submission[0:6])
190 |         if not os.path.exists(outpath):
191 |             os.makedirs(outpath, exist_ok=True)
192 |         outfile = os.path.join(outpath, f"{submission}.json")
193 |         if (not args.f) and os.path.exists(outfile):
194 |             continue
195 |         # we create the semaphore file that is empty so we don't try and do this twice
196 |         with open(outfile, 'w') as w:
197 |             w.write("")
198 | 
199 | 
200 |         if args.v:
201 |             sys.stderr.write(f"{bcolors.GREEN}Parsing {bcolors.ENDC} {submission}\n")
202 | 
203 |         data = read_directory(args.d, submission, schemas, args.v)
204 | 
205 |         if 'SUBMISSION' not in data:
206 |             sys.stderr.write(f"No SUBMISSION read for {submission}. Skipped\n")
207 |             continue
208 |         elif '@accession' not in data['SUBMISSION']:
209 |             sys.stderr.write(f"No @accession in the SUBMISSION for {submission}. Skipped\n")
210 |             continue
211 | 
212 |         write_id_map(data, args.m, args.v)
213 | 
214 |         if args.v:
215 |             sys.stderr.write(f"{bcolors.BLUE}Writing {bcolors.ENDC} {submission}\n")
216 |         write_json(data, outfile, args.v)
217 | 
218 | 


--------------------------------------------------------------------------------
/json_examples/json/ERA/ERA693/ERA693801.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "EXPERIMENT": [
  3 |         {
  4 |             "@alias": "EXP-68-1-2016-08-23_13-12-12",
  5 |             "@accession": "ERX1667430",
  6 |             "@broker_name": "",
  7 |             "IDENTIFIERS": {
  8 |                 "PRIMARY_ID": "ERX1667430",
  9 |                 "SUBMITTER_ID": [
 10 |                     {
 11 |                         "@namespace": "IFH_MS",
 12 |                         "$": "EXP-68-1-2016-08-23_13-12-12"
 13 |                     },
 14 |                     {
 15 |                         "@namespace": "University Hospital Muenster",
 16 |                         "$": "EXP-68-1-2016-08-23_13-12-12"
 17 |                     }
 18 |                 ]
 19 |             },
 20 |             "TITLE": "Illumina MiSeq paired end sequencing; EXP-68-1-2016-08-23_13-12-12",
 21 |             "STUDY_REF": {
 22 |                 "@accession": "ERP016940",
 23 |                 "IDENTIFIERS": {
 24 |                     "PRIMARY_ID": "ERP016940"
 25 |                 }
 26 |             },
 27 |             "DESIGN": {
 28 |                 "DESIGN_DESCRIPTION": null,
 29 |                 "SAMPLE_DESCRIPTOR": {
 30 |                     "@accession": "ERS1305242",
 31 |                     "IDENTIFIERS": {
 32 |                         "PRIMARY_ID": "ERS1305242",
 33 |                         "EXTERNAL_ID": [
 34 |                             {
 35 |                                 "@namespace": "BioSample",
 36 |                                 "$": "SAMEA4393793"
 37 |                             }
 38 |                         ]
 39 |                     }
 40 |                 },
 41 |                 "LIBRARY_DESCRIPTOR": {
 42 |                     "LIBRARY_NAME": "unspecified",
 43 |                     "LIBRARY_STRATEGY": "WGS",
 44 |                     "LIBRARY_SOURCE": "GENOMIC",
 45 |                     "LIBRARY_SELECTION": "RANDOM",
 46 |                     "LIBRARY_LAYOUT": {
 47 |                         "PAIRED": {
 48 |                             "@NOMINAL_LENGTH": 300
 49 |                         }
 50 |                     }
 51 |                 },
 52 |                 "SPOT_DESCRIPTOR": {
 53 |                     "SPOT_DECODE_SPEC": {
 54 |                         "SPOT_LENGTH": 500,
 55 |                         "READ_SPEC": [
 56 |                             {
 57 |                                 "READ_INDEX": 0,
 58 |                                 "READ_LABEL": "F1",
 59 |                                 "READ_CLASS": "Application Read",
 60 |                                 "READ_TYPE": "Forward",
 61 |                                 "BASE_COORD": 1
 62 |                             },
 63 |                             {
 64 |                                 "READ_INDEX": 1,
 65 |                                 "READ_LABEL": "R2",
 66 |                                 "READ_CLASS": "Application Read",
 67 |                                 "READ_TYPE": "Reverse",
 68 |                                 "BASE_COORD": 251
 69 |                             }
 70 |                         ]
 71 |                     }
 72 |                 }
 73 |             },
 74 |             "PLATFORM": {
 75 |                 "ILLUMINA": {
 76 |                     "INSTRUMENT_MODEL": "Illumina MiSeq"
 77 |                 }
 78 |             }
 79 |         }
 80 |     ],
 81 |     "RUN": [
 82 |         {
 83 |             "@accession": "ERR1596844",
 84 |             "@alias": "NGSRT18C2 exp EXP-68-1-2016-08-23_13-12-12 run 1",
 85 |             "IDENTIFIERS": {
 86 |                 "PRIMARY_ID": "ERR1596844",
 87 |                 "SUBMITTER_ID": [
 88 |                     {
 89 |                         "@namespace": "IFH_MS",
 90 |                         "$": "NGSRT18C2 exp EXP-68-1-2016-08-23_13-12-12 run 1"
 91 |                     },
 92 |                     {
 93 |                         "@namespace": "University Hospital Muenster",
 94 |                         "$": "NGSRT18C2 exp EXP-68-1-2016-08-23_13-12-12 run 1"
 95 |                     }
 96 |                 ]
 97 |             },
 98 |             "TITLE": "Illumina MiSeq paired end sequencing; EXP-68-1-2016-08-23_13-12-12",
 99 |             "EXPERIMENT_REF": {
100 |                 "@accession": "ERX1667430",
101 |                 "IDENTIFIERS": {
102 |                     "PRIMARY_ID": "ERX1667430"
103 |                 }
104 |             },
105 |             "SPOT_DESCRIPTOR": {
106 |                 "SPOT_DECODE_SPEC": {
107 |                     "SPOT_LENGTH": 502,
108 |                     "READ_SPEC": [
109 |                         {
110 |                             "READ_INDEX": 0,
111 |                             "READ_CLASS": "Application Read",
112 |                             "READ_TYPE": "Forward",
113 |                             "BASE_COORD": 1
114 |                         },
115 |                         {
116 |                             "READ_INDEX": 1,
117 |                             "READ_CLASS": "Application Read",
118 |                             "READ_TYPE": "Reverse",
119 |                             "BASE_COORD": 252
120 |                         }
121 |                     ]
122 |                 }
123 |             },
124 |             "RUN_ATTRIBUTES": {
125 |                 "RUN_ATTRIBUTE": [
126 |                     {
127 |                         "TAG": "ENA-FIRST-PUBLIC",
128 |                         "VALUE": "2016-12-25"
129 |                     },
130 |                     {
131 |                         "TAG": "ENA-LAST-UPDATE",
132 |                         "VALUE": "2018-11-16"
133 |                     }
134 |                 ]
135 |             }
136 |         }
137 |     ],
138 |     "SAMPLE": [
139 |         {
140 |             "@alias": "SAMEA4393793",
141 |             "@accession": "ERS1305242",
142 |             "IDENTIFIERS": {
143 |                 "PRIMARY_ID": "ERS1305242",
144 |                 "EXTERNAL_ID": [
145 |                     {
146 |                         "@namespace": "BioSample",
147 |                         "$": "SAMEA4393793"
148 |                     }
149 |                 ]
150 |             },
151 |             "TITLE": "NGSRT18C2",
152 |             "SAMPLE_NAME": {
153 |                 "TAXON_ID": 1280,
154 |                 "SCIENTIFIC_NAME": "Staphylococcus aureus"
155 |             },
156 |             "SAMPLE_ATTRIBUTES": {
157 |                 "SAMPLE_ATTRIBUTE": [
158 |                     {
159 |                         "TAG": "Alias",
160 |                         "VALUE": "NGSRT18C2"
161 |                     },
162 |                     {
163 |                         "TAG": "ENA checklist",
164 |                         "VALUE": "ERC000011"
165 |                     },
166 |                     {
167 |                         "TAG": "INSDC center name",
168 |                         "VALUE": "IFH_MS"
169 |                     },
170 |                     {
171 |                         "TAG": "INSDC first public",
172 |                         "VALUE": "2016-12-25T17:01:10Z"
173 |                     },
174 |                     {
175 |                         "TAG": "INSDC last update",
176 |                         "VALUE": "2016-08-23T15:37:20Z"
177 |                     },
178 |                     {
179 |                         "TAG": "INSDC status",
180 |                         "VALUE": "public"
181 |                     },
182 |                     {
183 |                         "TAG": "SRA accession",
184 |                         "VALUE": "ERS1305242"
185 |                     },
186 |                     {
187 |                         "TAG": "Sample Name",
188 |                         "VALUE": "ERS1305242"
189 |                     },
190 |                     {
191 |                         "TAG": "Title",
192 |                         "VALUE": "NGSRT18C2"
193 |                     },
194 |                     {
195 |                         "TAG": "collection_date",
196 |                         "VALUE": "not available"
197 |                     },
198 |                     {
199 |                         "TAG": "country",
200 |                         "VALUE": "not available"
201 |                     },
202 |                     {
203 |                         "TAG": "host",
204 |                         "VALUE": "Homo sapiens"
205 |                     },
206 |                     {
207 |                         "TAG": "host_associated",
208 |                         "VALUE": "Yes"
209 |                     }
210 |                 ]
211 |             }
212 |         }
213 |     ],
214 |     "SUBMISSION": {
215 |         "@alias": "NGSRT18C2 sub 1",
216 |         "@accession": "ERA693801",
217 |         "@lab_name": "European Nucleotide Archive"
218 |     }
219 | }


--------------------------------------------------------------------------------