├── tests ├── __init__.py ├── data │ ├── image │ │ └── dog2.jpeg │ ├── fail_file.dat │ ├── test.csv │ ├── pwscf │ │ └── NaF.scf.tar.gz │ ├── vasp │ │ └── AlNi_static_LDA.tar.gz │ ├── electron_microscopy │ │ └── test_files.tar.gz │ ├── yaml │ │ └── test_yaml.yaml │ ├── xml │ │ └── test_xml.xml │ ├── json │ │ └── test_json.json │ ├── crystal_structure │ │ ├── diamond.cif │ │ ├── Ce3VO16.cif │ │ ├── Al2O3.cif │ │ └── C13H22O3.cif │ ├── tdb │ │ ├── test_PbTe.TDB │ │ ├── test_AuSi.TDB │ │ └── PbSSeTe_Na.TDB │ └── cif │ │ └── 1548397.cif ├── test_version.py ├── test_image.py ├── test_ase.py ├── test_adapter.py ├── test_crystal_structure.py ├── conftest.py ├── test_file.py ├── test_base.py ├── test_csv.py ├── test_filename.py ├── test_tdb.py ├── test_dft.py ├── test_json.py ├── test_yaml.py ├── test_xml.py └── test_utils.py ├── .python-version ├── MANIFEST.in ├── docs ├── requirements.txt ├── source │ ├── api │ │ └── scythe.rst │ ├── index.rst │ ├── extractors.rst │ ├── goals.rst │ ├── conf.py │ ├── user-guide.rst │ └── contributor-guide.rst ├── Makefile └── make.bat ├── test-requirements.txt ├── scythe ├── __init__.py ├── adapters │ ├── __init__.py │ └── base.py ├── version.py ├── testing.py ├── image.py ├── schemas │ └── file.json ├── json.py ├── yaml.py ├── xml.py ├── tdb.py ├── filename.py ├── crystal_structure.py ├── ase.py ├── file.py ├── utils │ ├── grouping.py │ ├── __init__.py │ └── interface.py ├── csv.py ├── dft.py └── base.py ├── setup.cfg ├── .gitignore ├── readthedocs.yaml ├── .github └── workflows │ ├── release.yml │ └── test-suite-and-docs.yml ├── README.md ├── pyproject.toml └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.8.12 2 | 3.9.12 3 | 3.10.4 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include scythe/schemas *.json 2 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>5 2 | sphinx-rtd-theme 3 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov 3 | flake8 4 | diff-cover 5 | -------------------------------------------------------------------------------- /scythe/__init__.py: -------------------------------------------------------------------------------- 1 | from scythe.version import __version__ # noqa: F401 2 | -------------------------------------------------------------------------------- /scythe/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | """Functions and classes related to adapters""" 2 | -------------------------------------------------------------------------------- /tests/data/image/dog2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/materials-data-facility/scythe/HEAD/tests/data/image/dog2.jpeg -------------------------------------------------------------------------------- /tests/data/fail_file.dat: -------------------------------------------------------------------------------- 1 | This is a test file that is designed to fail most extraction, because it contains no data. 2 | -------------------------------------------------------------------------------- /tests/data/test.csv: -------------------------------------------------------------------------------- 1 | city,location 2 | london,"51.50,-0.11" 3 | paris,"48.85,2.30" 4 | rome,"41.90,12.49" 5 | utopia,N/A 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [flake8] 5 | exclude = .git,*.egg*,src/*,.* 6 | max-line-length = 160 7 | -------------------------------------------------------------------------------- /tests/data/pwscf/NaF.scf.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/materials-data-facility/scythe/HEAD/tests/data/pwscf/NaF.scf.tar.gz -------------------------------------------------------------------------------- /tests/test_version.py: -------------------------------------------------------------------------------- 1 | from scythe import version 2 | 3 | 4 | def test_version(): 5 | assert isinstance(version.__version__, str) 6 | -------------------------------------------------------------------------------- /tests/data/vasp/AlNi_static_LDA.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/materials-data-facility/scythe/HEAD/tests/data/vasp/AlNi_static_LDA.tar.gz -------------------------------------------------------------------------------- /tests/data/electron_microscopy/test_files.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/materials-data-facility/scythe/HEAD/tests/data/electron_microscopy/test_files.tar.gz -------------------------------------------------------------------------------- /tests/data/yaml/test_yaml.yaml: -------------------------------------------------------------------------------- 1 | compost: CN25 2 | dict1: 3 | field1: value1 4 | field2: 2 5 | dict2: 6 | nested1: 7 | field1: true 8 | field3: value3 9 | na_val: na 10 | -------------------------------------------------------------------------------- /tests/data/xml/test_xml.xml: -------------------------------------------------------------------------------- 1 | 2 | value12bazvalue3CN25na 3 | -------------------------------------------------------------------------------- /scythe/version.py: -------------------------------------------------------------------------------- 1 | # we target 3.8+, so this should be okay without fallback to importlib_metadata 2 | import importlib.metadata 3 | 4 | # single source of truth for package version, 5 | # see https://packaging.python.org/en/latest/single_source_version/ 6 | 7 | __version__ = importlib.metadata.version('scythe-extractors') 8 | -------------------------------------------------------------------------------- /tests/data/json/test_json.json: -------------------------------------------------------------------------------- 1 | { 2 | "dict1": { 3 | "field1": "value1", 4 | "field2": 2 5 | }, 6 | "dict2": { 7 | "nested1": { 8 | "field1": true, 9 | "field3": "value3" 10 | } 11 | }, 12 | "compost": "CN25", 13 | "na_val": "na" 14 | } 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *.swo 4 | *.ipynb_checkpoints 5 | Untitled*.ipynb 6 | *.cache* 7 | 8 | *.DS_Store 9 | 10 | */build/* 11 | docs/_build 12 | scythe/schemas/doc/ 13 | */dist/* 14 | *.egg* 15 | src/* 16 | dist/ 17 | .tox 18 | 19 | coverage_html/ 20 | coverage.xml 21 | *.coverage* 22 | *.pytest_cache* 23 | .vscode 24 | .venv 25 | .idea 26 | test_files/ 27 | *.xpr 28 | -------------------------------------------------------------------------------- /docs/source/api/scythe.rst: -------------------------------------------------------------------------------- 1 | scythe 2 | ====== 3 | 4 | Documentation for the non-parser functions in ``scythe``. 5 | 6 | scythe.adapters.base 7 | -------------------- 8 | 9 | .. automodule:: scythe.adapters.base 10 | :members: 11 | 12 | scythe.utils.interface 13 | ---------------------- 14 | 15 | .. automodule:: scythe.utils.interface 16 | :members: 17 | 18 | scythe.utils.grouping 19 | --------------------- 20 | 21 | .. automodule:: scythe.utils.grouping 22 | :members: -------------------------------------------------------------------------------- /tests/test_image.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | 4 | from scythe.image import ImageExtractor 5 | 6 | 7 | @pytest.fixture 8 | def test_image(): 9 | return os.path.join(os.path.dirname(__file__), 'data', 'image', 'dog2.jpeg') 10 | 11 | 12 | def test_parse(test_image): 13 | p = ImageExtractor() 14 | assert (p.extract([test_image]) == {'image': {'format': 'JPEG', 'height': 1000, 15 | 'megapixels': 1.91, 'width': 1910, 16 | 'shape': [1000, 1910, 3]}}) 17 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | version: 2 3 | 4 | # Set the version of Python and other tools you might need 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: "3.10" 9 | 10 | # Build documentation in the docs/ directory with Sphinx 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | - method: pip 23 | path: . 24 | extra_requirements: 25 | - all -------------------------------------------------------------------------------- /scythe/testing.py: -------------------------------------------------------------------------------- 1 | """Parsers used for testing purposes""" 2 | 3 | from scythe.base import BaseExtractor 4 | from typing import Iterable 5 | import os 6 | 7 | 8 | class NOOPExtractor(BaseExtractor): 9 | """Determine whether files exist, used for debugging 10 | 11 | Is not truly a "noop" parser, as it does perform a check as to whether the parser 12 | has access to a certain file. It is more a "check if the parser could run and then do 13 | nothing" parser. 14 | """ 15 | 16 | def extract(self, group: Iterable[str], context: dict = None): 17 | return dict((f, os.path.exists(f)) for f in group) 18 | 19 | def version(self): 20 | return '0.0.1' 21 | 22 | def implementors(self): 23 | return ['Logan Ward '] 24 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Scythe's documentation! 2 | ======================================= 3 | 4 | Scythe is a library of tools that generate summaries of the data contained in scientific 5 | data files. The goal of Scythe is to provide a shared resources of these tool to avoid 6 | duplication of effort between the many emerging scientific databases. Each extractor is designed to 7 | generate the sum of all data needed by each of these databases with a uniform API so that specific 8 | projects can write simple adaptors for their needs. 9 | 10 | Source Code: https://github.com/materials-data-facility/Scythe 11 | 12 | .. toctree:: 13 | :maxdepth: 2 14 | :caption: Contents: 15 | 16 | goals 17 | user-guide 18 | contributor-guide 19 | extractors 20 | api/scythe 21 | -------------------------------------------------------------------------------- /tests/test_ase.py: -------------------------------------------------------------------------------- 1 | from scythe.ase import ASEExtractor 2 | from math import isclose 3 | import pytest 4 | import os 5 | 6 | 7 | @pytest.fixture 8 | def ase(): 9 | return os.path.join(os.path.dirname(__file__), 'data', 'gaussian', 'molecule.log') 10 | 11 | 12 | @pytest.fixture 13 | def parser(): 14 | return ASEExtractor() 15 | 16 | 17 | def test_ase(parser, ase): 18 | output = parser.extract(ase) 19 | 20 | # Check the chemical formula 21 | assert output['chemical_formula'] == "C38H14N8O12" 22 | 23 | # Check the shape of the force outputs. There should be 24 | # 72 atoms and forces in 3 directions 25 | assert len(output['forces'][0]) == 72 26 | assert len(output['forces'][0][0]) == 3 27 | 28 | assert isclose(output['energy'], -76063.21525532556) 29 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | 13 | - name: Install pypa/build 14 | run: >- 15 | python -m 16 | pip install 17 | build 18 | --user 19 | 20 | - name: Build a binary wheel and a source tarball 21 | run: >- 22 | python -m 23 | build 24 | --sdist 25 | --wheel 26 | --outdir dist/ 27 | . 28 | 29 | - name: pypi-publish 30 | if: startsWith(github.ref, 'refs/tags') 31 | uses: pypa/gh-action-pypi-publish@release/v1 32 | with: 33 | password: ${{ secrets.PYPI_TOKEN }} 34 | -------------------------------------------------------------------------------- /scythe/image.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | 3 | from scythe.base import BaseSingleFileExtractor 4 | 5 | 6 | class ImageExtractor(BaseSingleFileExtractor): 7 | """Retrieves basic information about an image""" 8 | 9 | def _extract_file(self, file_path, context=None): 10 | im = Image.open(file_path) 11 | return { 12 | "image": { 13 | "width": im.width, 14 | "height": im.height, 15 | "format": im.format, 16 | "megapixels": (im.width * im.height) / 1000000, 17 | "shape": [ 18 | im.height, 19 | im.width, 20 | len(im.getbands()) 21 | ] 22 | } 23 | } 24 | 25 | def implementors(self): 26 | return ['Jonathon Gaff'] 27 | 28 | def version(self): 29 | return '0.0.2' 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /scythe/schemas/file.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "description": "Output from the Scythe File Parser", 4 | "type": "object", 5 | "properties": { 6 | "mime_type": { 7 | "type": "string", 8 | "description": "MIME type of the file. See https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Complete_list_of_MIME_types" 9 | }, 10 | "length": { 11 | "type": "integer", 12 | "description": "File size in bytes" 13 | }, 14 | "filename": { 15 | "type": "string", 16 | "description": "Name of the file, without the path information" 17 | }, 18 | "path": { 19 | "type": "string", 20 | "description": "Complete path of the file, including directory" 21 | }, 22 | "hash": { 23 | "type": "string", 24 | "description": "SHA512 hash of the file contents" 25 | } 26 | }, 27 | "additionalProperties": false, 28 | "required": ["mime_type", "length", "filename"] 29 | } 30 | -------------------------------------------------------------------------------- /tests/test_adapter.py: -------------------------------------------------------------------------------- 1 | from scythe.adapters.base import NOOPAdapter, GreedySerializeAdapter 2 | from scythe.testing import NOOPExtractor 3 | 4 | 5 | def test_compatibility(): 6 | adapter = NOOPAdapter() 7 | parser = NOOPExtractor() 8 | 9 | # Make sure `None` is always compatible 10 | assert adapter.version() is None 11 | assert adapter.check_compatibility(parser) 12 | 13 | # Make sure giving the adapter the same version number works 14 | adapter.version = lambda: parser.version() 15 | assert adapter.check_compatibility(parser) 16 | 17 | # Make sure giving it a different version number breaks compatibility 18 | adapter.version = lambda: parser.version() + '1' 19 | assert not adapter.check_compatibility(parser) 20 | 21 | 22 | def test_greedy_adapter_unserializable(): 23 | adapter = GreedySerializeAdapter() 24 | unserializable_bytes = {'key': b'\x03\xdd'} 25 | s = adapter.transform(unserializable_bytes) 26 | assert s == '{"key": "<>"}' 27 | -------------------------------------------------------------------------------- /tests/test_crystal_structure.py: -------------------------------------------------------------------------------- 1 | from scythe.crystal_structure import CrystalStructureExtractor 2 | from math import isclose 3 | import pytest 4 | import os 5 | 6 | 7 | @pytest.fixture 8 | def cif(): 9 | return os.path.join(os.path.dirname(__file__), 'data', 'cif', '1548397.cif') 10 | 11 | 12 | @pytest.fixture 13 | def parser(): 14 | return CrystalStructureExtractor() 15 | 16 | 17 | def test_cif(parser, cif): 18 | output = parser.extract(cif) 19 | 20 | # Check the volume and number of atoms, which is a float 21 | assert isclose(output['crystal_structure']['volume'], 101836.44086588411) 22 | assert isclose(output['crystal_structure']['number_of_atoms'], 5070.0) 23 | 24 | # Check everything else 25 | del output['crystal_structure']['volume'] 26 | del output['crystal_structure']['number_of_atoms'] 27 | assert output == {'material': {'composition': 'Co270H1680C1872N324O924'}, 28 | 'crystal_structure': {'space_group_number': 146, 29 | 'stoichiometry': 'A45B54C154D280E312'}} 30 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import tarfile 2 | import pathlib 3 | 4 | tar_f = pathlib.Path(__file__).parent / 'data' / 'electron_microscopy' / \ 5 | 'test_files.tar.gz' 6 | 7 | 8 | def pytest_sessionstart(session): 9 | """ 10 | Called after the Session object has been created and 11 | before performing collection and entering the run test loop. 12 | 13 | Unpack the compressed electron_microscopy est files. 14 | """ 15 | with tarfile.open(tar_f, 'r:gz') as tar: 16 | tar.extractall(path=pathlib.Path(tar_f).parent) 17 | 18 | 19 | def pytest_sessionfinish(session, exitstatus): 20 | """ 21 | Called after whole test run finished, right before 22 | returning the exit status to the system. 23 | 24 | Remove the unpacked test files. 25 | """ 26 | with tarfile.open(tar_f, 'r:gz') as tar: 27 | fn_list = tar.getnames() 28 | 29 | fn_list = [pathlib.Path(__file__).parent / 'data' / 30 | 'electron_microscopy' / f for f in fn_list] 31 | for path in fn_list: 32 | if path.is_file(): 33 | path.unlink() 34 | -------------------------------------------------------------------------------- /scythe/json.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from mdf_toolbox import translate_json 4 | 5 | from scythe.base import BaseSingleFileExtractor 6 | 7 | 8 | class JSONExtractor(BaseSingleFileExtractor): 9 | """Extracts fields in JSON into a user-defined new schema.""" 10 | 11 | def _extract_file(self, path, context=None): 12 | """Context used: 13 | mapping (dict): Required. The mapping of desired_fields: existing_fields, 14 | using dot notation. For example: 15 | {"good_schema.good_field": "oldSchema.longpath.nestedDicts.old_field"} 16 | na_values (list of str): Values to treat as N/A. Default None. 17 | """ 18 | if not context.get("mapping"): 19 | raise ValueError("Mapping is required for the JSONExtractor.") 20 | with open(path) as f: 21 | file_json = json.load(f) 22 | return translate_json(file_json, context["mapping"], 23 | na_values=context.get("na_values", None)) 24 | 25 | def implementors(self): 26 | return ['Jonathon Gaff'] 27 | 28 | def version(self): 29 | return '0.0.1' 30 | -------------------------------------------------------------------------------- /scythe/yaml.py: -------------------------------------------------------------------------------- 1 | from mdf_toolbox import translate_json 2 | import yaml 3 | 4 | from scythe.base import BaseSingleFileExtractor 5 | 6 | 7 | class YAMLExtractor(BaseSingleFileExtractor): 8 | """Extracts fields in YAML into a user-defined new schema in JSON.""" 9 | 10 | def _extract_file(self, path, context=None): 11 | """Context used: 12 | mapping (dict): Required. The mapping of desired_fields: existing_fields, 13 | using dot notation. For example: 14 | {"good_schema.good_field": "oldSchema.longpath.nestedDicts.old_field"} 15 | na_values (list of str): Values to treat as N/A. Default None. 16 | """ 17 | if not context.get("mapping"): 18 | raise ValueError("Mapping is required for the YAMLExtractor.") 19 | with open(path) as f: 20 | file_json = yaml.safe_load(f) 21 | return translate_json(file_json, context["mapping"], 22 | na_values=context.get("na_values", None)) 23 | 24 | def implementors(self): 25 | return ['Jonathon Gaff'] 26 | 27 | def version(self): 28 | return '0.0.1' 29 | -------------------------------------------------------------------------------- /tests/data/crystal_structure/diamond.cif: -------------------------------------------------------------------------------- 1 | # generated using pymatgen 2 | data_C 3 | _symmetry_space_group_name_H-M 'P 1' 4 | _cell_length_a 3.57370926 5 | _cell_length_b 3.57370926 6 | _cell_length_c 3.57370926 7 | _cell_angle_alpha 90.00000000 8 | _cell_angle_beta 90.00000000 9 | _cell_angle_gamma 90.00000000 10 | _symmetry_Int_Tables_number 1 11 | _chemical_formula_structural C 12 | _chemical_formula_sum C8 13 | _cell_volume 45.64126285 14 | _cell_formula_units_Z 8 15 | loop_ 16 | _symmetry_equiv_pos_site_id 17 | _symmetry_equiv_pos_as_xyz 18 | 1 'x, y, z' 19 | loop_ 20 | _atom_site_type_symbol 21 | _atom_site_label 22 | _atom_site_symmetry_multiplicity 23 | _atom_site_fract_x 24 | _atom_site_fract_y 25 | _atom_site_fract_z 26 | _atom_site_occupancy 27 | C C1 1 0.250000 0.250000 0.250000 1 28 | C C2 1 0.000000 0.000000 0.000000 1 29 | C C3 1 0.250000 0.750000 0.750000 1 30 | C C4 1 0.000000 0.500000 0.500000 1 31 | C C5 1 0.750000 0.250000 0.750000 1 32 | C C6 1 0.500000 0.000000 0.500000 1 33 | C C7 1 0.750000 0.750000 0.250000 1 34 | C C8 1 0.500000 0.500000 0.000000 1 35 | -------------------------------------------------------------------------------- /scythe/xml.py: -------------------------------------------------------------------------------- 1 | from mdf_toolbox import translate_json 2 | import xmltodict 3 | 4 | from scythe.base import BaseSingleFileExtractor 5 | 6 | 7 | class XMLExtractor(BaseSingleFileExtractor): 8 | """Extracts fields in XML into a user-defined new schema in JSON.""" 9 | 10 | def _extract_file(self, path, context=None): 11 | """Context used: 12 | mapping (dict): Required. The mapping of desired_fields: existing_fields, 13 | using dot notation. For example: 14 | {"good_schema.good_field": "oldSchema.longpath.nestedDicts.old_field"} 15 | na_values (list of str): Values to treat as N/A. Default None. 16 | """ 17 | if not context.get("mapping"): 18 | raise ValueError("Mapping is required for the XMLExtractor.") 19 | with open(path) as f: 20 | file_json = xmltodict.parse(f.read()) 21 | return translate_json(file_json, context["mapping"], 22 | na_values=context.get("na_values", None)) 23 | 24 | def implementors(self): 25 | return ['Jonathon Gaff'] 26 | 27 | def version(self): 28 | return '0.0.1' 29 | -------------------------------------------------------------------------------- /.github/workflows/test-suite-and-docs.yml: -------------------------------------------------------------------------------- 1 | name: Build Status 2 | on: [push, pull_request] 3 | jobs: 4 | run_test_suite: 5 | name: ${{ matrix.os }}-py${{ matrix.python-version }} 6 | env: 7 | target_dir: './docs/_build' 8 | strategy: 9 | fail-fast: false 10 | matrix: 11 | python-version: ["3.8", "3.9", "3.10"] 12 | os: [ubuntu-latest, macos-latest] #, windows-latest] 13 | runs-on: ${{ matrix.os }} 14 | steps: 15 | - name: Check out the repo at master branch (for diff-cover) 16 | uses: actions/checkout@v3 17 | with: 18 | ref: master 19 | 20 | - uses: actions/checkout@v3 21 | 22 | - uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install package 27 | run: | 28 | pip install -e.[all] 29 | pip install -r test-requirements.txt 30 | 31 | - name: Flake8 32 | run: flake8 33 | 34 | - name: Run tests 35 | run: pytest 36 | 37 | - name: Run diff-cover 38 | run: diff-cover coverage.xml --compare-branch=origin/master --diff-range-notation '..' 39 | 40 | - name: Upload coverage to Codecov 41 | uses: codecov/codecov-action@v2 42 | 43 | - name: Build documentation 44 | run: | 45 | cd docs 46 | pip install -r requirements.txt 47 | make html 48 | -------------------------------------------------------------------------------- /scythe/tdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | # pycalphad and hyperspy imports require this env var set 3 | # Triggers E402: module level import not at top of file, so noqa set for other imports 4 | os.environ["MPLBACKEND"] = "agg" 5 | import pycalphad # noqa: E402 6 | 7 | from scythe.base import BaseSingleFileExtractor # noqa: E402 8 | 9 | 10 | class TDBExtractor(BaseSingleFileExtractor): 11 | """Extract metadata from a Thermodynamic Database (TBD) file. 12 | 13 | Built atop `PyCALPHAD `_. 14 | """ 15 | 16 | def _extract_file(self, path, context=None): 17 | material = {} 18 | calphad = {} 19 | # Attempt to read the file 20 | calphad_db = pycalphad.Database(path) 21 | composition = "" 22 | for element in calphad_db.elements: 23 | if element.isalnum(): 24 | element = element.lower() 25 | element = element[0].upper() + element[1:] 26 | composition += element 27 | 28 | phases = list(calphad_db.phases.keys()) 29 | 30 | if composition: 31 | material['composition'] = composition 32 | if phases: 33 | calphad['phases'] = phases 34 | 35 | # Create record 36 | record = {} 37 | if material: 38 | record["material"] = material 39 | if calphad: 40 | record["calphad"] = calphad 41 | return record 42 | 43 | def implementors(self): 44 | return ['Jonathon Gaff'] 45 | 46 | def version(self): 47 | return '0.0.1' 48 | -------------------------------------------------------------------------------- /scythe/filename.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | from mdf_toolbox import flatten_json 5 | 6 | from scythe.base import BaseSingleFileExtractor 7 | 8 | 9 | class FilenameExtractor(BaseSingleFileExtractor): 10 | """Extracts metadata in a filename, according to user-supplied patterns.""" 11 | 12 | def _extract_file(self, path, context=None): 13 | """Context used: 14 | mapping (dict): Required. The mapping of desired_fields: regex_pattern, 15 | using dot notation. For example: 16 | {"material.composition": "^[a-zA-Z]{3,4}"} 17 | na_values (list of str): Values to treat as N/A. Default None. 18 | """ 19 | if not context.get("mapping"): 20 | raise ValueError("Mapping is required for the FilenameExtractor.") 21 | 22 | record = {} 23 | filename = os.path.basename(path) 24 | for json_path, pattern in flatten_json(context["mapping"]).items(): 25 | match = re.search(pattern, filename) 26 | if match: 27 | fields = json_path.split(".") 28 | last_field = fields.pop() 29 | current_field = record 30 | # Create all missing fields 31 | for field in fields: 32 | if current_field.get(field) is None: 33 | current_field[field] = {} 34 | current_field = current_field[field] 35 | # Add value to end 36 | current_field[last_field] = match.group() 37 | return record 38 | 39 | def implementors(self): 40 | return ['Jonathon Gaff'] 41 | 42 | def version(self): 43 | return '0.0.1' 44 | -------------------------------------------------------------------------------- /tests/test_file.py: -------------------------------------------------------------------------------- 1 | from scythe.file import GenericFileExtractor 2 | import pytest 3 | import os 4 | 5 | 6 | def test_file(): 7 | my_file = os.path.join(os.path.dirname(__file__), 'data', 'image', 'dog2.jpeg') 8 | parser = GenericFileExtractor(store_path=True, compute_hash=True) 9 | output = parser.extract([my_file]) 10 | expected = { 11 | 'mime_type': 'image/jpeg', 12 | 'length': 269360, 13 | 'filename': 'dog2.jpeg', 14 | 'path': my_file, 15 | 'data_type': 'JPEG image data, JFIF standard 1.01, resolution (DPI), ' 16 | 'density 300x300, segment length 16, Exif Standard: [TIFF ' 17 | 'image data, little-endian, direntries=2, GPS-Data], ' 18 | 'baseline, precision 8, 1910x1000, frames 3', 19 | 'sha512': '1f47ed450ad23e92caf1a0e5307e2af9b13edcd7735ac9685c9f21c' 20 | '9faec62cb95892e890a73480b06189ed5b842d8b265c5e47cc6cf27' 21 | '9d281270211cff8f90'} 22 | 23 | # be defensive against data_type, which will only be present if the user has libmagic installed 24 | if 'data_type' not in output: 25 | del expected['data_type'] 26 | del expected['mime_type'] 27 | assert output == expected 28 | assert isinstance(parser.schema, dict) 29 | pytest.xfail("'data_type' was not present in the parser output, most likely because " 30 | "libmagic is not properly installed") 31 | 32 | for i in ['JPEG image data', 'density 300x300', 'TIFF image data', 33 | '1910x1000']: 34 | assert i in output['data_type'] 35 | del output['data_type'] 36 | del expected['data_type'] 37 | assert output == expected 38 | assert isinstance(parser.schema, dict) 39 | -------------------------------------------------------------------------------- /tests/test_base.py: -------------------------------------------------------------------------------- 1 | from scythe.base import BaseExtractor, BaseSingleFileExtractor 2 | from glob import glob 3 | import pytest 4 | import os 5 | 6 | 7 | class FakeParser(BaseExtractor): 8 | 9 | def extract(self, group, context=None): 10 | return {'group': list(group)} 11 | 12 | def implementors(self): 13 | return ['Logan Ward'] 14 | 15 | def version(self): 16 | return '0.0.0' 17 | 18 | 19 | class FakeSingleParser(BaseSingleFileExtractor): 20 | 21 | def _extract_file(self, path, context=None): 22 | return {'dirname': os.path.dirname(path)} 23 | 24 | def implementors(self): 25 | return ['Logan Ward'] 26 | 27 | def version(self): 28 | return '0.0.0' 29 | 30 | 31 | @pytest.fixture 32 | def directory(): 33 | return os.path.dirname(__file__) 34 | 35 | 36 | @pytest.fixture 37 | def parser(): 38 | return FakeParser() 39 | 40 | 41 | @pytest.fixture 42 | def my_files(directory): 43 | return [p for p in glob(os.path.join(directory, '**', '*'), recursive=True) 44 | if os.path.isfile(p)] 45 | 46 | 47 | def test_group(parser, directory, my_files): 48 | groups = set(parser.group(my_files)) 49 | assert groups == set(zip(my_files)) # Each file own group 50 | 51 | 52 | def test_parse_dir(caplog, parser, directory, my_files): 53 | assert len(list(parser.extract_directory(directory))) == len(my_files) 54 | 55 | 56 | def test_citations(parser): 57 | assert parser.citations() == [] 58 | 59 | 60 | def test_single_file(directory): 61 | parser = FakeSingleParser() 62 | assert parser.extract(__file__) == {'dirname': directory} # Handle sensibly incorrect inputs 63 | assert parser.extract([__file__]) == {'dirname': directory} 64 | with pytest.raises(ValueError): 65 | parser.extract(['/fake/file.in', '/fake/file.out']) 66 | -------------------------------------------------------------------------------- /tests/test_csv.py: -------------------------------------------------------------------------------- 1 | from scythe.csv import CSVExtractor 2 | import os 3 | 4 | csv_file = os.path.join(os.path.dirname(__file__), 'data', 'test.csv') 5 | 6 | 7 | def test_csv(): 8 | p = CSVExtractor() 9 | 10 | # Test with records 11 | output = p.extract([csv_file]) 12 | assert len(output['records']) == 4 13 | assert isinstance(output['records'][0], dict) 14 | assert isinstance(output['records'][0]['location'], list) 15 | assert isinstance(output['records'][0]['location'][0], float) 16 | assert isinstance(output['records'][-1]['location'], str) # Would fail schema 17 | assert output['schema'] == {'fields': [{'name': 'city', 'type': 'string', 18 | 'format': 'default'}, 19 | {'name': 'location', 'type': 'geopoint', 20 | 'format': 'default'}], 21 | 'missingValues': ['']} 22 | 23 | # Test without records 24 | p.return_records = False 25 | assert 'records' not in p.extract([csv_file]) 26 | 27 | # Test with missing values 28 | p.return_records = True 29 | output = p.extract([csv_file], {'na_values': ['N/A']}) 30 | assert output['schema'] == {'fields': [{'name': 'city', 'type': 'string', 31 | 'format': 'default'}, 32 | {'name': 'location', 'type': 'geopoint', 33 | 'format': 'default'}], 34 | 'missingValues': ['', 'N/A']} 35 | assert output['records'][-1]['location'] is None 36 | 37 | # Just run the other operations 38 | assert any('https://github.com/frictionlessdata/tableschema-py' in x for x in p.citations()) 39 | p.implementors() 40 | p.version() 41 | -------------------------------------------------------------------------------- /tests/test_filename.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from scythe.filename import FilenameExtractor 6 | 7 | 8 | @pytest.fixture 9 | def test_files(): 10 | # These are not files on disk, because no data is read from the files directly 11 | return ["He_abcdeffoo:FOO.txt", 12 | "Al123Smith_et_al.and_co.data", 13 | os.path.join(os.path.dirname(__file__), 'data', 'filename', "O2foo:bar")] 14 | 15 | 16 | @pytest.fixture 17 | def extractor(): 18 | return FilenameExtractor() 19 | 20 | 21 | @pytest.fixture 22 | def mappings(): 23 | return [{ 24 | "material.composition": "^.{2}", # First two chars are always composition 25 | "custom.foo": "foo:.{3}", # 3 chars after foo is value of foo 26 | "custom.ext": "\\..{3,4}$" # 3 or 4 char extension 27 | }] 28 | 29 | 30 | def test_filename(extractor, test_files, mappings): 31 | # Run test extractions 32 | outputs = [{ 33 | 'custom': { 34 | 'ext': '.txt', 35 | 'foo': 'foo:FOO' 36 | }, 37 | 'material': { 38 | 'composition': 'He' 39 | } 40 | }, { 41 | 'custom': { 42 | 'ext': '.data' 43 | }, 44 | 'material': { 45 | 'composition': 'Al' 46 | } 47 | }, { 48 | 'custom': { 49 | 'foo': 'foo:bar' 50 | }, 51 | 'material': { 52 | 'composition': 'O2' 53 | } 54 | }] 55 | 56 | assert extractor.extract(test_files[0], context={"mapping": mappings[0]}) == outputs[0] 57 | assert extractor.extract(test_files[1], context={"mapping": mappings[0]}) == outputs[1] 58 | assert extractor.extract(test_files[2], context={"mapping": mappings[0]}) == outputs[2] 59 | 60 | # Test failure modes 61 | # No mapping provided 62 | with pytest.raises(Exception): 63 | extractor.extract(test_files[0]) 64 | -------------------------------------------------------------------------------- /tests/test_tdb.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from mdf_toolbox import insensitive_comparison as eqi 4 | import pytest 5 | 6 | from scythe.tdb import TDBExtractor 7 | 8 | 9 | @pytest.fixture 10 | def test_files(): 11 | return [os.path.join(os.path.dirname(__file__), 'data', 'tdb', 'PbSSeTe_Na.TDB'), 12 | os.path.join(os.path.dirname(__file__), 'data', 'tdb', 'test_AuSi.TDB'), 13 | os.path.join(os.path.dirname(__file__), 'data', 'tdb', 'test_PbTe.TDB')] 14 | 15 | 16 | @pytest.fixture 17 | def fail_file(): 18 | return os.path.join(os.path.dirname(__file__), 'data', 'fail_file.dat') 19 | 20 | 21 | @pytest.fixture 22 | def extractor(): 23 | return TDBExtractor() 24 | 25 | 26 | def test_tdb(extractor, test_files, fail_file): 27 | # Run test extractions 28 | output0 = extractor.extract(test_files[0]) 29 | assert eqi(output0["material"]["composition"], "VaPbSNaTeSe", string_insensitive=True) 30 | assert eqi(output0["calphad"]["phases"], ['LIQUID', 'FCC_A1', 'HALITE', 'HEXAGONAL_A8', 31 | 'ORTHORHOMBIC_S', 'BCC_A2', 'NA2TE', 'NATE', 'NATE3', 32 | 'NA2SE', 'NASE', 'NASE2', 'NA2S', 'NAS', 'NAS2']) 33 | output1 = extractor.extract(test_files[1]) 34 | assert eqi(output1["material"]["composition"], "SiVaAu", string_insensitive=True) 35 | assert eqi(output1["calphad"]["phases"], ['LIQUID', 'BCC_A2', 'CBCC_A12', 'CUB_A13', 36 | 'DIAMOND_A4', 'FCC_A1', 'HCP_A3', 'HCP_ZN']) 37 | output2 = extractor.extract(test_files[2]) 38 | assert eqi(output2["material"]["composition"], "TeVaPb", string_insensitive=True) 39 | assert eqi(output2["calphad"]["phases"], ['LIQUID', 'PBTE', 'HEXAGONAL_A8', 'RHOMBOHEDRAL_A7']) 40 | 41 | # Test failure modes 42 | with pytest.raises(Exception): 43 | extractor.extract(fail_file) 44 | -------------------------------------------------------------------------------- /tests/test_dft.py: -------------------------------------------------------------------------------- 1 | from scythe.dft import DFTExtractor 2 | from shutil import copy 3 | from glob import glob 4 | import tarfile 5 | import pytest 6 | import os 7 | 8 | vasp_tar = os.path.join(os.path.dirname(__file__), 'data', 9 | 'vasp', 'AlNi_static_LDA.tar.gz') 10 | pwscf_tar = os.path.join(os.path.dirname(__file__), 'data', 11 | 'pwscf', 'NaF.scf.tar.gz') 12 | 13 | 14 | @pytest.fixture 15 | def vasp_dir(tmpdir): 16 | """Unpack VASP tar into a temporary directory""" 17 | with tarfile.open(vasp_tar) as fp: 18 | fp.extractall(tmpdir) 19 | return str(tmpdir) 20 | 21 | 22 | @pytest.fixture 23 | def parser(): 24 | return DFTExtractor(quality_report=False) 25 | 26 | 27 | @pytest.fixture 28 | def multi_vasp_dir(vasp_dir): 29 | """VASP directory with two calculations with different extensions""" 30 | for f in glob(os.path.join(os.path.join(vasp_dir, 'AlNi_static_LDA'), '*')): 31 | if os.path.isfile(f): 32 | copy(f, f + '.2') 33 | return str(vasp_dir) 34 | 35 | 36 | @pytest.fixture 37 | def pwscf_dir(tmpdir): 38 | with tarfile.open(pwscf_tar) as fp: 39 | fp.extractall(tmpdir) 40 | return str(tmpdir) 41 | 42 | 43 | def test_single_vasp_calc(parser, vasp_dir): 44 | metadata = list(parser.extract_directory(vasp_dir)) 45 | assert len(metadata) == 1 46 | assert isinstance(metadata[0], tuple) 47 | assert isinstance(metadata[0][0], list) 48 | assert isinstance(metadata[0][1], dict) 49 | 50 | 51 | def test_multivasp_calc(parser: DFTExtractor, multi_vasp_dir): 52 | metadata = list(parser.extract_directory(multi_vasp_dir)) 53 | assert len(metadata) == 2 54 | assert isinstance(metadata[0][0], list) 55 | assert isinstance(metadata[0][1], dict) 56 | 57 | 58 | def test_pwscf(parser: DFTExtractor, pwscf_dir): 59 | metadata = list(parser.extract_directory(pwscf_dir)) 60 | assert len(metadata) == 1 61 | -------------------------------------------------------------------------------- /scythe/crystal_structure.py: -------------------------------------------------------------------------------- 1 | from pymatgen.io.ase import AseAtomsAdaptor 2 | from pymatgen.core import Structure 3 | from ase.io import read 4 | 5 | from scythe.base import BaseSingleFileExtractor 6 | 7 | 8 | class CrystalStructureExtractor(BaseSingleFileExtractor): 9 | """Extract information about a crystal structure from many types of files. 10 | 11 | Uses either ASE or Pymatgen on the back end""" 12 | 13 | def _extract_file(self, path, context=None): 14 | material = {} 15 | crystal_structure = {} 16 | # Attempt to read the file 17 | try: 18 | # Read with ASE 19 | ase_res = read(path) 20 | # Check data read, validate crystal structure 21 | if not ase_res or not all(ase_res.get_pbc()): 22 | raise ValueError("No valid data") 23 | else: 24 | # Convert ASE Atoms to Pymatgen Structure 25 | pmg_s = AseAtomsAdaptor.get_structure(ase_res) 26 | # ASE failed to read file 27 | except Exception: 28 | try: 29 | # Read with Pymatgen 30 | pmg_s = Structure.from_file(path) 31 | except Exception: 32 | # Can't read file 33 | raise ValueError('File not readable by pymatgen or ase: {}'.format(path)) 34 | 35 | # Parse material block 36 | material["composition"] = pmg_s.formula.replace(" ", "") 37 | 38 | # Parse crystal_structure block 39 | crystal_structure["space_group_number"] = pmg_s.get_space_group_info()[1] 40 | crystal_structure["number_of_atoms"] = float(pmg_s.composition.num_atoms) 41 | crystal_structure["volume"] = float(pmg_s.volume) 42 | crystal_structure["stoichiometry"] = pmg_s.composition.anonymized_formula 43 | 44 | record = {} 45 | if material: 46 | record["material"] = material 47 | if crystal_structure: 48 | record["crystal_structure"] = crystal_structure 49 | return record 50 | 51 | def implementors(self): 52 | return ['Jonathon Gaff'] 53 | 54 | def version(self): 55 | return '0.0.1' 56 | -------------------------------------------------------------------------------- /docs/source/extractors.rst: -------------------------------------------------------------------------------- 1 | Available Extractors 2 | ==================== 3 | 4 | These pages detail all of the extractors currently available in Scythe. 5 | 6 | Quick Summary 7 | ~~~~~~~~~~~~~ 8 | 9 | The extractors that are configured to work with the stevedore plugin are: 10 | 11 | .. list-plugins:: scythe.extractor 12 | 13 | 14 | Detailed Listing 15 | ~~~~~~~~~~~~~~~~ 16 | 17 | Generic File Extractors 18 | ----------------------- 19 | 20 | Extractors that work for any kind of file 21 | 22 | .. automodule:: scythe.file 23 | :members: 24 | :exclude-members: implementors, schema, version, group 25 | 26 | Image Extractors 27 | ---------------- 28 | 29 | Extractors that read image data 30 | 31 | .. automodule:: scythe.image 32 | :members: 33 | :exclude-members: implementors, schema, version, group 34 | 35 | Electron Microscopy Extractors 36 | ------------------------------ 37 | 38 | Extractors that read electron microscopy data of various sorts (images, spectra, spectrum images, 39 | etc.) using the `HyperSpy `_ package. 40 | 41 | .. automodule:: scythe.electron_microscopy 42 | :members: 43 | :exclude-members: implementors, schema, version, group 44 | 45 | Atomistic Data Extractors 46 | ------------------------- 47 | 48 | Extractors related to data files that encode atom-level structure 49 | 50 | .. automodule:: scythe.crystal_structure 51 | :members: 52 | :exclude-members: implementors, schema, version, group 53 | 54 | .. automodule:: scythe.ase 55 | :members: 56 | :noindex: 57 | :exclude-members: implementors, schema, version, group 58 | 59 | Calculation Extractors 60 | ---------------------- 61 | 62 | Extractors that retrieve results from calculations 63 | 64 | .. automodule:: scythe.dft 65 | :members: 66 | :exclude-members: implementors, schema, version, group 67 | 68 | .. automodule:: scythe.ase 69 | :members: 70 | :noindex: 71 | :exclude-members: implementors, schema, version, group 72 | 73 | Structured Data Files 74 | --------------------- 75 | 76 | Extractors that read data from structured files 77 | 78 | .. automodule:: scythe.csv 79 | :members: 80 | :exclude-members: implementors, schema, version, group 81 | -------------------------------------------------------------------------------- /scythe/ase.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datetime 3 | from ase.io.jsonio import create_ndarray 4 | from ase.io import read, write 5 | from io import StringIO 6 | import numpy as np 7 | 8 | from scythe.base import BaseSingleFileExtractor 9 | 10 | 11 | def object_hook(dct): 12 | """Custom decoder for ASE JSON objects 13 | 14 | Does everything *except* reconstitute the JSON object and 15 | also converts numpy arrays to lists 16 | 17 | Adapted from ase.io.jsonio 18 | 19 | Args: 20 | dct (dict): Dictionary to reconstitute to an ASE object 21 | """ 22 | if '__datetime__' in dct: 23 | return datetime.datetime.strptime(dct['__datetime__'], '%Y-%m-%dT%H:%M:%S.%f') 24 | 25 | if '__complex__' in dct: 26 | return complex(*dct['__complex__']) 27 | 28 | if '__ndarray__' in dct: 29 | return create_ndarray(*dct['__ndarray__']) 30 | 31 | # No longer used (only here for backwards compatibility): 32 | if '__complex_ndarray__' in dct: 33 | r, i = (np.array(x) for x in dct['__complex_ndarray__']) 34 | return r + i * 1j 35 | 36 | return dct 37 | 38 | 39 | class ASEExtractor(BaseSingleFileExtractor): 40 | """Parse information from atomistic simulation input files using ASE. 41 | 42 | ASE can read many file types. These can be found at https://wiki.fysik.dtu.dk/ase/ase/io/io.html 43 | 44 | Metadata are generated as ASE JSON DB format: https://wiki.fysik.dtu.dk/ase/ase/db/db.html 45 | """ 46 | 47 | def _extract_file(self, path, context=None): 48 | # Attempt to read the file with ASE 49 | # To return ASE JSON DB requires writing to file. 50 | # Here we use StringIO instead of a file on disk. 51 | 52 | fobj = StringIO() 53 | m = read(path) 54 | write(images=m, format="json", filename=fobj) 55 | js = json.loads(fobj.getvalue(), object_hook=object_hook) 56 | 57 | # Select the first record. 58 | # TODO: Test this against multiple records 59 | record = js['1'] 60 | record['chemical_formula'] = m.get_chemical_formula() 61 | return record 62 | 63 | def implementors(self): 64 | return ['Ben Blaiszik '] 65 | 66 | def version(self): 67 | return '0.0.1' 68 | -------------------------------------------------------------------------------- /scythe/file.py: -------------------------------------------------------------------------------- 1 | from scythe.base import BaseSingleFileExtractor 2 | from hashlib import sha512 3 | from warnings import warn 4 | import json 5 | import os 6 | 7 | 8 | try: 9 | import magic 10 | except ImportError as e: 11 | if 'failed to find libmagic' in str(e): 12 | warn('The libmagic library is not installed. ' 13 | 'See: https://github.com/ahupp/python-magic#installation') 14 | else: 15 | warn('The python wrapper for libmagic is not installed. ' 16 | 'If desired, call: https://github.com/ahupp/python-magic#installation') 17 | magic = None 18 | 19 | 20 | class GenericFileExtractor(BaseSingleFileExtractor): 21 | """Gather basic file information""" 22 | 23 | def __init__(self, store_path=True, compute_hash=True): 24 | """ 25 | Args: 26 | store_path (bool): Whether to record the path of the file 27 | compute_hash (bool): Whether to compute the hash of a file 28 | """ 29 | super().__init__() 30 | self.store_path = store_path 31 | self.compute_hash = compute_hash 32 | 33 | def _extract_file(self, path, context=None): 34 | output = { 35 | "length": os.path.getsize(path), 36 | "filename": os.path.basename(path), 37 | } 38 | 39 | # If magic imported properly, use it 40 | if magic is not None: 41 | output["mime_type"] = magic.from_file(path, mime=True) 42 | output["data_type"] = magic.from_file(path) 43 | 44 | if self.store_path: 45 | output['path'] = path 46 | if self.compute_hash: 47 | sha = sha512() 48 | with open(path, 'rb') as fp: 49 | while True: 50 | data = fp.read(65536) 51 | if not data: 52 | break 53 | sha.update(data) 54 | output['sha512'] = sha.hexdigest() 55 | return output 56 | 57 | def implementors(self): 58 | return ['Logan Ward'] 59 | 60 | def version(self): 61 | return '0.0.1' 62 | 63 | @property 64 | def schema(self): 65 | with open(os.path.join(os.path.dirname(__file__), 'schemas', 'file.json')) as fp: 66 | return json.load(fp) 67 | -------------------------------------------------------------------------------- /tests/test_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from scythe.json import JSONExtractor 6 | 7 | 8 | @pytest.fixture 9 | def test_files(): 10 | return [os.path.join(os.path.dirname(__file__), 'data', 'json', 'test_json.json')] 11 | 12 | 13 | @pytest.fixture 14 | def fail_file(): 15 | return os.path.join(os.path.dirname(__file__), 'data', 'fail_file.dat') 16 | 17 | 18 | @pytest.fixture 19 | def extractor(): 20 | return JSONExtractor() 21 | 22 | 23 | @pytest.fixture 24 | def mappings(): 25 | return [{ 26 | "custom": { 27 | "foo": "dict1.field1", 28 | "bar": "dict2.nested1.field1", 29 | "missing": "na_val" 30 | }, 31 | "material": { 32 | "composition": "compost" 33 | } 34 | }, { 35 | "custom.foo": "dict1.field1", 36 | "custom.bar": "dict2.nested1.field1", 37 | "custom.missing": "na_val", 38 | "material.composition": "compost" 39 | }] 40 | 41 | 42 | def test_json(extractor, test_files, fail_file, mappings): 43 | # Run test extractions 44 | output_na_unset = { 45 | "material": { 46 | "composition": "CN25" 47 | }, 48 | "custom": { 49 | "foo": "value1", 50 | "bar": True, 51 | "missing": "na" 52 | } 53 | } 54 | output_na_set = { 55 | "material": { 56 | "composition": "CN25" 57 | }, 58 | "custom": { 59 | "foo": "value1", 60 | "bar": True 61 | } 62 | } 63 | 64 | assert extractor.extract(test_files[0], context={"mapping": mappings[0]}) == output_na_unset 65 | assert extractor.extract(test_files[0], context={"mapping": mappings[1]}) == output_na_unset 66 | assert extractor.extract(test_files[0], context={"mapping": mappings[0], 67 | "na_values": ["na"]}) == output_na_set 68 | assert extractor.extract(test_files[0], context={"mapping": mappings[1], 69 | "na_values": "na"}) == output_na_set 70 | 71 | # Test failure modes 72 | with pytest.raises(Exception): 73 | extractor.extract(fail_file) 74 | # No mapping provided 75 | with pytest.raises(Exception): 76 | extractor.extract(test_files[0]) 77 | -------------------------------------------------------------------------------- /tests/test_yaml.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from scythe.yaml import YAMLExtractor 6 | 7 | 8 | @pytest.fixture 9 | def test_files(): 10 | return [os.path.join(os.path.dirname(__file__), 'data', 'yaml', 'test_yaml.yaml')] 11 | 12 | 13 | @pytest.fixture 14 | def fail_file(): 15 | return os.path.join(os.path.dirname(__file__), 'data', 'fail_file.dat') 16 | 17 | 18 | @pytest.fixture 19 | def extractor(): 20 | return YAMLExtractor() 21 | 22 | 23 | @pytest.fixture 24 | def mappings(): 25 | return [{ 26 | "custom": { 27 | "foo": "dict1.field1", 28 | "bar": "dict2.nested1.field1", 29 | "missing": "na_val" 30 | }, 31 | "material": { 32 | "composition": "compost" 33 | } 34 | }, { 35 | "custom.foo": "dict1.field1", 36 | "custom.bar": "dict2.nested1.field1", 37 | "custom.missing": "na_val", 38 | "material.composition": "compost" 39 | }] 40 | 41 | 42 | def test_yaml(extractor, test_files, fail_file, mappings): 43 | # Run test extractions 44 | output_na_unset = { 45 | "material": { 46 | "composition": "CN25" 47 | }, 48 | "custom": { 49 | "foo": "value1", 50 | "bar": True, 51 | "missing": "na" 52 | } 53 | } 54 | output_na_set = { 55 | "material": { 56 | "composition": "CN25" 57 | }, 58 | "custom": { 59 | "foo": "value1", 60 | "bar": True 61 | } 62 | } 63 | 64 | assert extractor.extract(test_files[0], context={"mapping": mappings[0]}) == output_na_unset 65 | assert extractor.extract(test_files[0], context={"mapping": mappings[1]}) == output_na_unset 66 | assert extractor.extract(test_files[0], context={"mapping": mappings[0], 67 | "na_values": ["na"]}) == output_na_set 68 | assert extractor.extract(test_files[0], context={"mapping": mappings[1], 69 | "na_values": "na"}) == output_na_set 70 | 71 | # Test failure modes 72 | with pytest.raises(Exception): 73 | extractor.extract(fail_file) 74 | # No mapping provided 75 | with pytest.raises(Exception): 76 | extractor.extract(test_files[0]) 77 | -------------------------------------------------------------------------------- /tests/test_xml.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from scythe.xml import XMLExtractor 6 | 7 | 8 | @pytest.fixture 9 | def test_files(): 10 | return [os.path.join(os.path.dirname(__file__), 'data', 'xml', 'test_xml.xml')] 11 | 12 | 13 | @pytest.fixture 14 | def fail_file(): 15 | return os.path.join(os.path.dirname(__file__), 'data', 'fail_file.dat') 16 | 17 | 18 | @pytest.fixture 19 | def extractor(): 20 | return XMLExtractor() 21 | 22 | 23 | @pytest.fixture 24 | def mappings(): 25 | return [{ 26 | "custom": { 27 | "foo": "root.dict1.field1", 28 | "bar": "root.dict2.nested1.field1", 29 | "missing": "root.na_val" 30 | }, 31 | "material": { 32 | "composition": "root.compost" 33 | } 34 | }, { 35 | "custom.foo": "root.dict1.field1", 36 | "custom.bar": "root.dict2.nested1.field1", 37 | "custom.missing": "root.na_val", 38 | "material.composition": "root.compost" 39 | }] 40 | 41 | 42 | def test_xml(extractor, test_files, fail_file, mappings): 43 | # Run test extractions 44 | output_na_unset = { 45 | "material": { 46 | "composition": "CN25" 47 | }, 48 | "custom": { 49 | "foo": "value1", 50 | "bar": "baz", 51 | "missing": "na" 52 | } 53 | } 54 | output_na_set = { 55 | "material": { 56 | "composition": "CN25" 57 | }, 58 | "custom": { 59 | "foo": "value1", 60 | "bar": "baz" 61 | } 62 | } 63 | 64 | assert extractor.extract(test_files[0], context={"mapping": mappings[0]}) == output_na_unset 65 | assert extractor.extract(test_files[0], context={"mapping": mappings[1]}) == output_na_unset 66 | assert extractor.extract(test_files[0], context={"mapping": mappings[0], 67 | "na_values": ["na"]}) == output_na_set 68 | assert extractor.extract(test_files[0], context={"mapping": mappings[1], 69 | "na_values": "na"}) == output_na_set 70 | 71 | # Test failure modes 72 | with pytest.raises(Exception): 73 | extractor.extract(fail_file) 74 | # No mapping provided 75 | with pytest.raises(Exception): 76 | extractor.extract(test_files[0]) 77 | -------------------------------------------------------------------------------- /docs/source/goals.rst: -------------------------------------------------------------------------------- 1 | Project Goals 2 | ============= 3 | 4 | The goal of Scythe is to minimize the amount of code duplication between scientific databases. 5 | Many databases rely on custom software to extract information from scientific files and transform that data into a standardized format. 6 | Automation or analysis software also require extracting information from files. 7 | While the data needs of application vary, they all rely on similar algorithms to extract information from the 8 | same types of files. 9 | *Scythe is designed to be a shared repository for these algorithms*. 10 | 11 | The core of Scythe is a collection of "extractors" which each generate simplified, standardized 12 | data from a certain class of files. For example, the 13 | :class:`~scythe.electron_microscopy.ElectronMicroscopyExtractor` produces structured data from 14 | file types specific to brands of electron microscopes. 15 | 16 | Each extractor does not necessarily generate data in a format needed by any tool. Rather, the extractors 17 | are designed to produce *all* of the information needed by all projects that utilize the 18 | libraries. In this way, the extractors can service every user without modification. 19 | 20 | What Does Scythe *Do*? 21 | --------------------------- 22 | 23 | Scythe is designed to provide the answer to two limited questions: 24 | 25 | 1. *Which files can I parse with a certain tool?* 26 | Scythe provides tools for quickly finding files of a certain type 27 | 28 | 2. *What information does a set of files contain?* 29 | Scythe provides a library of tools that transform data into a simpler formats 30 | 31 | What Does Scythe *Not Do*? 32 | ------------------------------- 33 | 34 | There are several questions that are specifically out-of-scope for Scythe: 35 | 36 | 1. *How do I get access to files that I want to parse?* 37 | Scythe does not solve the data transfer problem 38 | 2. *How can I parse large numbers of files reliably?* 39 | Scythe is not a distributed workflow engine, but is designed to integrate with one 40 | for extracting metadata from large filesystems. 41 | 3. *How can I translate data into the schema needed for my application?* 42 | The goal of Scythe is to go from opaque to well-documented formats. We recommend 43 | implementing separate "adapter" classes to transform Scythe metadata to your 44 | specific requirements. 45 | 46 | See our 47 | `"how to use Scythe" documentation `_ 48 | for more detail on how to integrate Scythe into an application that provides these 49 | intentionally-missing features. 50 | -------------------------------------------------------------------------------- /scythe/utils/grouping.py: -------------------------------------------------------------------------------- 1 | """Utilities for implementing grouping operations""" 2 | from typing import Union, List, Iterable, Tuple 3 | from operator import itemgetter 4 | from pathlib import Path 5 | import itertools 6 | import os 7 | 8 | 9 | def preprocess_paths(paths: Union[str, Path, List[str], List[Path]]) -> List[str]: 10 | """Transform paths to absolute paths 11 | 12 | Designed to be used to simplify grouping logic 13 | 14 | Args: 15 | paths (Union[str, List[str]): Files and directories to be parsed 16 | Returns: 17 | (List[str]): List of paths in standardized form 18 | """ 19 | 20 | # Make sure paths are strings or Path-like objects 21 | if isinstance(paths, (str, Path)): 22 | paths = [paths] 23 | 24 | # Make paths absolute 25 | return [os.path.abspath(os.path.expanduser(f)) for f in paths] 26 | 27 | 28 | def group_by_postfix(files: Iterable[str], vocabulary: List[str]) -> Iterable[Tuple[str, ...]]: 29 | """Group files that have a common ending 30 | 31 | Finds all filenames that begin with a prefixes from a 32 | user-provided vocabulary and end with the same post-fix. 33 | 34 | For example, consider a directory that contains files A.1, B.1, A.2, B.2, and C.1. 35 | If a user provides a vocabulary of ['A', 'B'], the parser will return 36 | groups (A.1, B.1) and (A.2, B.2). 37 | If a user provides a vocabulary of ['A', 'B', 'C'], the parser will 38 | return groups (A.1, B.1), (A.2, B.2), and (C.1) 39 | 40 | See :class:`scythe.dft.DFTParser` for an example usage. 41 | 42 | Args: 43 | files ([str]): List of files to be grouped 44 | vocabulary ([str]): List of known starts for the file 45 | Yields: 46 | ([str]): Groups of files to be parsed together 47 | """ 48 | 49 | # TODO (lw): This function could be more flexible, but let's add features on demand 50 | 51 | # Get the files with similar post-fixes and are from the user-defined vocabulary 52 | matchable_files = [] # List of (path, type, (dir, postfix)) 53 | for filename in files: 54 | # Find if the filename matches a known type 55 | name = os.path.basename(filename) 56 | name_lower = name.lower() 57 | matches = [name_lower.startswith(n) for n in vocabulary] 58 | if not any(matches): 59 | continue 60 | 61 | # Get the extension of the file 62 | match_id = matches.index(True) 63 | vtype = vocabulary[match_id] 64 | ext = name[len(vtype):] 65 | d = os.path.dirname(filename) 66 | 67 | # Add to the list 68 | matchable_files.append((filename, vtype, (d, ext))) 69 | 70 | # Group files by postfix type and directory 71 | sort_key = itemgetter(2) 72 | for k, group in itertools.groupby(sorted(matchable_files, key=sort_key), 73 | key=sort_key): 74 | yield [x[0] for x in group] 75 | -------------------------------------------------------------------------------- /scythe/csv.py: -------------------------------------------------------------------------------- 1 | from scythe.base import BaseSingleFileExtractor 2 | from tableschema.exceptions import CastError 3 | from tableschema import Table 4 | from typing import List 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class CSVExtractor(BaseSingleFileExtractor): 11 | """Describe the contents of a comma-separated value (CSV) file 12 | 13 | The context dictionary for the CSV parser includes several fields: 14 | - ``schema``: Dictionary defining the schema for this dataset, following that of 15 | FrictionlessIO 16 | - ``na_values``: Any values that should be interpreted as missing 17 | """ 18 | 19 | def __init__(self, return_records=True, **kwargs): 20 | """ 21 | Args: 22 | return_records (bool): Whether to return each row in the CSV file 23 | Keyword: 24 | All kwargs as passed to `TableSchema's infer `_ method 25 | """ 26 | self.return_records = return_records 27 | self.infer_kwargs = kwargs 28 | 29 | def _extract_file(self, path: str, context=None): 30 | # Set the default value 31 | if context is None: 32 | context = dict() 33 | 34 | # Load in the table 35 | table = Table(path, schema=context.get('schema', None)) 36 | 37 | # Infer the table's schema 38 | table.infer(**self.infer_kwargs) 39 | 40 | # Add missing values 41 | if 'na_values' in context: 42 | if not isinstance(context['na_values'], list): 43 | raise ValueError('context["na_values"] must be a list') 44 | table.schema.descriptor['missingValues'] = sorted(set([''] + context['na_values'])) 45 | table.schema.commit() 46 | 47 | # Store the schema 48 | output = {'schema': table.schema.descriptor} 49 | 50 | # If desired, store the data 51 | if self.return_records: 52 | headers = table.schema.headers 53 | records = [] 54 | failed_records = 0 55 | for row in table.iter(keyed=False, cast=False): 56 | try: 57 | row = table.schema.cast_row(row) 58 | except CastError: 59 | failed_records += 1 60 | 61 | # TODO (wardlt): Use json output from tableschema once it's supported 62 | # https://github.com/frictionlessdata/tableschema-py/issues/213 63 | records.append(eval(repr(dict(zip(headers, row))))) 64 | if failed_records > 0: 65 | logger.warning(f'{failed_records} records failed casting with schema') 66 | output['records'] = records 67 | 68 | return output 69 | 70 | def implementors(self) -> List[str]: 71 | return ['Logan Ward'] 72 | 73 | def citations(self) -> List[str]: 74 | return ["https://github.com/frictionlessdata/tableschema-py"] 75 | 76 | def version(self) -> str: 77 | return '0.0.1' 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scythe 2 | 3 | [![Build Status](https://github.com/materials-data-facility/Scythe/workflows/Build%20Status/badge.svg)](https://github.com/materials-data-facility/Scythe/actions/workflows/test-suite-and-docs.yml) 4 | [![Documentation](https://img.shields.io/badge/-Documentation-blue?style=flat&logo=bookstack&labelColor=grey&logoColor=white)](https://materials-data-facility.github.io/Scythe) 5 | [![Coverage Status](https://codecov.io/gh/materials-data-facility/Scythe/branch/master/graph/badge.svg)](https://codecov.io/gh/materials-data-facility/Scythe) 6 | [![GitHub last commit](https://img.shields.io/github/last-commit/materials-data-facility/Scythe)](https://github.com/materials-data-facility/Scythe/commits/master) 7 | [![PyPI version](https://badge.fury.io/py/scythe-extractors.svg)](https://badge.fury.io/py/scythe-extractors) 8 | [![GitHub contributors](https://img.shields.io/github/contributors/materials-data-facility/Scythe)](https://github.com/materials-data-facility/Scythe/graphs/contributors) 9 | 10 | Scythe is a library of tools that generate summaries of the data contained in scientific data files. 11 | The goal of Scythe is to provide a shared resources of these tools ("extractors") to avoid duplication of effort between the many emerging materials databases. 12 | Each extractor is designed to generate the sum of all data needed by each of these databases with a uniform API so that specific projects can write simple adaptors for their needs. 13 | 14 | ## Installation 15 | 16 | Install using an up-to-date version of `pip` on version 3.8 or higher of Python: 17 | 18 | ```bash 19 | pip install scythe-extractors 20 | ``` 21 | 22 | Each specific extractor module has its own set of required libraries. 23 | Given that some modules have extensive dependencies, we do not install all of them automatically. 24 | You can install them either module-by-module using the pip "extras" installation (e.g., 25 | `pip install scythe-extractors[image]"`), 26 | or install all extractors with 27 | `pip install scythe-extractors[all]"`. 28 | 29 | ## Development/Contribution 30 | 31 | If you wish to develop new features using Scythe, please consult the 32 | [Contributor Guide](https://materialsio.readthedocs.io/en/latest/contributor-guide.html) that will 33 | walk you through installing [Poetry](https://python-poetry.org/) and the Scythe dependencies. 34 | 35 | ## Documentation 36 | 37 | * Complete documentation for Scythe is on [Read the Docs](https://materialsio.readthedocs.io/en/latest/). 38 | * [List of Available Extractors](https://materialsio.readthedocs.io/en/latest/extractors.html) 39 | 40 | ## Support 41 | 42 | This work was performed in partnership with [Citrine Informatics](https://citrine.io/). 43 | This was also performed under financial assistance award 70NANB14H012 from U.S. Department of Commerce, National Institute of Standards and Technology as part of the Center for Hierarchical Material Design (CHiMaD). 44 | This work was also supported by the National Science Foundation as part of the Midwest Big Data Hub under NSF Award Number: 1636950 "BD Spokes: SPOKE: MIDWEST: Collaborative: Integrative Materials Design (IMaD): Leverage, Innovate, and Disseminate". 45 | -------------------------------------------------------------------------------- /scythe/adapters/base.py: -------------------------------------------------------------------------------- 1 | """Base classes for adapters""" 2 | 3 | import json 4 | from abc import abstractmethod 5 | from typing import Any, Union 6 | 7 | import numpy as np 8 | 9 | from scythe.base import BaseExtractor 10 | 11 | 12 | class BaseAdapter: 13 | """Template for tools that transform metadata into a new form""" 14 | 15 | @abstractmethod 16 | def transform(self, metadata: dict, context: Union[None, dict] = None) -> Any: 17 | """Process metadata into a new form 18 | 19 | Args: 20 | metadata (dict): Metadata to transform 21 | context (dict): Any context information used during transformation 22 | Returns: 23 | Metadata in a new form, can be any type of object. 24 | ``None`` corresponding 25 | """ 26 | 27 | def check_compatibility(self, parser: BaseExtractor) -> bool: 28 | """Evaluate whether an adapter is compatible with a certain parser 29 | 30 | Args: 31 | parser (BaseExtractor): Parser to evaluate 32 | Returns: 33 | (bool) Whether this parser is compatible 34 | """ 35 | 36 | if self.version() is None: 37 | return True 38 | else: 39 | my_version = tuple(int(x) for x in self.version().split('.')) 40 | their_version = tuple(int(x) for x in parser.version().split('.')) 41 | return my_version == their_version 42 | 43 | def version(self) -> Union[None, str]: 44 | """Version of the parser that an adapter was created for 45 | 46 | Returns: 47 | (str) Version of parser this adapter was designed for, 48 | or ``None`` if not applicable 49 | """ 50 | return None 51 | 52 | 53 | class NOOPAdapter(BaseAdapter): 54 | """Adapter that does not alter the output data 55 | 56 | Used for testing purposes""" 57 | 58 | def transform(self, metadata: dict, context=None) -> dict: 59 | return metadata 60 | 61 | 62 | class SerializeAdapter(BaseAdapter): 63 | """Converts the metadata to a string by serializing with JSON""" 64 | 65 | def transform(self, metadata: dict, context=None) -> str: 66 | return json.dumps(metadata) 67 | 68 | 69 | class GreedySerializeAdapter(BaseAdapter): 70 | """Converts the metadata to a string by serializing with JSON, making some (hopefully) informed 71 | choices about what to do with various types commonly seen, and otherwise reporting that the 72 | data type could not be serialized. May not work in all situations, but should cover a large 73 | number of cases.""" 74 | @staticmethod 75 | def default(o): 76 | success = False 77 | if isinstance(o, np.void): 78 | return None 79 | elif isinstance(o, (np.ndarray, np.generic)): 80 | return o.tolist() 81 | elif isinstance(o, bytes): 82 | try: 83 | return o.decode() 84 | except UnicodeDecodeError: 85 | pass 86 | 87 | if not success: 88 | type_name = o.__class__.__name__ 89 | return f"<>" 90 | 91 | def transform(self, metadata: dict, context=None) -> str: 92 | s = json.dumps(metadata, default=GreedySerializeAdapter.default) 93 | return s 94 | -------------------------------------------------------------------------------- /scythe/dft.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Iterable, Tuple, List 2 | from scythe.utils.grouping import preprocess_paths, group_by_postfix 3 | from scythe.base import BaseExtractor 4 | from dfttopif import files_to_pif 5 | from operator import itemgetter 6 | import itertools 7 | import os 8 | 9 | 10 | # List of files that are known to the VASP parser 11 | _vasp_file_names = ["outcar", "incar", "chgcar", "wavecar", "wavcar", "oszicar", "ibzcar", 12 | "kpoints", "doscar", "poscar", "contcar", "vasp_run.xml", "xdatcar"] 13 | 14 | 15 | class DFTExtractor(BaseExtractor): 16 | """Extract metadata from Density Functional Theory calculation results 17 | 18 | Uses the `dfttopif `_ parser to extract metadata from each file 19 | """ 20 | 21 | def __init__(self, quality_report=False): 22 | """Initialize the extractor 23 | 24 | Args: 25 | quality_report (bool): Whether to generate a quality report 26 | """ 27 | self.quality_report = quality_report 28 | 29 | def group(self, files: Union[str, List[str]], directories: List[str] = None, 30 | context: dict = None): 31 | # Convert paths into standardized form 32 | files = set(preprocess_paths(files)) 33 | 34 | # Find all files, and attempt to group them 35 | for group in self._group_vasp(files): # VASP grouping logic 36 | # Remove all files matched as VASP from the matchable files 37 | files.difference_update(group) 38 | yield group 39 | for group in self._group_pwscf(files): 40 | yield group # Do not remove, as the PWSCF group is not reliable 41 | 42 | def _group_vasp(self, files: Iterable[str]) -> Iterable[Tuple[str, ...]]: 43 | """Find groupings of files associated with VASP calculations 44 | 45 | Find files that start with the name "OUTCAR" (not case sensitive) and groups those files 46 | together with any file that share the same postfix (e.g., "OUTCAR.1" and "INCAR.1" are 47 | grouped together) 48 | 49 | Args: 50 | files ([str]): List of files to be grouped 51 | Yields: 52 | ((files)): List of VASP files from the same calculation 53 | """ 54 | 55 | for group in group_by_postfix(files, _vasp_file_names): 56 | yield group 57 | 58 | def _group_pwscf(self, files: Iterable[str]) -> Iterable[Tuple[str, ...]]: 59 | """Assemble groups of files that are potentially PWSCF calculations 60 | 61 | Args: 62 | files ([str]): List of files to be grouped 63 | Yields: 64 | ((str)): Groups of potential-pwscf files 65 | """ 66 | 67 | # For now, we just group files by directory 68 | # TODO (lw): Find files that have PWSCF flags in them 69 | # TODO (lw): Read PWSCF input files to know the save directory 70 | file_and_dir = [(os.path.dirname(f), f) for f in files] 71 | for k, group in itertools.groupby(sorted(file_and_dir), key=itemgetter(0)): 72 | yield [x[1] for x in group] 73 | 74 | def extract(self, group: Iterable[str], context: dict = None): 75 | return files_to_pif(group, quality_report=self.quality_report).as_dictionary() 76 | 77 | def implementors(self): 78 | return ['Logan Ward '] 79 | 80 | def version(self): 81 | return '0.0.1' 82 | -------------------------------------------------------------------------------- /tests/data/crystal_structure/Ce3VO16.cif: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------ 2 | #$Date: 2016-03-02 09:51:31 +0200 (Wed, 02 Mar 2016) $ 3 | #$Revision: 177123 $ 4 | #$URL: svn://www.crystallography.net/cod/cif/5/91/02/5910204.cif $ 5 | #------------------------------------------------------------------------------ 6 | # 7 | # This file is available in the Crystallography Open Database (COD), 8 | # http://www.crystallography.net/ 9 | # 10 | # All data on this site have been placed in the public domain by the 11 | # contributors. 12 | # 13 | data_5910204 14 | loop_ 15 | _publ_author_name 16 | 'Wyckoff, R. W. G.' 17 | _publ_section_title 18 | ; 19 | Pages 15 & 17 from the Structure of Crystals, vol. 3 by Wyckoff R W G. 20 | published by Interscience Publishers, Inc. in 1951 21 | ; 22 | _journal_name_full 'The Structure of Crystals' 23 | _journal_page_first 15 24 | _journal_page_last 17 25 | _journal_volume 3 26 | _journal_year 1951 27 | _chemical_formula_structural CeVO4 28 | _chemical_formula_sum 'Ce O4 V' 29 | _space_group_IT_number 141 30 | _symmetry_cell_setting tetragonal 31 | _symmetry_Int_Tables_number 141 32 | _symmetry_space_group_name_Hall '-I 4bd 2' 33 | _symmetry_space_group_name_H-M 'I 41/a m d :2' 34 | _audit_creation_date 2006-30-06 35 | _audit_creation_method 36 | ; 37 | Pages 15 & 17 from the Structure of Crystals, vol. 3 by Wyckoff R W G. 38 | published by Interscience Publishers, Inc. in 1951 39 | ; 40 | _audit_update_record 41 | 'created by Girish Upreti, Portland State University' 42 | _cell_angle_alpha 90 43 | _cell_angle_beta 90 44 | _cell_angle_gamma 90 45 | _cell_length_a 7.399 46 | _cell_length_b 7.399 47 | _cell_length_c 6.496 48 | _cell_volume 355.625 49 | _cod_original_sg_symbol_H-M 'I 41/a m d' 50 | _cod_original_formula_sum 'Ce V O4' 51 | _cod_database_code 5910204 52 | loop_ 53 | _symmetry_equiv_pos_as_xyz 54 | x,y,z 55 | -y+1/4,x+3/4,z+1/4 56 | y+1/4,-x+1/4,z+3/4 57 | x,-y,-z 58 | -x,y+1/2,-z 59 | -x,-y+1/2,z 60 | y+1/4,x+3/4,-z+1/4 61 | -y+1/4,-x+1/4,-z+3/4 62 | -x,-y,-z 63 | y+3/4,-x+1/4,-z+3/4 64 | -y+3/4,x+3/4,-z+1/4 65 | -x,y,z 66 | x,-y+1/2,z 67 | x,y+1/2,-z 68 | -y+3/4,-x+1/4,z+3/4 69 | y+3/4,x+3/4,z+1/4 70 | x+1/2,y+1/2,z+1/2 71 | -y+3/4,x+1/4,z+3/4 72 | y+3/4,-x+3/4,z+1/4 73 | x+1/2,-y+1/2,-z+1/2 74 | -x+1/2,y,-z+1/2 75 | -x+1/2,-y,z+1/2 76 | y+3/4,x+1/4,-z+3/4 77 | -y+3/4,-x+3/4,-z+1/4 78 | -x+1/2,-y+1/2,-z+1/2 79 | y+1/4,-x+3/4,-z+1/4 80 | -y+1/4,x+1/4,-z+3/4 81 | -x+1/2,y+1/2,z+1/2 82 | x+1/2,-y,z+1/2 83 | x+1/2,y,-z+1/2 84 | -y+1/4,-x+3/4,z+1/4 85 | y+1/4,x+1/4,z+3/4 86 | loop_ 87 | _atom_site_fract_x 88 | _atom_site_fract_y 89 | _atom_site_fract_z 90 | _atom_site_label 91 | 0.00000 0.00000 0.00000 Ce1 92 | 0.00000 0.50000 0.25000 Ce2 93 | 0.50000 0.00000 0.75000 Ce3 94 | 0.50000 0.50000 0.50000 Ce4 95 | 0.00000 0.00000 0.50000 V1 96 | 0.00000 0.50000 0.75000 V2 97 | 0.50000 0.00000 0.25000 V3 98 | 0.50000 0.50000 0.00000 V4 99 | 0.00000 0.20000 0.34000 O1 100 | 0.00000 -0.20000 0.34000 O2 101 | 0.20000 0.00000 -0.34000 O3 102 | -0.20000 0.00000 -0.34000 O4 103 | 0.00000 0.70000 -0.09000 O5 104 | 0.00000 0.30000 -0.09000 O6 105 | -0.20000 0.50000 0.59000 O7 106 | -------------------------------------------------------------------------------- /tests/data/crystal_structure/Al2O3.cif: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------ 2 | #$Date: 2017-10-13 02:32:00 +0300 (Fri, 13 Oct 2017) $ 3 | #$Revision: 201954 $ 4 | #$URL: file:///home/coder/svn-repositories/cod/cif/1/00/00/1000017.cif $ 5 | #------------------------------------------------------------------------------ 6 | # 7 | # This file is available in the Crystallography Open Database (COD), 8 | # http://www.crystallography.net/ 9 | # 10 | # All data on this site have been placed in the public domain by the 11 | # contributors. 12 | # 13 | data_1000017 14 | loop_ 15 | _publ_author_name 16 | 'Tsirelson, V G' 17 | 'Antipin, M Y' 18 | 'Gerr, R G' 19 | 'Ozerov, R P' 20 | 'Struchkov, Y T' 21 | _publ_section_title 22 | ; 23 | Ruby structure peculiarities derived from X-ray data. Localization of 24 | chromium atoms and electron deformation density 25 | ; 26 | _journal_coden_ASTM PSSABA 27 | _journal_name_full 28 | ; 29 | Physica Status Solidi, Sectio A: Applied Research 30 | ; 31 | _journal_page_first 425 32 | _journal_page_last 433 33 | _journal_paper_doi 10.1002/pssa.2210870204 34 | _journal_volume 87 35 | _journal_year 1985 36 | _chemical_formula_structural 'Al2 O3' 37 | _chemical_formula_sum 'Al2 O3' 38 | _chemical_name_mineral Corundum 39 | _chemical_name_systematic 'Aluminium oxide' 40 | _space_group_IT_number 167 41 | _symmetry_cell_setting trigonal 42 | _symmetry_space_group_name_Hall '-R 3 2"c' 43 | _symmetry_space_group_name_H-M 'R -3 c :H' 44 | _audit_creation_date 102-05-16 45 | _cell_angle_alpha 90 46 | _cell_angle_beta 90 47 | _cell_angle_gamma 120 48 | _cell_formula_units_Z 6 49 | _cell_length_a 4.7606(5) 50 | _cell_length_b 4.7606(5) 51 | _cell_length_c 12.994(1) 52 | _cell_volume 255.0 53 | _refine_ls_R_factor_all 0.063 54 | _cod_original_sg_symbol_H-M 'R -3 c' 55 | _cod_database_code 1000017 56 | loop_ 57 | _symmetry_equiv_pos_as_xyz 58 | x,y,z 59 | -y,x-y,z 60 | y-x,-x,z 61 | -y,-x,1/2+z 62 | x,x-y,1/2+z 63 | y-x,y,1/2+z 64 | -x,-y,-z 65 | y,y-x,-z 66 | x-y,x,-z 67 | y,x,1/2-z 68 | -x,y-x,1/2-z 69 | x-y,-y,1/2-z 70 | 1/3+x,2/3+y,2/3+z 71 | 2/3+x,1/3+y,1/3+z 72 | 1/3-y,2/3+x-y,2/3+z 73 | 2/3-y,1/3+x-y,1/3+z 74 | 1/3-x+y,2/3-x,2/3+z 75 | 2/3-x+y,1/3-x,1/3+z 76 | 1/3-y,2/3-x,1/6+z 77 | 2/3-y,1/3-x,5/6+z 78 | 1/3+x,2/3+x-y,1/6+z 79 | 2/3+x,1/3+x-y,5/6+z 80 | 1/3-x+y,2/3+y,1/6+z 81 | 2/3-x+y,1/3+y,5/6+z 82 | 1/3-x,2/3-y,2/3-z 83 | 2/3-x,1/3-y,1/3-z 84 | 1/3+y,2/3-x+y,2/3-z 85 | 2/3+y,1/3-x+y,1/3-z 86 | 1/3+x-y,2/3+x,2/3-z 87 | 2/3+x-y,1/3+x,1/3-z 88 | 1/3+y,2/3+x,1/6-z 89 | 2/3+y,1/3+x,5/6-z 90 | 1/3-x,2/3-x+y,1/6-z 91 | 2/3-x,1/3-x+y,5/6-z 92 | 1/3+x-y,2/3-y,1/6-z 93 | 2/3+x-y,1/3-y,5/6-z 94 | loop_ 95 | _atom_site_label 96 | _atom_site_type_symbol 97 | _atom_site_symmetry_multiplicity 98 | _atom_site_Wyckoff_symbol 99 | _atom_site_fract_x 100 | _atom_site_fract_y 101 | _atom_site_fract_z 102 | _atom_site_occupancy 103 | _atom_site_attached_hydrogens 104 | _atom_site_calc_flag 105 | O1 O2- 18 e 0.69365(3) 0. 0.25 1. 0 d 106 | Al1 Al3+ 12 c 0. 0. 0.35217(1) 1. 0 d 107 | loop_ 108 | _atom_type_symbol 109 | _atom_type_oxidation_number 110 | O2- -2.000 111 | Al3+ 3.000 112 | loop_ 113 | _cod_related_entry_id 114 | _cod_related_entry_database 115 | _cod_related_entry_code 116 | 1 ChemSpider 8164808 117 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "scythe-extractors" 3 | version = "0.1.1" 4 | description = "A library of tools that generate summaries of the data contained in scientific data files" 5 | authors = ["Materials Data Facility "] 6 | license = "Apache" 7 | readme = "README.md" 8 | repository = "https://github.com/materials-data-facility/scythe" 9 | 10 | packages = [ 11 | { include = "scythe" }, 12 | ] 13 | 14 | [tool.poetry.dependencies] 15 | python = ">=3.8.0,<3.11" 16 | mdf-toolbox = "^0.5.3" 17 | stevedore = "^3.5.0" 18 | pandas = "^1.4.2" 19 | llvmlite = "^0.38.0" 20 | numba = "^0.55" 21 | 22 | ase = { version = "~3.19", optional = true } 23 | pymatgen = { version = "^2022.3.24", optional = true } 24 | tableschema = { version = "^1,<2", optional = true } 25 | dfttopif = { version = "^1.1.0", optional = true } 26 | hyperspy = { version = "^1.4.1", optional = true } 27 | python-magic = { version = "^0.4.15", optional = true } 28 | Pillow = { version = "^9.0.1", optional = true } 29 | xmltodict = { version = "^0.12.0", optional = true } 30 | pycalphad = { version = "^0.10.0", optional = true } 31 | 32 | [tool.poetry.dev-dependencies] 33 | flake8 = "^3.9.2" # pinned due to incompatibility with flake8 v4 and sphinx 34 | pytest = "^7.1.1" 35 | coveralls = "^3.3.1" 36 | pytest-cov = "^3.0.0" 37 | tox = "^3.25.0" 38 | Sphinx = "^4.5.0" 39 | sphinx-rtd-theme = "^1.0.0" 40 | diff-cover = "^6.4.5" 41 | 42 | [tool.poetry.extras] 43 | ase = ['ase'] 44 | crystal_structure = ['pymatgen', 'ase'] 45 | csv = ['tableschema'] 46 | dft = ['dfttopif'] 47 | electron_microscopy = ['hyperspy'] 48 | file = ['python-magic'] 49 | image = ['Pillow'] 50 | tdb = ['pycalphad'] 51 | xml = ['xmltodict'] 52 | # to make it easy to add all extras, maintain the list below as the sum 53 | # of all the dependencies above 54 | all = ['ase', 55 | 'pymatgen', 56 | 'tableschema', 57 | 'dfttopif', 58 | 'hyperspy', 59 | 'python-magic', 60 | 'Pillow', 61 | 'xmltodict', 62 | 'pycalphad'] 63 | 64 | [tool.poetry.plugins] 65 | 66 | [tool.poetry.plugins."scythe.extractor"] 67 | "ase" = "scythe.ase:ASEExtractor" 68 | "crystal" = "scythe.crystal_structure:CrystalStructureExtractor" 69 | "csv" = "scythe.csv:CSVExtractor" 70 | "dft" = "scythe.dft:DFTExtractor" 71 | "em" = "scythe.electron_microscopy:ElectronMicroscopyExtractor" 72 | "filename" = "scythe.filename:FilenameExtractor" 73 | "generic" = "scythe.file:GenericFileExtractor" 74 | "image" = "scythe.image:ImageExtractor" 75 | "json" = "scythe.json:JSONExtractor" 76 | "noop" = "scythe.testing:NOOPExtractor" 77 | "tdb" = "scythe.tdb:TDBExtractor" 78 | "xml" = "scythe.xml:XMLExtractor" 79 | "yaml" = "scythe.yaml:YAMLExtractor" 80 | 81 | [tool.poetry.plugins."scythe.adapter"] 82 | "noop" = "scythe.adapters.base:NOOPAdapter" 83 | "serialize" = "scythe.adapters.base:SerializeAdapter" 84 | "greedy_serialize" = "scythe.adapters.base:GreedySerializeAdapter" 85 | 86 | [build-system] 87 | requires = ["poetry-core>=1.0.0"] 88 | build-backend = "poetry.core.masonry.api" 89 | 90 | [tool.pytest.ini_options] 91 | addopts = "--ignore=.venv --ignore=.tox --cov=scythe --cov-report html --cov-report term-missing --cov-report=xml" 92 | testpaths = ['tests'] 93 | 94 | [tool.coverage.run] 95 | omit = [".tox/*"] 96 | dynamic_context = "test_function" 97 | 98 | [tool.coverage.report] 99 | show_missing = true 100 | 101 | [tool.coverage.html] 102 | directory = "coverage_html" 103 | show_contexts = true 104 | 105 | [tool.tox] 106 | legacy_tox_ini = """ 107 | [tox] 108 | isolated_build = true 109 | envlist = py3{8,9,10} 110 | 111 | [testenv] 112 | passenv = * 113 | whitelist_externals = poetry 114 | commands = 115 | poetry install -E all 116 | poetry run flake8 117 | poetry run pytest 118 | poetry run diff-cover coverage.xml --compare-branch=origin/master 119 | 120 | [testenv:docs] 121 | passenv = * 122 | envlist = py310 123 | description = invoke sphinx-build to build the HTML docs 124 | commands = 125 | poetry install -E all 126 | poetry run flake8 127 | poetry run python -m sphinx.cmd.build ./docs/source ./docs/_build -n -E -a -j auto -b html 128 | """ 129 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from scythe.utils.interface import (get_available_extractors, run_extractor, 2 | get_available_adapters, run_all_extractors_on_directory, 3 | ExtractResult) 4 | from scythe.utils import set_nested_dict_value 5 | from scythe.image import ImageExtractor 6 | import pytest 7 | import json 8 | import os 9 | 10 | 11 | cwd = os.path.dirname(__file__) 12 | 13 | 14 | def test_list_parsers(): 15 | assert 'image' in get_available_extractors() 16 | 17 | 18 | def test_execute_parser(): 19 | image = os.path.join(cwd, 'data', 'image', 'dog2.jpeg') 20 | assert ImageExtractor().extract([image]) == run_extractor('image', [image]) 21 | assert run_extractor('image', [image], adapter='noop') == run_extractor('image', [image]) 22 | 23 | 24 | def test_run_all_parsers(): 25 | path = os.path.join(cwd, 'data', 'image') 26 | output = list(run_all_extractors_on_directory(path)) 27 | assert len(output) > 0 28 | assert len(output[0]) == 3 29 | assert isinstance(output[0][0], tuple) 30 | assert isinstance(output[0][1], str) 31 | assert isinstance(output[0][2], dict) 32 | 33 | # Re-run parsers with adapters 34 | output_noop = list(run_all_extractors_on_directory(path, default_adapter='noop')) 35 | assert output == output_noop 36 | output_json = list(run_all_extractors_on_directory(path, default_adapter='serialize')) 37 | assert output == [ExtractResult(x.group, x.extractor, json.loads(x.metadata)) for x in output_json] 38 | 39 | # Test the matching 40 | output_matching = list(run_all_extractors_on_directory(path, adapter_map={'file': 'serialize'})) 41 | assert all(isinstance(x.metadata, str if x.extractor == 'file' else dict) 42 | for x in output_matching) 43 | output_matching = list(run_all_extractors_on_directory(path, adapter_map={'file': 'noop'}, 44 | default_adapter='serialize')) 45 | assert all(isinstance(x.metadata, str if x.extractor != 'file' else dict) 46 | for x in output_matching) 47 | 48 | # This matching test fails if we have other packages with adapters on the system 49 | adapters = set(get_available_adapters().keys()) 50 | if adapters == {'noop', 'serialize'}: 51 | output_matching = list(run_all_extractors_on_directory(path, adapter_map='match', 52 | default_adapter='serialize')) 53 | assert all(isinstance(x.metadata, str if x.extractor != 'noop' else dict) 54 | for x in output_matching) 55 | 56 | # Test the error case 57 | with pytest.raises(ValueError): 58 | list(run_all_extractors_on_directory(path, adapter_map='matching', 59 | default_adapter='serialize')) 60 | 61 | # Test specifying parsers 62 | assert set([x.extractor for x in output]).issuperset(['image', 'generic']) 63 | output_limit = list(run_all_extractors_on_directory(path, exclude_extractors=['image'])) 64 | assert 'image' not in [x.extractor for x in output_limit] 65 | output_limit = list(run_all_extractors_on_directory(path, include_extractors=['image'])) 66 | assert set([x.extractor for x in output_limit]) == {'image'} 67 | with pytest.raises(ValueError): 68 | list(run_all_extractors_on_directory(path, include_extractors=['image'], 69 | exclude_extractors=['image'])) 70 | with pytest.raises(ValueError): 71 | list(run_all_extractors_on_directory(path, include_extractors=['totally-not-a-parser'])) 72 | 73 | 74 | def test_list_adapters(): 75 | assert 'noop' in get_available_adapters() 76 | 77 | 78 | def test_set_nested_dict(): 79 | dest_dict1 = { 80 | 'key1': 'val1', 81 | 'key2': { 82 | 'key2.1': 'val2.1', 83 | 'key2.2': 'val2.2'} 84 | } 85 | dest_dict2 = { 86 | 'key1': 'val1', 87 | 'key2': { 88 | 'key2.1': 'val2.1', 89 | 'key2.2': 'val2.2'} 90 | } 91 | 92 | set_nested_dict_value(dest_dict2, ('key3', 'key3.1'), None) 93 | assert dest_dict1 == dest_dict2 94 | 95 | set_nested_dict_value(dest_dict2, ('key3', 'key3.1'), 4) 96 | assert dest_dict2 == { 97 | 'key1': 'val1', 98 | 'key2': { 99 | 'key2.1': 'val2.1', 100 | 'key2.2': 'val2.2'}, 101 | 'key3': {'key3.1': 4} 102 | } 103 | 104 | set_nested_dict_value(dest_dict2, ('key3', 'key3.1'), 5, override=False) 105 | assert dest_dict2 == { 106 | 'key1': 'val1', 107 | 'key2': { 108 | 'key2.1': 'val2.1', 109 | 'key2.2': 'val2.2'}, 110 | 'key3': {'key3.1': 4} 111 | } 112 | 113 | set_nested_dict_value(dest_dict2, ('key3', 'key3.1'), 5, override=True) 114 | assert dest_dict2 == { 115 | 'key1': 'val1', 116 | 'key2': { 117 | 'key2.1': 'val2.1', 118 | 'key2.2': 'val2.2'}, 119 | 'key3': {'key3.1': 5} 120 | } 121 | -------------------------------------------------------------------------------- /tests/data/tdb/test_PbTe.TDB: -------------------------------------------------------------------------------- 1 | $ Reference: "Ab inito study of intrinsic point defects in PbTe: an insight into phase 2 | $ stability", Bajaj, S., et al. Acta Materialia 92, 2015, doi: 10.1016/j.actamat.2015.03.034 3 | 4 | ELEMENT VA VACANCY 0.0000E+00 0.0000E+00 0.0000E+00! 5 | ELEMENT /- ELECTRON_GAS 0.0000E+00 0.0000E+00 0.0000E+00! 6 | ELEMENT PB FCC_A1 2.072E+02 6.870E+03 6.48E+01! 7 | ELEMENT TE HEXAGONAL_A8 1.2760E+02 6.1212E+03 4.9497E+01! 8 | 9 | 10 | SPECIES PBTE_L PB1TE1 ! 11 | 12 | 13 | FUNCTION GHSERPB 298.15 -7650.09+101.7*T-24.5242*T*LN(T) 14 | -0.00365895*T**2-2.4395E-007*T**3; 600.61 Y 15 | -10531.1+154.243*T-32.4914*T*LN(T) 16 | +0.00154613*T**2+8.05448E+025*T**(-9); 1200 Y 17 | 4157.62+53.1391*T-18.9641*T*LN(T) 18 | -0.00288294*T**2+9.8144E-008*T**3-2.69676E+006*T**(-1)+8.05448E+025*T**(-9); 2100 N ! 19 | FUNCTION GHSERTE 2.98150E+02 -10544.679+183.372894*T-35.6687*T*LN(T) 20 | +.01583435*T**2-5.240417E-06*T**3+155015*T**(-1); 7.22660E+02 Y 21 | +9160.595-129.265373*T+13.004*T*LN(T)-.0362361*T**2+5.006367E-06*T**3 22 | -1286810*T**(-1); 1.15000E+03 Y 23 | -12781.349+174.901226*T-32.5596*T*LN(T); 1.60000E+03 N ! 24 | FUNCTION GLIQTE 2.98150E+02 -17554.731+685.877639*T 25 | -126.318*T*LN(T)+.2219435*T**2-9.42075E-05*T**3+827930*T**(-1); 26 | 6.26490E+02 Y 27 | -3165763.48+46756.357*T-7196.41*T*LN(T)+7.09775*T**2-.00130692833*T**3 28 | +2.58051E+08*T**(-1); 7.22660E+02 Y 29 | +180326.959-1500.57909*T+202.743*T*LN(T)-.142016*T**2+1.6129733E-05*T**3 30 | -24238450*T**(-1); 1.15000E+03 Y 31 | +6328.687+148.708299*T-32.5596*T*LN(T); 1.60000E+03 N REF0 ! 32 | FUNCTION GLIQPB 298.15 -2977.96+93.9496*T-24.5242*T*LN(T) 33 | -0.00365895*T**2-2.4395E-007*T**3-6.019E-019*T**7; 600.61 Y 34 | -5677.96+146.176*T-32.4914*T*LN(T) 35 | +0.00154613*T**2; 1200 Y 36 | 9010.75+45.0719*T-18.9641*T*LN(T) 37 | -0.00288294*T**2+9.8144E-008*T**3-2.69676E+006*T**(-1); 2100 N ! 38 | 39 | 40 | PHASE LIQUID % 1 1 ! 41 | CONSTITUENT LIQUID :PB,TE,PBTE_L : ! 42 | PARAMETER G(LIQUID,PB;0) 298.15 -2977.96+93.9496*T-24.5242*T*LN(T) 43 | -0.00365895*T**2-2.4395E-007*T**3-6.019E-019*T**7; 600.61 Y 44 | -5677.96+146.176*T-32.4914*T*LN(T) 45 | +0.00154613*T**2; 1200 Y 46 | 9010.75+45.0719*T-18.9641*T*LN(T) 47 | -0.00288294*T**2+9.8144E-008*T**3-2.69676E+006*T**(-1); 2100 N ! 48 | PARAMETER G(LIQUID,TE;0) 2.98150E+02 -17554.731+685.877639*T 49 | -126.318*T*LN(T)+.2219435*T**2-9.42075E-05*T**3+827930*T**(-1); 50 | 6.26490E+02 Y 51 | -3165763.48+46756.357*T-7196.41*T*LN(T)+7.09775*T**2-.00130692833*T**3 52 | +2.58051E+08*T**(-1); 7.22660E+02 Y 53 | +180326.959-1500.57909*T+202.743*T*LN(T)-.142016*T**2+1.6129733E-05*T**3 54 | -24238450*T**(-1); 1.15000E+03 Y 55 | +6328.687+148.708299*T-32.5596*T*LN(T); 1.60000E+03 N REF0 ! 56 | PARAMETER G(LIQUID,PBTE_L;0) 2.98150E+02 GLIQPB#+GLIQTE#-61700+18.9*T; 2.00000E+03 N REF0 ! 57 | PARAMETER G(LIQUID,PB,PBTE_L;0) 2.98150E+02 15965.83-3.8*T; 2.00000E+03 N REF0 ! 58 | PARAMETER G(LIQUID,PB,PBTE_L;1) 2.98150E+02 3681.91; 2.00000E+03 N REF0 ! 59 | PARAMETER G(LIQUID,TE,PBTE_L;0) 2.98150E+02 -6216.19+5.56*T; 2.00000E+03 N REF0 ! 60 | PARAMETER G(LIQUID,TE,PBTE_L;1) 2.98150E+02 1174.92; 2.00000E+03 N REF0 ! 61 | 62 | $OPTIMIZATION P1 -200000 -86518; -25000 N ! 63 | $OPTIMIZATION P2 -250 -25.657; 30 N ! 64 | $OPTIMIZATION P3 -200000 -85071; -25000 N ! 65 | $OPTIMIZATION P4 -250 -23; 50 N ! 66 | PHASE PBTE % 2 1 1 ! 67 | CONSTITUENT PBTE :PB,VA:TE,VA:! 68 | PARAMETER G(PBTE,PB:TE;0) 298.15 GHSERPB+GHSERTE-65055+5.87*T; 2000 N ! 69 | PARAMETER G(PBTE,PB:VA;0) 298.15 GHSERPB+174091.2; 2000 N ! 70 | PARAMETER G(PBTE,VA:TE;0) 298.15 GHSERTE+157960.355; 2000 N ! 71 | PARAMETER G(PBTE,PB,VA:TE;0) 298.15 -103462.5-5.2714*T; 2000 N ! 72 | PARAMETER G(PBTE,PB,VA:TE;1) 298.15 -12000+5.5*T; 2000 N ! 73 | PARAMETER G(PBTE,PB:VA,TE;0) 298.15 -84750.59-28.0930*T; 2000 N ! 74 | PARAMETER G(PBTE,PB:VA,TE;1) 298.15 -8000+4.5*T; 2000 N ! 75 | PARAMETER G(PBTE,PB,VA:TE,VA;0) 298.15 -62405.5-9.919*T; 2000 N ! 76 | 77 | PHASE HEXAGONAL_A8 % 1 1 ! 78 | CONSTITUENT HEXAGONAL_A8 :TE:! 79 | PARAMETER G(HEXAGONAL_A8,TE;0) 298.15 -10544.7+183.373*T-35.6687*T*LN(T) 80 | +0.0158344*T**2-5.24042E-006*T**3+155015*T**(-1); 722.66 Y 81 | 9160.59-129.265*T+13.004*T*LN(T) 82 | -0.0362361*T**2+5.00637E-006*T**3-1.28681E+006*T**(-1); 1150 Y 83 | -12781.3+174.901*T-32.5596*T*LN(T); 1600 N ! 84 | 85 | PHASE RHOMBOHEDRAL_A7 % 1 1.0 ! 86 | CONSTITUENT RHOMBOHEDRAL_A7 :PB,TE:! 87 | PARAMETER G(RHOMBOHEDRAL_A7,PB;0) 298.15 -7350.09+102.7*T-24.5242*T*LN(T) 88 | -0.00365895*T**2-2.4395E-007*T**3; 600.61 Y 89 | -10231.1+155.243*T-32.4914*T*LN(T) 90 | +0.00154613*T**2+8.05448E+025*T**(-9); 1200 Y 91 | 4457.62+54.1391*T-18.9641*T*LN(T) 92 | -0.00288294*T**2+9.8144E-008*T**3-2.69676E+006*T**(-1)+8.05448E+025*T**(-9); 2100 N ! 93 | PARAMETER G(RHOMBOHEDRAL_A7,TE;0) 2.98150E+02 +500+GHSERTE#; 94 | 2.00000E+03 N REF0 ! 95 | 96 | 97 | LIST_OF_REFERENCES 98 | NUMBER SOURCE 99 | ! 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /scythe/base.py: -------------------------------------------------------------------------------- 1 | from typing import List, Iterator, Tuple, Iterable, Union, Sequence 2 | from abc import ABC, abstractmethod 3 | import logging 4 | import os 5 | 6 | from scythe.utils.grouping import preprocess_paths 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class BaseExtractor(ABC): 12 | """Abstract base class for a metadata extractor 13 | 14 | This class defines the interface for all extractors in Scythe. Each new extractor must 15 | implement the :meth:`parse`, :meth:`version`, and :meth:`implementors` functions. The 16 | :meth:`group` method should be overridden to generate smart groups of file (e.g., associating 17 | the inputs and outputs to the same calculation) :meth:`citations` can be used if there 18 | are papers that should be cited if the extractor is used as part of a scientific publication. 19 | 20 | See the `Scythe Contributor Guide `_ for further details. 21 | """ 22 | 23 | def identify_files(self, path: str, context: dict = None) -> \ 24 | Iterator[Tuple[str]]: 25 | """Identify all groups of files likely to be compatible with this extractor 26 | 27 | Uses the :meth:`group` function to determine groups of files that should be parsed together. 28 | 29 | Args: 30 | path (str): Root of directory to group together 31 | context (dict): Context about the files 32 | Yields: 33 | ([str]) Groups of eligible files 34 | """ 35 | 36 | # Walk through the directories 37 | for root, dirs, files in os.walk(path): 38 | # Generate the full paths 39 | dirs = [os.path.join(root, d) for d in dirs] 40 | files = [os.path.join(root, f) for f in files] 41 | 42 | # Get any groups from this directory 43 | for group in self.group(files, dirs, context): 44 | yield group 45 | 46 | def extract_directory(self, path: str, context: dict = None) -> \ 47 | Iterator[Tuple[Tuple[str], dict]]: 48 | """Run extractor on all appropriate files in a directory 49 | 50 | Skips files that throw exceptions while parsing 51 | 52 | Args: 53 | path (str): Root of directory to extract metadata from 54 | context (dict): Context about the files 55 | Yields: 56 | ([str], dict): Tuple of the group identity and the metadata unit 57 | """ 58 | 59 | for group in self.identify_files(path, context): 60 | try: 61 | metadata_unit = self.extract(group, context) 62 | except Exception: 63 | continue 64 | else: 65 | yield group, metadata_unit 66 | 67 | @abstractmethod 68 | def extract(self, group: Iterable[str], context: dict = None) -> dict: 69 | """Extract metadata from a group of files 70 | 71 | A group of files is a set of 1 or more files that describe the same object 72 | and will be used together to create a single metadata record. 73 | 74 | Arguments: 75 | group ([str]): A list of one or more files that should be parsed together 76 | context (dict): Context about the files 77 | 78 | Returns: 79 | (dict): The parsed results, in JSON-serializable format. 80 | """ 81 | 82 | def group(self, files: Union[str, List[str]], directories: List[str] = None, 83 | context: dict = None) -> Iterator[Tuple[str, ...]]: 84 | """Identify a groups of files and directories that should be parsed together 85 | 86 | Will create groups using only the files and directories included as input. 87 | 88 | The files of files are _all_ files that could be read by this extractor, 89 | which may include many false positives. 90 | 91 | Args: 92 | files (str or [str]): List of files to consider grouping 93 | directories ([str]): Any directories to consider group as well 94 | context (dict): Context about the files 95 | Yields: 96 | ((str)): Groups of files 97 | """ 98 | 99 | # Make sure file paths are strings or Path-like objects 100 | files = preprocess_paths(files) 101 | 102 | # Default: Every file is in its own group 103 | for f in files: 104 | yield f, 105 | 106 | def citations(self) -> List[str]: 107 | """Citation(s) and reference(s) for this extractor 108 | 109 | Returns: 110 | ([str]): each element should be a string citation in BibTeX format 111 | """ 112 | return [] 113 | 114 | @abstractmethod 115 | def implementors(self) -> List[str]: 116 | """List of implementors of the extractor 117 | 118 | These people are the points-of-contact for addressing errors or modifying the extractor 119 | 120 | Returns: 121 | ([str]): List of implementors in the form "FirstName LastName " 122 | """ 123 | 124 | @abstractmethod 125 | def version(self) -> str: 126 | """Return the version of the extractor 127 | 128 | Returns: 129 | (str): Version of the extractor 130 | """ 131 | 132 | @property 133 | def schema(self) -> dict: 134 | """Schema for the output of the extractor""" 135 | return { 136 | "$schema": "http://json-schema.org/schema#" 137 | } 138 | 139 | 140 | class BaseSingleFileExtractor(BaseExtractor): 141 | """Base class for extractors that only ever considers a single file at a time 142 | 143 | Instead of implementing :meth:`parse`, implement :meth:`_parse_file`""" 144 | 145 | @abstractmethod 146 | def _extract_file(self, path: str, context=None): 147 | """Generate the metadata for a single file 148 | 149 | Args: 150 | path (str): Path to the file 151 | context (dict): Optional context information about the file 152 | Returns: 153 | (dict): Metadata for the file 154 | """ 155 | 156 | def extract(self, group: Union[str, Sequence[str]], context=None): 157 | # Error catching: allows for single files to passed not as list 158 | if isinstance(group, str): 159 | return self._extract_file(group, context) 160 | 161 | # Assumes that the group must have exactly one file 162 | if len(group) > 1: 163 | raise ValueError('Extractor only takes a single file at a time') 164 | 165 | return self._extract_file(group[0], context) 166 | -------------------------------------------------------------------------------- /tests/data/tdb/test_AuSi.TDB: -------------------------------------------------------------------------------- 1 | $ Reference: "Phase stability in nanoscale material systems: extension from bulk phase diagrams", Bajaj, S. et al, 2 | $ Nanoscale 7, 2015, doi: 10.1039/C5NR01535A 3 | $ Database file written 2013- 8-24 4 | $ From database: PURE4 5 | ELEMENT /- ELECTRON_GAS 0.0000E+00 0.0000E+00 0.0000E+00! 6 | ELEMENT VA VACUUM 0.0000E+00 0.0000E+00 0.0000E+00! 7 | ELEMENT AU FCC_A1 1.9697E+02 6.0166E+03 4.7488E+01! 8 | ELEMENT SI DIAMOND_A4 2.8085E+01 3.2175E+03 1.8820E+01! 9 | 10 | 11 | FUNCTION GHSERAU 2.98150E+02 -6938.856+106.830098*T-22.75455*T*LN(T) 12 | -.00385924*T**2+3.79625E-07*T**3-25097*T**(-1); 9.29400E+02 Y 13 | -93586.481+1021.69543*T-155.706745*T*LN(T)+.08756015*T**2 14 | -1.1518713E-05*T**3+10637210*T**(-1); 1.33733E+03 Y 15 | +314067.829-2016.37825*T+263.252259*T*LN(T)-.118216828*T**2 16 | +8.923844E-06*T**3-67999832*T**(-1); 1.73580E+03 Y 17 | -12133.783+165.272524*T-30.9616*T*LN(T); 3.20000E+03 N ! 18 | FUNCTION GHSERSI 2.98150E+02 -8162.609+137.236859*T-22.8317533*T*LN(T) 19 | -.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 1.68700E+03 Y 20 | -9457.642+167.281367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 21 | 3.60000E+03 N ! 22 | FUNCTION UN_ASS 298.15 0; 300 N ! 23 | 24 | TYPE_DEFINITION % SEQ *! 25 | DEFINE_SYSTEM_DEFAULT ELEMENT 2 ! 26 | DEFAULT_COMMAND DEF_SYS_ELEMENT VA /- ! 27 | 28 | 29 | PHASE LIQUID:L % 1 1.0 ! 30 | CONSTITUENT LIQUID:L :AU,SI : ! 31 | 32 | PARAMETER G(LIQUID,AU;0) 2.98150E+02 +5613.144+97.444232*T 33 | -22.75455*T*LN(T)-.00385924*T**2+3.79625E-07*T**3-25097*T**(-1); 34 | 9.29400E+02 Y 35 | -81034.481+1012.30956*T-155.706745*T*LN(T)+.08756015*T**2 36 | -1.1518713E-05*T**3+10637210*T**(-1); 1.33733E+03 Y 37 | +326619.829-2025.76412*T+263.252259*T*LN(T)-.118216828*T**2 38 | +8.923844E-06*T**3-67999832*T**(-1); 1.73580E+03 Y 39 | +418.217+155.886658*T-30.9616*T*LN(T); 3.20000E+03 N REF1 ! 40 | PARAMETER G(LIQUID,SI;0) 2.98150E+02 +42533.751+107.13742*T 41 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1) 42 | +2.09307E-21*T**7; 1.68700E+03 Y 43 | +40370.523+137.722298*T-27.196*T*LN(T); 3.60000E+03 N REF1 ! 44 | PARAMETER G(LIQUID,AU,SI;0) 2.98150E+02 -24103.3028-15.13883*T; 45 | 6.00000E+03 N REF0 ! 46 | PARAMETER G(LIQUID,AU,SI;1) 2.98150E+02 -29375.2777+1.1065*T; 47 | 6.00000E+03 N REF0 ! 48 | PARAMETER G(LIQUID,AU,SI;2) 2.98150E+02 -13032.2412; 6.00000E+03 N 49 | REF0 ! 50 | 51 | 52 | TYPE_DEFINITION & GES A_P_D BCC_A2 MAGNETIC -1.0 4.00000E-01 ! 53 | PHASE BCC_A2 %& 2 1 3 ! 54 | CONSTITUENT BCC_A2 :AU,SI : VA : ! 55 | 56 | PARAMETER G(BCC_A2,AU:VA;0) 2.98150E+02 -2688.856+105.730098*T 57 | -22.75455*T*LN(T)-.00385924*T**2+3.79625E-07*T**3-25097*T**(-1); 58 | 9.29400E+02 Y 59 | -89336.481+1020.59543*T-155.706745*T*LN(T)+.08756015*T**2 60 | -1.1518713E-05*T**3+10637210*T**(-1); 1.33733E+03 Y 61 | +318317.829-2017.47825*T+263.252259*T*LN(T)-.118216828*T**2 62 | +8.923844E-06*T**3-67999832*T**(-1); 1.73580E+03 Y 63 | -7883.783+164.172524*T-30.9616*T*LN(T); 3.20000E+03 N REF1 ! 64 | PARAMETER G(BCC_A2,SI:VA;0) 2.98150E+02 +38837.391+114.736859*T 65 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 66 | 1.68700E+03 Y 67 | +37542.358+144.781367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 3.60000E+03 68 | N REF1 ! 69 | 70 | 71 | TYPE_DEFINITION ' GES A_P_D CBCC_A12 MAGNETIC -3.0 2.80000E-01 ! 72 | PHASE CBCC_A12 %' 2 1 1 ! 73 | CONSTITUENT CBCC_A12 :SI : VA : ! 74 | 75 | PARAMETER G(CBCC_A12,SI:VA;0) 2.98150E+02 +42045.391+116.859859*T 76 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 77 | 1.68700E+03 Y 78 | +40750.358+146.904367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 3.60000E+03 79 | N REF1 ! 80 | 81 | 82 | PHASE CUB_A13 % 2 1 1 ! 83 | CONSTITUENT CUB_A13 :SI : VA : ! 84 | 85 | PARAMETER G(CUB_A13,SI:VA;0) 2.98150E+02 +39116.391+116.859859*T 86 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 87 | 1.68700E+03 Y 88 | +37821.358+146.904367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 3.60000E+03 89 | N REF1 ! 90 | 91 | 92 | PHASE DIAMOND_A4 % 1 1.0 ! 93 | CONSTITUENT DIAMOND_A4 :AU,SI : ! 94 | 95 | PARAMETER G(DIAMOND_A4,AU;0) 2.98150E+02 +GHSERAU#+12552+20.61589*T; 96 | 6.00000E+03 N REF0 ! 97 | PARAMETER G(DIAMOND_A4,SI;0) 2.98150E+02 +GHSERSI#; 3.60000E+03 N 98 | REF1 ! 99 | PARAMETER G(DIAMOND_A4,AU,SI;0) 2.98150E+02 40000; 6.00000E+03 N 100 | REF0 ! 101 | 102 | 103 | TYPE_DEFINITION ( GES A_P_D FCC_A1 MAGNETIC -3.0 2.80000E-01 ! 104 | PHASE FCC_A1 %( 2 1 1 ! 105 | CONSTITUENT FCC_A1 :AU,SI : VA : ! 106 | 107 | PARAMETER G(FCC_A1,AU:VA;0) 2.98150E+02 +GHSERAU#; 3.20000E+03 N REF1 ! 108 | PARAMETER G(FCC_A1,SI:VA;0) 2.98150E+02 +42837.391+115.436859*T 109 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 110 | 1.68700E+03 Y 111 | +41542.358+145.481367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 3.60000E+03 112 | N REF1 ! 113 | PARAMETER G(FCC_A1,AU,SI:VA;0) 2.98150E+02 2000; 6.00000E+03 N REF0 ! 114 | 115 | 116 | TYPE_DEFINITION ) GES A_P_D HCP_A3 MAGNETIC -3.0 2.80000E-01 ! 117 | PHASE HCP_A3 %) 2 1 .5 ! 118 | CONSTITUENT HCP_A3 :AU,SI : VA : ! 119 | 120 | PARAMETER G(HCP_A3,AU:VA;0) 2.98150E+02 -6698.106+108.430098*T 121 | -22.75455*T*LN(T)-.00385924*T**2+3.79625E-07*T**3-25097*T**(-1); 122 | 9.29400E+02 Y 123 | -93345.731+1023.29543*T-155.706745*T*LN(T)+.08756015*T**2 124 | -1.1518713E-05*T**3+10637210*T**(-1); 1.33733E+03 Y 125 | +314308.579-2014.77825*T+263.252259*T*LN(T)-.118216828*T**2 126 | +8.923844E-06*T**3-67999832*T**(-1); 1.73580E+03 Y 127 | -11893.033+166.872524*T-30.9616*T*LN(T); 3.20000E+03 N REF1 ! 128 | PARAMETER G(HCP_A3,SI:VA;0) 2.98150E+02 +41037.391+116.436859*T 129 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 130 | 1.68700E+03 Y 131 | +39742.358+146.481367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 3.60000E+03 132 | N REF1 ! 133 | 134 | 135 | PHASE HCP_ZN % 2 1 .5 ! 136 | CONSTITUENT HCP_ZN :SI : VA : ! 137 | 138 | PARAMETER G(HCP_ZN,SI:VA;0) 2.98150E+02 +41038.391+116.436859*T 139 | -22.8317533*T*LN(T)-.001912904*T**2-3.552E-09*T**3+176667*T**(-1); 140 | 1.68700E+03 Y 141 | +39743.358+146.481367*T-27.196*T*LN(T)-4.20369E+30*T**(-9); 3.60000E+03 142 | N REF1 ! 143 | 144 | LIST_OF_REFERENCES 145 | NUMBER SOURCE 146 | REF1 'PURE4 - SGTE Pure Elements (Unary) Database (Version 4.6), 147 | developed by SGTE (Scientific Group Thermodata Europe), 1991-2008, 148 | and provided by TCSAB (Jan. 2008). ' 149 | ! 150 | 151 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | from scythe import __version__ 20 | 21 | 22 | # -- Project information ----------------------------------------------------- 23 | 24 | project = 'Scythe' 25 | copyright = '2019 - 2022, Materials Data Facility Team, Citrine Informatics' 26 | author = 'Materials Data Facility Team, Citrine Informatics' 27 | 28 | # The short X.Y version 29 | version = __version__ 30 | # The full version, including alpha/beta/rc tags 31 | release = version 32 | 33 | 34 | # -- General configuration --------------------------------------------------- 35 | 36 | # If your documentation needs a minimal Sphinx version, state it here. 37 | # 38 | # needs_sphinx = '1.0' 39 | 40 | # Add any Sphinx extension module names here, as strings. They can be 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 42 | # ones. 43 | extensions = [ 44 | 'sphinx.ext.autodoc', 45 | 'sphinx.ext.intersphinx', 46 | 'sphinx.ext.todo', 47 | 'sphinx.ext.napoleon', 48 | 'sphinx.ext.viewcode', 49 | 'stevedore.sphinxext', 50 | 'sphinx.ext.autosectionlabel', 51 | 'sphinx.ext.githubpages' 52 | ] 53 | 54 | # Add any paths that contain templates here, relative to this directory. 55 | templates_path = ['_templates'] 56 | 57 | # The suffix(es) of source filenames. 58 | # You can specify multiple suffix as a list of string: 59 | # 60 | # source_suffix = ['.rst', '.md'] 61 | source_suffix = '.rst' 62 | 63 | # The master toctree document. 64 | master_doc = 'index' 65 | 66 | # The language for content autogenerated by Sphinx. Refer to documentation 67 | # for a list of supported languages. 68 | # 69 | # This is also used if you do content translation via gettext catalogs. 70 | # Usually you set "language" from the command line for these cases. 71 | language = 'EN-US' 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | # This pattern also affects html_static_path and html_extra_path. 76 | exclude_patterns = [] 77 | 78 | # The name of the Pygments (syntax highlighting) style to use. 79 | pygments_style = None 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'sphinx_rtd_theme' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a theme 90 | # further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | # html_static_path = ['_static'] 99 | 100 | # Custom sidebar templates, must be a dictionary that maps document names 101 | # to template names. 102 | # 103 | # The default sidebars (for documents that don't match any pattern) are 104 | # defined by theme itself. Builtin themes are using these templates by 105 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 106 | # 'searchbox.html']``. 107 | # 108 | # html_sidebars = {} 109 | 110 | 111 | # -- Options for HTMLHelp output --------------------------------------------- 112 | 113 | # Output file base name for HTML help builder. 114 | htmlhelp_basename = 'Scythedoc' 115 | 116 | 117 | # -- Options for LaTeX output ------------------------------------------------ 118 | 119 | latex_elements = { 120 | # The paper size ('letterpaper' or 'a4paper'). 121 | # 122 | # 'papersize': 'letterpaper', 123 | 124 | # The font size ('10pt', '11pt' or '12pt'). 125 | # 126 | # 'pointsize': '10pt', 127 | 128 | # Additional stuff for the LaTeX preamble. 129 | # 130 | # 'preamble': '', 131 | 132 | # Latex figure (float) alignment 133 | # 134 | # 'figure_align': 'htbp', 135 | } 136 | 137 | # Grouping the document tree into LaTeX files. List of tuples 138 | # (source start file, target name, title, 139 | # author, documentclass [howto, manual, or own class]). 140 | latex_documents = [ 141 | (master_doc, 'Scythe.tex', 'Scythe Documentation', 142 | 'Materials Data Facility Team, Citrine Informatics', 'manual'), 143 | ] 144 | 145 | 146 | # -- Options for manual page output ------------------------------------------ 147 | 148 | # One entry per manual page. List of tuples 149 | # (source start file, name, description, authors, manual section). 150 | man_pages = [ 151 | (master_doc, 'materialsio', 'Scythe Documentation', 152 | [author], 1) 153 | ] 154 | 155 | 156 | # -- Options for Texinfo output ---------------------------------------------- 157 | 158 | # Grouping the document tree into Texinfo files. List of tuples 159 | # (source start file, target name, title, author, 160 | # dir menu entry, description, category) 161 | texinfo_documents = [ 162 | (master_doc, 'Scythe', 'Scythe Documentation', 163 | author, 'Scythe', 'One line description of project.', 164 | 'Miscellaneous'), 165 | ] 166 | 167 | 168 | # -- Options for Epub output ------------------------------------------------- 169 | 170 | # Bibliographic Dublin Core info. 171 | epub_title = project 172 | 173 | # The unique identifier of the text. This can be a ISBN number 174 | # or the project homepage. 175 | # 176 | # epub_identifier = '' 177 | 178 | # A unique identification for the text. 179 | # 180 | # epub_uid = '' 181 | 182 | # A list of files that should not be packed into the epub file. 183 | epub_exclude_files = ['search.html'] 184 | 185 | 186 | # -- Extension configuration ------------------------------------------------- 187 | 188 | autoclass_content = "both" 189 | 190 | # -- Options for intersphinx extension --------------------------------------- 191 | 192 | # Example configuration for intersphinx: refer to the Python standard library. 193 | intersphinx_mapping = {'python': ('https://docs.python.org/3/', None)} 194 | 195 | # -- Options for todo extension ---------------------------------------------- 196 | 197 | # If true, `todo` and `todoList` produce output, else they produce nothing. 198 | todo_include_todos = True 199 | -------------------------------------------------------------------------------- /scythe/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Union, Tuple, Any, Callable, Optional, List, TypedDict 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | def get_nested_dict_value_by_path(nest_dict: Dict, 8 | path: Union[Tuple, str], 9 | cast: Optional[Callable] = None) -> Any: 10 | """Get the value from within a nested dictionary structure by traversing 11 | into the dictionary as deep as that path found and returning that value 12 | 13 | Args: 14 | nest_dict: A dictionary of dictionaries that is to be queried 15 | path: A string or tuple that specifies the subsequent keys needed to 16 | get to a value within `nest_dict`. If a string, the value will 17 | return just from the first level (mostly for convenience) 18 | cast: A function that (if provided) will be applied to the value. This 19 | helps with serialization. If it returns an error, the value will be returned as is 20 | without conversion 21 | 22 | Returns: 23 | The value at the path within the nested dictionary; if there's no 24 | value there, return ``None`` 25 | """ 26 | sub_dict = nest_dict 27 | 28 | if isinstance(path, str): 29 | path = (path,) 30 | 31 | for key in path: 32 | try: 33 | sub_dict = sub_dict[key] 34 | except KeyError: 35 | return None 36 | 37 | # coerce empty values to None 38 | if sub_dict in [{}, dict(), [], '', None]: 39 | return None 40 | 41 | if cast is not None: 42 | # noinspection PyBroadException 43 | try: 44 | return cast(sub_dict) 45 | except Exception as e: 46 | logger.warning(f"Exception encountered when casting value using {cast}: {e}; returning " 47 | f"value as is without casting") 48 | return sub_dict 49 | else: 50 | return sub_dict 51 | 52 | 53 | def set_nested_dict_value(nest_dict: Dict, path: Tuple, 54 | value: Any, override: Optional[bool] = False, ): 55 | """Set a value within a nested dictionary structure by traversing into 56 | the dictionary as deep as that path found and changing it to ``value``. 57 | If ``value`` is ``None``, immediately return without performing an action 58 | Cribbed from https://stackoverflow.com/a/13688108/1435788 59 | 60 | Args: 61 | nest_dict: A dictionary of dictionaries that is to be queried 62 | path: A tuple (or other iterable type) that specifies the subsequent 63 | keys needed to get to a value within `nest_dict` 64 | value: The value which will be given to the path in the nested 65 | dictionary 66 | override: If the value is already present, this flag controls whether 67 | to override its existing value 68 | """ 69 | if value is None: 70 | return 71 | orig_value = get_nested_dict_value_by_path(nest_dict, path) 72 | for key in path[:-1]: 73 | nest_dict = nest_dict.setdefault(key, {}) 74 | if orig_value is None or \ 75 | orig_value is not None and override: 76 | # only set the value if it was None, or we chose to override 77 | nest_dict[path[-1]] = value 78 | 79 | 80 | def set_nested_dict_value_with_units(nest_dict: Dict, path: Tuple, 81 | value: Any, units: Optional[str] = None, 82 | override: bool = False, 83 | fn: Optional[Callable] = None): 84 | """Same as :func:`~scythe.utils.set_nested_dict_value`, but sets the 85 | value in the format of a dictionary with keys ``'value'`` and ``'units'`` 86 | according to the specified units. If ``fn`` is supplied, it will be 87 | applied to the value prior to setting it. 88 | 89 | Args: 90 | nest_dict: A dictionary of dictionaries that is to be queried 91 | path: A tuple (or other iterable type) that specifies the subsequent 92 | keys needed to get to a value within ``nest_dict``. 93 | value: The value which will be given to the path in the nested 94 | dictionary 95 | units: If provided, will set the value at the given path to the 96 | provided units 97 | override: Whether to override a value if there is one already present 98 | at the path given 99 | fn: A callable function to apply to the value; can be used (for example) 100 | to convert a value form one unit to another, or any other purpose 101 | """ 102 | if value is not None: 103 | if fn is not None: 104 | value = fn(value) 105 | to_set = {'value': value} 106 | if units is not None: 107 | to_set['units'] = units 108 | set_nested_dict_value(nest_dict, path, to_set, override) 109 | 110 | 111 | # type definition for the mapping dictionaries 112 | MappingElements = TypedDict('MappingElements', 113 | {'source_dict': Dict, 114 | 'source_path': Union[str, Tuple[str, ...]], 115 | 'dest_dict': Dict, 116 | 'dest_path': Union[str, Tuple[str, ...]], 117 | 'cast_fn': Optional[Callable], 118 | 'units': Optional[Union[None, str]], 119 | 'conv_fn': Optional[Union[None, Callable]], 120 | 'override': Optional[bool]}) 121 | """TypedDict: A TypedDict to specify the exact types expected when creating a 122 | mapping dictionary to map metadata from one place to another. 123 | """ 124 | 125 | 126 | def map_dict_values(mapping: List[MappingElements]): 127 | """ 128 | Helper method to apply map values from one dictionary into another. 129 | Inspired by the implementation in :func:`hyperspy.io.dict2signal` 130 | 131 | For each mapping we need a source dict and destination dict, then for 132 | each term, the source path, the destination path, the cast function, 133 | the units to set, and potentially a conversion function 134 | 135 | Args: 136 | mapping: should be a list of dicts, for example: 137 | [ 138 | {'source_path': ('source', 'path',), 139 | 'dest_path': ('dest', 'path',), 140 | 'cast_fn': float, 141 | 'units': str, 142 | 'conv_fn': lambda x: x, 143 | 'override': bool} 144 | ] 145 | """ 146 | for m in mapping: 147 | m.setdefault('cast_fn', None) 148 | m.setdefault('units', None) 149 | m.setdefault('conv_fn', None) 150 | 151 | value = get_nested_dict_value_by_path( 152 | nest_dict=m['source_dict'], 153 | path=m['source_path'], 154 | cast=m['cast_fn']) 155 | set_nested_dict_value_with_units( 156 | nest_dict=m['dest_dict'], path=m['dest_path'], value=value, 157 | units=m['units'], fn=m['conv_fn'], override=m['override']) 158 | 159 | 160 | def standardize_unit(u: str) -> str: 161 | """ 162 | Helper method to convert typically seen unit representations into a 163 | standardized representation from QUDT 164 | (http://www.qudt.org/doc/DOC_VOCAB-UNITS.html). This is 165 | non-exhaustive, and may need to be updated as more types of units are 166 | encountered 167 | 168 | Args: 169 | u: The unit representation to convert 170 | 171 | Returns: 172 | The unit in a QUDT-standard representation (if known; otherwise just 173 | returns the unit representation as provided) 174 | """ 175 | mapping = { 176 | # length 177 | 'km': 'KiloM', 'cm': 'CentiM', 'm': 'M', 'mm': 'MilliM', 178 | 'µm': 'MicroM', 'um': 'MicroM', 'nm': 'NanoM', 'pm': 'PicoM', 179 | 'Å': 'ANGSTROM', 180 | # current 181 | 'A': 'A', 'mA': 'MilliA', 'nA': 'NanoA', 'pA': 'PicoA', 182 | 'µA': 'MicroA', 'uA': 'MicroA', 183 | # energy 184 | 'eV': 'EV', 'GeV': 'GigaEV', 'keV': 'KiloEV', 'MeV': 'MegaEV', 185 | # mass 186 | 'g': 'GM', 'kg': 'KiloGM', 187 | # potential 188 | 'V': 'V', 'kV': 'KiloV', 'MV': 'MegaV', 'mV': 'MilliV', 189 | 'uV': 'MicroV', 'µV': 'MicroV', 190 | # inverse lengths 191 | '1/nm': 'PER-NanoM', '1/mm': 'PER-MilliM', '1/m': 'PER-M', 192 | '1/cm': 'PER-CentiM', '1/um': 'PER-MicroM', '1/µm': 'PER-MicroM', 193 | '1/pm': 'PER-PicoM' 194 | } 195 | if u in mapping: 196 | return mapping[u] 197 | else: 198 | return u 199 | -------------------------------------------------------------------------------- /docs/source/user-guide.rst: -------------------------------------------------------------------------------- 1 | User Guide 2 | ========== 3 | 4 | In this part of the guide, we show a simple example of using a Scythe extractor and discuss the 5 | full functionality of an extractor. 6 | 7 | Installing Scythe (for users) 8 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 9 | 10 | Installing Scythe should be as easy as a single ``pip`` command. Assuming you have a 11 | version of Python that is 3.8 or higher, running:: 12 | 13 | pip install scythe-extractors 14 | 15 | Should get the basics of Scythe installed. By default however, only a small subset of 16 | extractors will be installed (this is done so you do not need to install all the dependencies of 17 | extractors you may never use). To install additional extractors, you can specify "extras" at install time 18 | using the ``[...]`` syntax for ``pip``. For example, if you want to install all the extractors 19 | bundled with Scythe (and their dependencies), run:: 20 | 21 | pip install pip install scythe-extractors[all] 22 | 23 | This will pull in many more packages, but also enable as many extractors as possible. Check the list 24 | under ``[tool.poetry.extras]`` in ``pyproject.toml`` to see all the options you can specify in 25 | the brackets of the ``pip install`` command. 26 | 27 | 28 | Discovering an extractor 29 | ~~~~~~~~~~~~~~~~~~~~~~~~ 30 | 31 | Scythe uses `stevedore `_ to manage 32 | a collection of extractors, and has a utility function for listing available extractors:: 33 | 34 | from scythe.utils.interface import get_available_extractors 35 | print(get_available_extractors()) 36 | 37 | This snippet will print a dictionary of extractors installed on your system. Both extractors that are 38 | part of the Scythe base package and those defined by other packages will be included in this 39 | list. 40 | 41 | Simple Interface 42 | ~~~~~~~~~~~~~~~~ 43 | 44 | The methods in :mod:`scythe.utils.interface` are useful for most applications. As an 45 | example, we illustrate the use of :class:`scythe.file.GenericFileExtractor`, which is 46 | available through the ``'generic'`` extractor plugin:: 47 | 48 | from scythe.utils.interface import execute_extractor 49 | print(execute_extractor('generic', ['pyproject.toml'])) 50 | 51 | 52 | The above snippet creates the extractor object and runs it on a file named ``pyproject.toml``. Run 53 | in the root directory of the Scythe, it would produce output similar to the following, 54 | likely with a different ``sha512`` value if the contents of that file have changed since this 55 | documentation was written: 56 | 57 | .. code:: json 58 | 59 | [{ 60 | "data_type": "ASCII text", 61 | "filename": "pyproject.toml", 62 | "length": 2421, 63 | "mime_type": "text/plain", 64 | "path": "pyproject.toml", 65 | "sha512": "a7eb382c4a3e6cf469656453f9ff2e3c1ac2c02c9c2ba31c3d569a09883e2b2471801c39125dafb7c13bfcaf9cf6afbab92afa4c053c0c93a4c8c59acad1b85b" 66 | }] 67 | 68 | The other pre-built parsing function provides the ability to run all extractors on all files in a 69 | directory:: 70 | 71 | from scythe.utils.interface import run_all_extractors 72 | gen = run_all_extractors('.') 73 | for record in gen: 74 | print(record) 75 | 76 | A third route for using ``scythe`` is to employ the ``get_extractor`` operation to access a 77 | specific extractor, and then use its class interface (described below):: 78 | 79 | from scythe.utils.interface import get_extractor 80 | extractor = get_extractor('generic') 81 | gen = extractor.parse_directory('.') 82 | for record in gen: 83 | print(record) 84 | 85 | 86 | Advanced Usage: Adding Context 87 | ++++++++++++++++++++++++++++++ 88 | 89 | The function interface for Scythe supports using "context" and "adapters" to provide 90 | additional information Scythe into Applications <#id1>`_. Here, we describe the purpose 91 | of context and how to use it in our interface. 92 | 93 | Context is information about the data held in a file that is not contained within the file itself 94 | . Examples include human-friendly descriptions of columns names or which values actually 95 | represent a missing measurement in tabular data file (e.g., CSV files). A limited number of 96 | extractors support context and this information can be provided via the ``execute_extractor`` 97 | function:: 98 | 99 | execute_extractor('csv', 'tests/data/test.csv', context={'na_values': ['N/A']}) 100 | 101 | 102 | The types of context information used by an extractor, if any, is described in the 103 | `documentation for each extractor `_. 104 | 105 | The ``run_all_extractors_on_directory`` function has several options for providing context to the 106 | extractors. These options include specifying "global context" to be passed to every extractor or 107 | adapter and ways of limiting the metadata to specific extractors. See 108 | :meth:`scythe.utils.interface.run_all_extractors_on_directory` for further details on the 109 | syntax for this command. 110 | 111 | .. note:: 112 | 113 | *Context is still an experimental feature and APIs are subject to change* 114 | 115 | 116 | Class Interface 117 | ~~~~~~~~~~~~~~~ 118 | 119 | The class API of extractors provide access to more detailed features of individual extractors. The 120 | functionality of an extractor is broken into several simple operations. 121 | 122 | Initializing an extractor 123 | +++++++++++++++++++++++++ 124 | 125 | The first step to using an extractor is to initialize it. Most extractors do not have any options for 126 | the initializer, so you can create them with:: 127 | 128 | extractor = Extractor() 129 | 130 | Some extractors require configuration options that define how the extractor runs, such as the location 131 | of a non-Python executable. 132 | 133 | Parsing Method 134 | ++++++++++++++ 135 | 136 | The main operation for any extractor is the data extraction operation: ``parse``. 137 | 138 | In most cases, the ``parse`` operation takes the path to a file and and returns a summary of the 139 | data the file holds:: 140 | 141 | metadata = extractor.parse(['/my/file']) 142 | 143 | Some extractors take multiple files that describe the same object (e.g., the input and output files 144 | of a simulation) and use them to generate a single metadata record:: 145 | 146 | metadata = extractor.parse(['/my/file.in', '/my/file.out']) 147 | 148 | The `grouping method <#grouping-files>`_ for these extractors provides logic to identify groups of 149 | related files. 150 | 151 | Some extractors also can use information that is not contained within the file themselves, which can 152 | be provided to the extractor as a "context":: 153 | 154 | metadata = extractor.parse(['/my/file1'], context={'headers': {'temp': 'temperature'}}) 155 | 156 | The documentation for the extractor should indicate valid types of context information. 157 | 158 | Grouping Files 159 | ++++++++++++++ 160 | 161 | Extractors also provide the ability to quickly find groups of associated files: ``group``. 162 | The ``group`` operation takes path or list of files and, optionally, directories and generates 163 | a list of files that should be treated together when parsing:: 164 | 165 | extractor.group(['input.file', 'output.file', 'unrelated']) # -> [('input.file', 'output.file'), ('unrelated',)] 166 | 167 | Parsing Entire Directories 168 | ++++++++++++++++++++++++++ 169 | 170 | ``scythe`` also provides a utility operation to parse all groups of valid files in a directory:: 171 | 172 | metadata = list(extractor.parse_directory('.')) 173 | 174 | ``parse_directory`` is a generator function, so we use ``list`` here to turn the output into a list 175 | format. 176 | 177 | Attribution Functions 178 | +++++++++++++++++++++ 179 | 180 | Two functions, ``citations`` and ``implementors``, are available to determine who contirbuted a 181 | extractor. ``implementors`` returns the list of people who created an extractor, who are likely the 182 | points-of-contact for support. ``citations`` indicates if any publications are available that 183 | describe the underlying methods and should be reference in scientific articles. 184 | 185 | Full Extractor API 186 | ++++++++++++++++++ 187 | 188 | The full API for the extractors are described as a Python abstract class: 189 | 190 | .. autoclass:: scythe.base.BaseExtractor 191 | :members: 192 | :member-order: bysource 193 | 194 | Integrating Scythe into Applications 195 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 196 | 197 | Scythe is designed to create a documented, JSON-format version of scientific files, but 198 | these files might not yet be in a form useful for your application. We recommend an "adapter" 199 | approach to post-process these "generic JSON" files that can actually be used for your application. 200 | 201 | BaseAdapter 202 | +++++++++++ 203 | 204 | The ``BaseAdapter`` class defines the interface for all adapters. 205 | 206 | .. autoclass:: scythe.adapters.base.BaseAdapter 207 | :member-order: bysource 208 | :noindex: 209 | :members: 210 | 211 | Adapters must fulfill a single operation, ``transform``, which renders metadata from one of the 212 | Scythe extractors into a new form. There are no restrictions on the output for this function, 213 | except that ``None`` indicates that there is no valid transformation for an object. 214 | 215 | The ``check_compatibility`` and ``version`` method provide a route for marking which versions of 216 | an extractor are compatible with an adapter. ``scythe`` uses the version in utility operations 217 | to provide warnings to users about when an adapter is out-of-date. 218 | 219 | Using Adapters 220 | ++++++++++++++ 221 | 222 | The same utility operations `described above <#simple-interface>`_ support using adapters. The 223 | ``execute_extractor`` function has an argument, ``adapter``, that takes the name of the adapter as 224 | an input and causes the parsing operation to run the adapter after parsing. The 225 | ``run_all_extractors`` function also has arguments (e.g., ``adapter_map``) that associate each 226 | extractor with the adapter needed to run after parsing. 227 | 228 | As an example, we will demonstrate an adapter that comes packaged with Scythe: 229 | :class:`scythe.adapters.base.SerializeAdapter` 230 | The serialize adapter is registered using ``stevedore`` as the name "serialize". To use it after 231 | all extractors:: 232 | 233 | from scythe.utils.interface import run_all_extractors 234 | gen = run_all_extractors('.', default_adapter='serialize') 235 | 236 | Implementing Adapters 237 | +++++++++++++++++++++ 238 | 239 | Any new adapters must inherit from the ``BaseAdapter`` class defined above. You only need 240 | implement the ``transform`` operation. 241 | 242 | Once the adapter is implemented, you need to put it in a project that is installable via pip. See 243 | [python docs](https://docs.python.org/3.7/distutils/setupscript.html) for a detailed tutorial or 244 | copy the structure used by the 245 | `MDF's adapter library `_. 246 | 247 | Then, register the adapter with ``stevedore`` by adding it as an entry point in your project's 248 | ``setup.py`` or ``pyproject.toml`` file. See the 249 | `stevedore documentation for more detail `_. 250 | We recommend using the same name for a adapter as the extractor it is designed for so that 251 | ``scythe`` can auto-detect the adapters associated with each extractor. 252 | 253 | Examples of Tools Using Scythe 254 | +++++++++++++++++++++++++++++++++++ 255 | 256 | Materials Data Facility: 257 | https://github.com/materials-data-facility/mdf-materialsio-adapters 258 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /docs/source/contributor-guide.rst: -------------------------------------------------------------------------------- 1 | Contributor Guide 2 | ================= 3 | 4 | Setting up development environment 5 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | Scythe makes use of the `Poetry `_ project to manage 8 | dependencies and packaging. To install the latest version of Scythe, first install poetry 9 | following `their documentation `_. Once that's 10 | done, clone/download the Scythe repository locally from 11 | `Github `_. Change into that directory 12 | and run ``poetry install`` (it would be a good idea to create a new virtual environment for your 13 | project first too, so as to not mix dependencies with your system environment). 14 | 15 | By default, only a small subset of extractors will be installed (this is done so that you do not 16 | need to install all the dependencies of extractors you may never use). To install additional 17 | extractors, you can specify "extras" at install time using ``poetry``. Any of the values specified 18 | in the ``[tool.poetry.extras]`` section of ``pyproject.toml`` can be provided, including ``all``, 19 | which will install all bundled extractors and their dependencies. For example:: 20 | 21 | poetry install -E all 22 | 23 | Poetry wil create a dedicated virtual environment for the project and the Scythe code will 24 | be installed in "editable" mode, so any changes you make to the code will be reflected when 25 | running tests, importing extractors, etc. It will use the default version of python available. 26 | Scythe is currently developed and tested against Python versions 3.8.12, 3.9.12, and 3.10.4. 27 | We recommend using the `pyenv `_ project to manage 28 | various python versions on your system if this does not match your system version of Python. It 29 | is required to use ``tox`` as well (see next paragraph). Make sure you install the versions 30 | specified in the ``.python-version`` file by running commands such as ``pyenv install 3.8.12`` etc. 31 | 32 | Additionally, the project uses `tox `_ to simplify common tasks and 33 | to be able to run tests in isolated environments. This will be installed automatically as a 34 | development package when running the ``poetry install`` command above. It can be used to run the 35 | test suite with common settings, as well as building the documentation. For example, to 36 | run the full Scythe test suite on all three versions of Python targetd, just run:: 37 | 38 | poetry run tox 39 | 40 | To build the HTML documentation (will be placed inside the ``./docs/_build/`` folder), run:: 41 | 42 | poetry run tox -e docs 43 | 44 | For the sake of speed, if you would like to focus your testing on just one Python version, you can 45 | temporarily override the environment list from ``pyproject.toml`` with an enviornment variable. 46 | For example, to only run the test/coverage suite on Python 3.8.X, run:: 47 | 48 | TOXENV=py38 poetry run tox 49 | 50 | Check out the ``[tool.tox]`` section of the ``pyproject.toml`` file to view how these tasks are 51 | configured, and the `tox documentation `_ on how to add your 52 | own custom tasks, if needed. 53 | 54 | Finally, Scythe uses ``flake8`` to enforce code styles, which will be run for you 55 | automatically when using ``tox`` as defined above. Any code-style errors, such as lines longer 56 | than 100 characters, trailing whitespace, etc. will be flagged when running ``poetry run tox``. 57 | 58 | The next part of the Scythe guide details how to add a new extractor to the ecosystem. 59 | 60 | Step 1: Implement the Extractor 61 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 62 | 63 | Creating a new extractor is accomplished by implementing the 64 | `BaseExtractor `_ abstract class. If you are new to MaterailsIO, we 65 | recommend reviewing the `User Guide `_ first to learn about 66 | the available methods of BaseExtractor. Minimally, you need only implement the ``extract``, 67 | ``version``, and ``implementors`` operations for a new extractor. Each of these methods (and any 68 | other methods you override) must be stateless, so that running the operation does not change the 69 | behavior of the extractor. 70 | 71 | We also have subclasses of ``BaseExtractor`` that are useful for common types of extractors: 72 | 73 | - ``BaseSingleFileExtractor``: Extractors that only ever evaluate a single file at a time 74 | 75 | Class Attributes and Initializer 76 | -------------------------------- 77 | 78 | The ``BaseExtractor`` class supports configuration options as Python class attributes. 79 | These options are intended to define the behavior of an extractor for a particular environment 80 | (e.g., paths of required executables) or for a particular application (e.g., turning off unneeded 81 | features). We recommend limiting these options to be only JSON-serializable data types and for 82 | all to be defined in the ``__init__`` function to simplify text-based configuration files. 83 | 84 | The initializer function should check if an extractor has access to all required external tools, and 85 | throw exceptions if not. For example, an extractor that relies on calling an external command-line 86 | tool should check whether the package is installed. In general, extractors should fail during 87 | initialization and not during the parsing operation if the system in misconfigured. 88 | 89 | Implementing ``extract`` 90 | ------------------------ 91 | 92 | The ``extract`` method contains the core logic of a Scythe extractor: rendering a summary of a 93 | group of data files. We do not specify any particular schema for the output but we do recommend 94 | best practices: 95 | 96 | 97 | #. *Summaries must be JSON-serializable.* 98 | Limiting to JSON data types ensures summaries are readable by most software without specia 99 | libraries. JSON documents are also able to be documented easily. 100 | 101 | #. *Human-readability is desirable.* 102 | JSON summaries should be understandable to users without expert-level knowledge of the data. 103 | Avoid unfamiliar acronyms, such as names of variables in a specific simulation code or settings 104 | specific to a certain brand of instrument. 105 | 106 | #. *Adhere closely to the original format.* 107 | If feasible, try to stay close to the original data format of a file or the output of a library 108 | used for parsing. Deviating from already existing formats complicates modifications to an extractor. 109 | 110 | #. *Always return a dictionary.* 111 | If an extractor can return multiple records from a single file group, return the list as an element 112 | of the dictionary. Any metadata that pertains to each of the sub-records should be stored as 113 | a distinct element rather than being duplicated in each sub-record. 114 | 115 | 116 | We also have a recommendations for the extractor behavior: 117 | 118 | #. *Avoid configuration options that change only output format.* 119 | Extractors can take configuration options that alter the output format, but configurations 120 | should be used sparingly. A good use of configuration would be to disable complex parsing 121 | operations if unneeded. A bad use of configuration would be to change the output to match a 122 | different schema. Operations that significantly alter the form but not the content of a 123 | summary should be implemented as adaptors. 124 | 125 | #. *Consider whether context should be configuration.* 126 | Settings that are identical for each file could be better suited as configuration settings 127 | than as context. 128 | 129 | Implementing ``group`` 130 | ---------------------- 131 | 132 | The ``group`` operation finds all sets of files in a user-provided list files and directories 133 | that should be parsed together. Implementing ``group`` is optional. Implementing a new ``group`` 134 | method is required only when the default behavior of "each file is its own group" (i.e., the 135 | extractor only treats files individually) is incorrect. 136 | 137 | The ``group`` operation should not require access to the content of the files or directories to 138 | determine groupings. Being able to determine file groups via only file names improves performance 139 | and allows for determining groups of parsable files without needing to download them from remote 140 | systems. 141 | 142 | Files are allowed to appear in more than one group, but we recommend generating only the largest 143 | valid group of files to minimize the same metadata being generated multiple times. 144 | 145 | It is important to note that that file groups are specific to an extractor. Groupings of files that 146 | are meaningful to one extractor need not be meaningful to another. For that reason, limit the 147 | definition of groups to sets of files that can be parsed together without consideration to what 148 | other information makes the files related (e.g., being in the same directory). 149 | 150 | Another appropriate use of the ``group`` operation is to filter out files which are very unlikely 151 | to parse correctly. For example, a PDF extractor could identify only files with a ".pdf" extension. 152 | However, we recommend using filtering sparing to ensure no files are missed. 153 | 154 | Implementing ``citations`` and ``implementors`` 155 | ----------------------------------------------- 156 | 157 | The ``citation`` and ``implementors`` methods identify additional resources describing an extractor 158 | and provide credit to contributors. ``implementors`` is required, as this operation is also used 159 | to identify points-of-contact for support requests. 160 | 161 | ``citation`` should return a list of BibTeX-format references. 162 | 163 | ``implementors`` should return a list of people and, optionally, their contract information 164 | in the form: "FirstName LastName ". 165 | 166 | Implementing ``version`` 167 | ------------------------ 168 | 169 | We require using `semantic versioning `_ for specifying the version of extractors. 170 | As the API of the extractor should remain unchanged, use versioning to indicate changes in available 171 | options or the output schema. The ``version`` operation should return the version of the extractor. 172 | 173 | 174 | Step 2: Document the Extractor 175 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 176 | 177 | The docstring for an extractor must start with a short, one sentence summary of the extractor, which 178 | will be used by our autodocumentation tooling. The rest of the documentation should describe what 179 | types of files are compatible, what context information can be used, and 180 | summarize what types of metadata are generated. 181 | 182 | .. todo:: Actually write these descriptors for the available extractors 183 | 184 | The Scythe project uses JSON documents as the output for all extractors and 185 | `JSON Schema `_ to describe the content of the documents. The 186 | BaseExtractor class includes a property, ``schema``, that stores a description of the output format. 187 | We recommend writing your description as a separate file and having the ``schema`` property read 188 | and output the contents of this file. See the 189 | `GenericFileExtractor source code `_ 190 | for a example. 191 | 192 | 193 | Step 3: Register the Extractor 194 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 195 | 196 | Preferred Route: Adding the Extractor to Scythe 197 | ----------------------------------------------- 198 | 199 | If your extractor has the same dependencies as existing extractors, add it to the existing module with 200 | the same dependencies. 201 | 202 | If your extractor has new dependencies, create a new module for your extractor in ``scythe``, and 203 | then add the requirements as a new key in the ``[tool.poetry.extras]`` section of ``pyproject 204 | .toml``, following the other extractor examples in that section. Next, add your extractor to 205 | ``docs/source/extractors.rst`` by adding an ``.. automodule::`` statement that refers to your new 206 | module (again, following the existing pattern). 207 | 208 | Scythe uses ``stevedore`` to simplify access to the extractors. After implementing and 209 | documenting the extractor, add it to the ``[tool.poetry.plugins."scythe.extractor"]`` section of the 210 | ``pyproject.toml`` file for Scythe. See 211 | `stevedore documentation for more information `_ 212 | (these docs reference ``setup.py``, but the equivalent can be done via plugins in ``pyproject 213 | .toml``; follow the existing structure if you're unsure, and ask for help from the developers if 214 | you run into issues). 215 | 216 | 217 | Alternative Route: Including Extractors from Other Libraries 218 | ------------------------------------------------------------ 219 | 220 | If an extractor would be better suited as part of a different library, you can still register it as a 221 | extractor with Scythe by altering your ``pyproject.toml`` file. Add an entry point with the 222 | namespace ``"scythe.extractor"`` and point to the class object following the 223 | `stevedore documentation `_. 224 | Adding the entry point will let Scythe use your extractor if your library is installed in the 225 | same Python environment as Scythe. 226 | 227 | .. todo:: Provide a public listing of scythe-compatible software. 228 | 229 | So that people know where to find these external libraries 230 | -------------------------------------------------------------------------------- /tests/data/crystal_structure/C13H22O3.cif: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------ 2 | #$Date: 2015-01-07 18:25:02 +0200 (Wed, 07 Jan 2015) $ 3 | #$Revision: 129439 $ 4 | #$URL: file:///home/coder/svn-repositories/cod/cif/1/00/00/1000018.cif $ 5 | #------------------------------------------------------------------------------ 6 | # 7 | # This file is available in the Crystallography Open Database (COD), 8 | # http://www.crystallography.net/ 9 | # 10 | # All data on this site have been placed in the public domain by the 11 | # contributors. 12 | # 13 | data_1000018 14 | loop_ 15 | _publ_author_name 16 | 'Mondal, Swastik' 17 | 'Mukherjee, Monika' 18 | 'Roy, Arnab' 19 | 'Mukherjee, Debabrata' 20 | 'Helliwell, Madeleine' 21 | _publ_section_title 22 | ; 23 | (1SR,2RS,5RS,6SR,8RS)-7,7-dimethyltricyclo[6.2.1.0^1,6^]undecane-2,5,6-triol: 24 | a supramolecular framework built from O-H...O hydrogen bonds 25 | ; 26 | _journal_coeditor_code SK1554 27 | _journal_issue 8 28 | _journal_name_full 'Acta Crystallographica, Section C' 29 | _journal_page_first o474 30 | _journal_page_last o476 31 | _journal_volume 58 32 | _journal_year 2002 33 | _chemical_formula_moiety 'C13 H22 O3' 34 | _chemical_formula_sum 'C13 H22 O3' 35 | _chemical_formula_weight 226.31 36 | _chemical_melting_point 453 37 | _chemical_name_systematic 38 | ; 39 | (1SR,2RS,5RS,6SR,8RS)-7,7-dimethyltricyclo[6.2.1.0^1,6^]undecane-2,5,6-triol 40 | ; 41 | _symmetry_cell_setting triclinic 42 | _symmetry_space_group_name_Hall '-P 1' 43 | _symmetry_space_group_name_H-M 'P -1' 44 | _atom_sites_solution_hydrogens geom 45 | _atom_sites_solution_primary direct 46 | _atom_sites_solution_secondary difmap 47 | _audit_creation_method SHELXL97 48 | _cell_angle_alpha 82.470(10) 49 | _cell_angle_beta 77.560(10) 50 | _cell_angle_gamma 89.460(10) 51 | _cell_formula_units_Z 4 52 | _cell_length_a 9.812(2) 53 | _cell_length_b 11.1410(10) 54 | _cell_length_c 11.443(2) 55 | _cell_measurement_reflns_used 18 56 | _cell_measurement_temperature 293(2) 57 | _cell_measurement_theta_max 8.1 58 | _cell_measurement_theta_min 6.9 59 | _cell_volume 1210.8(3) 60 | _computing_cell_refinement 'MSC/AFC Diffractometer Control Software' 61 | _computing_data_collection 62 | ; 63 | MSC/AFC Diffractometer Control Software (Molecular Structure Corporation, 1995) 64 | ; 65 | _computing_data_reduction 66 | 'TEXSAN (Molecular Structure Corporation, 1995)' 67 | _computing_molecular_graphics 68 | 'ZORTEP (Zsolnai, 1995) and WinGX (Farrugia, 1999)' 69 | _computing_publication_material 'SHELXL97 and PARST (Nardelli, 1995)' 70 | _computing_structure_refinement 'SHELXL97 (Sheldrick, 1997)' 71 | _computing_structure_solution 'MULTAN-88 (Debaerdemaeker et al., 1988)' 72 | _diffrn_ambient_temperature 293(2) 73 | _diffrn_measured_fraction_theta_full 0.996 74 | _diffrn_measured_fraction_theta_max 0.996 75 | _diffrn_measurement_device_type 'Rigaku AFC-5R' 76 | _diffrn_measurement_method \w/2\q 77 | _diffrn_radiation_monochromator graphite 78 | _diffrn_radiation_source 'fine-focus sealed tube' 79 | _diffrn_radiation_type MoK\a 80 | _diffrn_radiation_wavelength 0.71070 81 | _diffrn_reflns_av_R_equivalents 0.015 82 | _diffrn_reflns_av_sigmaI/netI 0.039 83 | _diffrn_reflns_limit_h_max 11 84 | _diffrn_reflns_limit_h_min -11 85 | _diffrn_reflns_limit_k_max 0 86 | _diffrn_reflns_limit_k_min -13 87 | _diffrn_reflns_limit_l_max 13 88 | _diffrn_reflns_limit_l_min -13 89 | _diffrn_reflns_number 4499 90 | _diffrn_reflns_theta_full 25.00 91 | _diffrn_reflns_theta_max 25.00 92 | _diffrn_reflns_theta_min 1.84 93 | _diffrn_standards_decay_% -1.09 94 | _diffrn_standards_interval_count 150 95 | _diffrn_standards_number 3 96 | _exptl_absorpt_coefficient_mu 0.086 97 | _exptl_absorpt_correction_type none 98 | _exptl_crystal_colour colourless 99 | _exptl_crystal_density_diffrn 1.241 100 | _exptl_crystal_density_meas ? 101 | _exptl_crystal_density_method . 102 | _exptl_crystal_description block 103 | _exptl_crystal_F_000 496 104 | _exptl_crystal_size_max 0.5 105 | _exptl_crystal_size_mid 0.4 106 | _exptl_crystal_size_min 0.3 107 | _refine_diff_density_max 0.24 108 | _refine_diff_density_min -0.21 109 | _refine_ls_extinction_coef none 110 | _refine_ls_extinction_method none 111 | _refine_ls_goodness_of_fit_ref 1.101 112 | _refine_ls_hydrogen_treatment noref 113 | _refine_ls_matrix_type full 114 | _refine_ls_number_parameters 299 115 | _refine_ls_number_reflns 4259 116 | _refine_ls_number_restraints 0 117 | _refine_ls_restrained_S_all 1.101 118 | _refine_ls_R_factor_all 0.094 119 | _refine_ls_R_factor_gt 0.054 120 | _refine_ls_shift/su_max 0.014 121 | _refine_ls_shift/su_mean 0.000 122 | _refine_ls_structure_factor_coef Fsqd 123 | _refine_ls_weighting_details 124 | 'calc w = 1/[\s^2^(Fo^2^)+(0.0577P)^2^+0.9623P] where P=(Fo^2^+2Fc^2^)/3' 125 | _refine_ls_weighting_scheme calc 126 | _refine_ls_wR_factor_gt 0.132 127 | _refine_ls_wR_factor_ref 0.156 128 | _reflns_number_gt 2972 129 | _reflns_number_total 4259 130 | _reflns_threshold_expression I>2\s(I) 131 | _cod_duplicate_entry 2012894 132 | _cod_depositor_comments 133 | ; 134 | The following automatic conversions were performed: 135 | 136 | '_chemical_melting_point' value '453K' was changed to '453' - the 137 | value should be numeric and without a unit designator. 138 | 139 | Automatic conversion script 140 | Id: cif_fix_values 1646 2011-03-28 12:23:43Z adriana 141 | ; 142 | _cod_database_code 1000018 143 | loop_ 144 | _symmetry_equiv_pos_as_xyz 145 | 'x, y, z' 146 | '-x, -y, -z' 147 | loop_ 148 | _atom_site_label 149 | _atom_site_fract_x 150 | _atom_site_fract_y 151 | _atom_site_fract_z 152 | _atom_site_U_iso_or_equiv 153 | _atom_site_adp_type 154 | _atom_site_calc_flag 155 | _atom_site_refinement_flags 156 | _atom_site_occupancy 157 | _atom_site_type_symbol 158 | C1A 0.7141(3) 0.2593(2) 0.9184(2) 0.0303(6) Uani d . 1 C 159 | C2A 0.6901(3) 0.2018(2) 0.8106(2) 0.0336(6) Uani d . 1 C 160 | H2A 0.7796 0.1978 0.7535 0.040 Uiso calc R 1 H 161 | C3A 0.6296(3) 0.0742(3) 0.8510(3) 0.0410(7) Uani d . 1 C 162 | H3A1 0.6222 0.0370 0.7806 0.049 Uiso calc R 1 H 163 | H3A2 0.5363 0.0784 0.9002 0.049 Uiso calc R 1 H 164 | C4A 0.7192(3) -0.0050(2) 0.9236(3) 0.0383(7) Uani d . 1 C 165 | H4A1 0.6725 -0.0828 0.9531 0.046 Uiso calc R 1 H 166 | H4A2 0.8078 -0.0191 0.8709 0.046 Uiso calc R 1 H 167 | C5A 0.7465(3) 0.0526(2) 1.0304(2) 0.0306(6) Uani d . 1 C 168 | H5A 0.6579 0.0596 1.0880 0.037 Uiso calc R 1 H 169 | C6A 0.8121(3) 0.1796(2) 0.9847(2) 0.0281(6) Uani d . 1 C 170 | C7A 0.8437(3) 0.2627(3) 1.0796(2) 0.0383(7) Uani d . 1 C 171 | C8A 0.7786(3) 0.3848(3) 1.0373(3) 0.0409(7) Uani d . 1 C 172 | H8A 0.8228 0.4562 1.0561 0.049 Uiso calc R 1 H 173 | C9A 0.6201(3) 0.3799(3) 1.0801(3) 0.0490(8) Uani d . 1 C 174 | H9A1 0.5797 0.4578 1.0608 0.059 Uiso calc R 1 H 175 | H9A2 0.5926 0.3545 1.1664 0.059 Uiso calc R 1 H 176 | C10A 0.5768(3) 0.2844(3) 1.0080(3) 0.0396(7) Uani d . 1 C 177 | H10A 0.5062 0.3157 0.9651 0.048 Uiso calc R 1 H 178 | H10B 0.5408 0.2113 1.0610 0.048 Uiso calc R 1 H 179 | C11A 0.7907(3) 0.3824(2) 0.9017(2) 0.0363(7) Uani d . 1 C 180 | H11A 0.8866 0.3808 0.8573 0.044 Uiso calc R 1 H 181 | H11B 0.7417 0.4481 0.8651 0.044 Uiso calc R 1 H 182 | C12A 1.0010(3) 0.2824(3) 1.0680(3) 0.0561(9) Uani d . 1 C 183 | H12A 1.0422 0.3190 0.9875 0.084 Uiso calc R 1 H 184 | H12B 1.0435 0.2058 1.0843 0.084 Uiso calc R 1 H 185 | H12C 1.0156 0.3345 1.1248 0.084 Uiso calc R 1 H 186 | C13A 0.7803(4) 0.2134(3) 1.2117(3) 0.0587(10) Uani d . 1 C 187 | H13A 0.6836 0.1927 1.2200 0.088 Uiso calc R 1 H 188 | H13B 0.7882 0.2742 1.2623 0.088 Uiso calc R 1 H 189 | H13C 0.8294 0.1426 1.2355 0.088 Uiso calc R 1 H 190 | O1A 0.59473(19) 0.2694(2) 0.74991(18) 0.0453(5) Uani d . 1 O 191 | H1A 0.6369 0.3255 0.7029 0.068 Uiso calc R 1 H 192 | O2A 0.84002(19) -0.01982(17) 1.08956(18) 0.0385(5) Uani d . 1 O 193 | H2A1 0.7973 -0.0787 1.1316 0.058 Uiso calc R 1 H 194 | O3A 0.93661(17) 0.16502(17) 0.89587(15) 0.0319(4) Uani d . 1 O 195 | H3A 0.9861 0.1147 0.9245 0.048 Uiso calc R 1 H 196 | C1B 0.7940(3) 0.6323(2) 0.5335(2) 0.0300(6) Uani d . 1 C 197 | C2B 0.8156(3) 0.5029(2) 0.5043(2) 0.0330(6) Uani d . 1 C 198 | H2B 0.9074 0.4977 0.4509 0.040 Uiso calc R 1 H 199 | C3B 0.7034(3) 0.4690(2) 0.4412(3) 0.0359(6) Uani d . 1 C 200 | H3B1 0.6133 0.4672 0.4967 0.043 Uiso calc R 1 H 201 | H3B2 0.7205 0.3884 0.4186 0.043 Uiso calc R 1 H 202 | C4B 0.6993(3) 0.5582(3) 0.3280(3) 0.0396(7) Uani d . 1 C 203 | H4B1 0.7835 0.5499 0.2673 0.048 Uiso calc R 1 H 204 | H4B2 0.6203 0.5372 0.2956 0.048 Uiso calc R 1 H 205 | C5B 0.6875(3) 0.6889(2) 0.3516(2) 0.0313(6) Uani d . 1 C 206 | H5B 0.5961 0.6992 0.4039 0.038 Uiso calc R 1 H 207 | C6B 0.8003(2) 0.7213(2) 0.4158(2) 0.0280(6) Uani d . 1 C 208 | C7B 0.7980(3) 0.8496(2) 0.4626(3) 0.0362(7) Uani d . 1 C 209 | C8B 0.8150(3) 0.8138(3) 0.5955(3) 0.0426(7) Uani d . 1 C 210 | H8B 0.8596 0.8772 0.6269 0.051 Uiso calc R 1 H 211 | C9B 0.6780(3) 0.7643(3) 0.6783(3) 0.0531(9) Uani d . 1 C 212 | H9B1 0.6023 0.8200 0.6727 0.064 Uiso calc R 1 H 213 | H9B2 0.6867 0.7480 0.7617 0.064 Uiso calc R 1 H 214 | C10B 0.6553(3) 0.6465(3) 0.6274(3) 0.0387(7) Uani d . 1 C 215 | H10C 0.5762 0.6530 0.5887 0.046 Uiso calc R 1 H 216 | H10D 0.6393 0.5781 0.6909 0.046 Uiso calc R 1 H 217 | C11B 0.8977(3) 0.6965(3) 0.5892(2) 0.0391(7) Uani d . 1 C 218 | H11C 0.9049 0.6565 0.6682 0.047 Uiso calc R 1 H 219 | H11D 0.9896 0.7082 0.5365 0.047 Uiso calc R 1 H 220 | C12B 0.9221(3) 0.9312(3) 0.3914(3) 0.0434(7) Uani d . 1 C 221 | H12D 1.0079 0.8942 0.4026 0.065 Uiso calc R 1 H 222 | H12E 0.9155 1.0087 0.4201 0.065 Uiso calc R 1 H 223 | H12F 0.9203 0.9417 0.3071 0.065 Uiso calc R 1 H 224 | C13B 0.6646(3) 0.9206(3) 0.4549(3) 0.0571(9) Uani d . 1 C 225 | H13D 0.6635 0.9908 0.4956 0.086 Uiso calc R 1 H 226 | H13E 0.5845 0.8700 0.4927 0.086 Uiso calc R 1 H 227 | H13F 0.6625 0.9453 0.3718 0.086 Uiso calc R 1 H 228 | O1B 0.8074(2) 0.41898(19) 0.61185(18) 0.0459(6) Uani d . 1 O 229 | H1B 0.8856 0.3947 0.6156 0.069 Uiso calc R 1 H 230 | O2B 0.6974(2) 0.76823(19) 0.23988(18) 0.0438(5) Uani d . 1 O 231 | H2B1 0.6220 0.7690 0.2199 0.066 Uiso calc R 1 H 232 | O3B 0.93444(17) 0.70004(17) 0.33988(16) 0.0320(4) Uani d . 1 O 233 | H3B 0.9459 0.7466 0.2767 0.048 Uiso calc R 1 H 234 | loop_ 235 | _atom_site_aniso_label 236 | _atom_site_aniso_U_11 237 | _atom_site_aniso_U_22 238 | _atom_site_aniso_U_33 239 | _atom_site_aniso_U_12 240 | _atom_site_aniso_U_13 241 | _atom_site_aniso_U_23 242 | C1A 0.0273(13) 0.0295(14) 0.0315(14) 0.0018(11) -0.0054(11) 0.0037(11) 243 | C2A 0.0269(14) 0.0401(16) 0.0325(14) -0.0007(12) -0.0084(11) 0.0033(12) 244 | C3A 0.0406(16) 0.0428(17) 0.0409(17) -0.0081(13) -0.0133(13) -0.0019(13) 245 | C4A 0.0371(16) 0.0312(15) 0.0445(17) -0.0042(12) -0.0060(13) -0.0019(13) 246 | C5A 0.0257(13) 0.0296(14) 0.0328(14) 0.0024(11) -0.0027(11) 0.0031(11) 247 | C6A 0.0261(13) 0.0300(14) 0.0249(13) -0.0015(11) -0.0016(10) 0.0018(11) 248 | C7A 0.0480(17) 0.0338(15) 0.0349(15) 0.0013(13) -0.0119(13) -0.0068(12) 249 | C8A 0.0490(18) 0.0320(16) 0.0428(17) 0.0007(13) -0.0105(14) -0.0076(13) 250 | C9A 0.055(2) 0.0422(18) 0.0481(18) 0.0172(15) -0.0045(15) -0.0102(15) 251 | C10A 0.0314(15) 0.0378(16) 0.0446(17) 0.0053(12) -0.0009(13) 0.0004(13) 252 | C11A 0.0376(16) 0.0318(15) 0.0367(15) 0.0021(12) -0.0067(12) 0.0037(12) 253 | C12A 0.058(2) 0.056(2) 0.067(2) -0.0009(17) -0.0346(18) -0.0182(18) 254 | C13A 0.097(3) 0.0470(19) 0.0328(17) 0.0111(19) -0.0162(18) -0.0059(14) 255 | O1A 0.0314(11) 0.0587(14) 0.0440(12) -0.0064(9) -0.0166(9) 0.0144(10) 256 | O2A 0.0324(10) 0.0347(11) 0.0427(11) 0.0016(8) -0.0055(9) 0.0120(9) 257 | O3A 0.0251(9) 0.0370(11) 0.0300(10) 0.0044(8) -0.0032(8) 0.0045(8) 258 | C1B 0.0242(13) 0.0374(15) 0.0271(13) -0.0014(11) -0.0049(11) 0.0000(11) 259 | C2B 0.0262(13) 0.0340(15) 0.0342(14) -0.0005(11) -0.0033(11) 0.0067(12) 260 | C3B 0.0363(15) 0.0290(14) 0.0426(16) -0.0040(12) -0.0084(13) -0.0047(12) 261 | C4B 0.0429(17) 0.0405(17) 0.0394(16) -0.0067(13) -0.0183(13) -0.0040(13) 262 | C5B 0.0286(14) 0.0351(15) 0.0296(14) -0.0008(11) -0.0094(11) 0.0030(11) 263 | C6B 0.0220(13) 0.0332(14) 0.0275(13) 0.0007(11) -0.0035(10) -0.0023(11) 264 | C7B 0.0359(15) 0.0307(15) 0.0418(16) -0.0041(12) -0.0072(13) -0.0058(12) 265 | C8B 0.0457(18) 0.0488(18) 0.0337(15) -0.0103(14) -0.0038(13) -0.0145(13) 266 | C9B 0.055(2) 0.059(2) 0.0406(18) -0.0038(16) 0.0060(15) -0.0172(16) 267 | C10B 0.0327(15) 0.0457(17) 0.0336(15) -0.0046(13) 0.0003(12) -0.0023(13) 268 | C11B 0.0366(15) 0.0540(19) 0.0258(14) -0.0103(13) -0.0080(12) 0.0011(13) 269 | C12B 0.0503(18) 0.0342(16) 0.0468(18) -0.0105(13) -0.0133(14) -0.0040(13) 270 | C13B 0.053(2) 0.0416(19) 0.079(2) 0.0132(15) -0.0128(18) -0.0182(18) 271 | O1B 0.0318(11) 0.0540(13) 0.0431(12) 0.0025(10) -0.0043(9) 0.0194(10) 272 | O2B 0.0408(12) 0.0500(13) 0.0408(11) -0.0068(10) -0.0198(9) 0.0126(9) 273 | O3B 0.0263(9) 0.0381(11) 0.0277(10) 0.0012(8) -0.0020(8) 0.0033(8) 274 | loop_ 275 | _atom_type_symbol 276 | _atom_type_description 277 | _atom_type_scat_dispersion_real 278 | _atom_type_scat_dispersion_imag 279 | _atom_type_scat_source 280 | C C 0.0033 0.0016 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 281 | H H 0.0000 0.0000 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 282 | O O 0.0106 0.0060 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 283 | _journal_paper_doi 10.1107/S0108270102010843 284 | -------------------------------------------------------------------------------- /scythe/utils/interface.py: -------------------------------------------------------------------------------- 1 | """Utilities for working with extractors from other applications""" 2 | 3 | from stevedore.extension import ExtensionManager 4 | from stevedore.driver import DriverManager 5 | from typing import Iterator, Union, Dict, List 6 | from collections import namedtuple 7 | from copy import deepcopy 8 | 9 | from scythe.adapters.base import BaseAdapter 10 | from scythe.base import BaseExtractor 11 | import logging 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | ExtractResult = namedtuple('ExtractResult', ['group', 'extractor', 'metadata']) 16 | 17 | 18 | def _output_plugin_info(mgr: ExtensionManager) -> dict: 19 | """Gets information about all plugins attached to a particular manager 20 | 21 | Args: 22 | mgr (ExtensionManager): Plugin manager 23 | Returns: 24 | (dict): Dictionary where keys are plugin ids and values are descriptions 25 | """ 26 | 27 | output = {} 28 | for name, ext in mgr.items(): 29 | plugin = ext.plugin() 30 | output[name] = { 31 | 'description': plugin.__doc__.split("\n")[0], 32 | 'version': plugin.version(), 33 | 'class': ext.entry_point_target 34 | } 35 | return output 36 | 37 | 38 | def get_available_extractors(): 39 | """Get information about the available extractors 40 | 41 | Returns: 42 | [dict]: Descriptions of available extractors 43 | """ 44 | mgr = ExtensionManager( 45 | namespace='scythe.extractor', 46 | ) 47 | 48 | # Get information about each extractor 49 | return _output_plugin_info(mgr) 50 | 51 | 52 | def get_available_adapters() -> dict: 53 | """Get information on all available adapters 54 | 55 | Returns: 56 | (dict) Where keys are adapter names and values are descriptions 57 | """ 58 | 59 | return _output_plugin_info(ExtensionManager(namespace='scythe.adapter')) 60 | 61 | 62 | def _get_adapter_map(adapter_map: str, extractors: list) -> dict: 63 | """Helper function to generate 'adapter map' 64 | 65 | Adapter map is a list of extractors and names of the appropriate adapters 66 | to use to format their output. 67 | 68 | Args: 69 | adapter_map (str): string argument for adapters. 70 | - 'match' means just find adapters with same names as corresponding extractors. 71 | extractors ([str]): list of extractors 72 | Returns: 73 | (dict) where keys are adapter names extractor/adapter names and values are adapter objects. 74 | """ 75 | if adapter_map is None: 76 | adapter_map = {} 77 | elif adapter_map == 'match': 78 | adapters = get_available_adapters() 79 | adapter_map = dict((x, x) for x in extractors if x in adapters) 80 | elif not isinstance(adapter_map, dict): 81 | raise ValueError('Adapter map must be a dict, None, or `matching`') 82 | 83 | # Give it to the user 84 | return adapter_map 85 | 86 | 87 | def get_extractor_and_adapter_contexts(name, global_context, extractor_context, adapter_context): 88 | """ 89 | Helper function to update the helper and adapter contexts and the 'name' 90 | of a extractor/adapter pair 91 | Args: 92 | name (str): adapter/extractor name. 93 | global_context (dict): Context of the files, used for every extractor and adapter 94 | adapter_context (dict): Context used for adapters. Key is the name of the adapter, 95 | value is the context. The key ``@all`` is used to for context used for every adapter 96 | extractor_context (dict): Context used for adapters. Key is the name of the extractor, 97 | value is the context. The key ``@all`` is used to for context used for every extractor 98 | Returns: 99 | (dict, dict): extractor_context, my_adapter context tuple 100 | """ 101 | 102 | # Get the context information for the extractor and adapter 103 | my_extractor_context = deepcopy(global_context) 104 | my_extractor_context.update(extractor_context.get('@all', {})) 105 | my_extractor_context.update(extractor_context.get(name, {})) 106 | 107 | my_adapter_context = deepcopy(global_context) 108 | my_adapter_context.update(adapter_context.get('@all', {})) 109 | my_adapter_context.update(adapter_context.get(name, {})) 110 | 111 | return my_extractor_context, my_adapter_context 112 | 113 | 114 | def _get_extractor_list(to_include: list, to_exclude: list) -> list: 115 | """ Helper function to get a list of extractors given lists of extractors to include/exclude 116 | 117 | Args: 118 | to_include ([str]): Predefined list of extractors to run. Only these will be used. 119 | Mutually exclusive with `exclude_extractors`. 120 | to_exclude ([str]): List of extractors to exclude. 121 | Mutually exclusive with `include_extractors`. 122 | Returns: 123 | List of all applicable extractors 124 | """ 125 | 126 | extractors = get_available_extractors() 127 | if to_include is not None and to_exclude is not None: 128 | raise ValueError('Including and excluding extractors are mutually exclusive') 129 | elif to_include is not None: 130 | missing_extractors = set(to_include).difference(extractors.keys()) 131 | if len(missing_extractors) > 0: 132 | raise ValueError('Some extractors are missing: ' + ' '.join(missing_extractors)) 133 | extractors = to_include 134 | elif to_exclude is not None: 135 | extractors = list(set(extractors.keys()).difference(to_exclude)) 136 | 137 | return extractors 138 | 139 | 140 | def get_extractor(name: str) -> BaseExtractor: 141 | """Load an extractor object 142 | 143 | Args: 144 | name (str): Name of extractor 145 | Returns: 146 | Requested extractor 147 | """ 148 | return DriverManager( 149 | namespace='scythe.extractor', 150 | name=name, 151 | invoke_on_load=True 152 | ).driver 153 | 154 | 155 | def get_adapter(name: str) -> BaseAdapter: 156 | """Load an adapter 157 | 158 | Args: 159 | name (str): Name of adapter 160 | Returns: 161 | (BaseAdapter) Requested adapter 162 | """ 163 | 164 | # Load the adapter 165 | mgr = DriverManager( 166 | namespace='scythe.adapter', 167 | name=name, 168 | invoke_on_load=True 169 | ) 170 | 171 | # Give it to the user 172 | return mgr.driver 173 | 174 | 175 | def run_extractor(name, group, context=None, adapter=None): 176 | """Invoke a extractor on a certain group of files 177 | 178 | Args: 179 | name (str): Name of the extractor 180 | group ([str]): Paths to group of files to be parsed 181 | context (dict): Context of the files, used in adapter and extractor 182 | adapter (str): Name of adapter to use to transform metadata 183 | Returns: 184 | ([dict]): Metadata generated by the extractor 185 | """ 186 | metadata = get_extractor(name).extract(group, context) 187 | if adapter is not None: 188 | adapter = get_adapter(adapter) 189 | return adapter.transform(metadata, context=context) 190 | return metadata 191 | 192 | 193 | def run_all_extractors_on_directory(directory: str, global_context=None, 194 | adapter_context: Union[None, dict] = None, 195 | extractor_context: Union[None, dict] = None, 196 | include_extractors: Union[None, List[str]] = None, 197 | exclude_extractors: Union[None, List] = None, 198 | adapter_map: Union[None, str, Dict[str, str]] = None, 199 | default_adapter: Union[None, str] = None) \ 200 | -> Iterator[ExtractResult]: 201 | """Run all known files on a directory of files 202 | 203 | Args: 204 | directory (str): Path to directory to be parsed 205 | global_context (dict): Context of the files, used for every extractor and adapter 206 | adapter_context (dict): Context used for adapters. Key is the name of the adapter, 207 | value is the context. The key ``@all`` is used to for context used for every adapter 208 | extractor_context (dict): Context used for adapters. Key is the name of the extractor, 209 | value is the context. The key ``@all`` is used to for context used for every extractor 210 | include_extractors ([str]): Predefined list of extractors to run. Only these will be used. 211 | Mutually exclusive with `exclude_extractors`. 212 | exclude_extractors ([str]): List of extractors to exclude. 213 | Mutually exclusive with `include_extractors`. 214 | adapter_map (str, dict): Map of extractor name to the desired adapter. 215 | Use 'match' to find adapters with the same names 216 | default_adapter (str): Adapter to use if no other adapter is defined 217 | Yields 218 | ((str), str, dict) Tuple of (1) group of files, (2) name of extractor, (3) metadata 219 | """ 220 | 221 | # Load in default arguments 222 | if global_context is None: 223 | global_context = dict() 224 | if adapter_context is None: 225 | adapter_context = dict() 226 | if extractor_context is None: 227 | extractor_context = dict() 228 | 229 | # Get the list of extractors 230 | extractors = _get_extractor_list(include_extractors, exclude_extractors) 231 | 232 | # Make the adapter map 233 | adapter_map = _get_adapter_map(adapter_map=adapter_map, extractors=extractors) 234 | 235 | # Get the list of known extractors 236 | for name in extractors: 237 | # Get the extractor and adapter 238 | extractor = get_extractor(name) 239 | adapter_name = adapter_map.get(name, default_adapter) 240 | if adapter_name is not None: 241 | adapter = get_adapter(adapter_name) 242 | else: 243 | adapter = None 244 | 245 | my_extractor_context, my_adapter_context = get_extractor_and_adapter_contexts(name, 246 | global_context, 247 | extractor_context, 248 | adapter_context) 249 | 250 | for group, metadata in extractor.extract_directory(directory, context=my_extractor_context): 251 | # Run the adapter, if defined 252 | if adapter is not None: 253 | try: 254 | metadata = adapter.transform(metadata, my_adapter_context) 255 | except Exception as e: 256 | logger.warning(f'Adapter for {extractor} failed with caught exception: {e}') 257 | continue 258 | if metadata is None: 259 | continue 260 | 261 | yield ExtractResult(group, name, metadata) 262 | 263 | 264 | def run_all_extractors_on_group(group, 265 | adapter_map=None, 266 | global_context=None, 267 | adapter_context: Union[None, dict] = None, 268 | extractor_context: Union[None, dict] = None, 269 | include_extractors: Union[None, List[str]] = None, 270 | exclude_extractors: Union[None, List] = None, 271 | default_adapter: Union[None, str] = None): 272 | """ 273 | Parse metadata from a file-group and adapt its metadata per a user-supplied adapter_map. 274 | 275 | This function is effectively a wrapper to execute_extractor() that enables us to output metadata 276 | in the same format as run_all_extractors_on_directory(), but just on a single file group. 277 | 278 | Args: 279 | group ([str]): Paths to group of files to be parsed 280 | global_context (dict): Context of the files, used for every extractor and adapter 281 | adapter_context (dict): Context used for adapters. Key is the name of the adapter, 282 | value is the context. The key ``@all`` is used to for context used for every adapter 283 | extractor_context (dict): Context used for adapters. Key is the name of the extractor, 284 | value is the context. The key ``@all`` is used to for context used for every extractor 285 | include_extractors ([str]): Predefined list of extractors to run. Only these will be used. 286 | Mutually exclusive with `exclude_extractors`. 287 | exclude_extractors ([str]): List of extractors to exclude. 288 | Mutually exclusive with `include_extractors`. 289 | adapter_map (str, dict): Map of extractor name to the desired adapter. 290 | Use 'match' to find adapters with the same names: 291 | default_adapter: 292 | Yields: 293 | Metadata for a certain 294 | """ 295 | 296 | # Load in default arguments 297 | if global_context is None: 298 | global_context = dict() 299 | if adapter_context is None: 300 | adapter_context = dict() 301 | if extractor_context is None: 302 | extractor_context = dict() 303 | 304 | # Get the list of extractors 305 | extractors = _get_extractor_list(include_extractors, exclude_extractors) 306 | 307 | # Make the adapter map 308 | adapter_map = _get_adapter_map(adapter_map=adapter_map, extractors=extractors) 309 | 310 | for name in extractors: 311 | # Get the extractor and adapter 312 | adapter_name = adapter_map.get(name, default_adapter) 313 | 314 | my_extractor_context, my_adapter_context = get_extractor_and_adapter_contexts(name, 315 | global_context, 316 | extractor_context, 317 | adapter_context) 318 | 319 | metadata = run_extractor(name, group, context=my_extractor_context, adapter=adapter_name) 320 | 321 | yield ExtractResult(group, name, metadata) 322 | -------------------------------------------------------------------------------- /tests/data/tdb/PbSSeTe_Na.TDB: -------------------------------------------------------------------------------- 1 | $ Database file written 1900- 1-** 2 | $ From database: User data 1900.01.** 3 | ELEMENT /- ELECTRON_GAS 0.0000E+00 0.0000E+00 0.0000E+00! 4 | ELEMENT VA VACUUM 0.0000E+00 0.0000E+00 0.0000E+00! 5 | ELEMENT NA BCC_A2 2.2990E+01 6.4475E+03 5.1447E+01 ! 6 | ELEMENT PB FCC_A1 2.0720E+02 6.8785E+03 6.4785E+01! 7 | ELEMENT S ORTHORHOMBIC_S 3.2066E+01 0.0000E+00 0.0000E+00! 8 | ELEMENT SE HEXAGONAL_A8 7.8960E+01 5.5145E+03 4.1966E+01! 9 | ELEMENT TE HEXAGONAL_A8 1.2760E+02 6.1212E+03 4.9497E+01! 10 | 11 | SPECIES PB2 PB2! 12 | SPECIES PBS PB1S1! 13 | SPECIES PBSE PB1SE1! 14 | SPECIES PBTE PB1TE1! 15 | SPECIES TE+2 TE1/+2! 16 | SPECIES TE2 TE2! 17 | SPECIES TE3 TE3! 18 | SPECIES TE4 TE4! 19 | SPECIES TE5 TE5! 20 | SPECIES TE6 TE6! 21 | SPECIES TE7 TE7! 22 | SPECIES VA+1 VA1/+1! 23 | SPECIES VA+2 VA1/+2! 24 | SPECIES VA-1 VA1/-1! 25 | SPECIES VA-2 VA1/-2! 26 | SPECIES NA2TE NA2TE1! 27 | SPECIES NATE3 NA1TE3! 28 | 29 | 30 | FUNCTION GHSERPB 298.15 -7650.085+101.700244*T-24.5242231*T*LN(T) 31 | -.00365895*T**2-2.4395E-07*T**3; 600.61 Y 32 | -10531.095+154.243182*T-32.4913959*T*LN(T)+.00154613*T**2 33 | +8.05448E+25*T**(-9); 1200 Y 34 | +4157.616+53.139072*T-18.9640637*T*LN(T)-.002882943*T**2 35 | +9.8144E-08*T**3-2696755*T**(-1)+8.05448E+25*T**(-9); 2100 N ! 36 | FUNCTION GLIQPB 298.15 +GHSERPB#+4672.124-7.750683*T-6.019E-19*T**7; 37 | 600.61 Y 38 | -5677.958+146.176046*T-32.4913959*T*LN(T)+.00154613*T**2; 1200 Y 39 | +9010.753+45.071937*T-18.9640637*T*LN(T)-.002882943*T**2 40 | +9.8144E-08*T**3-2696755*T**(-1); 2100 N ! 41 | FUNCTION GLIQTE 298.15 -17554.731+685.877639*T-126.318*T*LN(T) 42 | +.2219435*T**2-9.42075E-05*T**3+827930*T**(-1); 626.49 Y 43 | -3165763.48+46756.357*T-7196.41*T*LN(T)+7.09775*T**2-.00130692833*T**3 44 | +2.58051E+08*T**(-1); 722.66 Y 45 | +180326.959-1500.57909*T+202.743*T*LN(T)-.142016*T**2 46 | +1.6129733E-05*T**3-24238450*T**(-1); 1150 Y 47 | +6328.687+148.708299*T-32.5596*T*LN(T); 1600 N ! 48 | FUNCTION GBCCPB 298.15 +GHSERPB#+2400-1.1*T; 2100 N ! 49 | FUNCTION GHCPPB 298.15 +GHSERPB#+300+T; 2100 N ! 50 | FUNCTION GHSERTE 298.15 -10544.679+183.372894*T-35.6687*T*LN(T) 51 | +.01583435*T**2-5.240417E-06*T**3+155015*T**(-1); 722.66 Y 52 | +9160.595-129.265373*T+13.004*T*LN(T)-.0362361*T**2+5.006367E-06*T**3 53 | -1286810*T**(-1); 1150 Y 54 | -12781.349+174.901226*T-32.5596*T*LN(T); 1600 N ! 55 | FUNCTION GHSERS 298.15 -5198.294+53.913855*T-10.726*T*LN(T) 56 | -.0273801*T**2+8.179537E-06*T**3; 368.30 Y 57 | -6475.706+94.182332*T-17.8693298*T*LN(T)-.010936877*T**2 58 | +1.406467E-06*T**3+36871*T**(-1); 1300 Y 59 | -12485.546+188.304687*T-32*T*LN(T); 1301 N ! 60 | FUNCTION GLIQS 298.15 -4196.575+85.63027*T-17.413*T*LN(T) 61 | -.00993935*T**2-7.0062E-08*T**3+1250*T**(-1); 335 Y 62 | +1790361.98-44195.4514*T+7511.61943*T*LN(T)-13.9855175*T**2 63 | +.0048387386*T**3-79880891*T**(-1); 388.36 Y 64 | -876313.954+23366.873*T-4028.756*T*LN(T)+7.954595*T**2 65 | -.00290851333*T**3+33980035*T**(-1); 432.25 Y 66 | +454088.687-7814.67023*T+1237.001*T*LN(T)-1.5607295*T**2 67 | +3.59883667E-04*T**3-31765395*T**(-1); 500 Y 68 | +18554.561-144.895285*T+16.535*T*LN(T)-.0454119*T**2+8.327402E-06*T**3 69 | -2705030*T**(-1); 700 Y 70 | +21243.126-113.298877*T+9.944*T*LN(T)-.0288384*T**2+3.791365E-06*T**3 71 | -3507570*T**(-1); 900 Y 72 | +16117.849-32.79523*T-2.425*T*LN(T)-.01712545*T**2+1.84974E-06*T**3 73 | -3215170*T**(-1); 1300 Y 74 | -6461.814+175.590536*T-32*T*LN(T); 1301 N ! 75 | FUNCTION GBCCS 298.15 +105000+GHSERS#; 1301 N ! 76 | FUNCTION GFCCS 298.15 +105000+GHSERS#; 1301 N ! 77 | FUNCTION GHSERSE 298.15 -9376.371+174.205877*T-33.6527*T*LN(T) 78 | +.02424314*T**2-1.5318461E-05*T**3+102249*T**(-1); 494 Y 79 | -37546.134+507.111538*T-81.2006585*T*LN(T)+.037144892*T**2 80 | -5.611026E-06*T**3+2614263*T**(-1); 800 Y 81 | -12193.47+197.770166*T-35.1456*T*LN(T); 1000 N ! 82 | FUNCTION GLIQSE 298.15 +50533.347-1178.28824*T+194.107439*T*LN(T) 83 | -.390268991*T**2+1.19219297E-04*T**3-2224398*T**(-1); 494 Y 84 | -5228.304+183.72559*T-35.1456*T*LN(T); 1000 N ! 85 | FUNCTION GPBS 298.15 -98000+8.6*T+GHSERS#+GHSERPB#; 6000 N ! 86 | FUNCTION GPBTE 298.15 -128362.29+432.84959*T-52.848901*T*LN(T)+GHSERPB# 87 | +GHSERTE#; 6000 N ! 88 | FUNCTION GPBSE 298.15 -99783.25+22.58*T+GHSERPB#+GHSERSE#; 6000 N ! 89 | FUNCTION GVA 298.15 +2.3*R#*T; 6000 N ! 90 | FUNCTION UN_ASS 298.15 +0.0; 300 N ! 91 | 92 | $ Na 93 | $ ------------------------------------- 94 | FUNCTION GHSERNA 200.00 95 | -11989.434+260.548732*T-51.0393608*T*LN(T)+72.306633E-3*T**2 96 | -43.638283E-6*T**3+132154*T**(-1); 370.87 Y 97 | -11009.884+199.619999*T-38.1198801*T*LN(T)+9.745854E-3*T**2-1.70664E-6*T**3 98 | +34342*T**(-1)+165.071E21*T**(-9); 2300.00 N ! 99 | 100 | FUNCTION GLIQNA 200 2581.02-6.95218*T-276.132E-20*T**7+GHSERNA; 370.87 Y 101 | -8400.44+192.587343*T-38.1198801*T*LN(T)+9.745854E-3*T**2-1.70664E-6*T**3 102 | +34342*T**(-1); 2300.00 N ! 103 | 104 | FUNCTION GHCPNA 200 -104+2*T+GHSERNA; 2300 N ! 105 | 106 | FUNCTION GFCCNA 200 -50+1.3*T+GHSERNA; 2300 N ! 107 | $ ------------------------------------- 108 | 109 | TYPE_DEFINITION % SEQ *! 110 | DEFINE_SYSTEM_DEFAULT ELEMENT 2 ! 111 | DEFAULT_COMMAND DEF_SYS_ELEMENT VA /- ! 112 | 113 | 114 | PHASE LIQUID:L % 1 1.0 ! 115 | CONSTITUENT LIQUID:L :PB,PBS,PBSE,PBTE,S,SE,TE,NA,NA2TE,NATE3 : ! 116 | 117 | PARAMETER G(LIQUID,PB;0) 298.15 +GLIQPB#; 2100 N REF0 ! 118 | PARAMETER G(LIQUID,PBS;0) 298.15 +GLIQPB#+GLIQS#-60000-6*T; 119 | 3000 N REF0 ! 120 | PARAMETER G(LIQUID,PBSE;0) 298.15 +GLIQPB#+GLIQSE#-91032.45 121 | +29.88*T; 3000 N REF0 ! 122 | PARAMETER G(LIQUID,PBTE;0) 298.15 +GLIQPB#+GLIQTE#-60870.273 123 | +18.088152*T; 3000 N REF0 ! 124 | PARAMETER G(LIQUID,S;0) 298.15 +GLIQS#; 1600 N REF0 ! 125 | PARAMETER G(LIQUID,SE;0) 298.15 +GLIQSE#; 1600 N REF0 ! 126 | PARAMETER G(LIQUID,TE;0) 298.15 +GLIQTE#; 1600 N REF0 ! 127 | PARAMETER G(LIQUID,PB,PBS;0) 298.15 +21000; 3000 N REF0 ! 128 | PARAMETER G(LIQUID,PB,PBS;1) 298.15 -2*T; 3000 N REF0 ! 129 | PARAMETER G(LIQUID,PB,PBTE;0) 298.15 +20634.752-9.7324602*T; 130 | 3000 N REF0 ! 131 | PARAMETER G(LIQUID,PB,PBTE;1) 298.15 +7.858817; 3000 N REF0 ! 132 | PARAMETER G(LIQUID,PB,PBSE;0) 298.15 +19500.32; 3000 N REF0 ! 133 | PARAMETER G(LIQUID,PB,PBSE;1) 298.15 -1003.26+.58*T; 3000 N REF0 ! 134 | PARAMETER G(LIQUID,PB,PBSE;2) 298.15 +8352.21-5.64*T; 3000 N 135 | REF0 ! 136 | PARAMETER G(LIQUID,PBS,S;0) 298.15 +4500; 3000 N REF0 ! 137 | PARAMETER G(LIQUID,PBS,S;1) 298.15 -11000+2*T; 3000 N REF0 ! 138 | PARAMETER G(LIQUID,PBS,S;2) 298.15 +8000+7*T; 3000 N REF0 ! 139 | PARAMETER G(LIQUID,PBS,PBTE;0) 298.15 -7046.79603; 6000 N REF0 ! 140 | PARAMETER G(LIQUID,PBS,PBTE;1) 298.15 -12692.7833; 6000 N REF0 ! 141 | PARAMETER G(LIQUID,PBS,PBSE;0) 298.15 -17214.314+13.5254896*T; 142 | 6000 N REF0 ! 143 | PARAMETER G(LIQUID,PBSE,SE;0) 298.15 +17503.24-7.95*T; 3000 N 144 | REF0 ! 145 | PARAMETER G(LIQUID,PBSE,SE;1) 298.15 -4201.24+1.42*T; 3000 N 146 | REF0 ! 147 | PARAMETER G(LIQUID,PBSE,SE;2) 298.15 +16498.24-3.61*T; 3000 N 148 | REF0 ! 149 | PARAMETER G(LIQUID,PBSE,PBTE;0) 298.15 +9090.74243; 6000 N REF0 ! 150 | PARAMETER G(LIQUID,PBTE,TE;0) 298.15 -4167.4859-2.660869*T; 3000 151 | N REF0 ! 152 | PARAMETER G(LIQUID,PBTE,TE;1) 298.15 +3500.9393; 3000 N REF0 ! 153 | PARAMETER G(LIQUID,NA;0) 298.15 +GLIQNA#; 3000 N REF0 ! 154 | PARAMETER G(LIQUID,NA2TE;0) 298.15 +2*GLIQNA#+GLIQTE#-320000 155 | +52*T; 3000 N REF0 ! 156 | PARAMETER G(LIQUID,NATE3;0) 298.15 +GLIQNA#+3*GLIQTE#-185000 157 | +49*T; 3000 N REF0 ! 158 | PARAMETER G(LIQUID,NA,NA2TE;0) 298.15 +14000+4*T; 3000 N REF0 ! 159 | PARAMETER G(LIQUID,NA,NA2TE;1) 298.15 +16000-12*T; 3000 N REF0 ! 160 | PARAMETER G(LIQUID,NA,NA2TE;2) 298.15 +4000; 3000 N REF0 ! 161 | PARAMETER G(LIQUID,NA2TE,TE;0) 298.15 -40000-4*T; 3000 N REF0 ! 162 | PARAMETER G(LIQUID,NA2TE,TE;1) 298.15 +2000+4*T; 3000 N REF0 ! 163 | 164 | 165 | TYPE_DEFINITION & GES A_P_D FCC_A1 MAGNETIC -3.0 2.80000E-01 ! 166 | PHASE FCC_A1 %& 2 1 1 ! 167 | CONSTITUENT FCC_A1 :PB : VA : ! 168 | 169 | PARAMETER G(FCC_A1,PB:VA;0) 298.15 +GHSERPB#; 2100 N REF0 ! 170 | 171 | 172 | PHASE HALITE % 2 1 1 ! 173 | CONSTITUENT HALITE :PB,NA,VA : S,SE,TE,VA : ! 174 | 175 | PARAMETER G(HALITE,PB:S;0) 298.15 +GPBS#; 6000 N REF0 ! 176 | PARAMETER G(HALITE,VA:S;0) 298.15 +GHSERS#+GVA#; 6000 N REF0 ! 177 | PARAMETER G(HALITE,PB:SE;0) 298.15 +GPBSE#; 6000 N REF0 ! 178 | PARAMETER G(HALITE,VA:SE;0) 298.15 +GHSERSE#+GVA#; 6000 N REF0 ! 179 | PARAMETER G(HALITE,PB:TE;0) 298.15 +GPBTE#; 6000 N REF0 ! 180 | PARAMETER G(HALITE,VA:TE;0) 298.15 +GHSERTE#+GVA#; 6000 N REF0 ! 181 | PARAMETER G(HALITE,PB:VA;0) 298.15 +GHSERPB#+GVA#; 6000 N REF0 ! 182 | PARAMETER G(HALITE,VA:VA;0) 298.15 +2*GVA#; 6000 N REF0 ! 183 | PARAMETER G(HALITE,PB,VA:S;0) 298.15 +38570.1008+4.42495068*T; 184 | 6000 N REF0 ! 185 | PARAMETER G(HALITE,PB:S,TE;0) 298.15 +38232.7229-20.9138489*T; 186 | 3600 N REF0 ! 187 | PARAMETER G(HALITE,PB:S,TE;1) 298.15 +4372.80543; 3600 N REF0 ! 188 | PARAMETER G(HALITE,PB:S,SE;0) 298.15 +3206.19881; 3600 N REF0 ! 189 | PARAMETER G(HALITE,PB:S,SE;1) 298.15 +590.16994; 6000 N REF0 ! 190 | PARAMETER G(HALITE,PB:S,SE,TE;0) 298.15 -4.5249796E+03; 3000 N REF0 ! 191 | PARAMETER G(HALITE,PB:S,SE,TE;1) 298.15 3.0769838E+04; 3000 N REF0 ! 192 | PARAMETER G(HALITE,PB:S,SE,TE;2) 298.15 -2.3660755E+04; 3000 N REF0 ! 193 | PARAMETER G(HALITE,PB:S,VA;0) 298.15 +38944.7237+4.62924619*T; 194 | 6000 N REF0 ! 195 | PARAMETER G(HALITE,PB,VA:SE;0) 298.15 +37953.5572; 6000 N REF0 ! 196 | PARAMETER G(HALITE,PB:SE,TE;0) 298.15 +10250.9638; 3600 N REF0 ! 197 | PARAMETER G(HALITE,PB:SE,TE;1) 298.15 +1348.60571; 3600 N REF0 ! 198 | PARAMETER G(HALITE,PB:SE,VA;0) 298.15 +45585.0811; 6000 N REF0 ! 199 | PARAMETER G(HALITE,PB,VA:TE;0) 298.15 +38070.478+2.39964988*T; 200 | 6000 N REF0 ! 201 | PARAMETER G(HALITE,PB:TE,VA;0) 298.15 +72780.5624-10.7972522*T; 202 | 6000 N REF0 ! 203 | PARAMETER G(HALITE,NA,PB:TE;0) 298.15 -8.2090710E+04 204 | +3.1004837E+01*T; 6000 N REF0 ! 205 | PARAMETER G(HALITE,NA:VA;0) 298.15 +GHSERNA#+GVA#; 6000 N REF0 ! 206 | PARAMETER G(HALITE,NA:TE;0) 298.15 +GHSERNA#+GHSERTE# 207 | -74873.8344; 6000 N REF0 ! 208 | PARAMETER G(HALITE,NA,PB:SE;0) 298.15 -7.84156946E+04; 6000 N REF0 ! 209 | PARAMETER G(HALITE,NA,PB:S;0) 298.15 -1.3192696E+05; 6000 N REF0 ! 210 | PARAMETER G(HALITE,NA:S;0) 298.15 -61729.2889+GHSERNA# 211 | +GHSERS#; 6000 N REF0 ! 212 | PARAMETER G(HALITE,NA:SE;0) 298.15 -82803.3063+GHSERNA# 213 | +GHSERSE#; 6000 N REF0 ! 214 | 215 | PHASE HEXAGONAL_A8 % 1 1.0 ! 216 | CONSTITUENT HEXAGONAL_A8 :SE,TE : ! 217 | 218 | PARAMETER G(HEXAGONAL_A8,SE;0) 298.15 +GHSERSE#; 1600 N REF0 ! 219 | PARAMETER G(HEXAGONAL_A8,TE;0) 298.15 +GHSERTE#; 1600 N REF0 ! 220 | 221 | 222 | PHASE ORTHORHOMBIC_S % 1 1.0 ! 223 | CONSTITUENT ORTHORHOMBIC_S :S : ! 224 | 225 | PARAMETER G(ORTHORHOMBIC_S,S;0) 298.15 +GHSERS#; 1301 N REF0 ! 226 | 227 | TYPE_DEFINITION & GES A_P_D BCC_A2 MAGNETIC -1.0 4.00000E-01 ! 228 | PHASE BCC_A2 %& 2 1 3 ! 229 | CONSTITUENT BCC_A2 :NA%,PB : VA% : ! 230 | 231 | PARAMETER G(BCC_A2,NA:VA;0) 298.15 +GHSERNA#; 6000 N REF0 ! 232 | PARAMETER G(BCC_A2,PB:VA;0) 298.15 +GBCCPB#; 2100 N REF0 ! 233 | 234 | PHASE NA2TE % 2 2 1 ! 235 | CONSTITUENT NA2TE :NA : TE : ! 236 | 237 | PARAMETER G(NA2TE,NA:TE;0) 298.15 +2*GHSERNA#+GHSERTE#-330000 238 | +39.5*T; 6000 N REF0 ! 239 | 240 | 241 | PHASE NATE % 2 1 1 ! 242 | CONSTITUENT NATE :NA : TE : ! 243 | 244 | PARAMETER G(NATE,NA:TE;0) 298.15 +GHSERNA#+GHSERTE#-175000 245 | +25.6*T; 6000 N REF0 ! 246 | 247 | 248 | PHASE NATE3 % 2 1 3 ! 249 | CONSTITUENT NATE3 :NA : TE : ! 250 | 251 | PARAMETER G(NATE3,NA:TE;0) 298.15 +GHSERNA#+3*GHSERTE#-180000 252 | +7.9*T; 6000 N REF0 ! 253 | 254 | PHASE NA2SE % 2 2 1 ! 255 | CONSTITUENT NA2SE :NA : SE : ! 256 | 257 | PARAMETER G(NA2SE,NA:SE;0) 298.15 -332011.423+2*GHSERNA# 258 | +GHSERSE#; 6000 N REF0 ! 259 | 260 | 261 | PHASE NASE % 2 1 1 ! 262 | CONSTITUENT NASE :NA : SE : ! 263 | 264 | PARAMETER G(NASE,NA:SE;0) 298.15 -171553.708+GHSERSE# 265 | +GHSERNA#; 6000 N REF0 ! 266 | 267 | 268 | PHASE NASE2 % 2 1 2 ! 269 | CONSTITUENT NASE2 :NA : SE : ! 270 | 271 | PARAMETER G(NASE2,NA:SE;0) 298.15 -177728.87+2*GHSERSE# 272 | +GHSERNA#; 6000 N REF0 ! 273 | 274 | PHASE NA2S % 2 2 1 ! 275 | CONSTITUENT NA2S :NA : S : ! 276 | 277 | PARAMETER G(NA2S,NA:S;0) 298.15 -360957.493+2*GHSERNA# 278 | +GHSERS#; 6000 N REF0 ! 279 | 280 | 281 | PHASE NAS % 2 1 1 ! 282 | CONSTITUENT NAS :NA : S : ! 283 | 284 | PARAMETER G(NAS,NA:S;0) 298.15 -199727.883+GHSERS# 285 | +GHSERNA#; 6000 N REF0 ! 286 | 287 | 288 | PHASE NAS2 % 2 1 2 ! 289 | CONSTITUENT NAS2 :NA : S : ! 290 | 291 | PARAMETER G(NAS2,NA:S;0) 298.15 -235910.47+2*GHSERS# 292 | +GHSERNA#; 6000 N REF0 ! 293 | 294 | 295 | LIST_OF_REFERENCES 296 | NUMBER SOURCE 297 | ! 298 | -------------------------------------------------------------------------------- /tests/data/cif/1548397.cif: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------ 2 | #$Date: 2017-12-06 04:31:51 +0200 (Wed, 06 Dec 2017) $ 3 | #$Revision: 204011 $ 4 | #$URL: file:///home/coder/svn-repositories/cod/cif/1/54/83/1548397.cif $ 5 | #------------------------------------------------------------------------------ 6 | # 7 | # This file is available in the Crystallography Open Database (COD), 8 | # http://www.crystallography.net/ 9 | # 10 | # All data on this site have been placed in the public domain by the 11 | # contributors. 12 | # 13 | data_1548397 14 | loop_ 15 | _publ_author_name 16 | 'Feng, Rui' 17 | 'Jia, Yan-Yuan' 18 | 'Li, Zhao-Yang' 19 | 'Chang, Ze' 20 | 'Bu, Xian-He' 21 | _publ_section_title 22 | ; 23 | Enhancing the stability and porosity of penetrated metal--organic 24 | frameworks through the insertion of coordination sites 25 | ; 26 | _journal_name_full 'Chemical Science' 27 | _journal_paper_doi 10.1039/C7SC04192F 28 | _journal_year 2018 29 | _chemical_formula_sum 'C36 H20 Co2 N6 O15' 30 | _chemical_formula_weight 894.44 31 | _chemical_name_systematic 32 | ; 33 | 34 | ? 35 | 36 | ; 37 | _chemical_properties_physical Heat-sensitive 38 | _space_group_IT_number 227 39 | _symmetry_cell_setting cubic 40 | _symmetry_space_group_name_Hall '-F 4vw 2vw 3' 41 | _symmetry_space_group_name_H-M 'F d -3 m :2' 42 | _atom_sites_solution_hydrogens geom 43 | _atom_sites_solution_primary direct 44 | _atom_sites_solution_secondary difmap 45 | _audit_creation_method SHELXL-97 46 | _audit_update_record 47 | ; 48 | 2017-09-25 deposited with the CCDC. 49 | 2017-12-05 downloaded from the CCDC. 50 | ; 51 | _cell_angle_alpha 90.00 52 | _cell_angle_beta 90.00 53 | _cell_angle_gamma 90.00 54 | _cell_formula_units_Z 48 55 | _cell_length_a 46.6983(3) 56 | _cell_length_b 46.6983(3) 57 | _cell_length_c 46.6983(3) 58 | _cell_measurement_reflns_used 2483 59 | _cell_measurement_temperature 293(2) 60 | _cell_measurement_theta_max 61.2060 61 | _cell_measurement_theta_min 3.1350 62 | _cell_volume 101836.4(11) 63 | _computing_cell_refinement 'Bruker FRAMBO' 64 | _computing_data_collection 'Bruker FRAMBO' 65 | _computing_data_reduction 'Bruker SAINT' 66 | _computing_molecular_graphics 'Bruker SHELXTL' 67 | _computing_publication_material 'Bruker SHELXTL' 68 | _computing_structure_refinement 'SHELXL-97 (Sheldrick, 1997)' 69 | _computing_structure_solution 'SHELXS-97 (Sheldrick, 1990)' 70 | _diffrn_ambient_temperature 293(2) 71 | _diffrn_measured_fraction_theta_full 0.962 72 | _diffrn_measured_fraction_theta_max 0.962 73 | _diffrn_measurement_device_type 'multiwire proportional' 74 | _diffrn_measurement_method 'phi and omega scans' 75 | _diffrn_radiation_monochromator graphite 76 | _diffrn_radiation_source 'fine-focus sealed tube' 77 | _diffrn_radiation_type CuK\a 78 | _diffrn_radiation_wavelength 1.54178 79 | _diffrn_reflns_av_R_equivalents 0.1501 80 | _diffrn_reflns_av_sigmaI/netI 0.0721 81 | _diffrn_reflns_limit_h_max 43 82 | _diffrn_reflns_limit_h_min -38 83 | _diffrn_reflns_limit_k_max 20 84 | _diffrn_reflns_limit_k_min -49 85 | _diffrn_reflns_limit_l_max 36 86 | _diffrn_reflns_limit_l_min -39 87 | _diffrn_reflns_number 16582 88 | _diffrn_reflns_theta_full 54.94 89 | _diffrn_reflns_theta_max 54.94 90 | _diffrn_reflns_theta_min 3.79 91 | _exptl_absorpt_coefficient_mu 3.377 92 | _exptl_absorpt_correction_T_max 0.4409 93 | _exptl_absorpt_correction_T_min 0.3761 94 | _exptl_absorpt_correction_type multi-scan 95 | _exptl_absorpt_process_details SADABS 96 | _exptl_crystal_colour RED 97 | _exptl_crystal_density_diffrn 0.700 98 | _exptl_crystal_density_method 'not measured' 99 | _exptl_crystal_description BLOCK 100 | _exptl_crystal_F_000 21696 101 | _exptl_crystal_recrystallization_method 102 | 'Re-crystallisation from solvent: DMF, CH3CN and H2O' 103 | _exptl_crystal_size_max 0.36 104 | _exptl_crystal_size_mid 0.32 105 | _exptl_crystal_size_min 0.29 106 | _refine_diff_density_max 1.656 107 | _refine_diff_density_min -0.622 108 | _refine_diff_density_rms 0.156 109 | _refine_ls_extinction_coef 0.000072(17) 110 | _refine_ls_extinction_expression Fc^*^=kFc[1+0.001xFc^2^\l^3^/sin(2\q)]^-1/4^ 111 | _refine_ls_extinction_method SHELXL 112 | _refine_ls_goodness_of_fit_ref 1.059 113 | _refine_ls_hydrogen_treatment constr 114 | _refine_ls_matrix_type full 115 | _refine_ls_number_parameters 158 116 | _refine_ls_number_reflns 2913 117 | _refine_ls_number_restraints 105 118 | _refine_ls_restrained_S_all 1.081 119 | _refine_ls_R_factor_all 0.2094 120 | _refine_ls_R_factor_gt 0.1495 121 | _refine_ls_shift/su_max 0.002 122 | _refine_ls_shift/su_mean 0.000 123 | _refine_ls_structure_factor_coef Fsqd 124 | _refine_ls_weighting_details 125 | 'calc w=1/[\s^2^(Fo^2^)+(0.3300P)^2^+0.0000P] where P=(Fo^2^+2Fc^2^)/3' 126 | _refine_ls_weighting_scheme calc 127 | _refine_ls_wR_factor_gt 0.3603 128 | _refine_ls_wR_factor_ref 0.4440 129 | _reflns_number_gt 1649 130 | _reflns_number_total 2913 131 | _reflns_threshold_expression >2sigma(I) 132 | _cod_data_source_file c7sc04192f2.cif 133 | _cod_data_source_block aaa 134 | _cod_depositor_comments 135 | ; 136 | The following automatic conversions were performed: 137 | 138 | '_symmetry_cell_setting' value 'Cubic' changed to 'cubic' according 139 | to 140 | /home/data/users/saulius/crontab/automatic-downloads/rss-feeds/RSC/lib/dictionaries/cif_core.dic 141 | dictionary named 'cif_core.dic' version 2.4.2 from 2011-04-26. 142 | 143 | '_exptl_absorpt_correction_type' value 'MULTI-SCAN' changed to 144 | 'multi-scan' according to 145 | /home/data/users/saulius/crontab/automatic-downloads/rss-feeds/RSC/lib/dictionaries/cif_core.dic 146 | dictionary named 'cif_core.dic' version 2.4.2 from 2011-04-26. 147 | 148 | '_refine_ls_hydrogen_treatment' value 'CONSTR' changed to 'constr' 149 | according to 150 | /home/data/users/saulius/crontab/automatic-downloads/rss-feeds/RSC/lib/dictionaries/cif_core.dic 151 | dictionary named 'cif_core.dic' version 2.4.2 from 2011-04-26. 152 | 153 | Automatic conversion script 154 | Id: cif_fix_values 4973 2017-02-22 13:04:09Z antanas 155 | ; 156 | _cod_original_sg_symbol_H-M Fd-3m 157 | _cod_database_code 1548397 158 | loop_ 159 | _symmetry_equiv_pos_as_xyz 160 | 'x, y, z' 161 | '-x+3/4, -y+1/4, z+1/2' 162 | '-x+1/4, y+1/2, -z+3/4' 163 | 'x+1/2, -y+3/4, -z+1/4' 164 | 'z, x, y' 165 | 'z+1/2, -x+3/4, -y+1/4' 166 | '-z+3/4, -x+1/4, y+1/2' 167 | '-z+1/4, x+1/2, -y+3/4' 168 | 'y, z, x' 169 | '-y+1/4, z+1/2, -x+3/4' 170 | 'y+1/2, -z+3/4, -x+1/4' 171 | '-y+3/4, -z+1/4, x+1/2' 172 | 'y+3/4, x+1/4, -z+1/2' 173 | '-y, -x, -z' 174 | 'y+1/4, -x+1/2, z+3/4' 175 | '-y+1/2, x+3/4, z+1/4' 176 | 'x+3/4, z+1/4, -y+1/2' 177 | '-x+1/2, z+3/4, y+1/4' 178 | '-x, -z, -y' 179 | 'x+1/4, -z+1/2, y+3/4' 180 | 'z+3/4, y+1/4, -x+1/2' 181 | 'z+1/4, -y+1/2, x+3/4' 182 | '-z+1/2, y+3/4, x+1/4' 183 | '-z, -y, -x' 184 | 'x, y+1/2, z+1/2' 185 | '-x+3/4, -y+3/4, z+1' 186 | '-x+1/4, y+1, -z+5/4' 187 | 'x+1/2, -y+5/4, -z+3/4' 188 | 'z, x+1/2, y+1/2' 189 | 'z+1/2, -x+5/4, -y+3/4' 190 | '-z+3/4, -x+3/4, y+1' 191 | '-z+1/4, x+1, -y+5/4' 192 | 'y, z+1/2, x+1/2' 193 | '-y+1/4, z+1, -x+5/4' 194 | 'y+1/2, -z+5/4, -x+3/4' 195 | '-y+3/4, -z+3/4, x+1' 196 | 'y+3/4, x+3/4, -z+1' 197 | '-y, -x+1/2, -z+1/2' 198 | 'y+1/4, -x+1, z+5/4' 199 | '-y+1/2, x+5/4, z+3/4' 200 | 'x+3/4, z+3/4, -y+1' 201 | '-x+1/2, z+5/4, y+3/4' 202 | '-x, -z+1/2, -y+1/2' 203 | 'x+1/4, -z+1, y+5/4' 204 | 'z+3/4, y+3/4, -x+1' 205 | 'z+1/4, -y+1, x+5/4' 206 | '-z+1/2, y+5/4, x+3/4' 207 | '-z, -y+1/2, -x+1/2' 208 | 'x+1/2, y, z+1/2' 209 | '-x+5/4, -y+1/4, z+1' 210 | '-x+3/4, y+1/2, -z+5/4' 211 | 'x+1, -y+3/4, -z+3/4' 212 | 'z+1/2, x, y+1/2' 213 | 'z+1, -x+3/4, -y+3/4' 214 | '-z+5/4, -x+1/4, y+1' 215 | '-z+3/4, x+1/2, -y+5/4' 216 | 'y+1/2, z, x+1/2' 217 | '-y+3/4, z+1/2, -x+5/4' 218 | 'y+1, -z+3/4, -x+3/4' 219 | '-y+5/4, -z+1/4, x+1' 220 | 'y+5/4, x+1/4, -z+1' 221 | '-y+1/2, -x, -z+1/2' 222 | 'y+3/4, -x+1/2, z+5/4' 223 | '-y+1, x+3/4, z+3/4' 224 | 'x+5/4, z+1/4, -y+1' 225 | '-x+1, z+3/4, y+3/4' 226 | '-x+1/2, -z, -y+1/2' 227 | 'x+3/4, -z+1/2, y+5/4' 228 | 'z+5/4, y+1/4, -x+1' 229 | 'z+3/4, -y+1/2, x+5/4' 230 | '-z+1, y+3/4, x+3/4' 231 | '-z+1/2, -y, -x+1/2' 232 | 'x+1/2, y+1/2, z' 233 | '-x+5/4, -y+3/4, z+1/2' 234 | '-x+3/4, y+1, -z+3/4' 235 | 'x+1, -y+5/4, -z+1/4' 236 | 'z+1/2, x+1/2, y' 237 | 'z+1, -x+5/4, -y+1/4' 238 | '-z+5/4, -x+3/4, y+1/2' 239 | '-z+3/4, x+1, -y+3/4' 240 | 'y+1/2, z+1/2, x' 241 | '-y+3/4, z+1, -x+3/4' 242 | 'y+1, -z+5/4, -x+1/4' 243 | '-y+5/4, -z+3/4, x+1/2' 244 | 'y+5/4, x+3/4, -z+1/2' 245 | '-y+1/2, -x+1/2, -z' 246 | 'y+3/4, -x+1, z+3/4' 247 | '-y+1, x+5/4, z+1/4' 248 | 'x+5/4, z+3/4, -y+1/2' 249 | '-x+1, z+5/4, y+1/4' 250 | '-x+1/2, -z+1/2, -y' 251 | 'x+3/4, -z+1, y+3/4' 252 | 'z+5/4, y+3/4, -x+1/2' 253 | 'z+3/4, -y+1, x+3/4' 254 | '-z+1, y+5/4, x+1/4' 255 | '-z+1/2, -y+1/2, -x' 256 | '-x, -y, -z' 257 | 'x-3/4, y-1/4, -z-1/2' 258 | 'x-1/4, -y-1/2, z-3/4' 259 | '-x-1/2, y-3/4, z-1/4' 260 | '-z, -x, -y' 261 | '-z-1/2, x-3/4, y-1/4' 262 | 'z-3/4, x-1/4, -y-1/2' 263 | 'z-1/4, -x-1/2, y-3/4' 264 | '-y, -z, -x' 265 | 'y-1/4, -z-1/2, x-3/4' 266 | '-y-1/2, z-3/4, x-1/4' 267 | 'y-3/4, z-1/4, -x-1/2' 268 | '-y-3/4, -x-1/4, z-1/2' 269 | 'y, x, z' 270 | '-y-1/4, x-1/2, -z-3/4' 271 | 'y-1/2, -x-3/4, -z-1/4' 272 | '-x-3/4, -z-1/4, y-1/2' 273 | 'x-1/2, -z-3/4, -y-1/4' 274 | 'x, z, y' 275 | '-x-1/4, z-1/2, -y-3/4' 276 | '-z-3/4, -y-1/4, x-1/2' 277 | '-z-1/4, y-1/2, -x-3/4' 278 | 'z-1/2, -y-3/4, -x-1/4' 279 | 'z, y, x' 280 | '-x, -y+1/2, -z+1/2' 281 | 'x-3/4, y+1/4, -z' 282 | 'x-1/4, -y, z-1/4' 283 | '-x-1/2, y-1/4, z+1/4' 284 | '-z, -x+1/2, -y+1/2' 285 | '-z-1/2, x-1/4, y+1/4' 286 | 'z-3/4, x+1/4, -y' 287 | 'z-1/4, -x, y-1/4' 288 | '-y, -z+1/2, -x+1/2' 289 | 'y-1/4, -z, x-1/4' 290 | '-y-1/2, z-1/4, x+1/4' 291 | 'y-3/4, z+1/4, -x' 292 | '-y-3/4, -x+1/4, z' 293 | 'y, x+1/2, z+1/2' 294 | '-y-1/4, x, -z-1/4' 295 | 'y-1/2, -x-1/4, -z+1/4' 296 | '-x-3/4, -z+1/4, y' 297 | 'x-1/2, -z-1/4, -y+1/4' 298 | 'x, z+1/2, y+1/2' 299 | '-x-1/4, z, -y-1/4' 300 | '-z-3/4, -y+1/4, x' 301 | '-z-1/4, y, -x-1/4' 302 | 'z-1/2, -y-1/4, -x+1/4' 303 | 'z, y+1/2, x+1/2' 304 | '-x+1/2, -y, -z+1/2' 305 | 'x-1/4, y-1/4, -z' 306 | 'x+1/4, -y-1/2, z-1/4' 307 | '-x, y-3/4, z+1/4' 308 | '-z+1/2, -x, -y+1/2' 309 | '-z, x-3/4, y+1/4' 310 | 'z-1/4, x-1/4, -y' 311 | 'z+1/4, -x-1/2, y-1/4' 312 | '-y+1/2, -z, -x+1/2' 313 | 'y+1/4, -z-1/2, x-1/4' 314 | '-y, z-3/4, x+1/4' 315 | 'y-1/4, z-1/4, -x' 316 | '-y-1/4, -x-1/4, z' 317 | 'y+1/2, x, z+1/2' 318 | '-y+1/4, x-1/2, -z-1/4' 319 | 'y, -x-3/4, -z+1/4' 320 | '-x-1/4, -z-1/4, y' 321 | 'x, -z-3/4, -y+1/4' 322 | 'x+1/2, z, y+1/2' 323 | '-x+1/4, z-1/2, -y-1/4' 324 | '-z-1/4, -y-1/4, x' 325 | '-z+1/4, y-1/2, -x-1/4' 326 | 'z, -y-3/4, -x+1/4' 327 | 'z+1/2, y, x+1/2' 328 | '-x+1/2, -y+1/2, -z' 329 | 'x-1/4, y+1/4, -z-1/2' 330 | 'x+1/4, -y, z-3/4' 331 | '-x, y-1/4, z-1/4' 332 | '-z+1/2, -x+1/2, -y' 333 | '-z, x-1/4, y-1/4' 334 | 'z-1/4, x+1/4, -y-1/2' 335 | 'z+1/4, -x, y-3/4' 336 | '-y+1/2, -z+1/2, -x' 337 | 'y+1/4, -z, x-3/4' 338 | '-y, z-1/4, x-1/4' 339 | 'y-1/4, z+1/4, -x-1/2' 340 | '-y-1/4, -x+1/4, z-1/2' 341 | 'y+1/2, x+1/2, z' 342 | '-y+1/4, x, -z-3/4' 343 | 'y, -x-1/4, -z-1/4' 344 | '-x-1/4, -z+1/4, y-1/2' 345 | 'x, -z-1/4, -y-1/4' 346 | 'x+1/2, z+1/2, y' 347 | '-x+1/4, z, -y-3/4' 348 | '-z-1/4, -y+1/4, x-1/2' 349 | '-z+1/4, y, -x-3/4' 350 | 'z, -y-1/4, -x-1/4' 351 | 'z+1/2, y+1/2, x' 352 | loop_ 353 | _atom_site_label 354 | _atom_site_type_symbol 355 | _atom_site_fract_x 356 | _atom_site_fract_y 357 | _atom_site_fract_z 358 | _atom_site_U_iso_or_equiv 359 | _atom_site_adp_type 360 | _atom_site_occupancy 361 | _atom_site_symmetry_multiplicity 362 | _atom_site_calc_flag 363 | _atom_site_refinement_flags 364 | _atom_site_disorder_assembly 365 | _atom_site_disorder_group 366 | Co2 Co 0.09666(17) 0.15054(17) 0.93636(8) 0.0352(10) Uani 0.25 1 d P A -1 367 | O6 O 0.0730(7) 0.1833(7) 0.9152(4) 0.058(8) Uani 0.25 1 d PU A -1 368 | H1W H 0.0754 0.1749 0.8992 0.086 Uiso 0.50 2 d SPR A -1 369 | H2W H 0.0659 0.2000 0.9136 0.086 Uiso 0.25 1 d PR A -1 370 | Co1 Co 0.1250 0.1250 1.00490(5) 0.0352(10) Uani 1 4 d S . . 371 | O1 O 0.12508(15) 0.16855(13) 1.00669(12) 0.061(2) Uani 1 1 d . . . 372 | O2 O 0.1173(3) 0.1832(2) 0.9609(2) 0.122(3) Uani 1 1 d U . . 373 | O3 O 0.1505(3) 0.3241(3) 0.9864(3) 0.162(5) Uani 1 1 d DU A . 374 | O4 O 0.1250 0.1250 1.0507(2) 0.087(5) Uani 1 4 d S . . 375 | H3W H 0.1079 0.1266 1.0568 0.130 Uiso 0.50 1 d PR . . 376 | O5 O 0.1250 0.1250 0.95858(19) 0.034(2) Uani 1 4 d S . . 377 | H5A H 0.1418 0.1293 0.9532 0.051 Uiso 0.25 1 d PR A . 378 | H5B H 0.1220 0.1079 0.9532 0.051 Uiso 0.25 1 d PR . . 379 | N1 N 0.1543(3) 0.3709(3) 0.9106(3) 0.107(3) Uani 1 1 d U A . 380 | N2 N 0.1268(4) 0.2886(2) 0.9614(2) 0.106(4) Uani 1 2 d SDU . . 381 | C1 C 0.1225(3) 0.1869(2) 0.9874(2) 0.080(4) Uani 1 1 d . A . 382 | C2 C 0.1242(3) 0.22567(18) 1.02433(18) 0.056(4) Uani 1 2 d S . . 383 | H2 H 0.1238 0.2116 1.0384 0.067 Uiso 1 2 calc SR . . 384 | C3 C 0.1245(2) 0.21790(19) 0.99607(18) 0.066(3) Uani 1 1 d . . . 385 | C4 C 0.1239(3) 0.2393(2) 0.9750(2) 0.083(4) Uani 1 1 d . A . 386 | H4A H 0.1219 0.2344 0.9558 0.099 Uiso 1 1 calc R . . 387 | C5 C 0.1263(4) 0.26725(17) 0.98275(17) 0.076(5) Uani 1 2 d SD . . 388 | C6 C 0.1375(4) 0.3154(3) 0.9647(4) 0.123(4) Uani 1 1 d DU . . 389 | C7 C 0.1331(4) 0.3342(3) 0.9384(3) 0.119(3) Uani 1 1 d U . . 390 | C8 C 0.1102(4) 0.3314(3) 0.9186(3) 0.101(6) Uani 1 2 d S . . 391 | H8 H 0.0921 0.3238 0.9262 0.121 Uiso 1 2 calc SR A . 392 | C9 C 0.1080(3) 0.3522(2) 0.8978(2) 0.077(5) Uani 1 2 d S . . 393 | H9 H 0.0887 0.3580 0.8920 0.092 Uiso 1 2 calc SR . . 394 | C10 C 0.1299(3) 0.3725(3) 0.8947(3) 0.107(4) Uani 1 1 d U . . 395 | C11 C 0.1554(4) 0.3522(3) 0.9324(4) 0.114(3) Uani 1 1 d U . . 396 | H11 H 0.1717 0.3514 0.9437 0.137 Uiso 1 1 calc R A . 397 | loop_ 398 | _atom_site_aniso_label 399 | _atom_site_aniso_U_11 400 | _atom_site_aniso_U_22 401 | _atom_site_aniso_U_33 402 | _atom_site_aniso_U_23 403 | _atom_site_aniso_U_13 404 | _atom_site_aniso_U_12 405 | Co2 0.039(6) 0.047(6) 0.0199(12) 0.004(4) -0.004(4) -0.0137(9) 406 | O6 0.061(12) 0.052(12) 0.060(9) 0.009(8) -0.002(8) 0.000(9) 407 | Co1 0.039(6) 0.047(6) 0.0199(12) 0.004(4) -0.004(4) -0.0137(9) 408 | O1 0.107(6) 0.043(4) 0.035(3) -0.010(3) 0.009(3) -0.016(3) 409 | O2 0.203(8) 0.085(5) 0.078(6) -0.001(4) 0.004(5) -0.035(5) 410 | O3 0.172(8) 0.171(8) 0.143(7) 0.015(6) -0.018(6) -0.034(6) 411 | O4 0.121(8) 0.121(8) 0.019(6) 0.000 0.000 -0.011(10) 412 | O5 0.038(4) 0.038(4) 0.027(5) 0.000 0.000 -0.001(4) 413 | N1 0.100(5) 0.112(6) 0.109(6) 0.021(4) 0.001(4) 0.001(5) 414 | N2 0.128(7) 0.095(4) 0.095(4) 0.016(5) 0.005(4) -0.005(4) 415 | C1 0.148(12) 0.069(7) 0.023(5) -0.010(5) 0.013(6) -0.032(7) 416 | C2 0.098(11) 0.035(5) 0.035(5) 0.009(5) 0.009(4) -0.009(4) 417 | C3 0.129(10) 0.037(5) 0.031(5) 0.002(4) 0.016(5) -0.020(5) 418 | C4 0.140(11) 0.066(7) 0.042(6) 0.003(5) 0.004(6) -0.023(7) 419 | C5 0.146(16) 0.041(5) 0.041(5) 0.011(6) 0.018(6) -0.018(6) 420 | C6 0.139(6) 0.115(6) 0.116(6) 0.013(5) 0.009(5) -0.014(5) 421 | C7 0.128(6) 0.114(6) 0.116(6) 0.014(5) 0.014(5) -0.009(5) 422 | C8 0.072(11) 0.115(10) 0.115(10) 0.050(13) 0.024(7) -0.024(7) 423 | C9 0.051(9) 0.090(7) 0.090(7) 0.028(9) 0.014(5) -0.014(5) 424 | C10 0.098(6) 0.113(7) 0.110(7) 0.019(6) -0.004(5) -0.007(5) 425 | C11 0.116(6) 0.113(6) 0.113(6) 0.022(5) 0.002(5) -0.004(5) 426 | loop_ 427 | _atom_type_symbol 428 | _atom_type_description 429 | _atom_type_scat_dispersion_real 430 | _atom_type_scat_dispersion_imag 431 | _atom_type_scat_source 432 | C C 0.0181 0.0091 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 433 | H H 0.0000 0.0000 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 434 | N N 0.0311 0.0180 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 435 | O O 0.0492 0.0322 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 436 | Co Co -2.3653 3.6143 'International Tables Vol C Tables 4.2.6.8 and 6.1.1.4' 437 | loop_ 438 | _geom_angle_atom_site_label_1 439 | _geom_angle_atom_site_label_2 440 | _geom_angle_atom_site_label_3 441 | _geom_angle 442 | _geom_angle_site_symmetry_1 443 | _geom_angle_site_symmetry_3 444 | O2 Co2 O5 85.3(4) 133_655 . 445 | O2 Co2 N1 91.3(6) 133_655 71_545 446 | O5 Co2 N1 94.7(5) . 71_545 447 | O2 Co2 O6 101.8(10) 133_655 . 448 | O5 Co2 O6 169.1(13) . . 449 | N1 Co2 O6 93.4(9) 71_545 . 450 | O2 Co2 O2 107.7(6) 133_655 . 451 | O5 Co2 O2 81.6(4) . . 452 | N1 Co2 O2 160.1(6) 71_545 . 453 | O6 Co2 O2 88.3(11) . . 454 | O2 Co2 N1 166.7(6) 133_655 107_656 455 | O5 Co2 N1 91.4(5) . 107_656 456 | N1 Co2 N1 76.1(7) 71_545 107_656 457 | O6 Co2 N1 83.5(9) . 107_656 458 | O2 Co2 N1 84.4(6) . 107_656 459 | Co2 O6 H1W 90.8 . . 460 | Co2 O6 H2W 154.3 . . 461 | H1W O6 H2W 113.2 . . 462 | O1 Co1 O1 90.1(4) 110 50_454 463 | O1 Co1 O1 89.7(4) 110 . 464 | O1 Co1 O1 175.3(3) 50_454 . 465 | O1 Co1 O1 175.3(3) 110 133_655 466 | O1 Co1 O1 89.7(4) 50_454 133_655 467 | O1 Co1 O1 90.1(4) . 133_655 468 | O1 Co1 O4 87.64(16) 110 . 469 | O1 Co1 O4 87.64(16) 50_454 . 470 | O1 Co1 O4 87.64(16) . . 471 | O1 Co1 O4 87.64(16) 133_655 . 472 | O1 Co1 O5 92.36(16) 110 . 473 | O1 Co1 O5 92.36(16) 50_454 . 474 | O1 Co1 O5 92.36(16) . . 475 | O1 Co1 O5 92.36(16) 133_655 . 476 | O4 Co1 O5 180.000(3) . . 477 | C1 O1 Co1 131.0(6) . . 478 | C1 O2 Co2 137.5(8) . 133_655 479 | C1 O2 Co2 134.9(8) . . 480 | Co2 O2 Co2 3.0(4) 133_655 . 481 | Co1 O4 H3W 109.5 . . 482 | Co2 O5 Co2 5.2(6) 50_454 110 483 | Co2 O5 Co2 119.6(5) 50_454 . 484 | Co2 O5 Co2 119.3(5) 110 . 485 | Co2 O5 Co2 119.3(5) 50_454 133_655 486 | Co2 O5 Co2 119.6(5) 110 133_655 487 | Co2 O5 Co2 5.2(6) . 133_655 488 | Co2 O5 Co1 120.2(2) 50_454 . 489 | Co2 O5 Co1 120.2(2) 110 . 490 | Co2 O5 Co1 120.2(2) . . 491 | Co2 O5 Co1 120.2(2) 133_655 . 492 | Co2 O5 H5A 52.8 50_454 . 493 | Co2 O5 H5A 57.9 110 . 494 | Co2 O5 H5A 107.9 . . 495 | Co2 O5 H5A 103.5 133_655 . 496 | Co1 O5 H5A 107.2 . . 497 | Co2 O5 H5B 54.1 50_454 . 498 | Co2 O5 H5B 49.0 110 . 499 | Co2 O5 H5B 106.7 . . 500 | Co2 O5 H5B 111.0 133_655 . 501 | Co1 O5 H5B 107.2 . . 502 | H5A O5 H5B 106.9 . . 503 | C11 N1 C10 118.7(13) . . 504 | C11 N1 Co2 122.2(11) . 69_355 505 | C10 N1 Co2 118.7(9) . 69_355 506 | C11 N1 Co2 126.1(11) . 152_466 507 | C10 N1 Co2 114.7(9) . 152_466 508 | Co2 N1 Co2 3.9(5) 69_355 152_466 509 | C6 N2 C6 94.2(17) . 162_576 510 | C6 N2 C5 125.3(11) . . 511 | C6 N2 C5 125.3(11) 162_576 . 512 | O1 C1 O2 128.6(10) . . 513 | O1 C1 C3 117.5(8) . . 514 | O2 C1 C3 113.9(10) . . 515 | C3 C2 C3 120.7(11) . 162_576 516 | C3 C2 H2 119.6 . . 517 | C3 C2 H2 119.6 162_576 . 518 | C2 C3 C4 119.1(9) . . 519 | C2 C3 C1 120.9(8) . . 520 | C4 C3 C1 119.7(8) . . 521 | C5 C4 C3 119.8(10) . . 522 | C5 C4 H4A 120.1 . . 523 | C3 C4 H4A 120.1 . . 524 | C4 C5 C4 120.3(12) 162_576 . 525 | C4 C5 N2 119.5(7) 162_576 . 526 | C4 C5 N2 119.5(6) . . 527 | O3 C6 N2 125.0(16) . . 528 | O3 C6 C7 122.0(14) . . 529 | N2 C6 C7 112.9(15) . . 530 | O3 C6 C6 143.6(11) . 162_576 531 | N2 C6 C6 42.9(9) . 162_576 532 | C7 C6 C6 80.6(8) . 162_576 533 | C11 C7 C8 119.8(16) . . 534 | C11 C7 C7 106.8(10) . 162_576 535 | C8 C7 C7 58.3(8) . 162_576 536 | C11 C7 C6 114.7(16) . . 537 | C8 C7 C6 124.9(15) . . 538 | C7 C7 C6 99.4(8) 162_576 . 539 | C9 C8 C7 116.6(13) . 162_576 540 | C9 C8 C7 116.6(13) . . 541 | C7 C8 C7 63.3(15) 162_576 . 542 | C9 C8 H8 116.6 . . 543 | C7 C8 H8 116.6 162_576 . 544 | C7 C8 H8 116.6 . . 545 | C8 C9 C10 119.8(14) . . 546 | C8 C9 C10 119.8(14) . 162_576 547 | C10 C9 C10 48.0(12) . 162_576 548 | C8 C9 H9 117.1 . . 549 | C10 C9 H9 117.1 . . 550 | C10 C9 H9 117.1 162_576 . 551 | C10 C10 N1 110.1(8) 162_576 . 552 | C10 C10 C9 66.0(6) 162_576 . 553 | N1 C10 C9 121.1(13) . . 554 | C10 C10 C10 90.000(10) 162_576 139_545 555 | N1 C10 C10 115.1(7) . 139_545 556 | C9 C10 C10 123.4(9) . 139_545 557 | C10 C10 C10 52.1(8) 162_576 52_456 558 | N1 C10 C10 123.1(8) . 52_456 559 | C9 C10 C10 100.6(10) . 52_456 560 | C10 C10 C10 37.9(8) 139_545 52_456 561 | N1 C11 C7 121.9(16) . . 562 | N1 C11 H11 119.1 . . 563 | C7 C11 H11 119.1 . . 564 | loop_ 565 | _geom_bond_atom_site_label_1 566 | _geom_bond_atom_site_label_2 567 | _geom_bond_distance 568 | _geom_bond_site_symmetry_2 569 | Co2 O2 1.987(13) 133_655 570 | Co2 O5 2.062(6) . 571 | Co2 N1 2.067(14) 71_545 572 | Co2 O6 2.13(2) . 573 | Co2 O2 2.137(13) . 574 | Co2 N1 2.181(14) 107_656 575 | O6 H1W 0.8497 . 576 | O6 H2W 0.8501 . 577 | Co1 O1 2.035(6) 110 578 | Co1 O1 2.035(6) 50_454 579 | Co1 O1 2.035(6) . 580 | Co1 O1 2.035(6) 133_655 581 | Co1 O4 2.140(12) . 582 | Co1 O5 2.163(9) . 583 | O1 C1 1.250(12) . 584 | O2 C1 1.274(13) . 585 | O2 Co2 1.987(13) 133_655 586 | O3 C6 1.248(14) . 587 | O4 H3W 0.8499 . 588 | O5 Co2 2.062(6) 50_454 589 | O5 Co2 2.062(6) 110 590 | O5 Co2 2.062(6) 133_655 591 | O5 H5A 0.8501 . 592 | O5 H5B 0.8500 . 593 | N1 C11 1.344(17) . 594 | N1 C10 1.362(17) . 595 | N1 Co2 2.067(14) 69_355 596 | N1 Co2 2.181(14) 152_466 597 | N2 C6 1.357(14) . 598 | N2 C6 1.357(14) 162_576 599 | N2 C5 1.408(14) . 600 | C1 C3 1.504(14) . 601 | C2 C3 1.369(11) . 602 | C2 C3 1.369(11) 162_576 603 | C2 H2 0.9300 . 604 | C3 C4 1.402(14) . 605 | C4 C5 1.359(13) . 606 | C4 H4A 0.9300 . 607 | C5 C4 1.359(13) 162_576 608 | C6 C7 1.52(2) . 609 | C6 C6 1.99(3) 162_576 610 | C7 C11 1.37(2) . 611 | C7 C8 1.42(2) . 612 | C7 C7 1.49(3) 162_576 613 | C8 C9 1.38(2) . 614 | C8 C7 1.42(2) 162_576 615 | C8 H8 0.9800 . 616 | C9 C10 1.401(17) . 617 | C9 C10 1.401(17) 162_576 618 | C9 H9 0.9800 . 619 | C10 C10 1.14(3) 162_576 620 | C10 C10 1.46(3) 139_545 621 | C10 C10 1.86(3) 52_456 622 | C11 H11 0.9300 . 623 | loop_ 624 | _geom_torsion_atom_site_label_1 625 | _geom_torsion_atom_site_label_2 626 | _geom_torsion_atom_site_label_3 627 | _geom_torsion_atom_site_label_4 628 | _geom_torsion 629 | _geom_torsion_site_symmetry_1 630 | _geom_torsion_site_symmetry_4 631 | O1 Co1 O1 C1 99.8(10) 110 . 632 | O1 Co1 O1 C1 -172.6(10) 50_454 . 633 | O1 Co1 O1 C1 -84.9(10) 133_655 . 634 | O4 Co1 O1 C1 -172.6(10) . . 635 | O5 Co1 O1 C1 7.4(10) . . 636 | O2 Co2 O2 C1 30.1(18) 133_655 . 637 | O5 Co2 O2 C1 -52.2(14) . . 638 | N1 Co2 O2 C1 -132.7(17) 71_545 . 639 | O6 Co2 O2 C1 132.0(16) . . 640 | N1 Co2 O2 C1 -144.4(15) 107_656 . 641 | O2 Co2 O2 Co2 180.0(5) 133_655 133_655 642 | O5 Co2 O2 Co2 97.7(6) . 133_655 643 | N1 Co2 O2 Co2 17.2(18) 71_545 133_655 644 | O6 Co2 O2 Co2 -78.1(9) . 133_655 645 | N1 Co2 O2 Co2 5.5(6) 107_656 133_655 646 | O2 Co2 O5 Co2 128.5(5) 133_655 50_454 647 | N1 Co2 O5 Co2 37.5(4) 71_545 50_454 648 | O6 Co2 O5 Co2 -100(4) . 50_454 649 | O2 Co2 O5 Co2 -122.8(4) . 50_454 650 | N1 Co2 O5 Co2 -38.7(3) 107_656 50_454 651 | O2 Co2 O5 Co2 122.6(5) 133_655 110 652 | N1 Co2 O5 Co2 31.6(8) 71_545 110 653 | O6 Co2 O5 Co2 -106(3) . 110 654 | O2 Co2 O5 Co2 -128.7(5) . 110 655 | N1 Co2 O5 Co2 -44.6(8) 107_656 110 656 | O2 Co2 O5 Co2 -143.0(4) 133_655 133_655 657 | N1 Co2 O5 Co2 126.0(4) 71_545 133_655 658 | O6 Co2 O5 Co2 -12(4) . 133_655 659 | O2 Co2 O5 Co2 -34.3(3) . 133_655 660 | N1 Co2 O5 Co2 49.8(4) 107_656 133_655 661 | O2 Co2 O5 Co1 -51.5(5) 133_655 . 662 | N1 Co2 O5 Co1 -142.5(4) 71_545 . 663 | O6 Co2 O5 Co1 80(4) . . 664 | O2 Co2 O5 Co1 57.2(4) . . 665 | N1 Co2 O5 Co1 141.3(3) 107_656 . 666 | O1 Co1 O5 Co2 42.1(4) 110 50_454 667 | O1 Co1 O5 Co2 -48.1(4) 50_454 50_454 668 | O1 Co1 O5 Co2 131.9(4) . 50_454 669 | O1 Co1 O5 Co2 -137.9(4) 133_655 50_454 670 | O4 Co1 O5 Co2 -139(100) . 50_454 671 | O1 Co1 O5 Co2 48.1(4) 110 110 672 | O1 Co1 O5 Co2 -42.1(4) 50_454 110 673 | O1 Co1 O5 Co2 137.9(4) . 110 674 | O1 Co1 O5 Co2 -131.9(4) 133_655 110 675 | O4 Co1 O5 Co2 -133(100) . 110 676 | O1 Co1 O5 Co2 -137.9(4) 110 . 677 | O1 Co1 O5 Co2 131.9(4) 50_454 . 678 | O1 Co1 O5 Co2 -48.1(4) . . 679 | O1 Co1 O5 Co2 42.1(4) 133_655 . 680 | O4 Co1 O5 Co2 41(100) . . 681 | O1 Co1 O5 Co2 -131.9(4) 110 133_655 682 | O1 Co1 O5 Co2 137.9(4) 50_454 133_655 683 | O1 Co1 O5 Co2 -42.1(4) . 133_655 684 | O1 Co1 O5 Co2 48.1(4) 133_655 133_655 685 | O4 Co1 O5 Co2 47(100) . 133_655 686 | Co1 O1 C1 O2 5(2) . . 687 | Co1 O1 C1 C3 -178.1(7) . . 688 | Co2 O2 C1 O1 23(2) 133_655 . 689 | Co2 O2 C1 O1 25(2) . . 690 | Co2 O2 C1 C3 -154.3(11) 133_655 . 691 | Co2 O2 C1 C3 -152.0(10) . . 692 | C3 C2 C3 C4 -2(2) 162_576 . 693 | C3 C2 C3 C1 -176.5(9) 162_576 . 694 | O1 C1 C3 C2 -10.8(18) . . 695 | O2 C1 C3 C2 166.7(13) . . 696 | O1 C1 C3 C4 175.1(12) . . 697 | O2 C1 C3 C4 -7.4(17) . . 698 | C2 C3 C4 C5 7(2) . . 699 | C1 C3 C4 C5 -178.6(13) . . 700 | C3 C4 C5 C4 -12(3) . 162_576 701 | C3 C4 C5 N2 177.7(14) . . 702 | C6 N2 C5 C4 31(3) . 162_576 703 | C6 N2 C5 C4 158.6(16) 162_576 162_576 704 | C6 N2 C5 C4 -158.6(16) . . 705 | C6 N2 C5 C4 -31(3) 162_576 . 706 | C6 N2 C6 O3 -133.6(15) 162_576 . 707 | C5 N2 C6 O3 6(3) . . 708 | C6 N2 C6 C7 44(2) 162_576 . 709 | C5 N2 C6 C7 -176.0(16) . . 710 | C5 N2 C6 C6 140(3) . 162_576 711 | O3 C6 C7 C11 36(2) . . 712 | N2 C6 C7 C11 -142.4(16) . . 713 | C6 C6 C7 C11 -113.5(14) 162_576 . 714 | O3 C6 C7 C8 -153.1(17) . . 715 | N2 C6 C7 C8 29(2) . . 716 | C6 C6 C7 C8 57.8(13) 162_576 . 717 | O3 C6 C7 C7 149.2(16) . 162_576 718 | N2 C6 C7 C7 -28.9(15) . 162_576 719 | C6 C6 C7 C7 0.000(6) 162_576 162_576 720 | C11 C7 C8 C9 -16(2) . . 721 | C7 C7 C8 C9 -108.0(12) 162_576 . 722 | C6 C7 C8 C9 173.4(13) . . 723 | C11 C7 C8 C7 92.2(15) . 162_576 724 | C6 C7 C8 C7 -78.6(16) . 162_576 725 | C7 C8 C9 C10 -63.9(12) 162_576 . 726 | C7 C8 C9 C10 8.0(15) . . 727 | C7 C8 C9 C10 -8.0(15) 162_576 162_576 728 | C7 C8 C9 C10 63.9(12) . 162_576 729 | C11 N1 C10 C10 -83.8(13) . 162_576 730 | Co2 N1 C10 C10 103.6(8) 69_355 162_576 731 | Co2 N1 C10 C10 103.2(7) 152_466 162_576 732 | C11 N1 C10 C9 -10(2) . . 733 | Co2 N1 C10 C9 177.0(9) 69_355 . 734 | Co2 N1 C10 C9 176.7(9) 152_466 . 735 | C11 N1 C10 C10 176.3(11) . 139_545 736 | Co2 N1 C10 C10 3.7(12) 69_355 139_545 737 | Co2 N1 C10 C10 3.4(11) 152_466 139_545 738 | C11 N1 C10 C10 -141.0(14) . 52_456 739 | Co2 N1 C10 C10 46.4(18) 69_355 52_456 740 | Co2 N1 C10 C10 46.1(18) 152_466 52_456 741 | C8 C9 C10 C10 104.8(10) . 162_576 742 | C8 C9 C10 N1 4.9(17) . . 743 | C10 C9 C10 N1 -99.8(13) 162_576 . 744 | C8 C9 C10 C10 177.7(5) . 139_545 745 | C10 C9 C10 C10 72.9(7) 162_576 139_545 746 | C8 C9 C10 C10 144.6(7) . 52_456 747 | C10 C9 C10 C10 39.8(10) 162_576 52_456 748 | C10 N1 C11 C7 2(2) . . 749 | Co2 N1 C11 C7 174.7(12) 69_355 . 750 | Co2 N1 C11 C7 174.4(12) 152_466 . 751 | C8 C7 C11 N1 11(2) . . 752 | C7 C7 C11 N1 73.6(16) 162_576 . 753 | C6 C7 C11 N1 -177.3(14) . . 754 | --------------------------------------------------------------------------------