├── tests ├── __init__.py ├── test_parser.py ├── test_get_namespaces.py └── test.xml ├── setup.py ├── .vscode └── settings.json ├── README.md ├── pyproject.toml ├── LICENSE ├── .github └── workflows │ └── publish_on_pipy.yml ├── .gitignore └── src └── schema_st4_parser └── __init__.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup,find_packages 2 | import os 3 | 4 | setup( 5 | version=os.environ.get("PACKAGE_VERSION","0.0.0"), 6 | package_dir={"":"src"}, 7 | packages=find_packages(where="./src", exclude=("*.tests", "*.tests.*", "tests.*", "tests")) 8 | ) -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.unittestArgs": [ 3 | "-v", 4 | "-s", 5 | "./tests", 6 | "-p", 7 | "test_*.py" 8 | ], 9 | "python.testing.pytestEnabled": true, 10 | "python.testing.unittestEnabled": false, 11 | "python.testing.pytestArgs": [ 12 | "tests" 13 | ] 14 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Schema ST4 Python Parser 2 | 3 | Parse [Schema ST4](https://www.quanos-content-solutions.com/en/software/schema-st4) XML Files into simple,flat python objects. 4 | 5 | ## Installation 6 | 7 | via pip: `pip install schema-st4-parser` 8 | 9 | ## Usage 10 | Simply pass the xml file into the parse methode. A list of St4Entry objects will be returned. 11 | 12 | ```pytohn 13 | from schema_st4_parser import parse, St4Entry 14 | 15 | entries = parse("MyFile.xml") 16 | print(entries[0]) 17 | ``` 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "schema_st4_parser" 7 | dynamic = ["version"] 8 | authors = [ 9 | { name="Lukas Kreussel"}, 10 | ] 11 | description = "Parse Schema ST4 xml files into pytohn objects" 12 | readme = "README.md" 13 | license = { file="LICENSE" } 14 | requires-python = ">=3.8" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | ] 20 | 21 | [project.urls] 22 | "Homepage" = "https://github.com/LLukas22/ST4-Python-Parser" 23 | "Bug Tracker" = "https://github.com/LLukas22/ST4-Python-Parser/issues" -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | from src.schema_st4_parser import St4Entry, parse, get_namespaces 2 | 3 | def test_can_parse_file(): 4 | results = parse("./tests/test.xml") 5 | assert len(results) == 1 6 | result = results[0] 7 | assert isinstance(result, St4Entry) 8 | assert result.label == "label" 9 | assert result.node_id == "nID" 10 | assert result.link_id == "lID" 11 | assert result.titles["en"] == "title_en" 12 | assert result.titles["de"] == "title_de" 13 | assert "en" in result.content 14 | assert "de" in result.content 15 | assert result.thumbnail == "thumbnail" 16 | assert "GraficResource" in result.type 17 | assert result.data_web["en"] == "data_web_en" 18 | assert result.data_web["de"] == "data_web_de" 19 | assert result.data_web_data["en"] == "data_web_data_en" 20 | assert result.data_web_data["de"] == "data_web_data_de" 21 | 22 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 LLukas22 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /.github/workflows/publish_on_pipy.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package to PyPi 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - name: checkout 25 | uses: actions/checkout@v3 26 | 27 | - id: get_version 28 | uses: battila7/get-version-action@v2 29 | 30 | - name: print version 31 | run: echo ${{ steps.get_version.outputs.version-without-v }} 32 | 33 | - name: update env 34 | run: echo "PACKAGE_VERSION=${{ steps.get_version.outputs.version-without-v }}" >> $GITHUB_ENV 35 | 36 | - name: print package version 37 | run: echo $PACKAGE_VERSION 38 | 39 | - name: Set up Python 40 | uses: actions/setup-python@v3 41 | with: 42 | python-version: '3.10' 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | pip install build 47 | - name: Build Package 48 | run: python -m build 49 | - name: Publish Package 50 | uses: pypa/gh-action-pypi-publish@v1.5.1 51 | with: 52 | user: __token__ 53 | password: ${{ secrets.PYPI_TOKEN }} -------------------------------------------------------------------------------- /tests/test_get_namespaces.py: -------------------------------------------------------------------------------- 1 | from src.schema_st4_parser import get_namespaces 2 | import io 3 | import pytest 4 | 5 | def test_should_parse_namespaces_from_valid_input(): 6 | valid_string = ''' 7 | 8 | 9 | ''' 10 | file = io.StringIO(valid_string) 11 | result = get_namespaces(file) 12 | assert "n" in result 13 | assert "l" in result 14 | assert "d" in result 15 | assert "xsi" in result 16 | 17 | def test_should_raise_exception_from_invalid_input(): 18 | invalid_string = ''' 19 | 20 | ''' 21 | file = io.StringIO(invalid_string) 22 | with pytest.raises(Exception): 23 | result = get_namespaces(file) 24 | -------------------------------------------------------------------------------- /tests/test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | title_en 9 | 10 | 11 | title_de 12 | 13 | 14 | 15 | 16 | 17 |

English Content

18 |
19 |
20 | 21 | 22 |

German Content

23 |
24 |
25 |
26 | thumbnail 27 | 28 | data_web_en 29 | data_web_de 30 | 31 | 32 | data_web_data_en 33 | data_web_data_de 34 | 35 |
36 |
37 |
38 |
39 |
-------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | schemas/ 2 | streams/ 3 | T450_Betriebsanleitung.xml 4 | main.py 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | -------------------------------------------------------------------------------- /src/schema_st4_parser/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Union 2 | from dataclasses import dataclass 3 | import xml.etree.ElementTree as ET 4 | import os 5 | 6 | @dataclass(frozen=True) 7 | class St4Entry(): 8 | type:str 9 | label:str 10 | node_id:str 11 | link_id:str 12 | titles:Dict[str,str] 13 | content:Dict[str,str] 14 | thumbnail:Optional[str]=None 15 | data_web:Optional[Dict[str,str]]=None 16 | data_web_data:Optional[Dict[str,str]]=None 17 | 18 | @property 19 | def languages(self)->List[str]: 20 | if len(self.content) == 0: 21 | return list(self.titles.keys()) 22 | return list(self.content.keys()) 23 | 24 | 25 | def get_namespaces(xml_file:Union[str, os.PathLike])->Dict[str,str]: 26 | """ 27 | Extracts the namespaces from a schema st4 xml file 28 | """ 29 | namespaces = {} 30 | for event, elem in ET.iterparse(xml_file, events=("start", "start-ns")): 31 | if event == "start-ns": 32 | prefix, url = elem 33 | namespaces[prefix] = url 34 | return namespaces 35 | 36 | def parse(xml_file:Union[str, os.PathLike])->List[St4Entry]: 37 | """ 38 | Parses a schema st4 xml file and returns a list of St4Entry objects 39 | """ 40 | namespaces = get_namespaces(xml_file) 41 | assert "n" in namespaces and "l" in namespaces , "No namespaces found! Is this a valid ST4 file?" 42 | 43 | extracted_entries=[] 44 | 45 | def extract_language_and_values(element:ET.Element,with_entry=False)->Dict[str,str]: 46 | extracted={} 47 | value_elements = element.findall("./n:Value",namespaces) 48 | for value_element in value_elements: 49 | language = value_element.attrib[(f"{'{'+namespaces['n']+'}'}Aspect")] 50 | if with_entry: 51 | entry_element = value_element.find(".//n:Entry",namespaces) 52 | if entry_element is not None: 53 | extracted[language]=entry_element.text 54 | else: 55 | extracted[language]=value_element.text 56 | return extracted 57 | 58 | 59 | tree = ET.parse(xml_file) 60 | root = tree.getroot() 61 | 62 | # Find all 'n:SystemFolder' elements 63 | system_folder_elements = root.findall(".//n:SystemFolder",namespaces) 64 | for system_folder_element in system_folder_elements: 65 | 66 | #get info elements 67 | 68 | info_elements = system_folder_element.findall(".//n:Data-Title/..",namespaces) #Just dont ask me why, but im not gonna hardcode the InfoType02 element 69 | if info_elements is None: 70 | continue 71 | 72 | 73 | for info_element in info_elements: 74 | #extract label and ids 75 | type=info_element.tag 76 | label = info_element.attrib[(f"{'{'+namespaces['l']+'}'}Label")] 77 | node_id = info_element.attrib[(f"{'{'+namespaces['n']+'}'}Id")] 78 | link_id = info_element.attrib[(f"{'{'+namespaces['l']+'}'}Id")] 79 | 80 | #extract the titles in all languages 81 | title_element = info_element.find(".//n:Data-Title",namespaces) 82 | titles=extract_language_and_values(title_element,with_entry=True) 83 | 84 | #get the content in all languages 85 | data_content_element = info_element.find(".//n:Data-Content",namespaces) 86 | content={} 87 | if data_content_element is not None: 88 | value_elements = data_content_element.findall("./n:Value",namespaces) 89 | 90 | for value_element in value_elements: 91 | language = value_element.attrib[(f"{'{'+namespaces['n']+'}'}Aspect")] 92 | content_element = value_element.find(".//n:Entry//content",namespaces) 93 | content[language]= ET.tostring(content_element, encoding='unicode') 94 | 95 | #check if we got content or titles, if not, skip this entry 96 | if len(titles)==0 and len(content)==0: 97 | continue 98 | 99 | #get thumbnail if it exists 100 | thumbnail=None 101 | thumbnail_element = info_element.find(".//n:Data-Thumbnail",namespaces) 102 | if thumbnail_element is not None: 103 | thumbnail = thumbnail_element.text 104 | 105 | #get data web if it exists 106 | data_web = None 107 | data_web_element = info_element.find(".//n:Data-Web",namespaces) 108 | if data_web_element is not None: 109 | data_web = extract_language_and_values(data_web_element) 110 | 111 | # get data web.data if it exists // dont ask me why it is named this way, its just stupid 112 | data_web_data = None 113 | data_web_data_element = info_element.find(".//n:Data-Web.Data",namespaces) 114 | if data_web_data_element is not None: 115 | data_web_data = extract_language_and_values(data_web_data_element) 116 | 117 | extracted_entries.append(St4Entry(type,label,node_id,link_id,titles,content,thumbnail,data_web,data_web_data)) 118 | 119 | return extracted_entries 120 | 121 | 122 | 123 | 124 | 125 | 126 | --------------------------------------------------------------------------------