├── tests
├── __init__.py
├── test_parser.py
├── test_get_namespaces.py
└── test.xml
├── setup.py
├── .vscode
└── settings.json
├── README.md
├── pyproject.toml
├── LICENSE
├── .github
└── workflows
│ └── publish_on_pipy.yml
├── .gitignore
└── src
└── schema_st4_parser
└── __init__.py
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup,find_packages
2 | import os
3 |
4 | setup(
5 | version=os.environ.get("PACKAGE_VERSION","0.0.0"),
6 | package_dir={"":"src"},
7 | packages=find_packages(where="./src", exclude=("*.tests", "*.tests.*", "tests.*", "tests"))
8 | )
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.testing.unittestArgs": [
3 | "-v",
4 | "-s",
5 | "./tests",
6 | "-p",
7 | "test_*.py"
8 | ],
9 | "python.testing.pytestEnabled": true,
10 | "python.testing.unittestEnabled": false,
11 | "python.testing.pytestArgs": [
12 | "tests"
13 | ]
14 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Schema ST4 Python Parser
2 |
3 | Parse [Schema ST4](https://www.quanos-content-solutions.com/en/software/schema-st4) XML Files into simple,flat python objects.
4 |
5 | ## Installation
6 |
7 | via pip: `pip install schema-st4-parser`
8 |
9 | ## Usage
10 | Simply pass the xml file into the parse methode. A list of St4Entry objects will be returned.
11 |
12 | ```pytohn
13 | from schema_st4_parser import parse, St4Entry
14 |
15 | entries = parse("MyFile.xml")
16 | print(entries[0])
17 | ```
18 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "setuptools-scm"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "schema_st4_parser"
7 | dynamic = ["version"]
8 | authors = [
9 | { name="Lukas Kreussel"},
10 | ]
11 | description = "Parse Schema ST4 xml files into pytohn objects"
12 | readme = "README.md"
13 | license = { file="LICENSE" }
14 | requires-python = ">=3.8"
15 | classifiers = [
16 | "Programming Language :: Python :: 3",
17 | "License :: OSI Approved :: MIT License",
18 | "Operating System :: OS Independent",
19 | ]
20 |
21 | [project.urls]
22 | "Homepage" = "https://github.com/LLukas22/ST4-Python-Parser"
23 | "Bug Tracker" = "https://github.com/LLukas22/ST4-Python-Parser/issues"
--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
1 | from src.schema_st4_parser import St4Entry, parse, get_namespaces
2 |
3 | def test_can_parse_file():
4 | results = parse("./tests/test.xml")
5 | assert len(results) == 1
6 | result = results[0]
7 | assert isinstance(result, St4Entry)
8 | assert result.label == "label"
9 | assert result.node_id == "nID"
10 | assert result.link_id == "lID"
11 | assert result.titles["en"] == "title_en"
12 | assert result.titles["de"] == "title_de"
13 | assert "en" in result.content
14 | assert "de" in result.content
15 | assert result.thumbnail == "thumbnail"
16 | assert "GraficResource" in result.type
17 | assert result.data_web["en"] == "data_web_en"
18 | assert result.data_web["de"] == "data_web_de"
19 | assert result.data_web_data["en"] == "data_web_data_en"
20 | assert result.data_web_data["de"] == "data_web_data_de"
21 |
22 |
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 LLukas22
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/.github/workflows/publish_on_pipy.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package to PyPi
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - name: checkout
25 | uses: actions/checkout@v3
26 |
27 | - id: get_version
28 | uses: battila7/get-version-action@v2
29 |
30 | - name: print version
31 | run: echo ${{ steps.get_version.outputs.version-without-v }}
32 |
33 | - name: update env
34 | run: echo "PACKAGE_VERSION=${{ steps.get_version.outputs.version-without-v }}" >> $GITHUB_ENV
35 |
36 | - name: print package version
37 | run: echo $PACKAGE_VERSION
38 |
39 | - name: Set up Python
40 | uses: actions/setup-python@v3
41 | with:
42 | python-version: '3.10'
43 | - name: Install dependencies
44 | run: |
45 | python -m pip install --upgrade pip
46 | pip install build
47 | - name: Build Package
48 | run: python -m build
49 | - name: Publish Package
50 | uses: pypa/gh-action-pypi-publish@v1.5.1
51 | with:
52 | user: __token__
53 | password: ${{ secrets.PYPI_TOKEN }}
--------------------------------------------------------------------------------
/tests/test_get_namespaces.py:
--------------------------------------------------------------------------------
1 | from src.schema_st4_parser import get_namespaces
2 | import io
3 | import pytest
4 |
5 | def test_should_parse_namespaces_from_valid_input():
6 | valid_string = '''
7 |
8 |
9 | '''
10 | file = io.StringIO(valid_string)
11 | result = get_namespaces(file)
12 | assert "n" in result
13 | assert "l" in result
14 | assert "d" in result
15 | assert "xsi" in result
16 |
17 | def test_should_raise_exception_from_invalid_input():
18 | invalid_string = '''
19 |
20 | '''
21 | file = io.StringIO(invalid_string)
22 | with pytest.raises(Exception):
23 | result = get_namespaces(file)
24 |
--------------------------------------------------------------------------------
/tests/test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | title_en
9 |
10 |
11 | title_de
12 |
13 |
14 |
15 |
16 |
17 | English Content
18 |
19 |
20 |
21 |
22 | German Content
23 |
24 |
25 |
26 | thumbnail
27 |
28 | data_web_en
29 | data_web_de
30 |
31 |
32 | data_web_data_en
33 | data_web_data_de
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | schemas/
2 | streams/
3 | T450_Betriebsanleitung.xml
4 | main.py
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | # .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # poetry
102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | # This is especially recommended for binary packages to ensure reproducibility, and is more
104 | # commonly ignored for libraries.
105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 |
108 | # pdm
109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | # in version control.
113 | # https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 |
166 |
--------------------------------------------------------------------------------
/src/schema_st4_parser/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Optional, Union
2 | from dataclasses import dataclass
3 | import xml.etree.ElementTree as ET
4 | import os
5 |
6 | @dataclass(frozen=True)
7 | class St4Entry():
8 | type:str
9 | label:str
10 | node_id:str
11 | link_id:str
12 | titles:Dict[str,str]
13 | content:Dict[str,str]
14 | thumbnail:Optional[str]=None
15 | data_web:Optional[Dict[str,str]]=None
16 | data_web_data:Optional[Dict[str,str]]=None
17 |
18 | @property
19 | def languages(self)->List[str]:
20 | if len(self.content) == 0:
21 | return list(self.titles.keys())
22 | return list(self.content.keys())
23 |
24 |
25 | def get_namespaces(xml_file:Union[str, os.PathLike])->Dict[str,str]:
26 | """
27 | Extracts the namespaces from a schema st4 xml file
28 | """
29 | namespaces = {}
30 | for event, elem in ET.iterparse(xml_file, events=("start", "start-ns")):
31 | if event == "start-ns":
32 | prefix, url = elem
33 | namespaces[prefix] = url
34 | return namespaces
35 |
36 | def parse(xml_file:Union[str, os.PathLike])->List[St4Entry]:
37 | """
38 | Parses a schema st4 xml file and returns a list of St4Entry objects
39 | """
40 | namespaces = get_namespaces(xml_file)
41 | assert "n" in namespaces and "l" in namespaces , "No namespaces found! Is this a valid ST4 file?"
42 |
43 | extracted_entries=[]
44 |
45 | def extract_language_and_values(element:ET.Element,with_entry=False)->Dict[str,str]:
46 | extracted={}
47 | value_elements = element.findall("./n:Value",namespaces)
48 | for value_element in value_elements:
49 | language = value_element.attrib[(f"{'{'+namespaces['n']+'}'}Aspect")]
50 | if with_entry:
51 | entry_element = value_element.find(".//n:Entry",namespaces)
52 | if entry_element is not None:
53 | extracted[language]=entry_element.text
54 | else:
55 | extracted[language]=value_element.text
56 | return extracted
57 |
58 |
59 | tree = ET.parse(xml_file)
60 | root = tree.getroot()
61 |
62 | # Find all 'n:SystemFolder' elements
63 | system_folder_elements = root.findall(".//n:SystemFolder",namespaces)
64 | for system_folder_element in system_folder_elements:
65 |
66 | #get info elements
67 |
68 | info_elements = system_folder_element.findall(".//n:Data-Title/..",namespaces) #Just dont ask me why, but im not gonna hardcode the InfoType02 element
69 | if info_elements is None:
70 | continue
71 |
72 |
73 | for info_element in info_elements:
74 | #extract label and ids
75 | type=info_element.tag
76 | label = info_element.attrib[(f"{'{'+namespaces['l']+'}'}Label")]
77 | node_id = info_element.attrib[(f"{'{'+namespaces['n']+'}'}Id")]
78 | link_id = info_element.attrib[(f"{'{'+namespaces['l']+'}'}Id")]
79 |
80 | #extract the titles in all languages
81 | title_element = info_element.find(".//n:Data-Title",namespaces)
82 | titles=extract_language_and_values(title_element,with_entry=True)
83 |
84 | #get the content in all languages
85 | data_content_element = info_element.find(".//n:Data-Content",namespaces)
86 | content={}
87 | if data_content_element is not None:
88 | value_elements = data_content_element.findall("./n:Value",namespaces)
89 |
90 | for value_element in value_elements:
91 | language = value_element.attrib[(f"{'{'+namespaces['n']+'}'}Aspect")]
92 | content_element = value_element.find(".//n:Entry//content",namespaces)
93 | content[language]= ET.tostring(content_element, encoding='unicode')
94 |
95 | #check if we got content or titles, if not, skip this entry
96 | if len(titles)==0 and len(content)==0:
97 | continue
98 |
99 | #get thumbnail if it exists
100 | thumbnail=None
101 | thumbnail_element = info_element.find(".//n:Data-Thumbnail",namespaces)
102 | if thumbnail_element is not None:
103 | thumbnail = thumbnail_element.text
104 |
105 | #get data web if it exists
106 | data_web = None
107 | data_web_element = info_element.find(".//n:Data-Web",namespaces)
108 | if data_web_element is not None:
109 | data_web = extract_language_and_values(data_web_element)
110 |
111 | # get data web.data if it exists // dont ask me why it is named this way, its just stupid
112 | data_web_data = None
113 | data_web_data_element = info_element.find(".//n:Data-Web.Data",namespaces)
114 | if data_web_data_element is not None:
115 | data_web_data = extract_language_and_values(data_web_data_element)
116 |
117 | extracted_entries.append(St4Entry(type,label,node_id,link_id,titles,content,thumbnail,data_web,data_web_data))
118 |
119 | return extracted_entries
120 |
121 |
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------