├── requirements.txt ├── tests ├── __init__.py ├── data │ └── docx_example.docx ├── test_doc_parser.py ├── test_utils.py ├── test_reader.py ├── test_parser.py └── test_xml_parser.py ├── .coveragerc ├── requirements-dev.txt ├── docparser ├── constants.py ├── document.py ├── enums.py ├── __init__.py ├── exceptions.py ├── utils.py ├── reader.py ├── parser.py └── xml_parser.py ├── README.md ├── .github └── workflows │ ├── pypi-publish.yml │ └── test.yml ├── LICENCE.txt ├── setup.py └── .gitignore /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = tests/* -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pylint 2 | black 3 | mypy 4 | coverage -------------------------------------------------------------------------------- /tests/data/docx_example.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/has-abi/docparser/HEAD/tests/data/docx_example.docx -------------------------------------------------------------------------------- /docparser/constants.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains the package constants 3 | """ 4 | 5 | 6 | DOCX_EXT = "docx" 7 | ALLOWED_EXTS = [DOCX_EXT] 8 | XML_BODY = "word/document.xml" 9 | XML_HEADER = "word/header[0-9]*.xml" 10 | XML_FOOTER = "word/footer[0-9]*.xml" 11 | NSMAP = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" 12 | -------------------------------------------------------------------------------- /docparser/document.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This is dataclass module, namely :class:`Document`, which 3 | holds the parsed data from a document. 4 | 5 | Classes & methods 6 | ----------------- 7 | 8 | Below is listed the class within :py:mod:`docparser.parser` 9 | along with possessed methods. 10 | """ 11 | 12 | 13 | from dataclasses import dataclass 14 | from typing import Dict 15 | 16 | 17 | @dataclass 18 | class Document: 19 | name: str 20 | ext: str 21 | content: str 22 | divided_content: Dict[str, str] 23 | -------------------------------------------------------------------------------- /tests/test_doc_parser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | 4 | from docparser import parse 5 | 6 | DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx" 7 | 8 | 9 | class TestDocParser(unittest.TestCase): 10 | def test_parse_docx_file_str(self): 11 | document = parse(str(DOCX_FILE_PATH)) 12 | self.assertTrue(document.content) 13 | 14 | def test_parse_docx_file_binary(self): 15 | with open(DOCX_FILE_PATH, "rb") as docx_file: 16 | document = parse(docx_file) 17 | self.assertTrue(document.content) 18 | 19 | 20 | if __name__ == "__main__": 21 | unittest.main() 22 | -------------------------------------------------------------------------------- /docparser/enums.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains the package enums. 3 | 4 | Enums Classes 5 | ------------- 6 | 7 | Below is listed the enums classes within 8 | :py:mod:`docparser.enums` 9 | """ 10 | 11 | 12 | from enum import Enum 13 | 14 | import docparser.constants as CS 15 | 16 | 17 | class TagEnum(str, Enum): 18 | SPACE = CS.NSMAP + "t" 19 | TAB = CS.NSMAP + "tab" 20 | BREAK_LINE = CS.NSMAP + "br" 21 | CARRIAGE_RETURN = CS.NSMAP + "cr" 22 | PARAGRAPH = CS.NSMAP + "p" 23 | 24 | 25 | class LayoutEnum(str, Enum): 26 | TAB = "\t" 27 | BREAK_LINE = "\n" 28 | MAJ_BREAK_LINE = "\n\n" 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Python Version](https://img.shields.io/badge/python-3.7+-blue)](https://www.python.org/downloads/release/python-370/) 2 | ![Tests](https://github.com/has-abi/docparser/actions/workflows/test.yml/badge.svg) 3 | [![codecov](https://codecov.io/gh/has-abi/docparser/branch/main/graph/badge.svg?token=4AL385JEH9)](https://codecov.io/gh/has-abi/docparser) 4 | 5 | # What is docparser? 6 | docparser is python package that extract text form a DOCX document. 7 | 8 | ## Installation 9 | 10 | ```bash 11 | pip install python-docparser 12 | ``` 13 | 14 | ## Usage 15 | 16 | ```python 17 | from docparser import parse 18 | 19 | document = parse("your_docx_document") 20 | print(document.content) 21 | ``` 22 | -------------------------------------------------------------------------------- /docparser/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This is the package entrypoint which exposes the 3 | `parse` method that handles the parsing process. 4 | """ 5 | 6 | 7 | from io import BufferedReader 8 | 9 | from docparser.document import Document 10 | from docparser.parser import Parser 11 | from docparser.reader import Reader 12 | from docparser.utils import get_file_name_and_ext 13 | from docparser.xml_parser import XMLParser 14 | 15 | 16 | def parse(input_file: str | BufferedReader) -> Document: 17 | file_name, file_ext = get_file_name_and_ext(input_file) 18 | reader = Reader(input_file, file_ext) 19 | file_parser = XMLParser(reader.zip_file) 20 | parser = Parser(file_parser, file_ext, file_name) 21 | reader.zip_file.close() 22 | return parser.document 23 | -------------------------------------------------------------------------------- /.github/workflows/pypi-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | deploy: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: '3.9' 20 | 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install build 25 | 26 | - name: Build dist 27 | run: | 28 | python -m build --outdir dist/ 29 | 30 | - name: Publish package 31 | uses: pypa/gh-action-pypi-publish@release/v1 32 | with: 33 | password: ${{ secrets.PYPI_API_TOKEN }} 34 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from docparser import utils 4 | 5 | FILE_NAME_NO_PATH = "docx_example.docx" 6 | FILE_NAME_WITH_PATH = "/tests/docx_example.docx" 7 | 8 | 9 | class TestDocParser(unittest.TestCase): 10 | def test_get_file_name_no_path(self): 11 | self.assertEqual( 12 | utils.get_file_name(FILE_NAME_NO_PATH), 13 | FILE_NAME_NO_PATH, 14 | ) 15 | 16 | def test_get_file_name_with_path(self): 17 | self.assertEqual( 18 | utils.get_file_name(FILE_NAME_WITH_PATH), 19 | FILE_NAME_NO_PATH, 20 | ) 21 | 22 | def test_get_file_name_and_ext(self): 23 | self.assertEqual( 24 | utils.get_file_name_and_ext(FILE_NAME_WITH_PATH), 25 | (FILE_NAME_NO_PATH, "docx"), 26 | ) 27 | 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /docparser/exceptions.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains the package exceptions. 3 | 4 | Exceptions Classes 5 | ------------------ 6 | 7 | Below is listed the exceptions classes within 8 | :py:mod:`docparser.exception` 9 | """ 10 | 11 | 12 | class InvalidArgumentTypeError(Exception): 13 | def __init__(self, message: str) -> None: 14 | super().__init__(message) 15 | 16 | 17 | class UnsupportedFileFormatError(Exception): 18 | def __init__(self, file_format: str) -> None: 19 | super().__init__( 20 | f"{file_format} if not supported. supported formats are docx and doc." 21 | ) 22 | 23 | 24 | class MissingAttributeError(Exception): 25 | def __init__(self, message: str) -> None: 26 | super().__init__(message) 27 | 28 | 29 | class InvalidReturnValueError(Exception): 30 | def __init__(self, message: str) -> None: 31 | super().__init__(message) 32 | -------------------------------------------------------------------------------- /LICENCE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Hassane Abida 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/test_reader.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | from zipfile import ZipFile 4 | 5 | from docparser.exceptions import ( 6 | InvalidArgumentTypeError, 7 | UnsupportedFileFormatError, 8 | ) 9 | from docparser.reader import Reader 10 | 11 | DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx" 12 | 13 | 14 | class TestReader(unittest.TestCase): 15 | def test_read_empty_file(self): 16 | with self.assertRaises(InvalidArgumentTypeError): 17 | reader = Reader(input_file=None, file_ext="") # type: ignore 18 | 19 | def test_read_unsupported_file_type(self): 20 | with self.assertRaises(UnsupportedFileFormatError): 21 | reader = Reader(input_file="file_example.pdf", file_ext="pdf") 22 | 23 | def test_read_missing_file(self): 24 | with self.assertRaises(FileNotFoundError): 25 | reader = Reader(input_file="missing_file.docx", file_ext="docx") 26 | 27 | def test_to_zip_str_file(self): 28 | test_reader = Reader(input_file=str(DOCX_FILE_PATH), file_ext="docx") 29 | zip_file = test_reader.to_zip() 30 | self.assertTrue(isinstance(zip_file, ZipFile)) 31 | 32 | 33 | if __name__ == "__main__": 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /docparser/utils.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module contains the package utils. 3 | 4 | Utils methods 5 | ------------- 6 | 7 | Below is listed the package util methos within 8 | :py:mod:`docparser.utils` 9 | """ 10 | 11 | 12 | import os 13 | from io import BufferedReader 14 | from typing import Tuple 15 | 16 | 17 | def get_file_name_and_ext(file_or_filepath: str | BufferedReader) -> Tuple[str, str]: 18 | """Extract the file extension and the file name 19 | from a file or a file name. 20 | 21 | Args: 22 | file_or_filepath (str | BufferedReader): File or file path. 23 | 24 | Returns: 25 | Tuple[str, str]: Tuple of file name and file extension 26 | """ 27 | filename = get_file_name(file_or_filepath) 28 | ext = filename.rsplit(".", 1)[1] 29 | return filename, ext.lower() 30 | 31 | 32 | def get_file_name(file_or_filepath: str | BufferedReader) -> str: 33 | """Extract the file name form a file or a file path. 34 | 35 | Args: 36 | file_or_filepath (str | BufferedReader): File or a file path. 37 | 38 | Returns: 39 | str: The extracted file name. 40 | """ 41 | if isinstance(file_or_filepath, BufferedReader): 42 | return file_or_filepath.name 43 | return os.path.basename(file_or_filepath) 44 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup 4 | 5 | CURRENT = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | 8 | def _open(subpath): 9 | path = os.path.join(CURRENT, subpath) 10 | return open(path, encoding="utf-8", errors="ignore") 11 | 12 | 13 | with _open("requirements.txt") as file: 14 | base_reqs = file.read().strip().split("\n") 15 | 16 | with _open("requirements-dev.txt") as file: 17 | dev_reqs = file.read().strip().split("\n") 18 | 19 | with _open("README.md") as f: 20 | readme = f.read() 21 | 22 | setup( 23 | name="python-docparser", 24 | version="1.1.0", 25 | author="Hassane Abida", 26 | author_email="abidahass.uca@gmail.com", 27 | url="https://github.com/has-abi/docparser", 28 | description="Extract text from your docx document.", 29 | long_description=readme, 30 | long_description_content_type="text/markdown", 31 | python_requires=">=3.7", 32 | tests_require=base_reqs + dev_reqs, 33 | install_requires=base_reqs, 34 | classifiers=[ 35 | "License :: OSI Approved :: MIT License", 36 | "Operating System :: OS Independent", 37 | "Programming Language :: Python :: 3.7", 38 | "Programming Language :: Python :: 3.8", 39 | "Programming Language :: Python :: 3.9", 40 | "Programming Language :: Python :: 3.10", 41 | "Programming Language :: Python :: 3.11", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | lint: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v3 11 | 12 | - name: Set up Python 3.10 13 | uses: actions/setup-python@v3 14 | with: 15 | python-version: "3.10" 16 | 17 | - name: Install Python dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install -r requirements-dev.txt 21 | 22 | - name: Validate against psf/black 23 | run: python -m black --check docparser tests 24 | 25 | - name: Check type annotations via mypy 26 | run: python -m mypy --strict docparser 27 | 28 | test: 29 | needs: lint 30 | runs-on: ubuntu-latest 31 | strategy: 32 | fail-fast: false 33 | matrix: 34 | python-version: ["3.10", "3.11"] 35 | 36 | steps: 37 | - uses: actions/checkout@v3 38 | 39 | - name: Set up Python ${{ matrix.python-version }} 40 | uses: actions/setup-python@v3 41 | with: 42 | python-version: ${{ matrix.python-version }} 43 | 44 | - name: Install Python dependencies 45 | run: | 46 | python -m pip install --upgrade pip 47 | pip install -r requirements-dev.txt 48 | 49 | - name: Run tests 50 | run: | 51 | python -m coverage run -m unittest 52 | python -m coverage html 53 | 54 | - name: Upload code coverage 55 | uses: codecov/codecov-action@v3 56 | with: 57 | token: ${{ secrets.CODECOV_TOKEN }} 58 | 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | docs/build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | # vscode cache 94 | .vscode 95 | .pytest_cache 96 | 97 | # html coverage 98 | htmlcov/ 99 | 100 | # Pycharm 101 | .idea -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import Mock 3 | 4 | from docparser.document import Document 5 | from docparser.exceptions import InvalidReturnValueError, MissingAttributeError 6 | from docparser.parser import Parser 7 | 8 | 9 | class TestParser(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls) -> None: 12 | file_parser = Mock() 13 | file_parser.extract_text = Mock( 14 | return_value={ 15 | "header": "xml header text", 16 | "body": "xml body text", 17 | "footer": "xml footer text", 18 | } 19 | ) 20 | cls.parser = Parser( 21 | file_parser=file_parser, 22 | file_ext="docx", 23 | file_name="file_name_example.docx", 24 | ) 25 | 26 | def test_parser_with_invalid_file_parser(self): 27 | test_file_parser = "" 28 | with self.assertRaises(MissingAttributeError): 29 | Parser(file_parser=test_file_parser, file_ext="", file_name="") 30 | 31 | def test_invalid_file_parser_extract_text_callable_return(self): 32 | test_file_parser = Mock() 33 | test_file_parser.extract_text = Mock(return_value=["list item"]) 34 | with self.assertRaises(InvalidReturnValueError): 35 | Parser(file_parser=test_file_parser, file_ext="", file_name="") 36 | 37 | def test_get_document(self): 38 | result_document = __class__.parser.document 39 | self.assertTrue(isinstance(result_document, Document)) 40 | self.assertEqual(result_document.name, "file_name_example.docx") 41 | self.assertEqual(result_document.ext, "docx") 42 | self.assertTrue(isinstance(result_document.divided_content, dict)) 43 | self.assertListEqual( 44 | list(result_document.divided_content.keys()), ["header", "body", "footer"] 45 | ) 46 | self.assertListEqual( 47 | list(result_document.divided_content.values()), 48 | ["xml header text", "xml body text", "xml footer text"], 49 | ) 50 | self.assertEqual( 51 | result_document.content, "xml header text xml body text xml footer text" 52 | ) 53 | 54 | 55 | if __name__ == "__main__": 56 | unittest.main() 57 | -------------------------------------------------------------------------------- /tests/test_xml_parser.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from pathlib import Path 3 | from zipfile import ZipFile 4 | 5 | from docparser.exceptions import InvalidArgumentTypeError 6 | from docparser.xml_parser import XMLParser 7 | 8 | DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx" 9 | 10 | XML_BODY = "word/document.xml" 11 | XML_HEADER = "word/header[0-9]*.xml" 12 | XML_FOOTER = "word/footer[0-9]*.xml" 13 | 14 | 15 | class TestXMLParser(unittest.TestCase): 16 | @classmethod 17 | def setUpClass(cls) -> None: 18 | cls.zip_file = ZipFile(DOCX_FILE_PATH) 19 | cls.xml_parser = XMLParser(cls.zip_file) 20 | 21 | def test_invalid_input_file(self): 22 | with self.assertRaises(InvalidArgumentTypeError): 23 | xml_parser = XMLParser(input_file="") # type: ignore 24 | 25 | def test_get_xml_part_by_pattern_header(self) -> None: 26 | test_result = __class__.xml_parser.get_xml_part_by_pattern(XML_HEADER) 27 | self.assertTrue(isinstance(test_result, list)) 28 | self.assertTrue(all(isinstance(result, bytes) for result in test_result)) 29 | 30 | def test_get_xml_part_by_pattern_body(self) -> None: 31 | test_result = __class__.xml_parser.get_xml_part_by_pattern(XML_BODY) 32 | self.assertTrue(isinstance(test_result, list)) 33 | self.assertTrue(all(isinstance(result, bytes) for result in test_result)) 34 | 35 | def test_get_xml_part_by_pattern_footer(self) -> None: 36 | test_result = __class__.xml_parser.get_xml_part_by_pattern(XML_FOOTER) 37 | self.assertTrue(isinstance(test_result, list)) 38 | self.assertTrue(all(isinstance(result, bytes) for result in test_result)) 39 | 40 | def test_to_xml(self) -> None: 41 | test_xml_components = __class__.xml_parser.to_xml() 42 | self.assertTrue( 43 | ["header", "body", "footer"] == list(test_xml_components.keys()) 44 | ) 45 | 46 | def test_xml2text(self) -> None: 47 | test_xml_body = __class__.xml_parser.get_xml_part_by_pattern(XML_BODY) 48 | text_result = " ".join( 49 | [self.xml_parser.xml2text(part) for part in test_xml_body] 50 | ) 51 | self.assertTrue(len(text_result) > 0) 52 | 53 | def test_extract_text(self) -> None: 54 | doc_content = __class__.xml_parser.extract_text() 55 | self.assertTrue(isinstance(doc_content, dict)) 56 | self.assertTrue(["header", "body", "footer"] == list(doc_content.keys())) 57 | self.assertTrue( 58 | ( 59 | (isinstance(component_content, str) and len(component_content) > 0) 60 | for component_content in doc_content.values() 61 | ) 62 | ) 63 | 64 | @classmethod 65 | def tearDownClass(cls) -> None: 66 | cls.zip_file.close() 67 | 68 | 69 | if __name__ == "__main__": 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /docparser/reader.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module is a single class, namely :class:`Reader`, which 3 | handles the file reading as a zip file. 4 | 5 | Classes & methods 6 | ----------------- 7 | 8 | Below is listed the class within :py:mod:`docparser.reader` 9 | along with possessed methods. 10 | """ 11 | 12 | 13 | import os 14 | from io import BufferedReader 15 | from zipfile import ZipFile 16 | 17 | import docparser.constants as CS 18 | from docparser.exceptions import ( 19 | InvalidArgumentTypeError, 20 | UnsupportedFileFormatError, 21 | ) 22 | 23 | 24 | class Reader: 25 | """Docparser `Reader` class that reads a docx file as a zip file. 26 | 27 | Args: 28 | input_file (str | BufferedReader): Input file that could be a file 29 | or a file path. 30 | file_ext (str): The input file extension. 31 | """ 32 | 33 | def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None: 34 | """Docparser `Reader` class that reads a docx file as a zip file. 35 | 36 | Args: 37 | input_file (str | BufferedReader): Input file that could be a file 38 | or a file path. 39 | file_ext (str): The input file extension. 40 | """ 41 | self.__check(input_file, file_ext) 42 | self.input_file = input_file 43 | self.zip_file = self.to_zip() 44 | 45 | def __check(self, input_file: str | BufferedReader, file_ext: str) -> None: 46 | """Check the input arguments of the class constructor for invalid 47 | types or values. 48 | 49 | Args: 50 | input_file (str | BufferedReade): Input file that could be a file 51 | or a file path. 52 | file_ext (str): The input file extension. 53 | 54 | Raises: 55 | InvalidArgumentTypeError: Thrown if any argument has an invalid 56 | type. 57 | UnsupportedFileFormatError: Thrown if the input file has unsupported 58 | format. 59 | FileNotFoundError: Thrown if the input file don't exist in disque or 60 | not found. 61 | """ 62 | if not isinstance(input_file, (str, BufferedReader)): 63 | raise InvalidArgumentTypeError( 64 | "input_file must be a file path or a binary file." 65 | ) 66 | 67 | if file_ext not in CS.ALLOWED_EXTS: 68 | raise UnsupportedFileFormatError(file_ext) 69 | 70 | if isinstance(input_file, str) and not os.path.isfile(input_file): 71 | raise FileNotFoundError(f"File not found: {input_file}") 72 | 73 | def to_zip(self) -> ZipFile: 74 | """Convert the input file to a zip file. 75 | 76 | Returns: 77 | ZipFile: The converted zip file. 78 | """ 79 | zip_file = ZipFile(self.input_file) 80 | return zip_file 81 | -------------------------------------------------------------------------------- /docparser/parser.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module is a single class, namely :class:`Parser`, which 3 | is the end class that handles getting the parsing results from 4 | a file parser. 5 | 6 | Classes & methods 7 | ----------------- 8 | 9 | Below is listed the class within :py:mod:`docparser.parser` 10 | along with possessed methods. 11 | """ 12 | 13 | 14 | from typing import Any 15 | 16 | from docparser.document import Document 17 | from docparser.exceptions import InvalidReturnValueError, MissingAttributeError 18 | 19 | 20 | class Parser: 21 | """Docparser `Parser` class. 22 | 23 | Args: 24 | file_parser (Any): A file parser that has an `extract_text` method 25 | that returns the parsed document content as a dictionary. 26 | file_ext (str): The original file extension. 27 | file_name (str): The original file name. 28 | """ 29 | 30 | def __init__(self, file_parser: Any, file_ext: str, file_name: str) -> None: 31 | """Docparser `Parser` class. 32 | 33 | Args: 34 | file_parser (Any): A file parser that has an `extract_text` method 35 | that returns the parsed document content as a dictionary. 36 | file_ext (str): The original file extension. 37 | file_name (str): The original file name. 38 | """ 39 | self.__check(file_parser) 40 | self.file_parser = file_parser 41 | self.document = self.get_document(file_ext, file_name) 42 | 43 | def __check(self, file_parser: Any) -> None: 44 | """Checks if the `file_parser` has a callable with the name 45 | `extract_text`. 46 | 47 | Args: 48 | file_parser (Any): A file parser. 49 | 50 | Raises: 51 | MissingAttributeError: Thrown if the file parser don't have 52 | a callable `extract_text` 53 | """ 54 | if not ( 55 | hasattr(file_parser, "extract_text") and callable(file_parser.extract_text) 56 | ): 57 | raise MissingAttributeError( 58 | "Missing callable extract_text() from file_parser instance." 59 | ) 60 | 61 | def get_document(self, file_ext: str, file_name: str) -> Document: 62 | """Get the extracted document data. 63 | 64 | Args: 65 | file_ext (str): The original file extension 66 | file_name (str): The original file name. 67 | 68 | Raises: 69 | InvalidReturnValueError: throw if the file parser callable 70 | `extract_text` return value is not a dict. 71 | 72 | Returns: 73 | Document: A document object that represents the parsed results. 74 | """ 75 | divided_content = self.file_parser.extract_text() 76 | if not isinstance(divided_content, dict): 77 | raise InvalidReturnValueError( 78 | "The file parser extract_text callable return value must be a dict" 79 | ) 80 | content = " ".join(list(divided_content.values())) 81 | return Document( 82 | name=file_name, 83 | ext=file_ext, 84 | content=content, 85 | divided_content=divided_content, 86 | ) 87 | -------------------------------------------------------------------------------- /docparser/xml_parser.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This module is a single class, namely :class:`XMLParser`, which 3 | represents an XML parser that extracts text from different XML 4 | nodes. 5 | 6 | Classes & methods 7 | ----------------- 8 | 9 | Below is listed the class within :py:mod:`docparser.xml_parser` 10 | along with possessed methods. 11 | """ 12 | 13 | 14 | import re 15 | import xml.etree.ElementTree as ETree 16 | from typing import Dict, List 17 | from zipfile import ZipFile 18 | 19 | import docparser.constants as CS 20 | from docparser.enums import LayoutEnum, TagEnum 21 | from docparser.exceptions import InvalidArgumentTypeError 22 | 23 | 24 | XML_Type = Dict[str, bytes | List[bytes]] 25 | 26 | 27 | class XMLParser: 28 | """Docparser `XMLParser` class that parses the input zip file 29 | using the python package `xml`. 30 | 31 | Args: 32 | input_file (ZipFile): Zip file. 33 | """ 34 | 35 | def __init__(self, input_file: ZipFile) -> None: 36 | """Docparser `XMLParser` class that parses the input zip file 37 | using the python package `xml`. 38 | 39 | Args: 40 | input_file (ZipFile): Zip file. 41 | """ 42 | self.__check(input_file) 43 | self.__zip_file = input_file 44 | self.__name_list = self.__zip_file.namelist() 45 | 46 | def __check(self, input_file: ZipFile) -> None: 47 | """Check the input arguments of the class constructor for invalid 48 | types or values. 49 | 50 | Args: 51 | input_file (ZipFile): Zip file. 52 | 53 | Raises: 54 | InvalidArgumentTypeError: Thrown if the input file is not an 55 | instance of ZipFile. 56 | """ 57 | if not isinstance(input_file, ZipFile): 58 | raise InvalidArgumentTypeError("input file must of type ZipFile.") 59 | 60 | def extract_text(self) -> Dict[str, str]: 61 | """Extract text from the zip file using XML. 62 | 63 | Returns: 64 | Dict[str, str]: A dictionary containing the document 65 | XML parts [head, body, footer] and their text. 66 | """ 67 | doc_text: Dict[str, str] = {} 68 | xml_components = self.to_xml() 69 | for part_name, content in xml_components.items(): 70 | doc_text[part_name] = "" 71 | if isinstance(content, list): 72 | for sub_content in content: 73 | doc_text[part_name] += self.xml2text(sub_content) 74 | else: 75 | doc_text[part_name] += self.xml2text(content) 76 | return doc_text 77 | 78 | def xml2text(self, xml_part: bytes) -> str: 79 | """Extract text from xml component nodes. 80 | 81 | Args: 82 | xml_part (bytes): XML component. 83 | 84 | Returns: 85 | str: The extracted text. 86 | """ 87 | text = "" 88 | root = ETree.fromstring(xml_part) 89 | for child in root.iter(): 90 | if child.tag == TagEnum.SPACE: 91 | text += child.text if child.text is not None else "" 92 | elif child.tag == TagEnum.TAB: 93 | text += LayoutEnum.TAB 94 | elif child.tag in ( 95 | TagEnum.BREAK_LINE, 96 | TagEnum.CARRIAGE_RETURN, 97 | ): 98 | text += LayoutEnum.BREAK_LINE 99 | elif child.tag == TagEnum.PARAGRAPH: 100 | text += LayoutEnum.MAJ_BREAK_LINE 101 | return text 102 | 103 | def to_xml(self) -> XML_Type: 104 | """Convert a zip file to XML components header, body and footer. 105 | 106 | Returns: 107 | XML_Type: Dictionary containing 108 | the components content. 109 | """ 110 | xml_parts: XML_Type = { 111 | "header": self.get_xml_part_by_pattern(CS.XML_HEADER), 112 | "body": self.__zip_file.read(CS.XML_BODY), 113 | "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER), 114 | } 115 | return xml_parts 116 | 117 | def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]: 118 | """Get all XML component parts based on the input `pattern`. 119 | 120 | Args: 121 | pattern (str): The pattern of the component. 122 | 123 | Returns: 124 | List[bytes]: List of the components parts. 125 | """ 126 | xml_part: List[bytes] = [] 127 | for file_name in self.__name_list: 128 | if re.match(pattern, file_name): 129 | xml_part.append(self.__zip_file.read(file_name)) 130 | return xml_part 131 | --------------------------------------------------------------------------------