├── requirements.txt
├── tests
    ├── __init__.py
    ├── data
    │   └── docx_example.docx
    ├── test_doc_parser.py
    ├── test_utils.py
    ├── test_reader.py
    ├── test_parser.py
    └── test_xml_parser.py
├── .coveragerc
├── requirements-dev.txt
├── docparser
    ├── constants.py
    ├── document.py
    ├── enums.py
    ├── __init__.py
    ├── exceptions.py
    ├── utils.py
    ├── reader.py
    ├── parser.py
    └── xml_parser.py
├── README.md
├── .github
    └── workflows
    │   ├── pypi-publish.yml
    │   └── test.yml
├── LICENCE.txt
├── setup.py
└── .gitignore


/requirements.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = tests/*


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pylint
2 | black
3 | mypy
4 | coverage


--------------------------------------------------------------------------------
/tests/data/docx_example.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/has-abi/docparser/HEAD/tests/data/docx_example.docx


--------------------------------------------------------------------------------
/docparser/constants.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This module contains the package constants
 3 | """
 4 | 
 5 | 
 6 | DOCX_EXT = "docx"
 7 | ALLOWED_EXTS = [DOCX_EXT]
 8 | XML_BODY = "word/document.xml"
 9 | XML_HEADER = "word/header[0-9]*.xml"
10 | XML_FOOTER = "word/footer[0-9]*.xml"
11 | NSMAP = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
12 | 


--------------------------------------------------------------------------------
/docparser/document.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This is dataclass module, namely :class:`Document`, which
 3 | holds the parsed data from a document.
 4 | 
 5 | Classes & methods
 6 | -----------------
 7 | 
 8 | Below is listed the class within :py:mod:`docparser.parser`
 9 | along with possessed methods.
10 | """
11 | 
12 | 
13 | from dataclasses import dataclass
14 | from typing import Dict
15 | 
16 | 
17 | @dataclass
18 | class Document:
19 |     name: str
20 |     ext: str
21 |     content: str
22 |     divided_content: Dict[str, str]
23 | 


--------------------------------------------------------------------------------
/tests/test_doc_parser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from pathlib import Path
 3 | 
 4 | from docparser import parse
 5 | 
 6 | DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx"
 7 | 
 8 | 
 9 | class TestDocParser(unittest.TestCase):
10 |     def test_parse_docx_file_str(self):
11 |         document = parse(str(DOCX_FILE_PATH))
12 |         self.assertTrue(document.content)
13 | 
14 |     def test_parse_docx_file_binary(self):
15 |         with open(DOCX_FILE_PATH, "rb") as docx_file:
16 |             document = parse(docx_file)
17 |             self.assertTrue(document.content)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     unittest.main()
22 | 


--------------------------------------------------------------------------------
/docparser/enums.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This module contains the package enums.
 3 | 
 4 | Enums Classes
 5 | -------------
 6 | 
 7 | Below is listed the enums classes within 
 8 | :py:mod:`docparser.enums`
 9 | """
10 | 
11 | 
12 | from enum import Enum
13 | 
14 | import docparser.constants as CS
15 | 
16 | 
17 | class TagEnum(str, Enum):
18 |     SPACE = CS.NSMAP + "t"
19 |     TAB = CS.NSMAP + "tab"
20 |     BREAK_LINE = CS.NSMAP + "br"
21 |     CARRIAGE_RETURN = CS.NSMAP + "cr"
22 |     PARAGRAPH = CS.NSMAP + "p"
23 | 
24 | 
25 | class LayoutEnum(str, Enum):
26 |     TAB = "\t"
27 |     BREAK_LINE = "\n"
28 |     MAJ_BREAK_LINE = "\n\n"
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Python Version](https://img.shields.io/badge/python-3.7+-blue)](https://www.python.org/downloads/release/python-370/)
 2 | ![Tests](https://github.com/has-abi/docparser/actions/workflows/test.yml/badge.svg)
 3 | [![codecov](https://codecov.io/gh/has-abi/docparser/branch/main/graph/badge.svg?token=4AL385JEH9)](https://codecov.io/gh/has-abi/docparser)
 4 | 
 5 | # What is docparser?
 6 | docparser is python package that extract text form a DOCX document.
 7 | 
 8 | ## Installation
 9 | 
10 | ```bash
11 | pip install python-docparser
12 | ```
13 | 
14 | ## Usage
15 | 
16 | ```python
17 | from docparser import parse 
18 | 
19 | document = parse("your_docx_document")
20 | print(document.content)
21 | ```
22 | 


--------------------------------------------------------------------------------
/docparser/__init__.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This is the package entrypoint which exposes the
 3 | `parse` method that handles the parsing process.
 4 | """
 5 | 
 6 | 
 7 | from io import BufferedReader
 8 | 
 9 | from docparser.document import Document
10 | from docparser.parser import Parser
11 | from docparser.reader import Reader
12 | from docparser.utils import get_file_name_and_ext
13 | from docparser.xml_parser import XMLParser
14 | 
15 | 
16 | def parse(input_file: str | BufferedReader) -> Document:
17 |     file_name, file_ext = get_file_name_and_ext(input_file)
18 |     reader = Reader(input_file, file_ext)
19 |     file_parser = XMLParser(reader.zip_file)
20 |     parser = Parser(file_parser, file_ext, file_name)
21 |     reader.zip_file.close()
22 |     return parser.document
23 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |      - '*'
 7 | 
 8 | jobs:
 9 |   deploy:
10 | 
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |     - uses: actions/checkout@v3
15 |     
16 |     - name: Set up Python
17 |       uses: actions/setup-python@v3
18 |       with:
19 |         python-version: '3.9'
20 |         
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install build
25 |         
26 |     - name: Build dist
27 |       run: |
28 |           python -m build --outdir dist/
29 |       
30 |     - name: Publish package
31 |       uses: pypa/gh-action-pypi-publish@release/v1
32 |       with:
33 |         password: ${{ secrets.PYPI_API_TOKEN }}
34 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from docparser import utils
 4 | 
 5 | FILE_NAME_NO_PATH = "docx_example.docx"
 6 | FILE_NAME_WITH_PATH = "/tests/docx_example.docx"
 7 | 
 8 | 
 9 | class TestDocParser(unittest.TestCase):
10 |     def test_get_file_name_no_path(self):
11 |         self.assertEqual(
12 |             utils.get_file_name(FILE_NAME_NO_PATH),
13 |             FILE_NAME_NO_PATH,
14 |         )
15 | 
16 |     def test_get_file_name_with_path(self):
17 |         self.assertEqual(
18 |             utils.get_file_name(FILE_NAME_WITH_PATH),
19 |             FILE_NAME_NO_PATH,
20 |         )
21 | 
22 |     def test_get_file_name_and_ext(self):
23 |         self.assertEqual(
24 |             utils.get_file_name_and_ext(FILE_NAME_WITH_PATH),
25 |             (FILE_NAME_NO_PATH, "docx"),
26 |         )
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/docparser/exceptions.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This module contains the package exceptions.
 3 | 
 4 | Exceptions Classes
 5 | ------------------
 6 | 
 7 | Below is listed the exceptions classes within 
 8 | :py:mod:`docparser.exception`
 9 | """
10 | 
11 | 
12 | class InvalidArgumentTypeError(Exception):
13 |     def __init__(self, message: str) -> None:
14 |         super().__init__(message)
15 | 
16 | 
17 | class UnsupportedFileFormatError(Exception):
18 |     def __init__(self, file_format: str) -> None:
19 |         super().__init__(
20 |             f"{file_format} if not supported. supported formats are docx and doc."
21 |         )
22 | 
23 | 
24 | class MissingAttributeError(Exception):
25 |     def __init__(self, message: str) -> None:
26 |         super().__init__(message)
27 | 
28 | 
29 | class InvalidReturnValueError(Exception):
30 |     def __init__(self, message: str) -> None:
31 |         super().__init__(message)
32 | 


--------------------------------------------------------------------------------
/LICENCE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Hassane Abida
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tests/test_reader.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from pathlib import Path
 3 | from zipfile import ZipFile
 4 | 
 5 | from docparser.exceptions import (
 6 |     InvalidArgumentTypeError,
 7 |     UnsupportedFileFormatError,
 8 | )
 9 | from docparser.reader import Reader
10 | 
11 | DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx"
12 | 
13 | 
14 | class TestReader(unittest.TestCase):
15 |     def test_read_empty_file(self):
16 |         with self.assertRaises(InvalidArgumentTypeError):
17 |             reader = Reader(input_file=None, file_ext="")  # type: ignore
18 | 
19 |     def test_read_unsupported_file_type(self):
20 |         with self.assertRaises(UnsupportedFileFormatError):
21 |             reader = Reader(input_file="file_example.pdf", file_ext="pdf")
22 | 
23 |     def test_read_missing_file(self):
24 |         with self.assertRaises(FileNotFoundError):
25 |             reader = Reader(input_file="missing_file.docx", file_ext="docx")
26 | 
27 |     def test_to_zip_str_file(self):
28 |         test_reader = Reader(input_file=str(DOCX_FILE_PATH), file_ext="docx")
29 |         zip_file = test_reader.to_zip()
30 |         self.assertTrue(isinstance(zip_file, ZipFile))
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/docparser/utils.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This module contains the package utils.
 3 | 
 4 | Utils methods
 5 | -------------
 6 | 
 7 | Below is listed the package util methos within 
 8 | :py:mod:`docparser.utils`
 9 | """
10 | 
11 | 
12 | import os
13 | from io import BufferedReader
14 | from typing import Tuple
15 | 
16 | 
17 | def get_file_name_and_ext(file_or_filepath: str | BufferedReader) -> Tuple[str, str]:
18 |     """Extract the file extension and the file name
19 |     from a file or a file name.
20 | 
21 |     Args:
22 |         file_or_filepath (str | BufferedReader): File or file path.
23 | 
24 |     Returns:
25 |         Tuple[str, str]: Tuple of file name and file extension
26 |     """
27 |     filename = get_file_name(file_or_filepath)
28 |     ext = filename.rsplit(".", 1)[1]
29 |     return filename, ext.lower()
30 | 
31 | 
32 | def get_file_name(file_or_filepath: str | BufferedReader) -> str:
33 |     """Extract the file name form a file or a file path.
34 | 
35 |     Args:
36 |         file_or_filepath (str | BufferedReader): File or a file path.
37 | 
38 |     Returns:
39 |         str: The extracted file name.
40 |     """
41 |     if isinstance(file_or_filepath, BufferedReader):
42 |         return file_or_filepath.name
43 |     return os.path.basename(file_or_filepath)
44 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | CURRENT = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | 
 8 | def _open(subpath):
 9 |     path = os.path.join(CURRENT, subpath)
10 |     return open(path, encoding="utf-8", errors="ignore")
11 | 
12 | 
13 | with _open("requirements.txt") as file:
14 |     base_reqs = file.read().strip().split("\n")
15 | 
16 | with _open("requirements-dev.txt") as file:
17 |     dev_reqs = file.read().strip().split("\n")
18 | 
19 | with _open("README.md") as f:
20 |     readme = f.read()
21 | 
22 | setup(
23 |     name="python-docparser",
24 |     version="1.1.0",
25 |     author="Hassane Abida",
26 |     author_email="abidahass.uca@gmail.com",
27 |     url="https://github.com/has-abi/docparser",
28 |     description="Extract text from your docx document.",
29 |     long_description=readme,
30 |     long_description_content_type="text/markdown",
31 |     python_requires=">=3.7",
32 |     tests_require=base_reqs + dev_reqs,
33 |     install_requires=base_reqs,
34 |     classifiers=[
35 |         "License :: OSI Approved :: MIT License",
36 |         "Operating System :: OS Independent",
37 |         "Programming Language :: Python :: 3.7",
38 |         "Programming Language :: Python :: 3.8",
39 |         "Programming Language :: Python :: 3.9",
40 |         "Programming Language :: Python :: 3.10",
41 |         "Programming Language :: Python :: 3.11",
42 |     ],
43 | )
44 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   lint:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |     - uses: actions/checkout@v3
11 | 
12 |     - name: Set up Python 3.10
13 |       uses: actions/setup-python@v3
14 |       with:
15 |         python-version: "3.10"
16 |     
17 |     - name: Install Python dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         pip install -r requirements-dev.txt
21 | 
22 |     - name: Validate against psf/black
23 |       run: python -m black --check docparser tests
24 |       
25 |     - name: Check type annotations via mypy
26 |       run: python -m mypy --strict docparser
27 |     
28 |   test:
29 |     needs: lint
30 |     runs-on: ubuntu-latest
31 |     strategy:
32 |       fail-fast: false
33 |       matrix:
34 |         python-version: ["3.10", "3.11"]
35 | 
36 |     steps:
37 |     - uses: actions/checkout@v3
38 |     
39 |     - name: Set up Python ${{ matrix.python-version }}
40 |       uses: actions/setup-python@v3
41 |       with:
42 |         python-version: ${{ matrix.python-version }}
43 |         
44 |     - name: Install Python dependencies
45 |       run: |
46 |         python -m pip install --upgrade pip
47 |         pip install -r requirements-dev.txt
48 |         
49 |     - name: Run tests
50 |       run: |
51 |         python -m coverage run -m unittest
52 |         python -m coverage html
53 |         
54 |     - name: Upload code coverage
55 |       uses: codecov/codecov-action@v3
56 |       with:
57 |         token: ${{ secrets.CODECOV_TOKEN }}
58 |     
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | docs/build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # IPython Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # dotenv
 80 | .env
 81 | 
 82 | # virtualenv
 83 | venv/
 84 | ENV/
 85 | 
 86 | 
 87 | # Spyder project settings
 88 | .spyderproject
 89 | 
 90 | # Rope project settings
 91 | .ropeproject
 92 | 
 93 | # vscode cache
 94 | .vscode
 95 | .pytest_cache
 96 | 
 97 | # html coverage
 98 | htmlcov/
 99 | 
100 | # Pycharm
101 | .idea


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import Mock
 3 | 
 4 | from docparser.document import Document
 5 | from docparser.exceptions import InvalidReturnValueError, MissingAttributeError
 6 | from docparser.parser import Parser
 7 | 
 8 | 
 9 | class TestParser(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls) -> None:
12 |         file_parser = Mock()
13 |         file_parser.extract_text = Mock(
14 |             return_value={
15 |                 "header": "xml header text",
16 |                 "body": "xml body text",
17 |                 "footer": "xml footer text",
18 |             }
19 |         )
20 |         cls.parser = Parser(
21 |             file_parser=file_parser,
22 |             file_ext="docx",
23 |             file_name="file_name_example.docx",
24 |         )
25 | 
26 |     def test_parser_with_invalid_file_parser(self):
27 |         test_file_parser = ""
28 |         with self.assertRaises(MissingAttributeError):
29 |             Parser(file_parser=test_file_parser, file_ext="", file_name="")
30 | 
31 |     def test_invalid_file_parser_extract_text_callable_return(self):
32 |         test_file_parser = Mock()
33 |         test_file_parser.extract_text = Mock(return_value=["list item"])
34 |         with self.assertRaises(InvalidReturnValueError):
35 |             Parser(file_parser=test_file_parser, file_ext="", file_name="")
36 | 
37 |     def test_get_document(self):
38 |         result_document = __class__.parser.document
39 |         self.assertTrue(isinstance(result_document, Document))
40 |         self.assertEqual(result_document.name, "file_name_example.docx")
41 |         self.assertEqual(result_document.ext, "docx")
42 |         self.assertTrue(isinstance(result_document.divided_content, dict))
43 |         self.assertListEqual(
44 |             list(result_document.divided_content.keys()), ["header", "body", "footer"]
45 |         )
46 |         self.assertListEqual(
47 |             list(result_document.divided_content.values()),
48 |             ["xml header text", "xml body text", "xml footer text"],
49 |         )
50 |         self.assertEqual(
51 |             result_document.content, "xml header text xml body text xml footer text"
52 |         )
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     unittest.main()
57 | 


--------------------------------------------------------------------------------
/tests/test_xml_parser.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from pathlib import Path
 3 | from zipfile import ZipFile
 4 | 
 5 | from docparser.exceptions import InvalidArgumentTypeError
 6 | from docparser.xml_parser import XMLParser
 7 | 
 8 | DOCX_FILE_PATH = Path(__file__).parent / "data" / "docx_example.docx"
 9 | 
10 | XML_BODY = "word/document.xml"
11 | XML_HEADER = "word/header[0-9]*.xml"
12 | XML_FOOTER = "word/footer[0-9]*.xml"
13 | 
14 | 
15 | class TestXMLParser(unittest.TestCase):
16 |     @classmethod
17 |     def setUpClass(cls) -> None:
18 |         cls.zip_file = ZipFile(DOCX_FILE_PATH)
19 |         cls.xml_parser = XMLParser(cls.zip_file)
20 | 
21 |     def test_invalid_input_file(self):
22 |         with self.assertRaises(InvalidArgumentTypeError):
23 |             xml_parser = XMLParser(input_file="")  # type: ignore
24 | 
25 |     def test_get_xml_part_by_pattern_header(self) -> None:
26 |         test_result = __class__.xml_parser.get_xml_part_by_pattern(XML_HEADER)
27 |         self.assertTrue(isinstance(test_result, list))
28 |         self.assertTrue(all(isinstance(result, bytes) for result in test_result))
29 | 
30 |     def test_get_xml_part_by_pattern_body(self) -> None:
31 |         test_result = __class__.xml_parser.get_xml_part_by_pattern(XML_BODY)
32 |         self.assertTrue(isinstance(test_result, list))
33 |         self.assertTrue(all(isinstance(result, bytes) for result in test_result))
34 | 
35 |     def test_get_xml_part_by_pattern_footer(self) -> None:
36 |         test_result = __class__.xml_parser.get_xml_part_by_pattern(XML_FOOTER)
37 |         self.assertTrue(isinstance(test_result, list))
38 |         self.assertTrue(all(isinstance(result, bytes) for result in test_result))
39 | 
40 |     def test_to_xml(self) -> None:
41 |         test_xml_components = __class__.xml_parser.to_xml()
42 |         self.assertTrue(
43 |             ["header", "body", "footer"] == list(test_xml_components.keys())
44 |         )
45 | 
46 |     def test_xml2text(self) -> None:
47 |         test_xml_body = __class__.xml_parser.get_xml_part_by_pattern(XML_BODY)
48 |         text_result = " ".join(
49 |             [self.xml_parser.xml2text(part) for part in test_xml_body]
50 |         )
51 |         self.assertTrue(len(text_result) > 0)
52 | 
53 |     def test_extract_text(self) -> None:
54 |         doc_content = __class__.xml_parser.extract_text()
55 |         self.assertTrue(isinstance(doc_content, dict))
56 |         self.assertTrue(["header", "body", "footer"] == list(doc_content.keys()))
57 |         self.assertTrue(
58 |             (
59 |                 (isinstance(component_content, str) and len(component_content) > 0)
60 |                 for component_content in doc_content.values()
61 |             )
62 |         )
63 | 
64 |     @classmethod
65 |     def tearDownClass(cls) -> None:
66 |         cls.zip_file.close()
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     unittest.main()
71 | 


--------------------------------------------------------------------------------
/docparser/reader.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This module is a single class, namely :class:`Reader`, which
 3 | handles the file reading as a zip file.
 4 | 
 5 | Classes & methods
 6 | -----------------
 7 | 
 8 | Below is listed the class within :py:mod:`docparser.reader`
 9 | along with possessed methods.
10 | """
11 | 
12 | 
13 | import os
14 | from io import BufferedReader
15 | from zipfile import ZipFile
16 | 
17 | import docparser.constants as CS
18 | from docparser.exceptions import (
19 |     InvalidArgumentTypeError,
20 |     UnsupportedFileFormatError,
21 | )
22 | 
23 | 
24 | class Reader:
25 |     """Docparser `Reader` class that reads a docx file as a zip file.
26 | 
27 |     Args:
28 |         input_file (str | BufferedReader): Input file that could be a file
29 |             or a file path.
30 |         file_ext (str): The input file extension.
31 |     """
32 | 
33 |     def __init__(self, input_file: str | BufferedReader, file_ext: str) -> None:
34 |         """Docparser `Reader` class that reads a docx file as a zip file.
35 | 
36 |         Args:
37 |             input_file (str | BufferedReader): Input file that could be a file
38 |                 or a file path.
39 |             file_ext (str): The input file extension.
40 |         """
41 |         self.__check(input_file, file_ext)
42 |         self.input_file = input_file
43 |         self.zip_file = self.to_zip()
44 | 
45 |     def __check(self, input_file: str | BufferedReader, file_ext: str) -> None:
46 |         """Check the input arguments of the class constructor for invalid
47 |         types or values.
48 | 
49 |         Args:
50 |             input_file (str | BufferedReade): Input file that could be a file
51 |                 or a file path.
52 |             file_ext (str): The input file extension.
53 | 
54 |         Raises:
55 |             InvalidArgumentTypeError: Thrown if any argument has an invalid
56 |                 type.
57 |             UnsupportedFileFormatError: Thrown if the input file has unsupported
58 |                 format.
59 |             FileNotFoundError: Thrown if the input file don't exist in disque or
60 |                 not found.
61 |         """
62 |         if not isinstance(input_file, (str, BufferedReader)):
63 |             raise InvalidArgumentTypeError(
64 |                 "input_file must be a file path or a binary file."
65 |             )
66 | 
67 |         if file_ext not in CS.ALLOWED_EXTS:
68 |             raise UnsupportedFileFormatError(file_ext)
69 | 
70 |         if isinstance(input_file, str) and not os.path.isfile(input_file):
71 |             raise FileNotFoundError(f"File not found: {input_file}")
72 | 
73 |     def to_zip(self) -> ZipFile:
74 |         """Convert the input file to a zip file.
75 | 
76 |         Returns:
77 |             ZipFile: The converted zip file.
78 |         """
79 |         zip_file = ZipFile(self.input_file)
80 |         return zip_file
81 | 


--------------------------------------------------------------------------------
/docparser/parser.py:
--------------------------------------------------------------------------------
 1 | __doc__ = """
 2 | This module is a single class, namely :class:`Parser`, which
 3 | is the end class that handles getting the parsing results from
 4 | a file parser.
 5 | 
 6 | Classes & methods
 7 | -----------------
 8 | 
 9 | Below is listed the class within :py:mod:`docparser.parser`
10 | along with possessed methods.
11 | """
12 | 
13 | 
14 | from typing import Any
15 | 
16 | from docparser.document import Document
17 | from docparser.exceptions import InvalidReturnValueError, MissingAttributeError
18 | 
19 | 
20 | class Parser:
21 |     """Docparser `Parser` class.
22 | 
23 |     Args:
24 |         file_parser (Any): A file parser that has an `extract_text` method
25 |             that returns the parsed document content as a dictionary.
26 |         file_ext (str): The original file extension.
27 |         file_name (str): The original file name.
28 |     """
29 | 
30 |     def __init__(self, file_parser: Any, file_ext: str, file_name: str) -> None:
31 |         """Docparser `Parser` class.
32 | 
33 |         Args:
34 |             file_parser (Any): A file parser that has an `extract_text` method
35 |                 that returns the parsed document content as a dictionary.
36 |             file_ext (str): The original file extension.
37 |             file_name (str): The original file name.
38 |         """
39 |         self.__check(file_parser)
40 |         self.file_parser = file_parser
41 |         self.document = self.get_document(file_ext, file_name)
42 | 
43 |     def __check(self, file_parser: Any) -> None:
44 |         """Checks if the `file_parser` has a callable with the name
45 |         `extract_text`.
46 | 
47 |         Args:
48 |             file_parser (Any): A file parser.
49 | 
50 |         Raises:
51 |             MissingAttributeError: Thrown if the file parser don't have
52 |                 a callable `extract_text`
53 |         """
54 |         if not (
55 |             hasattr(file_parser, "extract_text") and callable(file_parser.extract_text)
56 |         ):
57 |             raise MissingAttributeError(
58 |                 "Missing callable extract_text() from file_parser instance."
59 |             )
60 | 
61 |     def get_document(self, file_ext: str, file_name: str) -> Document:
62 |         """Get the extracted document data.
63 | 
64 |         Args:
65 |             file_ext (str): The original file extension
66 |             file_name (str): The original file name.
67 | 
68 |         Raises:
69 |             InvalidReturnValueError: throw if the file parser callable
70 |                 `extract_text` return value is not a dict.
71 | 
72 |         Returns:
73 |             Document: A document object that represents the parsed results.
74 |         """
75 |         divided_content = self.file_parser.extract_text()
76 |         if not isinstance(divided_content, dict):
77 |             raise InvalidReturnValueError(
78 |                 "The file parser extract_text callable return value must be a dict"
79 |             )
80 |         content = " ".join(list(divided_content.values()))
81 |         return Document(
82 |             name=file_name,
83 |             ext=file_ext,
84 |             content=content,
85 |             divided_content=divided_content,
86 |         )
87 | 


--------------------------------------------------------------------------------
/docparser/xml_parser.py:
--------------------------------------------------------------------------------
  1 | __doc__ = """
  2 | This module is a single class, namely :class:`XMLParser`, which
  3 | represents an XML parser that extracts text from different XML
  4 | nodes.
  5 | 
  6 | Classes & methods
  7 | -----------------
  8 | 
  9 | Below is listed the class within :py:mod:`docparser.xml_parser`
 10 | along with possessed methods.
 11 | """
 12 | 
 13 | 
 14 | import re
 15 | import xml.etree.ElementTree as ETree
 16 | from typing import Dict, List
 17 | from zipfile import ZipFile
 18 | 
 19 | import docparser.constants as CS
 20 | from docparser.enums import LayoutEnum, TagEnum
 21 | from docparser.exceptions import InvalidArgumentTypeError
 22 | 
 23 | 
 24 | XML_Type = Dict[str, bytes | List[bytes]]
 25 | 
 26 | 
 27 | class XMLParser:
 28 |     """Docparser `XMLParser` class that parses the input zip file
 29 |     using the python package `xml`.
 30 | 
 31 |     Args:
 32 |         input_file (ZipFile): Zip file.
 33 |     """
 34 | 
 35 |     def __init__(self, input_file: ZipFile) -> None:
 36 |         """Docparser `XMLParser` class that parses the input zip file
 37 |         using the python package `xml`.
 38 | 
 39 |         Args:
 40 |             input_file (ZipFile): Zip file.
 41 |         """
 42 |         self.__check(input_file)
 43 |         self.__zip_file = input_file
 44 |         self.__name_list = self.__zip_file.namelist()
 45 | 
 46 |     def __check(self, input_file: ZipFile) -> None:
 47 |         """Check the input arguments of the class constructor for invalid
 48 |         types or values.
 49 | 
 50 |         Args:
 51 |             input_file (ZipFile): Zip file.
 52 | 
 53 |         Raises:
 54 |             InvalidArgumentTypeError: Thrown if the input file is not an
 55 |                 instance of ZipFile.
 56 |         """
 57 |         if not isinstance(input_file, ZipFile):
 58 |             raise InvalidArgumentTypeError("input file must of type ZipFile.")
 59 | 
 60 |     def extract_text(self) -> Dict[str, str]:
 61 |         """Extract text from the zip file using XML.
 62 | 
 63 |         Returns:
 64 |             Dict[str, str]: A dictionary containing the document
 65 |                 XML parts [head, body, footer] and their text.
 66 |         """
 67 |         doc_text: Dict[str, str] = {}
 68 |         xml_components = self.to_xml()
 69 |         for part_name, content in xml_components.items():
 70 |             doc_text[part_name] = ""
 71 |             if isinstance(content, list):
 72 |                 for sub_content in content:
 73 |                     doc_text[part_name] += self.xml2text(sub_content)
 74 |             else:
 75 |                 doc_text[part_name] += self.xml2text(content)
 76 |         return doc_text
 77 | 
 78 |     def xml2text(self, xml_part: bytes) -> str:
 79 |         """Extract text from xml component nodes.
 80 | 
 81 |         Args:
 82 |             xml_part (bytes): XML component.
 83 | 
 84 |         Returns:
 85 |             str: The extracted text.
 86 |         """
 87 |         text = ""
 88 |         root = ETree.fromstring(xml_part)
 89 |         for child in root.iter():
 90 |             if child.tag == TagEnum.SPACE:
 91 |                 text += child.text if child.text is not None else ""
 92 |             elif child.tag == TagEnum.TAB:
 93 |                 text += LayoutEnum.TAB
 94 |             elif child.tag in (
 95 |                 TagEnum.BREAK_LINE,
 96 |                 TagEnum.CARRIAGE_RETURN,
 97 |             ):
 98 |                 text += LayoutEnum.BREAK_LINE
 99 |             elif child.tag == TagEnum.PARAGRAPH:
100 |                 text += LayoutEnum.MAJ_BREAK_LINE
101 |         return text
102 | 
103 |     def to_xml(self) -> XML_Type:
104 |         """Convert a zip file to XML components header, body and footer.
105 | 
106 |         Returns:
107 |             XML_Type: Dictionary containing
108 |                 the components content.
109 |         """
110 |         xml_parts: XML_Type = {
111 |             "header": self.get_xml_part_by_pattern(CS.XML_HEADER),
112 |             "body": self.__zip_file.read(CS.XML_BODY),
113 |             "footer": self.get_xml_part_by_pattern(CS.XML_FOOTER),
114 |         }
115 |         return xml_parts
116 | 
117 |     def get_xml_part_by_pattern(self, pattern: str) -> List[bytes]:
118 |         """Get all XML component parts based on the input `pattern`.
119 | 
120 |         Args:
121 |             pattern (str): The pattern of the component.
122 | 
123 |         Returns:
124 |             List[bytes]: List of the components parts.
125 |         """
126 |         xml_part: List[bytes] = []
127 |         for file_name in self.__name_list:
128 |             if re.match(pattern, file_name):
129 |                 xml_part.append(self.__zip_file.read(file_name))
130 |         return xml_part
131 | 


--------------------------------------------------------------------------------