├── .gitattributes ├── src └── rtfparse │ ├── __about__.py │ ├── renderers │ ├── __init__.py │ └── html_decapsulator.py │ ├── __init__.py │ ├── enums.py │ ├── minimal.py │ ├── utils.py │ ├── parser.py │ ├── logging_conf.py │ ├── re_patterns.py │ ├── cli.py │ └── entities.py ├── changelog.d └── changelog_template.jinja ├── LICENSE ├── ROADMAP.md ├── .gitignore ├── CHANGELOG.md ├── README.md └── pyproject.toml /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /src/rtfparse/__about__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | __version__ = "0.9.5" 5 | -------------------------------------------------------------------------------- /src/rtfparse/renderers/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | class Renderer: 5 | pass 6 | 7 | 8 | if __name__ == "__main__": 9 | pass 10 | -------------------------------------------------------------------------------- /src/rtfparse/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | # Towncrier needs version 5 | # from rtfparse.__about__ import __version__ 6 | __all__ = ["rtfparse.__about__.__version__"] 7 | 8 | if __name__ == "__main__": 9 | from rtfparse.cli import main 10 | 11 | main() 12 | -------------------------------------------------------------------------------- /src/rtfparse/enums.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | from enum import Enum, auto, unique 5 | 6 | 7 | @unique 8 | class Bytestring_Type(Enum): 9 | GROUP_START = auto() 10 | GROUP_END = auto() 11 | CONTROL_WORD = auto() 12 | CONTROL_SYMBOL = auto() 13 | PLAIN_TEXT = auto() 14 | 15 | 16 | if __name__ == "__main__": 17 | pass 18 | -------------------------------------------------------------------------------- /changelog.d/changelog_template.jinja: -------------------------------------------------------------------------------- 1 | {% if sections[""] %} 2 | {% for category, val in definitions.items() if category in sections[""] %} 3 | 4 | ### {{ definitions[category]['name'] }} 5 | 6 | {% for text, values in sections[""][category].items() %} 7 | - {{ text }} {{ values|join(', ') }} 8 | {% endfor %} 9 | 10 | {% endfor %} 11 | {% else %} 12 | No significant changes. 13 | 14 | 15 | {% endif %} 16 | -------------------------------------------------------------------------------- /src/rtfparse/minimal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | """ 5 | A minimal example for a programatic use of the rtf parser and renderer 6 | """ 7 | 8 | from pathlib import Path 9 | 10 | from rtfparse.parser import Rtf_Parser 11 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator 12 | 13 | source_path = Path(r"D:\trace\Pre-Integration test report of carapp_orureleasenotes_1_22_104 Webapps on ID_S 5_0.rtf") 14 | target_path = Path(r"D:\trace\Pre-Integration test report of carapp_orureleasenotes_1_22_104 Webapps on ID_S 5_0.html") 15 | # Create parent directory of `target_path` if it does not already exist: 16 | target_path.parent.mkdir(parents=True, exist_ok=True) 17 | 18 | parser = Rtf_Parser(rtf_path=source_path) 19 | parsed = parser.parse_file() 20 | 21 | renderer = HTML_Decapsulator() 22 | 23 | with open(target_path, mode="w", encoding="utf-8") as html_file: 24 | renderer.render(parsed, html_file) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Sven Siegmund 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | # Roadmap for rtfparse 2 | 3 | - Rework the CLI. The original reason I wrote rtfparse was to decapsulate HTML from MS Outlook email files. Much of the current CLI serves the purpose of extracting the email body and attachments. This introduced dependency with non-free license (yes, I consider GPL non-free) so that rtfparse currently has a license conflict. By modifying the CLI such that it expects an RTF file (rather than Outlook's .msg file) we shall get rid of that conflict. For extracting content out of Outlook messages, [msg-extractor][msg-extractor]'s own CLI shall be used in a separate step. 4 | - Build solid test code 5 | - introduce end-to-end tests with [behave][behave] 6 | - bring in some good test material (call for test material) 7 | - Once [human-regex][hr] works with Python 3.13, rewrite the _re_patterns_ module with human-regex as a dependency. 8 | - Hand over the further development and maintenance of this project to somebody with more free time and investment in RTF than me. By migrating from Windows to FreeBSD, Outlook messages and RTFs have left my life. My incentive to work a tool I'm not personally using is currently very low. 9 | 10 | [msg-extractor]: https://github.com/TeamMsgExtractor/msg-extractor 11 | [behave]: https://behave.readthedocs.io/en/stable/ 12 | [hr]: https://github.com/fleetingbytes/human-regex 13 | -------------------------------------------------------------------------------- /src/rtfparse/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import io 5 | import logging 6 | import pathlib 7 | 8 | # Typing 9 | from typing import Union 10 | 11 | # Setup logging 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | program_name = home_dir_name = "rtfparse" 16 | dir_name = "".join((".", program_name)) 17 | configuration_file_name = f"{program_name}_configuration.ini" 18 | 19 | 20 | def provide_dir(directory: pathlib.Path) -> pathlib.Path: 21 | """ 22 | Checks if there is a directory of name `dir_name` in the user home path. 23 | If not, it will try to create one. 24 | """ 25 | if directory.exists() and directory.is_dir(): 26 | logger.debug(f"Found directory {str(directory)}") 27 | else: 28 | while True: 29 | try: 30 | directory.mkdir() 31 | logger.info(f"Created directory {str(directory)}") 32 | break 33 | except FileNotFoundError: 34 | provide_dir(directory.parent) 35 | continue 36 | except FileExistsError: 37 | logger.debug(f"{directory} already exists") 38 | break 39 | return directory 40 | 41 | 42 | def warn(s: str) -> str: 43 | """ 44 | Creates a string highlighted as warning in log output 45 | """ 46 | return " ".join(("◊", s)) 47 | 48 | 49 | def what_is_being_parsed(file: Union[io.BufferedReader, io.BytesIO]) -> str: 50 | if isinstance(file, io.BufferedReader): 51 | return file.name 52 | elif isinstance(file, io.BytesIO): 53 | return repr(file) 54 | 55 | 56 | def twos_complement(val, nbits): 57 | """Compute the 2's complement of int value val. Credit: https://stackoverflow.com/a/37075643/9235421""" 58 | if val < 0: 59 | if (val + 1).bit_length() >= nbits: 60 | raise ValueError(f"Value {val} is out of range of {nbits}-bit value.") 61 | val = (1 << nbits) + val 62 | else: 63 | if val.bit_length() > nbits: 64 | raise ValueError(f"Value {val} is out of range of {nbits}-bit value.") 65 | # If sign bit is set. 66 | if (val & (1 << (nbits - 1))) != 0: 67 | # compute negative value. 68 | val = val - (1 << nbits) 69 | return val 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Test files 2 | target.html 3 | extract.py 4 | test.msg 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # PowerShell garbage 12 | Out-Null 13 | 14 | # Vim files 15 | *~ 16 | *.swp 17 | *.swo 18 | 19 | # RTF 20 | *.rtf 21 | 22 | # C extensions 23 | *.so 24 | 25 | # Distribution / packaging 26 | .Python 27 | build/ 28 | develop-eggs/ 29 | dist/ 30 | downloads/ 31 | eggs/ 32 | .eggs/ 33 | lib/ 34 | lib64/ 35 | parts/ 36 | sdist/ 37 | var/ 38 | wheels/ 39 | pip-wheel-metadata/ 40 | share/python-wheels/ 41 | *.egg-info/ 42 | .installed.cfg 43 | *.egg 44 | MANIFEST 45 | 46 | # PyInstaller 47 | # Usually these files are written by a python script from a template 48 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 49 | *.manifest 50 | *.spec 51 | 52 | # Installer logs 53 | pip-log.txt 54 | pip-delete-this-directory.txt 55 | 56 | # Unit test / coverage reports 57 | htmlcov/ 58 | .tox/ 59 | .nox/ 60 | .coverage 61 | .coverage.* 62 | .cache 63 | nosetests.xml 64 | coverage.xml 65 | *.cover 66 | *.py,cover 67 | .hypothesis/ 68 | .pytest_cache/ 69 | 70 | # Translations 71 | *.mo 72 | *.pot 73 | 74 | # Django stuff: 75 | *.log 76 | local_settings.py 77 | db.sqlite3 78 | db.sqlite3-journal 79 | 80 | # Flask stuff: 81 | instance/ 82 | .webassets-cache 83 | 84 | # Scrapy stuff: 85 | .scrapy 86 | 87 | # Sphinx documentation 88 | docs/_build/ 89 | 90 | # PyBuilder 91 | target/ 92 | 93 | # Jupyter Notebook 94 | .ipynb_checkpoints 95 | 96 | # IPython 97 | profile_default/ 98 | ipython_config.py 99 | 100 | # pyenv 101 | .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | 4 | 5 | ## 0.9.5 (2025-07-08) 6 | 7 | 8 | ### Documentation 9 | 10 | - add MIT header to LICENSE.txt [#46](https://github.com/fleetingbytes/rtfparse/issues/46) 11 | - use MIT SPDX identifier in pyproject.toml, use correct name in LICENSE.txt, update year in LICENSE.txt, rename LICENSE.txt to LICENSE [#47](https://github.com/fleetingbytes/rtfparse/issues/47) 12 | 13 | ## 0.9.4 (2024-11-10) 14 | 15 | 16 | ### Bugfixes 17 | 18 | - add missing import statement in `html_decapsulator.py` [#42](https://github.com/fleetingbytes/rtfparse/issues/42) 19 | 20 | 21 | ### Development Details 22 | 23 | - replace `black` and `isort` with `ruff` [#44](https://github.com/fleetingbytes/rtfparse/issues/44) 24 | 25 | ## 0.9.3 (2024-11-01) 26 | 27 | 28 | ### Bugfixes 29 | 30 | - Fixed double numbering of ordered and unordered lists [#38](https://github.com/fleetingbytes/rtfparse/issues/38) 31 | 32 | ## 0.9.2 (2024-09-30) 33 | 34 | 35 | ### Bugfixes 36 | 37 | - Fixed `rtfparse --help`, correct entrypoint in `pyproject.toml` [#34](https://github.com/fleetingbytes/rtfparse/issues/34) 38 | 39 | ## 0.9.1 (2024-06-21) 40 | 41 | 42 | ### Documentation 43 | 44 | - Fix old naming in readme [#22](https://github.com/fleetingbytes/rtfparse/issues/22) 45 | - Add example how to programmatically extract HTML from MS Outlook message [#25](https://github.com/fleetingbytes/rtfparse/issues/25) 46 | 47 | 48 | ### Bugfixes 49 | 50 | - Don't setup log if not using the CLI [#24](https://github.com/fleetingbytes/rtfparse/issues/24) 51 | - Fix possible bug in error handling [#26](https://github.com/fleetingbytes/rtfparse/issues/26) 52 | 53 | ## 0.9.0 (2024-03-11) 54 | 55 | 56 | ### Bugfixes 57 | 58 | - Recognize control words with where the parameter's digital sequence is delimited by any character other than an ASCII digit [#18](https://github.com/fleetingbytes/rtfparse/issues/18) 59 | 60 | 61 | ### Development Details 62 | 63 | - Renamed a few things, improved readme [#17](https://github.com/fleetingbytes/rtfparse/issues/17) 64 | 65 | ## 0.8.2 (2024-03-05) 66 | 67 | 68 | ### Documentation 69 | 70 | - Update `README.md`: Create parent directories of `target_path` if they don't already exist. [#14](https://github.com/fleetingbytes/rtfparse/issues/14) 71 | 72 | ## 0.8.1 (2023-08-07) 73 | 74 | 75 | ### Bugfixes 76 | 77 | - Interpret ANSI encoding as CP1252, improve error handling [#11](https://github.com/fleetingbytes/rtfparse/issues/11) 78 | 79 | 80 | ## 0.8.0 (2023-06-29) 81 | 82 | 83 | ### Bugfixes 84 | 85 | - Using `pyproject.toml` for installation with current pip versions [#1](https://github.com/fleetingbytes/rtfparse/issues/1) 86 | 87 | 88 | ### Development Details 89 | 90 | - Fixed reference before assignment error [#3](https://github.com/fleetingbytes/rtfparse/issues/3) 91 | - Removed convoluted configurator [#5](https://github.com/fleetingbytes/rtfparse/issues/5) 92 | -------------------------------------------------------------------------------- /src/rtfparse/parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import io 5 | import logging 6 | import pathlib 7 | from argparse import Namespace 8 | 9 | # Typing 10 | from typing import Optional, Union 11 | 12 | # Own modules 13 | from rtfparse import entities, utils 14 | 15 | # Setup logging 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | class Rtf_Parser: 20 | def __init__(self, rtf_path: Optional[pathlib.Path] = None, rtf_file: Optional[Union[io.BufferedReader, io.BytesIO]] = None) -> None: 21 | self.rtf_path = rtf_path 22 | self.rtf_file = rtf_file 23 | if not (self.rtf_path or self.rtf_file): 24 | raise ValueError("Need `rtf_path` or `rtf_file` argument") 25 | self.ENCODING_PROBE = 48 # look for encoding information in the first 48 bytes of the file 26 | 27 | def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str: 28 | probed = file.read(self.ENCODING_PROBE) 29 | group = entities.Group("cp1252", io.BytesIO(probed)) 30 | recognized_encodings = ("ansi", "ansicpg", "mac", "pc", "pca") 31 | # Gather all control words, which could define an encoding: 32 | names = tuple(filter(lambda item: isinstance(item, entities.Control_Word) and item.control_name in recognized_encodings, group.structure)) 33 | # Check if the ANSI code page is set as a parameter of any of the control words: 34 | encoding = None 35 | for item in names: 36 | # if any item is a Control_Word which has a parameter, we assume that this is the parameter of \ansicpg, and that corresponds to the codepage we are looking for 37 | if item.parameter: 38 | param = item.parameter 39 | else: 40 | param = None 41 | if param: 42 | if param == 65001: 43 | logger.warning("Found encoding '65001', but often this is actually 'cp1252', so I'm taking that") 44 | encoding = "cp1252" 45 | else: 46 | encoding = f"cp{param}" 47 | else: 48 | if names[0].control_name == "ansi": 49 | logger.warning("Found encoding 'ansi', but often this is actually 'cp1252', so I'm taking that") 50 | encoding = "cp1252" 51 | elif names[0].control_name == "mac": 52 | encoding = "mac_roman" 53 | elif names[0].control_name == "pc": 54 | encoding = "cp437" 55 | elif names[0].control_name == "pca": 56 | encoding = "cp850" 57 | file.seek(0) 58 | logger.info(f"recognized encoding {encoding}") 59 | return encoding 60 | 61 | def parse_file(self) -> entities.Group: 62 | if self.rtf_path is not None: 63 | file = open(self.rtf_path, mode="rb") 64 | elif self.rtf_file is not None: 65 | file = self.rtf_file 66 | else: 67 | file = io.BytesIO(b"") 68 | parsed_object = utils.what_is_being_parsed(file) 69 | logger.info(f"Parsing the structure of {parsed_object}") 70 | try: 71 | encoding = self.read_encoding(file) 72 | self.parsed = entities.Group(encoding, file) 73 | except Exception as err: 74 | logger.exception(err) 75 | self.parsed = Namespace() 76 | self.parsed.structure = list() 77 | finally: 78 | if self.rtf_path is not None: 79 | logger.debug(f"Closing {parsed_object}") 80 | file.close() 81 | logger.info(f"Structure of {parsed_object} parsed") 82 | return self.parsed 83 | 84 | 85 | if __name__ == "__main__": 86 | pass 87 | -------------------------------------------------------------------------------- /src/rtfparse/renderers/html_decapsulator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import io 5 | import logging 6 | 7 | from rtfparse import entities, utils 8 | from rtfparse.renderers import Renderer 9 | 10 | # Setup logging 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class HTML_Decapsulator(Renderer): 15 | def __init__(self) -> None: 16 | super().__init__() 17 | self.ignore_rtf = False 18 | self.render_word_func = dict( 19 | (("par", self.newline), ("line", self.newline), ("tab", self.tab), ("fromhtml", self.check_fromhtml), ("htmlrtf", self.ignore_rtf_toggle)) 20 | ) 21 | self.ignore_groups = ("fonttbl", "colortbl", "generator", "formatConverter", "pntext", "pntxta", "pntxtb") 22 | 23 | def ignore_rtf_toggle(self, cw: entities.Control_Word) -> str: 24 | if cw.parameter == "" or cw.parameter == 1: 25 | self.ignore_rtf = True 26 | elif cw.parameter == 0: 27 | self.ignore_rtf = False 28 | return "" 29 | 30 | def check_fromhtml(self, cw: entities.Control_Word) -> str: 31 | if cw.parameter == 1: 32 | logger.info("This RTF was indeed generated from HTML") 33 | else: 34 | logger.warning(utils.warn("Encountered a part of RTF which was not generated from HTML")) 35 | logger.warning(utils.warn("This might not be the right renderer for it.")) 36 | return "" 37 | 38 | def newline(self, cw: entities.Control_Word) -> str: 39 | if self.ignore_rtf: 40 | return "" 41 | else: 42 | return "\n" 43 | 44 | def tab(self, cw: entities.Control_Word) -> str: 45 | if self.ignore_rtf: 46 | return "" 47 | else: 48 | return "\t" 49 | 50 | def render_symbol(self, item: entities.Control_Symbol, file: io.TextIOWrapper) -> None: 51 | if not self.ignore_rtf: 52 | # Obsolete formula character used by Word 5.1 for Macintosh 53 | if item.text == "|": 54 | pass 55 | # Non-breaking space 56 | elif item.text == "~": 57 | file.write("\u00a0") 58 | # Optional hyphen 59 | elif item.text == "-": 60 | pass 61 | # Non-breaking hyphen 62 | elif item.text == "_": 63 | file.write("\u2011") 64 | # Subentry in an index entry 65 | elif item.text == ":": 66 | pass 67 | # Ignorable outside of Group 68 | elif item.text == "*": 69 | logger.warning(utils.warn("Found an IGNORABLE control symbol which is not a group start!")) 70 | # Probably any symbol converted from a hex code: \'hh 71 | else: 72 | file.write(item.text) 73 | 74 | def render(self, parsed: entities.Group, file: io.TextIOWrapper) -> None: 75 | for item in parsed.structure: 76 | if isinstance(item, entities.Group): 77 | if item.name not in self.ignore_groups: 78 | self.render(item, file) 79 | elif isinstance(item, entities.Control_Word): 80 | try: 81 | file.write(self.render_word_func[item.control_name](item)) 82 | except KeyError: 83 | pass 84 | elif isinstance(item, entities.Control_Symbol): 85 | self.render_symbol(item, file) 86 | elif isinstance(item, entities.Plain_Text): 87 | if not self.ignore_rtf: 88 | file.write(item.text) 89 | else: 90 | pass 91 | 92 | 93 | if __name__ == "__main__": 94 | pass 95 | -------------------------------------------------------------------------------- /src/rtfparse/logging_conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Logger Configuration module 4 | # Import this for easy logger configuration 5 | # See example in the comment of the set_logfile_path function below 6 | 7 | # Author: Sven Siegmund 8 | # Version 4 9 | 10 | """ 11 | This is to easily set the logfile name for the root logger's 12 | file handler from the module where logging_conf 13 | is imported. Like this: 14 | 15 | import logging_conf 16 | logging.config.dictConfig(logging_conf.create_dict_cofig(pathlib.Path.home(), "debug.log", "info.log", "error.log") 17 | logging.getLogger() 18 | 19 | If you want an additional custom logger, get it like this: 20 | 21 | logger = logging.getLogger("custom_logger") 22 | 23 | The custom logger is configured to propagate its log records to the root logger 24 | """ 25 | 26 | import pathlib 27 | 28 | 29 | def create_dict_config(directory: pathlib.Path, all_log: str, info_log: str, error_log: str) -> dict: 30 | """ 31 | Creates a logging configuration with path to logfiles set as 32 | given by the arguments 33 | """ 34 | file_formatter_conf = { 35 | "format": "{message:<50s} {levelname:>9s} {asctime}.{msecs:03.0f} {module} {funcName} ", 36 | "style": "{", 37 | # "datefmt": "%Y-%m-%d %H:%M:%S", 38 | "datefmt": "%H:%M:%S", 39 | } 40 | 41 | console_formatter_conf = { 42 | "format": "{message}", 43 | # "format": "{asctime},{msecs:03.0f} {levelname:>9s} {module} {funcName}: {message}", 44 | "style": "{", 45 | "datefmt": "%a %H:%M:%S", 46 | } 47 | 48 | formatters_dict = {"file_formatter": file_formatter_conf, "console_formatter": console_formatter_conf} 49 | 50 | root_console_handler_conf = {"class": "logging.StreamHandler", "level": "INFO", "formatter": "console_formatter", "stream": "ext://sys.stdout"} 51 | 52 | root_file_handler_conf = { 53 | "class": "logging.FileHandler", 54 | "level": "DEBUG", 55 | "formatter": "file_formatter", 56 | "filename": directory / all_log, 57 | "mode": "w", 58 | "encoding": "utf-8", 59 | } 60 | 61 | custom_error_file_handler_conf = { 62 | "class": "logging.FileHandler", 63 | "level": "ERROR", 64 | "formatter": "file_formatter", 65 | "filename": directory / error_log, 66 | "mode": "w", 67 | "encoding": "utf-8", 68 | } 69 | 70 | custom_info_file_handler_conf = { 71 | "class": "logging.FileHandler", 72 | "level": "INFO", 73 | "formatter": "file_formatter", 74 | "filename": directory / info_log, 75 | "mode": "w", 76 | "encoding": "utf-8", 77 | } 78 | 79 | handlers_dict = { 80 | "root_console_handler": root_console_handler_conf, 81 | "root_file_handler": root_file_handler_conf, 82 | "custom_error_file_handler": custom_error_file_handler_conf, 83 | "custom_info_file_handler": custom_info_file_handler_conf, 84 | } 85 | 86 | custom_logger_conf = {"propagate": True, "handlers": ["custom_error_file_handler", "custom_info_file_handler"], "level": "DEBUG"} 87 | 88 | root_logger_conf = { 89 | "handlers": ["root_file_handler", "root_console_handler", "custom_error_file_handler", "custom_info_file_handler"], 90 | "level": "DEBUG", 91 | } 92 | 93 | loggers_dict = {"custom_logger": custom_logger_conf} 94 | 95 | dict_config = { 96 | "version": 1, 97 | "disable_existing_loggers": False, 98 | "formatters": formatters_dict, 99 | "handlers": handlers_dict, 100 | "loggers": loggers_dict, 101 | "root": root_logger_conf, 102 | "incremental": False, 103 | } 104 | return dict_config 105 | -------------------------------------------------------------------------------- /src/rtfparse/re_patterns.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import re 5 | 6 | # Helper functions to construct raw regular expressions "strings" (actually byte strings) 7 | 8 | 9 | def group(content: bytes) -> bytes: 10 | if content: 11 | return rb"[" + content + rb"]" 12 | else: 13 | return b"" 14 | 15 | 16 | def named_regex_group(name: str, content: bytes) -> bytes: 17 | group_start = rb"(?P<" + name.encode("ascii") + rb">" 18 | group_end = rb")" 19 | return rb"".join((group_start, content, group_end)) 20 | 21 | 22 | def not_preceded_by(preceding: bytes, actual: bytes) -> bytes: 23 | return rb"(? bytes: 27 | return rb"(? bytes: 31 | return rb"(?:" + content + rb")" 32 | 33 | 34 | # Raw regular expression "strings"" (actually byte strings) 35 | 36 | 37 | _control_characters = rb"\\\{\}" 38 | _newline = b"\\" + rb"r" + b"\\" + rb"n" 39 | control_character = group(_control_characters) 40 | not_control_character = group(rb"^" + _control_characters) 41 | _control_characters_or_newline = _control_characters + _newline 42 | control_character_or_newline = group(_control_characters + _newline) 43 | not_control_character_or_newline = group(rb"^" + _control_characters_or_newline) 44 | rtf_backslash = named_regex_group("backslash", not_preceded_by(rb"\\", rb"\\")) 45 | unnamed_rtf_backslash = not_preceded_by(rb"\\", rb"\\") 46 | _letters = rb"a-zA-Z" 47 | ascii_letters = group(_letters) + rb"{1,32}" 48 | _digits = rb"0-9" 49 | _hdigits = rb"0-9a-f" 50 | ignorable = named_regex_group("ignorable", rb"\\\*") 51 | rtf_brace_open = named_regex_group("group_start", not_preceded_by(unnamed_rtf_backslash, rb"\{") + ignorable + rb"?") 52 | rtf_brace_close = named_regex_group("group_end", not_preceded_by(unnamed_rtf_backslash, rb"\}")) 53 | 54 | 55 | minus = named_regex_group("minus", rb"-?") 56 | digit = named_regex_group("digit", minus + group(_digits) + rb"{1,10}") 57 | hdigit = named_regex_group("hdigit", group(_hdigits)) 58 | parameter_pattern = named_regex_group("parameter", digit) 59 | space = named_regex_group("space", rb" ") 60 | newline = named_regex_group("newline", _newline) 61 | other = named_regex_group("other", group(rb"^" + _letters + _digits)) 62 | nothing = named_regex_group("nothing", group(rb"")) 63 | 64 | 65 | ascii_letter_sequence = named_regex_group("control_name", ascii_letters + parameter_pattern + rb"?") 66 | delimiter = named_regex_group("delimiter", rb"|".join((space, newline, other, nothing, rb"$"))) 67 | symbol = named_regex_group("symbol", other) 68 | control_word_pattern = named_regex_group("control_word", rtf_backslash + ascii_letter_sequence + delimiter) 69 | pcdata_delimiter = no_capture(rb"|".join((rtf_brace_open, rtf_brace_close, control_word_pattern))) 70 | plain_text_pattern = named_regex_group("text", not_control_character_or_newline + rb"+") + no_capture( 71 | rb"|".join((control_character_or_newline, rb"$")) 72 | ) 73 | probe_pattern = rb".." 74 | 75 | 76 | class Bytes_Regex: 77 | """ 78 | This wraps `re.pattern` objects and gives them a method `regex101` which 79 | prints out the pattern in such a manner that it can be copy-pasted 80 | to regex101.com. 81 | """ 82 | 83 | def __init__(self, Bytes: bytes, flags: re.RegexFlag = 0) -> None: 84 | self.pattern_bytes = Bytes 85 | self.pattern = re.compile(Bytes, flags) 86 | self.match = self.pattern.match 87 | 88 | def regex101(self) -> None: 89 | print(self.pattern_bytes.decode("ascii")) 90 | 91 | 92 | meaningful_bs = Bytes_Regex(rtf_backslash) 93 | probe = Bytes_Regex(named_regex_group("probe", probe_pattern), flags=re.DOTALL) 94 | parameter = Bytes_Regex(parameter_pattern) 95 | control_word = Bytes_Regex(control_word_pattern) 96 | control_symbol = Bytes_Regex(rtf_backslash + symbol) 97 | group_start = Bytes_Regex(rtf_brace_open) 98 | group_end = Bytes_Regex(rtf_brace_close) 99 | plain_text = Bytes_Regex(plain_text_pattern) 100 | 101 | 102 | raw_pcdata = Bytes_Regex(named_regex_group("pcdata", rb".*?") + pcdata_delimiter, flags=re.DOTALL) 103 | raw_sdata = Bytes_Regex(named_regex_group("sdata", group(_hdigits + rb"\r\n") + rb"+"), flags=re.DOTALL) 104 | -------------------------------------------------------------------------------- /src/rtfparse/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # PYTHON_ARGCOMPLETE_OK 3 | 4 | import io 5 | import logging 6 | import logging.config 7 | from argparse import ArgumentParser, Namespace 8 | from pathlib import Path 9 | 10 | import argcomplete 11 | import compressed_rtf as cr 12 | import extract_msg as em 13 | from provide_dir import provide_dir 14 | 15 | from rtfparse import logging_conf 16 | from rtfparse.__about__ import __version__ 17 | from rtfparse.parser import Rtf_Parser 18 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator 19 | 20 | 21 | def setup_logger(directory: Path) -> logging.Logger: 22 | """ 23 | Returns a logger and a path to directory where the logs are saved 24 | """ 25 | try: 26 | provide_dir(directory) 27 | logger_config = logging_conf.create_dict_config(directory, "rtfparse.debug.log", "rtfparse.info.log", "rtfparse.errors.log") 28 | except FileExistsError: 29 | print(f"Failed to create the directory `{str(directory)}` because it already exists as a file.") 30 | print(f"Please create the directory `{str(directory)}`") 31 | finally: 32 | logging.config.dictConfig(logger_config) 33 | logger = logging.getLogger(__name__) 34 | return logger 35 | 36 | 37 | logger = setup_logger(Path.home() / "rtfparse") 38 | 39 | 40 | def argument_parser() -> ArgumentParser: 41 | """ 42 | Creates an argument parser for command line arguments 43 | """ 44 | parser = ArgumentParser(description="RTF parser", prog="rtfparse") 45 | parser.add_argument("-v", "--version", action="version", version=" ".join(("%(prog)s", __version__)), help="print out rtfparse version and exit") 46 | parser.add_argument("-r", "--rtf-file", action="store", metavar="PATH", type=Path, help="path to the rtf file") 47 | parser.add_argument("-m", "--msg-file", action="store", metavar="PATH", type=Path, help="Parse RTF from MS Outlook's .msg file") 48 | parser.add_argument("-d", "--decapsulate-html", action="store_true", help="Decapsulate HTML from RTF") 49 | parser.add_argument("-i", "--embed-img", action="store_true", help="Embed images from email to HTML") 50 | parser.add_argument("-o", "--output-file", metavar="PATH", type=Path, help="path to the desired output file") 51 | parser.add_argument("-a", "--attachments-dir", metavar="PATH", type=Path, help="path to directory where to save email attachments") 52 | return parser 53 | 54 | 55 | def decapsulate(rp: Rtf_Parser, target_file: Path) -> None: 56 | renderer = HTML_Decapsulator() 57 | with open(target_file, mode="w", encoding="utf-8") as htmlfile: 58 | logger.info("Rendering the encapsulated HTML") 59 | renderer.render(rp.parsed, htmlfile) 60 | logger.info("Encapsulated HTML rendered") 61 | 62 | 63 | def run(cli_args: Namespace) -> None: 64 | if cli_args.rtf_file and cli_args.rtf_file.exists(): 65 | with open(cli_args.rtf_file, mode="rb") as rtf_file: 66 | rp = Rtf_Parser(rtf_file=rtf_file) 67 | rp.parse_file() 68 | elif cli_args.msg_file: 69 | msg = em.openMsg(f"{cli_args.msg_file}") 70 | if cli_args.attachments_dir: 71 | provide_dir(cli_args.attachments_dir) 72 | for attachment in msg.attachments: 73 | with open(cli_args.attachments_dir / f"{attachment.longFilename}", mode="wb") as att_file: 74 | att_file.write(attachment.data) 75 | decompressed_rtf = cr.decompress(msg.compressedRtf) 76 | with open(cli_args.msg_file.with_suffix(".rtf"), mode="wb") as email_rtf: 77 | email_rtf.write(decompressed_rtf) 78 | with io.BytesIO(decompressed_rtf) as rtf_file: 79 | rp = Rtf_Parser(rtf_file=rtf_file) 80 | rp.parse_file() 81 | if cli_args.decapsulate_html and cli_args.output_file: 82 | decapsulate(rp, cli_args.output_file.with_suffix(".html")) 83 | 84 | 85 | def main() -> None: 86 | """ 87 | Entry point for any component start from the commmand line 88 | """ 89 | logger.debug("rtfparse started") 90 | parser = argument_parser() 91 | argcomplete.autocomplete(parser) 92 | cli_args = parser.parse_args() 93 | logger.debug(f"Parsed arguments: {cli_args}") 94 | try: 95 | run(cli_args) 96 | except Exception as err: 97 | logger.exception(f"Uncaught exception {repr(err)} occurred.") 98 | logger.debug("rtfparse ended") 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rtfparse 2 | 3 | Parses Microsoft's Rich Text Format (RTF) documents. It creates an in-memory object which represents the tree structure of the RTF document. This object can in turn be rendered by using one of the renderers. 4 | So far, rtfparse provides only one renderer (`HTML_Decapsulator`) which liberates the HTML code encapsulated in RTF. This will come handy, for examle, if you ever need to extract the HTML from a HTML-formatted email message saved by Microsoft Outlook. 5 | 6 | MS Outlook also tends to use RTF compression, so the CLI of rtfparse can optionally decompress that, too. 7 | 8 | You can of course write your own renderers of parsed RTF documents and consider contributing them to this project. 9 | 10 | 11 | # Installation 12 | 13 | Install rtfparse from your local repository with pip: 14 | 15 | pip install rtfparse 16 | 17 | Installation creates an executable file `rtfparse` in your python scripts folder which should be in your `$PATH`. 18 | 19 | # Usage From Command Line 20 | 21 | Use the `rtfparse` executable from the command line. Read `rtfparse --help`. 22 | 23 | rtfparse writes logs into `~/rtfparse/` into these files: 24 | 25 | ``` 26 | rtfparse.debug.log 27 | rtfparse.info.log 28 | rtfparse.errors.log 29 | ``` 30 | 31 | ## Example: Decapsulate HTML from an uncompressed RTF file 32 | 33 | rtfparse --rtf-file "path/to/rtf_file.rtf" --decapsulate-html --output-file "path/to/extracted.html" 34 | 35 | ## Example: Decapsulate HTML from MS Outlook email file 36 | 37 | For this, the CLI of rtfparse uses [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf). 38 | 39 | rtfparse --msg-file "path/to/email.msg" --decapsulate-html --output-file "path/to/extracted.html" 40 | 41 | ## Example: Only decompress the RTF from MS Outlook email file 42 | 43 | rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" 44 | 45 | ## Example: Decapsulate HTML from MS Outlook email file and save (and later embed) the attachments 46 | 47 | When extracting the RTF from the `.msg` file, you can save the attachments (which includes images embedded in the email text) in a directory: 48 | 49 | rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" --attachments-dir "path/to/dir" 50 | 51 | In `rtfparse` version 1.x you will be able to embed these images in the decapsulated HTML. This functionality will be provided by the package [embedimg](https://github.com/fleetingbytes/embedimg). 52 | 53 | rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" --attachments-dir "path/to/dir" --embed-img 54 | 55 | In the current version the option `--embed-img` does nothing. 56 | 57 | # Programatic usage in a Python module 58 | 59 | ## Decapsulate HTML from an uncompressed RTF file 60 | 61 | ```py 62 | from pathlib import Path 63 | from rtfparse.parser import Rtf_Parser 64 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator 65 | 66 | source_path = Path(r"path/to/your/rtf/document.rtf") 67 | target_path = Path(r"path/to/your/html/decapsulated.html") 68 | # Create parent directory of `target_path` if it does not already exist: 69 | target_path.parent.mkdir(parents=True, exist_ok=True) 70 | 71 | parser = Rtf_Parser(rtf_path=source_path) 72 | parsed = parser.parse_file() 73 | 74 | renderer = HTML_Decapsulator() 75 | 76 | with open(target_path, mode="w", encoding="utf-8") as html_file: 77 | renderer.render(parsed, html_file) 78 | ``` 79 | 80 | ## Decapsulate HTML from an MS Outlook msg file 81 | 82 | ```py 83 | from pathlib import Path 84 | from extract_msg import openMsg 85 | from compressed_rtf import decompress 86 | from io import BytesIO 87 | from rtfparse.parser import Rtf_Parser 88 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator 89 | 90 | 91 | source_file = Path("path/to/your/source.msg") 92 | target_file = Path(r"path/to/your/target.html") 93 | # Create parent directory of `target_path` if it does not already exist: 94 | target_file.parent.mkdir(parents=True, exist_ok=True) 95 | 96 | # Get a decompressed RTF bytes buffer from the MS Outlook message 97 | msg = openMsg(source_file) 98 | decompressed_rtf = decompress(msg.compressedRtf) 99 | rtf_buffer = BytesIO(decompressed_rtf) 100 | 101 | # Parse the rtf buffer 102 | parser = Rtf_Parser(rtf_file=rtf_buffer) 103 | parsed = parser.parse_file() 104 | 105 | # Decapsulate the HTML from the parsed RTF 106 | decapsulator = HTML_Decapsulator() 107 | with open(target_file, mode="w", encoding="utf-8") as html_file: 108 | decapsulator.render(parsed, html_file) 109 | ``` 110 | 111 | # RTF Specification Links 112 | 113 | * [RTF Informative References](https://learn.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxrtfcp/85c0b884-a960-4d1a-874e-53eeee527ca6) 114 | * [RTF Specification 1.9.1](https://go.microsoft.com/fwlink/?LinkId=120924) 115 | * [RTF Extensions, MS-OXRTFEX](https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxrtfex/411d0d58-49f7-496c-b8c3-5859b045f6cf) 116 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "hatchling>=1.27.0", 4 | "hatch-semver" 5 | ] 6 | build-backend = "hatchling.build" 7 | 8 | [project] 9 | name = "rtfparse" 10 | description = "Tool to parse Microsoft Rich Text Format (RTF)" 11 | readme = "README.md" 12 | license = "MIT" 13 | requires-python = ">=3.10" 14 | authors = [ 15 | { name = "Sven Siegmund", email = "sven.siegmund@iav.de" }, 16 | ] 17 | classifiers = [ 18 | #"Development Status :: 3 - Alpha", 19 | #"Development Status :: 4 - Beta", 20 | "Development Status :: 5 - Production/Stable", 21 | "Intended Audience :: Developers", 22 | "Environment :: Console", 23 | "Topic :: Software Development :: Testing", 24 | "Topic :: Utilities", 25 | "Natural Language :: English", 26 | "Programming Language :: Python :: 3.10", 27 | "Programming Language :: Python :: 3.11", 28 | "Operating System :: OS Independent", 29 | "Operating System :: Microsoft :: Windows", 30 | "Operating System :: POSIX :: Linux", 31 | "Operating System :: MacOS :: MacOS X", 32 | ] 33 | keywords = [ 34 | "rtf", 35 | "parse", 36 | ] 37 | dependencies = [ 38 | "argcomplete", 39 | "extract-msg", 40 | "compressed_rtf", 41 | "provide_dir", 42 | ] 43 | dynamic = ["version"] 44 | 45 | [project.urls] 46 | Documentation = "https://github.com/fleetingbytes/rtfparse#readme" 47 | Issues = "https://github.com/fleetingbytes/rtfparse/issues" 48 | Source = "https://github.com/fleetingbytes/rtfparse" 49 | 50 | [project.scripts] 51 | rtfparse = "rtfparse.cli:main" 52 | 53 | [tool.hatch.version] 54 | path = "src/rtfparse/__about__.py" 55 | validate-bump = true 56 | scheme = "semver" 57 | 58 | [tool.hatch.envs.default] 59 | dependencies = [ 60 | "pytest-cov", 61 | ] 62 | [tool.hatch.envs.default.scripts] 63 | cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=src/rtfparse --cov=tests {args}" 64 | no-cov = "cov --no-cov {args}" 65 | 66 | [tool.hatch.envs.style] 67 | dependencies = [ 68 | "ruff", 69 | ] 70 | 71 | [tool.hatch.envs.style.scripts] 72 | fmt = [ 73 | "ruff format", 74 | "ruff check", 75 | ] 76 | 77 | [tool.hatch.envs.tc] 78 | dependencies = [ 79 | "towncrier", 80 | ] 81 | 82 | [tool.hatch.envs.tc.scripts] 83 | draft = "towncrier build --draft" 84 | build = "towncrier build --yes" 85 | 86 | [tool.hatch.envs.docs] 87 | dependencies = [ 88 | "pdoc3" 89 | ] 90 | 91 | [[tool.hatch.envs.test.matrix]] 92 | python = ["311"] 93 | 94 | [tool.coverage.run] 95 | branch = true 96 | parallel = true 97 | omit = [ 98 | #"src/rtfparse/__about__.py", 99 | ] 100 | 101 | [tool.coverage.report] 102 | exclude_lines = [ 103 | "no cov", 104 | "if __name__ == .__main__.:", 105 | "if TYPE_CHECKING:", 106 | ] 107 | 108 | [tool.ruff] 109 | # Exclude a variety of commonly ignored directories. 110 | exclude = [ 111 | ".bzr", 112 | ".direnv", 113 | ".eggs", 114 | ".git", 115 | ".git-rewrite", 116 | ".hg", 117 | ".ipynb_checkpoints", 118 | ".mypy_cache", 119 | ".nox", 120 | ".pants.d", 121 | ".pyenv", 122 | ".pytest_cache", 123 | ".pytype", 124 | ".ruff_cache", 125 | ".svn", 126 | ".tox", 127 | ".venv", 128 | ".vscode", 129 | "__pypackages__", 130 | "_build", 131 | "buck-out", 132 | "build", 133 | "dist", 134 | "node_modules", 135 | "site-packages", 136 | "venv", 137 | ] 138 | 139 | # Same as Black. 140 | line-length = 150 141 | indent-width = 4 142 | 143 | # Assume Python 3.10 144 | target-version = "py310" 145 | 146 | [tool.ruff.lint] 147 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 148 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 149 | # McCabe complexity (`C901`) by default. 150 | select = ["E4", "E7", "E9", "F"] 151 | ignore = [] 152 | 153 | # Allow fix for all enabled rules (when `--fix`) is provided. 154 | fixable = ["ALL"] 155 | unfixable = [] 156 | 157 | # Allow unused variables when underscore-prefixed. 158 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 159 | 160 | [tool.ruff.lint.pycodestyle] 161 | max-line-length = 150 162 | 163 | [tool.ruff.format] 164 | # Like Black, use double quotes for strings. 165 | quote-style = "double" 166 | 167 | # Like Black, indent with spaces, rather than tabs. 168 | indent-style = "space" 169 | 170 | # Like Black, respect magic trailing commas. 171 | skip-magic-trailing-comma = false 172 | 173 | # Like Black, automatically detect the appropriate line ending. 174 | line-ending = "auto" 175 | 176 | # Enable auto-formatting of code examples in docstrings. Markdown, 177 | # reStructuredText code/literal blocks and doctests are all supported. 178 | # 179 | # This is currently disabled by default, but it is planned for this 180 | # to be opt-out in the future. 181 | docstring-code-format = true 182 | 183 | # Set the line length limit used when formatting code snippets in 184 | # docstrings. 185 | # 186 | # This only has an effect when the `docstring-code-format` setting is 187 | # enabled. 188 | docstring-code-line-length = "dynamic" 189 | 190 | [tool.towncrier] 191 | name = "rtfparse" 192 | package = "rtfparse" 193 | package_dir = "src" 194 | directory = "changelog.d" 195 | filename = "CHANGELOG.md" 196 | start_string = "\n" 197 | underlines = ["", "", ""] 198 | template = "changelog.d/changelog_template.jinja" 199 | #title_format = "## [{version}](https://github.com/fleetingbytes/rtfparse/{version}) - {project_date}" 200 | title_format = "## {version} ({project_date})" 201 | issue_format = "[#{issue}](https://github.com/fleetingbytes/rtfparse/issues/{issue})" 202 | orphan_prefix = "+" 203 | 204 | [tool.towncrier.fragment.doc] 205 | name = "Documentation" 206 | 207 | [tool.towncrier.fragment.feature] 208 | name = "New Features" 209 | 210 | [tool.towncrier.fragment.improved] 211 | name = "Improvements" 212 | 213 | [tool.towncrier.fragment.fixed] 214 | name = "Bugfixes" 215 | 216 | [tool.towncrier.fragment.unimportant] 217 | name = "Development Details" 218 | -------------------------------------------------------------------------------- /src/rtfparse/entities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import io 5 | import logging 6 | 7 | # Own modules 8 | from rtfparse import re_patterns, utils 9 | from rtfparse.enums import Bytestring_Type 10 | 11 | # Setup logging 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # Constants, number of bytes to read when creating entities 16 | CHARACTER = BACKSLASH = DELIMITER = MINUS = GROUP_END = len(b"\\") 17 | SYMBOL = IGNORABLE = BACKSLASH + CHARACTER 18 | GROUP_START = BACKSLASH + IGNORABLE 19 | MAX_CW_LETTERS = 32 # As specified in RTF Spec 20 | INTEGER_MAGNITUDE = 32 # As specified in RTF Spec 21 | PLAIN_TEXT = CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + MINUS + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER 22 | 23 | 24 | class Entity: 25 | def __init__(self) -> None: 26 | self.text = "" 27 | 28 | @classmethod 29 | def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type: 30 | logger.debug(f"Probing file at position {file.tell()}") 31 | original_position = file.tell() 32 | while True: 33 | probed = file.read(len(re_patterns.probe_pattern)) 34 | logger.debug(f"{probed = }") 35 | file.seek(original_position) 36 | logger.debug(f"Probe returned to position {file.tell()}") 37 | if re_patterns.group_start.match(probed): 38 | result = Bytestring_Type.GROUP_START 39 | elif re_patterns.group_end.match(probed): 40 | result = Bytestring_Type.GROUP_END 41 | elif re_patterns.control_word.match(probed): 42 | result = Bytestring_Type.CONTROL_WORD 43 | elif re_patterns.control_symbol.match(probed): 44 | result = Bytestring_Type.CONTROL_SYMBOL 45 | elif re_patterns.plain_text.match(probed): 46 | result = Bytestring_Type.PLAIN_TEXT 47 | else: 48 | logger.debug("This does not match anything, it's probably a newline, moving on") 49 | original_position += 1 50 | file.seek(original_position) 51 | logger.debug(f"Probe moved to position {file.tell()}") 52 | if not probed: 53 | logger.debug("Reached unexpected end of file.") 54 | result = Bytestring_Type.GROUP_END 55 | break 56 | continue 57 | break 58 | logger.debug(f"Probe {result = }") 59 | logger.debug(f"Probe leaving file at position {file.tell()}") 60 | return result 61 | 62 | 63 | class Control_Word(Entity): 64 | def __init__(self, encoding: str, file: io.BufferedReader) -> None: 65 | super().__init__() 66 | self.encoding = encoding 67 | logger.debug(f"Reading Control Word at file position {file.tell()}") 68 | self.control_name = "missing" 69 | self.parameter = "" 70 | self.bindata = b"" 71 | self.start_position = file.tell() 72 | logger.debug(f"Starting at file position {self.start_position}") 73 | probe = file.read(CONTROL_WORD) 74 | if match := re_patterns.control_word.match(probe): 75 | self.control_name = match.group("control_name").decode(self.encoding) 76 | logger.debug(f"Preliminary {self.control_name = }") 77 | parameter = match.group("parameter") 78 | if parameter is not None: 79 | self.parameter = int(parameter.decode(self.encoding)) 80 | logger.debug(f"{self.parameter = }") 81 | self.control_name = self.control_name.removesuffix(str(self.parameter)) 82 | logger.debug(f"Final {self.control_name = }") 83 | target_position = self.start_position + match.span()[1] 84 | if match.group("other"): 85 | logger.debug(f"Delimiter is {match.group('other').decode(self.encoding)}, len: {len(match.group('delimiter'))}") 86 | target_position -= len(match.group("delimiter")) 87 | file.seek(target_position) 88 | # handle \binN: 89 | if self.control_name == "bin": 90 | self.bindata = file.read(utils.twos_complement(self.parameter, INTEGER_MAGNITUDE)) 91 | else: 92 | logger.warning("Missing Control Word") 93 | file.seek(self.start_position) 94 | 95 | def __repr__(self) -> str: 96 | return f"<{self.__class__.__name__}: {self.control_name}{self.parameter}>" 97 | 98 | 99 | class Control_Symbol(Entity): 100 | def __init__(self, encoding: str, file: io.BufferedReader) -> None: 101 | super().__init__() 102 | self.encoding = encoding 103 | self.start_position = file.tell() 104 | logger.debug(f"Reading Symbol at file position {self.start_position}") 105 | self.char = "" 106 | self.text = chr(file.read(SYMBOL)[-1]) 107 | if self.text == "'": 108 | self.char = file.read(SYMBOL).decode(self.encoding) 109 | self.text = bytes((int(self.char, base=16),)).decode(self.encoding) 110 | logger.debug(f"Encountered escaped ANSI character, read two more bytes: {self.char}, character: {self.text}") 111 | if self.text in "\\{}": 112 | file.seek(file.tell() - SYMBOL) 113 | 114 | def __repr__(self) -> str: 115 | return f"<{self.__class__.__name__}: {self.text}>" 116 | 117 | 118 | class Plain_Text(Entity): 119 | def __init__(self, encoding: str, file: io.BufferedReader) -> None: 120 | super().__init__() 121 | self.encoding = encoding 122 | self.text = "" 123 | logger.debug("Constructing Plain_Text") 124 | while True: 125 | self.start_position = file.tell() 126 | read = file.read(PLAIN_TEXT) 127 | logger.debug(f"Read file from {self.start_position} to position {file.tell()}, read: {read}") 128 | # see if we have read all the plain text there is: 129 | if match := re_patterns.plain_text.match(read): 130 | logger.debug("This matches the plain text pattern") 131 | _text = match.group("text").decode(self.encoding) 132 | logger.debug(f"{_text = }") 133 | self.text = "".join((self.text, _text)) 134 | logger.debug(f"{self.text = }") 135 | if len(_text) == PLAIN_TEXT: 136 | continue 137 | else: 138 | file.seek(self.start_position + len(_text)) 139 | break 140 | else: 141 | file.seek(self.start_position) 142 | break 143 | logger.debug(f"Returned to position {file.tell()}") 144 | 145 | def __repr__(self) -> str: 146 | return f"<{self.__class__.__name__}: {self.text}>" 147 | 148 | 149 | class Group(Entity): 150 | def __init__(self, encoding: str, file: io.BufferedReader) -> None: 151 | super().__init__() 152 | logger.debug("Group.__init__") 153 | self.encoding = encoding 154 | self.known = False 155 | self.name = "unknown" 156 | self.ignorable = False 157 | self.structure = list() 158 | parsed_object = utils.what_is_being_parsed(file) 159 | logger.debug(f"Creating destination group from {parsed_object}") 160 | self.start_position = file.tell() 161 | logger.debug(f"Starting at file position {self.start_position}") 162 | probe = file.read(GROUP_START) 163 | logger.debug(f"Read file up to position {file.tell()}, read {probe = }") 164 | if match := re_patterns.group_start.match(probe): 165 | self.known = bool(match.group("group_start")) 166 | self.ignorable = bool(match.group("ignorable")) 167 | if not self.ignorable: 168 | file.seek(self.start_position + GROUP_START - IGNORABLE) 169 | logger.debug(f"Returned to position {file.tell()}") 170 | else: 171 | logger.warning(utils.warn("Expected a group but found no group start. Creating unknown group")) 172 | file.seek(self.start_position) 173 | while True: 174 | probed = self.probe(re_patterns.probe, file) 175 | if probed is Bytestring_Type.CONTROL_WORD: 176 | self.structure.append(Control_Word(self.encoding, file)) 177 | elif probed is Bytestring_Type.GROUP_END: 178 | file.read(GROUP_END) 179 | break 180 | elif probed is Bytestring_Type.GROUP_START: 181 | self.structure.append(Group(self.encoding, file)) 182 | elif probed is Bytestring_Type.CONTROL_SYMBOL: 183 | self.structure.append(Control_Symbol(self.encoding, file)) 184 | else: 185 | self.structure.append(Plain_Text(self.encoding, file)) 186 | # name the group like its first Control Word 187 | # this way the renderer will be able to ignore entire groups based on their first control word 188 | try: 189 | if isinstance(self.structure[0], Control_Word): 190 | self.name = self.structure[0].control_name 191 | except IndexError: 192 | pass 193 | 194 | def __repr__(self) -> str: 195 | return f"" 196 | 197 | 198 | if __name__ == "__main__": 199 | pass 200 | --------------------------------------------------------------------------------