├── .gitattributes
├── src
    └── rtfparse
    │   ├── __about__.py
    │   ├── renderers
    │       ├── __init__.py
    │       └── html_decapsulator.py
    │   ├── __init__.py
    │   ├── enums.py
    │   ├── minimal.py
    │   ├── utils.py
    │   ├── parser.py
    │   ├── logging_conf.py
    │   ├── re_patterns.py
    │   ├── cli.py
    │   └── entities.py
├── changelog.d
    └── changelog_template.jinja
├── LICENSE
├── ROADMAP.md
├── .gitignore
├── CHANGELOG.md
├── README.md
└── pyproject.toml


/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto
2 | 


--------------------------------------------------------------------------------
/src/rtfparse/__about__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 
3 | 
4 | __version__ = "0.9.5"
5 | 


--------------------------------------------------------------------------------
/src/rtfparse/renderers/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | class Renderer:
 5 |     pass
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/src/rtfparse/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | # Towncrier needs version
 5 | # from rtfparse.__about__ import __version__
 6 | __all__ = ["rtfparse.__about__.__version__"]
 7 | 
 8 | if __name__ == "__main__":
 9 |     from rtfparse.cli import main
10 | 
11 |     main()
12 | 


--------------------------------------------------------------------------------
/src/rtfparse/enums.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | from enum import Enum, auto, unique
 5 | 
 6 | 
 7 | @unique
 8 | class Bytestring_Type(Enum):
 9 |     GROUP_START = auto()
10 |     GROUP_END = auto()
11 |     CONTROL_WORD = auto()
12 |     CONTROL_SYMBOL = auto()
13 |     PLAIN_TEXT = auto()
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     pass
18 | 


--------------------------------------------------------------------------------
/changelog.d/changelog_template.jinja:
--------------------------------------------------------------------------------
 1 | {% if sections[""] %}
 2 | {% for category, val in definitions.items() if category in sections[""] %}
 3 | 
 4 | ### {{ definitions[category]['name'] }}
 5 | 
 6 | {% for text, values in sections[""][category].items() %}
 7 | - {{ text }} {{ values|join(', ') }}
 8 | {% endfor %}
 9 | 
10 | {% endfor %}
11 | {% else %}
12 | No significant changes.
13 | 
14 | 
15 | {% endif %}
16 | 


--------------------------------------------------------------------------------
/src/rtfparse/minimal.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | """
 5 | A minimal example for a programatic use of the rtf parser and renderer
 6 | """
 7 | 
 8 | from pathlib import Path
 9 | 
10 | from rtfparse.parser import Rtf_Parser
11 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator
12 | 
13 | source_path = Path(r"D:\trace\Pre-Integration test report of carapp_orureleasenotes_1_22_104 Webapps on ID_S 5_0.rtf")
14 | target_path = Path(r"D:\trace\Pre-Integration test report of carapp_orureleasenotes_1_22_104 Webapps on ID_S 5_0.html")
15 | # Create parent directory of `target_path` if it does not already exist:
16 | target_path.parent.mkdir(parents=True, exist_ok=True)
17 | 
18 | parser = Rtf_Parser(rtf_path=source_path)
19 | parsed = parser.parse_file()
20 | 
21 | renderer = HTML_Decapsulator()
22 | 
23 | with open(target_path, mode="w", encoding="utf-8") as html_file:
24 |     renderer.render(parsed, html_file)
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Sven Siegmund
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | # Roadmap for rtfparse
 2 | 
 3 | - Rework the CLI. The original reason I wrote rtfparse was to decapsulate HTML from MS Outlook email files. Much of the current CLI serves the purpose of extracting the email body and attachments. This introduced dependency with non-free license (yes, I consider GPL non-free) so that rtfparse currently has a license conflict. By modifying the CLI such that it expects an RTF file (rather than Outlook's .msg file) we shall get rid of that conflict. For extracting content out of Outlook messages, [msg-extractor][msg-extractor]'s own CLI shall be used in a separate step.
 4 | - Build solid test code
 5 |     - introduce end-to-end tests with [behave][behave]
 6 |     - bring in some good test material (call for test material)
 7 | - Once [human-regex][hr] works with Python 3.13, rewrite the _re_patterns_ module with human-regex as a dependency.
 8 | - Hand over the further development and maintenance of this project to somebody with more free time and investment in RTF than me. By migrating from Windows to FreeBSD, Outlook messages and RTFs have left my life. My incentive to work a tool I'm not personally using is currently very low.
 9 | 
10 | [msg-extractor]: https://github.com/TeamMsgExtractor/msg-extractor
11 | [behave]: https://behave.readthedocs.io/en/stable/
12 | [hr]: https://github.com/fleetingbytes/human-regex
13 | 


--------------------------------------------------------------------------------
/src/rtfparse/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import io
 5 | import logging
 6 | import pathlib
 7 | 
 8 | # Typing
 9 | from typing import Union
10 | 
11 | # Setup logging
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | program_name = home_dir_name = "rtfparse"
16 | dir_name = "".join((".", program_name))
17 | configuration_file_name = f"{program_name}_configuration.ini"
18 | 
19 | 
20 | def provide_dir(directory: pathlib.Path) -> pathlib.Path:
21 |     """
22 |     Checks if there is a directory of name `dir_name` in the user home path.
23 |     If not, it will try to create one.
24 |     """
25 |     if directory.exists() and directory.is_dir():
26 |         logger.debug(f"Found directory {str(directory)}")
27 |     else:
28 |         while True:
29 |             try:
30 |                 directory.mkdir()
31 |                 logger.info(f"Created directory {str(directory)}")
32 |                 break
33 |             except FileNotFoundError:
34 |                 provide_dir(directory.parent)
35 |                 continue
36 |             except FileExistsError:
37 |                 logger.debug(f"{directory} already exists")
38 |                 break
39 |     return directory
40 | 
41 | 
42 | def warn(s: str) -> str:
43 |     """
44 |     Creates a string highlighted as warning in log output
45 |     """
46 |     return " ".join(("◊", s))
47 | 
48 | 
49 | def what_is_being_parsed(file: Union[io.BufferedReader, io.BytesIO]) -> str:
50 |     if isinstance(file, io.BufferedReader):
51 |         return file.name
52 |     elif isinstance(file, io.BytesIO):
53 |         return repr(file)
54 | 
55 | 
56 | def twos_complement(val, nbits):
57 |     """Compute the 2's complement of int value val. Credit: https://stackoverflow.com/a/37075643/9235421"""
58 |     if val < 0:
59 |         if (val + 1).bit_length() >= nbits:
60 |             raise ValueError(f"Value {val} is out of range of {nbits}-bit value.")
61 |         val = (1 << nbits) + val
62 |     else:
63 |         if val.bit_length() > nbits:
64 |             raise ValueError(f"Value {val} is out of range of {nbits}-bit value.")
65 |         # If sign bit is set.
66 |         if (val & (1 << (nbits - 1))) != 0:
67 |             # compute negative value.
68 |             val = val - (1 << nbits)
69 |     return val
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Test files
  2 | target.html
  3 | extract.py
  4 | test.msg
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # PowerShell garbage
 12 | Out-Null
 13 | 
 14 | # Vim files
 15 | *~
 16 | *.swp
 17 | *.swo
 18 | 
 19 | # RTF
 20 | *.rtf
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | pip-wheel-metadata/
 40 | share/python-wheels/
 41 | *.egg-info/
 42 | .installed.cfg
 43 | *.egg
 44 | MANIFEST
 45 | 
 46 | # PyInstaller
 47 | #  Usually these files are written by a python script from a template
 48 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 49 | *.manifest
 50 | *.spec
 51 | 
 52 | # Installer logs
 53 | pip-log.txt
 54 | pip-delete-this-directory.txt
 55 | 
 56 | # Unit test / coverage reports
 57 | htmlcov/
 58 | .tox/
 59 | .nox/
 60 | .coverage
 61 | .coverage.*
 62 | .cache
 63 | nosetests.xml
 64 | coverage.xml
 65 | *.cover
 66 | *.py,cover
 67 | .hypothesis/
 68 | .pytest_cache/
 69 | 
 70 | # Translations
 71 | *.mo
 72 | *.pot
 73 | 
 74 | # Django stuff:
 75 | *.log
 76 | local_settings.py
 77 | db.sqlite3
 78 | db.sqlite3-journal
 79 | 
 80 | # Flask stuff:
 81 | instance/
 82 | .webassets-cache
 83 | 
 84 | # Scrapy stuff:
 85 | .scrapy
 86 | 
 87 | # Sphinx documentation
 88 | docs/_build/
 89 | 
 90 | # PyBuilder
 91 | target/
 92 | 
 93 | # Jupyter Notebook
 94 | .ipynb_checkpoints
 95 | 
 96 | # IPython
 97 | profile_default/
 98 | ipython_config.py
 99 | 
100 | # pyenv
101 | .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 | 
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/
146 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | <!-- towncrier release notes start -->
 4 | 
 5 | ## 0.9.5 (2025-07-08)
 6 | 
 7 | 
 8 | ### Documentation
 9 | 
10 | - add MIT header to LICENSE.txt [#46](https://github.com/fleetingbytes/rtfparse/issues/46)
11 | - use MIT SPDX identifier in pyproject.toml, use correct name in LICENSE.txt, update year in LICENSE.txt, rename LICENSE.txt to LICENSE [#47](https://github.com/fleetingbytes/rtfparse/issues/47)
12 | 
13 | ## 0.9.4 (2024-11-10)
14 | 
15 | 
16 | ### Bugfixes
17 | 
18 | - add missing import statement in `html_decapsulator.py` [#42](https://github.com/fleetingbytes/rtfparse/issues/42)
19 | 
20 | 
21 | ### Development Details
22 | 
23 | - replace `black` and `isort` with `ruff` [#44](https://github.com/fleetingbytes/rtfparse/issues/44)
24 | 
25 | ## 0.9.3 (2024-11-01)
26 | 
27 | 
28 | ### Bugfixes
29 | 
30 | - Fixed double numbering of ordered and unordered lists [#38](https://github.com/fleetingbytes/rtfparse/issues/38)
31 | 
32 | ## 0.9.2 (2024-09-30)
33 | 
34 | 
35 | ### Bugfixes
36 | 
37 | - Fixed `rtfparse --help`, correct entrypoint in `pyproject.toml` [#34](https://github.com/fleetingbytes/rtfparse/issues/34)
38 | 
39 | ## 0.9.1 (2024-06-21)
40 | 
41 | 
42 | ### Documentation
43 | 
44 | - Fix old naming in readme [#22](https://github.com/fleetingbytes/rtfparse/issues/22)
45 | - Add example how to programmatically extract HTML from MS Outlook message [#25](https://github.com/fleetingbytes/rtfparse/issues/25)
46 | 
47 | 
48 | ### Bugfixes
49 | 
50 | - Don't setup log if not using the CLI [#24](https://github.com/fleetingbytes/rtfparse/issues/24)
51 | - Fix possible bug in error handling [#26](https://github.com/fleetingbytes/rtfparse/issues/26)
52 | 
53 | ## 0.9.0 (2024-03-11)
54 | 
55 | 
56 | ### Bugfixes
57 | 
58 | - Recognize control words with where the parameter's digital sequence is delimited by any character other than an ASCII digit [#18](https://github.com/fleetingbytes/rtfparse/issues/18)
59 | 
60 | 
61 | ### Development Details
62 | 
63 | - Renamed a few things, improved readme [#17](https://github.com/fleetingbytes/rtfparse/issues/17)
64 | 
65 | ## 0.8.2 (2024-03-05)
66 | 
67 | 
68 | ### Documentation
69 | 
70 | - Update `README.md`: Create parent directories of `target_path` if they don't already exist. [#14](https://github.com/fleetingbytes/rtfparse/issues/14)
71 | 
72 | ## 0.8.1 (2023-08-07)
73 | 
74 | 
75 | ### Bugfixes
76 | 
77 | - Interpret ANSI encoding as CP1252, improve error handling [#11](https://github.com/fleetingbytes/rtfparse/issues/11)
78 | 
79 | 
80 | ## 0.8.0 (2023-06-29)
81 | 
82 | 
83 | ### Bugfixes
84 | 
85 | - Using `pyproject.toml` for installation with current pip versions [#1](https://github.com/fleetingbytes/rtfparse/issues/1)
86 | 
87 | 
88 | ### Development Details
89 | 
90 | - Fixed reference before assignment error [#3](https://github.com/fleetingbytes/rtfparse/issues/3)
91 | - Removed convoluted configurator [#5](https://github.com/fleetingbytes/rtfparse/issues/5)
92 | 


--------------------------------------------------------------------------------
/src/rtfparse/parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import io
 5 | import logging
 6 | import pathlib
 7 | from argparse import Namespace
 8 | 
 9 | # Typing
10 | from typing import Optional, Union
11 | 
12 | # Own modules
13 | from rtfparse import entities, utils
14 | 
15 | # Setup logging
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | class Rtf_Parser:
20 |     def __init__(self, rtf_path: Optional[pathlib.Path] = None, rtf_file: Optional[Union[io.BufferedReader, io.BytesIO]] = None) -> None:
21 |         self.rtf_path = rtf_path
22 |         self.rtf_file = rtf_file
23 |         if not (self.rtf_path or self.rtf_file):
24 |             raise ValueError("Need `rtf_path` or `rtf_file` argument")
25 |         self.ENCODING_PROBE = 48  # look for encoding information in the first 48 bytes of the file
26 | 
27 |     def read_encoding(self, file: Union[io.BufferedReader, io.BytesIO]) -> str:
28 |         probed = file.read(self.ENCODING_PROBE)
29 |         group = entities.Group("cp1252", io.BytesIO(probed))
30 |         recognized_encodings = ("ansi", "ansicpg", "mac", "pc", "pca")
31 |         # Gather all control words, which could define an encoding:
32 |         names = tuple(filter(lambda item: isinstance(item, entities.Control_Word) and item.control_name in recognized_encodings, group.structure))
33 |         # Check if the ANSI code page is set as a parameter of any of the control words:
34 |         encoding = None
35 |         for item in names:
36 |             # if any item is a Control_Word which has a parameter, we assume that this is the parameter of \ansicpg, and that corresponds to the codepage we are looking for
37 |             if item.parameter:
38 |                 param = item.parameter
39 |             else:
40 |                 param = None
41 |         if param:
42 |             if param == 65001:
43 |                 logger.warning("Found encoding '65001', but often this is actually 'cp1252', so I'm taking that")
44 |                 encoding = "cp1252"
45 |             else:
46 |                 encoding = f"cp{param}"
47 |         else:
48 |             if names[0].control_name == "ansi":
49 |                 logger.warning("Found encoding 'ansi', but often this is actually 'cp1252', so I'm taking that")
50 |                 encoding = "cp1252"
51 |             elif names[0].control_name == "mac":
52 |                 encoding = "mac_roman"
53 |             elif names[0].control_name == "pc":
54 |                 encoding = "cp437"
55 |             elif names[0].control_name == "pca":
56 |                 encoding = "cp850"
57 |         file.seek(0)
58 |         logger.info(f"recognized encoding {encoding}")
59 |         return encoding
60 | 
61 |     def parse_file(self) -> entities.Group:
62 |         if self.rtf_path is not None:
63 |             file = open(self.rtf_path, mode="rb")
64 |         elif self.rtf_file is not None:
65 |             file = self.rtf_file
66 |         else:
67 |             file = io.BytesIO(b"")
68 |         parsed_object = utils.what_is_being_parsed(file)
69 |         logger.info(f"Parsing the structure of {parsed_object}")
70 |         try:
71 |             encoding = self.read_encoding(file)
72 |             self.parsed = entities.Group(encoding, file)
73 |         except Exception as err:
74 |             logger.exception(err)
75 |             self.parsed = Namespace()
76 |             self.parsed.structure = list()
77 |         finally:
78 |             if self.rtf_path is not None:
79 |                 logger.debug(f"Closing {parsed_object}")
80 |                 file.close()
81 |             logger.info(f"Structure of {parsed_object} parsed")
82 |             return self.parsed
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     pass
87 | 


--------------------------------------------------------------------------------
/src/rtfparse/renderers/html_decapsulator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import io
 5 | import logging
 6 | 
 7 | from rtfparse import entities, utils
 8 | from rtfparse.renderers import Renderer
 9 | 
10 | # Setup logging
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class HTML_Decapsulator(Renderer):
15 |     def __init__(self) -> None:
16 |         super().__init__()
17 |         self.ignore_rtf = False
18 |         self.render_word_func = dict(
19 |             (("par", self.newline), ("line", self.newline), ("tab", self.tab), ("fromhtml", self.check_fromhtml), ("htmlrtf", self.ignore_rtf_toggle))
20 |         )
21 |         self.ignore_groups = ("fonttbl", "colortbl", "generator", "formatConverter", "pntext", "pntxta", "pntxtb")
22 | 
23 |     def ignore_rtf_toggle(self, cw: entities.Control_Word) -> str:
24 |         if cw.parameter == "" or cw.parameter == 1:
25 |             self.ignore_rtf = True
26 |         elif cw.parameter == 0:
27 |             self.ignore_rtf = False
28 |         return ""
29 | 
30 |     def check_fromhtml(self, cw: entities.Control_Word) -> str:
31 |         if cw.parameter == 1:
32 |             logger.info("This RTF was indeed generated from HTML")
33 |         else:
34 |             logger.warning(utils.warn("Encountered a part of RTF which was not generated from HTML"))
35 |             logger.warning(utils.warn("This might not be the right renderer for it."))
36 |         return ""
37 | 
38 |     def newline(self, cw: entities.Control_Word) -> str:
39 |         if self.ignore_rtf:
40 |             return ""
41 |         else:
42 |             return "\n"
43 | 
44 |     def tab(self, cw: entities.Control_Word) -> str:
45 |         if self.ignore_rtf:
46 |             return ""
47 |         else:
48 |             return "\t"
49 | 
50 |     def render_symbol(self, item: entities.Control_Symbol, file: io.TextIOWrapper) -> None:
51 |         if not self.ignore_rtf:
52 |             # Obsolete formula character used by Word 5.1 for Macintosh
53 |             if item.text == "|":
54 |                 pass
55 |             # Non-breaking space
56 |             elif item.text == "~":
57 |                 file.write("\u00a0")
58 |             # Optional hyphen
59 |             elif item.text == "-":
60 |                 pass
61 |             # Non-breaking hyphen
62 |             elif item.text == "_":
63 |                 file.write("\u2011")
64 |             # Subentry in an index entry
65 |             elif item.text == ":":
66 |                 pass
67 |             # Ignorable outside of Group
68 |             elif item.text == "*":
69 |                 logger.warning(utils.warn("Found an IGNORABLE control symbol which is not a group start!"))
70 |             # Probably any symbol converted from a hex code: \'hh
71 |             else:
72 |                 file.write(item.text)
73 | 
74 |     def render(self, parsed: entities.Group, file: io.TextIOWrapper) -> None:
75 |         for item in parsed.structure:
76 |             if isinstance(item, entities.Group):
77 |                 if item.name not in self.ignore_groups:
78 |                     self.render(item, file)
79 |             elif isinstance(item, entities.Control_Word):
80 |                 try:
81 |                     file.write(self.render_word_func[item.control_name](item))
82 |                 except KeyError:
83 |                     pass
84 |             elif isinstance(item, entities.Control_Symbol):
85 |                 self.render_symbol(item, file)
86 |             elif isinstance(item, entities.Plain_Text):
87 |                 if not self.ignore_rtf:
88 |                     file.write(item.text)
89 |             else:
90 |                 pass
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     pass
95 | 


--------------------------------------------------------------------------------
/src/rtfparse/logging_conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Logger Configuration module
  4 | # Import this for easy logger configuration
  5 | # See example in the comment of the set_logfile_path function below
  6 | 
  7 | # Author: Sven Siegmund
  8 | # Version 4
  9 | 
 10 | """
 11 | This is to easily set the logfile name for the root logger's
 12 | file handler from the module where logging_conf
 13 | is imported. Like this:
 14 | 
 15 |     import logging_conf
 16 |     logging.config.dictConfig(logging_conf.create_dict_cofig(pathlib.Path.home(), "debug.log", "info.log", "error.log")
 17 |     logging.getLogger()
 18 | 
 19 | If you want an additional custom logger, get it like this:
 20 | 
 21 |     logger = logging.getLogger("custom_logger")
 22 | 
 23 | The custom logger is configured to propagate its log records to the root logger
 24 | """
 25 | 
 26 | import pathlib
 27 | 
 28 | 
 29 | def create_dict_config(directory: pathlib.Path, all_log: str, info_log: str, error_log: str) -> dict:
 30 |     """
 31 |     Creates a logging configuration with path to logfiles set as
 32 |     given by the arguments
 33 |     """
 34 |     file_formatter_conf = {
 35 |         "format": "{message:<50s} {levelname:>9s} {asctime}.{msecs:03.0f} {module} {funcName} ",
 36 |         "style": "{",
 37 |         # "datefmt": "%Y-%m-%d %H:%M:%S",
 38 |         "datefmt": "%H:%M:%S",
 39 |     }
 40 | 
 41 |     console_formatter_conf = {
 42 |         "format": "{message}",
 43 |         # "format": "{asctime},{msecs:03.0f} {levelname:>9s} {module} {funcName}: {message}",
 44 |         "style": "{",
 45 |         "datefmt": "%a %H:%M:%S",
 46 |     }
 47 | 
 48 |     formatters_dict = {"file_formatter": file_formatter_conf, "console_formatter": console_formatter_conf}
 49 | 
 50 |     root_console_handler_conf = {"class": "logging.StreamHandler", "level": "INFO", "formatter": "console_formatter", "stream": "ext://sys.stdout"}
 51 | 
 52 |     root_file_handler_conf = {
 53 |         "class": "logging.FileHandler",
 54 |         "level": "DEBUG",
 55 |         "formatter": "file_formatter",
 56 |         "filename": directory / all_log,
 57 |         "mode": "w",
 58 |         "encoding": "utf-8",
 59 |     }
 60 | 
 61 |     custom_error_file_handler_conf = {
 62 |         "class": "logging.FileHandler",
 63 |         "level": "ERROR",
 64 |         "formatter": "file_formatter",
 65 |         "filename": directory / error_log,
 66 |         "mode": "w",
 67 |         "encoding": "utf-8",
 68 |     }
 69 | 
 70 |     custom_info_file_handler_conf = {
 71 |         "class": "logging.FileHandler",
 72 |         "level": "INFO",
 73 |         "formatter": "file_formatter",
 74 |         "filename": directory / info_log,
 75 |         "mode": "w",
 76 |         "encoding": "utf-8",
 77 |     }
 78 | 
 79 |     handlers_dict = {
 80 |         "root_console_handler": root_console_handler_conf,
 81 |         "root_file_handler": root_file_handler_conf,
 82 |         "custom_error_file_handler": custom_error_file_handler_conf,
 83 |         "custom_info_file_handler": custom_info_file_handler_conf,
 84 |     }
 85 | 
 86 |     custom_logger_conf = {"propagate": True, "handlers": ["custom_error_file_handler", "custom_info_file_handler"], "level": "DEBUG"}
 87 | 
 88 |     root_logger_conf = {
 89 |         "handlers": ["root_file_handler", "root_console_handler", "custom_error_file_handler", "custom_info_file_handler"],
 90 |         "level": "DEBUG",
 91 |     }
 92 | 
 93 |     loggers_dict = {"custom_logger": custom_logger_conf}
 94 | 
 95 |     dict_config = {
 96 |         "version": 1,
 97 |         "disable_existing_loggers": False,
 98 |         "formatters": formatters_dict,
 99 |         "handlers": handlers_dict,
100 |         "loggers": loggers_dict,
101 |         "root": root_logger_conf,
102 |         "incremental": False,
103 |     }
104 |     return dict_config
105 | 


--------------------------------------------------------------------------------
/src/rtfparse/re_patterns.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import re
  5 | 
  6 | # Helper functions to construct raw regular expressions "strings" (actually byte strings)
  7 | 
  8 | 
  9 | def group(content: bytes) -> bytes:
 10 |     if content:
 11 |         return rb"[" + content + rb"]"
 12 |     else:
 13 |         return b""
 14 | 
 15 | 
 16 | def named_regex_group(name: str, content: bytes) -> bytes:
 17 |     group_start = rb"(?P<" + name.encode("ascii") + rb">"
 18 |     group_end = rb")"
 19 |     return rb"".join((group_start, content, group_end))
 20 | 
 21 | 
 22 | def not_preceded_by(preceding: bytes, actual: bytes) -> bytes:
 23 |     return rb"(?<!" + preceding + rb")" + actual
 24 | 
 25 | 
 26 | def not_followed_by(preceding: bytes, actual: bytes) -> bytes:
 27 |     return rb"(?<!" + preceding + rb")" + actual
 28 | 
 29 | 
 30 | def no_capture(content: bytes) -> bytes:
 31 |     return rb"(?:" + content + rb")"
 32 | 
 33 | 
 34 | # Raw regular expression "strings"" (actually byte strings)
 35 | 
 36 | 
 37 | _control_characters = rb"\\\{\}"
 38 | _newline = b"\\" + rb"r" + b"\\" + rb"n"
 39 | control_character = group(_control_characters)
 40 | not_control_character = group(rb"^" + _control_characters)
 41 | _control_characters_or_newline = _control_characters + _newline
 42 | control_character_or_newline = group(_control_characters + _newline)
 43 | not_control_character_or_newline = group(rb"^" + _control_characters_or_newline)
 44 | rtf_backslash = named_regex_group("backslash", not_preceded_by(rb"\\", rb"\\"))
 45 | unnamed_rtf_backslash = not_preceded_by(rb"\\", rb"\\")
 46 | _letters = rb"a-zA-Z"
 47 | ascii_letters = group(_letters) + rb"{1,32}"
 48 | _digits = rb"0-9"
 49 | _hdigits = rb"0-9a-f"
 50 | ignorable = named_regex_group("ignorable", rb"\\\*")
 51 | rtf_brace_open = named_regex_group("group_start", not_preceded_by(unnamed_rtf_backslash, rb"\{") + ignorable + rb"?")
 52 | rtf_brace_close = named_regex_group("group_end", not_preceded_by(unnamed_rtf_backslash, rb"\}"))
 53 | 
 54 | 
 55 | minus = named_regex_group("minus", rb"-?")
 56 | digit = named_regex_group("digit", minus + group(_digits) + rb"{1,10}")
 57 | hdigit = named_regex_group("hdigit", group(_hdigits))
 58 | parameter_pattern = named_regex_group("parameter", digit)
 59 | space = named_regex_group("space", rb" ")
 60 | newline = named_regex_group("newline", _newline)
 61 | other = named_regex_group("other", group(rb"^" + _letters + _digits))
 62 | nothing = named_regex_group("nothing", group(rb""))
 63 | 
 64 | 
 65 | ascii_letter_sequence = named_regex_group("control_name", ascii_letters + parameter_pattern + rb"?")
 66 | delimiter = named_regex_group("delimiter", rb"|".join((space, newline, other, nothing, rb"$")))
 67 | symbol = named_regex_group("symbol", other)
 68 | control_word_pattern = named_regex_group("control_word", rtf_backslash + ascii_letter_sequence + delimiter)
 69 | pcdata_delimiter = no_capture(rb"|".join((rtf_brace_open, rtf_brace_close, control_word_pattern)))
 70 | plain_text_pattern = named_regex_group("text", not_control_character_or_newline + rb"+") + no_capture(
 71 |     rb"|".join((control_character_or_newline, rb"$"))
 72 | )
 73 | probe_pattern = rb".."
 74 | 
 75 | 
 76 | class Bytes_Regex:
 77 |     """
 78 |     This wraps `re.pattern` objects and gives them a method `regex101` which
 79 |     prints out the pattern in such a manner that it can be copy-pasted
 80 |     to regex101.com.
 81 |     """
 82 | 
 83 |     def __init__(self, Bytes: bytes, flags: re.RegexFlag = 0) -> None:
 84 |         self.pattern_bytes = Bytes
 85 |         self.pattern = re.compile(Bytes, flags)
 86 |         self.match = self.pattern.match
 87 | 
 88 |     def regex101(self) -> None:
 89 |         print(self.pattern_bytes.decode("ascii"))
 90 | 
 91 | 
 92 | meaningful_bs = Bytes_Regex(rtf_backslash)
 93 | probe = Bytes_Regex(named_regex_group("probe", probe_pattern), flags=re.DOTALL)
 94 | parameter = Bytes_Regex(parameter_pattern)
 95 | control_word = Bytes_Regex(control_word_pattern)
 96 | control_symbol = Bytes_Regex(rtf_backslash + symbol)
 97 | group_start = Bytes_Regex(rtf_brace_open)
 98 | group_end = Bytes_Regex(rtf_brace_close)
 99 | plain_text = Bytes_Regex(plain_text_pattern)
100 | 
101 | 
102 | raw_pcdata = Bytes_Regex(named_regex_group("pcdata", rb".*?") + pcdata_delimiter, flags=re.DOTALL)
103 | raw_sdata = Bytes_Regex(named_regex_group("sdata", group(_hdigits + rb"\r\n") + rb"+"), flags=re.DOTALL)
104 | 


--------------------------------------------------------------------------------
/src/rtfparse/cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # PYTHON_ARGCOMPLETE_OK
 3 | 
 4 | import io
 5 | import logging
 6 | import logging.config
 7 | from argparse import ArgumentParser, Namespace
 8 | from pathlib import Path
 9 | 
10 | import argcomplete
11 | import compressed_rtf as cr
12 | import extract_msg as em
13 | from provide_dir import provide_dir
14 | 
15 | from rtfparse import logging_conf
16 | from rtfparse.__about__ import __version__
17 | from rtfparse.parser import Rtf_Parser
18 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator
19 | 
20 | 
21 | def setup_logger(directory: Path) -> logging.Logger:
22 |     """
23 |     Returns a logger and a path to directory where the logs are saved
24 |     """
25 |     try:
26 |         provide_dir(directory)
27 |         logger_config = logging_conf.create_dict_config(directory, "rtfparse.debug.log", "rtfparse.info.log", "rtfparse.errors.log")
28 |     except FileExistsError:
29 |         print(f"Failed to create the directory `{str(directory)}` because it already exists as a file.")
30 |         print(f"Please create the directory `{str(directory)}`")
31 |     finally:
32 |         logging.config.dictConfig(logger_config)
33 |         logger = logging.getLogger(__name__)
34 |     return logger
35 | 
36 | 
37 | logger = setup_logger(Path.home() / "rtfparse")
38 | 
39 | 
40 | def argument_parser() -> ArgumentParser:
41 |     """
42 |     Creates an argument parser for command line arguments
43 |     """
44 |     parser = ArgumentParser(description="RTF parser", prog="rtfparse")
45 |     parser.add_argument("-v", "--version", action="version", version=" ".join(("%(prog)s", __version__)), help="print out rtfparse version and exit")
46 |     parser.add_argument("-r", "--rtf-file", action="store", metavar="PATH", type=Path, help="path to the rtf file")
47 |     parser.add_argument("-m", "--msg-file", action="store", metavar="PATH", type=Path, help="Parse RTF from MS Outlook's .msg file")
48 |     parser.add_argument("-d", "--decapsulate-html", action="store_true", help="Decapsulate HTML from RTF")
49 |     parser.add_argument("-i", "--embed-img", action="store_true", help="Embed images from email to HTML")
50 |     parser.add_argument("-o", "--output-file", metavar="PATH", type=Path, help="path to the desired output file")
51 |     parser.add_argument("-a", "--attachments-dir", metavar="PATH", type=Path, help="path to directory where to save email attachments")
52 |     return parser
53 | 
54 | 
55 | def decapsulate(rp: Rtf_Parser, target_file: Path) -> None:
56 |     renderer = HTML_Decapsulator()
57 |     with open(target_file, mode="w", encoding="utf-8") as htmlfile:
58 |         logger.info("Rendering the encapsulated HTML")
59 |         renderer.render(rp.parsed, htmlfile)
60 |         logger.info("Encapsulated HTML rendered")
61 | 
62 | 
63 | def run(cli_args: Namespace) -> None:
64 |     if cli_args.rtf_file and cli_args.rtf_file.exists():
65 |         with open(cli_args.rtf_file, mode="rb") as rtf_file:
66 |             rp = Rtf_Parser(rtf_file=rtf_file)
67 |             rp.parse_file()
68 |     elif cli_args.msg_file:
69 |         msg = em.openMsg(f"{cli_args.msg_file}")
70 |         if cli_args.attachments_dir:
71 |             provide_dir(cli_args.attachments_dir)
72 |             for attachment in msg.attachments:
73 |                 with open(cli_args.attachments_dir / f"{attachment.longFilename}", mode="wb") as att_file:
74 |                     att_file.write(attachment.data)
75 |         decompressed_rtf = cr.decompress(msg.compressedRtf)
76 |         with open(cli_args.msg_file.with_suffix(".rtf"), mode="wb") as email_rtf:
77 |             email_rtf.write(decompressed_rtf)
78 |         with io.BytesIO(decompressed_rtf) as rtf_file:
79 |             rp = Rtf_Parser(rtf_file=rtf_file)
80 |             rp.parse_file()
81 |     if cli_args.decapsulate_html and cli_args.output_file:
82 |         decapsulate(rp, cli_args.output_file.with_suffix(".html"))
83 | 
84 | 
85 | def main() -> None:
86 |     """
87 |     Entry point for any component start from the commmand line
88 |     """
89 |     logger.debug("rtfparse started")
90 |     parser = argument_parser()
91 |     argcomplete.autocomplete(parser)
92 |     cli_args = parser.parse_args()
93 |     logger.debug(f"Parsed arguments: {cli_args}")
94 |     try:
95 |         run(cli_args)
96 |     except Exception as err:
97 |         logger.exception(f"Uncaught exception {repr(err)} occurred.")
98 |     logger.debug("rtfparse ended")
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # rtfparse
  2 | 
  3 | Parses Microsoft's Rich Text Format (RTF) documents. It creates an in-memory object which represents the tree structure of the RTF document. This object can in turn be rendered by using one of the renderers.
  4 | So far, rtfparse provides only one renderer (`HTML_Decapsulator`) which liberates the HTML code encapsulated in RTF. This will come handy, for examle, if you ever need to extract the HTML from a HTML-formatted email message saved by Microsoft Outlook.
  5 | 
  6 | MS Outlook also tends to use RTF compression, so the CLI of rtfparse can optionally decompress that, too.
  7 | 
  8 | You can of course write your own renderers of parsed RTF documents and consider contributing them to this project.
  9 | 
 10 | 
 11 | # Installation
 12 | 
 13 | Install rtfparse from your local repository with pip:
 14 | 
 15 |     pip install rtfparse
 16 | 
 17 | Installation creates an executable file `rtfparse` in your python scripts folder which should be in your `$PATH`.
 18 | 
 19 | # Usage From Command Line
 20 | 
 21 | Use the `rtfparse` executable from the command line. Read `rtfparse --help`.
 22 | 
 23 | rtfparse writes logs into `~/rtfparse/` into these files:
 24 | 
 25 | ```
 26 | rtfparse.debug.log
 27 | rtfparse.info.log
 28 | rtfparse.errors.log
 29 | ```
 30 | 
 31 | ## Example: Decapsulate HTML from an uncompressed RTF file
 32 | 
 33 |     rtfparse --rtf-file "path/to/rtf_file.rtf" --decapsulate-html --output-file "path/to/extracted.html"
 34 | 
 35 | ## Example: Decapsulate HTML from MS Outlook email file
 36 | 
 37 | For this, the CLI of rtfparse uses [extract_msg](https://github.com/TeamMsgExtractor/msg-extractor) and [compressed_rtf](https://github.com/delimitry/compressed_rtf).
 38 | 
 39 |     rtfparse --msg-file "path/to/email.msg" --decapsulate-html --output-file "path/to/extracted.html"
 40 | 
 41 | ## Example: Only decompress the RTF from MS Outlook email file
 42 | 
 43 |     rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf"
 44 | 
 45 | ## Example: Decapsulate HTML from MS Outlook email file and save (and later embed) the attachments
 46 | 
 47 | When extracting the RTF from the `.msg` file, you can save the attachments (which includes images embedded in the email text) in a directory:
 48 | 
 49 |     rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" --attachments-dir "path/to/dir"
 50 | 
 51 | In `rtfparse` version 1.x you will be able to embed these images in the decapsulated HTML. This functionality will be provided by the package [embedimg](https://github.com/fleetingbytes/embedimg).
 52 | 
 53 |     rtfparse --msg-file "path/to/email.msg" --output-file "path/to/extracted.rtf" --attachments-dir "path/to/dir" --embed-img
 54 | 
 55 | In the current version the option `--embed-img` does nothing.
 56 | 
 57 | # Programatic usage in a Python module
 58 | 
 59 | ## Decapsulate HTML from an uncompressed RTF file
 60 | 
 61 | ```py
 62 | from pathlib import Path
 63 | from rtfparse.parser import Rtf_Parser
 64 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator
 65 | 
 66 | source_path = Path(r"path/to/your/rtf/document.rtf")
 67 | target_path = Path(r"path/to/your/html/decapsulated.html")
 68 | # Create parent directory of `target_path` if it does not already exist:
 69 | target_path.parent.mkdir(parents=True, exist_ok=True)
 70 | 
 71 | parser = Rtf_Parser(rtf_path=source_path)
 72 | parsed = parser.parse_file()
 73 | 
 74 | renderer = HTML_Decapsulator()
 75 | 
 76 | with open(target_path, mode="w", encoding="utf-8") as html_file:
 77 |     renderer.render(parsed, html_file)
 78 | ```
 79 | 
 80 | ## Decapsulate HTML from an MS Outlook msg file
 81 | 
 82 | ```py
 83 | from pathlib import Path
 84 | from extract_msg import openMsg
 85 | from compressed_rtf import decompress
 86 | from io import BytesIO
 87 | from rtfparse.parser import Rtf_Parser
 88 | from rtfparse.renderers.html_decapsulator import HTML_Decapsulator
 89 | 
 90 | 
 91 | source_file = Path("path/to/your/source.msg")
 92 | target_file = Path(r"path/to/your/target.html")
 93 | # Create parent directory of `target_path` if it does not already exist:
 94 | target_file.parent.mkdir(parents=True, exist_ok=True)
 95 | 
 96 | # Get a decompressed RTF bytes buffer from the MS Outlook message
 97 | msg = openMsg(source_file)
 98 | decompressed_rtf = decompress(msg.compressedRtf)
 99 | rtf_buffer = BytesIO(decompressed_rtf)
100 | 
101 | # Parse the rtf buffer
102 | parser = Rtf_Parser(rtf_file=rtf_buffer)
103 | parsed = parser.parse_file()
104 | 
105 | # Decapsulate the HTML from the parsed RTF
106 | decapsulator = HTML_Decapsulator()
107 | with open(target_file, mode="w", encoding="utf-8") as html_file:
108 |     decapsulator.render(parsed, html_file)
109 | ```
110 | 
111 | # RTF Specification Links
112 | 
113 | * [RTF Informative References](https://learn.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxrtfcp/85c0b884-a960-4d1a-874e-53eeee527ca6)
114 | * [RTF Specification 1.9.1](https://go.microsoft.com/fwlink/?LinkId=120924)
115 | * [RTF Extensions, MS-OXRTFEX](https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxrtfex/411d0d58-49f7-496c-b8c3-5859b045f6cf)
116 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = [
  3 |     "hatchling>=1.27.0",
  4 |     "hatch-semver"
  5 | ]
  6 | build-backend = "hatchling.build"
  7 | 
  8 | [project]
  9 | name = "rtfparse"
 10 | description = "Tool to parse Microsoft Rich Text Format (RTF)"
 11 | readme = "README.md"
 12 | license = "MIT"
 13 | requires-python = ">=3.10"
 14 | authors = [
 15 |   { name = "Sven Siegmund", email = "sven.siegmund@iav.de" },
 16 | ]
 17 | classifiers = [
 18 |     #"Development Status :: 3 - Alpha",
 19 |     #"Development Status :: 4 - Beta",
 20 |     "Development Status :: 5 - Production/Stable",
 21 |     "Intended Audience :: Developers",
 22 |     "Environment :: Console",
 23 |     "Topic :: Software Development :: Testing",
 24 |     "Topic :: Utilities",
 25 |     "Natural Language :: English",
 26 |     "Programming Language :: Python :: 3.10",
 27 |     "Programming Language :: Python :: 3.11",
 28 |     "Operating System :: OS Independent",
 29 |     "Operating System :: Microsoft :: Windows",
 30 |     "Operating System :: POSIX :: Linux",
 31 |     "Operating System :: MacOS :: MacOS X",
 32 | ]
 33 | keywords = [
 34 |     "rtf",
 35 |     "parse",
 36 | ]
 37 | dependencies = [
 38 |     "argcomplete",
 39 |     "extract-msg",
 40 |     "compressed_rtf",
 41 |     "provide_dir",
 42 | ]
 43 | dynamic = ["version"]
 44 | 
 45 | [project.urls]
 46 | Documentation = "https://github.com/fleetingbytes/rtfparse#readme"
 47 | Issues = "https://github.com/fleetingbytes/rtfparse/issues"
 48 | Source = "https://github.com/fleetingbytes/rtfparse"
 49 | 
 50 | [project.scripts]
 51 | rtfparse = "rtfparse.cli:main"
 52 | 
 53 | [tool.hatch.version]
 54 | path = "src/rtfparse/__about__.py"
 55 | validate-bump = true
 56 | scheme = "semver"
 57 | 
 58 | [tool.hatch.envs.default]
 59 | dependencies = [
 60 |   "pytest-cov",
 61 | ]
 62 | [tool.hatch.envs.default.scripts]
 63 | cov = "pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=src/rtfparse --cov=tests {args}"
 64 | no-cov = "cov --no-cov {args}"
 65 | 
 66 | [tool.hatch.envs.style]
 67 | dependencies = [
 68 |     "ruff",
 69 | ]
 70 | 
 71 | [tool.hatch.envs.style.scripts]
 72 | fmt = [
 73 |     "ruff format",
 74 |     "ruff check",
 75 | ]
 76 | 
 77 | [tool.hatch.envs.tc]
 78 | dependencies = [
 79 |     "towncrier",
 80 | ]
 81 | 
 82 | [tool.hatch.envs.tc.scripts]
 83 | draft = "towncrier build --draft"
 84 | build = "towncrier build --yes"
 85 | 
 86 | [tool.hatch.envs.docs]
 87 | dependencies = [
 88 |     "pdoc3"
 89 | ]
 90 | 
 91 | [[tool.hatch.envs.test.matrix]]
 92 | python = ["311"]
 93 | 
 94 | [tool.coverage.run]
 95 | branch = true
 96 | parallel = true
 97 | omit = [
 98 |   #"src/rtfparse/__about__.py",
 99 | ]
100 | 
101 | [tool.coverage.report]
102 | exclude_lines = [
103 |   "no cov",
104 |   "if __name__ == .__main__.:",
105 |   "if TYPE_CHECKING:",
106 | ]
107 | 
108 | [tool.ruff]
109 | # Exclude a variety of commonly ignored directories.
110 | exclude = [
111 |     ".bzr",
112 |     ".direnv",
113 |     ".eggs",
114 |     ".git",
115 |     ".git-rewrite",
116 |     ".hg",
117 |     ".ipynb_checkpoints",
118 |     ".mypy_cache",
119 |     ".nox",
120 |     ".pants.d",
121 |     ".pyenv",
122 |     ".pytest_cache",
123 |     ".pytype",
124 |     ".ruff_cache",
125 |     ".svn",
126 |     ".tox",
127 |     ".venv",
128 |     ".vscode",
129 |     "__pypackages__",
130 |     "_build",
131 |     "buck-out",
132 |     "build",
133 |     "dist",
134 |     "node_modules",
135 |     "site-packages",
136 |     "venv",
137 | ]
138 | 
139 | # Same as Black.
140 | line-length = 150
141 | indent-width = 4
142 | 
143 | # Assume Python 3.10
144 | target-version = "py310"
145 | 
146 | [tool.ruff.lint]
147 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
148 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
149 | # McCabe complexity (`C901`) by default.
150 | select = ["E4", "E7", "E9", "F"]
151 | ignore = []
152 | 
153 | # Allow fix for all enabled rules (when `--fix`) is provided.
154 | fixable = ["ALL"]
155 | unfixable = []
156 | 
157 | # Allow unused variables when underscore-prefixed.
158 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
159 | 
160 | [tool.ruff.lint.pycodestyle]
161 | max-line-length = 150
162 | 
163 | [tool.ruff.format]
164 | # Like Black, use double quotes for strings.
165 | quote-style = "double"
166 | 
167 | # Like Black, indent with spaces, rather than tabs.
168 | indent-style = "space"
169 | 
170 | # Like Black, respect magic trailing commas.
171 | skip-magic-trailing-comma = false
172 | 
173 | # Like Black, automatically detect the appropriate line ending.
174 | line-ending = "auto"
175 | 
176 | # Enable auto-formatting of code examples in docstrings. Markdown,
177 | # reStructuredText code/literal blocks and doctests are all supported.
178 | #
179 | # This is currently disabled by default, but it is planned for this
180 | # to be opt-out in the future.
181 | docstring-code-format = true
182 | 
183 | # Set the line length limit used when formatting code snippets in
184 | # docstrings.
185 | #
186 | # This only has an effect when the `docstring-code-format` setting is
187 | # enabled.
188 | docstring-code-line-length = "dynamic"
189 | 
190 | [tool.towncrier]
191 | name = "rtfparse"
192 | package = "rtfparse"
193 | package_dir = "src"
194 | directory = "changelog.d"
195 | filename = "CHANGELOG.md"
196 | start_string = "<!-- towncrier release notes start -->\n"
197 | underlines = ["", "", ""]
198 | template = "changelog.d/changelog_template.jinja"
199 | #title_format = "## [{version}](https://github.com/fleetingbytes/rtfparse/{version}) - {project_date}"
200 | title_format = "## {version} ({project_date})"
201 | issue_format = "[#{issue}](https://github.com/fleetingbytes/rtfparse/issues/{issue})"
202 | orphan_prefix = "+"
203 | 
204 | [tool.towncrier.fragment.doc]
205 | name = "Documentation"
206 | 
207 | [tool.towncrier.fragment.feature]
208 | name = "New Features"
209 | 
210 | [tool.towncrier.fragment.improved]
211 | name = "Improvements"
212 | 
213 | [tool.towncrier.fragment.fixed]
214 | name = "Bugfixes"
215 | 
216 | [tool.towncrier.fragment.unimportant]
217 | name = "Development Details"
218 | 


--------------------------------------------------------------------------------
/src/rtfparse/entities.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | 
  4 | import io
  5 | import logging
  6 | 
  7 | # Own modules
  8 | from rtfparse import re_patterns, utils
  9 | from rtfparse.enums import Bytestring_Type
 10 | 
 11 | # Setup logging
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | # Constants, number of bytes to read when creating entities
 16 | CHARACTER = BACKSLASH = DELIMITER = MINUS = GROUP_END = len(b"\\")
 17 | SYMBOL = IGNORABLE = BACKSLASH + CHARACTER
 18 | GROUP_START = BACKSLASH + IGNORABLE
 19 | MAX_CW_LETTERS = 32  # As specified in RTF Spec
 20 | INTEGER_MAGNITUDE = 32  # As specified in RTF Spec
 21 | PLAIN_TEXT = CONTROL_WORD = BACKSLASH + MAX_CW_LETTERS + MINUS + len(str((1 << INTEGER_MAGNITUDE) // 2)) + DELIMITER
 22 | 
 23 | 
 24 | class Entity:
 25 |     def __init__(self) -> None:
 26 |         self.text = ""
 27 | 
 28 |     @classmethod
 29 |     def probe(cls, pattern: re_patterns.Bytes_Regex, file: io.BufferedReader) -> Bytestring_Type:
 30 |         logger.debug(f"Probing file at position {file.tell()}")
 31 |         original_position = file.tell()
 32 |         while True:
 33 |             probed = file.read(len(re_patterns.probe_pattern))
 34 |             logger.debug(f"{probed = }")
 35 |             file.seek(original_position)
 36 |             logger.debug(f"Probe returned to position {file.tell()}")
 37 |             if re_patterns.group_start.match(probed):
 38 |                 result = Bytestring_Type.GROUP_START
 39 |             elif re_patterns.group_end.match(probed):
 40 |                 result = Bytestring_Type.GROUP_END
 41 |             elif re_patterns.control_word.match(probed):
 42 |                 result = Bytestring_Type.CONTROL_WORD
 43 |             elif re_patterns.control_symbol.match(probed):
 44 |                 result = Bytestring_Type.CONTROL_SYMBOL
 45 |             elif re_patterns.plain_text.match(probed):
 46 |                 result = Bytestring_Type.PLAIN_TEXT
 47 |             else:
 48 |                 logger.debug("This does not match anything, it's probably a newline, moving on")
 49 |                 original_position += 1
 50 |                 file.seek(original_position)
 51 |                 logger.debug(f"Probe moved to position {file.tell()}")
 52 |                 if not probed:
 53 |                     logger.debug("Reached unexpected end of file.")
 54 |                     result = Bytestring_Type.GROUP_END
 55 |                     break
 56 |                 continue
 57 |             break
 58 |         logger.debug(f"Probe {result = }")
 59 |         logger.debug(f"Probe leaving file at position {file.tell()}")
 60 |         return result
 61 | 
 62 | 
 63 | class Control_Word(Entity):
 64 |     def __init__(self, encoding: str, file: io.BufferedReader) -> None:
 65 |         super().__init__()
 66 |         self.encoding = encoding
 67 |         logger.debug(f"Reading Control Word at file position {file.tell()}")
 68 |         self.control_name = "missing"
 69 |         self.parameter = ""
 70 |         self.bindata = b""
 71 |         self.start_position = file.tell()
 72 |         logger.debug(f"Starting at file position {self.start_position}")
 73 |         probe = file.read(CONTROL_WORD)
 74 |         if match := re_patterns.control_word.match(probe):
 75 |             self.control_name = match.group("control_name").decode(self.encoding)
 76 |             logger.debug(f"Preliminary {self.control_name = }")
 77 |             parameter = match.group("parameter")
 78 |             if parameter is not None:
 79 |                 self.parameter = int(parameter.decode(self.encoding))
 80 |                 logger.debug(f"{self.parameter = }")
 81 |                 self.control_name = self.control_name.removesuffix(str(self.parameter))
 82 |                 logger.debug(f"Final {self.control_name = }")
 83 |             target_position = self.start_position + match.span()[1]
 84 |             if match.group("other"):
 85 |                 logger.debug(f"Delimiter is {match.group('other').decode(self.encoding)}, len: {len(match.group('delimiter'))}")
 86 |                 target_position -= len(match.group("delimiter"))
 87 |             file.seek(target_position)
 88 |             # handle \binN:
 89 |             if self.control_name == "bin":
 90 |                 self.bindata = file.read(utils.twos_complement(self.parameter, INTEGER_MAGNITUDE))
 91 |         else:
 92 |             logger.warning("Missing Control Word")
 93 |             file.seek(self.start_position)
 94 | 
 95 |     def __repr__(self) -> str:
 96 |         return f"<{self.__class__.__name__}: {self.control_name}{self.parameter}>"
 97 | 
 98 | 
 99 | class Control_Symbol(Entity):
100 |     def __init__(self, encoding: str, file: io.BufferedReader) -> None:
101 |         super().__init__()
102 |         self.encoding = encoding
103 |         self.start_position = file.tell()
104 |         logger.debug(f"Reading Symbol at file position {self.start_position}")
105 |         self.char = ""
106 |         self.text = chr(file.read(SYMBOL)[-1])
107 |         if self.text == "'":
108 |             self.char = file.read(SYMBOL).decode(self.encoding)
109 |             self.text = bytes((int(self.char, base=16),)).decode(self.encoding)
110 |             logger.debug(f"Encountered escaped ANSI character, read two more bytes: {self.char}, character: {self.text}")
111 |             if self.text in "\\{}":
112 |                 file.seek(file.tell() - SYMBOL)
113 | 
114 |     def __repr__(self) -> str:
115 |         return f"<{self.__class__.__name__}: {self.text}>"
116 | 
117 | 
118 | class Plain_Text(Entity):
119 |     def __init__(self, encoding: str, file: io.BufferedReader) -> None:
120 |         super().__init__()
121 |         self.encoding = encoding
122 |         self.text = ""
123 |         logger.debug("Constructing Plain_Text")
124 |         while True:
125 |             self.start_position = file.tell()
126 |             read = file.read(PLAIN_TEXT)
127 |             logger.debug(f"Read file from {self.start_position} to position {file.tell()}, read: {read}")
128 |             # see if we have read all the plain text there is:
129 |             if match := re_patterns.plain_text.match(read):
130 |                 logger.debug("This matches the plain text pattern")
131 |                 _text = match.group("text").decode(self.encoding)
132 |                 logger.debug(f"{_text = }")
133 |                 self.text = "".join((self.text, _text))
134 |                 logger.debug(f"{self.text = }")
135 |                 if len(_text) == PLAIN_TEXT:
136 |                     continue
137 |                 else:
138 |                     file.seek(self.start_position + len(_text))
139 |                     break
140 |             else:
141 |                 file.seek(self.start_position)
142 |                 break
143 |         logger.debug(f"Returned to position {file.tell()}")
144 | 
145 |     def __repr__(self) -> str:
146 |         return f"<{self.__class__.__name__}: {self.text}>"
147 | 
148 | 
149 | class Group(Entity):
150 |     def __init__(self, encoding: str, file: io.BufferedReader) -> None:
151 |         super().__init__()
152 |         logger.debug("Group.__init__")
153 |         self.encoding = encoding
154 |         self.known = False
155 |         self.name = "unknown"
156 |         self.ignorable = False
157 |         self.structure = list()
158 |         parsed_object = utils.what_is_being_parsed(file)
159 |         logger.debug(f"Creating destination group from {parsed_object}")
160 |         self.start_position = file.tell()
161 |         logger.debug(f"Starting at file position {self.start_position}")
162 |         probe = file.read(GROUP_START)
163 |         logger.debug(f"Read file up to position {file.tell()}, read {probe = }")
164 |         if match := re_patterns.group_start.match(probe):
165 |             self.known = bool(match.group("group_start"))
166 |             self.ignorable = bool(match.group("ignorable"))
167 |             if not self.ignorable:
168 |                 file.seek(self.start_position + GROUP_START - IGNORABLE)
169 |                 logger.debug(f"Returned to position {file.tell()}")
170 |         else:
171 |             logger.warning(utils.warn("Expected a group but found no group start. Creating unknown group"))
172 |             file.seek(self.start_position)
173 |         while True:
174 |             probed = self.probe(re_patterns.probe, file)
175 |             if probed is Bytestring_Type.CONTROL_WORD:
176 |                 self.structure.append(Control_Word(self.encoding, file))
177 |             elif probed is Bytestring_Type.GROUP_END:
178 |                 file.read(GROUP_END)
179 |                 break
180 |             elif probed is Bytestring_Type.GROUP_START:
181 |                 self.structure.append(Group(self.encoding, file))
182 |             elif probed is Bytestring_Type.CONTROL_SYMBOL:
183 |                 self.structure.append(Control_Symbol(self.encoding, file))
184 |             else:
185 |                 self.structure.append(Plain_Text(self.encoding, file))
186 |         # name the group like its first Control Word
187 |         # this way the renderer will be able to ignore entire groups based on their first control word
188 |         try:
189 |             if isinstance(self.structure[0], Control_Word):
190 |                 self.name = self.structure[0].control_name
191 |         except IndexError:
192 |             pass
193 | 
194 |     def __repr__(self) -> str:
195 |         return f"<Group {self.name}>"
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     pass
200 | 


--------------------------------------------------------------------------------