├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SECURITY.md ├── docs ├── 0. Introduction.md ├── 1. Segment and Span.md ├── 2. In Text Object.md ├── 3. Visualization.md ├── 4. Arborform.md ├── 5. Traversal & Query.md ├── 6. Xml.md ├── 7. NLP.md ├── 8. Serialization.md ├── Pawpaw Cookbook.md ├── Using Pawpaw with nltk.md └── demos │ ├── Q&A │ ├── description.md │ └── solution.py │ ├── class_grades │ ├── description.md │ ├── input.txt │ ├── parser_compact.py │ ├── parser_verbose.py │ └── solution.py │ ├── compounds │ ├── compound_1.txt │ ├── compound_2.txt │ ├── description.md │ └── solution.py │ ├── gettysburg_address │ └── gettysburg_address.txt │ ├── us_constitution │ ├── description.md │ ├── us_constitution.py │ └── us_constitution.txt │ └── xpath_recursion_depth │ ├── description.md │ ├── solution_1.py │ └── solution_2.py ├── images └── pawpaw.png ├── pawpaw ├── __init__.py ├── _type_magic.py ├── _version.py ├── arborform │ ├── __init__.py │ ├── itorator │ │ ├── __init__.py │ │ ├── desc.py │ │ ├── extract.py │ │ ├── filter.py │ │ ├── invert.py │ │ ├── itorator.py │ │ ├── nuco.py │ │ ├── reflect.py │ │ ├── regex_itorator.py │ │ ├── split.py │ │ └── value_func.py │ └── postorator │ │ ├── __init__.py │ │ ├── postorator.py │ │ ├── stacked_reduce.py │ │ └── windowed_join.py ├── errors.py ├── infix.py ├── ito.py ├── nlp │ ├── __init__.py │ └── nlp.py ├── ontology │ ├── __init__.py │ ├── _query.py │ └── ontology.py ├── query │ ├── __init__.py │ └── _query.py ├── span.py ├── table │ ├── __init__.py │ ├── styles │ │ ├── __init__.py │ │ └── styles.py │ └── table.py ├── util.py ├── visualization │ ├── __init__.py │ ├── ascii_box.py │ ├── highlighter.py │ ├── pepo │ │ ├── __init__.py │ │ └── pepo.py │ └── sgr │ │ ├── __init__.py │ │ ├── palettes │ │ ├── __init__.py │ │ └── palettes.py │ │ └── sgr.py └── xml │ ├── __init__.py │ ├── descriptors.py │ ├── xml_helper.py │ └── xml_parser.py ├── pyproject.toml └── tests ├── __init__.py ├── arborform ├── __init__.py ├── test_connectors.py ├── test_invert.py ├── test_itorator.py ├── test_itorator_desc.py ├── test_itorator_extract.py ├── test_itorator_filter.py ├── test_itorator_reflect.py ├── test_itorator_split.py ├── test_itorator_value_func.py ├── test_nuco.py ├── test_postorator.py └── test_postorator_windowed_join.py ├── ito ├── __init__.py ├── test_child_itos.py ├── test_ito.py ├── test_ito_ctor.py ├── test_ito_descend.py ├── test_ito_regex_equivalence_methods.py ├── test_ito_serialization.py ├── test_ito_str_equivalence_methods.py └── test_ito_utility_methods.py ├── nlp ├── __init__.py └── test_nlp.py ├── ontology ├── __init__.py ├── test_keyed_list.py └── test_ontology.py ├── query ├── __init__.py └── test_query_and_traversal.py ├── table ├── __init__.py └── test_table.py ├── test_group_keys.py ├── test_invoke_func.py ├── test_span.py ├── test_type_magic.py ├── test_util.py ├── test_version.py ├── test_xml_helper.py ├── test_xml_parser.py ├── util.py └── visualization ├── __init__.py ├── test_sgr.py └── test_visualization_ascii_box.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Provide a short, concise script that shows how to reproduce the behavior via: 15 | - Python 16 | - Jupyter Notebook 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Additional Context** 22 | Please indicate the Python and regex versions you are using, e.g. "Python 10.0.4" 23 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Please indicate the Python and regex versions you are using, e.g. "Python 10.0.4" 21 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * Public or private harassment 31 | * Trolling, insulting or derogatory comments, and personal or political attacks 32 | * Publishing others' private information, such as a physical or email 33 | address, without their explicit permission 34 | * Other conduct which could reasonably be considered inappropriate in a 35 | professional setting 36 | 37 | ## Enforcement Responsibilities 38 | 39 | Community leaders are responsible for clarifying and enforcing our standards of 40 | acceptable behavior and will take appropriate and fair corrective action in 41 | response to any behavior that they deem inappropriate, threatening, offensive, 42 | or harmful. 43 | 44 | Community leaders have the right and responsibility to remove, edit, or reject 45 | comments, commits, code, wiki edits, issues, and other contributions that are 46 | not aligned to this Code of Conduct, and will communicate reasons for moderation 47 | decisions when appropriate. 48 | 49 | ## Scope 50 | 51 | This Code of Conduct applies within all community spaces, and also applies when 52 | an individual is officially representing the community in public spaces. 53 | Examples of representing our community include using an official e-mail address, 54 | posting via an official social media account, or acting as an appointed 55 | representative at an online or offline event. 56 | 57 | ## Enforcement 58 | 59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 60 | reported to the community leaders responsible for enforcement at 61 | . 62 | All complaints will be reviewed and investigated promptly and fairly. 63 | 64 | All community leaders are obligated to respect the privacy and security of the 65 | reporter of any incident. 66 | 67 | ## Enforcement Guidelines 68 | 69 | Community leaders will follow these Community Impact Guidelines in determining 70 | the consequences for any action they deem in violation of this Code of Conduct: 71 | 72 | ### 1. Correction 73 | 74 | **Community Impact**: Use of inappropriate language or other behavior deemed 75 | unprofessional or unwelcome in the community. 76 | 77 | **Consequence**: A private, written warning from community leaders, providing 78 | clarity around the nature of the violation and an explanation of why the 79 | behavior was inappropriate. A public apology may be requested. 80 | 81 | ### 2. Warning 82 | 83 | **Community Impact**: A violation through a single incident or series 84 | of actions. 85 | 86 | **Consequence**: A warning with consequences for continued behavior. No 87 | interaction with the people involved, including unsolicited interaction with 88 | those enforcing the Code of Conduct, for a specified period of time. This 89 | includes avoiding interactions in community spaces as well as external channels 90 | like social media. Violating these terms may lead to a temporary or 91 | permanent ban. 92 | 93 | ### 3. Temporary Ban 94 | 95 | **Community Impact**: A serious violation of community standards, including 96 | sustained inappropriate behavior. 97 | 98 | **Consequence**: A temporary ban from any sort of interaction or public 99 | communication with the community for a specified period of time. No public or 100 | private interaction with the people involved, including unsolicited interaction 101 | with those enforcing the Code of Conduct, is allowed during this period. 102 | Violating these terms may lead to a permanent ban. 103 | 104 | ### 4. Permanent Ban 105 | 106 | **Community Impact**: Demonstrating a pattern of violation of community 107 | standards, including sustained inappropriate behavior, harassment of an 108 | individual, or aggression toward or disparagement of classes of individuals. 109 | 110 | **Consequence**: A permanent ban from any sort of public interaction within 111 | the community. 112 | 113 | ## Attribution 114 | 115 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 116 | version 2.0, available at 117 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 118 | 119 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 120 | enforcement ladder](https://github.com/mozilla/diversity). 121 | 122 | [homepage]: https://www.contributor-covenant.org 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | https://www.contributor-covenant.org/faq. Translations are available at 126 | https://www.contributor-covenant.org/translations. 127 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | [![Python][Python-shield]][Python-url] 6 | [![Contributors][contributors-shield]][contributors-url] 7 | [![Watchers][watchers-shield]][watchers-url] 8 | [![Forks][forks-shield]][forks-url] 9 | [![MIT License][license-shield]][license-url] 10 | [![Stargazers][stars-social]][stars-url] 11 |
12 | 13 | ## How to contribute to Pawpaw 14 | 15 | #### **Did you find a bug?** 16 | 17 | * **Do not open up a GitHub issue if the bug is a security vulnerability in Pawpaw**, and instead to refer to the [security policy](https://rubyonrails.org/security). 18 | 19 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/rlayers/pawpaw/issues). 20 | 21 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/rlayers/pawpaw/issues/new). Be sure to include a **title and clear description**, as much relevant information as possible, and a **code sample** or an **executable test case** demonstrating the expected behavior that is not occurring. 22 | 23 | #### **Did you write a patch that fixes a bug?** 24 | 25 | * Open a new GitHub pull request with your patch. 26 | 27 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. 28 | 29 | #### **Did you write an enhancement to PawPaw?** 30 | 31 | * If you have a suggestion that would make this better, please fork the repo and create a pull request. 32 | 33 | * You can also simply open an issue with the tag "enhancement". 34 | 35 | #### **Did you fix whitespace, format code, or make a purely cosmetic patch?** 36 | 37 | Changes that are cosmetic in nature and do not add anything substantial to the stability, functionality, or testability of Rails will generally not be accepted. 38 | 39 | #### **Do you have questions about the source code?** 40 | 41 | * Ask Robert any question about how to use PawPaw via a.nov.guy@gmail.com. 42 | 43 | #### **Do you want to contribute to the PawPaw documentation?** 44 | 45 | * I'd love to hear feedback on expanding or improving documents. 46 | 47 | Thanks! :heart: :heart: :heart: 48 | 49 | Robert 50 | 51 | 52 | 53 | 54 | 55 | 60 | 61 | [repo]: https://github.com/rlayers/pawpaw 62 | 63 | [byline-img]: https://img.shields.io/badge/-High%20Performance%20Text%20Segmentation%20Framework-FFFFFF 64 | 65 | [byline2-img]: https://readme-typing-svg.demolab.com?font=Fira+Code&weight=800&duration=500&pause=1500&color=533E30&vCenter=true&width=375&height=25&lines=High+Performance+Text+Segmentation 66 | 67 | [Python-shield]: https://img.shields.io/badge/python-≥3.10-517D3D.svg?style=flat 68 | [Python-url]: https://www.python.org 69 | 70 | [contributors-shield]: https://img.shields.io/github/contributors/rlayers/pawpaw.svg?color=90C246&style=flat 71 | [contributors-url]: https://github.com/rlayers/pawpaw/graphs/contributors 72 | 73 | [watchers-shield]: https://img.shields.io/github/watchers/rlayers/pawpaw.svg?color=E4D1AE&style=flat 74 | [watchers-url]: https://github.com/rlayers/pawpaw/watchers 75 | 76 | [issues-shield]: https://img.shields.io/github/issues/rlayers/pawpaw.svg?style=flat 77 | [issues-url]: https://github.com/rlayers/pawpaw/issues 78 | 79 | [forks-social]: https://img.shields.io/github/forks/rlayers/pawpaw.svg?style=social 80 | [forks-shield]: https://img.shields.io/github/forks/rlayers/pawpaw.svg?color=D2AC70&style=flat 81 | [forks-url]: https://github.com/rlayers/pawpaw/network/members 82 | 83 | [license-shield]: https://img.shields.io/github/license/rlayers/pawpaw.svg?color=533E30&style=flat 84 | [license-url]: https://github.com/rlayers/pawpaw/blob/master/LICENSE 85 | 86 | [stars-social]: https://img.shields.io/github/stars/rlayers/pawpaw.svg?style=social 87 | [stars-shield]: https://img.shields.io/github/stars/rlayers/pawpaw.svg?style=flat 88 | [stars-url]: https://github.com/rlayers/pawpaw/stargazers 89 | 90 | [PyCharm-shield]: https://img.shields.io/badge/PyCharm-000000.svg?&style=flat&logo=PyCharm&logoColor=white 91 | [PyCharm-url]: https://www.jetbrains.com/pycharm/ 92 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Robert L. Ayers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | 13 | 14 | | Version | Supported | 15 | | ------- | ------------------ | 16 | | 1.0.x | :white_check_mark: | 17 | 18 | ## Reporting a Vulnerability 19 | 20 | To report a discovered vulnerability, e-mail me directly at a.nov.guy@gmail.com. 21 | -------------------------------------------------------------------------------- /docs/1. Segment and Span.md: -------------------------------------------------------------------------------- 1 | # Segments and Span 2 | 3 | ## Definitions 4 | 5 | **string:** 6 | an ordered sequence of **n** characters; the *length* of a string is equivalent to **n**: 7 | 8 | $$0 \leq n \equiv length$$ 9 | 10 | **substring:** 11 | an ordered sequence of **m** characters *contained in* or *equal to* a reference string of length **n**; may exist in more than one *location* within a string: 12 | 13 | $$m \leq n$$ 14 | 15 | **proper substring:** 16 | a substring of **m** characters that is *not equal to* a reference string of length **n**: 17 | 18 | $$m < n$$ 19 | 20 | **segment:** 21 | a *specific* substring of **m** characters identified by its *inclusive* **start** location, a zero-based index; the sum of **start** and **m** is equivalent to the *exclusive* **stop** location: 22 | 23 | $$start + m ≡ stop$$ 24 | 25 | as a result of this relationship, a segment can be uniquely identified using *any two* values from **start**, **stop**, or **m**; **stop** must be less than or equal to **n**: 26 | 27 | $$0 \leq start \leq stop \leq n$$ 28 | 29 | A *segment* is frequently identified by its *inclusive* **start** and *exclusive* **stop** locations within a reference string. Two integers can identify a valid segment for a string of length **n** if they are: 30 | 31 | 1. ordered 32 | 2. between zero and **n** inclusive 33 | 34 | .. sidebar:: Segment as a vector 35 | 36 | A segment can be thought of as a one dimensional vector having a location at ``start``, and a length of ``stop-start``. 37 | 38 | A segment having a non-zero **start** value *-or-* a **stop** value less than **n** defines a *proper substring*. 39 | 40 | It is possible for a segment to have identical **start** and **stop** values, in which case it defines an *empty substring* (i.e., zero length) at a *specific location*. 41 | 42 | In Python, a ``str`` is immutable. And because since substrings are themselves ``str`` objects, they too are immutable. This attribute extends to segments, which because they define a substring, are also immutable. 43 | 44 | .. admonition:: Key Concept 45 | 46 | Segments are immutable 47 | 48 | Span 49 | ==== 50 | 51 | A segment can be represented in Python with a 2-tuple of ``int`` values. Pawpaw offers a class named ``Span``[^span_name] for this purpose. ``Span`` is derived from Python's `NamedTuple `_, which results in a tuple-like object that: 52 | 53 | - has fields accessible by attribute lookup 54 | - is indexable and iterable 55 | - requires no more memory than regular tuples 56 | - has immutable elements. 57 | 58 | Because they are tuples, ``Span`` objects are themselves immutable. This ensures that immutable representation for segments within Pawpaw. 59 | 60 | Creating a ``Span`` only requires a pair of ``int`` value for ``start`` and ``stop``: 61 | 62 | ```python 63 | >>> from pawpaw import Span 64 | >>> Span(0, 3) 65 | Span(start=0, stop=3) 66 | ``` 67 | 68 | As a named tuple, ``Span`` can be used as direct replacement for a tuple: 69 | 70 | ```python 71 | >>> 'The vals are %d and %d.' % Span(2, 5) 72 | 'The vals are 2 and 5' 73 | ``` 74 | 75 | A ``Span`` can also be unpacked using the ``*`` operator. For example, many ``str`` methods feature 76 | ``start`` and ``end`` parameters, which can be supplied via unpacking a ``Span``: 77 | 78 | ```python 79 | >>> s = '3. This sentence has "3" as a prefix.' 80 | >>> span = Span(1, len(s)) 81 | >>> s.find('3', *span) 82 | 22 83 | ``` 84 | 85 | Slice Coordination 86 | ================== 87 | 88 | A ``Span`` can be easily converted to a Python ``slice`` via unpacking: 89 | 90 | ```python 91 | >>> s = ' leading and trailing spaces ' 92 | >>> span = Span(1, len(s) - 1) 93 | >>> _slice = slice(*span) 94 | >>> s[_slice] # strip leading and trailing chars 95 | 'leading and trailing spaces' 96 | ``` 97 | 98 | However, a ``Span`` and ``slice`` are **not** equivalent. A Python's ``slice`` constructor has ``start`` and ``stop`` parameters[^slice_step], but they are *Python-style indices*, which allow negative values. So although a ``Span`` can always be converted to a ``slice``, the reverse is not true: 99 | 100 | ```python 101 | >>> slice(-10, 1) 102 | slice(-10, 1, None) 103 | >>> slice(3) 104 | slice(None, 3, None) 105 | ``` 106 | 107 | To convert from ``slice`` to ``Span``, the indices must be *normalized* to valid locations within the reference string. For example, given a ``str`` of length ``n`` and a ``slice(1, -1)``, the associated ``Span`` would be ``Span(1, n - 1)`` 108 | 109 | The ``Span`` class offers a static constructor method ``.from_indices`` that performs normalization for you: 110 | 111 | ```python 112 | >>> s = 'abcd' 113 | >>> Span.from_indices(s, 1, -1) 114 | Span(start=1, stop=3) 115 | >>> Span.from_indices(s, -1) 116 | Span(start=3, stop=4) 117 | >>> Span.from_indices(s, stop=-2) 118 | Span(start=0, stop=2) 119 | ``` 120 | 121 | The ``.from_indices`` constructor only uses the length of the reference ``str``. The first parameter, ``basis``, accepts an ``int`` or any ``Sized``[^sized] type. 122 | 123 | [^span_name]: The choice of *Span* for this type name instead of *Segment* is based on the extensive use of *span* in the ``re`` and ``regex`` modules. 124 | 125 | [^slice_step]: Python's ``slice`` constructor also features a ``step`` parameter, which defaults to 1. Slicing a ``str`` with ``step`` values other than 1 does not result in a *proper substring*, i.e., the resulting ``str`` is not contained within the starting ``str``. 126 | 127 | [^sized]: Python's ``Sized`` type supports the ``len`` keyword via a ``__len__`` method, which is used to supply a length by ``.from_indices``. 128 | -------------------------------------------------------------------------------- /docs/7. NLP.md: -------------------------------------------------------------------------------- 1 | # NLP 2 | 3 | ## Introduction 4 | 5 | Pawpaw is well suited for Natural Language Processing (NLP) software development. NLP is a deep topic, and it can sometimes be difficult to select which state of the art (SoA) approaches are best suited for your particular data and needs. 6 | 7 | The intent of Pawpaw's ``nlp`` module is not serve as a replacement for the excellent work that has been done in this field. Rather, ``nlp`` is available as a simple and easy to use toolbox that offers an excellent balance of features and performance for English language data. 8 | 9 | ## Architecture 10 | 11 | The ``nlp`` contains a variety of useful lexical data marker collections for Python unicode strings: 12 | 13 | * ``byte_order_controls`` 14 | * ``unicode_white_space_LF_FF`` 15 | * ``unicode_white_space_eol`` 16 | * ``unicode_white_space_other`` 17 | * ``unicode_single_quote_marks`` 18 | * ``unicode_double_quote_marks`` 19 | * ``unicode_bullets`` 20 | 21 | NLP methods are split among a class hierarchy, whose base class is ``NlpComponent``: 22 | 23 | ```mermaid 24 | classDiagram 25 | class NlpComponent{ 26 | <> 27 | +re regex.Pattern* 28 | +get_itor() pawpaw.arborform.Itorator* 29 | } 30 | 31 | NlpComponent <|-- SimpleNlp 32 | NlpComponent <|-- Paragraph 33 | NlpComponent <|-- Sentence 34 | NlpComponent <|-- Number 35 | ``` 36 | 37 | ## Introduction 38 | 39 | A complete, paragraph → Sentence → Word extraction can be achieved using the the class ``SimpleNlp``: 40 | 41 | ```python 42 | >>> import pawpaw 43 | >>> tom_sawyer = '''“Tom!” 44 | ... 45 | ... No answer. 46 | ... 47 | ... “TOM!” 48 | ... 49 | ... No answer. 50 | ... 51 | ... “What’s gone with that boy, I wonder? You TOM!” 52 | ... 53 | ... No answer.''' 54 | >>> nlp = pawpaw.nlp.SimpleNlp() 55 | >>> result = nlp.from_text(tom_sawyer) 56 | >>> tree_vis = pawpaw.visualization.pepo.Tree() 57 | >>> print(tree_vis.dumps(result)) 58 | (0, 100) 'Document' : '“Tom!”\n\nNo answer.…TOM!”\n\nNo answer.' 59 | ├──(0, 6) 'paragraph' : '“Tom!”' 60 | │ └──(0, 6) 'sentence' : '“Tom!”' 61 | │ └──(1, 4) 'word' : 'Tom' 62 | ├──(8, 18) 'paragraph' : 'No answer.' 63 | │ └──(8, 18) 'sentence' : 'No answer.' 64 | │ ├──(8, 10) 'word' : 'No' 65 | │ └──(11, 17) 'word' : 'answer' 66 | ├──(20, 26) 'paragraph' : '“TOM!”' 67 | │ └──(20, 26) 'sentence' : '“TOM!”' 68 | │ └──(21, 24) 'word' : 'TOM' 69 | ├──(28, 38) 'paragraph' : 'No answer.' 70 | │ └──(28, 38) 'sentence' : 'No answer.' 71 | │ ├──(28, 30) 'word' : 'No' 72 | │ └──(31, 37) 'word' : 'answer' 73 | ├──(40, 88) 'paragraph' : '“What’s gone with th…I wonder? You TOM!”' 74 | │ ├──(40, 78) 'sentence' : '“What’s gone with that boy, I wonder?' 75 | │ │ ├──(41, 47) 'word' : 'What’s' 76 | │ │ ├──(48, 52) 'word' : 'gone' 77 | │ │ ├──(53, 57) 'word' : 'with' 78 | │ │ ├──(58, 62) 'word' : 'that' 79 | │ │ ├──(63, 66) 'word' : 'boy' 80 | │ │ ├──(69, 70) 'word' : 'I' 81 | │ │ └──(71, 77) 'word' : 'wonder' 82 | │ └──(79, 88) 'sentence' : 'You TOM!”' 83 | │ ├──(79, 82) 'word' : 'You' 84 | │ └──(83, 86) 'word' : 'TOM' 85 | └──(90, 100) 'paragraph' : 'No answer.' 86 | └──(90, 100) 'sentence' : 'No answer.' 87 | ├──(90, 92) 'word' : 'No' 88 | └──(93, 99) 'word' : 'answer' 89 | ``` 90 | 91 | ``SimpleNlp`` creates an aborform pipeline using the classes ``Paragraph`` and ``Sentence``. 92 | 93 | *More coming soon...* 94 | -------------------------------------------------------------------------------- /docs/8. Serialization.md: -------------------------------------------------------------------------------- 1 | # SERIALIZATION 2 | 3 | ## Introduction 4 | 5 | Serialization and deserialization of ``Ito`` hierarchies is easy to accomplish in Pawpaw, which offers native support for both: 6 | 7 | * Pickling 8 | * JSON 9 | 10 | In either case, support for any dynamically ascribed ``.value`` methods are not serializable[^lambda_pickling]. 11 | 12 | ## Pickling 13 | 14 | ```python 15 | >>> import pickle 16 | >>> from pawpaw import Ito 17 | >>> s = 'See Jack run.' 18 | >>> i = Ito(s, desc='my desc') 19 | >>> i.children.add(*i.str_split()) 20 | >>> pickle_data = pickle.dumps(i) 21 | >>> j = pickle.loads(pickle_data) 22 | >>> len(j.children) 23 | 3 24 | ``` 25 | 26 | ## JSON 27 | 28 | Pawpaw offers two JSON serialization encoder options: 29 | 30 | 1. ``Ito.JsonEncoder``: **Does** serialize ``.string`` 31 | 2. ``Ito.JsonEncoderStringless``: Does **not** serialize ``.string`` 32 | 33 | ### Ito.JsonEncoder 34 | 35 | For serialization that includes string data, use the normal the python json ``.dump(s)`` 36 | and ``.load(s)`` methods, passing the ``Ito.JsonEncoder`` class and ``Ito.json_decoder`` 37 | method to each respectively: 38 | 39 | ```python 40 | >>> import json 41 | >>> json_data = json.dumps(i, cls=Ito.JsonEncoder) 42 | >>> s in json_data # verify s is present in JSON 43 | True 44 | >>> j = json.loads(json_data, object_hook=Ito.json_decoder) 45 | >>> len(j.children) 46 | 3 47 | ``` 48 | 49 | The resulting output conserves memory by saving the string data for a hierarchy once, 50 | since a given ``Ito`` and its children all share the same value. 51 | 52 | ### Ito.JsonEncoderStringless 53 | 54 | For stringless serialization, use the normal the python json ``.dump(s)`` 55 | methods and passing the ``Ito.JsonEncoderStringless`` class to them. For 56 | deserialization, use the static ``Ito.json_decode_stringless`` method, which 57 | has its inputs both the string and json data being de-serialized: 58 | 59 | ```python 60 | >>> json_data = json.dumps(i, cls=Ito.JsonEncoderStringless) 61 | >>> s in json_data # verify s not present in JSON 62 | False 63 | >>> j = Ito.json_decode_stringless(s, json_data) 64 | >>> j 65 | Ito(span=(0, 13), desc='my desc', substr='See Jack run.') 66 | ``` 67 | 68 | [^lambda_pickling]: The python pickle library supports neither lambdas nor methods not-defined at the top level of a module. See `Python pickle docs 69 | ` for more info. 70 | -------------------------------------------------------------------------------- /docs/Using Pawpaw with nltk.md: -------------------------------------------------------------------------------- 1 | # Using Pawpaw with ``nltk`` 2 | 3 | ## Tokenization 4 | 5 | ### Convert nlkt tokenizer output to Ito 6 | 7 | ```python 8 | >>> import nltk 9 | >>> from pawpaw import Ito 10 | >>> from nltk.tokenize import WhitespaceTokenizer 11 | >>> s = 'The quick brown fox.' 12 | >>> ws_tok = nltk.tokenize.WhitespaceTokenizer() 13 | >>> tokens = [Ito(s, *span, 'token') for span in ws_tok.span_tokenize(s)] 14 | >>> [str(i) for i in tokens] 15 | ['The', 'quick', 'brown', 'fox.'] 16 | ``` 17 | 18 | ### Use nltk tokenizer with split 19 | 20 | ```python 21 | >>> import nltk 22 | >>> import regex 23 | >>> from pawpaw import Ito, arborform 24 | >>> ws_tok = nltk.tokenize.WhitespaceTokenizer() 25 | >>> splitter = arborform.Split(regex.compile(ws_tok._pattern, ws_tok._flags)) 26 | >>> i = Ito('The quick brown fox.') 27 | >>> [str(i) for i in splitter(i)] 28 | ['The', 'quick', 'brown', 'fox.'] 29 | ``` 30 | 31 | ### Chaining NLP 32 | 33 | ```python 34 | >>> from pawpaw import Ito, arborform, visualization 35 | >>> s = 'Here is one sentence. Here is another.' 36 | >>> i = Ito(s) 37 | >>> 38 | >>> nltk_tok = nltk.tokenize 39 | >>> sent_itor = arborform.Itorator.wrap(lambda ito: ito.from_substrings(ito, *nltk_tok.sent_tokenize(str(ito)))) 40 | >>> 41 | >>> word_itor = arborform.Itorator.wrap(lambda ito: ito.from_substrings(ito, *nltk_tok.word_tokenize(str(ito)))) 42 | >>> sent_itor.itor_children = word_itor 43 | >>> 44 | >>> i.children.add(*sent_itor(i)) 45 | >>> vis_tree = visualization.pepo.Tree() 46 | >>> print(vis_tree.dumps(i)) 47 | (0, 39) 'None' : 'Here is one sentence. Here is another.' 48 | ├──(0, 21) 'None' : 'Here is one sentence.' 49 | │ ├──(0, 4) 'None' : 'Here' 50 | │ ├──(5, 7) 'None' : 'is' 51 | │ ├──(8, 11) 'None' : 'one' 52 | │ ├──(12, 20) 'None' : 'sentence' 53 | │ └──(20, 21) 'None' : '.' 54 | └──(23, 39) 'None' : 'Here is another.' 55 | ├──(23, 27) 'None' : 'Here' 56 | ├──(28, 30) 'None' : 'is' 57 | ├──(31, 38) 'None' : 'another' 58 | └──(38, 39) 'None' : '.' 59 | ``` 60 | -------------------------------------------------------------------------------- /docs/demos/Q&A/description.md: -------------------------------------------------------------------------------- 1 | ## From: 2 | 3 | * [stackoverflow question 75394318](https://stackoverflow.com/questions/75394318/python-text-parsing-to-split-list-into-chunks-including-preceding-delimiters) 4 | 5 | ## Description 6 | 7 | Given the text: 8 | 9 | ```text 10 | \na\n\nQ So I do first want to bring up exhibit No. 46, which is in the binder 11 | in front of\nyou.\n\nAnd that is a letter [to] Alston\n& Bird... 12 | \n\nIs that correct?\n\nA This is correct.\n\nQ Okay 13 | ``` 14 | 15 | Split it into separate questions and answers. 16 | 17 | * Each Question or Answer starts with ``'\nQ '``, ``'\nA '``, ``'\nQ_'`` or ``'\nA_'``. 18 | * Sometimes the first item in the list may be neither a Question nor Answer, but just random text before the first ``'\Q'`` delimiter. 19 | -------------------------------------------------------------------------------- /docs/demos/Q&A/solution.py: -------------------------------------------------------------------------------- 1 | import regex 2 | from pawpaw import Ito, arborform, visualization 3 | 4 | # INPUT 5 | text = """\na\n\nQ So I do first want to bring up exhibit No. 46, which is in the binder 6 | in front of\nyou.\n\nAnd that is a letter [to] Alston\n& Bird... 7 | \n\nIs that correct?\n\nA This is correct.\n\nQ Okay.""" 8 | 9 | # BUILD PARSER 10 | itor_split = arborform.Split(regex.compile(r'\n+(?=Q_? )', regex.DOTALL), desc='Q/A tuple') 11 | 12 | itor_filt = arborform.Filter(lambda i: i.str_startswith('Q')) # toss "random text" stuff 13 | con = arborform.Connectors.Delegate(itor_filt) 14 | itor_split.connections.append(con) 15 | 16 | # Assumes only one answer per question 17 | itor_qa_split = arborform.Split(regex.compile(r'\n+(?=A_? )', regex.DOTALL), limit=1) 18 | con = arborform.Connectors.Children.Add(itor_qa_split) 19 | itor_filt.connections.append(con) 20 | 21 | itor_extract = arborform.Extract( 22 | regex.compile(r'([QA])_? (?.+)', regex.DOTALL), 23 | desc=lambda match, group: match.group(1)) 24 | con = arborform.Connectors.Children.Add(itor_extract) 25 | itor_qa_split.connections.append(con) 26 | 27 | # OUTPUT TREE 28 | root = Ito(text) 29 | tree_vis = visualization.pepo.Tree() 30 | for i in itor_split(root): 31 | print(tree_vis.dumps(i)) 32 | print() 33 | 34 | # OUTPUT TUPLE 35 | for i, tup in enumerate(itor_split(root)): 36 | print(f'{tup:%desc} {i:,}:') 37 | for qa in tup.children: 38 | print(f'\t{qa:%desc% : %substr!r}') 39 | print() 40 | -------------------------------------------------------------------------------- /docs/demos/class_grades/description.md: -------------------------------------------------------------------------------- 1 | ## From 2 | 3 | * [stackoverflow question 47982949](https://stackoverflow.com/questions/47982949/how-to-parse-complex-text-files-using-python) 4 | 5 | * [codereview question 183668](https://codereview.stackexchange.com/questions/183668/parse-complex-text-files-using-python) 6 | 7 | ## Description 8 | 9 | Given the a file containing this text: 10 | 11 | ```text 12 | School = Riverdale High 13 | Grade = 1 14 | Student number, Name 15 | 0, Phoebe 16 | 1, Rachel 17 | 18 | Student number, Score 19 | 0, 3 20 | 1, 7 21 | 22 | Grade = 2 23 | Student number, Name 24 | 0, Angela 25 | 1, Tristan 26 | 2, Aurora 27 | 28 | Student number, Score 29 | 0, 6 30 | 1, 3 31 | 2, 9 32 | 33 | School = Hogwarts 34 | Grade = 1 35 | Student number, Name 36 | 0, Ginny 37 | 1, Luna 38 | 39 | Student number, Score 40 | 0, 8 41 | 1, 7 42 | 43 | Grade = 2 44 | Student number, Name 45 | 0, Harry 46 | 1, Hermione 47 | 48 | Student number, Score 49 | 0, 5 50 | 1, 10 51 | 52 | Grade = 3 53 | Student number, Name 54 | 0, Fred 55 | 1, George 56 | 57 | Student number, Score 58 | 0, 0 59 | 1, 0 60 | ``` 61 | 62 | Parset the file and create a pandas DataFrame whose output is as follows: 63 | 64 | ```text 65 | Name Score 66 | School Grade Student number 67 | Hogwarts 1 0 Ginny 8 68 | 1 Luna 7 69 | 2 0 Harry 5 70 | 1 Hermione 10 71 | 3 0 Fred 0 72 | 1 George 0 73 | Riverdale High 1 0 Phoebe 3 74 | 1 Rachel 7 75 | 2 0 Angela 6 76 | 1 Tristan 3 77 | 2 Aurora 9 78 | ``` -------------------------------------------------------------------------------- /docs/demos/class_grades/input.txt: -------------------------------------------------------------------------------- 1 | School = Riverdale High 2 | Grade = 1 3 | Student number, Name 4 | 0, Phoebe 5 | 1, Rachel 6 | 7 | Student number, Score 8 | 0, 3 9 | 1, 7 10 | 11 | Grade = 2 12 | Student number, Name 13 | 0, Angela 14 | 1, Tristan 15 | 2, Aurora 16 | 17 | Student number, Score 18 | 0, 6 19 | 1, 3 20 | 2, 9 21 | 22 | School = Hogwarts 23 | Grade = 1 24 | Student number, Name 25 | 0, Ginny 26 | 1, Luna 27 | 28 | Student number, Score 29 | 0, 8 30 | 1, 7 31 | 32 | Grade = 2 33 | Student number, Name 34 | 0, Harry 35 | 1, Hermione 36 | 37 | Student number, Score 38 | 0, 5 39 | 1, 10 40 | 41 | Grade = 3 42 | Student number, Name 43 | 0, Fred 44 | 1, George 45 | 46 | Student number, Score 47 | 0, 0 48 | 1, 0 -------------------------------------------------------------------------------- /docs/demos/class_grades/parser_compact.py: -------------------------------------------------------------------------------- 1 | import regex 2 | from pawpaw import arborform 3 | 4 | def get_parser() -> arborform.Itorator: 5 | return arborform.Extract( 6 | regex.compile( 7 | r'(?School = (?.+?)\n' 8 | r'(?Grade = (?\d+)\n' 9 | r'Student number, Name\n(?P(?:(?P\d+), (?P.+?)\n)+)\n' 10 | r'Student number, Score\n(?P(?:(?P\d+), (?P\d+)(?:$|\n))+)(?:$|\n)' 11 | r')+)+', 12 | regex.DOTALL 13 | ) 14 | ) 15 | -------------------------------------------------------------------------------- /docs/demos/class_grades/parser_verbose.py: -------------------------------------------------------------------------------- 1 | import regex 2 | from pawpaw import arborform 3 | 4 | def get_parser() -> arborform.Itorator: 5 | school_splitter = arborform.Split( 6 | regex.compile(r'(?<=(?:^|\n))(?=School =)', regex.DOTALL), 7 | desc='school', 8 | tag='school splitter') 9 | 10 | name_grades = arborform.Extract( 11 | regex.compile(r'School = (?.+?)\n(?.+)(?:$|\n)', regex.DOTALL), 12 | tag='school name & grades') 13 | con = arborform.Connectors.Children.Add(name_grades) 14 | school_splitter.connections.append(con) 15 | 16 | grade_splitter = arborform.Split( 17 | regex.compile(r'(?<=\n)(?=Grade =)', regex.DOTALL), 18 | desc='grade', 19 | tag='grade splitter') 20 | con = arborform.Connectors.Delegate(grade_splitter, lambda ito: ito.desc == 'grades') 21 | name_grades.connections.append(con) 22 | 23 | grade = arborform.Extract( 24 | regex.compile(r'Grade = (?\d+)\nStudent number, Name\n(?.+?)\nStudent number, Score\n(?.+)', regex.DOTALL), 25 | tag='grade & stu_num/name * stu_num/score') 26 | con = arborform.Connectors.Children.Add(grade) 27 | grade_splitter.connections.append(con) 28 | 29 | stu_num_names = arborform.Extract( 30 | regex.compile(r'(?\d+), (?.+?)\n', regex.DOTALL), 31 | tag='stu num/name pairs') 32 | con = arborform.Connectors.Children.Add(stu_num_names, lambda ito: ito.desc == 'stu_num_names') 33 | grade.connections.append(con) 34 | 35 | stu_num_scores = arborform.Extract( 36 | regex.compile(r'(?\d+), (?\d+)(?:$|\n)', regex.DOTALL), 37 | tag='stu num/score pairs') 38 | con = arborform.Connectors.Children.Add(stu_num_scores) 39 | grade.connections.append(con) 40 | 41 | return school_splitter 42 | -------------------------------------------------------------------------------- /docs/demos/class_grades/solution.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | import regex 5 | from pawpaw import Ito, visualization 6 | import pandas as pd 7 | 8 | while ((answer := input('Select (C)ompact or (V)erbose parser: ').casefold()) not in 'cv'): 9 | pass 10 | 11 | # read file 12 | with open(os.path.join(sys.path[0], 'input.txt')) as f: 13 | ito = Ito(f.read(), desc='all') 14 | 15 | # parse 16 | if answer == 'c': 17 | from parser_compact import get_parser 18 | else: 19 | from parser_verbose import get_parser 20 | parser = get_parser() 21 | ito.children.add(*parser(ito)) 22 | 23 | # display Pawpaw tree 24 | tree_vis = visualization.pepo.Tree() 25 | print(tree_vis.dumps(ito)) 26 | 27 | # build pandas DataFrame 28 | d = [] 29 | for school in ito.find_all('*[d:school]'): 30 | school_name = str(school.find('*[d:name]')) 31 | for grade in school.find_all('**[d:grade]'): 32 | grade_key = int(str(grade.find('*[d:key]'))) 33 | for stu_num in grade.find_all('*[d:stu_num_names]/*[d:stu_num]'): 34 | stu_name = str(stu_num.find('>[d:name]')) 35 | stu_num = str(stu_num) 36 | stu_score = int(str(grade.find('*[d:stu_num_scores]/*[d:stu_num]&[s:' + stu_num + ']/>[d:score]'))) 37 | d.append({'School': school_name, 'Grade': grade_key, 'Student number': stu_num, 'Name': stu_name, 'Score': stu_score}) 38 | data = pd.DataFrame(d) 39 | data.set_index(['School', 'Grade', 'Student number'], inplace=True) 40 | data = data.groupby(level=data.index.names).first() 41 | 42 | # display pandas DataFrame 43 | print(data) 44 | -------------------------------------------------------------------------------- /docs/demos/compounds/compound_1.txt: -------------------------------------------------------------------------------- 1 | MODEL 1 2 | REMARK minimizedAffinity -7.11687565 3 | REMARK CNNscore 0.573647082 4 | REMARK CNNaffinity 5.82644749 5 | REMARK 11 active torsions: 6 | # Lots of text here 7 | # Lots of text here 8 | # Lots of text here 9 | docs/demos/class_grades/input.txt 10 | docs/demos/class_grades/input.txt 11 | MODEL 2 12 | REMARK minimizedAffinity -6.61898327 13 | REMARK CNNscore 0.55260396 14 | REMARK CNNaffinity 5.86855984 15 | REMARK 11 active torsions: 16 | # Lots of text here 17 | # Lots of text here 18 | # Lots of text here -------------------------------------------------------------------------------- /docs/demos/compounds/compound_2.txt: -------------------------------------------------------------------------------- 1 | MODEL 1 2 | REMARK minimizedAffinity -7.11687565 3 | REMARK CNNscore 0.573647082 4 | REMARK CNNaffinity 5.82644749 5 | REMARK 11 active torsions: 6 | # Lots of text here 7 | # Lots of text here 8 | # Lots of text here 9 | docs/demos/class_grades/input.txt 10 | docs/demos/class_grades/input.txt 11 | MODEL 2 12 | REMARK minimizedAffinity -6.61898327 13 | REMARK CNNscore 0.55260396 14 | REMARK CNNaffinity 5.86855984 15 | REMARK 11 active torsions: 16 | # Lots of text here 17 | # Lots of text here 18 | # Lots of text here -------------------------------------------------------------------------------- /docs/demos/compounds/description.md: -------------------------------------------------------------------------------- 1 | ## From 2 | 3 | * [stackoverflow question 76453312](https://stackoverflow.com/questions/76453312/extract-information-from-a-list-of-files-and-write-into-a-log-file) 4 | 5 | ## Description 6 | 7 | Given one or more files that look like this: 8 | 9 | ```text 10 | MODEL 1 11 | REMARK minimizedAffinity -7.11687565 12 | REMARK CNNscore 0.573647082 13 | REMARK CNNaffinity 5.82644749 14 | REMARK 11 active torsions: 15 | #Lots of text here 16 | MODEL 2 17 | REMARK minimizedAffinity -6.61898327 18 | REMARK CNNscore 0.55260396 19 | REMARK CNNaffinity 5.86855984 20 | REMARK 11 active torsions: 21 | ``` 22 | 23 | Generate output as log file containing "MODEL", "minimizedAffinity", "CNNscore", and "CNNaffinity" of each and every compound in the folder in a delimited text file: 24 | 25 | ```text 26 | Compound Model minimizedAffinity CNNscore CNNaffinity 27 | 1 1 -7.11687565 0.573647082 5.82644749 28 | 1 2 -6.61898327 0.55260396 5.86855984 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/demos/compounds/solution.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | import fnmatch 4 | import typing 5 | 6 | import regex 7 | import pawpaw 8 | 9 | # Build pawpaw parser 10 | re = regex.compile(r'(?<=^|\n)(?=MODEL \d+)', regex.DOTALL) 11 | splitter = pawpaw.arborform.Split(re) 12 | 13 | pat = r""" 14 | (?P 15 | MODEL\ 16 | (?\d+) 17 | (?:\n 18 | (? 19 | REMARK\ 20 | (?[^\s]+)\ 21 | (?[^\n]+) 22 | ) 23 | )+ 24 | (?:\n 25 | (?>!=REMARK) 26 | (?.+) 27 | )? 28 | )+ 29 | """ 30 | re = regex.compile(pat, regex.VERBOSE | regex.DOTALL) 31 | extractor = pawpaw.arborform.Extract(re) 32 | con = pawpaw.arborform.Connectors.Delegate(extractor) 33 | splitter.connections.append(con) 34 | 35 | # Prints using fixed-width for visibility: change to delimited if needed 36 | def dump_row(cols: list) -> None: 37 | print(*(f'{v: <18}' for v in cols)) 38 | 39 | # Select desired remark columns 40 | desired_remarks = ['minimizedAffinity', 'CNNscore', 'CNNaffinity'] 41 | 42 | # Headers 43 | headers = ['Compound', 'Model'] 44 | headers.extend(desired_remarks) 45 | dump_row(headers) 46 | 47 | # Create rows from compound file 48 | def compound_vals(compound: str, ito: pawpaw.Ito) -> typing.Iterable[list[str]]: 49 | for model in ito.children: 50 | vals = [compound] 51 | vals.append(str(model.find('*[d:tag]'))) 52 | for dr in desired_remarks: 53 | vals.append(str(model.find(f'*[d:remark]/*[d:tag]&[s:{dr}]/>[d:value]'))) 54 | yield vals 55 | 56 | # Read files and dump contents of each 57 | for path in os.scandir(os.path.join(sys.path[0])): 58 | if path.is_file() and fnmatch.fnmatch(path.name, 'compound_*.txt'): 59 | compound = path.name.split('_', 1)[-1].split('.', 1)[0] # compound number 60 | with open(os.path.join(sys.path[0], path)) as f: 61 | ito = pawpaw.Ito(f.read(), desc='all') 62 | ito.children.add(*splitter(ito)) 63 | for vals in compound_vals(compound, ito): 64 | dump_row(vals) 65 | -------------------------------------------------------------------------------- /docs/demos/gettysburg_address/gettysburg_address.txt: -------------------------------------------------------------------------------- 1 | Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal. 2 | 3 | Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this. 4 | 5 | But, in a larger sense, we can not dedicate — we can not consecrate — we can not hallow — this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. 6 | 7 | The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us — that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion — that we here highly resolve that these dead shall not have died in vain — that this nation, under God, shall have a new birth of freedom — and that government of the people, by the people, for the people, shall not perish from the earth. -------------------------------------------------------------------------------- /docs/demos/us_constitution/description.md: -------------------------------------------------------------------------------- 1 | ## From: 2 | 3 | * [stackoverflow question 75394318](https://stackoverflow.com/questions/75394318/python-text-parsing-to-split-list-into-chunks-including-preceding-delimiters) 4 | 5 | ## Description 6 | 7 | Given the text of the U.S. Constitution, which can be found [here](https://www.archives.gov/founding-docs/constitution-transcript), perfrom a full segmentation starting with high level parts such as articles, sections, etc. down to words. 8 | 9 | . 10 | -------------------------------------------------------------------------------- /docs/demos/us_constitution/us_constitution.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os.path 3 | 4 | import regex 5 | import pawpaw 6 | from pawpaw import arborform 7 | 8 | """ 9 | DEMO: US CONSTITUTION 10 | 11 | This demo shows an example of how to parse, visualize, and query the US Constitution using Pawpaw. 12 | 13 | Note: The text for the constitution was taken from https://www.archives.gov/founding-docs/constitution-transcript 14 | """ 15 | 16 | def get_parser() -> arborform.Itorator: 17 | # Article: could be preamble 18 | a_splitter = arborform.Split( 19 | regex.compile(r'(?<=\n+)(?=Article\.)', regex.DOTALL), 20 | boundary_retention=arborform.Split.BoundaryRetention.NONE, 21 | tag='article splitter') 22 | 23 | a_desc = arborform.Desc( 24 | desc=lambda ito: 'article' if ito.str_startswith('Article.') else 'preamble', 25 | tag='article desc') 26 | con = arborform.Connectors.Delegate(a_desc) 27 | a_splitter.connections.append(con) 28 | 29 | con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'preamble') 30 | a_desc.connections.append(con) 31 | 32 | a_extractor = arborform.Extract( 33 | regex.compile(r'Article\. (?[A-Z]+)\.\n(?.+)', regex.DOTALL), 34 | tag='article extractor') 35 | con = arborform.Connectors.Children.Add(a_extractor, lambda ito: ito.desc == 'article') 36 | a_desc.connections.append(con) 37 | 38 | # Section: only some articles have sections 39 | s_splitter = arborform.Split( 40 | regex.compile(r'(?<=\n+)(?=Section\.)', regex.DOTALL), 41 | boundary_retention=arborform.Split.BoundaryRetention.LEADING, 42 | desc='section', 43 | tag='section splitter') 44 | con = arborform.Connectors.Children.Add(s_splitter, lambda ito: ito.desc == 'value' and ito.str_startswith('Section.')) 45 | a_extractor.connections.append(con) 46 | con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'value' and not ito.str_startswith('Section.')) 47 | a_extractor.connections.append(con) 48 | 49 | s_extractor = arborform.Extract(regex.compile(r'Section\. (?\d+)\.\n(?.+)', regex.DOTALL)) 50 | con = arborform.Connectors.Children.Add(s_extractor) 51 | s_splitter.connections.append(con) 52 | con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'value') 53 | s_extractor.connections.append(con) 54 | 55 | return a_splitter 56 | 57 | 58 | def get_text() -> pawpaw.Ito: 59 | with open(os.path.join(sys.path[0], 'us_constitution.txt')) as f: 60 | return pawpaw.Ito(f.read(), desc='constitution') 61 | 62 | 63 | # Visualize 64 | print(f'\nVISUALIZE:\n') 65 | i = get_text() 66 | tree_vis = pawpaw.visualization.pepo.Tree() 67 | parser = get_parser() 68 | i.children.add(*parser(i)) 69 | print(tree_vis.dumps(i)) 70 | 71 | # Query 72 | print(f'\nQUERY:\n') 73 | print(f'\tGoal: Find sections containing words \'power\' or \'right\'\n') 74 | query = '**[d:section]{**[d:word] & [lcs:power,right]}' 75 | print(f'\tPlumule Query: {query}\n') 76 | print(f'\tResults:\n') 77 | for i, section in enumerate(i.find_all(query)): 78 | article_key = section.find('..[d:article]/*[d:key]') 79 | section_key = section.find('*[d:key]') 80 | section_value = section.find('*[d:value]') 81 | print(f'\t\tMatch {i}: Article {article_key}, Section {section_key}') 82 | print(f'\t\t\t{section_value:%substr:45…}') 83 | -------------------------------------------------------------------------------- /docs/demos/xpath_recursion_depth/description.md: -------------------------------------------------------------------------------- 1 | ## From 2 | 3 | * [stackoverflow question 51034706](https://stackoverflow.com/questions/51034706/breaking-the-lxml-etree-html-xpath-max-parsing-depth-limit) 4 | 5 | ## Description 6 | 7 | The XPATH parser for lxml.etree has a max depth limit, which can be seen with the following code: 8 | 9 | ```python 10 | import lxml.etree as etree 11 | 12 | # Setup HTML tabs 13 | x = "" 14 | x_ = "" 15 | 16 | # Set recursion depth to 255 17 | depth = 255 18 | 19 | # Fails with depth >= 255: 20 | print(etree.HTML(x * depth + "

text to be extracted

" + x_* depth).xpath("//p//text()")) 21 | ``` 22 | -------------------------------------------------------------------------------- /docs/demos/xpath_recursion_depth/solution_1.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | 3 | # Setup HTML tabs 4 | x = "" 5 | x_ = "" 6 | 7 | # Set recursion depth to 255 8 | depth = 300 9 | 10 | xml_text = f'{x * depth}

text to be extracted

{x_ * depth}' 11 | 12 | root = ET.fromstring(xml_text) 13 | print(root.find(".//p").text) 14 | -------------------------------------------------------------------------------- /docs/demos/xpath_recursion_depth/solution_2.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.modules['_elementtree'] = None 3 | import xml.etree.ElementTree as ET 4 | 5 | from pawpaw import xml 6 | 7 | 8 | # Setup HTML tabs 9 | x = "" 10 | x_ = "" 11 | 12 | # Set recursion depth to 255 13 | depth = 300 14 | 15 | xml_text = f'{x * depth}

text to be extracted

{x_ * depth}' 16 | 17 | root = ET.fromstring(xml_text, parser=xml.XmlParser()) 18 | node = root.ito.find('**[d:element]{*[d:start_tag]/**[d:name]&[s:p]}/*[d:text]') 19 | print(str(node)) 20 | 21 | -------------------------------------------------------------------------------- /images/pawpaw.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/images/pawpaw.png -------------------------------------------------------------------------------- /pawpaw/__init__.py: -------------------------------------------------------------------------------- 1 | from pawpaw._version import __version__, Version 2 | # del _version 3 | 4 | from pawpaw.infix import Infix 5 | del infix 6 | 7 | from pawpaw.errors import Errors 8 | del errors 9 | 10 | import pawpaw._type_magic as type_magic 11 | del _type_magic 12 | 13 | from pawpaw.span import Span 14 | del span 15 | 16 | from pawpaw.ito import nuco, GroupKeys, Ito, ChildItos, Types 17 | del ito 18 | 19 | from pawpaw.util import find_unescaped, split_unescaped, find_balanced 20 | del util 21 | 22 | import pawpaw.arborform 23 | import pawpaw.query 24 | import pawpaw.xml 25 | import pawpaw.nlp 26 | import pawpaw.table 27 | import pawpaw.visualization 28 | 29 | del pawpaw 30 | -------------------------------------------------------------------------------- /pawpaw/_type_magic.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import inspect 3 | import types 4 | import typing 5 | 6 | from pawpaw.errors import Errors 7 | 8 | 9 | CALLABLE_TYPE_OR_GENERIC = typing._CallableType | typing._CallableGenericAlias 10 | 11 | def is_callable_type_or_generic(obj: typing.Any) -> bool: 12 | ''' 13 | Returns True if obj is typing._Callable or typing._CallableGenericAlias 14 | ''' 15 | return isinstance(obj, CALLABLE_TYPE_OR_GENERIC) 16 | 17 | 18 | def is_functoid(obj: typing.Any): 19 | ''' 20 | Python's builtin callable method iscallable() returns true for all callable types (e.g., def, lambda, 21 | instance/class/builtin method, etc.) However, it also returns True for typing.Callable 22 | objects. E.g.: 23 | 24 | >>> MY_FUNC_ALIAS = typing.Callable[[int], str] 25 | >>> callable(MY_FUNC_ALIAS) 26 | True 27 | 28 | This method returns False in such cases 29 | 30 | The terms 'Function', 'Method', 'Callable', etc. all have established meanings in Python. 31 | It would be confusing to label this method 'callable', and the terms 'Function', 'Method', 32 | etc. already have established meantings in pythong. So instead, I'm using 'functoid' 33 | ''' 34 | return callable(obj) and not is_callable_type_or_generic(obj) 35 | 36 | 37 | _LAMBDA_OBJ_NAME = (lambda: True).__name__ # Use this instead of string literal in case Python changes 38 | 39 | 40 | # Note: Guido van Rossum uses 'def' and 'lambda' for these two concepts (see: 41 | # https://stackoverflow.com/questions/62479608/lambdatype-vs-functiontype), so I'll 42 | # use the same naming convention here 43 | 44 | def is_def(obj: typing.Any) -> bool: 45 | ''' 46 | Returns True if obj is def (defined function) 47 | ''' 48 | return isinstance(obj, types.FunctionType) and obj.__name__ != _LAMBDA_OBJ_NAME 49 | 50 | 51 | def is_lambda(obj: typing.Any) -> bool: 52 | ''' 53 | Returns True if obj is lambda 54 | ''' 55 | return isinstance(obj, types.FunctionType) and obj.__name__ == _LAMBDA_OBJ_NAME 56 | 57 | 58 | TYPE_OR_UNION = typing.Type | types.UnionType 59 | 60 | 61 | def unpack(t: TYPE_OR_UNION) -> typing.List[type]: 62 | rv = list[typing.Type]() 63 | 64 | if (origin := typing.get_origin(t)) is types.UnionType: 65 | for i in typing.get_args(t): 66 | rv.extend(unpack(i)) 67 | else: 68 | rv.append(t) 69 | 70 | return rv 71 | 72 | 73 | def isinstance_ex(obj: object, type_or_union: TYPE_OR_UNION) -> bool: 74 | ''' 75 | Although Python >= 3.10 now allows Union as 2nd parameter to isinstance method, it doesn't 76 | allow _parameterized_ types. This function performs weak checking for any supplied 77 | parameterized types. 78 | 79 | Tuples are not allowed for 2nd parameter because... why support them now that you can pass a Union? 80 | ''' 81 | for t in unpack(type_or_union): 82 | if (origin := typing.get_origin(t)) is not None: 83 | # Could expand this for various generic types 84 | if isinstance(obj, origin): 85 | return True 86 | elif issubclass(type(obj), t): 87 | return True 88 | return False 89 | 90 | 91 | def issubclass_ex(_cls, type_or_union: TYPE_OR_UNION) -> bool: 92 | cls_types = [t if (origin := typing.get_origin(t)) is None else origin for t in unpack(_cls)] 93 | tou_types = [t if (origin := typing.get_origin(t)) is None else origin for t in unpack(type_or_union)] 94 | for cls_type in cls_types: 95 | if any(issubclass(cls_type, tou_type) for tou_type in tou_types): 96 | return True 97 | return False 98 | 99 | 100 | class Functoid: 101 | """Marker object""" 102 | 103 | 104 | def _annotation_or_type_hint_matches_type( 105 | annotation: TYPE_OR_UNION | str | inspect.Signature.empty, 106 | type_hint: typing.Any or None, 107 | _type: TYPE_OR_UNION 108 | ) -> bool: 109 | t = annotation 110 | if not isinstance(t, TYPE_OR_UNION) or (isinstance(t, type) and issubclass(t, inspect.Signature.empty)): 111 | t = type_hint 112 | if t is not None: 113 | if _type is typing.Any: 114 | return True 115 | elif not issubclass_ex(t, _type): 116 | return False 117 | 118 | return True 119 | 120 | 121 | def functoid_isinstance(functoid: typing.Callable, callable_type_or_generic: CALLABLE_TYPE_OR_GENERIC) -> bool: 122 | ''' 123 | There is no good way to type hint for functoid, so falling back to 'typing.Callable' 124 | ''' 125 | 126 | if not is_callable_type_or_generic(callable_type_or_generic): 127 | raise Errors.parameter_invalid_type('callable_type_or_generic', callable_type_or_generic, CALLABLE_TYPE_OR_GENERIC) 128 | 129 | if not is_functoid(functoid): 130 | return False 131 | 132 | # This has guaranteed entries for the ret_val and all params, however, the types _may_ be 133 | # strings if "from __future__ import annotations" is used. 134 | func_sig = inspect.signature(functoid) 135 | 136 | # This has proper types, even when "from __future__ import annotations" used. However: 137 | # if the ret-val or param lacks a type hint, it is missing from this dict 138 | func_type_hints = typing.get_type_hints(functoid) 139 | 140 | ts_params, ts_ret_val = typing.get_args(callable_type_or_generic) 141 | 142 | if not _annotation_or_type_hint_matches_type(func_sig.return_annotation, func_type_hints.get('return', None), ts_ret_val): 143 | return False 144 | 145 | if len(func_sig.parameters) != len(ts_params): 146 | return False 147 | 148 | for func_p, ts_p in zip(func_sig.parameters.items(), ts_params): 149 | func_n, func_p = func_p 150 | if not _annotation_or_type_hint_matches_type(func_p.annotation, func_type_hints.get(func_n, None), ts_p): 151 | return False 152 | 153 | return True 154 | 155 | 156 | def invoke_func(func: typing.Any, *vals: typing.Any) -> typing.Any: 157 | """Wire and fire 158 | 159 | Args: 160 | func: 161 | *vals: 162 | 163 | Returns: 164 | Invokes func and returns its return value 165 | """ 166 | 167 | if is_lambda(func): 168 | return func(*vals) # No type hints on lamdbas, so this is the best we can do 169 | 170 | unpaired: typing.List[typing.Any] = list(vals) 171 | 172 | arg_spec = inspect.getfullargspec(func) 173 | del arg_spec.annotations['return'] 174 | 175 | p_args: typing.List[typing.Any] = [] 176 | for arg in arg_spec.args: 177 | for val in unpaired: 178 | val_type = type(val) 179 | if issubclass_ex(val_type, arg_spec.annotations[arg]): 180 | p_args.append(val) 181 | unpaired.remove(val) 182 | break 183 | 184 | p_kwonlyargs: typing.Dict[str, typing.Any] = {} 185 | for arg in arg_spec.kwonlyargs: 186 | for val in unpaired: 187 | val_type = type(val) 188 | if issubclass_ex(val_type, arg_spec.annotations[arg]): 189 | p_kwonlyargs[arg] = val 190 | unpaired.remove(val) 191 | break 192 | 193 | p_vargs: typing.List[typing.Any] = [] 194 | if len(unpaired) > 0 and arg_spec.varargs is not None: 195 | p_vargs[arg_spec.varargs] = unpaired 196 | 197 | return func(*p_args, *p_vargs, **p_kwonlyargs) 198 | -------------------------------------------------------------------------------- /pawpaw/_version.py: -------------------------------------------------------------------------------- 1 | import string 2 | import typing 3 | 4 | import regex 5 | 6 | __version__ = '1.0.1' 7 | """The str literal that build, setup, documentation, and other tools typically want.""" 8 | 9 | class Version: 10 | _canonical_re = regex.compile(r'^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\.post(0|[1-9][0-9]*))?(\.dev(0|[1-9][0-9]*))?(?:\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*)?$') 11 | """This pattern taken from https://peps.python.org/pep-0440/#appendix-b-parsing-version-strings-with-regular-expressions 12 | and expanded to support optional "local version identifier" (see https://peps.python.org/pep-0440/#local-version-identifiers).""" 13 | 14 | @classmethod 15 | def is_canonical(cls, version: str) -> bool: 16 | return cls._canonical_re.match(version) is not None 17 | 18 | _parse_pat = r""" 19 | v? 20 | (?: 21 | (?:(?P[0-9]+)!)? # epoch 22 | (?P[0-9]+(?:\.[0-9]+)*) # release segment 23 | (?P
                                          # pre-release
24 |             [-_\.]?
25 |             (?Pa|b|c|rc|alpha|beta|pre|preview)
26 |             [-_\.]?
27 |             (?P[0-9]+)?
28 |         )?
29 |         (?P                                         # post release
30 |             (?:-(?P[0-9]+))
31 |             |
32 |             (?:
33 |                 [-_\.]?
34 |                 (?Ppost|rev|r)
35 |                 [-_\.]?
36 |                 (?P[0-9]+)?
37 |             )
38 |         )?
39 |         (?P                                          # dev release
40 |             [-_\.]?
41 |             (?Pdev)
42 |             [-_\.]?
43 |             (?P[0-9]+)?
44 |         )?
45 |     )
46 |     (?:\+(?P[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
47 | """
48 |     """Taken from https://peps.python.org/pep-0440/#appendix-b-parsing-version-strings-with-regular-expressions and
49 |     corrected so that group pre_l has no sub-group and behaves like post_l and dev_l groups"""
50 | 
51 |     parse_re = regex.compile(r"^\s*" + _parse_pat + r"\s*$", regex.VERBOSE | regex.IGNORECASE)
52 |     """regex that could be used by pawpaw to create a parse tree for a version str"""
53 | 
54 | if not Version.is_canonical(__version__):
55 |     raise ValueError(f'__version__ is non-canonical with pep-0440')
56 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/__init__.py:
--------------------------------------------------------------------------------
1 | from pawpaw.arborform.itorator import *
2 | 
3 | from pawpaw.arborform.postorator import *
4 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/__init__.py:
--------------------------------------------------------------------------------
 1 | from .itorator import Connectors, Itorator
 2 | del itorator
 3 | 
 4 | from .regex_itorator import RegexItorator
 5 | del regex_itorator
 6 | 
 7 | from .reflect import Reflect
 8 | del reflect
 9 | 
10 | from .desc import Desc
11 | del desc
12 | 
13 | from .value_func import ValueFunc
14 | del value_func
15 | 
16 | from .filter import Filter
17 | del filter
18 | 
19 | from .extract import Extract
20 | del extract
21 | 
22 | from .split import Split
23 | del split
24 | 
25 | from .invert import Invert
26 | del invert
27 | 
28 | from .nuco import Nuco
29 | del nuco
30 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/desc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Desc(Itorator):
 8 |     def __init__(self, desc: str | Types.F_ITO_2_DESC, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         if isinstance(desc, str):
11 |             self._desc_func = lambda ito: desc
12 |         elif type_magic.functoid_isinstance(desc, Types.F_ITO_2_DESC):
13 |             self._desc_func = desc
14 |         else:
15 |             raise Errors.parameter_invalid_type('desc', desc, str | Types.F_ITO_2_DESC)
16 | 
17 |     def clone(self, tag: str | None = None) -> Desc:
18 |         return type(self())(self._desc_func, self.tag if tag is None else tag)
19 | 
20 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
21 |         ito.desc = self._desc_func(ito)
22 |         yield ito
23 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/extract.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import collections
 3 | import types
 4 | import typing
 5 | 
 6 | import regex
 7 | from pawpaw import Ito, Types, Errors, type_magic
 8 | from pawpaw.arborform.itorator import RegexItorator
 9 | 
10 | 
11 | class Extract(RegexItorator):
12 |     def __init__(self,
13 |                  re: regex.Pattern,
14 |                  limit: int | None = None,
15 |                  desc: str | Types.F_M_GK_2_DESC = lambda m, gk: str(gk),
16 |                  group_filter: collections.abc.Container[Types.C_GK] | Types.P_M_GK = lambda m, gk: str(gk) != '0',
17 |                  tag: str | None = None):
18 |         super().__init__(re, group_filter, tag)
19 | 
20 |         if not isinstance(limit, (int, type(None))):
21 |             raise Errors.parameter_invalid_type('limit', limit, int, types.NoneType)
22 |         self.limit = limit
23 | 
24 |         if isinstance(desc, str):
25 |             self._desc = lambda m, gk: desc
26 |         elif type_magic.functoid_isinstance(desc, Types.F_M_GK_2_DESC):
27 |             self.desc = desc
28 |         else:
29 |             raise Errors.parameter_invalid_type('desc', desc, str,  Types.F_M_GK_2_DESC)
30 |     
31 |     def clone(self, tag: str | None = None) -> Extract:
32 |         return type(self())(self._re, self.limit, self.desc, self._group_filter, self.tag if tag is None else tag)
33 | 
34 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
35 |         return [*ito.from_re(self._re, ito, self.group_filter, self.desc, self.limit)]
36 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/filter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Filter(Itorator):
 8 |     def __init__(self, filter_: Types.P_ITO, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         if not (filter_ is None or type_magic.functoid_isinstance(filter_, Types.P_ITO)):
11 |             raise Errors.parameter_invalid_type('filter', filter_, Types.P_ITO)
12 |         self._filter = filter_
13 | 
14 |     def clone(self, tag: str | None = None) -> Filter:
15 |         return type(self())(self._filter, self.tag if tag is None else tag)
16 | 
17 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
18 |         if self._filter(ito):
19 |             yield ito
20 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/invert.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Invert(Itorator):
 8 |     def __init__(
 9 |             self,
10 |             itorator: Itorator,
11 |             desc: str | None = None,
12 |             tag: str | None = None):
13 |         super().__init__(tag)
14 |         self.itorator = itorator
15 |         self.desc = desc
16 | 
17 |     def clone(self, tag: str | None = None) -> Invert:
18 |         return type(self())(self.itorator, self.desc, self.tag if tag is None else tag)
19 | 
20 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
21 |         start = ito.start
22 |         for i in self.itorator(ito):
23 |             if start < i.start:
24 |                 yield ito.clone(start, i.start, desc=self.desc)
25 |             start = i.stop
26 | 
27 |         if start == ito.start:
28 |             yield ito.clone(desc=self.desc)
29 |         elif i.stop < ito.stop:
30 |             yield ito.clone(i.stop, ito.stop, self.desc)
31 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/itorator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from abc import ABC, abstractmethod
  3 | import itertools
  4 | import types
  5 | import typing
  6 | 
  7 | from pawpaw import Types, Errors, Ito, type_magic
  8 | from pawpaw.arborform.postorator.postorator import Postorator
  9 | 
 10 | 
 11 | class Connector(ABC):
 12 |     def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 13 |         if not isinstance(itorator, Itorator):
 14 |             raise Errors.parameter_invalid_type('itorator', itorator, Itorator)
 15 |         self.itorator = itorator
 16 | 
 17 |         if type_magic.functoid_isinstance(predicate, Types.P_ITO):
 18 |             self.predicate = predicate
 19 |         elif isinstance(predicate, str):
 20 |             self.predicate = lambda ito: ito.desc == predicate
 21 |         elif predicate is None:
 22 |             self.predicate = lambda ito: ito.desc is None
 23 |         else:
 24 |             raise Errors.parameter_invalid_type('predicate', predicate, Types.P_ITO, str, None)
 25 | 
 26 | 
 27 | class ChildrenConnector(Connector, ABC):
 28 |     def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 29 |         super().__init__(itorator, predicate)
 30 | 
 31 | 
 32 | class Connectors:
 33 |     # yield from f(cur)
 34 |     # break
 35 |     class Delegate(Connector):
 36 |         def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 37 |             super().__init__(itorator, predicate)
 38 | 
 39 |     # cur(s) ~= f(cur)
 40 |     class Recurse(Connector):
 41 |         def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 42 |             super().__init__(itorator, predicate)
 43 | 
 44 |     # f(cur)
 45 |     class Subroutine(Connector):
 46 |         def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 47 |             super().__init__(itorator, predicate)
 48 | 
 49 |     class Children:
 50 |         # cur.children.add(*f(cur))
 51 |         class Add(ChildrenConnector):
 52 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 53 |                 super().__init__(itorator, predicate)
 54 | 
 55 |         # cur.children.add_hierarchical(*f(cur))
 56 |         class AddHierarchical(ChildrenConnector):
 57 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 58 |                 super().__init__(itorator, predicate)
 59 | 
 60 |         # cur.children.clear
 61 |         # cur.children.add(*f(cur))
 62 |         class Replace(ChildrenConnector):
 63 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 64 |                 super().__init__(itorator, predicate)
 65 | 
 66 |         # for c in f(cur):
 67 |         #   cur.children.remove(c)
 68 |         class Delete(ChildrenConnector):  # REMOVE
 69 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 70 |                 super().__init__(itorator, predicate)
 71 | 
 72 | 
 73 | class Itorator(ABC):
 74 |     @classmethod
 75 |     def __exhaust_iterator(cls, it: typing.Iterator):
 76 |         if not isinstance(it, typing.Iterator):
 77 |             raise Errors.parameter_invalid_type('it', it, typing.Iterator)
 78 |         
 79 |         while True:
 80 |             try:
 81 |                 next(it)
 82 |             except StopIteration:
 83 |                 break
 84 | 
 85 |     @classmethod
 86 |     def wrap(cls, src: Types.F_ITO_2_IT_ITOS, tag: str | None = None):
 87 |         if type_magic.functoid_isinstance(src, Types.F_ITO_2_IT_ITOS):
 88 |             return _WrappedItoratorEx(src, tag)
 89 | 
 90 |         raise Errors.parameter_invalid_type('src', src, Types.F_ITO_2_IT_ITOS)
 91 | 
 92 |     def __init__(self, tag: str | None = None):
 93 |         if tag is not None and not isinstance(tag, str):
 94 |             raise Errors.parameter_invalid_type('desc', tag, str)
 95 |         self._connections = list[Connector]()
 96 |         self.tag: str | None = tag
 97 |         self._postorator: Postorator | Types.F_ITOS_2_ITOS | None = None
 98 | 
 99 |     @abstractmethod
100 |     def clone(self, tag: str | None = None) -> Itorator:
101 |         ...
102 | 
103 |     @property
104 |     def connections(self) -> list[Connector]:
105 |         return self._connections
106 | 
107 |     @property
108 |     def postorator(self) -> Postorator | Types.F_ITOS_2_ITOS | None:
109 |         return self._postorator
110 | 
111 |     @postorator.setter
112 |     def postorator(self, val: Postorator | Types.F_ITOS_2_ITOS | None):
113 |         if val is None or isinstance(val, Postorator):
114 |             self._postorator = val
115 |         else:
116 |             raise Errors.parameter_invalid_type('val', val, Postorator, Types.F_ITOS_2_ITOS, types.NoneType)
117 | 
118 |     @abstractmethod
119 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
120 |         pass
121 | 
122 |     # posterator
123 |     def _post(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
124 |         if self._postorator is None:
125 |             yield from itos
126 |         else:
127 |             yield from self._postorator(itos)                
128 | 
129 |     # pipeline flow
130 |     def _flow(self, ito: Ito, con_idx: int) -> Types.C_IT_ITOS:
131 |         if con_idx >= len(self.connections):
132 |             yield ito
133 | 
134 |         else:
135 |             con = self._connections[con_idx]
136 |             if con.predicate(ito):
137 |                 if isinstance(con, Connectors.Delegate):
138 |                     yield from con.itorator._traverse(ito)
139 | 
140 |                 elif isinstance(con, ChildrenConnector):
141 |                     children = [*con.itorator._traverse(ito)]
142 | 
143 |                     if isinstance(con, Connectors.Children.Replace):
144 |                         ito.children.clear()
145 | 
146 |                     if isinstance(con, (Connectors.Children.Add, Connectors.Children.Replace)):
147 |                         ito.children.add(*children)
148 |                     elif isinstance(con, Connectors.Children.AddHierarchical):
149 |                         ito.children.add_hierarchical(*children)
150 |                     else:  # Connections.Children.Delete
151 |                         for c in children:
152 |                             ito.children.remove(c)
153 | 
154 |                     yield from self._flow(ito, con_idx + 1)
155 | 
156 |                 elif isinstance(con, Connectors.Recurse):
157 |                     for sub in con.itorator._traverse(ito):
158 |                         yield from self._flow(sub, con_idx + 1)
159 | 
160 |                 elif isinstance(con, Connectors.Subroutine):
161 |                     self.__exhaust_iterator(con.itorator._traverse(ito))
162 |                     yield from self._flow(ito, con_idx + 1)
163 | 
164 |                 else:
165 |                     raise TypeError('Invalid connector: {con}')
166 | 
167 |             else:
168 |                 yield from self._flow(ito, con_idx + 1)
169 | 
170 |     # soup to nuts
171 |     def _traverse(self, ito: Ito) -> Types.C_IT_ITOS:
172 |         yield from self._post(itertools.chain.from_iterable(self._flow(i, 0) for i in self._transform(ito)))
173 | 
174 |     def __call__(self, ito: Ito) -> Types.C_IT_ITOS:
175 |         if not isinstance(ito, Ito):
176 |             raise Errors.parameter_invalid_type('ito', ito, Ito)
177 |         yield from self._traverse(ito.clone())
178 | 
179 | 
180 | class _WrappedItoratorEx(Itorator):
181 |     def __init__(self, f: Types.F_ITO_2_IT_ITOS, tag: str | None = None):
182 |         super().__init__(tag)
183 |         self.__f = f
184 | 
185 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
186 |         yield from self.__f(ito)
187 | 
188 |     def clone(self, tag: str | None = None) -> _WrappedItoratorEx:
189 |         return type(self)(self.__f, self.tag if tag is None else tag)
190 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/nuco.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import typing
 3 | 
 4 | from pawpaw import Ito, Types
 5 | from pawpaw.arborform.itorator import Itorator
 6 | 
 7 | class Nuco(Itorator):
 8 |     def __init__(self, *itorators: Itorator, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         self._itorators = list(itorators)
11 | 
12 |     def clone(self, tag: str | None = None) -> Nuco:
13 |         return type(self())(self._itorators, self.tag if tag is None else tag)
14 | 
15 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
16 |         for itor in self._itorators:
17 |             it = itor(ito)
18 |             try:
19 |                 yield next(it)
20 |                 yield from it
21 |                 break
22 |             except StopIteration:
23 |                 pass


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/reflect.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Reflect(Itorator):
 8 |     def __init__(self, tag: str | None = None):
 9 |         super().__init__(tag)
10 | 
11 |     def clone(self, tag: str | None = None) -> Reflect:
12 |         return type(self())(self.tag if tag is None else tag)
13 | 
14 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
15 |         yield ito
16 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/regex_itorator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from abc import abstractmethod
 3 | import collections
 4 | import typing
 5 | import types
 6 | 
 7 | import regex
 8 | from pawpaw import GroupKeys, Ito, Types, Errors, type_magic
 9 | from pawpaw.arborform.itorator import Itorator
10 | 
11 | 
12 | class RegexItorator(Itorator):
13 |     def __init__(self,
14 |                  re: regex.Pattern,
15 |                  group_filter: collections.abc.Container[Types.C_GK] | Types.P_M_GK = lambda m, gk: True,
16 |                  tag: str | None = None):
17 |         super().__init__(tag)
18 |         
19 |         self._group_keys: list[Types.C_GK]
20 |         self.re = re  # sets ._group_keys
21 |         self.group_filter = group_filter
22 | 
23 |     @property
24 |     def re(self) -> regex.Pattern:
25 |         return self._re
26 | 
27 |     @re.setter
28 |     def re(self, re: regex.Pattern) -> None:
29 |         if not isinstance(re, regex.Pattern):
30 |             raise Errors.parameter_invalid_type('re', re, regex.Pattern)
31 |         self._re = re
32 |         self._group_keys = GroupKeys.preferred(re)
33 | 
34 |     @property
35 |     def group_filter(self) -> collections.abc.Container[Types.C_GK] | Types.P_M_GK:
36 |         return self._group_filter
37 | 
38 |     @group_filter.setter
39 |     def group_filter(self, group_filter: collections.abc.Container[Types.C_GK] | Types.P_M_GK) -> None:
40 |         if type_magic.isinstance_ex(group_filter, collections.abc.Container[Types.C_GK]):
41 |             GroupKeys.validate(self._re, group_filter)
42 |             self._group_filter = group_filter
43 |         elif type_magic.functoid_isinstance(group_filter, Types.P_M_GK):
44 |             self._group_filter = group_filter
45 |         else:
46 |             raise Errors.parameter_invalid_type('group_filter', group_filter, collections.abc.Container[Types.C_GK], Types.P_M_GK)
47 | 
48 |     def clone(self, tag: str | None = None) -> RegexItorator:
49 |         return type(self())(self._re, self._group_filter, self.tag)
50 | 
51 |     @abstractmethod
52 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
53 |         pass
54 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/split.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import enum
  3 | import types
  4 | import typing
  5 | import itertools
  6 | 
  7 | import regex
  8 | from pawpaw import Errors, Span, Ito, Types, type_magic
  9 | from pawpaw.arborform.itorator import Itorator, Extract
 10 | 
 11 | 
 12 | class Split(Itorator):
 13 |     @enum.unique
 14 |     class BoundaryRetention(enum.Enum):
 15 |         NONE = 0
 16 |         LEADING = 1
 17 |         TRAILING = 2
 18 |         ALL = 3
 19 | 
 20 |     def __init__(
 21 |             self,
 22 |             splitter: Itorator | regex.Pattern,
 23 |             limit: int | None = None,
 24 |             boundary_retention: BoundaryRetention = BoundaryRetention.NONE,
 25 |             return_zero_split: bool = True,
 26 |             desc: str | None = None,
 27 |             tag: str | None = None
 28 |     ):
 29 |         """Given P-O-O-S where P is prefix, - is boundary, O is/are middle segments(s), and S is suffix, the
 30 |         behavior is as follows:
 31 | 
 32 |           * BoundaryRetention.NONE -> P O O S : boundaries are discarded (this is an 'ordinary' split operation)
 33 | 
 34 |           * BoundaryRetention.LEADING -> -O -O -S : boundaries kept as prefixes, leading P is discarded
 35 | 
 36 |           * BoundaryRetention.TRAILING -> P- O- O- : boundaries kept as suffixes, trailing S is discarded
 37 | 
 38 |           * BoundaryRetention.ALL -> P – O – O – : all non-zero-length boundaries kept as distincts
 39 | 
 40 |         Zero-length boundaries are allowable, and any resulting empty Ito's are discarded
 41 | 
 42 |        Args:
 43 |         splitter: An Itorator used to generate boundaries; if a regex.Pattern is supplied,
 44 |           splitter is set to a pawpaw.itorator.Extract as follows:
 45 | 
 46 |             splitter = pawpaw.arborform.Extract(
 47 |                 re,
 48 |                 desc = lambda match, group_key: None,
 49 |                 group_filter = lambda m, gk: gk == 0,
 50 |                 tag = f'generated Split for \\{re.pattern}\\'
 51 |             )
 52 | 
 53 |         re: A regex pattern used to find matches
 54 | 
 55 |         group_key: A key used to identify a group from the matches; the resulting group will be used as the
 56 |           boundary; defaults to 0 (the entire match)
 57 | 
 58 |         boundary_retention: A rule used to determine if boundaries are discarded, or else how they are kept
 59 | 
 60 |         return_zero_split: Indicates how to handle the zero-split condition; when True and splits occur,
 61 |           returns a list containing a clone of the input Ito; when False and no splits occur, returns an
 62 |           empty list
 63 | 
 64 |         desc: Value used for the .desc of any yielded non-boundary Itos.
 65 |         """
 66 |         super().__init__(tag)
 67 |         
 68 |         if isinstance(splitter, Itorator):
 69 |             self.splitter = splitter
 70 |         elif isinstance(splitter, regex.Pattern):
 71 |             self.splitter = Extract(
 72 |                 splitter,
 73 |                 desc = lambda match, group_key: None,
 74 |                 group_filter = lambda m, gk: gk == 0,
 75 |                 tag = f'generated Split for \\{splitter.pattern}\\'
 76 |             )
 77 |         else:
 78 |             raise Errors.parameter_invalid_type('splitter', splitter, Itorator, regex.Pattern)
 79 |         
 80 |         if not isinstance(limit, (int, type(None))):
 81 |             raise Errors.parameter_invalid_type('limit', limit, int, types.NoneType)
 82 |         self.limit = limit
 83 | 
 84 |         if not isinstance(boundary_retention, self.BoundaryRetention):
 85 |             raise Errors.parameter_invalid_type('boundary_retention', boundary_retention, self.BoundaryRetention)
 86 |         self.boundary_retention = boundary_retention
 87 | 
 88 |         if not isinstance(return_zero_split, bool):
 89 |             raise Errors.parameter_invalid_type('return_zero_split', return_zero_split, bool)
 90 |         self.return_zero_split = return_zero_split
 91 | 
 92 |         if not isinstance(desc, (str, type(None))):
 93 |             raise Errors.parameter_invalid_type('desc', desc, str, types.NoneType)
 94 |         self.desc = desc
 95 | 
 96 |     def clone(self, tag: str | None = None) -> Split:
 97 |         return type(self())(
 98 |             self.splitter,
 99 |             self.limit,
100 |             self.boundary_retention,
101 |             self.return_zero_split,
102 |             self.desc,
103 |             self.tag if tag is None else tag)
104 | 
105 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
106 |         if self.limit == 0 and self.return_zero_split:
107 |             return ito.clone(desc=self.desc, clone_children=False),
108 | 
109 |         rv: typing.List[Ito] = []
110 |         
111 |         count = 0
112 |         prior: Span | None = None
113 |         for cur in itertools.takewhile(lambda i: self.limit is None or count < self.limit, self.splitter(ito)):
114 |             if prior is None:
115 |                 if self.boundary_retention == self.BoundaryRetention.LEADING:
116 |                     start = stop = 0
117 |                 else:
118 |                     start = ito.start
119 |                     if self.boundary_retention in (self.BoundaryRetention.NONE, self.BoundaryRetention.ALL):
120 |                         stop = cur.start
121 |                     else:  # TRAILING
122 |                         stop = cur.stop
123 |             else:
124 |                 if self.boundary_retention in (self.BoundaryRetention.NONE, self.BoundaryRetention.ALL):
125 |                     start = prior.stop
126 |                     stop = cur.start
127 |                 elif self.boundary_retention == self.BoundaryRetention.LEADING:
128 |                     start = prior.start
129 |                     stop = cur.start
130 |                 else:  # TRAILING
131 |                     start = prior.stop
132 |                     stop = cur.stop
133 | 
134 |             count += 1
135 | 
136 |             if start != stop:
137 |                 rv.append(ito.clone(start, stop, self.desc, False))
138 | 
139 |             if self.boundary_retention == self.BoundaryRetention.ALL and (cur.start < cur.stop):
140 |                 rv.append(cur)
141 | 
142 |             prior = cur
143 | 
144 |         if prior is not None and self.boundary_retention != self.BoundaryRetention.TRAILING:
145 |             if self.boundary_retention in (self.BoundaryRetention.NONE, self.BoundaryRetention.ALL):
146 |                 start = prior.stop
147 |             else:  # LEADING
148 |                 start = prior.start
149 |             stop = ito.stop
150 |             if start != stop:
151 |                 rv.append(ito.clone(start, stop, self.desc, False))
152 | 
153 |         if prior is None and len(rv) == 0 and self.return_zero_split:
154 |             rv.append(ito.clone(desc=self.desc, clone_children=False))
155 | 
156 |         return rv
157 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/value_func.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class ValueFunc(Itorator):
 8 |     def __init__(self, f: Types.F_ITO_2_VAL | None, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         if not (f is None or type_magic.functoid_isinstance(f, Types.F_ITO_2_VAL)):
11 |             raise Errors.parameter_invalid_type('f', f, Types.F_ITO_2_VAL, None)
12 |         self.f = f
13 | 
14 |     def clone(self, tag: str | None = None) -> ValueFunc:
15 |         return type(self())(self.f, self.tag if tag is None else tag)
16 | 
17 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
18 |         ito.value_func = self.f
19 |         yield ito
20 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/__init__.py:
--------------------------------------------------------------------------------
1 | from .postorator import Postorator
2 | del postorator
3 | 
4 | from .windowed_join import WindowedJoin
5 | del windowed_join
6 | 
7 | from .stacked_reduce import StackedReduce
8 | del stacked_reduce


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/postorator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from pawpaw import Types, type_magic, Errors
 4 | 
 5 | 
 6 | class Postorator(ABC):
 7 |     @classmethod
 8 |     def wrap(cls, func: Types.F_ITOS_2_ITOS, tag: str | None = None):
 9 |         if type_magic.functoid_isinstance(func, Types.F_ITOS_2_ITOS):
10 |             return _WrappedPostorator(func, tag)
11 | 
12 |         raise Errors.parameter_invalid_type('func', func, Types.F_ITOS_2_ITOS)        
13 | 
14 |     def __init__(self, tag: str | None = None):
15 |         if tag is not None and not isinstance(tag, str):
16 |             raise Errors.parameter_invalid_type('desc', tag, str)
17 |         self.tag = tag
18 | 
19 |     @abstractmethod
20 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
21 |         ...
22 | 
23 |     def __call__(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
24 |         yield from self._transform(itos)
25 | 
26 | 
27 | class _WrappedPostorator(Postorator):
28 |     def __init__(self, f: Types.F_ITOS_2_ITOS, tag: str | None = None):
29 |         super().__init__(tag)
30 |         self.__f = f
31 | 
32 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
33 |         yield from self.__f(itos)
34 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/stacked_reduce.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.postorator import Postorator
 5 | 
 6 |        
 7 | class StackedReduce(Postorator):
 8 |     F_SQ_ITOS_2_ITO = typing.Callable[[Types.C_SQ_ITOS], Ito]
 9 |     P_SQ_ITOS_ITO = typing.Callable[[Types.C_SQ_ITOS, Ito], bool]
10 |     
11 |     def __init__(
12 |             self,
13 |             reduce_func: F_SQ_ITOS_2_ITO,
14 |             push_predicate: P_SQ_ITOS_ITO,
15 |             pop_predicate: P_SQ_ITOS_ITO | None = None,
16 |             tag: str | None = None
17 |     ):
18 |         super().__init__(tag)
19 |         if not type_magic.functoid_isinstance(reduce_func, self.F_SQ_ITOS_2_ITO):
20 |             raise Errors.parameter_invalid_type('reduce_func', reduce_func, self.F_SQ_ITOS_2_ITO)
21 |         self.reduce_func = reduce_func
22 | 
23 |         if not type_magic.functoid_isinstance(push_predicate, self.P_SQ_ITOS_ITO):
24 |             raise Errors.parameter_invalid_type('push_predicate', push_predicate, self.P_SQ_ITOS_ITO)
25 |         self.push_predicate = push_predicate
26 | 
27 |         if pop_predicate is None or type_magic.functoid_isinstance(pop_predicate, self.P_SQ_ITOS_ITO):
28 |             self.pop_predicate = pop_predicate
29 |         else:
30 |             raise Errors.parameter_invalid_type('pop_predicate', pop_predicate, self.P_SQ_ITOS_ITO, None)
31 | 
32 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
33 |         stack: typing.List[Ito] = []
34 |         for ito in itos:
35 |             if len(stack) > 0:
36 |                 if self.pop_predicate is not None and self.pop_predicate(stack, ito):
37 |                     yield self.reduce_func(stack)
38 |                     stack.clear()
39 |                 else:
40 |                     stack.append(ito)
41 | 
42 |             if len(stack) == 0:
43 |                 if self.push_predicate(stack, ito):
44 |                     stack.append(ito)
45 |                 else:
46 |                     yield ito
47 | 
48 |         if len(stack) > 0:
49 |             yield self.reduce_func(stack)
50 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/windowed_join.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.postorator import Postorator
 5 | 
 6 | 
 7 | class WindowedJoin(Postorator):
 8 |     F_SQ_ITOS_2_B = typing.Callable[[Types.C_SQ_ITOS], bool]
 9 |     
10 |     def __init__(
11 |             self,
12 |             window_size: int,
13 |             predicate: F_SQ_ITOS_2_B,
14 |             ito_class: Ito = Ito,
15 |             desc: str | None = None,
16 |             tag: str | None = None
17 |     ):
18 |         super().__init__(tag)
19 |         if not isinstance(window_size, int):
20 |             raise Errors.parameter_invalid_type('window_size', window_size, int)
21 |         if window_size < 2:
22 |             raise ValueError(f'parameter \'window_size\' has value '
23 |                              f'{window_size:,}, but must be greater than or equal to 2')
24 |         self.window_size = window_size
25 | 
26 |         if not type_magic.functoid_isinstance(predicate, self.F_SQ_ITOS_2_B):
27 |             raise Errors.parameter_invalid_type('predicate', predicate, self.F_SQ_ITOS_2_B)
28 |         self.predicate = predicate
29 | 
30 |         if not issubclass(ito_class, Ito):
31 |             raise ValueError('parameter \'ito_class\' ({ito_class}) is not an \'{Ito}\' or subclass.')
32 |         self.ito_class = ito_class
33 | 
34 |         self.desc = desc
35 | 
36 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
37 |         window: typing.List[Ito] = []
38 |         for ito in itos:
39 |             window.append(ito)
40 |             if len(window) == self.window_size:
41 |                 if self.predicate(window):
42 |                     yield self.ito_class.join(*window, desc=self.desc)
43 |                     window.clear()
44 |                 else:
45 |                     yield window.pop(0)
46 | 
47 |         yield from window
48 | 


--------------------------------------------------------------------------------
/pawpaw/errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import inspect
 3 | import types
 4 | import typing
 5 | import enum
 6 | 
 7 | 
 8 | class Errors:
 9 |     @classmethod
10 |     def parameter_not_none(cls, name: str) -> ValueError:
11 |         return ValueError(f'parameter \'{name}\' can not be None')
12 | 
13 |     @classmethod
14 |     def parameter_neither_none_nor_empty(cls, name: str) -> ValueError:
15 |         return ValueError(f'parameter \'{name}\' can be neither None nor empty')
16 | 
17 |     @classmethod
18 |     def parameter_enum_not_in(cls, name: str, value: typing.Any, enum_: enum.Enum) -> ValueError:
19 |         return ValueError(f'parameter \'{name}\' is not a valid {enum_.__name__}')
20 | 
21 |     @classmethod
22 |     def _get_type_strs(cls, *allowed) -> typing.Iterable[str]:
23 |         for t in allowed:
24 |             if hasattr(t, '__qualname__'):
25 |                 if t.__qualname__ == 'Callable':
26 |                     yield str(t)
27 |                 else:
28 |                     yield t.__qualname__
29 |             elif hasattr(t, '__bound__'):
30 |                 yield from cls._get_type_strs(t.__bound__)
31 |             elif typing.get_origin(t) is types.UnionType:
32 |                 args = typing.get_args(t)
33 |                 yield from cls._get_type_strs(*args)
34 |             elif t is None:
35 |                 yield 'None'
36 |             else:
37 |                 yield repr(t)
38 | 
39 |     @classmethod
40 |     def _build_types_str(cls, *allowed: typing.Type) -> str:
41 |         return ' or '.join(cls._get_type_strs(*allowed))
42 | 
43 |     @classmethod
44 |     def parameter_invalid_type(cls, name: str, value: typing.Any, *allowed: typing.Type) -> TypeError:
45 |         actual = str(inspect.signature(value)) if callable(value) else repr(value)
46 |         return TypeError(f'parameter \'{name}\' must be type {cls._build_types_str(*allowed)}, not {actual}')
47 | 
48 |     @classmethod
49 |     def parameter_iterable_contains_invalid_type(cls, name: str, value: typing.Any, *allowed: typing.Type) -> TypeError:
50 |         actual = str(inspect.signature(value)) if callable(value) else repr(value)
51 |         return TypeError(f'parameter \'{name}\' must contain elements of type {cls._build_types_str(*allowed)}, however, it contains an element of type {actual}: {value}')
52 | 


--------------------------------------------------------------------------------
/pawpaw/infix.py:
--------------------------------------------------------------------------------
 1 | """Infix operator class recipe from https://code.activestate.com/recipes/384122
 2 | 
 3 | Returns:
 4 |     An infix that can be called using either:
 5 |     
 6 |         x |op| y
 7 |         or
 8 |         x <> y
 9 | """
10 | class Infix:
11 |     def __init__(self, function):
12 |         self.function = function
13 | 
14 |     def __ror__(self, other):
15 |         return Infix(lambda x: self.function(other, x))
16 | 
17 |     def __or__(self, other):
18 |         return self.function(other)
19 | 
20 |     def __rlshift__(self, other):
21 |         return Infix(lambda x, self=self, other=other: self.function(other, x))
22 | 
23 |     def __rshift__(self, other):
24 |         return self.function(other)
25 | 
26 |     def __call__(self, value1, value2):
27 |         return self.function(value1, value2)
28 | 


--------------------------------------------------------------------------------
/pawpaw/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .nlp import byte_order_controls, unicode_white_space_other, unicode_single_quote_marks, unicode_double_quote_marks, unicode_bullets, trimmable_ws
2 | 
3 | from .nlp import Number, KeyedPrefix, Paragraph, Sentence, SimpleNlp
4 | del nlp
5 | 


--------------------------------------------------------------------------------
/pawpaw/ontology/__init__.py:
--------------------------------------------------------------------------------
1 | from .ontology import Discoveries, Ontology
2 | del ontology
3 | 
4 | from ._query import OPERATORS, MUST_ESCAPE_CHARS, escape, descape, Query, compile, find_all, find
5 | del _query


--------------------------------------------------------------------------------
/pawpaw/ontology/ontology.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import itertools
 3 | import typing
 4 | 
 5 | from pawpaw import Ito, Types
 6 | from pawpaw.arborform import Itorator
 7 | import regex
 8 | 
 9 | 
10 | class Discoveries(dict):
11 |     def __init__(self, *args, **kwargs):
12 |         self._itos: list[Ito] = list(kwargs.pop('itos', tuple()))
13 |         dict.__init__(self, *args, **kwargs )
14 | 
15 |     @property
16 |     def itos(self) -> list[Ito]:
17 |         return self._itos   
18 |     
19 |     def __str__(self):
20 |         c = ', '.join(f'{k}: {str(v)}' for k, v in self.items())
21 |         return f'{{itos: {[str(i) for i in self._itos]}, {c}}}'
22 |     
23 |     def _flatten(self, filter_empties: bool = True, path: Types.C_OPATH = tuple()) -> dict[Types.C_OPATH, list[Ito]]:
24 |         rv = {} if len(self.itos) == 0 and filter_empties else {tuple(path): self.itos}
25 |         for key in self.keys():
26 |             rv |= self[key]._flatten(filter_empties, path + (key,))
27 |         return rv
28 | 
29 |     def flatten(self, filter_empties: bool = True) -> dict[Types.C_OPATH, list[Ito]]:
30 |         return self._flatten(filter_empties, )
31 |     
32 |     def walk(self) -> Types.C_IT_ITOS:
33 |         yield from self._itos
34 |         for child in self.values():
35 |             yield from child.walk()
36 | 
37 | 
38 | class Ontology(dict):
39 |     def __missing__(self, key):
40 |         if isinstance(key, typing.Sequence) and (lk := len(key)) > 0 and not isinstance(key, str):
41 |             rv = self[key[0]]
42 |             if lk > 1:
43 |                 rv = rv[key[1:]]
44 |             return rv
45 |         else:
46 |             raise KeyError(key)
47 | 
48 |     def __init__(self, *args, **kwargs):
49 |         self._rules: list[Types.C_ORULE] = kwargs.pop('rules', [])
50 |         dict.__init__(self, *args, **kwargs )
51 | 
52 |     @property
53 |     def rules(self) -> list[Types.C_ORULE]:
54 |         return self._rules
55 | 
56 |     def __str__(self):
57 |         c = ', '.join(f'{k}: {str(v)}' for k, v in self.items())
58 |         return f'{{rules: {self._rules}, {c}}}'   
59 |     
60 |     def discover(self, *itos: Ito) -> Discoveries:
61 |         rv = Discoveries()
62 | 
63 |         for rule in self._rules:
64 |             for i in itos:
65 |                 rv.itos.extend(rule(i))
66 | 
67 |         for k, v in self.items():
68 |             rv[k] = v.discover(*itos)
69 | 
70 |         return rv
71 | 


--------------------------------------------------------------------------------
/pawpaw/query/__init__.py:
--------------------------------------------------------------------------------
1 | from ._query import OPERATORS, FILTER_KEYS, MUST_ESCAPE_CHARS, escape, descape, Query, compile, find_all, find
2 | del _query
3 | 


--------------------------------------------------------------------------------
/pawpaw/span.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import collections.abc
 3 | import types
 4 | import typing
 5 | 
 6 | from pawpaw.errors import Errors
 7 | 
 8 | 
 9 | class Span(typing.NamedTuple):
10 |     start: int
11 |     stop: int
12 |         
13 |     @classmethod
14 |     def from_indices(
15 |         cls,
16 |         basis: int | collections.abc.Sized,
17 |         start: int | None = None,
18 |         stop: int | None = None
19 |     ) -> Span:
20 |         if isinstance(basis, int):
21 |             length = basis
22 |         elif isinstance(basis, collections.abc.Sized):
23 |             length = len(basis)
24 |         else:
25 |             raise Errors.parameter_invalid_type('basis', basis, int, collections.abc.Sized)
26 | 
27 |         if start is None:
28 |             start = 0
29 |         elif not isinstance(start, int):
30 |             raise Errors.parameter_invalid_type('start', start, int, types.NoneType)
31 |         else:
32 |             start = min(length, start) if start >= 0 else max(0, length + start)
33 | 
34 |         if stop is None:
35 |             stop = length
36 |         elif not isinstance(stop, int):
37 |             raise Errors.parameter_invalid_type('stop', stop, int, types.NoneType)
38 |         else:
39 |             stop = min(length, stop) if stop >= 0 else max(0, length + stop)
40 |             
41 |         stop = max(start, stop)
42 | 
43 |         return Span(start, stop)
44 | 
45 |     def offset(self, i: int) -> Span:
46 |         if not isinstance(i, int):
47 |             raise Errors.parameter_invalid_type('i', i, int)
48 |             
49 |         if i == 0:
50 |             return self
51 |         
52 |         rv = Span(self.start + i, self.stop + i)
53 |         if rv.start < 0 or rv.stop < 0:
54 |             raise ValueError(f'offsetting by {i:,} results in negative indices')
55 |         
56 |         return rv
57 | 


--------------------------------------------------------------------------------
/pawpaw/table/__init__.py:
--------------------------------------------------------------------------------
1 | from .table import *
2 | del table
3 | 
4 | import pawpaw.table.styles


--------------------------------------------------------------------------------
/pawpaw/table/styles/__init__.py:
--------------------------------------------------------------------------------
1 | from .styles import *
2 | del styles
3 | 


--------------------------------------------------------------------------------
/pawpaw/table/styles/styles.py:
--------------------------------------------------------------------------------
  1 | from pawpaw.table import TableStyle
  2 | 
  3 | """
  4 | Notes:
  5 | 
  6 | MUST HAVE CHARACTERISTICS:
  7 | 
  8 |     - Must be able to determine start and stop in order to identify within larger
  9 |         unstructured text
 10 | 
 11 |     - Have way to distinguish columns and ROWS (i.e., a table represented with tabs
 12 |         doesn't allow for row delineations
 13 | 
 14 |     - Optionally has a header row(s)
 15 | """
 16 | 
 17 | """
 18 | Style 1 [Unnamed]
 19 | 
 20 | -----+-----+-----
 21 |   A  |  B  |  C
 22 | -----+-----+-----      
 23 |  aaa | bbb | ccc
 24 | -----+-----+-----      
 25 | """
 26 | 
 27 | p = r'(?:-{2,}(?:\+-+)+)'
 28 | TYPE_1 = TableStyle(
 29 |     table_start_pat = p,
 30 |     row_sep_pat = p,
 31 |     equi_distant_indent=False
 32 | )
 33 | del p
 34 | 
 35 | 
 36 | """
 37 | Style 2 [Unnamed]
 38 | 
 39 | -------------------
 40 | |  A  |  B  |  C  |
 41 | |-----------------|
 42 | | aaa | bbb | ccc |
 43 | -------------------     
 44 | """
 45 | 
 46 | p = r'-{2,}'
 47 | TYPE_2 = TableStyle(
 48 |     table_start_pat = p,
 49 |     row_sep_pat = r'\|(?:-+\|)+',
 50 |     table_end_pat = p,
 51 |     equi_distant_indent=True
 52 | )
 53 | del p
 54 | 
 55 | 
 56 | """
 57 | markdown
 58 | 
 59 |     | A | B | C |
 60 |     |---|:-:|--:|
 61 |     | a | b | c |
 62 |     | d | e | f |
 63 | """
 64 | 
 65 | """
 66 | reStructuredText
 67 | 
 68 |     2.a rst Simple Table
 69 | 
 70 |     =====  =====  =======
 71 |     A      B      A and B
 72 |     =====  =====  =======
 73 |     False  False  False
 74 |     True   False  False
 75 |     False  True   False
 76 |     True   True   True
 77 |     =====  =====  =======
 78 | 
 79 |     2.b rst Grid Table
 80 | 
 81 |     +------------+------------+-----------+
 82 |     | Header 1   | Header 2   | Header 3  |
 83 |     +============+============+===========+
 84 |     | body row 1 | column 2   | column 3  |
 85 |     +------------+------------+-----------+
 86 |     | body row 2 | Cells may span columns.|
 87 |     +------------+------------+-----------+
 88 |     | body row 3 | Cells may  | - Cells   |
 89 |     +------------+ span rows. | - contain |
 90 |     | body row 4 |            | - blocks. |
 91 |     +------------+------------+-----------+
 92 | """
 93 | 
 94 | """
 95 | ASCII doc
 96 | 
 97 |     [cols="e,m,^,>s",width="25%"]
 98 |     |============================
 99 |     |1 >s|2 |3 |4
100 |     ^|5 2.2+^.^|6 .3+<.>m|7
101 |     ^|8
102 |     |9 2+>|10
103 |     |============================
104 | """
105 |     
106 | """
107 | ASCII Misc
108 | 
109 |     pipe, hypen, plus
110 |     
111 |     +---+---+---+
112 |     | A | B | C |
113 |     +---+---+---+
114 |     | a | b | c |
115 |     +---+---+---+
116 |     | d | e | f |
117 |     +---+---+---+
118 | 
119 |     pipe, em-dash, plus
120 | 
121 |     +———+———+———+
122 |     | A | B | C |
123 |     +———+———+———+
124 |     | a | b | c |
125 |     +———+———+———+
126 |     | d | e | f |
127 |     +———+———+———+
128 | 
129 |     misc ascii box drawing line styles
130 |     
131 |     ┌───┬───┬───┐
132 |     │ A │ B │ C │
133 |     ├───┼───┼───┤
134 |     │ a │ b │ c │
135 |     ├───┼───┼───┤
136 |     │ d │ e │ f │
137 |     └───┴───┴───┘    
138 | 
139 |     ┏━━━┳━━━┳━━━┓
140 |     ┃ A ┃ B ┃ C ┃
141 |     ┣━━━╋━━━╋━━━┫
142 |     ┃ a ┃ b ┃ c ┃
143 |     ┣━━━╋━━━╋━━━┫
144 |     ┃ d ┃ e ┃ f ┃
145 |     ┗━━━┻━━━┻━━━┛
146 | 
147 |     ┏━━━┳━━━┳━━━┓
148 |     ┃ A ┃ B ┃ C ┃
149 |     ┡━━━╇━━━╇━━━┩
150 |     │ a │ b │ c │
151 |     ├───┼───┼───┤
152 |     │ d │ e │ f │
153 |     └───┴───┴───┘
154 |         
155 |     ╔═══╦═══╦═══╗
156 |     ║ A ║ B ║ C ║
157 |     ╠═══╬═══╬═══╣
158 |     ║ a ║ b ║ c ║
159 |     ╠═══╬═══╬═══╣
160 |     ║ d ║ e ║ f ║
161 |     ╚═══╩═══╩═══╝    
162 | 
163 |     ╔═══╤═══╤═══╗
164 |     ║ A │ B │ C ║
165 |     ╟───┼───┼───╢
166 |     ║ a │ b │ c ║
167 |     ╟───┼───┼───╢
168 |     ║ d │ e │ f ║
169 |     ╚═══╧═══╧═══╝    
170 | """
171 | 
172 | 


--------------------------------------------------------------------------------
/pawpaw/table/table.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod, abstractproperty
 2 | import dataclasses
 3 | 
 4 | import regex
 5 | import pawpaw
 6 | 
 7 | 
 8 | class Table(ABC):
 9 |     @property
10 |     @abstractmethod
11 |     def re(self) -> regex.Pattern:
12 |         ...
13 | 
14 |     @abstractmethod
15 |     def get_itor(self) -> pawpaw.arborform.Itorator:
16 |         ...
17 | 
18 | 
19 | @dataclasses.dataclass
20 | class TableStyle:
21 |     pre_caption_pat: str | None = None
22 |     table_start_pat: str = ''
23 |     header_row_end_pat: str | None = None
24 |     row_sep_pat: str = ''
25 |     table_end_pat: str | None = None
26 |     post_caption_pat: str | None = None
27 |     equi_distant_indent: bool = True
28 | 
29 | 
30 | class StyledTable(Table):
31 |     # finds equidistant indentation (zero or more spaces or tabs) chunks
32 |     _pat_indent = r'[ \t]*'
33 |     _re_equi_ident = regex.compile(rf'(?<=^|\n)(?P(?P{_pat_indent})[^ \t][^\n]+?\n(?:(?P=indent)[^ \t][^\n]+?(?:\n|$))+)', regex.DOTALL)
34 | 
35 |     @classmethod
36 |     def _build_re(cls, style: TableStyle) -> regex.Pattern:
37 |         re = r'(?<=^|\n)'
38 | 
39 |         if style.equi_distant_indent:
40 |             re = rf'(?P{cls._pat_indent})'
41 |             pat_indent = r'(?P=indent)'
42 |         else:
43 |             pat_indent = r''
44 | 
45 |         re += r'(?'
46 | 
47 |         if style.pre_caption_pat is not None:
48 |             re += rf'(?:(?{style.pre_caption_pat})\n{pat_indent})?'
49 | 
50 |         re += rf'{style.table_start_pat}'
51 | 
52 |         if style.header_row_end_pat is not None:
53 |             re += rf'(?:\n{pat_indent}(?.+?)\n{pat_indent}{style.header_row_end_pat})?'
54 |             
55 |         if style.table_end_pat is None:
56 |             re += rf'(?:\n{pat_indent}(?.+?)\n{pat_indent}{style.row_sep_pat})+'
57 |         else:
58 |             re += rf'(?:\n{pat_indent}(?.+?)\n{pat_indent}{style.row_sep_pat})*\n{pat_indent}(?.+?)'
59 |             re += rf'\n{pat_indent}{style.table_end_pat}'
60 |             
61 |         if style.post_caption_pat is not None:
62 |             re += rf'\n{pat_indent}(?{style.post_caption_pat})(?=\n|$)'
63 | 
64 |         re += r')(?=$|\n)'
65 | 
66 |         return regex.compile(re, regex.DOTALL)
67 | 
68 |     def __init__(self, style: TableStyle, tag: str | None = None):
69 |         self.style = style
70 |         self._re = self._build_re(style)
71 |         self.tag = tag
72 | 
73 |     @property
74 |     def re(self) -> regex.Pattern:
75 |         return self._re
76 | 
77 |     def get_itor(self) -> pawpaw.arborform.Itorator:
78 |         itor_table = pawpaw.arborform.Extract(self._re, tag=self.tag, group_filter=lambda m, gk: gk in ('pre_caption', 'table', 'header_row', 'row', 'post_caption'))
79 |         if not self.style.equi_distant_indent:
80 |             return itor_table
81 | 
82 |         itor_equi_ident = pawpaw.arborform.Extract(self._re_equi_ident, tag='equidistant indentation', group_filter=('chunk',))
83 |         con = pawpaw.arborform.Connectors.Delegate(itor_table, 'chunk')
84 |         itor_equi_ident.connections.append(con)
85 |         return itor_equi_ident
86 | 


--------------------------------------------------------------------------------
/pawpaw/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import typing
  3 | 
  4 | import pawpaw
  5 | 
  6 | 
  7 | # Finds indices of non-doubled escape chars
  8 | def find_escapes(
  9 |     src: str | pawpaw.Ito,
 10 |     escape: str = '\\',
 11 |     start: int | None = None,
 12 |     stop: int | None = None
 13 | ) -> typing.Iterable[int]:
 14 |     if isinstance(src, str):
 15 |         s = src
 16 |         offset = 0
 17 |     elif isinstance(src, pawpaw.Ito):
 18 |         s = src.string
 19 |         offset = src.start
 20 |     else:
 21 |         raise pawpaw.Errors.parameter_invalid_type('src', src, str, pawpaw.Ito)
 22 | 
 23 |     span = pawpaw.Span.from_indices(src, start, stop).offset(offset)
 24 | 
 25 |     if not isinstance(escape, str):
 26 |         raise pawpaw.Errors.parameter_invalid_type('escape', escape, str)
 27 |     elif len(escape) != 1:
 28 |         raise ValueError('parameter \'escape\' must have length 1')
 29 | 
 30 |     esc = False
 31 |     for i in range(span.start, span.stop):
 32 |         c = s[i]
 33 |         if c == escape:
 34 |             esc = not esc
 35 |         elif esc:
 36 |             yield i - 1
 37 |             esc = False
 38 | 
 39 | 
 40 | def find_unescaped(
 41 |     src: str | pawpaw.Ito,
 42 |     chars: str,
 43 |     escape: str = '\\',
 44 |     start: int | None = None,
 45 |     stop: int | None = None
 46 | ) -> typing.Iterable[int]:
 47 |     if isinstance(src, str):
 48 |         s = src
 49 |         offset = 0
 50 |     elif isinstance(src, pawpaw.Ito):
 51 |         s = src.string
 52 |         offset = src.start
 53 |     else:
 54 |         raise pawpaw.Errors.parameter_invalid_type('src', src, str, pawpaw.Ito)
 55 | 
 56 |     span = pawpaw.Span.from_indices(src, start, stop).offset(offset)
 57 | 
 58 |     if not isinstance(chars, str):
 59 |         raise pawpaw.Errors.parameter_invalid_type('chars', chars, str)
 60 |     elif len(chars) == 0:
 61 |         raise ValueError('parameter \'chars\' must have non-zero length')
 62 | 
 63 |     if not isinstance(escape, str):
 64 |         raise pawpaw.Errors.parameter_invalid_type('escape', escape, str)
 65 |     elif len(escape) != 1:
 66 |         raise ValueError('parameter \'escape\' must have length 1')
 67 | 
 68 |     esc = False
 69 |     for i in range(span.start, span.stop):
 70 |         c = s[i]
 71 |         if esc:
 72 |             esc = False
 73 |         elif c == escape:
 74 |             esc = True
 75 |         elif c in chars:
 76 |             yield i - offset
 77 | 
 78 |     if esc:
 79 |         raise ValueError('parameter \'src\' ends with un-followed escape char \'{escape}\'')
 80 | 
 81 | 
 82 | def split_unescaped(
 83 |     src: str | pawpaw.Ito,
 84 |     char: str,
 85 |     escape: str = '\\',
 86 |     start: int | None = None,
 87 |     stop: int | None = None
 88 | ) -> typing.Iterable[str] | typing.Iterable[pawpaw.Ito]:
 89 |     cur = 0
 90 |     for i in find_unescaped(src, char, escape, start, stop):
 91 |         yield src[cur:i]
 92 |         cur = i + 1
 93 |     yield src[cur:]
 94 | 
 95 | 
 96 | def find_balanced(
 97 |     src: str | pawpaw.Ito,
 98 |     lchar: str | pawpaw.Ito,
 99 |     rchar: str | pawpaw.Ito,
100 |     escape: str = '\\',
101 |     start: int | None = None,
102 |     stop: int | None = None
103 | ) -> typing.Iterable[str] | typing.Iterable[pawpaw.Ito]:
104 |     if isinstance(src, str):
105 |         s = src
106 |         offset = 0
107 |     elif isinstance(src, pawpaw.Ito):
108 |         s = src.string
109 |         offset = src.start
110 |     else:
111 |         raise pawpaw.Errors.parameter_invalid_type('src', src, str, pawpaw.Ito)
112 | 
113 |     if not (isinstance(lchar, str) or isinstance(lchar, pawpaw.Ito)):
114 |         raise pawpaw.Errors.parameter_invalid_type('left', lchar, str, pawpaw.Ito)
115 |     elif len(lchar) != 1:
116 |         raise ValueError('parameter \'left\' must have length 1')
117 |     lchar = str(lchar)
118 | 
119 |     if not (isinstance(rchar, str) or isinstance(rchar, pawpaw.Ito)):
120 |         raise pawpaw.Errors.parameter_invalid_type('right', rchar, str, pawpaw.Ito)
121 |     elif len(rchar) != 1:
122 |         raise ValueError('parameter \'right\' must have length 1')
123 |     rchar = str(rchar)
124 | 
125 |     lefts = []
126 |     for i in find_unescaped(src, lchar + rchar, escape, start, stop):
127 |         c = s[offset + i]
128 |         if c == lchar and (lchar != rchar or len(lefts) == 0):
129 |             lefts.append(i)
130 |         else:
131 |             len_lefts = len(lefts)
132 |             if len_lefts > 1:
133 |                 lefts.pop()
134 |             elif len_lefts == 1:
135 |                 yield src[lefts.pop():i+1]
136 |             else:
137 |                 raise ValueError(f'unbalanced right char {rchar} found at index {i}')
138 |         
139 |     if len(lefts) != 0:
140 |         raise ValueError(f'unbalanced left char {lchar} found at index {lefts.pop()}')
141 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | import pawpaw.visualization.sgr
2 | 
3 | from .highlighter import Highlighter
4 | del highlighter
5 | 
6 | import pawpaw.visualization.ascii_box
7 | 
8 | import pawpaw.visualization.pepo
9 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/highlighter.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import typing
 3 | 
 4 | import pawpaw
 5 | from pawpaw.visualization import sgr
 6 | 
 7 | 
 8 | class Highlighter:
 9 |     '''
10 |     - Guarrantee differing color across any Ito boundaries
11 |     - An Ito parent might not have the same color for sub spans, because it is not always possible to do so.
12 |       Consider a two-color palette with this nesting:
13 |         A-------------A     Prefix and suffix get color 1
14 |             B------B        Assign color 2 so that boundary AB and BA are visible
15 |             C---C           If color 1, boundary AC is invisible.  If color 2, boundary CB invisible)
16 |     '''
17 | 
18 |     def __init__(self, palette: sgr.C_PALETTE):
19 |         self._backs = tuple(sgr.Back.from_color(col) for col in palette)
20 | 
21 |     def _compose(self, predicate: pawpaw.Types.P_ITO, it_back: typing.Iterator[sgr.Back], ito: pawpaw.Ito, str_slice: slice | None = None):
22 |         if predicate(ito):
23 |             prefix = f'{next(it_back)}'
24 |             suffix = f'{sgr.Back.RESET}'
25 |         else:
26 |             prefix = suffix = ''
27 | 
28 |         if str_slice is None:
29 |             s = f'{ito}'
30 |         else:
31 |             s = f'{ito.string[str_slice]}'
32 | 
33 |         return f'{prefix}{s}{suffix}'
34 |             
35 |     def _print(self, ito: pawpaw.Types.P_ITO, predicate: pawpaw.Types.P_ITO, it_back: typing.Iterator[sgr.Back]):
36 |         if ito.children == 0:
37 |             if len(ito) == 0:
38 |                 print(self._compose(predicate, it_back, ito), end='')
39 |             return
40 | 
41 |         last = ito.start
42 |         for child in ito.children:
43 |             if last < child.start:
44 |                 print(self._compose(predicate, it_back, ito, slice(last, child.start)), end='')
45 |             self._print(child, predicate, it_back)
46 |             last = child.stop
47 |         if last < ito.stop:
48 |             print(self._compose(predicate, it_back, ito, slice(last, ito.stop)), end='')
49 | 
50 |     def print(self, ito: pawpaw.Ito, predicate: pawpaw.Types.P_ITO = lambda ito: True) -> None:
51 |         self._print(ito, predicate, itertools.cycle(self._backs))
52 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/pepo/__init__.py:
--------------------------------------------------------------------------------
1 | from .pepo import Pepo, Compact, Tree, Xml, Json
2 | del pepo


--------------------------------------------------------------------------------
/pawpaw/visualization/pepo/pepo.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # Force Python XML parser, not faster C version so that we can hook methods
  3 | sys.modules['_elementtree'] = None
  4 | import abc
  5 | import json
  6 | import io
  7 | import os
  8 | import typing
  9 | from xml.sax.saxutils import escape as xml_escape
 10 | 
 11 | import pawpaw
 12 | from pawpaw.visualization import ascii_box
 13 | 
 14 | 
 15 | class Pepo(abc.ABC):
 16 |     def __init__(self, indent: str = '    ', children: bool = True):
 17 |         self.linesep: str = os.linesep
 18 |         self.indent: str = indent
 19 |         self.children = children
 20 | 
 21 | 
 22 |     @abc.abstractmethod
 23 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
 24 |         ...
 25 | 
 26 |     def dumps(self, *itos: pawpaw.Ito) -> str:
 27 |         with io.StringIO() as fs:
 28 |             self.dump(fs, *itos)
 29 |             fs.seek(0)
 30 |             return fs.read()
 31 | 
 32 | 
 33 | class _PepoFstr(Pepo):
 34 |     def __init__(self, indent: str = '    ', children: bool = True, fstr: str = '%desc'):
 35 |         super().__init__(indent, children)
 36 |         self.fstr = fstr
 37 | 
 38 | 
 39 | class Compact(_PepoFstr):
 40 |     def __init__(self, indent: str = '    ', children: bool = True):
 41 |         super().__init__(indent, children, '%span %desc!r : \'%substr!1r1:40…% \'')
 42 |         self.children = children
 43 | 
 44 |     def _dump(self, fs: typing.IO, ei: pawpaw.Types.C_EITO, level: int = 0) -> None:
 45 |         fs.write(f'{self.indent * level}{ei.index:,}: {ei.ito:{self.fstr}}{self.linesep}')
 46 | 
 47 |         if self.children:
 48 |             level += 1
 49 |             for eic in (pawpaw.Types.C_EITO(i, ito) for i, ito in enumerate(ei.ito.children, start=1)):
 50 |                 self._dump(fs, eic, level)
 51 | 
 52 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
 53 |         for ei in (pawpaw.Types.C_EITO(i, ito) for i, ito in enumerate(itos, start=1)):
 54 |             if not isinstance(ei.ito, pawpaw.Ito):
 55 |                 raise pawpaw.Errors.parameter_iterable_contains_invalid_type('itos', ei.ito, pawpaw.Ito)
 56 |             self._dump(fs, ei)
 57 | 
 58 | 
 59 | class Tree(_PepoFstr):
 60 |     HORZ = ascii_box.BoxDrawingChar.from_char('─')
 61 |     VERT = ascii_box.BoxDrawingChar.from_char('│')
 62 |     TEE = ascii_box.BoxDrawingChar.from_char('├')
 63 |     ELBOW = ascii_box.BoxDrawingChar.from_char('└')
 64 | 
 65 |     def __init__(self, indent: str = '  ', children: bool = True):
 66 |         super().__init__(indent, children, '%span %desc!r : \'%substr!1r1:^40…% \'')
 67 |         self.children = False
 68 | 
 69 |     def _dump_children(self, fs: typing.IO, ito: pawpaw.Ito, prefix: str = '') -> None:
 70 |         for child in ito.children[:-1]:
 71 |             fs.write(f'{prefix}'
 72 |                      f'{self.TEE}'
 73 |                      f'{self.HORZ.char * len(self.indent)}'
 74 |                      f'{child:{self.fstr}}'
 75 |                      f'{self.linesep}')
 76 |             self._dump_children(fs, child, prefix + f'{self.VERT}{self.indent}')
 77 | 
 78 |         if len(ito.children) > 0:
 79 |             child = ito.children[-1]
 80 |             fs.write(f'{prefix}'
 81 |                      f'{self.ELBOW}'
 82 |                      f'{self.HORZ.char * len(self.indent)}'
 83 |                      f'{child:{self.fstr}}'
 84 |                      f'{self.linesep}')
 85 |             self._dump_children(fs, child, prefix + f' {self.indent}')
 86 | 
 87 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
 88 |         for ito in itos:
 89 |             if not isinstance(ito, pawpaw.Ito):
 90 |                 raise pawpaw.Errors.parameter_invalid_type('*itos', ito, pawpaw.Ito)
 91 |             fs.write(f'{ito:{self.fstr}}{self.linesep}')
 92 |             self._dump_children(fs, ito)
 93 | 
 94 | 
 95 | class Xml(Pepo):
 96 |     def __init__(self, indent: str = '    ', children: bool = True):
 97 |         super().__init__(indent, children)
 98 | 
 99 |     def _dump(self, fs: typing.IO, ei: pawpaw.Types.C_EITO, level: int = 0) -> None:
100 |         fs.write(f'{level * self.indent}')
104 |         fs.write(self.linesep)
105 | 
106 |         fs.write(f'{level * self.indent}')
107 |         fs.write(xml_escape(str(ei.ito)))
108 |         fs.write(f'{self.linesep}')
109 |         if self.children and len(ei.ito.children) > 0:
110 |             fs.write(f'{level * self.indent}{self.linesep}')
111 | 
112 |             level += 1
113 |             for i, ito in enumerate(ei.ito.children):
114 |                 child = pawpaw.Types.C_EITO(i, ito)
115 |                 self._dump(fs, child, level)
116 | 
117 |             level -= 1
118 |             fs.write(f'{level * self.indent}{self.linesep}')
119 | 
120 |         level -= 1
121 |         fs.write(f'{level * self.indent}{self.linesep}')
122 | 
123 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
124 |         fs.write(f'{self.linesep}')
125 |         fs.write(f'{self.linesep}')
126 |         for ito in itos:
127 |             if not isinstance(ito, pawpaw.Ito):
128 |                 raise pawpaw.Errors.parameter_iterable_contains_invalid_type('itos', ito, pawpaw.Ito)
129 |             self._dump(fs, pawpaw.Types.C_EITO(0, ito), 1)
130 |         fs.write(f'{self.linesep}')
131 | 
132 | 
133 | class Json(Pepo):
134 |     def __init__(self, indent: str = '    ', children: bool = True):
135 |         super().__init__(indent, children)
136 | 
137 |     def _dump(self, fs: typing.IO, ei: pawpaw.Types.C_EITO, level: int = 0) -> None:
138 |         fs.write(level * self.indent + '{' + self.linesep)
139 | 
140 |         level += 1
141 |         fs.write(f'{level * self.indent}"start": {ei.ito.start},{self.linesep}')
142 |         fs.write(f'{level * self.indent}"stop": {ei.ito.stop},{self.linesep}')
143 |         if ei.ito.desc == None:
144 |             desc = "null"
145 |         else:
146 |             desc = json.encoder.encode_basestring(ei.ito.desc)
147 |         fs.write(f'{level * self.indent}"desc": {desc},{self.linesep}')
148 |         substr = json.encoder.encode_basestring(str(ei.ito))
149 |         fs.write(f'{level * self.indent}"substr": {substr},{self.linesep}')
150 |         if self.children:
151 |             fs.write(f'{level * self.indent}"children": [')
152 |             if len(ei.ito.children) == 0:
153 |                 fs.write(f']{self.linesep}')
154 |             else:
155 |                 fs.write(self.linesep)
156 | 
157 |                 level += 1
158 |                 for i, ito in enumerate(ei.ito.children):
159 |                     child = pawpaw.Types.C_EITO(i, ito)
160 |                     self._dump(fs, child, level)
161 |                     if i < len(ei.ito.children) - 1:
162 |                         fs.write(',')
163 |                     fs.write(self.linesep)
164 | 
165 |                 level -= 1
166 |                 fs.write(f'{level * self.indent}]{self.linesep}')
167 | 
168 |         level -= 1
169 |         fs.write(level * self.indent + '}')
170 | 
171 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
172 |         fs.write('{' + self.linesep)
173 | 
174 |         fs.write(f'{self.indent}"itos": [')
175 | 
176 |         comma_needed = False
177 |         for ito in itos:
178 |             if not isinstance(ito, pawpaw.Ito):
179 |                 raise pawpaw.Errors.parameter_invalid_type('*itos', ito, pawpaw.Ito)
180 |             if comma_needed:
181 |                 fs.write(',')
182 |             fs.write(self.linesep)
183 |             self._dump(fs, pawpaw.Types.C_EITO(0, ito), 2)
184 |             comma_needed = True
185 |         fs.write(self.linesep)
186 | 
187 |         fs.write(self.indent + ']' + self.linesep)
188 | 
189 |         fs.write('}' + self.linesep)
190 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/__init__.py:
--------------------------------------------------------------------------------
1 | from .sgr import encode, RESET_ALL, Intensity, Italic, Underline, Blink, Invert, Conceal, Strike, Font
2 | from .sgr import C_COLOR, C_PALETTE, Colors, Fore, Back
3 | del sgr
4 | 
5 | from pawpaw.visualization.sgr.palettes import *
6 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/palettes/__init__.py:
--------------------------------------------------------------------------------
1 | from .palettes import *
2 | del palettes


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/palettes/palettes.py:
--------------------------------------------------------------------------------
 1 | from pawpaw.visualization.sgr import C_PALETTE, Colors
 2 | 
 3 | 
 4 | AIR_FORCE_ONE: C_PALETTE = (
 5 |     Colors.Rgb.from_24_bit(0x3C79B4),  # Dark Blue
 6 |     Colors.Rgb.from_24_bit(0xC9ECF5),  # Light Blue
 7 |     Colors.Rgb.from_24_bit(0xF6F6F6),  # Off White
 8 |     Colors.Rgb.from_24_bit(0xB7986C),  # Brown
 9 | )
10 | 
11 | OLD_GLORY: C_PALETTE = (
12 |     Colors.Rgb.from_24_bit(0xB31942),  # Red
13 |     Colors.Rgb.from_24_bit(0xFFFFFF),  # White
14 |     Colors.Rgb.from_24_bit(0x0A3161),  # Blue
15 | )
16 | 
17 | PAWPAW: C_PALETTE = (
18 |     Colors.Rgb.from_24_bit(0x533E30),  # Royal Brown
19 |     Colors.Rgb.from_24_bit(0xD2AC70),  # Light French Beige
20 |     Colors.Rgb.from_24_bit(0xE4D1AE),  # Desert Sand
21 |     Colors.Rgb.from_24_bit(0x517D3D),  # Fern Green
22 |     Colors.Rgb.from_24_bit(0x90C246),  # Android Green
23 | )
24 | """
25 |   "Oriental Beauty"
26 |   https://www.schemecolor.com/oriental-beauty-color-combination.php
27 | """
28 | 
29 | TULIP_FIELD: C_PALETTE = (
30 |     Colors.Rgb.from_24_bit(0xFF6C98),  # Dark Pink
31 |     Colors.Rgb.from_24_bit(0xFEAA6D),  # Orange 
32 |     Colors.Rgb.from_24_bit(0xF7BACB),  # Light Pink
33 |     Colors.Rgb.from_24_bit(0xD879A2),  # Purple
34 |     Colors.Rgb.from_24_bit(0xF9E841),  # Yellow
35 |     Colors.Rgb.from_24_bit(0xE53F5D),  # Red
36 | )
37 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/sgr.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from dataclasses import dataclass
  3 | import enum
  4 | import typing
  5 | 
  6 | from pawpaw import Errors
  7 | 
  8 | 
  9 | """
 10 | SGR (Select Graphic Rendition) - see https://en.wikipedia.org/wiki/ANSI_escape_code
 11 | """
 12 | def encode(*n: int) -> str:
 13 |     if len(n) == 0:
 14 |         n = '0'
 15 |     else:
 16 |         n = ';'.join(str(i) for i in n)
 17 |     return f'\033[{n}m'
 18 | 
 19 | RESET_ALL: str = encode(0)
 20 | 
 21 |     
 22 | @dataclass(frozen=True)
 23 | class _Sgr:
 24 |     RESET : str
 25 | 
 26 | 
 27 | @dataclass(frozen=True)
 28 | class _Intensity(_Sgr):
 29 |     BOLD : str = encode(1)
 30 |     DIM  : str = encode(2)
 31 |     RESET: str = encode(22)
 32 | 
 33 | 
 34 | Intensity = _Intensity()
 35 | 
 36 | 
 37 | @dataclass(frozen=True)
 38 | class _Italic(_Sgr):
 39 |     ON   : str = encode(3)
 40 |     RESET: str = encode(23)
 41 | 
 42 | 
 43 | Italic = _Italic()
 44 | 
 45 | 
 46 | @dataclass(frozen=True)
 47 | class _Underline(_Sgr):
 48 |     SINGLE: str = encode(4)
 49 |     DOUBLE: str = encode(21)
 50 |     RESET : str = encode(24)
 51 | 
 52 | 
 53 | Underline = _Underline()
 54 | 
 55 | 
 56 | @dataclass(frozen=True)
 57 | class _Blink(_Sgr):
 58 |     SLOW : str = encode(5)
 59 |     RAPID: str = encode(6)
 60 |     RESET: str = encode(25)
 61 | 
 62 | 
 63 | Blink = _Blink()
 64 | 
 65 | 
 66 | @dataclass(frozen=True)
 67 | class _Invert(_Sgr):
 68 |     ON   : str = encode(7)
 69 |     RESET: str = encode(27)
 70 | 
 71 | 
 72 | Invert = _Invert()
 73 | 
 74 | 
 75 | @dataclass(frozen=True)
 76 | class _Conceal(_Sgr):
 77 |     ON   : str = encode(8)
 78 |     RESET: str = encode(28)
 79 | 
 80 | 
 81 | Conceal = _Conceal()
 82 | 
 83 | 
 84 | @dataclass(frozen=True)
 85 | class _Strike(_Sgr):
 86 |     SLOW : str = encode(9)
 87 |     RESET: str = encode(29)
 88 | 
 89 | 
 90 | Strike = _Strike()
 91 | 
 92 | 
 93 | @dataclass(frozen=True)
 94 | class _Font(_Sgr):
 95 |     ALT_1: str = encode(11)
 96 |     ALT_2: str = encode(12)
 97 |     ALT_3: str = encode(13)
 98 |     ALT_4: str = encode(14)
 99 |     ALT_5: str = encode(15)
100 |     ALT_6: str = encode(16)
101 |     ALT_7: str = encode(17)
102 |     ALT_8: str = encode(18)
103 |     ALT_9: str = encode(19)
104 |     RESET: str = encode(10)
105 | 
106 | 
107 | Font = _Font()
108 | 
109 | 
110 | @dataclass
111 | class _Colors:
112 |     class Named(enum.IntEnum):
113 |         BLACK  : int = 0
114 |         RED    : int = 1
115 |         GREEN  : int = 2
116 |         YELLOW : int = 3
117 |         BLUE   : int = 4
118 |         MAGENTA: int = 5
119 |         CYAN   : int = 6
120 |         WHITE  : int = 7
121 | 
122 |         BRIGHT_BLACK  : int  = 60
123 |         BRIGHT_RED    : int  = 61
124 |         BRIGHT_GREEN  : int  = 62
125 |         BRIGHT_YELLOW : int  = 63
126 |         BRIGHT_BLUE   : int  = 64
127 |         BRIGHT_MAGENTA: int  = 65
128 |         BRIGHT_CYAN   : int  = 66
129 |         BRIGHT_WHITE  : int  = 67
130 | 
131 | 
132 |     class Rgb(typing.NamedTuple):
133 |         red: int
134 |         green: int
135 |         blue: int
136 | 
137 |         @classmethod
138 |         def from_24_bit(cls, val: int) -> _Colors.Rgb:
139 |             return cls(val >> 16, (val >> 8) & 0xFF, val & 0xFF)
140 | 
141 | 
142 |     class EightBit(int):
143 |         """
144 |             0-  7:  standard colors (as in ESC [ 30–37 m)
145 |             8- 15:  high intensity colors (as in ESC [ 90–97 m)
146 |             16-231:  6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5)
147 |             232-255:  grayscale from dark to light in 24 steps
148 |         """
149 |         pass
150 | 
151 | 
152 | Colors = _Colors()
153 | 
154 | C_COLOR = Colors.Named | Colors.Rgb | Colors.EightBit
155 | C_PALETTE = typing.Sequence[C_COLOR]
156 | 
157 | 
158 | @dataclass(frozen=True)
159 | class Fore(_Sgr):
160 |     _NAMED_OFFSET: int = 30
161 |     _BY_IDX      : int = 38
162 |     RESET        : str = encode(39)
163 | 
164 |     @classmethod
165 |     def from_color(cls, src: C_COLOR) -> str:
166 |         if isinstance(src, Colors.Named):
167 |             nc = getattr(Colors.Named, src.name)
168 |             return encode(nc.value + cls._NAMED_OFFSET)
169 |         elif isinstance(src, Colors.Rgb):
170 |             return encode(cls._BY_IDX, 2, *src)
171 |         elif isinstance(src, Colors.EightBit):
172 |             return encode(cls._BY_IDX, 5, src)
173 |         else:
174 |             raise Errors.parameter_invalid_type('src', src, Colors.Named, Colors.Rgb, Colors.EightBit)
175 |         
176 |     def __init__(self, src: C_COLOR):
177 |         object.__setattr__(self, '_value', self.from_color(src))
178 |         
179 |     def __str__(self) -> str:
180 |         return self._value
181 | 
182 | 
183 | @dataclass(frozen=True)
184 | class Back(Fore):
185 |     _NAMED_OFFSET: int = Fore._NAMED_OFFSET + 10
186 |     _BY_IDX      : int = Fore._BY_IDX + 10
187 |     RESET        : str = encode(49)
188 | 
189 |     def __init__(self, src: C_COLOR):
190 |         super().__init__(src)
191 | 


--------------------------------------------------------------------------------
/pawpaw/xml/__init__.py:
--------------------------------------------------------------------------------
1 | from pawpaw.xml import descriptors
2 | 
3 | from .xml_helper import QualifiedName, EtName, XmlErrors, XmlHelper
4 | del xml_helper
5 | 
6 | from .xml_parser import XmlParser
7 | del xml_parser
8 | 


--------------------------------------------------------------------------------
/pawpaw/xml/descriptors.py:
--------------------------------------------------------------------------------
 1 | ATTRIBUTES: str = 'attributes'
 2 | ATTRIBUTE: str = 'attribute'
 3 | COMMENT: str = 'comment'
 4 | ELEMENT: str = 'element'
 5 | END_TAG: str = 'end_tag'
 6 | NAME: str = 'name'
 7 | NAMESPACE: str = 'namespace'
 8 | PI: str = 'pi'
 9 | START_TAG: str = 'start_tag'
10 | TAG: str = 'tag'
11 | TEXT: str = 'text'
12 | VALUE: str = 'value'
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pawpaw"
 7 | dynamic = ["version"]
 8 | authors = [
 9 |   { name="Robert L. Ayers", email="rlayers@yahoo.com" },
10 | ]
11 | description = "High Performance Text Processing & Segmentation Framework"
12 | readme = "README.md"
13 | requires-python = ">=3.10"
14 | license = "MIT"
15 | license-files = { paths = ["LICENSE"] }
16 | dependencies = [
17 |   "regex >= 2023.8.8",
18 | ]
19 | keywords = [
20 |   "nlp",
21 |   "information-extraction",
22 |   "text-processing",
23 |   "text-segmentation",
24 |   "hierarchical-text-segmentation",
25 |   "python",
26 |   "xml-parser",
27 |   "extract-text",
28 |   "knowledge-graph",
29 | ]
30 | classifiers = [
31 |     # See https://pypi.org/classifiers/
32 |     "Programming Language :: Python :: 3.10",
33 |     "Programming Language :: Python :: 3.11",
34 |     "License :: OSI Approved :: MIT License",
35 |     "Operating System :: OS Independent",
36 |     "Development Status :: 3 - Alpha",
37 |     "Intended Audience :: Developers",
38 |     "Topic :: Software Development :: Libraries :: Python Modules",
39 |     "Topic :: Text Processing"
40 | ]
41 | 
42 | [project.urls]
43 | "Homepage" = "https://github.com/rlayers/pawpaw"
44 | "Bug Tracker" = "https://github.com/rlayers/pawpaw/issues"
45 | 
46 | [tool.hatch.version]
47 | path = "pawpaw/_version.py"
48 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/__init__.py


--------------------------------------------------------------------------------
/tests/arborform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/arborform/__init__.py


--------------------------------------------------------------------------------
/tests/arborform/test_invert.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import Extract, Invert
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestInvert(_TestIto):
 8 |     def test_transform(self):
 9 |         s = ' a1b2c '
10 |         root = Ito(s)
11 | 
12 |         non_gap_desc = 'nongap'
13 |         gap_desc = 'gap'
14 | 
15 |         extract_res = [
16 |             regex.compile(r'.', regex.DOTALL),
17 |             regex.compile(r'\s', regex.DOTALL),
18 |             regex.compile(r'[a-z]', regex.DOTALL),
19 |             regex.compile(r'\d', regex.DOTALL),
20 |             regex.compile(r'\S', regex.DOTALL),
21 |             regex.compile(r'_', regex.DOTALL),
22 |         ]
23 | 
24 |         for re in extract_res:
25 |             with self.subTest(re=re.pattern):
26 |                 itor_extract = Extract(re, desc=lambda match, gk: non_gap_desc)
27 |                 non_gaps = [*itor_extract(root)]
28 |                 expected = [*Ito.from_gaps(root, non_gaps, gap_desc)]
29 | 
30 |                 itor_gaps = Invert(itor_extract, desc=gap_desc)
31 |                 self.assertSequenceEqual(expected, [*itor_gaps(root)])
32 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_desc.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import Desc
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestDesc(_TestIto):
 8 |     def test_transform(self):
 9 |         s = ' abc '
10 |         root = Ito(s, 1, -1)
11 |         self.assertIsNone(root.desc)
12 | 
13 |         desc = 'changed'
14 |         itor = Desc(desc)
15 |         rv = [*itor._transform(root)]
16 |         self.assertEqual(1, len(rv))
17 | 
18 |         rv = rv[0]
19 |         self.assertIs(root, rv)
20 |         self.assertEqual(desc, rv.desc)
21 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_filter.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import Itorator, Filter, Connectors
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestFilter(_TestIto):
 8 |     def test_traverse_partial(self):
 9 |         s = '1a2b3c'
10 |         root = Ito(s)
11 | 
12 |         split_chr = Itorator.wrap(lambda ito: ito)
13 |         rv = [*split_chr(root)]
14 |         self.assertEqual(len(s), len(rv))
15 | 
16 |         for ft, f in [('None', lambda ito: False), ('All', lambda ito: True), ('Partial', Ito.str_isnumeric)]:
17 |             with self.subTest(filter_type=ft):
18 |                 split_chr = split_chr.clone()
19 |                 filter = Filter(f)
20 |                 con = Connectors.Delegate(filter)
21 |                 split_chr.connections.append(con)
22 |                 expected = [i for i in root if f(i)]
23 |                 actual = [*split_chr(root)]
24 |                 self.assertSequenceEqual(expected, actual)
25 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_reflect.py:
--------------------------------------------------------------------------------
 1 | from pawpaw import Ito
 2 | from pawpaw.arborform import Reflect
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestReflect(_TestIto):
 7 |     def test_transform(self):
 8 |         s = 'abc'
 9 |         root = Ito(s)
10 |         self.add_chars_as_children(root, 'Child')
11 | 
12 |         reflect = Reflect()
13 |         rv = [*reflect._transform(root)]
14 |         self.assertEqual(1, len(rv))
15 |         self.assertIs(root, rv[0])
16 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_split.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import typing
  3 | 
  4 | import regex
  5 | from pawpaw import Ito
  6 | from pawpaw.arborform import Itorator, Split
  7 | from tests.util import _TestIto
  8 | 
  9 | 
 10 | class TestSplit(_TestIto):
 11 |     PREFIX = 'PRE'
 12 |     MIDDLE = 'MID'
 13 |     SUFFIX = 'SUF'
 14 |   
 15 |     @classmethod
 16 |     def str_from(cls, sep: str) -> str:
 17 |         return sep.join((cls.PREFIX, cls.MIDDLE, cls.MIDDLE, cls.SUFFIX))
 18 |     
 19 |     SEP_DESC = 'sep'
 20 | 
 21 |     @classmethod
 22 |     def re_from(cls, sep: str) -> regex.Pattern:
 23 |         return regex.compile(regex.escape(sep), regex.DOTALL)
 24 |     
 25 |     @classmethod
 26 |     def expected_from(cls, s: str, sep: str, brt: Split.BoundaryRetention) -> typing.List[str]:
 27 |         if sep == '':
 28 |             return [c for c in s]
 29 |       
 30 |         rv = s.split(sep)
 31 |         if brt == Split.BoundaryRetention.LEADING:
 32 |             del rv[0]
 33 |             for i, s in enumerate(rv):
 34 |                 rv[i] = sep + s
 35 |         elif brt == Split.BoundaryRetention.TRAILING:
 36 |             del rv[-1]
 37 |             for i, s in enumerate(rv):
 38 |                 rv[i] += sep
 39 |         elif brt == Split.BoundaryRetention.ALL:
 40 |             rv = rv[:1] + list(itertools.chain.from_iterable((sep, i) for i in rv[1:]))
 41 | 
 42 |         return rv
 43 |     
 44 |     valid_ctor_params = {
 45 |         'splitter': [Itorator.wrap(lambda ito: ito.str_split()), regex.compile(r'\s+', regex.DOTALL)],
 46 |         'limit': [-1, -0, 1, None],
 47 |         'boundary_retention': list(Split.BoundaryRetention),
 48 |         'return_zero_split': [False, True],
 49 |         'desc': ['abc', None],
 50 |         'tag': ['abc', None],
 51 |     }
 52 | 
 53 |     def test_ctor_valid(self):
 54 |         keys, values = zip(*self.valid_ctor_params.items())
 55 |         for kwargs in [dict(zip(keys, v)) for v in itertools.product(*values)]:
 56 |             with self.subTest(**kwargs):
 57 |                 itor = Split(**kwargs)
 58 | 
 59 |     invalid_ctor_params = {
 60 |         'splitter': [None, True, 1, 'abc'],
 61 |         'limit': [1.0, 'abc'],
 62 |         'boundary_retention': [None, True, 1, 'abc'],
 63 |         'return_zero_split': [None, 1, 'abc'],
 64 |         'desc': [True, 1],
 65 |         'tag': [True, 1.3],
 66 |     }
 67 | 
 68 |     def test_ctor_invalid(self):
 69 |         valids = {k: v[0] for k, v in self.valid_ctor_params.items()}
 70 |         for k, vs in self.invalid_ctor_params.items():
 71 |             invalids = dict(**valids)
 72 |             for v in vs:
 73 |                 invalids[k] = v
 74 |                 with self.subTest(**invalids):
 75 |                     with self.assertRaises(TypeError):
 76 |                         itor = Split(**invalids)
 77 | 
 78 |     def test_iter_simple(self):
 79 |         for sep in ' ', '-':  # '', ' ', '-':
 80 |             s = self.str_from(sep)
 81 |             ito = Ito(s, desc='root')
 82 |             re = self.re_from(sep)
 83 |             for brt in Split.BoundaryRetention:
 84 |                 with self.subTest(string=s, separator=sep, boundary_retention=brt):
 85 |                     expected = self.expected_from(s, sep, brt)
 86 |                     non_sep_desc = 'split'
 87 |                     split = Split(re, boundary_retention=brt, desc=non_sep_desc)
 88 |                     actual = [*split._transform(ito)]
 89 |                     self.assertListEqual(expected, [str(i) for i in actual])
 90 |                     self.assertTrue(all(i.desc in (None, non_sep_desc) for i in actual))
 91 | 
 92 |     def test_iter_sep_not_present(self):
 93 |         sep = 'XXX'
 94 |         s = self.str_from(' ')
 95 |         ito = Ito(s)
 96 |         re = regex.compile(regex.escape(sep))
 97 |         desc='post-split'
 98 |         for brt in Split.BoundaryRetention:
 99 |             for return_zero_split in True, False:
100 |                 with self.subTest(string=s, separator=sep, boundary_retention=brt, return_zero_split=return_zero_split, desc=desc):
101 |                     expected = [ito.clone(desc=desc)] if return_zero_split else []
102 |                     split = Split(re, boundary_retention=brt, return_zero_split=return_zero_split, desc=desc)
103 |                     actual = [*split._transform(ito)]
104 |                     self.assertListEqual(expected, actual)
105 | 
106 |     @classmethod
107 |     def zero_width_patterns(cls, sep: str) -> typing.Iterable[regex.Pattern]:
108 |         esc_sep = regex.escape(sep)
109 |         yield r'(?<=' + esc_sep + r')'  # look behind
110 |         yield r'(?=' + esc_sep + r')'  # look ahead
111 |     
112 |     def test_iter_zero_width_matches(self):
113 |         sep = '.'
114 |         s = self.str_from(sep)
115 |         ito = Ito(s, desc='root')
116 |         for pat in self.zero_width_patterns(sep):
117 |             re = regex.compile(pat)
118 |             for brt in Split.BoundaryRetention:
119 |                 with self.subTest(string=s, pattern=pat, boundary_retention=brt):
120 |                     expected = re.split(s)
121 |                     if brt == Split.BoundaryRetention.LEADING:
122 |                         del expected[0]
123 |                     elif brt == Split.BoundaryRetention.TRAILING:
124 |                         del expected[-1]
125 |                     desc = 'split'
126 |                     split = Split(re, boundary_retention=brt, desc=desc)
127 |                     actual = [*split._transform(ito)]
128 |                     self.assertListEqual(expected, [str(i) for i in actual])
129 |                     self.assertTrue(all(i.desc == desc for i in actual))
130 | 
131 |     def test_limit(self):
132 |         s = 'abc'
133 |         root = Ito(s)
134 |         
135 |         re = regex.compile('(?=.)')
136 |         for limit in None, *range(0, len(s)):
137 |             with self.subTest(re=re.pattern, limit=limit):
138 |                 splitter = Split(re, limit=limit)
139 |                 rv = [*splitter(root)]
140 |                 expected = []
141 |                 if limit is None:
142 |                     expected.extend(root)
143 |                 elif limit == 0:
144 |                     expected.append(root)
145 |                 else:
146 |                     expected.extend(i for i in root[:limit-1] if len(i) > 0)  # split parts
147 |                     expected.append(root.clone(limit-1))  # remaining part
148 |                 self.assertSequenceEqual(expected, rv)
149 | 
150 |         re = regex.compile('(?<=.)')
151 |         for limit in None, *range(0, len(s)):
152 |             with self.subTest(re=re.pattern, limit=limit):
153 |                 splitter = Split(re, limit=limit)
154 |                 rv = [*splitter(root)]
155 |                 expected = []
156 |                 if limit is None:
157 |                     expected.extend(root)
158 |                 elif limit == 0:
159 |                     expected.append(root)
160 |                 else:
161 |                     expected.extend(i for i in root[:limit] if len(i) > 0)  # split parts
162 |                     expected.append(root.clone(limit))  # remaining part
163 |                 self.assertSequenceEqual(expected, rv)
164 | 
165 |         re = regex.compile('b')
166 |         for limit in None, *range(0, len(s)):
167 |             with self.subTest(re=re.pattern, limit=limit):
168 |                 splitter = Split(re, limit=limit)
169 |                 rv = [*splitter(root)]
170 |                 expected = []
171 |                 if limit is None or limit > 0:
172 |                     expected.extend(root.str_split('b'))
173 |                 else:
174 |                     expected.append(root)
175 |                 self.assertSequenceEqual(expected, rv)
176 |                 
177 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_value_func.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import ValueFunc
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestValueFunc(_TestIto):
 8 |     def test_transform(self):
 9 |         s = '123'
10 |         root = Ito(s)
11 |         self.assertEqual(str(root), root.value())
12 | 
13 |         f = lambda i: int(str(i))
14 |         itor = ValueFunc(f)
15 |         rv = [*itor._transform(root)]
16 |         self.assertEqual(1, len(rv))
17 | 
18 |         rv = rv[0]
19 |         self.assertIs(root, rv)
20 |         self.assertEqual(f(rv), rv.value())
21 | 


--------------------------------------------------------------------------------
/tests/arborform/test_nuco.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito, arborform
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestNuco(_TestIto):
 7 |     def test_init(self):
 8 |         itor_a = arborform.Reflect()
 9 |         itor_b = arborform.Desc('123')
10 |         tag = 'abc'
11 |         nuco = arborform.Nuco(itor_a, itor_b, tag=tag)
12 |         self.assertListEqual([itor_a, itor_b], nuco._itorators)
13 |         self.assertEqual(tag, nuco.tag)
14 | 
15 |     def test_transform(self):
16 |         s = 'She bought 12 eggs'
17 |         root = Ito(s)
18 | 
19 |         itor_split = arborform.Itorator.wrap(lambda ito: ito.str_split())
20 | 
21 |         itor_num = arborform.Extract(regex.compile('(?P\d+)'))
22 |         itor_word = arborform.Desc('word')
23 |         
24 |         itor_nuco = arborform.Nuco(itor_num, itor_word)
25 |         con = arborform.Connectors.Delegate(itor_nuco)
26 |         itor_split.connections.append(con)
27 | 
28 |         root.children.add(*itor_split(root))
29 |         self.assertEqual(len(s.split()), len(root.children))
30 |         for tok in root.children:
31 |             expected = 'number' if tok.str_isdecimal() else 'word'
32 |             self.assertEqual(expected, tok.desc)
33 | 


--------------------------------------------------------------------------------
/tests/arborform/test_postorator.py:
--------------------------------------------------------------------------------
 1 | from itertools import tee
 2 | 
 3 | import regex
 4 | from pawpaw import Ito, Types
 5 | from pawpaw.arborform import Split
 6 | from pawpaw.arborform.postorator import Postorator
 7 | from tests.util import _TestIto
 8 | 
 9 | 
10 | class TestPostorator(_TestIto):
11 |     post_desc = 'joined'
12 | 
13 |     @classmethod
14 |     def simple(cls, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
15 |         yield Ito.join(*itos)
16 | 
17 |     def test_traverse(self):
18 |         for s in 'One', 'One Two', 'One Two Three', 'One Two Three Four':
19 |             itos = Ito(s).str_split()
20 |             with self.subTest(string=s, itos=itos, desc=self.post_desc):
21 |                 wrapped = Postorator.wrap(self.simple)
22 |                 expected = [*self.simple(itos)]
23 |                 actual = [*wrapped(itos)]
24 |                 self.assertListEqual(expected, actual)
25 | 
26 |     def test_post(self):
27 |         for s in 'One', 'One Two', 'One Two Three', 'One Two Three Four':
28 |             root = Ito(s, desc='root')
29 |             splitter = Split(regex.compile(r'\s+'), desc=root.desc)
30 | 
31 |             rv = [*splitter(root)]
32 |             self.assertListEqual(root.str_split(), rv)
33 | 
34 |             splitter.postorator = Postorator.wrap(self.simple)
35 |             expected = [Ito(s)]
36 |             actual = [*splitter(root)]
37 |             self.assertListEqual(expected, actual)
38 | 


--------------------------------------------------------------------------------
/tests/arborform/test_postorator_windowed_join.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito, Types
 3 | from pawpaw.arborform.postorator import WindowedJoin
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestWindowedJoin(_TestIto):
 8 |     def test_window_size(self):
 9 |         func = lambda itos: True
10 |         for window_size in -1, 0, 1, 2:
11 |           with self.subTest(window_size=window_size):
12 |               if window_size < 2:
13 |                   with self.assertRaises(ValueError):
14 |                       WindowedJoin(window_size, func)
15 |               else:
16 |                   WindowedJoin(window_size, func)
17 | 
18 |     def test_traverse_tautology(self):
19 |         func = lambda itos: True
20 |         re = regex.compile(r'\s')
21 |         for s in '', 'One', 'One Two', 'One Two Three', 'One Two Three Four':
22 |             root = Ito(s, desc='root')
23 |             itos = root.split(re)
24 |             desc = 'merged'
25 |             for window_size in 2, 3, 4:
26 |                 with self.subTest(string=s, window_size=window_size, desc=desc):
27 |                     wj = WindowedJoin(window_size, func, desc=desc)
28 |                     actual = [*wj(itos)]
29 |                     if len(itos) < window_size:
30 |                         self.assertListEqual(itos, actual)
31 |                     else:
32 |                         joined_count = len(itos) // window_size
33 |                         unjoined_count = len(itos) % window_size
34 |                         self.assertEqual(joined_count + unjoined_count, len(actual))
35 | 
36 |                         for i in range(0, joined_count):
37 |                             expected = Ito.join(*itos[i * window_size:i * window_size + window_size], desc=desc)
38 |                             self.assertEqual(expected, actual[i])
39 | 
40 |                         if unjoined_count > 0:
41 |                             tail = itos[-unjoined_count:]
42 |                             self.assertListEqual(tail, actual[-unjoined_count:])
43 | 
44 |     def test_traverse_non_tautology(self):
45 |         s = 'One Two Three Four'
46 |         root = Ito(s, desc='root')
47 |         itos = root.str_split()
48 | 
49 |         window_size = 2
50 |         func = lambda itos: all(i.str_startswith('T') for i in itos)
51 |         desc = 'merged'
52 | 
53 |         wj = WindowedJoin(window_size, func, desc=desc)
54 |         actual = [*wj(itos)]
55 |         self.assertEqual(3, len(actual))
56 |         self.assertEqual(itos[0], actual[0])
57 |         self.assertEqual(itos[-1], actual[-1])
58 |         self.assertEqual('Two Three', str(actual[1]))
59 | 


--------------------------------------------------------------------------------
/tests/ito/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/ito/__init__.py


--------------------------------------------------------------------------------
/tests/ito/test_ito_descend.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | import regex
 4 | from pawpaw import Ito
 5 | from tests.util import _TestIto
 6 | 
 7 | class TestItoDescend(_TestIto):
 8 |     def test_descends_from(self):
 9 |         root = Ito('abcde')
10 |         root.children.add(c := Ito(root, 1, -1))
11 |         c.children.add(gc := Ito(c, 1, -1))
12 | 
13 |         for desc, basis, expected in (
14 |             ('root', root, False),
15 |             ('child', c, True),
16 |             ('grandchild', gc, True),
17 |         ):
18 |             with self.subTest(desc=f'{desc}.descends_from(root) is {expected}'):
19 |                 self.assertEqual(expected, basis.descends_from(root))
20 | 
21 |             with self.subTest(desc=f'{desc}.clone().descends_from(root) is False'):
22 |                 self.assertFalse(basis.clone().descends_from(root))
23 | 
24 |     def test_has_descendant(self):
25 |         root = Ito('abcde')
26 |         root.children.add(c := Ito(root, 1, -1))
27 |         c.children.add(gc := Ito(c, 1, -1))
28 | 
29 |         for desc, basis, expected in (
30 |             ('root', root, False),
31 |             ('child', c, True),
32 |             ('grandchild', gc, True),
33 |         ):
34 |             with self.subTest(desc=f'root.has_descendant({desc}) is {expected}'):
35 |                 self.assertEqual(expected, root.has_descendant(basis))
36 | 
37 |             with self.subTest(desc=f'root.has_descendant({desc}.clone) is False'):
38 |                 self.assertFalse(root.has_descendant(basis.clone()))                
39 | 


--------------------------------------------------------------------------------
/tests/ito/test_ito_regex_equivalence_methods.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestItoRegexEquivalenceMethods(_TestIto):
 7 |     def test_regex_finditer(self):
 8 |         strings = '', 'A', 'Here are some words.'
 9 |         paddings = '', ' ', '_'
10 |         for string in strings:
11 |             for padding in paddings:
12 |                 s = f'{padding}{string}{padding}'
13 |                 pad_slice = slice(len(padding), -len(padding))
14 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
15 |                 for re_str in r' ', r'\w+', r'(?P\w+)':
16 |                     re = regex.compile(re_str, regex.DOTALL)
17 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
18 |                         expected = [*re.finditer(s, pos=pad_slice.start, endpos=pad_slice.stop)]
19 |                         actual = [*ito.regex_finditer(re)]
20 |                         self.assertEqual(len(expected), len(actual))
21 |                         for e, a in zip(expected, actual):
22 |                             self.assertEqual(e, a)
23 | 
24 |     def test_regex_match(self):
25 |         strings = '', 'A', 'Here are some words.'
26 |         paddings = '', ' ', '_'
27 |         for string in strings:
28 |             for padding in paddings:
29 |                 s = f'{padding}{string}{padding}'
30 |                 pad_slice = slice(len(padding), -len(padding))
31 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
32 |                 for re_str in r' ', r'\w+', r'(?P\w+)':
33 |                     re = regex.compile(re_str, regex.DOTALL)
34 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
35 |                         expected = re.match(s, pos=pad_slice.start, endpos=pad_slice.stop)
36 |                         actual = ito.regex_match(re)
37 |                         self.assertEqual(expected, actual)
38 | 
39 |     def test_regex_search(self):
40 |         strings = '', 'A', 'Here are some words.'
41 |         paddings = '', ' ', '_'
42 |         for string in strings:
43 |             for padding in paddings:
44 |                 s = f'{padding}{string}{padding}'
45 |                 pad_slice = slice(len(padding), -len(padding))
46 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
47 |                 for re_str in r' ', r'\w+', r'(?P\w+)':
48 |                     re = regex.compile(re_str, regex.DOTALL)
49 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
50 |                         expected = re.search(s, pos=pad_slice.start, endpos=pad_slice.stop)
51 |                         actual = ito.regex_search(re)
52 |                         self.assertEqual(expected, actual)
53 | 
54 |     def test_regex_split_simple(self):
55 |         strings = '', 'A', 'Here are some words.'
56 |         separators = ' ', '\n', '\r\n'
57 |         paddings = '', ' ', '_'
58 |         for string in strings:
59 |             for sep in separators:
60 |                 s = string.replace(' ', sep)
61 |                 for padding in paddings:
62 |                     s = f'{padding}{s}{padding}'
63 |                     pad_slice = slice(len(padding), -len(padding))
64 |                     ito = Ito(s, pad_slice.start, pad_slice.stop)
65 |                     re = regex.compile(regex.escape(sep), regex.DOTALL)
66 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
67 |                         expected = re.split(s[pad_slice])
68 |                         actual = ito.regex_split(re)
69 |                         self.assertListEqual(expected, [str(i) for i in actual])
70 | 
71 |     def test_regex_split_sep_not_present(self):
72 |         strings = '', 'A', 'Here are some words.'
73 |         separator = 'XXX'
74 |         paddings = '', ' ', '_'
75 |         for string in strings:
76 |             for padding in paddings:
77 |                 s = f'{padding}{string}{padding}'
78 |                 pad_slice = slice(len(padding), -len(padding))
79 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
80 |                 re = regex.compile(regex.escape(separator), regex.DOTALL)
81 |                 with self.subTest(string=s, ito=ito, pattern=re.pattern):
82 |                     expected = re.split(s[pad_slice])
83 |                     actual = ito.regex_split(re)
84 |                     self.assertListEqual(expected, [str(i) for i in actual])
85 | 


--------------------------------------------------------------------------------
/tests/ito/test_ito_serialization.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | 
 4 | from pawpaw import Ito
 5 | from tests.util import _TestIto
 6 | 
 7 | 
 8 | class TestItoSerialization(_TestIto):
 9 |     def setUp(self) -> None:
10 |         super().setUp()
11 | 
12 |         s = 'See Jack run.'
13 |         self.h_ito = Ito(s, desc='Phrase')
14 |         self.h_ito.children.add(*self.h_ito.str_split())
15 |         for c in self.h_ito.children:
16 |             c.desc = 'Word'
17 |             self.add_chars_as_children(c, 'Char')
18 | 
19 |     def test_pickle_serialize(self):
20 |         word = self.h_ito.find('**[d:Word]')
21 |         pickle_data = pickle.dumps(word)
22 |         self.assertLess(0, len(pickle_data))
23 |       
24 |     def test_pickle_deserialize(self):
25 |         w_orig = self.h_ito.find('**[d:Word]')
26 |         pickle_data = pickle.dumps(w_orig)
27 |         w_deser = pickle.loads(pickle_data)
28 |         self.assertEqual(w_orig, w_deser)
29 | 
30 |     def test_json_serialize(self):
31 |         word = self.h_ito.find('**[d:Word]')
32 |         js_data = json.dumps(word, cls=Ito.JsonEncoder)
33 |         expected_prefix = '{"__type__": "typing.Tuple[str, Ito]", "string": "' + \
34 |             word.string + \
35 |             '", "ito": {"__type__": "Ito", "span": ' + \
36 |             str(list(word.span)) + \
37 |             ', "desc": "' + \
38 |             word.desc + \
39 |             '"'
40 |         self.assertTrue(js_data.startswith(expected_prefix))      
41 | 
42 |     def test_json_deserialize(self):
43 |         w_orig = self.h_ito.find('**[d:Word]')
44 |         js_data = json.dumps(w_orig, cls=Ito.JsonEncoder)
45 |         w_deser = json.loads(js_data, object_hook=Ito.json_decoder)
46 |         self.assertIsNot(w_orig, w_deser)
47 |         self.assertEqual(w_orig, w_deser)
48 |         
49 |     def test_json_stringless_serialize(self):
50 |         word = self.h_ito.find('**[d:Word]')
51 |         js_data = json.dumps(word, cls=Ito.JsonEncoderStringless)
52 |         expected_prefix = '{"__type__": "Ito", "span": ' + \
53 |             str(list(word.span)) + \
54 |             ', "desc": "' + \
55 |             word.desc + \
56 |             '"'
57 |         self.assertTrue(js_data.startswith(expected_prefix))
58 | 
59 |     def test_json_stringless_deserialize(self):
60 |         w_orig = self.h_ito.find('**[d:Word]')
61 |         js_data = json.dumps(w_orig, cls=Ito.JsonEncoderStringless)
62 |         w_deser = Ito.json_decode_stringless(w_orig.string, js_data)
63 |         self.assertIsNot(w_orig, w_deser)
64 |         self.assertEqual(w_orig, w_deser)
65 | 


--------------------------------------------------------------------------------
/tests/ito/test_ito_utility_methods.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | 
 3 | from pawpaw import Ito
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestItoUtilityMethods(_TestIto):
 8 | 
 9 |     def test_to_line_str_col_empty(self):
10 |         s = 'a\nb\nc'
11 |         line = 1
12 |         col = 0
13 |         for i in range(0, len(s)):
14 |             col += 1
15 |             ito = Ito(s, i, i + 1)
16 |             with self.subTest(string=s, ito=ito.span):
17 |                 expected = line, col
18 |                 actual = ito.to_line_col('\n')
19 |                 self.assertEqual(expected, actual)
20 |                 if s[i] == '\n':
21 |                     line += 1
22 |                     col = 0
23 | 
24 |     def test_to_line_str_non_empty(self):
25 |         s = 'a\r\nb\r\nc'
26 |         line = 1
27 |         col = 0
28 |         for i in range(0, len(s) - 1):
29 |             col += 1
30 |             ito = Ito(s, i, i + 2)
31 |             with self.subTest(string=s, ito=ito.span):
32 |                 expected = line, col
33 |                 actual = ito.to_line_col('\r\n')
34 |                 self.assertEqual(expected, actual)
35 |                 if s[i] == '\n':
36 |                     line += 1
37 |                     col = 0
38 | 
39 |     def test_to_line_regex_non_empty(self):
40 |         string = 'abc\r\ndef\nghi'
41 |         eol = regex.compile(r'\r?\n', regex.DOTALL)
42 |         matches = eol.findall(string)
43 |         for i, ito in enumerate(Ito.from_gaps(string, Ito.from_re(eol, string)), 1):
44 |             for sub in ito:
45 |                 with self.subTest(ito=sub):
46 |                     expected = i, 1 + sub.start - ito.start
47 |                     self.assertEqual(expected, sub.to_line_col(eol))
48 | 


--------------------------------------------------------------------------------
/tests/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/nlp/__init__.py


--------------------------------------------------------------------------------
/tests/ontology/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/ontology/__init__.py


--------------------------------------------------------------------------------
/tests/ontology/test_keyed_list.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | import regex
 4 | import pawpaw
 5 | from tests.util import _TestIto
 6 | 
 7 | 
 8 | class TestTable(_TestIto):
 9 |     _sample_list_keys = [
10 |         ['1', '2', '3'],
11 |         ['1.1', '1.2', '1.3'],
12 |         ['A', 'B', 'C'],
13 |     ]
14 | 
15 |     _sample_list_values = [
16 |         'First line.',
17 |         'Second line.',
18 |         'Third line.',
19 |     ]
20 | 
21 |     # def test_itorator(self) -> None:
22 |     #     itor = pawpaw.nlp.KeyedList().get_itor()
23 |     #
24 |     #     for sks in self._sample_list_keys:
25 |     #         for key_sep in ['.', ')', ':']:
26 |     #             list_lines = [f'{k}{key_sep} {val}' for k, val in zip(sks, self._sample_list_values)]
27 |     #             for line_sep in ['\n', '\n\r']:
28 |     #                 _list = pawpaw.Ito(line_sep.join(list_lines))
29 |     #                 with self.subTest(_list=_list):
30 |     #                     rv = [*itor(_list)]
31 |     #                     self.assertEqual(len(sks), len(rv))
32 | 


--------------------------------------------------------------------------------
/tests/ontology/test_ontology.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import typing
  3 | 
  4 | import regex
  5 | import pawpaw
  6 | from pawpaw.ontology import Ontology
  7 | from tests.util import _TestIto
  8 | 
  9 | 
 10 | class TestOntology(_TestIto):
 11 |     def setUp(self) -> None:
 12 |         super().setUp()
 13 | 
 14 |         self.ontology = Ontology(
 15 |             {
 16 |                 'vehicle': Ontology(
 17 |                     {
 18 |                         'car': Ontology(
 19 |                             {
 20 |                                 'Ford': Ontology(
 21 |                                     rules=[
 22 |                                         pawpaw.arborform.Extract(
 23 |                                             regex.compile(
 24 |                                                 r'(?P(?:Ford\s+)?Mustang(?:(?:-|\s+)\L)?)',
 25 |                                                 regex.IGNORECASE | regex.DOTALL,
 26 |                                                 subtypes=['EcoBoost', 'LX', 'GT', 'GT350', 'GT500', 'Mach-E', 'Dark Horse']
 27 |                                             )
 28 |                                         ),
 29 |                                         pawpaw.arborform.Extract(
 30 |                                             regex.compile(
 31 |                                                 r'(?PF(?:ord)?-(?:150(?:\s+Lightningt)?|[3-7]50|600))',
 32 |                                                 regex.IGNORECASE | regex.DOTALL
 33 |                                             )
 34 |                                         ),
 35 |                                     ]
 36 |                                 )
 37 |                             }
 38 |                         ),
 39 |                         'airplane': Ontology(
 40 |                             {
 41 |                                 'Cessna': Ontology(
 42 |                                     rules=[
 43 |                                         pawpaw.arborform.Extract(
 44 |                                             regex.compile(
 45 |                                                 r'(?PCessna\s+172(?:\s+Skyhawk)?|(?:Cessna\s+)?172\s+Skyhawk)',
 46 |                                                 regex.IGNORECASE | regex.DOTALL
 47 |                                             )
 48 |                                         ),
 49 |                                         pawpaw.arborform.Extract(
 50 |                                             regex.compile(
 51 |                                                 r'(?PCessna\s+182(?:\s+Skylane)?|(?:Cessna\s+)?182\s+Skylane)',
 52 |                                                 regex.IGNORECASE | regex.DOTALL
 53 |                                             )
 54 |                                         ),
 55 |                                         pawpaw.arborform.Extract(
 56 |                                             regex.compile(
 57 |                                                 r'(?PCessna\s+206(?:\s+Stationair)?|(?:Cessna\s+)?206\s+Stationair)',
 58 |                                                 regex.IGNORECASE | regex.DOTALL
 59 |                                             )
 60 |                                         ),
 61 |                                         pawpaw.arborform.Extract(
 62 |                                             regex.compile(
 63 |                                                 r'(?PCessna\s+208(?:\s+Caravan)?|(?:Cessna\s+)?208\s+Caravan)',
 64 |                                                 regex.IGNORECASE | regex.DOTALL
 65 |                                             )
 66 |                                         ),
 67 |                                     ]
 68 |                                 )
 69 |                             }
 70 |                         ),                        
 71 |                     },
 72 |                     rules=[pawpaw.arborform.Extract(regex.compile(r'(?Pvehicles?)', regex.IGNORECASE))]
 73 |                 )
 74 |             }
 75 |         )
 76 | 
 77 |     def test_ctor(self):
 78 |         for rules in [], [pawpaw.arborform.Extract(regex.compile(r'abc'))]:
 79 |             for items in {}, {'a': Ontology()}, {'a': Ontology(), 'rules': Ontology()}:
 80 |                 for b in None, Ontology():
 81 |                     with self.subTest(rules=rules, items=items, b=b):
 82 |                         args = []
 83 |                         if len(items) > 0:
 84 |                             args.append(items)
 85 |                         
 86 |                         kwargs = {}
 87 |                         if b is not None:
 88 |                             kwargs['b'] = b
 89 |                         items_expected = (items | kwargs).items()
 90 |                         
 91 |                         if len(rules) > 0:
 92 |                             kwargs['rules'] = rules
 93 |                         
 94 |                         ont = Ontology(*args, **kwargs)
 95 |                         
 96 |                         self.assertSequenceEqual(items_expected, ont.items())
 97 |                         self.assertSequenceEqual(rules, ont.rules)
 98 | 
 99 |     def test_path_index_access(self):
100 |         paths = [
101 |             ('vehicle', ),
102 |             ('vehicle', 'car'),
103 |             ('vehicle', 'car', 'Ford'),
104 |             ('vehicle', 'airplane', 'Cessna'),
105 |         ]
106 |         for path in paths:
107 |             with self.subTest(path=path):
108 |                 expected = self.ontology
109 |                 for s in path:
110 |                     expected = expected[s]
111 |                 self.assertIs(expected, self.ontology[path])
112 | 
113 |     def test_discover(self):
114 |         s = 'The vehicle John loves to drive most is his F-150, not his Cessna 172.'
115 |         ito = pawpaw.Ito(s)
116 | 
117 |         discoveries = self.ontology.discover(ito)
118 | 
119 |         vehicles = [*itertools.chain.from_iterable(rule(ito) for rule in self.ontology['vehicle'].rules)]
120 |         self.assertLess(0, len(vehicles))
121 |         self.assertSequenceEqual(vehicles, discoveries['vehicle'].itos)
122 | 
123 |         fords = [*itertools.chain.from_iterable(rule(ito) for rule in self.ontology['vehicle']['car']['Ford'].rules)]
124 |         self.assertLess(0, len(fords))
125 |         self.assertSequenceEqual(fords, discoveries['vehicle']['car']['Ford'].itos)
126 | 
127 |         cessnas = [*itertools.chain.from_iterable(rule(ito) for rule in self.ontology['vehicle']['airplane']['Cessna'].rules)]
128 |         self.assertLess(0, len(cessnas))
129 |         self.assertSequenceEqual(cessnas, discoveries['vehicle']['airplane']['Cessna'].itos)
130 | 


--------------------------------------------------------------------------------
/tests/query/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/query/__init__.py


--------------------------------------------------------------------------------
/tests/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/table/__init__.py


--------------------------------------------------------------------------------
/tests/table/test_table.py:
--------------------------------------------------------------------------------
 1 | import pawpaw
 2 | from tests.util import _TestIto
 3 | 
 4 | import regex
 5 | 
 6 | class TestTable(_TestIto):
 7 |     _test_style_data: list[tuple[str, str]]  = [
 8 |         (
 9 |             'TYPE_1',
10 | """-----+-----+-----
11 |   A  |  B  |  C
12 | -----+-----+-----
13 |  aaa | bbb | ccc
14 | -----+-----+-----"""
15 |         ),
16 |         (
17 |             'TYPE_2',
18 | """-------------------
19 | |  A  |  B  |  C  |
20 | |-----------------|
21 | | aaa | bbb | ccc |
22 | -------------------"""
23 |         ),
24 |     ]
25 | 
26 | 
27 |     def test_named_styles(self) -> None:
28 |         for style_name, data in self._test_style_data:
29 |             style: pawpaw.table.StyledTable = getattr(pawpaw.table.styles, style_name)
30 |             table = pawpaw.table.StyledTable(style)
31 |             for leading_trailing_crs in (False, True):
32 |                 indents = ['']
33 |                 if style.equi_distant_indent:
34 |                     indents.extend((' ', '\t', '  ', '\t '))
35 | 
36 |                 for indent in indents:
37 |                     with self.subTest(style=style_name, leading_trailing_crs=leading_trailing_crs, indent=indent):
38 |                         indented_data = '\n'.join([indent + line for line in data.split('\n')])
39 |                         indented_data = pawpaw.Ito(indented_data)
40 | 
41 |                         if indent == '':
42 |                             self.assertEqual(data, str(indented_data))
43 | 
44 |                         if style.equi_distant_indent:
45 |                             ed = pawpaw.table.StyledTable._re_equi_ident
46 |                             itor = pawpaw.arborform.Extract(ed)
47 |                             edr = [*itor(indented_data)]
48 |                             self.assertEqual(1, len(edr))
49 |                             self.assertEqual(str(indented_data), str(edr[0]))
50 |                         else:
51 |                             self.assertEqual(data, str(indented_data))
52 | 
53 |                         crd_data = pawpaw.Ito(f'\n{indented_data}\n') if leading_trailing_crs else indented_data
54 | 
55 |                         itor = table.get_itor()
56 |                         itos = list(itor(crd_data))
57 |                         self.assertIsNotNone(itos)
58 |                         self.assertEqual(1, len(itos))
59 | 
60 |                         ito = itos[0]
61 |                         self.assertEqual('table', ito.desc)
62 | 
63 |                         rows = [*ito.find_all('*[d:row]')]
64 |                         self.assertEqual(2, len(rows))
65 |                         self.assertTrue(all(i.desc == 'row' for i in rows))
66 | 


--------------------------------------------------------------------------------
/tests/test_group_keys.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import GroupKeys
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestGroupKeys(_TestIto):
 7 |     def test_preferred(self):
 8 |         for pat in r'.', r'(?P.).(?P.)':
 9 |             with self.subTest(re=pat):
10 |                 re = regex.compile(pat)
11 |                 pgks = GroupKeys.preferred(re)
12 |                 self.assertEqual(re.groups + 1, len(pgks))
13 |                 for i, gk in enumerate(pgks):
14 |                     if isinstance(gk, str):
15 |                         self.assertEqual(i, re.groupindex[gk])
16 |                     else:
17 |                         self.assertEqual(i, gk)
18 | 
19 |     def test_validate(self):
20 |         re = regex.compile(r'(?P.).(?P.)')
21 |         
22 |         for valid_gks in [[0], [0, 1, 2], ['key1'], ['key1', 'key2'], [0, 'key2'], GroupKeys.preferred(re)]:
23 |             with self.subTest(group_keys=valid_gks):
24 |                 GroupKeys.validate(re, valid_gks)
25 | 
26 |         for invalid_gks in [[-1], [0, 3], ['xyz'], ['key1', 'key1'], [1, 'key1'], ['key1', 1]]:
27 |             with self.subTest(group_keys=valid_gks):
28 |                 with self.assertRaises(ValueError):
29 |                     GroupKeys.validate(re, invalid_gks)
30 | 


--------------------------------------------------------------------------------
/tests/test_invoke_func.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import typing
 3 | 
 4 | import pawpaw
 5 | from tests.util import _TestIto
 6 | 
 7 | 
 8 | def arg_only_func(a: bool, b: int) -> typing.Dict[str, typing.Any]:
 9 |     return {'a': a, 'b': b}
10 | 
11 | 
12 | def arg_kwonlyargs_func(a: bool, b: int = 1, c: float = 1.0, d: str = 'd param val') -> typing.Dict[str, typing.Any]:
13 |     return {'a': a, 'b': b, 'c': c, 'd': d}
14 | 
15 | 
16 | def big_func(a: bool, b: int = 1, *args, c: float = 1.0, d: str = 'd param val', **kwargs) -> typing.Dict[str, typing.Any]:
17 |     return {'a': a, 'b': b, '*args': args, 'c': c, 'd': d, '**kwargs': kwargs}
18 | 
19 | 
20 | class TestDescFunc(_TestIto):
21 |     def test_arg_only_func(self):
22 |         for vars in {'a': False, 'b': -1}, {'a': False, 'b': -1, 'c': 1.234}:
23 |             with self.subTest(vars=vars):
24 |                 rv = pawpaw.type_magic.invoke_func(arg_only_func, *vars.values())
25 |                 for k, v in vars.items():
26 |                     if k in ('a', 'b'):
27 |                         self.assertEqual(v, rv[k])
28 |                     else:
29 |                         self.assertNotIn(k, rv.keys())
30 | 
31 |     def test_arg_kwonlyargs_func(self):
32 |         for vars in {'a': True}, \
33 |                     {'a': False, 'b': -1}, \
34 |                     {'a': False, 'b': -1, 'c': 1.234}, \
35 |                     {'a': False, 'b': -1, 'c': 1.234, 'd': 'd-value'}:
36 |             with self.subTest(vars=vars):
37 |                 rv = pawpaw.type_magic.invoke_func(arg_kwonlyargs_func, *vars.values())
38 |                 for k, v in vars.items():
39 |                     self.assertEqual(v, rv[k])
40 | 
41 |     def test_args(self):
42 |         for vars in {'a': True}, {'a': False, 'b': -1}:
43 |             with self.subTest(vars=vars):
44 |                 rv = pawpaw.type_magic.invoke_func(big_func, *vars.values())
45 |                 for k, v in vars.items():
46 |                     self.assertEqual(v, rv[k])
47 | 
48 |     def test_args_kwargs(self):
49 |         for vars in {'a': True, 'c': 1.234}, {'a': False, 'b': -1, 'c': 5.678}:
50 |             with self.subTest(vars=vars):
51 |                 rv = pawpaw.type_magic.invoke_func(big_func, *vars.values())
52 |                 for k, v in vars.items():
53 |                     self.assertEqual(v, rv[k])
54 | 


--------------------------------------------------------------------------------
/tests/test_span.py:
--------------------------------------------------------------------------------
 1 | from pawpaw import Span, Ito
 2 | from tests.util import _TestIto
 3 | 
 4 | 
 5 | class TestSpan(_TestIto):
 6 |     def test_from_indices_valid(self):
 7 |         for s in '', ' ', ' abc ':
 8 |             for start in (-100, -1, None, 0, 1, 100):
 9 |                 for stop in (-100, -1, None, 0, 1, 100):
10 |                     if len(s) == 0:
11 |                         ito = Ito(s)
12 |                     elif len(s) == 1:
13 |                         ito = Ito(s, 1)
14 |                     else:
15 |                         ito = Ito(s, 1, -1)
16 | 
17 |                     for basis in s, ito:
18 |                         with self.subTest(basis=basis, start=start, stop=stop):
19 |                             _slice = slice(start, stop)
20 |                             expected = basis[_slice]
21 |                             span = Span.from_indices(basis, start, stop)
22 |                             actual = basis[slice(*span)]
23 |                             self.assertEqual(expected, actual)
24 | 
25 |     def test_from_indices_invalid_base(self):
26 |         for basis in [None, 1.0]:
27 |             with self.subTest(basis=basis):
28 |                 with self.assertRaises(TypeError):
29 |                     Span.from_indices(basis)
30 | 
31 |     def test_from_indices_invalid_indices(self):
32 |         s = 'abc'
33 |         for k, v in {'start': 1.0, 'stop': 1.0}.items():
34 |             with self.subTest(basis=s, **{k: v}):
35 |                 with self.assertRaises(TypeError):
36 |                     Span.from_indices(s, **{k: v})
37 |                     
38 |     def test_offset(self):
39 |         s = 'abc'
40 |         for basis in s, Ito(s, 1, -1):
41 |             for i in -100, -1, 0, 1, 100:
42 |                 with self.subTest(basis=basis, i=i):
43 |                     span = Span.from_indices(basis)
44 |                     if (span.start + i < 0) or (span.stop + i < 0):
45 |                         with self.assertRaises(ValueError):
46 |                             span.offset(i)
47 |                     else:
48 |                         rv = span.offset(i)
49 |                         self.assertEqual(span.start + i, rv.start)
50 |                         self.assertEqual(span.stop + i, rv.stop)
51 | 


--------------------------------------------------------------------------------
/tests/test_type_magic.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import typing
  3 | 
  4 | import pawpaw
  5 | from tests.util import _TestIto
  6 | 
  7 | 
  8 | class Foo:
  9 |     ...
 10 |     
 11 | 
 12 | class FooDerived(Foo):
 13 |     ...
 14 | 
 15 | 
 16 | T_RET = bool
 17 | 
 18 | T_P1 = str
 19 | T_P2 = int | None
 20 | T_P3 = Foo
 21 | T_P4 = list[int]
 22 | 
 23 | F_EXACT = typing.Callable[[T_P1, T_P2, T_P3, T_P4], T_RET]
 24 | F_UNION_ELEMENT = typing.Callable[[T_P1, int, T_P3, T_P4], T_RET]
 25 | F_SUBTYPE = typing.Callable[[T_P1, T_P2, FooDerived, T_P4], T_RET]
 26 | F_NON_GENERIC = typing.Callable[[T_P1, T_P2, T_P3, list], T_RET]
 27 | F_INVALID_GENERIC = typing.Callable[[T_P1, T_P2, T_P3, list[str]], T_RET]
 28 | F_TOO_FEW = typing.Callable[[T_P1, T_P2, T_P3], T_RET]
 29 | F_TOO_MANY = typing.Callable[[T_P1, T_P2, T_P3, T_P4, bool], T_RET]
 30 | F_WRONG_RET = typing.Callable[[T_P1, T_P2, T_P3, T_P4], str]
 31 | 
 32 | 
 33 | def def_dir_w_type_hints(a: str, b: int | None, c: Foo, d: list[int]) -> bool:
 34 |     return True
 35 | 
 36 | 
 37 | def def_dir_wo_type_hints(a, b, c, d):
 38 |     return True
 39 | 
 40 | 
 41 | def def_indir_w_type_hints(a: T_P1, b: T_P2, c: T_P3, d: T_P4) -> T_RET:
 42 |     return True
 43 | 
 44 | 
 45 | def def_dir_subtype_w_type_hints(a: T_P1, b: T_P2, c: FooDerived, d: T_P4) -> T_RET:
 46 |     return True
 47 | 
 48 | 
 49 | @dataclass
 50 | class _TestData:
 51 |     name: str
 52 |     type_hints: bool
 53 |     subtype: bool
 54 |     functoid: typing.Callable
 55 |         
 56 |         
 57 | class TestTypeMagic(_TestIto):
 58 |     @classmethod
 59 |     def cls_m_w_type_hints(cls, a: T_P1, b: T_P2, c: T_P3, d: T_P4) -> T_RET:
 60 |         return True
 61 |     
 62 |     @classmethod
 63 |     def cls_m_wo_type_hints(cls, a, b, c, d):
 64 |         return True
 65 |     
 66 |     def inst_m_w_type_hints(self, a: T_P1, b: T_P2, c: T_P3, d: T_P4) -> T_RET:
 67 |         return self is not None
 68 |     
 69 |     def inst_m_wo_type_hints(self, a, b, c, d):
 70 |         return self is not None
 71 | 
 72 |     def setUp(self) -> None:
 73 |         super().setUp()
 74 | 
 75 |         lam_w_type_hints: typing.Callable[[T_P1, T_P2, T_P3, T_P4], T_RET] = lambda a, b, c, d: True
 76 | 
 77 |         lam_wo_type_hints = lambda a, b, c, d: True
 78 | 
 79 |         self.test_data = [
 80 |             _TestData('def direct', True, False, def_dir_w_type_hints),
 81 |             _TestData('def direct', False, False, def_dir_wo_type_hints),
 82 |             _TestData('def indirect', True, False, def_indir_w_type_hints),
 83 |             _TestData('def indirect subtype', True, True, def_dir_subtype_w_type_hints),
 84 |             _TestData('class method', True, False, TestTypeMagic.cls_m_w_type_hints),
 85 |             _TestData('class method', False, False, TestTypeMagic.cls_m_wo_type_hints),
 86 |             _TestData('instance method', True, False, self.inst_m_w_type_hints),
 87 |             _TestData('instance method', False, False, self.inst_m_wo_type_hints),
 88 |             _TestData('lambda', True, False, lam_w_type_hints),
 89 |             _TestData('lambda', False, False, lam_wo_type_hints),
 90 |         ]
 91 | 
 92 |     def test_is_callable_type_or_generic(self):
 93 |         for t in T_RET, T_P1, T_P2, T_P3, T_P4:
 94 |             with self.subTest(type=t):
 95 |                 self.assertFalse(pawpaw.type_magic.is_callable_type_or_generic(t))
 96 | 
 97 |         for t in F_EXACT, F_UNION_ELEMENT, F_SUBTYPE, F_NON_GENERIC, F_INVALID_GENERIC, F_TOO_FEW, F_TOO_MANY, F_WRONG_RET:
 98 |             with self.subTest(type=t):
 99 |                 self.assertTrue(pawpaw.type_magic.is_callable_type_or_generic(t))
100 |     
101 |         for ti in self.test_data:
102 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
103 |                 self.assertFalse(pawpaw.type_magic.is_callable_type_or_generic(ti.functoid))
104 |             
105 |     def test_is_functoid(self):
106 |         for ti in self.test_data:
107 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
108 |                 self.assertTrue(pawpaw.type_magic.is_functoid(ti.functoid))
109 | 
110 |     def test_is_def(self):
111 |         for ti in self.test_data:
112 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
113 |                 self.assertEqual(ti.name.startswith('def'), pawpaw.type_magic.is_def(ti.functoid))
114 | 
115 |     def test_is_lambda(self):
116 |         for ti in self.test_data:
117 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
118 |                 self.assertEqual(ti.name.startswith('lambda'), pawpaw.type_magic.is_lambda(ti.functoid))
119 | 
120 |     def test_is_callable_exact(self):
121 |         for ti in self.test_data:
122 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
123 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_EXACT))
124 | 
125 |     def test_is_callable_union_element(self):
126 |         for ti in self.test_data:
127 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
128 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_UNION_ELEMENT))
129 | 
130 |     def test_is_callable_subtype(self):
131 |         for ti in self.test_data:
132 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
133 |                 expected = not ti.type_hints or ti.name.startswith('lambda') or ti.subtype
134 |                 self.assertEqual(expected, pawpaw.type_magic.functoid_isinstance(ti.functoid, F_SUBTYPE))
135 | 
136 |     def test_is_callable_non_generic(self):
137 |         for ti in self.test_data:
138 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
139 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_NON_GENERIC))
140 | 
141 |     def test_is_callable_invalid_generic(self):
142 |         for ti in self.test_data:
143 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
144 |                 # Only compares origin (list to list), so everything will pass
145 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_INVALID_GENERIC))
146 | 
147 |     def test_is_callable_wrong_count(self):
148 |         for ti in self.test_data:
149 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
150 |                 self.assertFalse(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_TOO_FEW))
151 |                 self.assertFalse(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_TOO_MANY))
152 | 
153 |     def test_is_callable_wrong_ret(self):
154 |         for ti in self.test_data:
155 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
156 |                 expected = not ti.type_hints or ti.name.startswith('lambda')  # type hints for lambdas don't show in annotations
157 |                 actual = pawpaw.type_magic.functoid_isinstance(ti.functoid, F_WRONG_RET)
158 |                 self.assertEqual(expected, actual)
159 | 
160 |         F_INT_2_NONE = typing.Callable[[int], None]
161 | 
162 |         def no_ret_val_w_type_hints(i: int) -> None:
163 |             return
164 | 
165 |         with self.subTest(type=no_ret_val_w_type_hints.__name__, type_hints=True, subtype=False):
166 |             actual = pawpaw.type_magic.functoid_isinstance(no_ret_val_w_type_hints, F_INT_2_NONE)
167 |             self.assertTrue(actual)
168 | 
169 |         def no_ret_val_wo_type_hints(i):
170 |             return
171 | 
172 |         with self.subTest(type=no_ret_val_wo_type_hints.__name__, type_hints=True, subtype=False):
173 |             actual = pawpaw.type_magic.functoid_isinstance(no_ret_val_w_type_hints, F_INT_2_NONE)
174 |             self.assertTrue(actual)
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | import pawpaw
  2 | 
  3 | from tests.util import _TestIto
  4 | 
  5 | 
  6 | class TestFindUnescaped(_TestIto):
  7 |     def test_find_unescaped_invalid(self):
  8 |         s = ' abc '
  9 |         for src in s, pawpaw.Ito(s, 1, -1):
 10 |             for chars in [None, '']:
 11 |                 with self.subTest(src=src, chars=chars):
 12 |                     with self.assertRaises((TypeError, ValueError)):
 13 |                         next(pawpaw.find_unescaped(src, chars))
 14 | 
 15 |         for chars in 'a', 'ab':
 16 |             for src in s, pawpaw.Ito(s, 1, -1):
 17 |                 for escape in [None, '', '\\\\']:
 18 |                     with self.subTest(src=src, chars=chars, escape=escape):
 19 |                         with self.assertRaises((TypeError, ValueError)):
 20 |                             next(pawpaw.find_unescaped(src, chars, escape))
 21 | 
 22 |     def test_find_unescaped_trailing_escape(self):
 23 |         chars = 'a'
 24 |         src = 'a\\'
 25 |         with self.subTest(src=src, chars=chars):
 26 |             with self.assertRaises(ValueError):
 27 |                 [*pawpaw.find_unescaped(src, chars)]
 28 |                 
 29 |     def test_find_unescaped_empty_src(self):
 30 |         s = ''
 31 |         chars = 'a'
 32 |         for src in s, pawpaw.Ito(s), pawpaw.Ito('ab', 1, -1):
 33 |             with self.subTest(src=src, chars=chars):
 34 |                 i = next(pawpaw.find_unescaped(src, chars), None)
 35 |                 self.assertIsNone(i)
 36 | 
 37 |     def test_find_unescaped_not_present(self):
 38 |         s = ' abc '
 39 |         chars = 'z'
 40 |         for src in s, pawpaw.Ito(s), pawpaw.Ito(s, 1, -1):
 41 |             with self.subTest(src=src, chars=chars):
 42 |                 i = next(pawpaw.find_unescaped(src, chars), None)
 43 |                 self.assertIsNone(i)
 44 | 
 45 |     def test_find_unescaped_multiple(self):
 46 |         for string in 'a', 'b', 'ab', 'abc', 'bac':
 47 |             for chars in 'a', 'ab', 'cb':
 48 |                 with self.subTest(src=string, chars=chars):
 49 |                     expected = [i for i, c in enumerate(string) if c in chars]
 50 |                     actual = [*pawpaw.find_unescaped(string, chars)]
 51 |                     self.assertListEqual(expected, actual)
 52 | 
 53 |     def test_find_unescaped_simple(self):
 54 |         for string in 'a', 'b':
 55 |             chars = 'a'
 56 |             for pre in range(1, 5):
 57 |                 s = '\\' * pre + string
 58 |                 with self.subTest(src=s, chars=chars):
 59 |                     if pre & 1:  # odd
 60 |                         expected = []
 61 |                     else:        # even
 62 |                         expected = s.find(chars)
 63 |                         expected = [] if expected == -1 else [expected]
 64 |                     actual = [*pawpaw.find_unescaped(s, chars)]
 65 |                     self.assertListEqual(expected, actual)
 66 |     
 67 |     def test_find_unescaped_complex(self):
 68 |         s = ' a&b&&c '
 69 |         escape = '&'
 70 |         for src in s, pawpaw.Ito(s, 1, -1):
 71 |             for chars in 'a', 'b', 'c':
 72 |                 with self.subTest(src=src, chars=chars, escape=escape):
 73 |                     if chars == 'b':
 74 |                         expected = []
 75 |                     else:
 76 |                         expected = [str(src).find(chars)]
 77 |                     actual = [*pawpaw.find_unescaped(src, chars, escape)]
 78 |                     self.assertListEqual(expected, actual)
 79 | 
 80 | 
 81 | class TestSplitUnescaped(_TestIto):
 82 |     def test_split_unescaped_complex(self):
 83 |         s = ' a&b&&c '
 84 |         escape = '&'
 85 |         for src in s, pawpaw.Ito(s, 1, -1):
 86 |             for chars in 'a', 'b', 'c':
 87 |                 with self.subTest(src=src, chars=chars, escape=escape):
 88 |                     if chars == 'b':
 89 |                         expected = [src]
 90 |                     else:
 91 |                         i = str(src).index(chars)
 92 |                         expected = [src[:i], src[i + 1:]]
 93 |                 actual = [*pawpaw.split_unescaped(src, chars, escape)]
 94 |                 self.assertListEqual(expected, actual)
 95 | 
 96 |     def test_split_unescaped_prefix_suffix(self):
 97 |         s = 'aba'
 98 |         for src in s, pawpaw.Ito(s):
 99 |             for chars in 'a', 'b', 'c':
100 |                 with self.subTest(src=src, chars=chars):
101 |                     if isinstance(src, str):
102 |                         expected = src.split(chars)
103 |                     elif chars == 'a':
104 |                         expected = [src[0:0], src[1:1+1], src[3:3]]
105 |                     elif chars == 'b':
106 |                         expected = [src[0:1], src[2:3]]
107 |                     else:  # chars == 'c'
108 |                         expected = [src]
109 |                     actual = [*pawpaw.split_unescaped(src, chars)]
110 |                     self.assertListEqual(expected, actual)
111 | 
112 | 
113 | class TestFindBalanced(_TestIto):
114 |     def test_find_balanced_differing(self):
115 |         lchar = '('
116 |         rchar = ')'
117 |         balanced_segments = [r'(\))', r'(\()', '()', '(a)', '(a(b))', '()', '(123(abc)(def)456)']
118 | 
119 |         for b in [*balanced_segments]:
120 |             with self.subTest(src=b, lchar=lchar, rchar=rchar):
121 |                 actual = next(pawpaw.find_balanced(b, lchar, rchar))
122 |                 self.assertEqual(b, actual)
123 | 
124 |             b = pawpaw.Ito(b)
125 |             lcito = pawpaw.Ito(lchar)
126 |             rcito = pawpaw.Ito(rchar)
127 |             with self.subTest(src=b, lchar=lcito, rchar=rcito):
128 |                 actual = next(pawpaw.find_balanced(b, lcito, rcito))
129 |                 self.assertEqual(b, actual)
130 | 
131 |             b = pawpaw.Ito(f'({b})')
132 |             lcito = pawpaw.Ito(lchar)
133 |             rcito = pawpaw.Ito(rchar)
134 |             with self.subTest(src=b, lchar=lcito, rchar=rcito, start=1, stop=-1):
135 |                 actual = next(pawpaw.find_balanced(b, lcito, rcito, start=1, stop=-1))
136 |                 self.assertEqual(b[1:-1], actual)
137 | 
138 |         b = ''.join(balanced_segments)
139 |         with self.subTest(src=b, lchar=lchar, rchar=rchar):
140 |             actual = [*pawpaw.find_balanced(b, lchar, rchar)]
141 |             self.assertListEqual(balanced_segments, actual)
142 | 
143 |     def test_find_balanced_homogenous(self):
144 |         lchar = '"'
145 |         rchar = '"'
146 |         tokens = ("A", "B", "C")
147 |         src = ' '.join(f'"{t}"' for t in tokens)
148 | 
149 |         with self.subTest(src=src, lchar=lchar, rchar=rchar):
150 |             actual = [*pawpaw.find_balanced(src, lchar, rchar)]
151 |             self.assertListEqual([f'"{t}"' for t in tokens], [str(i) for i in actual])
152 | 


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pawpaw
 4 | 
 5 | 
 6 | class TestVersion(unittest.TestCase):
 7 | 
 8 |     _valid_versions = [
 9 |         '1.dev0',
10 |         '1.0.dev456',
11 |         '1.0a1',
12 |         '1.0a2.dev456',
13 |         '1.0a12.dev456',
14 |         '1.0a12',
15 |         '1.0b1.dev456',
16 |         '1.0b2',
17 |         '1.0b2.post345.dev456',
18 |         '1.0b2.post345',
19 |         '1.0rc1.dev456',
20 |         '1.0rc1',
21 |         '1.0',
22 |         '1.0+abc.5',
23 |         '1.0+abc.7',
24 |         '1.0+5',
25 |         '1.0.post456.dev34',
26 |         '1.0.post456',
27 |         '1.0.15',
28 |         '1.1.dev1',
29 |     ]
30 |     # Taken from https://peps.python.org/pep-0440/#summary-of-permitted-suffixes-and-relative-ordering
31 | 
32 |     def test_is_canonical_valid(self):
33 |         for v in self._valid_versions:
34 |              with self.subTest(version=v):
35 |                 self.assertTrue(pawpaw.Version.is_canonical(v))
36 | 
37 |     _invalid_versions = [
38 |         '1.0a',
39 |         '1.0dev0',
40 |         '1.0post',
41 |         '1.0d1',
42 |     ]
43 | 
44 |     def test_is_canonical_invalid(self):
45 |         for v in self._invalid_versions:
46 |              with self.subTest(version=v):
47 |                 self.assertFalse(pawpaw.Version.is_canonical(v))
48 | 
49 |     def test_version_parse_re(self):
50 |         v = '1.2a34.dev567+xyz.8'
51 |         m = pawpaw.Version.parse_re.fullmatch(v)
52 |         self.assertIsNotNone(m)
53 |         ito = pawpaw.Ito.from_match(m)[0]
54 | 
55 |         tests: list[tuple[str]] = [
56 |             ('release', '*[d:release]', '1.2'),
57 | 
58 |             ('pre', '*[d:pre]', 'a34'),
59 |             ('pre_l', '*[d:pre]/*[d:pre_l]', 'a'),
60 |             ('pre_n', '*[d:pre]/*[d:pre_n]', '34'),
61 | 
62 |             ('dev', '*[d:dev]', '.dev567'),
63 |             ('dev_l', '*[d:dev]/*[d:dev_l]', 'dev'),
64 |             ('dev_n', '*[d:dev]/*[d:dev_n]', '567'),
65 | 
66 |             ('local', '*[d:local]', 'xyz.8'),
67 |         ]
68 | 
69 |         for name, path, expected in tests:
70 |             with self.subTest(component=name):
71 |                 val = ito.find(path)
72 |                 self.assertEqual(expected, str(val))
73 | 


--------------------------------------------------------------------------------
/tests/test_xml_helper.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # Force Python XML parser, not faster C version so that we can hook methods
  3 | sys.modules['_elementtree'] = None
  4 | import xml.etree.ElementTree as ET
  5 | 
  6 | import pawpaw
  7 | import pawpaw.xml as xml
  8 | from tests.util import _TestIto, XML_TEST_SAMPLES
  9 | 
 10 | class TestQualifiedName(_TestIto):
 11 |     def test_from_src(self):
 12 |         for s in 'a', 'a:b':
 13 |             ito = pawpaw.Ito(s)
 14 |             parts = ito.str_split(':')
 15 |             if len(parts) == 1:
 16 |                 parts.insert(0, None)
 17 |             expected = xml.QualifiedName(*parts)
 18 | 
 19 |             with self.subTest(src=ito):
 20 |                 actual = xml.QualifiedName.from_src(ito)
 21 |                 self.assertEqual(expected, actual)
 22 | 
 23 |             with self.subTest(src=s):
 24 |                 actual = xml.QualifiedName.from_src(s)
 25 |                 self.assertEqual(expected, actual)
 26 | 
 27 | 
 28 | class TestXmlHelper(_TestIto):
 29 |     def test_get_qualified_name(self):
 30 |         pass
 31 | 
 32 |     def test_get_xmlns(self):
 33 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 34 |             with self.subTest(xml_sample_index=sample_index):
 35 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 36 |                 xmlns = xml.XmlHelper.get_xmlns(root)
 37 |                 xmlns = {str(k.local_part): str(v) for k, v in xmlns.items()}
 38 |                 if sample.default_namespace is None:
 39 |                     self.assertIsNone(xmlns.get('xmlns'))
 40 |                 else:
 41 |                     self.assertLessEqual({'xmlns': sample.default_namespace[1:-1]}.items(), xmlns.items())
 42 |                 self.assertLessEqual(sample.root_prefix_map.items(), xmlns.items())
 43 | 
 44 |     def test_get_prefix_map_root(self):
 45 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 46 |             with self.subTest(xml_sample_index=sample_index):
 47 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 48 |                 self.assertDictEqual(sample.root_prefix_map, xml.XmlHelper.get_prefix_map(root))
 49 | 
 50 |     def test_get_prefix_map_composite(self):
 51 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 52 |             with self.subTest(xml_sample_index=sample_index):
 53 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 54 |                 actual = xml.XmlHelper.get_prefix_map(root)
 55 |                 self.assertEqual(sample.root_prefix_map, actual)
 56 | 
 57 |                 actual = {}
 58 |                 for e in root.findall('.//'):
 59 |                     actual |= xml.XmlHelper.get_prefix_map(e)
 60 |                 self.assertDictEqual(sample.descendants_composite_prefix_map, actual)
 61 | 
 62 |     def test_get_default_namespace(self):
 63 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 64 |             depth = 0
 65 |             element = ET.fromstring(sample.xml, xml.XmlParser())
 66 |             while element is not None:
 67 |                 with self.subTest(xml_sample_index=sample_index, depth=depth):
 68 |                     if sample.default_namespace is None:
 69 |                         self.assertIsNone(xml.XmlHelper.get_default_namespace(element))
 70 |                     else:
 71 |                         self.assertEqual(sample.default_namespace, str(xml.XmlHelper.get_default_namespace(element)))
 72 |                 depth += 1
 73 |                 element = element.find('*')
 74 | 
 75 |     def test_get_element_text_if_found(self):
 76 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 77 |             path = sample.text_containing_descendant_path
 78 |             with self.subTest(xml_sample_index=sample_index, path=path):
 79 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 80 |                 expected = root.find(path).text
 81 |                 actual = xml.XmlHelper.get_element_text_if_found(root, path)
 82 |                 self.assertEqual(expected, actual)
 83 | 
 84 |             invalid_path = path + '/.[tag=""]'  # ensures path returns nothing
 85 |             with self.subTest(xml_sample_index=sample_index, path=invalid_path):
 86 |                 actual = xml.XmlHelper.get_element_text_if_found(root, invalid_path)
 87 |                 self.assertIsNone(actual)
 88 | 
 89 |     def test_get_parent_element(self):
 90 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 91 |             root = ET.fromstring(sample.xml, xml.XmlParser())
 92 |             
 93 |             depth = 0
 94 |             with self.subTest(xml_sample_index=sample_index, depth=depth):
 95 |                 self.assertIsNone(xml.XmlHelper.get_parent_element(root))
 96 | 
 97 |             parent = root
 98 |             while (child := parent.find('*')) is not None:
 99 |                 depth += 1
100 |                 with self.subTest(xml_sample_index=sample_index, depth=depth):
101 |                     actual = xml.XmlHelper.get_parent_element(child)
102 |                     self.assertIs(parent, actual)
103 |                 parent = child
104 | 
105 |     def test_reverse_find(self):
106 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
107 |             desC_QPATH, anc_pred = sample.descendant_path_with_ancestor_predicate
108 |             with self.subTest(xml_sample_index=sample_index, descendant_path=desC_QPATH, ancestor_predicate=anc_pred):
109 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
110 | 
111 |                 desc = root.find(desC_QPATH)
112 |                 self.assertIsNotNone(desc)
113 | 
114 |                 actual = xml.XmlHelper.reverse_find(desc, anc_pred)
115 |                 self.assertIsNotNone(actual)
116 | 


--------------------------------------------------------------------------------
/tests/test_xml_parser.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # Force Python XML parser, not faster C version so that we can hook methods
  3 | sys.modules['_elementtree'] = None
  4 | import xml.etree.ElementTree as ET
  5 | import html
  6 | import itertools
  7 | 
  8 | from pawpaw import Ito, Span, xml
  9 | from tests.util import _TestIto, XML_TEST_SAMPLES
 10 | 
 11 | 
 12 | class TestXmlParser(_TestIto):
 13 |     def test_basic(self):
 14 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 15 |             with self.subTest(xml_sample_index=sample_index):
 16 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 17 |                 for e in itertools.chain([root_e], root_e.iterfind('**')):
 18 |                     with self.subTest(element=e):
 19 |                         self.assertTrue(hasattr(e, 'ito'))
 20 |                         i = e.ito
 21 |                         self.assertEqual(xml.descriptors.ELEMENT, i.desc)
 22 |                         self.assertIs(e, i.value())
 23 | 
 24 |     def test_attributes(self):
 25 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 26 |             with self.subTest(xml_sample_index=sample_index):
 27 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 28 |                 for e in itertools.chain([root_e], root_e.iterfind('**')):
 29 |                     with self.subTest(element=e):
 30 |                         i = e.ito.find(f'*[d:{xml.descriptors.START_TAG}]/*[d:{xml.descriptors.ATTRIBUTES}]')
 31 |                         if i is None:
 32 |                             self.assertEqual(0, len(e.attrib.keys()))
 33 |                         else:
 34 |                             self.assertIs(e.attrib, i.value())
 35 |                             xmlns_attrs = xml.XmlHelper.get_xmlns(e)
 36 |                             non_xmlns_attrs_count = len(i.children) - len(xmlns_attrs.keys())
 37 |                             self.assertEqual(len(e.attrib.keys()), non_xmlns_attrs_count)
 38 | 
 39 |     def test_namespace(self):
 40 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 41 |             with self.subTest(xml_sample_index=sample_index):
 42 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 43 | 
 44 |                 start_tag = root_e.ito.find(f'**[d:' + xml.descriptors.START_TAG + ']')
 45 |                 self.assertIsNotNone(start_tag)
 46 | 
 47 |                 for attr in start_tag.find_all(f'**[d:' + xml.descriptors.ATTRIBUTE + ']'):
 48 |                     if attr is not None:
 49 |                         expected = [xml.descriptors.TAG, xml.descriptors.VALUE]
 50 |                         self.assertListEqual(expected, [i.desc for i in attr.children])
 51 | 
 52 |                         expected = [xml.descriptors.NAME]
 53 |                         tag = attr.children[0]
 54 |                         if tag.str_find(':') >= 0:
 55 |                             expected.insert(0, xml.descriptors.NAMESPACE)
 56 |                         self.assertListEqual(expected, [i.desc for i in tag.children])
 57 | 
 58 |     def test_hiearchical(self):
 59 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 60 |             with self.subTest(xml_sample_index=sample_index):
 61 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 62 | 
 63 |                 root_i: Ito = root_e.ito
 64 |                 self.assertIsNotNone(root_i)
 65 |                 self.assertIs(root_e, root_i.value())
 66 | 
 67 |                 for child_e in root_e.findall('.//'):
 68 |                     child_i = child_e.ito
 69 |                     self.assertIsNotNone(child_i)
 70 |                     self.assertIs(child_e, child_i.value())
 71 | 
 72 |     def test_values(self):
 73 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 74 |             with self.subTest(xml_sample_index=sample_index):
 75 |                 root = ET.fromstring(sample.xml, parser=xml.XmlParser()).ito
 76 |                 for ito in root.find_all('**!![d:' + ','.join((xml.descriptors.ELEMENT, xml.descriptors.TAG)) + ']'):
 77 |                     desc = ito.desc
 78 |                     with self.subTest(ito_desc=desc, ito_span=ito.span):
 79 |                         if desc == xml.descriptors.ELEMENT:
 80 |                             self.assertIsInstance(ito.value(), ET.Element)
 81 |                         elif desc == xml.descriptors.TAG:
 82 |                             self.assertIsInstance(ito.value(), xml.QualifiedName)
 83 | 
 84 |     def test_tails(self):
 85 |         # xml fragment taken from https://docs.python.org/3/library/xml.etree.elementtree.html
 86 |         fragment = '1234'
 87 |         root = ET.fromstring(fragment, parser=xml.XmlParser())
 88 |         for descendant in root.findall('.//'):
 89 |             next_sibling = descendant.ito.find('>')
 90 |             if descendant.tail is None:
 91 |                 self.assertTrue(next_sibling is None or next_sibling.desc != xml.descriptors.TEXT)
 92 |             else:
 93 |                 self.assertEqual(descendant.tail, str(next_sibling))
 94 | 
 95 |     def test_self_closings(self):
 96 |         fragment = ''
 97 |         root = ET.fromstring(fragment, parser=xml.XmlParser()).ito
 98 | 
 99 |         start_tags = list(root.find_all('**!![d:' + xml.descriptors.START_TAG + ']/*[d:' + xml.descriptors.TAG + ']'))
100 |         self.assertSequenceEqual(('td', 'div', 'span', 'br'), tuple(str(st) for st in start_tags))
101 | 
102 |         end_tags = list(root.find_all('**!![d:' + xml.descriptors.END_TAG + ']/*[d:' + xml.descriptors.TAG + ']'))
103 |         self.assertSequenceEqual(('span', 'div', 'td'), tuple(str(et) for et in end_tags))
104 |         
105 |         span = root.find('**![s:<\/span>]')
106 |         self.assertIsNotNone(span)
107 |         self.assertEqual(xml.descriptors.END_TAG, span.desc)
108 | 
109 |     def test_xml_entity_references(self):
110 |         # Ensure that entity references (e.g., "&") don't cause issues with span computations and Ito construction
111 |         sample = \
112 | """
113 | 
114 |     beans & franks
115 |     1 < 2
116 |     
117 |     Q&A
118 | """
119 |         root = ET.fromstring(sample, parser=xml.XmlParser())
120 | 
121 |         # First make sure our xml looks correct with de-escaped references for its text & tails
122 |         self.assertEqual(root.text.strip(), html.unescape('beans & franks'))
123 |         self.assertEqual(root[0].text, html.unescape('1 < 2'))
124 |         self.assertEqual(root[-1].attrib['type'], html.unescape('R&B'))
125 |         self.assertEqual(root[-1].tail.strip(), html.unescape('Q&A'))
126 | 
127 |         # Now compare html escaped xml text & tails to corresponding Itos
128 |         self.assertEqual(html.escape(root.text), root.ito.find(f'*[d:{xml.descriptors.TEXT}]').__str__())
129 |         self.assertEqual(html.escape(root[0].text), root[0].ito.find(f'*[d:{xml.descriptors.TEXT}]').__str__())
130 |         self.assertEqual(html.escape(root[-1].attrib['type']), root[-1].ito.find(f'**[d:{xml.descriptors.ATTRIBUTE}]/*[d:{xml.descriptors.VALUE}]').__str__())
131 |         self.assertEqual(html.escape(root[-1].tail), root.ito.find(f'-*[d:{xml.descriptors.TEXT}]').__str__())
132 | 
133 |     def test_xml_comments(self):
134 |         # Ensure that encoded text (e.g., "&") doesn't cause problems with span computations
135 |         comment = ''
136 |         text = 'Here is some text'
137 |         sample = '' + comment + text + ''
138 | 
139 |         # root = ET.fromstring(sample, parser=ET.XMLParser(target=ET.TreeBuilder(insert_comments=True)))
140 |         root = ET.fromstring(sample, parser=xml.XmlParser())
141 |         self.assertEqual(root.text, text)
142 | 
143 |         text_ito = root.ito.find(f'*[d:{xml.descriptors.TEXT}]')
144 |         self.assertIsNotNone(text_ito)
145 |         self.assertEqual(comment + text, str(text_ito))
146 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import collections.abc
  3 | import random
  4 | import typing
  5 | from unittest import TestCase
  6 | 
  7 | import regex
  8 | from pawpaw import Span, Ito, Types
  9 | 
 10 | 
 11 | class IntIto(Ito):  # Used for derived class tests
 12 |     def value(self) -> typing.Any:
 13 |         return int(str(self))
 14 |     
 15 | 
 16 | class _TestIto(TestCase):
 17 |     @classmethod
 18 |     def add_chars_as_children(cls, ito: Ito, desc: str | None) -> None:
 19 |         ito.children.add(*(ito.clone(i, i + 1, desc) for i in range(*ito.span)))
 20 | 
 21 |     def matches_equal(self, first: regex.Match, second: regex.Match, msg: typing.Any = ...) -> None:
 22 |         if first is second:
 23 |             return
 24 |         
 25 |         self.assertListEqual([*first.regs], [*second.regs])
 26 |         self.assertEqual(first.group(0), second.group(0))
 27 |         self.assertSequenceEqual(first.groupdict().keys(), second.groupdict().keys())
 28 |         for v1, v2 in zip(first.groupdict().values(), second.groupdict().values()):
 29 |             self.assertEqual(v1, v2)
 30 |             
 31 |     def setUp(self) -> None:
 32 |         self.addTypeEqualityFunc(regex.Match, self.matches_equal)
 33 | 
 34 |         
 35 | class RandSpans:
 36 |     def __init__(
 37 |             self,
 38 |             size: Span = (1, 1),
 39 |             gap: Span = (0, 0),
 40 |     ):
 41 |         if not (isinstance(size, tuple) and len(size) == 2 and all(isinstance(i, int) for i in size)):
 42 |             raise TypeError('invalid \'size\'')
 43 |         if size[0] < 0 or size[1] < 1 or size[0] > size[1]:
 44 |             raise ValueError('invalid \'size\'')
 45 |         self.size = size
 46 | 
 47 |         if not (isinstance(gap, tuple) and len(gap) == 2 and all(isinstance(i, int) for i in gap)):
 48 |             raise TypeError('invalid \'gap\'')
 49 |         if (gap[0] < 0 and abs(gap[0]) >= size[0]) or (gap[1] < 0 and abs(gap[1]) >= size[0]):
 50 |             raise ValueError('invalid \'gap\'')
 51 |         self.gap = gap
 52 | 
 53 |     def generate(
 54 |             self,
 55 |             basis: int | collections.abc.Sized,
 56 |             start: int | None = None,
 57 |             stop: int | None = None
 58 |     ) -> typing.Iterable[Span]:
 59 |         i, stop = Span.from_indices(basis, start, stop)
 60 |         while i < stop:
 61 |             k = i + random.randint(*self.size)
 62 |             k = min(k, stop)
 63 |             yield Span(i, k)
 64 |             if k == stop:
 65 |                 break
 66 |             i = k + random.randint(*self.gap)
 67 | 
 68 | 
 69 | class RandSubstrings(RandSpans):
 70 |     def __init__(
 71 |             self,
 72 |             size: Span = Span(1, 1),
 73 |             gap: Span = Span(0, 0),
 74 |     ):
 75 |         super().__init__(size, gap)
 76 | 
 77 |     def generate(self, string: str, start: int | None = None, stop: int | None = None) -> typing.Iterable[str]:
 78 |         for span in super().generate(string, start, stop):
 79 |             yield string[slice(*span)]
 80 | 
 81 | 
 82 | class XmlTestSample(typing.NamedTuple):
 83 |     source: str
 84 |     
 85 |     default_namespace: None | str
 86 |     
 87 |     # prefix_map of root node
 88 |     root_prefix_map: typing.Dict[str, str]
 89 |     
 90 |     # combined prefix_map of all non-root nodes
 91 |     descendants_composite_prefix_map: typing.Dict[str, str]
 92 |     
 93 |     # path to arbitrary descendant that has non-emtpy .text
 94 |     text_containing_descendant_path: str  
 95 |     
 96 |     # path to arbitrary descendant that can be reversed searched to find ancestor that matches predicate
 97 |     descendant_path_with_ancestor_predicate: typing.Tuple[str, str]
 98 | 
 99 |     xml: str
100 | 
101 | 
102 | XML_TEST_SAMPLES: typing.List[XmlTestSample] = [
103 |     XmlTestSample(
104 |         source='https://docs.python.org/3/library/xml.etree.elementtree.html',
105 |         default_namespace=None,
106 |         root_prefix_map={},
107 |         descendants_composite_prefix_map={},
108 |         text_containing_descendant_path='.//year',
109 |         descendant_path_with_ancestor_predicate=('.//gdppc', 'rank'),
110 |         xml=
111 | """
112 | 
113 |     
114 |         1
115 |         2008
116 |         141100
117 |         
118 |         
119 |     
120 |     
121 |         4
122 |         2011
123 |         59900
124 |         
125 |     
126 |     
127 |         68
128 |         2011
129 |         13600
130 |         
131 |         
132 |     
133 | """        
134 |     ),
135 | 
136 |     XmlTestSample(
137 |         source='https://docs.python.org/3/library/xml.etree.elementtree.html',
138 |         default_namespace='{http://people.example.com}',
139 |         root_prefix_map={'fictional': 'http://characters.example.com'},
140 |         descendants_composite_prefix_map={},
141 |         text_containing_descendant_path='.//{http://people.example.com}name',
142 |         descendant_path_with_ancestor_predicate=('.//{http://characters.example.com}character', '{http://people.example.com}actor'),
143 |         xml=
144 | """
145 | 
147 |     
148 |         John Cleese
149 |         Lancelot
150 |         Archie Leach
151 |     
152 |     
153 |         Eric Idle
154 |         Sir Robin
155 |         Gunther
156 |         Commander Clement
157 |     
158 | """   
159 |     ),
160 | 
161 |     XmlTestSample(
162 |         source='https://www.xml.com/pub/a/1999/01/namespaces.html',
163 |         default_namespace=None,
164 |         root_prefix_map={'xdc': 'http://www.xml.com/books', 'h': 'http://www.w3.org/HTML/1998/html4'},
165 |         descendants_composite_prefix_map={},
166 |         text_containing_descendant_path='.//{http://www.xml.com/books}author',
167 |         descendant_path_with_ancestor_predicate=('.//{http://www.xml.com/books}date', '@align'),
168 |         xml='''
169 | 
171 |  Book Review
172 |  
173 |   
174 |    XML: A Primer
175 |    
176 |     
177 |      AuthorPrice
178 |      PagesDate
179 |     
180 |      Simon St. Laurent
181 |      31.98
182 |      352
183 |      1998/01
184 |     
185 |    
186 |   
187 |  
188 | '''
189 |     ),
190 | 
191 |     XmlTestSample(
192 |         source='https://www.w3schools.com/xml/xml_namespaces.asp',
193 |         default_namespace=None,
194 |         root_prefix_map={},
195 |         descendants_composite_prefix_map={'h': 'http://www.w3.org/TR/html4/', 'f': 'https://www.w3schools.com/furniture'},
196 |         text_containing_descendant_path='.//{http://www.w3.org/TR/html4/}td',
197 |         descendant_path_with_ancestor_predicate=('.//{https://www.w3schools.com/furniture}length', '{https://www.w3schools.com/furniture}name'),
198 |         xml='''
199 | 
200 | 
201 | 
202 |   
203 |     Apples
204 |     Bananas
205 |   
206 | 
207 | 
208 | 
209 |   African Coffee Table
210 |   80
211 |   120
212 | 
213 | 
214 | '''
215 |     ),
216 | ]
217 | 


--------------------------------------------------------------------------------
/tests/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/visualization/__init__.py


--------------------------------------------------------------------------------
/tests/visualization/test_sgr.py:
--------------------------------------------------------------------------------
 1 | from pawpaw.visualization.sgr import *
 2 | from tests.util import _TestIto
 3 | 
 4 | 
 5 | class TestSgr(_TestIto):
 6 |     def test_sgr_reset_all(self):
 7 |         self.assertTrue(f'\033[0m', RESET_ALL)
 8 |         
 9 |     def test_sgr_encode(self):
10 |         for vals in (0,), (1,), (1,2,3):
11 |             with self.subTest(value=vals):
12 |                 vals_str = ';'.join(str(v) for v in vals)
13 |                 expected = f'\033[{vals_str}m'
14 |                 actual = encode(*vals)
15 |                 self.assertEqual(expected, actual)
16 |                 self.assertFalse(actual.isprintable())
17 | 


--------------------------------------------------------------------------------
/tests/visualization/test_visualization_ascii_box.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | 
  3 | import pawpaw
  4 | import pawpaw.visualization.ascii_box as box
  5 | from tests.util import _TestIto
  6 | 
  7 | directions: typing.List[box.Direction] = [
  8 |     box.Direction.N,
  9 |     box.Direction.NE,
 10 |     box.Direction.E,
 11 |     box.Direction.SE,
 12 |     box.Direction.S,
 13 |     box.Direction.SW,
 14 |     box.Direction.W,
 15 |     box.Direction.NW,
 16 | ]
 17 | 
 18 | class TestDirection(_TestIto):
 19 |     @classmethod
 20 |     def setUpClass(cls) -> None:
 21 |         cls.directions = directions
 22 | 
 23 |     def test_values(self):
 24 |         for i, d in enumerate(self.directions):
 25 |             with self.subTest(direction=d):
 26 |                 self.assertEqual(i * 45, d.value)
 27 | 
 28 |     def test_from_degrees(self):
 29 |         for direction in self.directions:
 30 |             for degrees in -720, -360, 0, 1, 44, 360, 720:
 31 |                 with self.subTest(direction=direction, degrees=degrees):
 32 |                     self.assertEqual(direction, direction.rotate(degrees))
 33 | 
 34 |             for degrees in -1, 45:
 35 |                 with self.subTest(direction=direction, degrees=degrees):
 36 |                     self.assertNotEqual(direction, direction.rotate(degrees))
 37 | 
 38 |     def test_rotate(self):
 39 |         for i, direction in enumerate(self.directions):
 40 |             for degrees in range(0, 360 + 45, 45):
 41 |                 with self.subTest(direction=direction, degrees=degrees):
 42 |                     self.assertEqual(
 43 |                         box.Direction.from_degrees(direction.value + degrees),
 44 |                         direction.rotate(degrees)
 45 |                     )
 46 | 
 47 |     def test_reflect(self):
 48 |         for direction in self.directions:
 49 |             for surface in direction, direction.rotate(180):
 50 |                 with self.subTest(direction=direction, surface=surface):
 51 |                     self.assertEqual(direction, direction.reflect(surface))
 52 | 
 53 |             for surface in direction.rotate(90), direction.rotate(-90):
 54 |                 with self.subTest(direction=direction, surface=surface):
 55 |                     self.assertEqual(direction.rotate(180), direction.reflect(surface))
 56 | 
 57 |             for delta in 45, -45:
 58 |                 surface = direction.rotate(delta)
 59 |                 with self.subTest(direction=direction, surface=surface):
 60 |                     rot = 90 if delta > 0 else -90
 61 |                     self.assertEqual(direction.rotate(rot), direction.reflect(surface))
 62 | 
 63 |             for delta in 135, -135:
 64 |                 surface = direction.rotate(delta)
 65 |                 with self.subTest(direction=direction, surface=surface):
 66 |                     rot = -90 if delta > 0 else 90
 67 |                     self.assertEqual(direction.rotate(rot), direction.reflect(surface))
 68 | 
 69 | 
 70 | class TestAsciiBoxDrawing(_TestIto):
 71 |     @classmethod
 72 |     def setUpClass(cls) -> None:
 73 |         cls.directions = directions
 74 | 
 75 |         rotations = [
 76 |             ['┃', '━', '┃', '━'],
 77 |             ['┍', '┒', '┙', '┖'],
 78 |             ['┡', '┲', '┪', '┹'],
 79 |             ['╔', '╗', '╝', '╚'],
 80 |         ]
 81 |         cls.ninety_degree_rotatations: typing.List[typing.List[box.BoxDrawingChar]] = [
 82 |             [box.BoxDrawingChar.from_char(c) for c in rots] for rots in rotations
 83 |         ]
 84 | 
 85 |     def test_chars_unique(self):
 86 |         chars = set(c.char for c in box.BoxDrawingChar._instances)
 87 |         self.assertEqual(len(box.BoxDrawingChar._instances), len(chars))
 88 | 
 89 |     def test_direction_styles_unique(self):
 90 |         dss = set(frozenset((ds.direction, ds.style.weight, ds.style.count, ds.style.dash, ds.style.path) for ds in c.direction_styles) for c in box.BoxDrawingChar._instances)
 91 |         self.assertEqual(len(box.BoxDrawingChar._instances), len(dss))
 92 | 
 93 |     @classmethod
 94 |     def is_corner(cls, bdc: box.BoxDrawingChar) -> bool:
 95 |         if len(bdc.direction_styles) != 2:
 96 |             return False
 97 | 
 98 |         dirs = tuple(ds.direction for ds in bdc.direction_styles)
 99 |         if dirs[0] == box.Direction.N:
100 |             return dirs[1] in (box.Direction.W, box.Direction.E)
101 |         elif dirs[1] == box.Direction.S:
102 |             return dirs[0] in (box.Direction.W, box.Direction.E)
103 |         else:
104 |             return False
105 | 
106 |     def test_from_corners_single_valid(self):
107 |         for bdc in box.BoxDrawingChar.from_char('╭',),:  # box.BoxDrawingChar._instances:
108 |             with self.subTest(box_drawing_char=bdc):
109 |                 if self.is_corner(bdc):
110 |                     boxer = box.from_corners(bdc.char)
111 |                     boxer = box.from_corners(bdc)
112 |                 else:
113 |                     with self.assertRaises(ValueError):
114 |                         boxer = box.from_corners(bdc.char)
115 |                     with self.assertRaises(ValueError):
116 |                         boxer = box.from_corners(bdc)
117 | 
118 |     def test_corner_combos(self):
119 |         in_outs = (
120 |             (('╔', '╯'), ('╔', '╕', '╙', '╯')),
121 |             (('╚', '╮'), ('╓', '╮', '╚', '╛'))
122 |         )
123 | 
124 |         for ins, output_corners in in_outs:
125 |             for input_corners in ins, ins[::-1]:
126 |                 with self.subTest(input_corners=input_corners):
127 |                     input_corners = [box.BoxDrawingChar.from_char(c) for c in input_corners]
128 |                     boxer = box.from_corners(*input_corners)
129 |                     lines = list(boxer.from_srcs(' '))
130 |                     self.assertEqual(3, len(lines))
131 |                     self.assertEqual(output_corners[0], lines[0][0])
132 |                     self.assertEqual(output_corners[1], lines[0][-1])
133 |                     self.assertEqual(output_corners[2], lines[-1][0])
134 |                     self.assertEqual(output_corners[3], lines[-1][-1])
135 | 
136 |     def test_rotate(self):
137 |         for rots in self.ninety_degree_rotatations:
138 |             for i, bdc in enumerate(rots):
139 |                 with self.subTest(box_drawing_char=bdc):
140 |                     j = (i + 1) % 4
141 |                     self.assertEqual(rots[j], bdc.rotate(90))
142 | 


--------------------------------------------------------------------------------