├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── SECURITY.md
├── docs
    ├── 0. Introduction.md
    ├── 1. Segment and Span.md
    ├── 2. In Text Object.md
    ├── 3. Visualization.md
    ├── 4. Arborform.md
    ├── 5. Traversal & Query.md
    ├── 6. Xml.md
    ├── 7. NLP.md
    ├── 8. Serialization.md
    ├── Pawpaw Cookbook.md
    ├── Using Pawpaw with nltk.md
    └── demos
    │   ├── Q&A
    │       ├── description.md
    │       └── solution.py
    │   ├── class_grades
    │       ├── description.md
    │       ├── input.txt
    │       ├── parser_compact.py
    │       ├── parser_verbose.py
    │       └── solution.py
    │   ├── compounds
    │       ├── compound_1.txt
    │       ├── compound_2.txt
    │       ├── description.md
    │       └── solution.py
    │   ├── gettysburg_address
    │       └── gettysburg_address.txt
    │   ├── us_constitution
    │       ├── description.md
    │       ├── us_constitution.py
    │       └── us_constitution.txt
    │   └── xpath_recursion_depth
    │       ├── description.md
    │       ├── solution_1.py
    │       └── solution_2.py
├── images
    └── pawpaw.png
├── pawpaw
    ├── __init__.py
    ├── _type_magic.py
    ├── _version.py
    ├── arborform
    │   ├── __init__.py
    │   ├── itorator
    │   │   ├── __init__.py
    │   │   ├── desc.py
    │   │   ├── extract.py
    │   │   ├── filter.py
    │   │   ├── invert.py
    │   │   ├── itorator.py
    │   │   ├── nuco.py
    │   │   ├── reflect.py
    │   │   ├── regex_itorator.py
    │   │   ├── split.py
    │   │   └── value_func.py
    │   └── postorator
    │   │   ├── __init__.py
    │   │   ├── postorator.py
    │   │   ├── stacked_reduce.py
    │   │   └── windowed_join.py
    ├── errors.py
    ├── infix.py
    ├── ito.py
    ├── nlp
    │   ├── __init__.py
    │   └── nlp.py
    ├── ontology
    │   ├── __init__.py
    │   ├── _query.py
    │   └── ontology.py
    ├── query
    │   ├── __init__.py
    │   └── _query.py
    ├── span.py
    ├── table
    │   ├── __init__.py
    │   ├── styles
    │   │   ├── __init__.py
    │   │   └── styles.py
    │   └── table.py
    ├── util.py
    ├── visualization
    │   ├── __init__.py
    │   ├── ascii_box.py
    │   ├── highlighter.py
    │   ├── pepo
    │   │   ├── __init__.py
    │   │   └── pepo.py
    │   └── sgr
    │   │   ├── __init__.py
    │   │   ├── palettes
    │   │       ├── __init__.py
    │   │       └── palettes.py
    │   │   └── sgr.py
    └── xml
    │   ├── __init__.py
    │   ├── descriptors.py
    │   ├── xml_helper.py
    │   └── xml_parser.py
├── pyproject.toml
└── tests
    ├── __init__.py
    ├── arborform
        ├── __init__.py
        ├── test_connectors.py
        ├── test_invert.py
        ├── test_itorator.py
        ├── test_itorator_desc.py
        ├── test_itorator_extract.py
        ├── test_itorator_filter.py
        ├── test_itorator_reflect.py
        ├── test_itorator_split.py
        ├── test_itorator_value_func.py
        ├── test_nuco.py
        ├── test_postorator.py
        └── test_postorator_windowed_join.py
    ├── ito
        ├── __init__.py
        ├── test_child_itos.py
        ├── test_ito.py
        ├── test_ito_ctor.py
        ├── test_ito_descend.py
        ├── test_ito_regex_equivalence_methods.py
        ├── test_ito_serialization.py
        ├── test_ito_str_equivalence_methods.py
        └── test_ito_utility_methods.py
    ├── nlp
        ├── __init__.py
        └── test_nlp.py
    ├── ontology
        ├── __init__.py
        ├── test_keyed_list.py
        └── test_ontology.py
    ├── query
        ├── __init__.py
        └── test_query_and_traversal.py
    ├── table
        ├── __init__.py
        └── test_table.py
    ├── test_group_keys.py
    ├── test_invoke_func.py
    ├── test_span.py
    ├── test_type_magic.py
    ├── test_util.py
    ├── test_version.py
    ├── test_xml_helper.py
    ├── test_xml_parser.py
    ├── util.py
    └── visualization
        ├── __init__.py
        ├── test_sgr.py
        └── test_visualization_ascii_box.py


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Provide a short, concise script that shows how to reproduce the behavior via:
15 |  - Python
16 |  - Jupyter Notebook 
17 | 
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 | 
21 | **Additional Context**
22 | Please indicate the Python and regex versions you are using, e.g. "Python 10.0.4"
23 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Please indicate the Python and regex versions you are using, e.g. "Python 10.0.4"
21 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * Public or private harassment
 31 | * Trolling, insulting or derogatory comments, and personal or political attacks
 32 | * Publishing others' private information, such as a physical or email
 33 |   address, without their explicit permission
 34 | * Other conduct which could reasonably be considered inappropriate in a
 35 |   professional setting
 36 | 
 37 | ## Enforcement Responsibilities
 38 | 
 39 | Community leaders are responsible for clarifying and enforcing our standards of
 40 | acceptable behavior and will take appropriate and fair corrective action in
 41 | response to any behavior that they deem inappropriate, threatening, offensive,
 42 | or harmful.
 43 | 
 44 | Community leaders have the right and responsibility to remove, edit, or reject
 45 | comments, commits, code, wiki edits, issues, and other contributions that are
 46 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 47 | decisions when appropriate.
 48 | 
 49 | ## Scope
 50 | 
 51 | This Code of Conduct applies within all community spaces, and also applies when
 52 | an individual is officially representing the community in public spaces.
 53 | Examples of representing our community include using an official e-mail address,
 54 | posting via an official social media account, or acting as an appointed
 55 | representative at an online or offline event.
 56 | 
 57 | ## Enforcement
 58 | 
 59 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 60 | reported to the community leaders responsible for enforcement at
 61 | .
 62 | All complaints will be reviewed and investigated promptly and fairly.
 63 | 
 64 | All community leaders are obligated to respect the privacy and security of the
 65 | reporter of any incident.
 66 | 
 67 | ## Enforcement Guidelines
 68 | 
 69 | Community leaders will follow these Community Impact Guidelines in determining
 70 | the consequences for any action they deem in violation of this Code of Conduct:
 71 | 
 72 | ### 1. Correction
 73 | 
 74 | **Community Impact**: Use of inappropriate language or other behavior deemed
 75 | unprofessional or unwelcome in the community.
 76 | 
 77 | **Consequence**: A private, written warning from community leaders, providing
 78 | clarity around the nature of the violation and an explanation of why the
 79 | behavior was inappropriate. A public apology may be requested.
 80 | 
 81 | ### 2. Warning
 82 | 
 83 | **Community Impact**: A violation through a single incident or series
 84 | of actions.
 85 | 
 86 | **Consequence**: A warning with consequences for continued behavior. No
 87 | interaction with the people involved, including unsolicited interaction with
 88 | those enforcing the Code of Conduct, for a specified period of time. This
 89 | includes avoiding interactions in community spaces as well as external channels
 90 | like social media. Violating these terms may lead to a temporary or
 91 | permanent ban.
 92 | 
 93 | ### 3. Temporary Ban
 94 | 
 95 | **Community Impact**: A serious violation of community standards, including
 96 | sustained inappropriate behavior.
 97 | 
 98 | **Consequence**: A temporary ban from any sort of interaction or public
 99 | communication with the community for a specified period of time. No public or
100 | private interaction with the people involved, including unsolicited interaction
101 | with those enforcing the Code of Conduct, is allowed during this period.
102 | Violating these terms may lead to a permanent ban.
103 | 
104 | ### 4. Permanent Ban
105 | 
106 | **Community Impact**: Demonstrating a pattern of violation of community
107 | standards, including sustained inappropriate behavior,  harassment of an
108 | individual, or aggression toward or disparagement of classes of individuals.
109 | 
110 | **Consequence**: A permanent ban from any sort of public interaction within
111 | the community.
112 | 
113 | ## Attribution
114 | 
115 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
116 | version 2.0, available at
117 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
118 | 
119 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
120 | enforcement ladder](https://github.com/mozilla/diversity).
121 | 
122 | [homepage]: https://www.contributor-covenant.org
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | https://www.contributor-covenant.org/faq. Translations are available at
126 | https://www.contributor-covenant.org/translations.
127 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | <!-- Back to top link -->
 2 | <a name="readme-top"></a>
 3 | 
 4 | <!-- PROJECT SHIELDS -->
 5 | [![Python][Python-shield]][Python-url]
 6 | [![Contributors][contributors-shield]][contributors-url]
 7 | [![Watchers][watchers-shield]][watchers-url]
 8 | [![Forks][forks-shield]][forks-url]
 9 | [![MIT License][license-shield]][license-url]
10 | [![Stargazers][stars-social]][stars-url]
11 | <br />
12 | 
13 | ## How to contribute to Pawpaw
14 | 
15 | #### **Did you find a bug?**
16 | 
17 | * **Do not open up a GitHub issue if the bug is a security vulnerability in Pawpaw**, and instead to refer to the [security policy](https://rubyonrails.org/security).
18 | 
19 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/rlayers/pawpaw/issues).
20 | 
21 | * If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/rlayers/pawpaw/issues/new). Be sure to include a **title and clear description**, as much relevant information as possible, and a **code sample** or an **executable test case** demonstrating the expected behavior that is not occurring.
22 | 
23 | #### **Did you write a patch that fixes a bug?**
24 | 
25 | * Open a new GitHub pull request with your patch.
26 | 
27 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
28 | 
29 | #### **Did you write an enhancement to PawPaw?**
30 | 
31 | * If you have a suggestion that would make this better, please fork the repo and create a pull request.
32 | 
33 | * You can also simply open an issue with the tag "enhancement".
34 | 
35 | #### **Did you fix whitespace, format code, or make a purely cosmetic patch?**
36 | 
37 | Changes that are cosmetic in nature and do not add anything substantial to the stability, functionality, or testability of Rails will generally not be accepted.
38 | 
39 | #### **Do you have questions about the source code?**
40 | 
41 | * Ask Robert any question about how to use PawPaw via <a alt="e-mail" href="email@a.nov.guy@gmail.com">a.nov.guy@gmail.com</a>.
42 | 
43 | #### **Do you want to contribute to the PawPaw documentation?**
44 | 
45 | * I'd love to hear feedback on expanding or improving documents.
46 | 
47 | Thanks! :heart: :heart: :heart:
48 | 
49 | Robert
50 | 
51 | 
52 | <!-- MARKDOWN LINKS & IMAGES -->
53 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
54 | 
55 | <!-- Palette Info:
56 |   "Oriental Beauty"
57 |   533E30	D2AC70	E4D1AE	517D3D	90C246
58 |   https://www.schemecolor.com/oriental-beauty-color-combination.php
59 | -->
60 | 
61 | [repo]: https://github.com/rlayers/pawpaw
62 | 
63 | [byline-img]: https://img.shields.io/badge/-High%20Performance%20Text%20Segmentation%20Framework-FFFFFF
64 | 
65 | [byline2-img]: https://readme-typing-svg.demolab.com?font=Fira+Code&weight=800&duration=500&pause=1500&color=533E30&vCenter=true&width=375&height=25&lines=High+Performance+Text+Segmentation
66 | 
67 | [Python-shield]: https://img.shields.io/badge/python-≥3.10-517D3D.svg?style=flat
68 | [Python-url]: https://www.python.org
69 | 
70 | [contributors-shield]: https://img.shields.io/github/contributors/rlayers/pawpaw.svg?color=90C246&style=flat
71 | [contributors-url]: https://github.com/rlayers/pawpaw/graphs/contributors
72 | 
73 | [watchers-shield]: https://img.shields.io/github/watchers/rlayers/pawpaw.svg?color=E4D1AE&style=flat
74 | [watchers-url]: https://github.com/rlayers/pawpaw/watchers
75 | 
76 | [issues-shield]: https://img.shields.io/github/issues/rlayers/pawpaw.svg?style=flat
77 | [issues-url]: https://github.com/rlayers/pawpaw/issues
78 | 
79 | [forks-social]: https://img.shields.io/github/forks/rlayers/pawpaw.svg?style=social
80 | [forks-shield]: https://img.shields.io/github/forks/rlayers/pawpaw.svg?color=D2AC70&style=flat
81 | [forks-url]: https://github.com/rlayers/pawpaw/network/members
82 | 
83 | [license-shield]: https://img.shields.io/github/license/rlayers/pawpaw.svg?color=533E30&style=flat
84 | [license-url]: https://github.com/rlayers/pawpaw/blob/master/LICENSE
85 | 
86 | [stars-social]: https://img.shields.io/github/stars/rlayers/pawpaw.svg?style=social
87 | [stars-shield]: https://img.shields.io/github/stars/rlayers/pawpaw.svg?style=flat
88 | [stars-url]: https://github.com/rlayers/pawpaw/stargazers
89 | 
90 | [PyCharm-shield]: https://img.shields.io/badge/PyCharm-000000.svg?&style=flat&logo=PyCharm&logoColor=white
91 | [PyCharm-url]: https://www.jetbrains.com/pycharm/
92 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Robert L. Ayers
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | <!--
 6 | | Version | Supported          |
 7 | | ------- | ------------------ |
 8 | | 1.0.x   | :white_check_mark: |
 9 | | 5.0.x   | :x:                |
10 | | 4.0.x   | :white_check_mark: |
11 | | < 4.0   | :x:                |
12 | -->
13 | 
14 | | Version | Supported          |
15 | | ------- | ------------------ |
16 | | 1.0.x   | :white_check_mark: |
17 | 
18 | ## Reporting a Vulnerability
19 | 
20 | To report a discovered vulnerability, e-mail me directly at <a alt="e-mail" href="email@a.nov.guy@gmail.com">a.nov.guy@gmail.com</a>.
21 | 


--------------------------------------------------------------------------------
/docs/1. Segment and Span.md:
--------------------------------------------------------------------------------
  1 | # Segments and Span
  2 | 
  3 | ## Definitions
  4 | 
  5 | **string:**  
  6 | an ordered sequence of **n** characters; the *length* of a string is equivalent to **n**:
  7 | 
  8 | $$0 \leq n \equiv length$$
  9 | 
 10 | **substring:**  
 11 | an ordered sequence of **m** characters *contained in* or *equal to* a reference string of length **n**; may exist in more than one *location* within a string:
 12 | 
 13 | $$m \leq n$$
 14 | 
 15 | **proper substring:**  
 16 |  a substring of **m** characters that is *not equal to* a reference string of length **n**:
 17 | 
 18 | $$m < n$$
 19 |     
 20 | **segment:**  
 21 |  a *specific* substring of **m** characters identified by its *inclusive* **start** location, a zero-based index; the sum of **start** and **m** is equivalent to the *exclusive* **stop** location:
 22 | 
 23 | $$start + m ≡ stop$$
 24 | 
 25 |  as a result of this relationship, a segment can be uniquely identified using *any two* values from **start**, **stop**, or **m**; **stop** must be less than or equal to **n**:
 26 | 
 27 |  $$0 \leq start \leq stop \leq n$$
 28 |  
 29 | A *segment* is frequently identified by its *inclusive* **start** and *exclusive* **stop** locations within a reference string.  Two integers can identify a valid segment for a string of length **n** if they are:
 30 | 
 31 | 1. ordered
 32 | 2. between zero and **n** inclusive
 33 | 
 34 | .. sidebar:: Segment as a vector
 35 | 
 36 |  A segment can be thought of as a one dimensional vector having a location at ``start``, and a length of ``stop-start``.
 37 | 
 38 | A segment having a non-zero **start** value *-or-* a **stop** value less than **n** defines a *proper substring*.
 39 | 
 40 | It is possible for a segment to have identical **start** and **stop** values, in which case it defines an *empty substring* (i.e., zero length) at a *specific location*.
 41 | 
 42 | In Python, a ``str`` is immutable.  And because since substrings are themselves ``str`` objects, they too are immutable.  This attribute extends to segments, which because they define a substring, are also immutable.
 43 | 
 44 | .. admonition:: Key Concept
 45 | 
 46 |    Segments are immutable
 47 | 
 48 | Span
 49 | ====
 50 | 
 51 | A segment can be represented in Python with a 2-tuple of ``int`` values.  Pawpaw offers a class named ``Span``[^span_name] for this purpose. ``Span`` is derived from Python's `NamedTuple <https://docs.python.org/3/library/collections.html?highlight=namedtuple#collections.namedtuple>`_, which results in a tuple-like object that:
 52 | 
 53 | - has fields accessible by attribute lookup
 54 | - is indexable and iterable
 55 | - requires no more memory than regular tuples
 56 | - has immutable elements.
 57 |  
 58 | Because they are tuples, ``Span`` objects are themselves immutable.  This ensures that immutable representation for segments within Pawpaw.
 59 | 
 60 | Creating a ``Span`` only requires a pair of ``int`` value for ``start`` and ``stop``:
 61 | 
 62 | ```python
 63 | >>> from pawpaw import Span
 64 | >>> Span(0, 3)
 65 | Span(start=0, stop=3)
 66 | ```
 67 | 
 68 | As a named tuple, ``Span`` can be used as direct replacement for a tuple:
 69 | 
 70 | ```python
 71 | >>> 'The vals are %d and %d.' % Span(2, 5)
 72 | 'The vals are 2 and 5'
 73 | ```
 74 | 
 75 | A ``Span`` can also be unpacked using the ``*`` operator.  For example, many ``str`` methods feature
 76 | ``start`` and ``end`` parameters, which can be supplied via unpacking a ``Span``:
 77 | 
 78 | ```python
 79 | >>> s = '3. This sentence has "3" as a prefix.'
 80 | >>> span = Span(1, len(s))
 81 | >>> s.find('3', *span)
 82 | 22
 83 | ```
 84 | 
 85 | Slice Coordination
 86 | ==================
 87 | 
 88 | A ``Span`` can be easily converted to a Python ``slice`` via unpacking:
 89 | 
 90 | ```python
 91 | >>> s = ' leading and trailing spaces '
 92 | >>> span = Span(1, len(s) - 1)
 93 | >>> _slice = slice(*span)
 94 | >>> s[_slice]  # strip leading and trailing chars
 95 | 'leading and trailing spaces'
 96 | ```
 97 | 
 98 | However, a ``Span`` and ``slice`` are **not** equivalent.  A Python's ``slice`` constructor has ``start`` and ``stop`` parameters[^slice_step], but they are *Python-style indices*, which allow negative values.  So although a ``Span`` can always be converted to a ``slice``, the reverse is not true:
 99 | 
100 | ```python
101 | >>> slice(-10, 1)
102 | slice(-10, 1, None)
103 | >>> slice(3)
104 | slice(None, 3, None)
105 | ```
106 | 
107 | To convert from ``slice`` to ``Span``, the indices must be *normalized* to valid locations within the reference string.  For example, given a ``str`` of length ``n`` and a ``slice(1, -1)``, the associated ``Span`` would be ``Span(1, n - 1)``
108 | 
109 | The ``Span`` class offers a static constructor method ``.from_indices`` that performs normalization for you:
110 | 
111 | ```python
112 | >>> s = 'abcd'
113 | >>> Span.from_indices(s, 1, -1)
114 | Span(start=1, stop=3)
115 | >>> Span.from_indices(s, -1)
116 | Span(start=3, stop=4)
117 | >>> Span.from_indices(s, stop=-2)
118 | Span(start=0, stop=2)
119 | ```
120 | 
121 | The ``.from_indices`` constructor only uses the length of the reference ``str``.  The first parameter, ``basis``, accepts an ``int`` or any ``Sized``[^sized] type.
122 | 
123 | [^span_name]: The choice of *Span* for this type name instead of *Segment* is based on the extensive use of *span* in the ``re`` and ``regex`` modules.
124 | 
125 | [^slice_step]: Python's ``slice`` constructor also features a ``step`` parameter, which defaults to 1. Slicing a ``str`` with ``step`` values other than 1 does not result in a *proper substring*, i.e., the resulting ``str`` is not contained within the starting ``str``.
126 | 
127 | [^sized]: Python's ``Sized`` type supports the ``len`` keyword via a ``__len__`` method, which is used to supply a length by ``.from_indices``.
128 | 


--------------------------------------------------------------------------------
/docs/7. NLP.md:
--------------------------------------------------------------------------------
 1 | # NLP
 2 | 
 3 | ## Introduction
 4 | 
 5 | Pawpaw is well suited for Natural Language Processing (NLP) software development.  NLP is a deep topic, and it can sometimes be difficult to select which state of the art (SoA) approaches are best suited for your particular data and needs.
 6 | 
 7 | The intent of Pawpaw's ``nlp`` module is not serve as a replacement for the excellent work that has been done in this field.  Rather, ``nlp`` is available as a simple and easy to use toolbox that offers an excellent balance of features and performance for English language data.
 8 | 
 9 | ## Architecture
10 | 
11 | The ``nlp`` contains a variety of useful lexical data marker collections for Python unicode strings:
12 | 
13 | * ``byte_order_controls``
14 | * ``unicode_white_space_LF_FF``
15 | * ``unicode_white_space_eol``
16 | * ``unicode_white_space_other``
17 | * ``unicode_single_quote_marks``
18 | * ``unicode_double_quote_marks``
19 | * ``unicode_bullets``
20 | 
21 | NLP methods are split among a class hierarchy, whose base class is ``NlpComponent``:
22 | 
23 | ```mermaid
24 | classDiagram
25 |   class NlpComponent{
26 |       <<Abstract>>
27 |       +re regex.Pattern*
28 |       +get_itor() pawpaw.arborform.Itorator*
29 |   }
30 |   
31 |   NlpComponent <|-- SimpleNlp
32 |   NlpComponent <|-- Paragraph
33 |   NlpComponent <|-- Sentence
34 |   NlpComponent <|-- Number
35 | ```
36 | 
37 | ## Introduction
38 | 
39 | A complete, paragraph → Sentence → Word extraction can be achieved using the the class ``SimpleNlp``:
40 | 
41 | ```python
42 | >>> import pawpaw
43 | >>> tom_sawyer = '''“Tom!”
44 | ... 
45 | ... No answer.
46 | ... 
47 | ... “TOM!”
48 | ... 
49 | ... No answer.
50 | ... 
51 | ... “What’s gone with that boy,  I wonder? You TOM!”
52 | ... 
53 | ... No answer.'''
54 | >>> nlp = pawpaw.nlp.SimpleNlp()
55 | >>> result = nlp.from_text(tom_sawyer)
56 | >>> tree_vis = pawpaw.visualization.pepo.Tree()
57 | >>> print(tree_vis.dumps(result))
58 | (0, 100) 'Document' : '“Tom!”\n\nNo answer.…TOM!”\n\nNo answer.'
59 | ├──(0, 6) 'paragraph' : '“Tom!”'
60 | │  └──(0, 6) 'sentence' : '“Tom!”'
61 | │     └──(1, 4) 'word' : 'Tom'
62 | ├──(8, 18) 'paragraph' : 'No answer.'
63 | │  └──(8, 18) 'sentence' : 'No answer.'
64 | │     ├──(8, 10) 'word' : 'No'
65 | │     └──(11, 17) 'word' : 'answer'
66 | ├──(20, 26) 'paragraph' : '“TOM!”'
67 | │  └──(20, 26) 'sentence' : '“TOM!”'
68 | │     └──(21, 24) 'word' : 'TOM'
69 | ├──(28, 38) 'paragraph' : 'No answer.'
70 | │  └──(28, 38) 'sentence' : 'No answer.'
71 | │     ├──(28, 30) 'word' : 'No'
72 | │     └──(31, 37) 'word' : 'answer'
73 | ├──(40, 88) 'paragraph' : '“What’s gone with th…I wonder? You TOM!”'
74 | │  ├──(40, 78) 'sentence' : '“What’s gone with that boy,  I wonder?'
75 | │  │  ├──(41, 47) 'word' : 'What’s'
76 | │  │  ├──(48, 52) 'word' : 'gone'
77 | │  │  ├──(53, 57) 'word' : 'with'
78 | │  │  ├──(58, 62) 'word' : 'that'
79 | │  │  ├──(63, 66) 'word' : 'boy'
80 | │  │  ├──(69, 70) 'word' : 'I'
81 | │  │  └──(71, 77) 'word' : 'wonder'
82 | │  └──(79, 88) 'sentence' : 'You TOM!”'
83 | │     ├──(79, 82) 'word' : 'You'
84 | │     └──(83, 86) 'word' : 'TOM'
85 | └──(90, 100) 'paragraph' : 'No answer.'
86 |    └──(90, 100) 'sentence' : 'No answer.'
87 |       ├──(90, 92) 'word' : 'No'
88 |       └──(93, 99) 'word' : 'answer'
89 | ```
90 | 
91 | ``SimpleNlp`` creates an aborform pipeline using the classes ``Paragraph`` and ``Sentence``.
92 | 
93 | *More coming soon...*
94 | 


--------------------------------------------------------------------------------
/docs/8. Serialization.md:
--------------------------------------------------------------------------------
 1 | # SERIALIZATION
 2 | 
 3 | ## Introduction
 4 | 
 5 | Serialization and deserialization of ``Ito`` hierarchies is easy to accomplish in Pawpaw, which offers native support for both:
 6 | 
 7 |  * Pickling
 8 |  * JSON
 9 | 
10 | In either case, support for any dynamically ascribed ``.value`` methods are not serializable[^lambda_pickling].
11 | 
12 | ## Pickling
13 | 
14 | ```python
15 | >>> import pickle
16 | >>> from pawpaw import Ito
17 | >>> s = 'See Jack run.'
18 | >>> i = Ito(s, desc='my desc')
19 | >>> i.children.add(*i.str_split())
20 | >>> pickle_data = pickle.dumps(i)
21 | >>> j = pickle.loads(pickle_data)
22 | >>> len(j.children)
23 | 3
24 | ```
25 | 
26 | ## JSON
27 | 
28 | Pawpaw offers two JSON serialization encoder options:
29 | 
30 | 1. ``Ito.JsonEncoder``: **Does** serialize ``.string``
31 | 2. ``Ito.JsonEncoderStringless``: Does **not** serialize ``.string``
32 | 
33 | ### Ito.JsonEncoder
34 | 
35 | For serialization that includes string data, use the normal the python json ``.dump(s)``
36 | and ``.load(s)`` methods, passing the ``Ito.JsonEncoder`` class and ``Ito.json_decoder``
37 | method to each respectively:
38 | 
39 | ```python
40 | >>> import json
41 | >>> json_data = json.dumps(i, cls=Ito.JsonEncoder)
42 | >>> s in json_data  # verify s is present in JSON
43 | True
44 | >>> j = json.loads(json_data, object_hook=Ito.json_decoder)
45 | >>> len(j.children)
46 | 3
47 | ```
48 | 
49 | The resulting output conserves memory by saving the string data for a hierarchy once,
50 | since a given ``Ito`` and its children all share the same value.
51 | 
52 | ### Ito.JsonEncoderStringless
53 | 
54 | For stringless serialization, use the normal the python json ``.dump(s)``
55 | methods and passing the ``Ito.JsonEncoderStringless`` class to them.  For
56 | deserialization, use the static ``Ito.json_decode_stringless`` method, which
57 | has its inputs both the string and json data being de-serialized:
58 | 
59 | ```python
60 | >>> json_data = json.dumps(i, cls=Ito.JsonEncoderStringless)
61 | >>> s in json_data  # verify s not present in JSON
62 | False
63 | >>> j = Ito.json_decode_stringless(s, json_data)
64 | >>> j
65 | Ito(span=(0, 13), desc='my desc', substr='See Jack run.')
66 | ```
67 | 
68 | [^lambda_pickling]: The python pickle library supports neither lambdas nor methods not-defined at the top level of a module.  See `Python pickle docs
69 | <https://docs.python.org/3/library/pickle.html/>` for more info.
70 | 


--------------------------------------------------------------------------------
/docs/Using Pawpaw with nltk.md:
--------------------------------------------------------------------------------
 1 | # Using Pawpaw with ``nltk``
 2 | 
 3 | ## Tokenization
 4 | 
 5 | ### Convert nlkt tokenizer output to Ito
 6 | 
 7 | ```python
 8 | >>> import nltk
 9 | >>> from pawpaw import Ito
10 | >>> from nltk.tokenize import WhitespaceTokenizer
11 | >>> s = 'The quick brown fox.'
12 | >>> ws_tok = nltk.tokenize.WhitespaceTokenizer()
13 | >>> tokens = [Ito(s, *span, 'token') for span in ws_tok.span_tokenize(s)]
14 | >>> [str(i) for i in tokens]
15 | ['The', 'quick', 'brown', 'fox.']
16 | ```
17 | 
18 | ### Use nltk tokenizer with split
19 | 
20 | ```python
21 | >>> import nltk
22 | >>> import regex
23 | >>> from pawpaw import Ito, arborform
24 | >>> ws_tok = nltk.tokenize.WhitespaceTokenizer()
25 | >>> splitter = arborform.Split(regex.compile(ws_tok._pattern, ws_tok._flags))
26 | >>> i = Ito('The quick brown fox.')
27 | >>> [str(i) for i in splitter(i)]
28 | ['The', 'quick', 'brown', 'fox.']
29 | ```
30 | 
31 | ### Chaining NLP
32 | 
33 | ```python
34 | >>> from pawpaw import Ito, arborform, visualization
35 | >>> s = 'Here is one sentence.  Here is another.'
36 | >>> i = Ito(s)
37 | >>>
38 | >>> nltk_tok = nltk.tokenize
39 | >>> sent_itor = arborform.Itorator.wrap(lambda ito: ito.from_substrings(ito, *nltk_tok.sent_tokenize(str(ito))))
40 | >>>
41 | >>> word_itor = arborform.Itorator.wrap(lambda ito: ito.from_substrings(ito, *nltk_tok.word_tokenize(str(ito))))
42 | >>> sent_itor.itor_children = word_itor
43 | >>>
44 | >>> i.children.add(*sent_itor(i))
45 | >>> vis_tree = visualization.pepo.Tree()
46 | >>> print(vis_tree.dumps(i))
47 | (0, 39) 'None' : 'Here is one sentence.  Here is another.'
48 | ├──(0, 21) 'None' : 'Here is one sentence.'
49 | │  ├──(0, 4) 'None' : 'Here'
50 | │  ├──(5, 7) 'None' : 'is'
51 | │  ├──(8, 11) 'None' : 'one'
52 | │  ├──(12, 20) 'None' : 'sentence'
53 | │  └──(20, 21) 'None' : '.'
54 | └──(23, 39) 'None' : 'Here is another.'
55 |    ├──(23, 27) 'None' : 'Here'
56 |    ├──(28, 30) 'None' : 'is'
57 |    ├──(31, 38) 'None' : 'another'
58 |    └──(38, 39) 'None' : '.'
59 | ```
60 | 


--------------------------------------------------------------------------------
/docs/demos/Q&A/description.md:
--------------------------------------------------------------------------------
 1 | ## From:
 2 | 
 3 | * [stackoverflow question 75394318](https://stackoverflow.com/questions/75394318/python-text-parsing-to-split-list-into-chunks-including-preceding-delimiters)
 4 | 
 5 | ## Description
 6 | 
 7 | Given the text:
 8 | 
 9 | ```text
10 | \na\n\nQ So I do first want to bring up exhibit No. 46, which is in the binder 
11 | in front of\nyou.\n\nAnd that is a letter [to] Alston\n& Bird...
12 | \n\nIs that correct?\n\nA This is correct.\n\nQ Okay
13 | ```
14 | 
15 | Split it into separate questions and answers.
16 | 
17 | * Each Question or Answer starts with ``'\nQ '``, ``'\nA '``, ``'\nQ_'`` or ``'\nA_'``.
18 | * Sometimes the first item in the list may be neither a Question nor Answer, but just random text before the first ``'\Q'`` delimiter.
19 | 


--------------------------------------------------------------------------------
/docs/demos/Q&A/solution.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito, arborform, visualization
 3 | 
 4 | # INPUT
 5 | text = """\na\n\nQ So I do first want to bring up exhibit No. 46, which is in the binder 
 6 | in front of\nyou.\n\nAnd that is a letter [to] Alston\n& Bird...
 7 | \n\nIs that correct?\n\nA This is correct.\n\nQ Okay."""
 8 | 
 9 | # BUILD PARSER
10 | itor_split = arborform.Split(regex.compile(r'\n+(?=Q_? )', regex.DOTALL), desc='Q/A tuple')
11 | 
12 | itor_filt = arborform.Filter(lambda i: i.str_startswith('Q'))  # toss "random text" stuff
13 | con = arborform.Connectors.Delegate(itor_filt)
14 | itor_split.connections.append(con)
15 | 
16 | # Assumes only one answer per question
17 | itor_qa_split = arborform.Split(regex.compile(r'\n+(?=A_? )', regex.DOTALL), limit=1)
18 | con = arborform.Connectors.Children.Add(itor_qa_split)
19 | itor_filt.connections.append(con)
20 | 
21 | itor_extract = arborform.Extract(
22 |     regex.compile(r'([QA])_? (?<QorA>.+)', regex.DOTALL),
23 |     desc=lambda match, group: match.group(1))
24 | con = arborform.Connectors.Children.Add(itor_extract)
25 | itor_qa_split.connections.append(con)
26 | 
27 | # OUTPUT TREE
28 | root = Ito(text)
29 | tree_vis = visualization.pepo.Tree()
30 | for i in itor_split(root):
31 |     print(tree_vis.dumps(i))
32 | print()
33 | 
34 | # OUTPUT TUPLE
35 | for i, tup in enumerate(itor_split(root)):
36 |     print(f'{tup:%desc} {i:,}:')
37 |     for qa in tup.children:
38 |         print(f'\t{qa:%desc% : %substr!r}')
39 |     print()
40 | 


--------------------------------------------------------------------------------
/docs/demos/class_grades/description.md:
--------------------------------------------------------------------------------
 1 | ## From
 2 | 
 3 | * [stackoverflow question 47982949](https://stackoverflow.com/questions/47982949/how-to-parse-complex-text-files-using-python)
 4 | 
 5 | * [codereview question 183668](https://codereview.stackexchange.com/questions/183668/parse-complex-text-files-using-python)
 6 | 
 7 | ## Description
 8 | 
 9 | Given the a file containing this text:
10 | 
11 | ```text
12 | School = Riverdale High
13 | Grade = 1
14 | Student number, Name
15 | 0, Phoebe
16 | 1, Rachel
17 | 
18 | Student number, Score
19 | 0, 3
20 | 1, 7
21 | 
22 | Grade = 2
23 | Student number, Name
24 | 0, Angela
25 | 1, Tristan
26 | 2, Aurora
27 | 
28 | Student number, Score
29 | 0, 6
30 | 1, 3
31 | 2, 9
32 | 
33 | School = Hogwarts
34 | Grade = 1
35 | Student number, Name
36 | 0, Ginny
37 | 1, Luna
38 | 
39 | Student number, Score
40 | 0, 8
41 | 1, 7
42 | 
43 | Grade = 2
44 | Student number, Name
45 | 0, Harry
46 | 1, Hermione
47 | 
48 | Student number, Score
49 | 0, 5
50 | 1, 10
51 | 
52 | Grade = 3
53 | Student number, Name
54 | 0, Fred
55 | 1, George
56 | 
57 | Student number, Score
58 | 0, 0
59 | 1, 0
60 | ```
61 | 
62 | Parset the file and create a pandas DataFrame whose output is as follows:
63 | 
64 | ```text
65 |                                          Name  Score
66 | School         Grade Student number                 
67 | Hogwarts       1     0                  Ginny      8
68 |                      1                   Luna      7
69 |                2     0                  Harry      5
70 |                      1               Hermione     10
71 |                3     0                   Fred      0
72 |                      1                 George      0
73 | Riverdale High 1     0                 Phoebe      3
74 |                      1                 Rachel      7
75 |                2     0                 Angela      6
76 |                      1                Tristan      3
77 |                      2                 Aurora      9
78 | ```


--------------------------------------------------------------------------------
/docs/demos/class_grades/input.txt:
--------------------------------------------------------------------------------
 1 | School = Riverdale High
 2 | Grade = 1
 3 | Student number, Name
 4 | 0, Phoebe
 5 | 1, Rachel
 6 | 
 7 | Student number, Score
 8 | 0, 3
 9 | 1, 7
10 | 
11 | Grade = 2
12 | Student number, Name
13 | 0, Angela
14 | 1, Tristan
15 | 2, Aurora
16 | 
17 | Student number, Score
18 | 0, 6
19 | 1, 3
20 | 2, 9
21 | 
22 | School = Hogwarts
23 | Grade = 1
24 | Student number, Name
25 | 0, Ginny
26 | 1, Luna
27 | 
28 | Student number, Score
29 | 0, 8
30 | 1, 7
31 | 
32 | Grade = 2
33 | Student number, Name
34 | 0, Harry
35 | 1, Hermione
36 | 
37 | Student number, Score
38 | 0, 5
39 | 1, 10
40 | 
41 | Grade = 3
42 | Student number, Name
43 | 0, Fred
44 | 1, George
45 | 
46 | Student number, Score
47 | 0, 0
48 | 1, 0


--------------------------------------------------------------------------------
/docs/demos/class_grades/parser_compact.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import arborform
 3 | 
 4 | def get_parser() -> arborform.Itorator:
 5 |     return arborform.Extract(
 6 |         regex.compile(
 7 |             r'(?<school>School = (?<name>.+?)\n'
 8 |             r'(?<grade>Grade = (?<key>\d+)\n'
 9 |             r'Student number, Name\n(?P<stu_num_names>(?:(?P<stu_num>\d+), (?P<name>.+?)\n)+)\n'
10 |             r'Student number, Score\n(?P<stu_num_scores>(?:(?P<stu_num>\d+), (?P<score>\d+)(?:$|\n))+)(?:$|\n)'
11 |             r')+)+',
12 |             regex.DOTALL
13 |     )
14 | )
15 | 


--------------------------------------------------------------------------------
/docs/demos/class_grades/parser_verbose.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import arborform
 3 | 
 4 | def get_parser() -> arborform.Itorator:
 5 |     school_splitter = arborform.Split(
 6 |         regex.compile(r'(?<=(?:^|\n))(?=School =)', regex.DOTALL),
 7 |         desc='school',
 8 |         tag='school splitter')
 9 | 
10 |     name_grades = arborform.Extract(
11 |         regex.compile(r'School = (?<name>.+?)\n(?<grades>.+)(?:$|\n)', regex.DOTALL),
12 |         tag='school name & grades')
13 |     con = arborform.Connectors.Children.Add(name_grades)
14 |     school_splitter.connections.append(con)
15 | 
16 |     grade_splitter = arborform.Split(
17 |         regex.compile(r'(?<=\n)(?=Grade =)', regex.DOTALL),
18 |         desc='grade',
19 |         tag='grade splitter')
20 |     con = arborform.Connectors.Delegate(grade_splitter, lambda ito: ito.desc == 'grades')
21 |     name_grades.connections.append(con)
22 | 
23 |     grade = arborform.Extract(
24 |         regex.compile(r'Grade = (?<key>\d+)\nStudent number, Name\n(?<stu_num_names>.+?)\nStudent number, Score\n(?<stu_num_scores>.+)', regex.DOTALL),
25 |         tag='grade & stu_num/name * stu_num/score')
26 |     con = arborform.Connectors.Children.Add(grade)
27 |     grade_splitter.connections.append(con)
28 | 
29 |     stu_num_names = arborform.Extract(
30 |         regex.compile(r'(?<stu_num>\d+), (?<name>.+?)\n', regex.DOTALL),
31 |         tag='stu num/name pairs')
32 |     con = arborform.Connectors.Children.Add(stu_num_names, lambda ito: ito.desc == 'stu_num_names')
33 |     grade.connections.append(con)
34 | 
35 |     stu_num_scores = arborform.Extract(
36 |         regex.compile(r'(?<stu_num>\d+), (?<score>\d+)(?:$|\n)', regex.DOTALL),
37 |         tag='stu num/score pairs')
38 |     con = arborform.Connectors.Children.Add(stu_num_scores)
39 |     grade.connections.append(con)
40 | 
41 |     return school_splitter
42 | 


--------------------------------------------------------------------------------
/docs/demos/class_grades/solution.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | import regex
 5 | from pawpaw import Ito, visualization
 6 | import pandas as pd
 7 | 
 8 | while ((answer := input('Select (C)ompact or (V)erbose parser: ').casefold()) not in 'cv'):
 9 |     pass
10 | 
11 | # read file
12 | with open(os.path.join(sys.path[0], 'input.txt')) as f:
13 |     ito = Ito(f.read(), desc='all')
14 | 
15 | # parse
16 | if answer == 'c':
17 |     from parser_compact import get_parser
18 | else:
19 |     from parser_verbose import get_parser
20 | parser = get_parser()
21 | ito.children.add(*parser(ito))
22 | 
23 | # display Pawpaw tree
24 | tree_vis = visualization.pepo.Tree()
25 | print(tree_vis.dumps(ito))
26 | 
27 | # build pandas DataFrame
28 | d = []
29 | for school in ito.find_all('*[d:school]'):
30 |     school_name = str(school.find('*[d:name]'))
31 |     for grade in school.find_all('**[d:grade]'):
32 |         grade_key = int(str(grade.find('*[d:key]')))
33 |         for stu_num in grade.find_all('*[d:stu_num_names]/*[d:stu_num]'):
34 |             stu_name = str(stu_num.find('>[d:name]'))
35 |             stu_num = str(stu_num)
36 |             stu_score = int(str(grade.find('*[d:stu_num_scores]/*[d:stu_num]&[s:' + stu_num + ']/>[d:score]')))
37 |             d.append({'School': school_name, 'Grade': grade_key, 'Student number': stu_num, 'Name': stu_name, 'Score': stu_score})
38 | data = pd.DataFrame(d)
39 | data.set_index(['School', 'Grade', 'Student number'], inplace=True)
40 | data = data.groupby(level=data.index.names).first()
41 | 
42 | # display pandas DataFrame
43 | print(data)
44 | 


--------------------------------------------------------------------------------
/docs/demos/compounds/compound_1.txt:
--------------------------------------------------------------------------------
 1 | MODEL 1
 2 | REMARK minimizedAffinity -7.11687565
 3 | REMARK CNNscore 0.573647082
 4 | REMARK CNNaffinity 5.82644749
 5 | REMARK  11 active torsions:
 6 | # Lots of text here
 7 | # Lots of text here
 8 | # Lots of text here
 9 | docs/demos/class_grades/input.txt
10 | docs/demos/class_grades/input.txt
11 | MODEL 2
12 | REMARK minimizedAffinity -6.61898327
13 | REMARK CNNscore 0.55260396
14 | REMARK CNNaffinity 5.86855984
15 | REMARK  11 active torsions:
16 | # Lots of text here
17 | # Lots of text here
18 | # Lots of text here


--------------------------------------------------------------------------------
/docs/demos/compounds/compound_2.txt:
--------------------------------------------------------------------------------
 1 | MODEL 1
 2 | REMARK minimizedAffinity -7.11687565
 3 | REMARK CNNscore 0.573647082
 4 | REMARK CNNaffinity 5.82644749
 5 | REMARK  11 active torsions:
 6 | # Lots of text here
 7 | # Lots of text here
 8 | # Lots of text here
 9 | docs/demos/class_grades/input.txt
10 | docs/demos/class_grades/input.txt
11 | MODEL 2
12 | REMARK minimizedAffinity -6.61898327
13 | REMARK CNNscore 0.55260396
14 | REMARK CNNaffinity 5.86855984
15 | REMARK  11 active torsions:
16 | # Lots of text here
17 | # Lots of text here
18 | # Lots of text here


--------------------------------------------------------------------------------
/docs/demos/compounds/description.md:
--------------------------------------------------------------------------------
 1 | ## From
 2 | 
 3 | * [stackoverflow question 76453312](https://stackoverflow.com/questions/76453312/extract-information-from-a-list-of-files-and-write-into-a-log-file)
 4 | 
 5 | ## Description
 6 | 
 7 | Given one or more files that look like this:
 8 | 
 9 | ```text
10 | MODEL 1
11 | REMARK minimizedAffinity -7.11687565
12 | REMARK CNNscore 0.573647082
13 | REMARK CNNaffinity 5.82644749
14 | REMARK  11 active torsions:
15 | #Lots of text here
16 | MODEL 2
17 | REMARK minimizedAffinity -6.61898327
18 | REMARK CNNscore 0.55260396
19 | REMARK CNNaffinity 5.86855984
20 | REMARK  11 active torsions:
21 | ```
22 | 
23 | Generate output as log file containing "MODEL", "minimizedAffinity", "CNNscore", and "CNNaffinity" of each and every compound in the folder in a delimited text file:
24 | 
25 | ```text
26 | Compound Model minimizedAffinity CNNscore CNNaffinity 
27 | 1 1 -7.11687565 0.573647082 5.82644749
28 | 1 2 -6.61898327 0.55260396 5.86855984
29 | ```
30 | 


--------------------------------------------------------------------------------
/docs/demos/compounds/solution.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | import fnmatch
 4 | import typing
 5 | 
 6 | import regex
 7 | import pawpaw
 8 | 
 9 | # Build pawpaw parser
10 | re = regex.compile(r'(?<=^|\n)(?=MODEL \d+)', regex.DOTALL)
11 | splitter = pawpaw.arborform.Split(re)
12 | 
13 | pat = r"""
14 | (?P<model>
15 |     MODEL\ 
16 |     (?<tag>\d+)
17 |     (?:\n
18 |         (?<remark>
19 |             REMARK\ 
20 |             (?<tag>[^\s]+)\ 
21 |             (?<value>[^\n]+)
22 |         )
23 |     )+
24 |     (?:\n
25 |         (?>!=REMARK)
26 |         (?<text>.+)
27 |     )?
28 | )+
29 | """
30 | re = regex.compile(pat, regex.VERBOSE | regex.DOTALL)
31 | extractor = pawpaw.arborform.Extract(re)
32 | con = pawpaw.arborform.Connectors.Delegate(extractor)
33 | splitter.connections.append(con)
34 | 
35 | # Prints using fixed-width for visibility: change to delimited if needed
36 | def dump_row(cols: list) -> None:
37 |     print(*(f'{v: <18}' for v in cols))  
38 | 
39 | # Select desired remark columns
40 | desired_remarks = ['minimizedAffinity', 'CNNscore', 'CNNaffinity']
41 | 
42 | # Headers
43 | headers = ['Compound', 'Model']
44 | headers.extend(desired_remarks)
45 | dump_row(headers)
46 | 
47 | # Create rows from compound file
48 | def compound_vals(compound: str, ito: pawpaw.Ito) -> typing.Iterable[list[str]]:
49 |     for model in ito.children:
50 |         vals = [compound]
51 |         vals.append(str(model.find('*[d:tag]')))
52 |         for dr in desired_remarks:
53 |             vals.append(str(model.find(f'*[d:remark]/*[d:tag]&[s:{dr}]/>[d:value]')))
54 |         yield vals
55 | 
56 | # Read files and dump contents of each
57 | for path in os.scandir(os.path.join(sys.path[0])):
58 |     if path.is_file() and fnmatch.fnmatch(path.name, 'compound_*.txt'):
59 |         compound = path.name.split('_', 1)[-1].split('.', 1)[0]  # compound number
60 |         with open(os.path.join(sys.path[0], path)) as f:
61 |             ito = pawpaw.Ito(f.read(), desc='all')
62 |             ito.children.add(*splitter(ito))
63 |             for vals in compound_vals(compound, ito):
64 |                 dump_row(vals)
65 | 


--------------------------------------------------------------------------------
/docs/demos/gettysburg_address/gettysburg_address.txt:
--------------------------------------------------------------------------------
1 | Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.
2 | 
3 | Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.
4 | 
5 | But, in a larger sense, we can not dedicate — we can not consecrate — we can not hallow — this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract.
6 | 
7 | The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us — that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion — that we here highly resolve that these dead shall not have died in vain — that this nation, under God, shall have a new birth of freedom — and that government of the people, by the people, for the people, shall not perish from the earth.


--------------------------------------------------------------------------------
/docs/demos/us_constitution/description.md:
--------------------------------------------------------------------------------
 1 | ## From:
 2 | 
 3 | * [stackoverflow question 75394318](https://stackoverflow.com/questions/75394318/python-text-parsing-to-split-list-into-chunks-including-preceding-delimiters)
 4 | 
 5 | ## Description
 6 | 
 7 | Given the text of the U.S. Constitution, which can be found [here](https://www.archives.gov/founding-docs/constitution-transcript), perfrom a full segmentation starting with high level parts such as articles, sections, etc. down to words.
 8 | 
 9 | .
10 | 


--------------------------------------------------------------------------------
/docs/demos/us_constitution/us_constitution.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os.path
 3 | 
 4 | import regex
 5 | import pawpaw
 6 | from pawpaw import arborform
 7 | 
 8 | """
 9 | DEMO: US CONSTITUTION
10 | 
11 | This demo shows an example of how to parse, visualize, and query the US Constitution using Pawpaw.
12 | 
13 | Note: The text for the constitution was taken from https://www.archives.gov/founding-docs/constitution-transcript
14 | """
15 | 
16 | def get_parser() -> arborform.Itorator:
17 |     # Article: could be preamble
18 |     a_splitter = arborform.Split(
19 |         regex.compile(r'(?<=\n+)(?=Article\.)', regex.DOTALL),
20 |         boundary_retention=arborform.Split.BoundaryRetention.NONE,
21 |         tag='article splitter')
22 | 
23 |     a_desc = arborform.Desc(
24 |         desc=lambda ito: 'article' if ito.str_startswith('Article.') else 'preamble',
25 |         tag='article desc')
26 |     con = arborform.Connectors.Delegate(a_desc)
27 |     a_splitter.connections.append(con)
28 | 
29 |     con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'preamble')
30 |     a_desc.connections.append(con)
31 | 
32 |     a_extractor = arborform.Extract(
33 |         regex.compile(r'Article\. (?<key>[A-Z]+)\.\n(?<value>.+)', regex.DOTALL),
34 |         tag='article extractor')
35 |     con = arborform.Connectors.Children.Add(a_extractor, lambda ito: ito.desc == 'article')
36 |     a_desc.connections.append(con)
37 | 
38 |     # Section: only some articles have sections
39 |     s_splitter = arborform.Split(
40 |         regex.compile(r'(?<=\n+)(?=Section\.)', regex.DOTALL),
41 |         boundary_retention=arborform.Split.BoundaryRetention.LEADING,
42 |         desc='section',
43 |         tag='section splitter')
44 |     con = arborform.Connectors.Children.Add(s_splitter, lambda ito: ito.desc == 'value' and ito.str_startswith('Section.'))
45 |     a_extractor.connections.append(con)
46 |     con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'value' and not ito.str_startswith('Section.'))
47 |     a_extractor.connections.append(con)
48 | 
49 |     s_extractor = arborform.Extract(regex.compile(r'Section\. (?<key>\d+)\.\n(?<value>.+)', regex.DOTALL))
50 |     con = arborform.Connectors.Children.Add(s_extractor)
51 |     s_splitter.connections.append(con)
52 |     con = arborform.Connectors.Children.Add(pawpaw.nlp.SimpleNlp().itor, lambda ito: ito.desc == 'value')
53 |     s_extractor.connections.append(con)
54 | 
55 |     return a_splitter
56 | 
57 | 
58 | def get_text() -> pawpaw.Ito:
59 |     with open(os.path.join(sys.path[0], 'us_constitution.txt')) as f:
60 |         return pawpaw.Ito(f.read(), desc='constitution')
61 | 
62 | 
63 | # Visualize
64 | print(f'\nVISUALIZE:\n')
65 | i = get_text()
66 | tree_vis = pawpaw.visualization.pepo.Tree()
67 | parser = get_parser()
68 | i.children.add(*parser(i))
69 | print(tree_vis.dumps(i))
70 | 
71 | # Query
72 | print(f'\nQUERY:\n')
73 | print(f'\tGoal: Find sections containing words \'power\' or \'right\'\n')
74 | query = '**[d:section]{**[d:word] & [lcs:power,right]}'
75 | print(f'\tPlumule Query: {query}\n')
76 | print(f'\tResults:\n')
77 | for i, section in enumerate(i.find_all(query)):
78 |     article_key = section.find('..[d:article]/*[d:key]')
79 |     section_key = section.find('*[d:key]')
80 |     section_value = section.find('*[d:value]')
81 |     print(f'\t\tMatch {i}: Article {article_key}, Section {section_key}')
82 |     print(f'\t\t\t{section_value:%substr:45…}')
83 | 


--------------------------------------------------------------------------------
/docs/demos/xpath_recursion_depth/description.md:
--------------------------------------------------------------------------------
 1 | ## From
 2 | 
 3 | * [stackoverflow question 51034706](https://stackoverflow.com/questions/51034706/breaking-the-lxml-etree-html-xpath-max-parsing-depth-limit)
 4 | 
 5 | ## Description
 6 | 
 7 | The XPATH parser for lxml.etree has a max depth limit, which can be seen with the following code:
 8 | 
 9 | ```python
10 | import lxml.etree as etree
11 | 
12 | # Setup HTML tabs
13 | x = "<span>"
14 | x_ = "</span>"
15 | 
16 | # Set recursion depth to 255
17 | depth = 255 
18 | 
19 | # Fails with depth >= 255:
20 | print(etree.HTML(x * depth + "<p>text to be extracted</p >" + x_* depth).xpath("//p//text()"))
21 | ```
22 | 


--------------------------------------------------------------------------------
/docs/demos/xpath_recursion_depth/solution_1.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | 
 3 | # Setup HTML tabs
 4 | x = "<span>"
 5 | x_ = "</span>"
 6 | 
 7 | # Set recursion depth to 255
 8 | depth = 300
 9 | 
10 | xml_text = f'{x * depth}<p>text to be extracted</p >{x_ * depth}'
11 | 
12 | root = ET.fromstring(xml_text)
13 | print(root.find(".//p").text)
14 | 


--------------------------------------------------------------------------------
/docs/demos/xpath_recursion_depth/solution_2.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.modules['_elementtree'] = None
 3 | import xml.etree.ElementTree as ET
 4 | 
 5 | from pawpaw import xml
 6 | 
 7 | 
 8 | # Setup HTML tabs
 9 | x = "<span>"
10 | x_ = "</span>"
11 | 
12 | # Set recursion depth to 255
13 | depth = 300
14 | 
15 | xml_text = f'{x * depth}<p>text to be extracted</p >{x_ * depth}'
16 | 
17 | root = ET.fromstring(xml_text, parser=xml.XmlParser())
18 | node = root.ito.find('**[d:element]{*[d:start_tag]/**[d:name]&[s:p]}/*[d:text]')
19 | print(str(node))
20 | 
21 | 


--------------------------------------------------------------------------------
/images/pawpaw.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/images/pawpaw.png


--------------------------------------------------------------------------------
/pawpaw/__init__.py:
--------------------------------------------------------------------------------
 1 | from pawpaw._version import __version__, Version
 2 | # del _version
 3 | 
 4 | from pawpaw.infix import Infix
 5 | del infix
 6 | 
 7 | from pawpaw.errors import Errors
 8 | del errors
 9 | 
10 | import pawpaw._type_magic as type_magic
11 | del _type_magic
12 | 
13 | from pawpaw.span import Span
14 | del span
15 | 
16 | from pawpaw.ito import nuco, GroupKeys, Ito, ChildItos, Types
17 | del ito
18 | 
19 | from pawpaw.util import find_unescaped, split_unescaped, find_balanced
20 | del util
21 | 
22 | import pawpaw.arborform
23 | import pawpaw.query
24 | import pawpaw.xml
25 | import pawpaw.nlp
26 | import pawpaw.table
27 | import pawpaw.visualization
28 | 
29 | del pawpaw
30 | 


--------------------------------------------------------------------------------
/pawpaw/_type_magic.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import inspect
  3 | import types
  4 | import typing
  5 | 
  6 | from pawpaw.errors import Errors
  7 | 
  8 | 
  9 | CALLABLE_TYPE_OR_GENERIC = typing._CallableType | typing._CallableGenericAlias
 10 | 
 11 | def is_callable_type_or_generic(obj: typing.Any) -> bool:
 12 |     '''
 13 |         Returns True if obj is typing._Callable or typing._CallableGenericAlias
 14 |     '''
 15 |     return isinstance(obj, CALLABLE_TYPE_OR_GENERIC)
 16 | 
 17 | 
 18 | def is_functoid(obj: typing.Any):
 19 |     '''
 20 |     Python's builtin callable method iscallable() returns true for all callable types (e.g., def, lambda,
 21 |     instance/class/builtin method, etc.) However, it also returns True for typing.Callable
 22 |     objects.  E.g.:
 23 | 
 24 |     >>> MY_FUNC_ALIAS = typing.Callable[[int], str]
 25 |     >>> callable(MY_FUNC_ALIAS)
 26 |     True
 27 | 
 28 |     This method returns False in such cases
 29 | 
 30 |     The terms 'Function', 'Method', 'Callable', etc. all have established meanings in Python.
 31 |     It would be confusing to label this method 'callable', and the terms 'Function', 'Method',
 32 |     etc. already have established meantings in pythong.  So instead, I'm using 'functoid'
 33 |     '''
 34 |     return callable(obj) and not is_callable_type_or_generic(obj)
 35 | 
 36 | 
 37 | _LAMBDA_OBJ_NAME = (lambda: True).__name__  # Use this instead of string literal in case Python changes
 38 | 
 39 | 
 40 | # Note: Guido van Rossum uses 'def' and 'lambda' for these two concepts (see:
 41 | # https://stackoverflow.com/questions/62479608/lambdatype-vs-functiontype), so I'll
 42 | # use the same naming convention here
 43 | 
 44 | def is_def(obj: typing.Any) -> bool:
 45 |     '''
 46 |         Returns True if obj is def (defined function)
 47 |     '''
 48 |     return isinstance(obj, types.FunctionType) and obj.__name__ != _LAMBDA_OBJ_NAME
 49 | 
 50 | 
 51 | def is_lambda(obj: typing.Any) -> bool:
 52 |     '''
 53 |         Returns True if obj is lambda
 54 |     '''
 55 |     return isinstance(obj, types.FunctionType) and obj.__name__ == _LAMBDA_OBJ_NAME
 56 | 
 57 | 
 58 | TYPE_OR_UNION = typing.Type | types.UnionType
 59 | 
 60 | 
 61 | def unpack(t: TYPE_OR_UNION) -> typing.List[type]:
 62 |     rv = list[typing.Type]()
 63 |     
 64 |     if (origin := typing.get_origin(t)) is types.UnionType:
 65 |         for i in typing.get_args(t):
 66 |             rv.extend(unpack(i))
 67 |     else:
 68 |         rv.append(t)
 69 | 
 70 |     return rv
 71 | 
 72 | 
 73 | def isinstance_ex(obj: object, type_or_union: TYPE_OR_UNION) -> bool:
 74 |     '''
 75 |     Although Python >= 3.10 now allows Union as 2nd parameter to isinstance method, it doesn't
 76 |     allow _parameterized_ types.  This function performs weak checking for any supplied
 77 |     parameterized types.
 78 | 
 79 |     Tuples are not allowed for 2nd parameter because... why support them now that you can pass a Union?
 80 |     '''
 81 |     for t in unpack(type_or_union):
 82 |         if (origin := typing.get_origin(t)) is not None:
 83 |             # Could expand this for various generic types
 84 |             if isinstance(obj, origin):
 85 |                 return True
 86 |         elif issubclass(type(obj), t):
 87 |             return True
 88 |     return False
 89 | 
 90 | 
 91 | def issubclass_ex(_cls, type_or_union: TYPE_OR_UNION) -> bool:
 92 |     cls_types = [t if (origin := typing.get_origin(t)) is None else origin for t in unpack(_cls)]
 93 |     tou_types = [t if (origin := typing.get_origin(t)) is None else origin for t in unpack(type_or_union)]
 94 |     for cls_type in cls_types:
 95 |         if any(issubclass(cls_type, tou_type) for tou_type in tou_types):
 96 |             return True
 97 |     return False
 98 | 
 99 | 
100 | class Functoid:
101 |     """Marker object"""
102 | 
103 | 
104 | def _annotation_or_type_hint_matches_type(
105 |         annotation: TYPE_OR_UNION | str | inspect.Signature.empty,
106 |         type_hint: typing.Any or None,
107 |         _type: TYPE_OR_UNION
108 | ) -> bool:
109 |     t = annotation
110 |     if not isinstance(t, TYPE_OR_UNION) or (isinstance(t, type) and issubclass(t, inspect.Signature.empty)):
111 |         t = type_hint
112 |     if t is not None:
113 |         if _type is typing.Any:
114 |             return True
115 |         elif not issubclass_ex(t, _type):
116 |             return False
117 | 
118 |     return True
119 | 
120 | 
121 | def functoid_isinstance(functoid: typing.Callable, callable_type_or_generic: CALLABLE_TYPE_OR_GENERIC) -> bool:
122 |     '''
123 |     There is no good way to type hint for functoid, so falling back to 'typing.Callable'
124 |     '''
125 | 
126 |     if not is_callable_type_or_generic(callable_type_or_generic):
127 |         raise Errors.parameter_invalid_type('callable_type_or_generic', callable_type_or_generic, CALLABLE_TYPE_OR_GENERIC)
128 | 
129 |     if not is_functoid(functoid):
130 |         return False
131 | 
132 |     # This has guaranteed entries for the ret_val and all params, however, the types _may_ be
133 |     # strings if "from __future__ import annotations" is used.
134 |     func_sig = inspect.signature(functoid)
135 | 
136 |     # This has proper types, even when "from __future__ import annotations" used.  However:
137 |     # if the ret-val or param lacks a type hint, it is missing from this dict
138 |     func_type_hints = typing.get_type_hints(functoid)
139 | 
140 |     ts_params, ts_ret_val = typing.get_args(callable_type_or_generic)
141 | 
142 |     if not _annotation_or_type_hint_matches_type(func_sig.return_annotation, func_type_hints.get('return', None), ts_ret_val):
143 |         return False
144 | 
145 |     if len(func_sig.parameters) != len(ts_params):
146 |         return False
147 | 
148 |     for func_p, ts_p in zip(func_sig.parameters.items(), ts_params):
149 |         func_n, func_p = func_p
150 |         if not _annotation_or_type_hint_matches_type(func_p.annotation, func_type_hints.get(func_n, None), ts_p):
151 |             return False
152 | 
153 |     return True
154 | 
155 | 
156 | def invoke_func(func: typing.Any, *vals: typing.Any) -> typing.Any:
157 |     """Wire and fire
158 | 
159 |     Args:
160 |         func:
161 |         *vals:
162 | 
163 |     Returns:
164 |         Invokes func and returns its return value
165 |     """
166 | 
167 |     if is_lambda(func):
168 |         return func(*vals)  # No type hints on lamdbas, so this is the best we can do
169 | 
170 |     unpaired: typing.List[typing.Any] = list(vals)
171 | 
172 |     arg_spec = inspect.getfullargspec(func)
173 |     del arg_spec.annotations['return']
174 | 
175 |     p_args: typing.List[typing.Any] = []
176 |     for arg in arg_spec.args:
177 |         for val in unpaired:
178 |             val_type = type(val)
179 |             if issubclass_ex(val_type, arg_spec.annotations[arg]):
180 |                 p_args.append(val)
181 |                 unpaired.remove(val)
182 |                 break
183 | 
184 |     p_kwonlyargs: typing.Dict[str, typing.Any] = {}
185 |     for arg in arg_spec.kwonlyargs:
186 |         for val in unpaired:
187 |             val_type = type(val)
188 |             if issubclass_ex(val_type, arg_spec.annotations[arg]):
189 |                 p_kwonlyargs[arg] = val
190 |                 unpaired.remove(val)
191 |                 break
192 | 
193 |     p_vargs: typing.List[typing.Any] = []
194 |     if len(unpaired) > 0 and arg_spec.varargs is not None:
195 |         p_vargs[arg_spec.varargs] = unpaired
196 | 
197 |     return func(*p_args, *p_vargs, **p_kwonlyargs)
198 | 


--------------------------------------------------------------------------------
/pawpaw/_version.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import typing
 3 | 
 4 | import regex
 5 | 
 6 | __version__ = '1.0.1'
 7 | """The str literal that build, setup, documentation, and other tools typically want."""
 8 | 
 9 | class Version:
10 |     _canonical_re = regex.compile(r'^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\.post(0|[1-9][0-9]*))?(\.dev(0|[1-9][0-9]*))?(?:\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*)?$')
11 |     """This pattern taken from https://peps.python.org/pep-0440/#appendix-b-parsing-version-strings-with-regular-expressions
12 |     and expanded to support optional "local version identifier" (see https://peps.python.org/pep-0440/#local-version-identifiers)."""
13 | 
14 |     @classmethod
15 |     def is_canonical(cls, version: str) -> bool:
16 |         return cls._canonical_re.match(version) is not None
17 | 
18 |     _parse_pat = r"""
19 |     v?
20 |     (?:
21 |         (?:(?P<epoch>[0-9]+)!)?                           # epoch
22 |         (?P<release>[0-9]+(?:\.[0-9]+)*)                  # release segment
23 |         (?P<pre>                                          # pre-release
24 |             [-_\.]?
25 |             (?P<pre_l>a|b|c|rc|alpha|beta|pre|preview)
26 |             [-_\.]?
27 |             (?P<pre_n>[0-9]+)?
28 |         )?
29 |         (?P<post>                                         # post release
30 |             (?:-(?P<post_n1>[0-9]+))
31 |             |
32 |             (?:
33 |                 [-_\.]?
34 |                 (?P<post_l>post|rev|r)
35 |                 [-_\.]?
36 |                 (?P<post_n2>[0-9]+)?
37 |             )
38 |         )?
39 |         (?P<dev>                                          # dev release
40 |             [-_\.]?
41 |             (?P<dev_l>dev)
42 |             [-_\.]?
43 |             (?P<dev_n>[0-9]+)?
44 |         )?
45 |     )
46 |     (?:\+(?P<local>[a-z0-9]+(?:[-_\.][a-z0-9]+)*))?       # local version
47 | """
48 |     """Taken from https://peps.python.org/pep-0440/#appendix-b-parsing-version-strings-with-regular-expressions and
49 |     corrected so that group pre_l has no sub-group and behaves like post_l and dev_l groups"""
50 | 
51 |     parse_re = regex.compile(r"^\s*" + _parse_pat + r"\s*$", regex.VERBOSE | regex.IGNORECASE)
52 |     """regex that could be used by pawpaw to create a parse tree for a version str"""
53 | 
54 | if not Version.is_canonical(__version__):
55 |     raise ValueError(f'__version__ is non-canonical with pep-0440')
56 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/__init__.py:
--------------------------------------------------------------------------------
1 | from pawpaw.arborform.itorator import *
2 | 
3 | from pawpaw.arborform.postorator import *
4 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/__init__.py:
--------------------------------------------------------------------------------
 1 | from .itorator import Connectors, Itorator
 2 | del itorator
 3 | 
 4 | from .regex_itorator import RegexItorator
 5 | del regex_itorator
 6 | 
 7 | from .reflect import Reflect
 8 | del reflect
 9 | 
10 | from .desc import Desc
11 | del desc
12 | 
13 | from .value_func import ValueFunc
14 | del value_func
15 | 
16 | from .filter import Filter
17 | del filter
18 | 
19 | from .extract import Extract
20 | del extract
21 | 
22 | from .split import Split
23 | del split
24 | 
25 | from .invert import Invert
26 | del invert
27 | 
28 | from .nuco import Nuco
29 | del nuco
30 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/desc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Desc(Itorator):
 8 |     def __init__(self, desc: str | Types.F_ITO_2_DESC, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         if isinstance(desc, str):
11 |             self._desc_func = lambda ito: desc
12 |         elif type_magic.functoid_isinstance(desc, Types.F_ITO_2_DESC):
13 |             self._desc_func = desc
14 |         else:
15 |             raise Errors.parameter_invalid_type('desc', desc, str | Types.F_ITO_2_DESC)
16 | 
17 |     def clone(self, tag: str | None = None) -> Desc:
18 |         return type(self())(self._desc_func, self.tag if tag is None else tag)
19 | 
20 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
21 |         ito.desc = self._desc_func(ito)
22 |         yield ito
23 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/extract.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import collections
 3 | import types
 4 | import typing
 5 | 
 6 | import regex
 7 | from pawpaw import Ito, Types, Errors, type_magic
 8 | from pawpaw.arborform.itorator import RegexItorator
 9 | 
10 | 
11 | class Extract(RegexItorator):
12 |     def __init__(self,
13 |                  re: regex.Pattern,
14 |                  limit: int | None = None,
15 |                  desc: str | Types.F_M_GK_2_DESC = lambda m, gk: str(gk),
16 |                  group_filter: collections.abc.Container[Types.C_GK] | Types.P_M_GK = lambda m, gk: str(gk) != '0',
17 |                  tag: str | None = None):
18 |         super().__init__(re, group_filter, tag)
19 | 
20 |         if not isinstance(limit, (int, type(None))):
21 |             raise Errors.parameter_invalid_type('limit', limit, int, types.NoneType)
22 |         self.limit = limit
23 | 
24 |         if isinstance(desc, str):
25 |             self._desc = lambda m, gk: desc
26 |         elif type_magic.functoid_isinstance(desc, Types.F_M_GK_2_DESC):
27 |             self.desc = desc
28 |         else:
29 |             raise Errors.parameter_invalid_type('desc', desc, str,  Types.F_M_GK_2_DESC)
30 |     
31 |     def clone(self, tag: str | None = None) -> Extract:
32 |         return type(self())(self._re, self.limit, self.desc, self._group_filter, self.tag if tag is None else tag)
33 | 
34 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
35 |         return [*ito.from_re(self._re, ito, self.group_filter, self.desc, self.limit)]
36 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/filter.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Filter(Itorator):
 8 |     def __init__(self, filter_: Types.P_ITO, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         if not (filter_ is None or type_magic.functoid_isinstance(filter_, Types.P_ITO)):
11 |             raise Errors.parameter_invalid_type('filter', filter_, Types.P_ITO)
12 |         self._filter = filter_
13 | 
14 |     def clone(self, tag: str | None = None) -> Filter:
15 |         return type(self())(self._filter, self.tag if tag is None else tag)
16 | 
17 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
18 |         if self._filter(ito):
19 |             yield ito
20 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/invert.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Invert(Itorator):
 8 |     def __init__(
 9 |             self,
10 |             itorator: Itorator,
11 |             desc: str | None = None,
12 |             tag: str | None = None):
13 |         super().__init__(tag)
14 |         self.itorator = itorator
15 |         self.desc = desc
16 | 
17 |     def clone(self, tag: str | None = None) -> Invert:
18 |         return type(self())(self.itorator, self.desc, self.tag if tag is None else tag)
19 | 
20 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
21 |         start = ito.start
22 |         for i in self.itorator(ito):
23 |             if start < i.start:
24 |                 yield ito.clone(start, i.start, desc=self.desc)
25 |             start = i.stop
26 | 
27 |         if start == ito.start:
28 |             yield ito.clone(desc=self.desc)
29 |         elif i.stop < ito.stop:
30 |             yield ito.clone(i.stop, ito.stop, self.desc)
31 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/itorator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from abc import ABC, abstractmethod
  3 | import itertools
  4 | import types
  5 | import typing
  6 | 
  7 | from pawpaw import Types, Errors, Ito, type_magic
  8 | from pawpaw.arborform.postorator.postorator import Postorator
  9 | 
 10 | 
 11 | class Connector(ABC):
 12 |     def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 13 |         if not isinstance(itorator, Itorator):
 14 |             raise Errors.parameter_invalid_type('itorator', itorator, Itorator)
 15 |         self.itorator = itorator
 16 | 
 17 |         if type_magic.functoid_isinstance(predicate, Types.P_ITO):
 18 |             self.predicate = predicate
 19 |         elif isinstance(predicate, str):
 20 |             self.predicate = lambda ito: ito.desc == predicate
 21 |         elif predicate is None:
 22 |             self.predicate = lambda ito: ito.desc is None
 23 |         else:
 24 |             raise Errors.parameter_invalid_type('predicate', predicate, Types.P_ITO, str, None)
 25 | 
 26 | 
 27 | class ChildrenConnector(Connector, ABC):
 28 |     def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 29 |         super().__init__(itorator, predicate)
 30 | 
 31 | 
 32 | class Connectors:
 33 |     # yield from f(cur)
 34 |     # break
 35 |     class Delegate(Connector):
 36 |         def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 37 |             super().__init__(itorator, predicate)
 38 | 
 39 |     # cur(s) ~= f(cur)
 40 |     class Recurse(Connector):
 41 |         def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 42 |             super().__init__(itorator, predicate)
 43 | 
 44 |     # f(cur)
 45 |     class Subroutine(Connector):
 46 |         def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 47 |             super().__init__(itorator, predicate)
 48 | 
 49 |     class Children:
 50 |         # cur.children.add(*f(cur))
 51 |         class Add(ChildrenConnector):
 52 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 53 |                 super().__init__(itorator, predicate)
 54 | 
 55 |         # cur.children.add_hierarchical(*f(cur))
 56 |         class AddHierarchical(ChildrenConnector):
 57 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 58 |                 super().__init__(itorator, predicate)
 59 | 
 60 |         # cur.children.clear
 61 |         # cur.children.add(*f(cur))
 62 |         class Replace(ChildrenConnector):
 63 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 64 |                 super().__init__(itorator, predicate)
 65 | 
 66 |         # for c in f(cur):
 67 |         #   cur.children.remove(c)
 68 |         class Delete(ChildrenConnector):  # REMOVE
 69 |             def __init__(self, itorator: Itorator, predicate: Types.P_ITO | str | None = lambda ito: True):
 70 |                 super().__init__(itorator, predicate)
 71 | 
 72 | 
 73 | class Itorator(ABC):
 74 |     @classmethod
 75 |     def __exhaust_iterator(cls, it: typing.Iterator):
 76 |         if not isinstance(it, typing.Iterator):
 77 |             raise Errors.parameter_invalid_type('it', it, typing.Iterator)
 78 |         
 79 |         while True:
 80 |             try:
 81 |                 next(it)
 82 |             except StopIteration:
 83 |                 break
 84 | 
 85 |     @classmethod
 86 |     def wrap(cls, src: Types.F_ITO_2_IT_ITOS, tag: str | None = None):
 87 |         if type_magic.functoid_isinstance(src, Types.F_ITO_2_IT_ITOS):
 88 |             return _WrappedItoratorEx(src, tag)
 89 | 
 90 |         raise Errors.parameter_invalid_type('src', src, Types.F_ITO_2_IT_ITOS)
 91 | 
 92 |     def __init__(self, tag: str | None = None):
 93 |         if tag is not None and not isinstance(tag, str):
 94 |             raise Errors.parameter_invalid_type('desc', tag, str)
 95 |         self._connections = list[Connector]()
 96 |         self.tag: str | None = tag
 97 |         self._postorator: Postorator | Types.F_ITOS_2_ITOS | None = None
 98 | 
 99 |     @abstractmethod
100 |     def clone(self, tag: str | None = None) -> Itorator:
101 |         ...
102 | 
103 |     @property
104 |     def connections(self) -> list[Connector]:
105 |         return self._connections
106 | 
107 |     @property
108 |     def postorator(self) -> Postorator | Types.F_ITOS_2_ITOS | None:
109 |         return self._postorator
110 | 
111 |     @postorator.setter
112 |     def postorator(self, val: Postorator | Types.F_ITOS_2_ITOS | None):
113 |         if val is None or isinstance(val, Postorator):
114 |             self._postorator = val
115 |         else:
116 |             raise Errors.parameter_invalid_type('val', val, Postorator, Types.F_ITOS_2_ITOS, types.NoneType)
117 | 
118 |     @abstractmethod
119 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
120 |         pass
121 | 
122 |     # posterator
123 |     def _post(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
124 |         if self._postorator is None:
125 |             yield from itos
126 |         else:
127 |             yield from self._postorator(itos)                
128 | 
129 |     # pipeline flow
130 |     def _flow(self, ito: Ito, con_idx: int) -> Types.C_IT_ITOS:
131 |         if con_idx >= len(self.connections):
132 |             yield ito
133 | 
134 |         else:
135 |             con = self._connections[con_idx]
136 |             if con.predicate(ito):
137 |                 if isinstance(con, Connectors.Delegate):
138 |                     yield from con.itorator._traverse(ito)
139 | 
140 |                 elif isinstance(con, ChildrenConnector):
141 |                     children = [*con.itorator._traverse(ito)]
142 | 
143 |                     if isinstance(con, Connectors.Children.Replace):
144 |                         ito.children.clear()
145 | 
146 |                     if isinstance(con, (Connectors.Children.Add, Connectors.Children.Replace)):
147 |                         ito.children.add(*children)
148 |                     elif isinstance(con, Connectors.Children.AddHierarchical):
149 |                         ito.children.add_hierarchical(*children)
150 |                     else:  # Connections.Children.Delete
151 |                         for c in children:
152 |                             ito.children.remove(c)
153 | 
154 |                     yield from self._flow(ito, con_idx + 1)
155 | 
156 |                 elif isinstance(con, Connectors.Recurse):
157 |                     for sub in con.itorator._traverse(ito):
158 |                         yield from self._flow(sub, con_idx + 1)
159 | 
160 |                 elif isinstance(con, Connectors.Subroutine):
161 |                     self.__exhaust_iterator(con.itorator._traverse(ito))
162 |                     yield from self._flow(ito, con_idx + 1)
163 | 
164 |                 else:
165 |                     raise TypeError('Invalid connector: {con}')
166 | 
167 |             else:
168 |                 yield from self._flow(ito, con_idx + 1)
169 | 
170 |     # soup to nuts
171 |     def _traverse(self, ito: Ito) -> Types.C_IT_ITOS:
172 |         yield from self._post(itertools.chain.from_iterable(self._flow(i, 0) for i in self._transform(ito)))
173 | 
174 |     def __call__(self, ito: Ito) -> Types.C_IT_ITOS:
175 |         if not isinstance(ito, Ito):
176 |             raise Errors.parameter_invalid_type('ito', ito, Ito)
177 |         yield from self._traverse(ito.clone())
178 | 
179 | 
180 | class _WrappedItoratorEx(Itorator):
181 |     def __init__(self, f: Types.F_ITO_2_IT_ITOS, tag: str | None = None):
182 |         super().__init__(tag)
183 |         self.__f = f
184 | 
185 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
186 |         yield from self.__f(ito)
187 | 
188 |     def clone(self, tag: str | None = None) -> _WrappedItoratorEx:
189 |         return type(self)(self.__f, self.tag if tag is None else tag)
190 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/nuco.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import typing
 3 | 
 4 | from pawpaw import Ito, Types
 5 | from pawpaw.arborform.itorator import Itorator
 6 | 
 7 | class Nuco(Itorator):
 8 |     def __init__(self, *itorators: Itorator, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         self._itorators = list(itorators)
11 | 
12 |     def clone(self, tag: str | None = None) -> Nuco:
13 |         return type(self())(self._itorators, self.tag if tag is None else tag)
14 | 
15 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
16 |         for itor in self._itorators:
17 |             it = itor(ito)
18 |             try:
19 |                 yield next(it)
20 |                 yield from it
21 |                 break
22 |             except StopIteration:
23 |                 pass


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/reflect.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class Reflect(Itorator):
 8 |     def __init__(self, tag: str | None = None):
 9 |         super().__init__(tag)
10 | 
11 |     def clone(self, tag: str | None = None) -> Reflect:
12 |         return type(self())(self.tag if tag is None else tag)
13 | 
14 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
15 |         yield ito
16 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/regex_itorator.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from abc import abstractmethod
 3 | import collections
 4 | import typing
 5 | import types
 6 | 
 7 | import regex
 8 | from pawpaw import GroupKeys, Ito, Types, Errors, type_magic
 9 | from pawpaw.arborform.itorator import Itorator
10 | 
11 | 
12 | class RegexItorator(Itorator):
13 |     def __init__(self,
14 |                  re: regex.Pattern,
15 |                  group_filter: collections.abc.Container[Types.C_GK] | Types.P_M_GK = lambda m, gk: True,
16 |                  tag: str | None = None):
17 |         super().__init__(tag)
18 |         
19 |         self._group_keys: list[Types.C_GK]
20 |         self.re = re  # sets ._group_keys
21 |         self.group_filter = group_filter
22 | 
23 |     @property
24 |     def re(self) -> regex.Pattern:
25 |         return self._re
26 | 
27 |     @re.setter
28 |     def re(self, re: regex.Pattern) -> None:
29 |         if not isinstance(re, regex.Pattern):
30 |             raise Errors.parameter_invalid_type('re', re, regex.Pattern)
31 |         self._re = re
32 |         self._group_keys = GroupKeys.preferred(re)
33 | 
34 |     @property
35 |     def group_filter(self) -> collections.abc.Container[Types.C_GK] | Types.P_M_GK:
36 |         return self._group_filter
37 | 
38 |     @group_filter.setter
39 |     def group_filter(self, group_filter: collections.abc.Container[Types.C_GK] | Types.P_M_GK) -> None:
40 |         if type_magic.isinstance_ex(group_filter, collections.abc.Container[Types.C_GK]):
41 |             GroupKeys.validate(self._re, group_filter)
42 |             self._group_filter = group_filter
43 |         elif type_magic.functoid_isinstance(group_filter, Types.P_M_GK):
44 |             self._group_filter = group_filter
45 |         else:
46 |             raise Errors.parameter_invalid_type('group_filter', group_filter, collections.abc.Container[Types.C_GK], Types.P_M_GK)
47 | 
48 |     def clone(self, tag: str | None = None) -> RegexItorator:
49 |         return type(self())(self._re, self._group_filter, self.tag)
50 | 
51 |     @abstractmethod
52 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
53 |         pass
54 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/split.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import enum
  3 | import types
  4 | import typing
  5 | import itertools
  6 | 
  7 | import regex
  8 | from pawpaw import Errors, Span, Ito, Types, type_magic
  9 | from pawpaw.arborform.itorator import Itorator, Extract
 10 | 
 11 | 
 12 | class Split(Itorator):
 13 |     @enum.unique
 14 |     class BoundaryRetention(enum.Enum):
 15 |         NONE = 0
 16 |         LEADING = 1
 17 |         TRAILING = 2
 18 |         ALL = 3
 19 | 
 20 |     def __init__(
 21 |             self,
 22 |             splitter: Itorator | regex.Pattern,
 23 |             limit: int | None = None,
 24 |             boundary_retention: BoundaryRetention = BoundaryRetention.NONE,
 25 |             return_zero_split: bool = True,
 26 |             desc: str | None = None,
 27 |             tag: str | None = None
 28 |     ):
 29 |         """Given P-O-O-S where P is prefix, - is boundary, O is/are middle segments(s), and S is suffix, the
 30 |         behavior is as follows:
 31 | 
 32 |           * BoundaryRetention.NONE -> P O O S : boundaries are discarded (this is an 'ordinary' split operation)
 33 | 
 34 |           * BoundaryRetention.LEADING -> -O -O -S : boundaries kept as prefixes, leading P is discarded
 35 | 
 36 |           * BoundaryRetention.TRAILING -> P- O- O- : boundaries kept as suffixes, trailing S is discarded
 37 | 
 38 |           * BoundaryRetention.ALL -> P – O – O – : all non-zero-length boundaries kept as distincts
 39 | 
 40 |         Zero-length boundaries are allowable, and any resulting empty Ito's are discarded
 41 | 
 42 |        Args:
 43 |         splitter: An Itorator used to generate boundaries; if a regex.Pattern is supplied,
 44 |           splitter is set to a pawpaw.itorator.Extract as follows:
 45 | 
 46 |             splitter = pawpaw.arborform.Extract(
 47 |                 re,
 48 |                 desc = lambda match, group_key: None,
 49 |                 group_filter = lambda m, gk: gk == 0,
 50 |                 tag = f'generated Split for \\{re.pattern}\\'
 51 |             )
 52 | 
 53 |         re: A regex pattern used to find matches
 54 | 
 55 |         group_key: A key used to identify a group from the matches; the resulting group will be used as the
 56 |           boundary; defaults to 0 (the entire match)
 57 | 
 58 |         boundary_retention: A rule used to determine if boundaries are discarded, or else how they are kept
 59 | 
 60 |         return_zero_split: Indicates how to handle the zero-split condition; when True and splits occur,
 61 |           returns a list containing a clone of the input Ito; when False and no splits occur, returns an
 62 |           empty list
 63 | 
 64 |         desc: Value used for the .desc of any yielded non-boundary Itos.
 65 |         """
 66 |         super().__init__(tag)
 67 |         
 68 |         if isinstance(splitter, Itorator):
 69 |             self.splitter = splitter
 70 |         elif isinstance(splitter, regex.Pattern):
 71 |             self.splitter = Extract(
 72 |                 splitter,
 73 |                 desc = lambda match, group_key: None,
 74 |                 group_filter = lambda m, gk: gk == 0,
 75 |                 tag = f'generated Split for \\{splitter.pattern}\\'
 76 |             )
 77 |         else:
 78 |             raise Errors.parameter_invalid_type('splitter', splitter, Itorator, regex.Pattern)
 79 |         
 80 |         if not isinstance(limit, (int, type(None))):
 81 |             raise Errors.parameter_invalid_type('limit', limit, int, types.NoneType)
 82 |         self.limit = limit
 83 | 
 84 |         if not isinstance(boundary_retention, self.BoundaryRetention):
 85 |             raise Errors.parameter_invalid_type('boundary_retention', boundary_retention, self.BoundaryRetention)
 86 |         self.boundary_retention = boundary_retention
 87 | 
 88 |         if not isinstance(return_zero_split, bool):
 89 |             raise Errors.parameter_invalid_type('return_zero_split', return_zero_split, bool)
 90 |         self.return_zero_split = return_zero_split
 91 | 
 92 |         if not isinstance(desc, (str, type(None))):
 93 |             raise Errors.parameter_invalid_type('desc', desc, str, types.NoneType)
 94 |         self.desc = desc
 95 | 
 96 |     def clone(self, tag: str | None = None) -> Split:
 97 |         return type(self())(
 98 |             self.splitter,
 99 |             self.limit,
100 |             self.boundary_retention,
101 |             self.return_zero_split,
102 |             self.desc,
103 |             self.tag if tag is None else tag)
104 | 
105 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
106 |         if self.limit == 0 and self.return_zero_split:
107 |             return ito.clone(desc=self.desc, clone_children=False),
108 | 
109 |         rv: typing.List[Ito] = []
110 |         
111 |         count = 0
112 |         prior: Span | None = None
113 |         for cur in itertools.takewhile(lambda i: self.limit is None or count < self.limit, self.splitter(ito)):
114 |             if prior is None:
115 |                 if self.boundary_retention == self.BoundaryRetention.LEADING:
116 |                     start = stop = 0
117 |                 else:
118 |                     start = ito.start
119 |                     if self.boundary_retention in (self.BoundaryRetention.NONE, self.BoundaryRetention.ALL):
120 |                         stop = cur.start
121 |                     else:  # TRAILING
122 |                         stop = cur.stop
123 |             else:
124 |                 if self.boundary_retention in (self.BoundaryRetention.NONE, self.BoundaryRetention.ALL):
125 |                     start = prior.stop
126 |                     stop = cur.start
127 |                 elif self.boundary_retention == self.BoundaryRetention.LEADING:
128 |                     start = prior.start
129 |                     stop = cur.start
130 |                 else:  # TRAILING
131 |                     start = prior.stop
132 |                     stop = cur.stop
133 | 
134 |             count += 1
135 | 
136 |             if start != stop:
137 |                 rv.append(ito.clone(start, stop, self.desc, False))
138 | 
139 |             if self.boundary_retention == self.BoundaryRetention.ALL and (cur.start < cur.stop):
140 |                 rv.append(cur)
141 | 
142 |             prior = cur
143 | 
144 |         if prior is not None and self.boundary_retention != self.BoundaryRetention.TRAILING:
145 |             if self.boundary_retention in (self.BoundaryRetention.NONE, self.BoundaryRetention.ALL):
146 |                 start = prior.stop
147 |             else:  # LEADING
148 |                 start = prior.start
149 |             stop = ito.stop
150 |             if start != stop:
151 |                 rv.append(ito.clone(start, stop, self.desc, False))
152 | 
153 |         if prior is None and len(rv) == 0 and self.return_zero_split:
154 |             rv.append(ito.clone(desc=self.desc, clone_children=False))
155 | 
156 |         return rv
157 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/itorator/value_func.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.itorator import Itorator
 5 | 
 6 | 
 7 | class ValueFunc(Itorator):
 8 |     def __init__(self, f: Types.F_ITO_2_VAL | None, tag: str | None = None):
 9 |         super().__init__(tag)
10 |         if not (f is None or type_magic.functoid_isinstance(f, Types.F_ITO_2_VAL)):
11 |             raise Errors.parameter_invalid_type('f', f, Types.F_ITO_2_VAL, None)
12 |         self.f = f
13 | 
14 |     def clone(self, tag: str | None = None) -> ValueFunc:
15 |         return type(self())(self.f, self.tag if tag is None else tag)
16 | 
17 |     def _transform(self, ito: Ito) -> Types.C_IT_ITOS:
18 |         ito.value_func = self.f
19 |         yield ito
20 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/__init__.py:
--------------------------------------------------------------------------------
1 | from .postorator import Postorator
2 | del postorator
3 | 
4 | from .windowed_join import WindowedJoin
5 | del windowed_join
6 | 
7 | from .stacked_reduce import StackedReduce
8 | del stacked_reduce


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/postorator.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | from pawpaw import Types, type_magic, Errors
 4 | 
 5 | 
 6 | class Postorator(ABC):
 7 |     @classmethod
 8 |     def wrap(cls, func: Types.F_ITOS_2_ITOS, tag: str | None = None):
 9 |         if type_magic.functoid_isinstance(func, Types.F_ITOS_2_ITOS):
10 |             return _WrappedPostorator(func, tag)
11 | 
12 |         raise Errors.parameter_invalid_type('func', func, Types.F_ITOS_2_ITOS)        
13 | 
14 |     def __init__(self, tag: str | None = None):
15 |         if tag is not None and not isinstance(tag, str):
16 |             raise Errors.parameter_invalid_type('desc', tag, str)
17 |         self.tag = tag
18 | 
19 |     @abstractmethod
20 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
21 |         ...
22 | 
23 |     def __call__(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
24 |         yield from self._transform(itos)
25 | 
26 | 
27 | class _WrappedPostorator(Postorator):
28 |     def __init__(self, f: Types.F_ITOS_2_ITOS, tag: str | None = None):
29 |         super().__init__(tag)
30 |         self.__f = f
31 | 
32 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
33 |         yield from self.__f(itos)
34 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/stacked_reduce.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.postorator import Postorator
 5 | 
 6 |        
 7 | class StackedReduce(Postorator):
 8 |     F_SQ_ITOS_2_ITO = typing.Callable[[Types.C_SQ_ITOS], Ito]
 9 |     P_SQ_ITOS_ITO = typing.Callable[[Types.C_SQ_ITOS, Ito], bool]
10 |     
11 |     def __init__(
12 |             self,
13 |             reduce_func: F_SQ_ITOS_2_ITO,
14 |             push_predicate: P_SQ_ITOS_ITO,
15 |             pop_predicate: P_SQ_ITOS_ITO | None = None,
16 |             tag: str | None = None
17 |     ):
18 |         super().__init__(tag)
19 |         if not type_magic.functoid_isinstance(reduce_func, self.F_SQ_ITOS_2_ITO):
20 |             raise Errors.parameter_invalid_type('reduce_func', reduce_func, self.F_SQ_ITOS_2_ITO)
21 |         self.reduce_func = reduce_func
22 | 
23 |         if not type_magic.functoid_isinstance(push_predicate, self.P_SQ_ITOS_ITO):
24 |             raise Errors.parameter_invalid_type('push_predicate', push_predicate, self.P_SQ_ITOS_ITO)
25 |         self.push_predicate = push_predicate
26 | 
27 |         if pop_predicate is None or type_magic.functoid_isinstance(pop_predicate, self.P_SQ_ITOS_ITO):
28 |             self.pop_predicate = pop_predicate
29 |         else:
30 |             raise Errors.parameter_invalid_type('pop_predicate', pop_predicate, self.P_SQ_ITOS_ITO, None)
31 | 
32 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
33 |         stack: typing.List[Ito] = []
34 |         for ito in itos:
35 |             if len(stack) > 0:
36 |                 if self.pop_predicate is not None and self.pop_predicate(stack, ito):
37 |                     yield self.reduce_func(stack)
38 |                     stack.clear()
39 |                 else:
40 |                     stack.append(ito)
41 | 
42 |             if len(stack) == 0:
43 |                 if self.push_predicate(stack, ito):
44 |                     stack.append(ito)
45 |                 else:
46 |                     yield ito
47 | 
48 |         if len(stack) > 0:
49 |             yield self.reduce_func(stack)
50 | 


--------------------------------------------------------------------------------
/pawpaw/arborform/postorator/windowed_join.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | 
 3 | from pawpaw import Ito, Types, Errors, type_magic
 4 | from pawpaw.arborform.postorator import Postorator
 5 | 
 6 | 
 7 | class WindowedJoin(Postorator):
 8 |     F_SQ_ITOS_2_B = typing.Callable[[Types.C_SQ_ITOS], bool]
 9 |     
10 |     def __init__(
11 |             self,
12 |             window_size: int,
13 |             predicate: F_SQ_ITOS_2_B,
14 |             ito_class: Ito = Ito,
15 |             desc: str | None = None,
16 |             tag: str | None = None
17 |     ):
18 |         super().__init__(tag)
19 |         if not isinstance(window_size, int):
20 |             raise Errors.parameter_invalid_type('window_size', window_size, int)
21 |         if window_size < 2:
22 |             raise ValueError(f'parameter \'window_size\' has value '
23 |                              f'{window_size:,}, but must be greater than or equal to 2')
24 |         self.window_size = window_size
25 | 
26 |         if not type_magic.functoid_isinstance(predicate, self.F_SQ_ITOS_2_B):
27 |             raise Errors.parameter_invalid_type('predicate', predicate, self.F_SQ_ITOS_2_B)
28 |         self.predicate = predicate
29 | 
30 |         if not issubclass(ito_class, Ito):
31 |             raise ValueError('parameter \'ito_class\' ({ito_class}) is not an \'{Ito}\' or subclass.')
32 |         self.ito_class = ito_class
33 | 
34 |         self.desc = desc
35 | 
36 |     def _transform(self, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
37 |         window: typing.List[Ito] = []
38 |         for ito in itos:
39 |             window.append(ito)
40 |             if len(window) == self.window_size:
41 |                 if self.predicate(window):
42 |                     yield self.ito_class.join(*window, desc=self.desc)
43 |                     window.clear()
44 |                 else:
45 |                     yield window.pop(0)
46 | 
47 |         yield from window
48 | 


--------------------------------------------------------------------------------
/pawpaw/errors.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import inspect
 3 | import types
 4 | import typing
 5 | import enum
 6 | 
 7 | 
 8 | class Errors:
 9 |     @classmethod
10 |     def parameter_not_none(cls, name: str) -> ValueError:
11 |         return ValueError(f'parameter \'{name}\' can not be None')
12 | 
13 |     @classmethod
14 |     def parameter_neither_none_nor_empty(cls, name: str) -> ValueError:
15 |         return ValueError(f'parameter \'{name}\' can be neither None nor empty')
16 | 
17 |     @classmethod
18 |     def parameter_enum_not_in(cls, name: str, value: typing.Any, enum_: enum.Enum) -> ValueError:
19 |         return ValueError(f'parameter \'{name}\' is not a valid {enum_.__name__}')
20 | 
21 |     @classmethod
22 |     def _get_type_strs(cls, *allowed) -> typing.Iterable[str]:
23 |         for t in allowed:
24 |             if hasattr(t, '__qualname__'):
25 |                 if t.__qualname__ == 'Callable':
26 |                     yield str(t)
27 |                 else:
28 |                     yield t.__qualname__
29 |             elif hasattr(t, '__bound__'):
30 |                 yield from cls._get_type_strs(t.__bound__)
31 |             elif typing.get_origin(t) is types.UnionType:
32 |                 args = typing.get_args(t)
33 |                 yield from cls._get_type_strs(*args)
34 |             elif t is None:
35 |                 yield 'None'
36 |             else:
37 |                 yield repr(t)
38 | 
39 |     @classmethod
40 |     def _build_types_str(cls, *allowed: typing.Type) -> str:
41 |         return ' or '.join(cls._get_type_strs(*allowed))
42 | 
43 |     @classmethod
44 |     def parameter_invalid_type(cls, name: str, value: typing.Any, *allowed: typing.Type) -> TypeError:
45 |         actual = str(inspect.signature(value)) if callable(value) else repr(value)
46 |         return TypeError(f'parameter \'{name}\' must be type {cls._build_types_str(*allowed)}, not {actual}')
47 | 
48 |     @classmethod
49 |     def parameter_iterable_contains_invalid_type(cls, name: str, value: typing.Any, *allowed: typing.Type) -> TypeError:
50 |         actual = str(inspect.signature(value)) if callable(value) else repr(value)
51 |         return TypeError(f'parameter \'{name}\' must contain elements of type {cls._build_types_str(*allowed)}, however, it contains an element of type {actual}: {value}')
52 | 


--------------------------------------------------------------------------------
/pawpaw/infix.py:
--------------------------------------------------------------------------------
 1 | """Infix operator class recipe from https://code.activestate.com/recipes/384122
 2 | 
 3 | Returns:
 4 |     An infix that can be called using either:
 5 |     
 6 |         x |op| y
 7 |         or
 8 |         x <<op>> y
 9 | """
10 | class Infix:
11 |     def __init__(self, function):
12 |         self.function = function
13 | 
14 |     def __ror__(self, other):
15 |         return Infix(lambda x: self.function(other, x))
16 | 
17 |     def __or__(self, other):
18 |         return self.function(other)
19 | 
20 |     def __rlshift__(self, other):
21 |         return Infix(lambda x, self=self, other=other: self.function(other, x))
22 | 
23 |     def __rshift__(self, other):
24 |         return self.function(other)
25 | 
26 |     def __call__(self, value1, value2):
27 |         return self.function(value1, value2)
28 | 


--------------------------------------------------------------------------------
/pawpaw/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | from .nlp import byte_order_controls, unicode_white_space_other, unicode_single_quote_marks, unicode_double_quote_marks, unicode_bullets, trimmable_ws
2 | 
3 | from .nlp import Number, KeyedPrefix, Paragraph, Sentence, SimpleNlp
4 | del nlp
5 | 


--------------------------------------------------------------------------------
/pawpaw/ontology/__init__.py:
--------------------------------------------------------------------------------
1 | from .ontology import Discoveries, Ontology
2 | del ontology
3 | 
4 | from ._query import OPERATORS, MUST_ESCAPE_CHARS, escape, descape, Query, compile, find_all, find
5 | del _query


--------------------------------------------------------------------------------
/pawpaw/ontology/ontology.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import itertools
 3 | import typing
 4 | 
 5 | from pawpaw import Ito, Types
 6 | from pawpaw.arborform import Itorator
 7 | import regex
 8 | 
 9 | 
10 | class Discoveries(dict):
11 |     def __init__(self, *args, **kwargs):
12 |         self._itos: list[Ito] = list(kwargs.pop('itos', tuple()))
13 |         dict.__init__(self, *args, **kwargs )
14 | 
15 |     @property
16 |     def itos(self) -> list[Ito]:
17 |         return self._itos   
18 |     
19 |     def __str__(self):
20 |         c = ', '.join(f'{k}: {str(v)}' for k, v in self.items())
21 |         return f'{{itos: {[str(i) for i in self._itos]}, {c}}}'
22 |     
23 |     def _flatten(self, filter_empties: bool = True, path: Types.C_OPATH = tuple()) -> dict[Types.C_OPATH, list[Ito]]:
24 |         rv = {} if len(self.itos) == 0 and filter_empties else {tuple(path): self.itos}
25 |         for key in self.keys():
26 |             rv |= self[key]._flatten(filter_empties, path + (key,))
27 |         return rv
28 | 
29 |     def flatten(self, filter_empties: bool = True) -> dict[Types.C_OPATH, list[Ito]]:
30 |         return self._flatten(filter_empties, )
31 |     
32 |     def walk(self) -> Types.C_IT_ITOS:
33 |         yield from self._itos
34 |         for child in self.values():
35 |             yield from child.walk()
36 | 
37 | 
38 | class Ontology(dict):
39 |     def __missing__(self, key):
40 |         if isinstance(key, typing.Sequence) and (lk := len(key)) > 0 and not isinstance(key, str):
41 |             rv = self[key[0]]
42 |             if lk > 1:
43 |                 rv = rv[key[1:]]
44 |             return rv
45 |         else:
46 |             raise KeyError(key)
47 | 
48 |     def __init__(self, *args, **kwargs):
49 |         self._rules: list[Types.C_ORULE] = kwargs.pop('rules', [])
50 |         dict.__init__(self, *args, **kwargs )
51 | 
52 |     @property
53 |     def rules(self) -> list[Types.C_ORULE]:
54 |         return self._rules
55 | 
56 |     def __str__(self):
57 |         c = ', '.join(f'{k}: {str(v)}' for k, v in self.items())
58 |         return f'{{rules: {self._rules}, {c}}}'   
59 |     
60 |     def discover(self, *itos: Ito) -> Discoveries:
61 |         rv = Discoveries()
62 | 
63 |         for rule in self._rules:
64 |             for i in itos:
65 |                 rv.itos.extend(rule(i))
66 | 
67 |         for k, v in self.items():
68 |             rv[k] = v.discover(*itos)
69 | 
70 |         return rv
71 | 


--------------------------------------------------------------------------------
/pawpaw/query/__init__.py:
--------------------------------------------------------------------------------
1 | from ._query import OPERATORS, FILTER_KEYS, MUST_ESCAPE_CHARS, escape, descape, Query, compile, find_all, find
2 | del _query
3 | 


--------------------------------------------------------------------------------
/pawpaw/span.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import collections.abc
 3 | import types
 4 | import typing
 5 | 
 6 | from pawpaw.errors import Errors
 7 | 
 8 | 
 9 | class Span(typing.NamedTuple):
10 |     start: int
11 |     stop: int
12 |         
13 |     @classmethod
14 |     def from_indices(
15 |         cls,
16 |         basis: int | collections.abc.Sized,
17 |         start: int | None = None,
18 |         stop: int | None = None
19 |     ) -> Span:
20 |         if isinstance(basis, int):
21 |             length = basis
22 |         elif isinstance(basis, collections.abc.Sized):
23 |             length = len(basis)
24 |         else:
25 |             raise Errors.parameter_invalid_type('basis', basis, int, collections.abc.Sized)
26 | 
27 |         if start is None:
28 |             start = 0
29 |         elif not isinstance(start, int):
30 |             raise Errors.parameter_invalid_type('start', start, int, types.NoneType)
31 |         else:
32 |             start = min(length, start) if start >= 0 else max(0, length + start)
33 | 
34 |         if stop is None:
35 |             stop = length
36 |         elif not isinstance(stop, int):
37 |             raise Errors.parameter_invalid_type('stop', stop, int, types.NoneType)
38 |         else:
39 |             stop = min(length, stop) if stop >= 0 else max(0, length + stop)
40 |             
41 |         stop = max(start, stop)
42 | 
43 |         return Span(start, stop)
44 | 
45 |     def offset(self, i: int) -> Span:
46 |         if not isinstance(i, int):
47 |             raise Errors.parameter_invalid_type('i', i, int)
48 |             
49 |         if i == 0:
50 |             return self
51 |         
52 |         rv = Span(self.start + i, self.stop + i)
53 |         if rv.start < 0 or rv.stop < 0:
54 |             raise ValueError(f'offsetting by {i:,} results in negative indices')
55 |         
56 |         return rv
57 | 


--------------------------------------------------------------------------------
/pawpaw/table/__init__.py:
--------------------------------------------------------------------------------
1 | from .table import *
2 | del table
3 | 
4 | import pawpaw.table.styles


--------------------------------------------------------------------------------
/pawpaw/table/styles/__init__.py:
--------------------------------------------------------------------------------
1 | from .styles import *
2 | del styles
3 | 


--------------------------------------------------------------------------------
/pawpaw/table/styles/styles.py:
--------------------------------------------------------------------------------
  1 | from pawpaw.table import TableStyle
  2 | 
  3 | """
  4 | Notes:
  5 | 
  6 | MUST HAVE CHARACTERISTICS:
  7 | 
  8 |     - Must be able to determine start and stop in order to identify within larger
  9 |         unstructured text
 10 | 
 11 |     - Have way to distinguish columns and ROWS (i.e., a table represented with tabs
 12 |         doesn't allow for row delineations
 13 | 
 14 |     - Optionally has a header row(s)
 15 | """
 16 | 
 17 | """
 18 | Style 1 [Unnamed]
 19 | 
 20 | -----+-----+-----
 21 |   A  |  B  |  C
 22 | -----+-----+-----      
 23 |  aaa | bbb | ccc
 24 | -----+-----+-----      
 25 | """
 26 | 
 27 | p = r'(?:-{2,}(?:\+-+)+)'
 28 | TYPE_1 = TableStyle(
 29 |     table_start_pat = p,
 30 |     row_sep_pat = p,
 31 |     equi_distant_indent=False
 32 | )
 33 | del p
 34 | 
 35 | 
 36 | """
 37 | Style 2 [Unnamed]
 38 | 
 39 | -------------------
 40 | |  A  |  B  |  C  |
 41 | |-----------------|
 42 | | aaa | bbb | ccc |
 43 | -------------------     
 44 | """
 45 | 
 46 | p = r'-{2,}'
 47 | TYPE_2 = TableStyle(
 48 |     table_start_pat = p,
 49 |     row_sep_pat = r'\|(?:-+\|)+',
 50 |     table_end_pat = p,
 51 |     equi_distant_indent=True
 52 | )
 53 | del p
 54 | 
 55 | 
 56 | """
 57 | markdown
 58 | 
 59 |     | A | B | C |
 60 |     |---|:-:|--:|
 61 |     | a | b | c |
 62 |     | d | e | f |
 63 | """
 64 | 
 65 | """
 66 | reStructuredText
 67 | 
 68 |     2.a rst Simple Table
 69 | 
 70 |     =====  =====  =======
 71 |     A      B      A and B
 72 |     =====  =====  =======
 73 |     False  False  False
 74 |     True   False  False
 75 |     False  True   False
 76 |     True   True   True
 77 |     =====  =====  =======
 78 | 
 79 |     2.b rst Grid Table
 80 | 
 81 |     +------------+------------+-----------+
 82 |     | Header 1   | Header 2   | Header 3  |
 83 |     +============+============+===========+
 84 |     | body row 1 | column 2   | column 3  |
 85 |     +------------+------------+-----------+
 86 |     | body row 2 | Cells may span columns.|
 87 |     +------------+------------+-----------+
 88 |     | body row 3 | Cells may  | - Cells   |
 89 |     +------------+ span rows. | - contain |
 90 |     | body row 4 |            | - blocks. |
 91 |     +------------+------------+-----------+
 92 | """
 93 | 
 94 | """
 95 | ASCII doc
 96 | 
 97 |     [cols="e,m,^,>s",width="25%"]
 98 |     |============================
 99 |     |1 >s|2 |3 |4
100 |     ^|5 2.2+^.^|6 .3+<.>m|7
101 |     ^|8
102 |     |9 2+>|10
103 |     |============================
104 | """
105 |     
106 | """
107 | ASCII Misc
108 | 
109 |     pipe, hypen, plus
110 |     
111 |     +---+---+---+
112 |     | A | B | C |
113 |     +---+---+---+
114 |     | a | b | c |
115 |     +---+---+---+
116 |     | d | e | f |
117 |     +---+---+---+
118 | 
119 |     pipe, em-dash, plus
120 | 
121 |     +———+———+———+
122 |     | A | B | C |
123 |     +———+———+———+
124 |     | a | b | c |
125 |     +———+———+———+
126 |     | d | e | f |
127 |     +———+———+———+
128 | 
129 |     misc ascii box drawing line styles
130 |     
131 |     ┌───┬───┬───┐
132 |     │ A │ B │ C │
133 |     ├───┼───┼───┤
134 |     │ a │ b │ c │
135 |     ├───┼───┼───┤
136 |     │ d │ e │ f │
137 |     └───┴───┴───┘    
138 | 
139 |     ┏━━━┳━━━┳━━━┓
140 |     ┃ A ┃ B ┃ C ┃
141 |     ┣━━━╋━━━╋━━━┫
142 |     ┃ a ┃ b ┃ c ┃
143 |     ┣━━━╋━━━╋━━━┫
144 |     ┃ d ┃ e ┃ f ┃
145 |     ┗━━━┻━━━┻━━━┛
146 | 
147 |     ┏━━━┳━━━┳━━━┓
148 |     ┃ A ┃ B ┃ C ┃
149 |     ┡━━━╇━━━╇━━━┩
150 |     │ a │ b │ c │
151 |     ├───┼───┼───┤
152 |     │ d │ e │ f │
153 |     └───┴───┴───┘
154 |         
155 |     ╔═══╦═══╦═══╗
156 |     ║ A ║ B ║ C ║
157 |     ╠═══╬═══╬═══╣
158 |     ║ a ║ b ║ c ║
159 |     ╠═══╬═══╬═══╣
160 |     ║ d ║ e ║ f ║
161 |     ╚═══╩═══╩═══╝    
162 | 
163 |     ╔═══╤═══╤═══╗
164 |     ║ A │ B │ C ║
165 |     ╟───┼───┼───╢
166 |     ║ a │ b │ c ║
167 |     ╟───┼───┼───╢
168 |     ║ d │ e │ f ║
169 |     ╚═══╧═══╧═══╝    
170 | """
171 | 
172 | 


--------------------------------------------------------------------------------
/pawpaw/table/table.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod, abstractproperty
 2 | import dataclasses
 3 | 
 4 | import regex
 5 | import pawpaw
 6 | 
 7 | 
 8 | class Table(ABC):
 9 |     @property
10 |     @abstractmethod
11 |     def re(self) -> regex.Pattern:
12 |         ...
13 | 
14 |     @abstractmethod
15 |     def get_itor(self) -> pawpaw.arborform.Itorator:
16 |         ...
17 | 
18 | 
19 | @dataclasses.dataclass
20 | class TableStyle:
21 |     pre_caption_pat: str | None = None
22 |     table_start_pat: str = ''
23 |     header_row_end_pat: str | None = None
24 |     row_sep_pat: str = ''
25 |     table_end_pat: str | None = None
26 |     post_caption_pat: str | None = None
27 |     equi_distant_indent: bool = True
28 | 
29 | 
30 | class StyledTable(Table):
31 |     # finds equidistant indentation (zero or more spaces or tabs) chunks
32 |     _pat_indent = r'[ \t]*'
33 |     _re_equi_ident = regex.compile(rf'(?<=^|\n)(?P<chunk>(?P<indent>{_pat_indent})[^ \t][^\n]+?\n(?:(?P=indent)[^ \t][^\n]+?(?:\n|$))+)', regex.DOTALL)
34 | 
35 |     @classmethod
36 |     def _build_re(cls, style: TableStyle) -> regex.Pattern:
37 |         re = r'(?<=^|\n)'
38 | 
39 |         if style.equi_distant_indent:
40 |             re = rf'(?P<indent>{cls._pat_indent})'
41 |             pat_indent = r'(?P=indent)'
42 |         else:
43 |             pat_indent = r''
44 | 
45 |         re += r'(?<table>'
46 | 
47 |         if style.pre_caption_pat is not None:
48 |             re += rf'(?:(?<pre_caption>{style.pre_caption_pat})\n{pat_indent})?'
49 | 
50 |         re += rf'{style.table_start_pat}'
51 | 
52 |         if style.header_row_end_pat is not None:
53 |             re += rf'(?:\n{pat_indent}(?<header_row>.+?)\n{pat_indent}{style.header_row_end_pat})?'
54 |             
55 |         if style.table_end_pat is None:
56 |             re += rf'(?:\n{pat_indent}(?<row>.+?)\n{pat_indent}{style.row_sep_pat})+'
57 |         else:
58 |             re += rf'(?:\n{pat_indent}(?<row>.+?)\n{pat_indent}{style.row_sep_pat})*\n{pat_indent}(?<row>.+?)'
59 |             re += rf'\n{pat_indent}{style.table_end_pat}'
60 |             
61 |         if style.post_caption_pat is not None:
62 |             re += rf'\n{pat_indent}(?<post_caption>{style.post_caption_pat})(?=\n|$)'
63 | 
64 |         re += r')(?=$|\n)'
65 | 
66 |         return regex.compile(re, regex.DOTALL)
67 | 
68 |     def __init__(self, style: TableStyle, tag: str | None = None):
69 |         self.style = style
70 |         self._re = self._build_re(style)
71 |         self.tag = tag
72 | 
73 |     @property
74 |     def re(self) -> regex.Pattern:
75 |         return self._re
76 | 
77 |     def get_itor(self) -> pawpaw.arborform.Itorator:
78 |         itor_table = pawpaw.arborform.Extract(self._re, tag=self.tag, group_filter=lambda m, gk: gk in ('pre_caption', 'table', 'header_row', 'row', 'post_caption'))
79 |         if not self.style.equi_distant_indent:
80 |             return itor_table
81 | 
82 |         itor_equi_ident = pawpaw.arborform.Extract(self._re_equi_ident, tag='equidistant indentation', group_filter=('chunk',))
83 |         con = pawpaw.arborform.Connectors.Delegate(itor_table, 'chunk')
84 |         itor_equi_ident.connections.append(con)
85 |         return itor_equi_ident
86 | 


--------------------------------------------------------------------------------
/pawpaw/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import typing
  3 | 
  4 | import pawpaw
  5 | 
  6 | 
  7 | # Finds indices of non-doubled escape chars
  8 | def find_escapes(
  9 |     src: str | pawpaw.Ito,
 10 |     escape: str = '\\',
 11 |     start: int | None = None,
 12 |     stop: int | None = None
 13 | ) -> typing.Iterable[int]:
 14 |     if isinstance(src, str):
 15 |         s = src
 16 |         offset = 0
 17 |     elif isinstance(src, pawpaw.Ito):
 18 |         s = src.string
 19 |         offset = src.start
 20 |     else:
 21 |         raise pawpaw.Errors.parameter_invalid_type('src', src, str, pawpaw.Ito)
 22 | 
 23 |     span = pawpaw.Span.from_indices(src, start, stop).offset(offset)
 24 | 
 25 |     if not isinstance(escape, str):
 26 |         raise pawpaw.Errors.parameter_invalid_type('escape', escape, str)
 27 |     elif len(escape) != 1:
 28 |         raise ValueError('parameter \'escape\' must have length 1')
 29 | 
 30 |     esc = False
 31 |     for i in range(span.start, span.stop):
 32 |         c = s[i]
 33 |         if c == escape:
 34 |             esc = not esc
 35 |         elif esc:
 36 |             yield i - 1
 37 |             esc = False
 38 | 
 39 | 
 40 | def find_unescaped(
 41 |     src: str | pawpaw.Ito,
 42 |     chars: str,
 43 |     escape: str = '\\',
 44 |     start: int | None = None,
 45 |     stop: int | None = None
 46 | ) -> typing.Iterable[int]:
 47 |     if isinstance(src, str):
 48 |         s = src
 49 |         offset = 0
 50 |     elif isinstance(src, pawpaw.Ito):
 51 |         s = src.string
 52 |         offset = src.start
 53 |     else:
 54 |         raise pawpaw.Errors.parameter_invalid_type('src', src, str, pawpaw.Ito)
 55 | 
 56 |     span = pawpaw.Span.from_indices(src, start, stop).offset(offset)
 57 | 
 58 |     if not isinstance(chars, str):
 59 |         raise pawpaw.Errors.parameter_invalid_type('chars', chars, str)
 60 |     elif len(chars) == 0:
 61 |         raise ValueError('parameter \'chars\' must have non-zero length')
 62 | 
 63 |     if not isinstance(escape, str):
 64 |         raise pawpaw.Errors.parameter_invalid_type('escape', escape, str)
 65 |     elif len(escape) != 1:
 66 |         raise ValueError('parameter \'escape\' must have length 1')
 67 | 
 68 |     esc = False
 69 |     for i in range(span.start, span.stop):
 70 |         c = s[i]
 71 |         if esc:
 72 |             esc = False
 73 |         elif c == escape:
 74 |             esc = True
 75 |         elif c in chars:
 76 |             yield i - offset
 77 | 
 78 |     if esc:
 79 |         raise ValueError('parameter \'src\' ends with un-followed escape char \'{escape}\'')
 80 | 
 81 | 
 82 | def split_unescaped(
 83 |     src: str | pawpaw.Ito,
 84 |     char: str,
 85 |     escape: str = '\\',
 86 |     start: int | None = None,
 87 |     stop: int | None = None
 88 | ) -> typing.Iterable[str] | typing.Iterable[pawpaw.Ito]:
 89 |     cur = 0
 90 |     for i in find_unescaped(src, char, escape, start, stop):
 91 |         yield src[cur:i]
 92 |         cur = i + 1
 93 |     yield src[cur:]
 94 | 
 95 | 
 96 | def find_balanced(
 97 |     src: str | pawpaw.Ito,
 98 |     lchar: str | pawpaw.Ito,
 99 |     rchar: str | pawpaw.Ito,
100 |     escape: str = '\\',
101 |     start: int | None = None,
102 |     stop: int | None = None
103 | ) -> typing.Iterable[str] | typing.Iterable[pawpaw.Ito]:
104 |     if isinstance(src, str):
105 |         s = src
106 |         offset = 0
107 |     elif isinstance(src, pawpaw.Ito):
108 |         s = src.string
109 |         offset = src.start
110 |     else:
111 |         raise pawpaw.Errors.parameter_invalid_type('src', src, str, pawpaw.Ito)
112 | 
113 |     if not (isinstance(lchar, str) or isinstance(lchar, pawpaw.Ito)):
114 |         raise pawpaw.Errors.parameter_invalid_type('left', lchar, str, pawpaw.Ito)
115 |     elif len(lchar) != 1:
116 |         raise ValueError('parameter \'left\' must have length 1')
117 |     lchar = str(lchar)
118 | 
119 |     if not (isinstance(rchar, str) or isinstance(rchar, pawpaw.Ito)):
120 |         raise pawpaw.Errors.parameter_invalid_type('right', rchar, str, pawpaw.Ito)
121 |     elif len(rchar) != 1:
122 |         raise ValueError('parameter \'right\' must have length 1')
123 |     rchar = str(rchar)
124 | 
125 |     lefts = []
126 |     for i in find_unescaped(src, lchar + rchar, escape, start, stop):
127 |         c = s[offset + i]
128 |         if c == lchar and (lchar != rchar or len(lefts) == 0):
129 |             lefts.append(i)
130 |         else:
131 |             len_lefts = len(lefts)
132 |             if len_lefts > 1:
133 |                 lefts.pop()
134 |             elif len_lefts == 1:
135 |                 yield src[lefts.pop():i+1]
136 |             else:
137 |                 raise ValueError(f'unbalanced right char {rchar} found at index {i}')
138 |         
139 |     if len(lefts) != 0:
140 |         raise ValueError(f'unbalanced left char {lchar} found at index {lefts.pop()}')
141 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | import pawpaw.visualization.sgr
2 | 
3 | from .highlighter import Highlighter
4 | del highlighter
5 | 
6 | import pawpaw.visualization.ascii_box
7 | 
8 | import pawpaw.visualization.pepo
9 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/highlighter.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import typing
 3 | 
 4 | import pawpaw
 5 | from pawpaw.visualization import sgr
 6 | 
 7 | 
 8 | class Highlighter:
 9 |     '''
10 |     - Guarrantee differing color across any Ito boundaries
11 |     - An Ito parent might not have the same color for sub spans, because it is not always possible to do so.
12 |       Consider a two-color palette with this nesting:
13 |         A-------------A     Prefix and suffix get color 1
14 |             B------B        Assign color 2 so that boundary AB and BA are visible
15 |             C---C           If color 1, boundary AC is invisible.  If color 2, boundary CB invisible)
16 |     '''
17 | 
18 |     def __init__(self, palette: sgr.C_PALETTE):
19 |         self._backs = tuple(sgr.Back.from_color(col) for col in palette)
20 | 
21 |     def _compose(self, predicate: pawpaw.Types.P_ITO, it_back: typing.Iterator[sgr.Back], ito: pawpaw.Ito, str_slice: slice | None = None):
22 |         if predicate(ito):
23 |             prefix = f'{next(it_back)}'
24 |             suffix = f'{sgr.Back.RESET}'
25 |         else:
26 |             prefix = suffix = ''
27 | 
28 |         if str_slice is None:
29 |             s = f'{ito}'
30 |         else:
31 |             s = f'{ito.string[str_slice]}'
32 | 
33 |         return f'{prefix}{s}{suffix}'
34 |             
35 |     def _print(self, ito: pawpaw.Types.P_ITO, predicate: pawpaw.Types.P_ITO, it_back: typing.Iterator[sgr.Back]):
36 |         if ito.children == 0:
37 |             if len(ito) == 0:
38 |                 print(self._compose(predicate, it_back, ito), end='')
39 |             return
40 | 
41 |         last = ito.start
42 |         for child in ito.children:
43 |             if last < child.start:
44 |                 print(self._compose(predicate, it_back, ito, slice(last, child.start)), end='')
45 |             self._print(child, predicate, it_back)
46 |             last = child.stop
47 |         if last < ito.stop:
48 |             print(self._compose(predicate, it_back, ito, slice(last, ito.stop)), end='')
49 | 
50 |     def print(self, ito: pawpaw.Ito, predicate: pawpaw.Types.P_ITO = lambda ito: True) -> None:
51 |         self._print(ito, predicate, itertools.cycle(self._backs))
52 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/pepo/__init__.py:
--------------------------------------------------------------------------------
1 | from .pepo import Pepo, Compact, Tree, Xml, Json
2 | del pepo


--------------------------------------------------------------------------------
/pawpaw/visualization/pepo/pepo.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # Force Python XML parser, not faster C version so that we can hook methods
  3 | sys.modules['_elementtree'] = None
  4 | import abc
  5 | import json
  6 | import io
  7 | import os
  8 | import typing
  9 | from xml.sax.saxutils import escape as xml_escape
 10 | 
 11 | import pawpaw
 12 | from pawpaw.visualization import ascii_box
 13 | 
 14 | 
 15 | class Pepo(abc.ABC):
 16 |     def __init__(self, indent: str = '    ', children: bool = True):
 17 |         self.linesep: str = os.linesep
 18 |         self.indent: str = indent
 19 |         self.children = children
 20 | 
 21 | 
 22 |     @abc.abstractmethod
 23 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
 24 |         ...
 25 | 
 26 |     def dumps(self, *itos: pawpaw.Ito) -> str:
 27 |         with io.StringIO() as fs:
 28 |             self.dump(fs, *itos)
 29 |             fs.seek(0)
 30 |             return fs.read()
 31 | 
 32 | 
 33 | class _PepoFstr(Pepo):
 34 |     def __init__(self, indent: str = '    ', children: bool = True, fstr: str = '%desc'):
 35 |         super().__init__(indent, children)
 36 |         self.fstr = fstr
 37 | 
 38 | 
 39 | class Compact(_PepoFstr):
 40 |     def __init__(self, indent: str = '    ', children: bool = True):
 41 |         super().__init__(indent, children, '%span %desc!r : \'%substr!1r1:40…% \'')
 42 |         self.children = children
 43 | 
 44 |     def _dump(self, fs: typing.IO, ei: pawpaw.Types.C_EITO, level: int = 0) -> None:
 45 |         fs.write(f'{self.indent * level}{ei.index:,}: {ei.ito:{self.fstr}}{self.linesep}')
 46 | 
 47 |         if self.children:
 48 |             level += 1
 49 |             for eic in (pawpaw.Types.C_EITO(i, ito) for i, ito in enumerate(ei.ito.children, start=1)):
 50 |                 self._dump(fs, eic, level)
 51 | 
 52 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
 53 |         for ei in (pawpaw.Types.C_EITO(i, ito) for i, ito in enumerate(itos, start=1)):
 54 |             if not isinstance(ei.ito, pawpaw.Ito):
 55 |                 raise pawpaw.Errors.parameter_iterable_contains_invalid_type('itos', ei.ito, pawpaw.Ito)
 56 |             self._dump(fs, ei)
 57 | 
 58 | 
 59 | class Tree(_PepoFstr):
 60 |     HORZ = ascii_box.BoxDrawingChar.from_char('─')
 61 |     VERT = ascii_box.BoxDrawingChar.from_char('│')
 62 |     TEE = ascii_box.BoxDrawingChar.from_char('├')
 63 |     ELBOW = ascii_box.BoxDrawingChar.from_char('└')
 64 | 
 65 |     def __init__(self, indent: str = '  ', children: bool = True):
 66 |         super().__init__(indent, children, '%span %desc!r : \'%substr!1r1:^40…% \'')
 67 |         self.children = False
 68 | 
 69 |     def _dump_children(self, fs: typing.IO, ito: pawpaw.Ito, prefix: str = '') -> None:
 70 |         for child in ito.children[:-1]:
 71 |             fs.write(f'{prefix}'
 72 |                      f'{self.TEE}'
 73 |                      f'{self.HORZ.char * len(self.indent)}'
 74 |                      f'{child:{self.fstr}}'
 75 |                      f'{self.linesep}')
 76 |             self._dump_children(fs, child, prefix + f'{self.VERT}{self.indent}')
 77 | 
 78 |         if len(ito.children) > 0:
 79 |             child = ito.children[-1]
 80 |             fs.write(f'{prefix}'
 81 |                      f'{self.ELBOW}'
 82 |                      f'{self.HORZ.char * len(self.indent)}'
 83 |                      f'{child:{self.fstr}}'
 84 |                      f'{self.linesep}')
 85 |             self._dump_children(fs, child, prefix + f' {self.indent}')
 86 | 
 87 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
 88 |         for ito in itos:
 89 |             if not isinstance(ito, pawpaw.Ito):
 90 |                 raise pawpaw.Errors.parameter_invalid_type('*itos', ito, pawpaw.Ito)
 91 |             fs.write(f'{ito:{self.fstr}}{self.linesep}')
 92 |             self._dump_children(fs, ito)
 93 | 
 94 | 
 95 | class Xml(Pepo):
 96 |     def __init__(self, indent: str = '    ', children: bool = True):
 97 |         super().__init__(indent, children)
 98 | 
 99 |     def _dump(self, fs: typing.IO, ei: pawpaw.Types.C_EITO, level: int = 0) -> None:
100 |         fs.write(f'{level * self.indent}<ito')
101 |         fs.write(f' start="{ei.ito.start}"')
102 |         fs.write(f' stop="{ei.ito.stop}"')
103 |         fs.write(f' desc="{xml_escape(ei.ito.desc or "")}">')
104 |         fs.write(self.linesep)
105 | 
106 |         fs.write(f'{level * self.indent}<substring>')
107 |         fs.write(xml_escape(str(ei.ito)))
108 |         fs.write(f'</substring>{self.linesep}')
109 |         if self.children and len(ei.ito.children) > 0:
110 |             fs.write(f'{level * self.indent}<children>{self.linesep}')
111 | 
112 |             level += 1
113 |             for i, ito in enumerate(ei.ito.children):
114 |                 child = pawpaw.Types.C_EITO(i, ito)
115 |                 self._dump(fs, child, level)
116 | 
117 |             level -= 1
118 |             fs.write(f'{level * self.indent}</children>{self.linesep}')
119 | 
120 |         level -= 1
121 |         fs.write(f'{level * self.indent}</ito>{self.linesep}')
122 | 
123 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
124 |         fs.write(f'<?xml version="1.0" encoding="UTF-8" ?>{self.linesep}')
125 |         fs.write(f'<itos>{self.linesep}')
126 |         for ito in itos:
127 |             if not isinstance(ito, pawpaw.Ito):
128 |                 raise pawpaw.Errors.parameter_iterable_contains_invalid_type('itos', ito, pawpaw.Ito)
129 |             self._dump(fs, pawpaw.Types.C_EITO(0, ito), 1)
130 |         fs.write(f'<itos>{self.linesep}')
131 | 
132 | 
133 | class Json(Pepo):
134 |     def __init__(self, indent: str = '    ', children: bool = True):
135 |         super().__init__(indent, children)
136 | 
137 |     def _dump(self, fs: typing.IO, ei: pawpaw.Types.C_EITO, level: int = 0) -> None:
138 |         fs.write(level * self.indent + '{' + self.linesep)
139 | 
140 |         level += 1
141 |         fs.write(f'{level * self.indent}"start": {ei.ito.start},{self.linesep}')
142 |         fs.write(f'{level * self.indent}"stop": {ei.ito.stop},{self.linesep}')
143 |         if ei.ito.desc == None:
144 |             desc = "null"
145 |         else:
146 |             desc = json.encoder.encode_basestring(ei.ito.desc)
147 |         fs.write(f'{level * self.indent}"desc": {desc},{self.linesep}')
148 |         substr = json.encoder.encode_basestring(str(ei.ito))
149 |         fs.write(f'{level * self.indent}"substr": {substr},{self.linesep}')
150 |         if self.children:
151 |             fs.write(f'{level * self.indent}"children": [')
152 |             if len(ei.ito.children) == 0:
153 |                 fs.write(f']{self.linesep}')
154 |             else:
155 |                 fs.write(self.linesep)
156 | 
157 |                 level += 1
158 |                 for i, ito in enumerate(ei.ito.children):
159 |                     child = pawpaw.Types.C_EITO(i, ito)
160 |                     self._dump(fs, child, level)
161 |                     if i < len(ei.ito.children) - 1:
162 |                         fs.write(',')
163 |                     fs.write(self.linesep)
164 | 
165 |                 level -= 1
166 |                 fs.write(f'{level * self.indent}]{self.linesep}')
167 | 
168 |         level -= 1
169 |         fs.write(level * self.indent + '}')
170 | 
171 |     def dump(self, fs: typing.IO, *itos: pawpaw.Ito) -> None:
172 |         fs.write('{' + self.linesep)
173 | 
174 |         fs.write(f'{self.indent}"itos": [')
175 | 
176 |         comma_needed = False
177 |         for ito in itos:
178 |             if not isinstance(ito, pawpaw.Ito):
179 |                 raise pawpaw.Errors.parameter_invalid_type('*itos', ito, pawpaw.Ito)
180 |             if comma_needed:
181 |                 fs.write(',')
182 |             fs.write(self.linesep)
183 |             self._dump(fs, pawpaw.Types.C_EITO(0, ito), 2)
184 |             comma_needed = True
185 |         fs.write(self.linesep)
186 | 
187 |         fs.write(self.indent + ']' + self.linesep)
188 | 
189 |         fs.write('}' + self.linesep)
190 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/__init__.py:
--------------------------------------------------------------------------------
1 | from .sgr import encode, RESET_ALL, Intensity, Italic, Underline, Blink, Invert, Conceal, Strike, Font
2 | from .sgr import C_COLOR, C_PALETTE, Colors, Fore, Back
3 | del sgr
4 | 
5 | from pawpaw.visualization.sgr.palettes import *
6 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/palettes/__init__.py:
--------------------------------------------------------------------------------
1 | from .palettes import *
2 | del palettes


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/palettes/palettes.py:
--------------------------------------------------------------------------------
 1 | from pawpaw.visualization.sgr import C_PALETTE, Colors
 2 | 
 3 | 
 4 | AIR_FORCE_ONE: C_PALETTE = (
 5 |     Colors.Rgb.from_24_bit(0x3C79B4),  # Dark Blue
 6 |     Colors.Rgb.from_24_bit(0xC9ECF5),  # Light Blue
 7 |     Colors.Rgb.from_24_bit(0xF6F6F6),  # Off White
 8 |     Colors.Rgb.from_24_bit(0xB7986C),  # Brown
 9 | )
10 | 
11 | OLD_GLORY: C_PALETTE = (
12 |     Colors.Rgb.from_24_bit(0xB31942),  # Red
13 |     Colors.Rgb.from_24_bit(0xFFFFFF),  # White
14 |     Colors.Rgb.from_24_bit(0x0A3161),  # Blue
15 | )
16 | 
17 | PAWPAW: C_PALETTE = (
18 |     Colors.Rgb.from_24_bit(0x533E30),  # Royal Brown
19 |     Colors.Rgb.from_24_bit(0xD2AC70),  # Light French Beige
20 |     Colors.Rgb.from_24_bit(0xE4D1AE),  # Desert Sand
21 |     Colors.Rgb.from_24_bit(0x517D3D),  # Fern Green
22 |     Colors.Rgb.from_24_bit(0x90C246),  # Android Green
23 | )
24 | """
25 |   "Oriental Beauty"
26 |   https://www.schemecolor.com/oriental-beauty-color-combination.php
27 | """
28 | 
29 | TULIP_FIELD: C_PALETTE = (
30 |     Colors.Rgb.from_24_bit(0xFF6C98),  # Dark Pink
31 |     Colors.Rgb.from_24_bit(0xFEAA6D),  # Orange 
32 |     Colors.Rgb.from_24_bit(0xF7BACB),  # Light Pink
33 |     Colors.Rgb.from_24_bit(0xD879A2),  # Purple
34 |     Colors.Rgb.from_24_bit(0xF9E841),  # Yellow
35 |     Colors.Rgb.from_24_bit(0xE53F5D),  # Red
36 | )
37 | 


--------------------------------------------------------------------------------
/pawpaw/visualization/sgr/sgr.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from dataclasses import dataclass
  3 | import enum
  4 | import typing
  5 | 
  6 | from pawpaw import Errors
  7 | 
  8 | 
  9 | """
 10 | SGR (Select Graphic Rendition) - see https://en.wikipedia.org/wiki/ANSI_escape_code
 11 | """
 12 | def encode(*n: int) -> str:
 13 |     if len(n) == 0:
 14 |         n = '0'
 15 |     else:
 16 |         n = ';'.join(str(i) for i in n)
 17 |     return f'\033[{n}m'
 18 | 
 19 | RESET_ALL: str = encode(0)
 20 | 
 21 |     
 22 | @dataclass(frozen=True)
 23 | class _Sgr:
 24 |     RESET : str
 25 | 
 26 | 
 27 | @dataclass(frozen=True)
 28 | class _Intensity(_Sgr):
 29 |     BOLD : str = encode(1)
 30 |     DIM  : str = encode(2)
 31 |     RESET: str = encode(22)
 32 | 
 33 | 
 34 | Intensity = _Intensity()
 35 | 
 36 | 
 37 | @dataclass(frozen=True)
 38 | class _Italic(_Sgr):
 39 |     ON   : str = encode(3)
 40 |     RESET: str = encode(23)
 41 | 
 42 | 
 43 | Italic = _Italic()
 44 | 
 45 | 
 46 | @dataclass(frozen=True)
 47 | class _Underline(_Sgr):
 48 |     SINGLE: str = encode(4)
 49 |     DOUBLE: str = encode(21)
 50 |     RESET : str = encode(24)
 51 | 
 52 | 
 53 | Underline = _Underline()
 54 | 
 55 | 
 56 | @dataclass(frozen=True)
 57 | class _Blink(_Sgr):
 58 |     SLOW : str = encode(5)
 59 |     RAPID: str = encode(6)
 60 |     RESET: str = encode(25)
 61 | 
 62 | 
 63 | Blink = _Blink()
 64 | 
 65 | 
 66 | @dataclass(frozen=True)
 67 | class _Invert(_Sgr):
 68 |     ON   : str = encode(7)
 69 |     RESET: str = encode(27)
 70 | 
 71 | 
 72 | Invert = _Invert()
 73 | 
 74 | 
 75 | @dataclass(frozen=True)
 76 | class _Conceal(_Sgr):
 77 |     ON   : str = encode(8)
 78 |     RESET: str = encode(28)
 79 | 
 80 | 
 81 | Conceal = _Conceal()
 82 | 
 83 | 
 84 | @dataclass(frozen=True)
 85 | class _Strike(_Sgr):
 86 |     SLOW : str = encode(9)
 87 |     RESET: str = encode(29)
 88 | 
 89 | 
 90 | Strike = _Strike()
 91 | 
 92 | 
 93 | @dataclass(frozen=True)
 94 | class _Font(_Sgr):
 95 |     ALT_1: str = encode(11)
 96 |     ALT_2: str = encode(12)
 97 |     ALT_3: str = encode(13)
 98 |     ALT_4: str = encode(14)
 99 |     ALT_5: str = encode(15)
100 |     ALT_6: str = encode(16)
101 |     ALT_7: str = encode(17)
102 |     ALT_8: str = encode(18)
103 |     ALT_9: str = encode(19)
104 |     RESET: str = encode(10)
105 | 
106 | 
107 | Font = _Font()
108 | 
109 | 
110 | @dataclass
111 | class _Colors:
112 |     class Named(enum.IntEnum):
113 |         BLACK  : int = 0
114 |         RED    : int = 1
115 |         GREEN  : int = 2
116 |         YELLOW : int = 3
117 |         BLUE   : int = 4
118 |         MAGENTA: int = 5
119 |         CYAN   : int = 6
120 |         WHITE  : int = 7
121 | 
122 |         BRIGHT_BLACK  : int  = 60
123 |         BRIGHT_RED    : int  = 61
124 |         BRIGHT_GREEN  : int  = 62
125 |         BRIGHT_YELLOW : int  = 63
126 |         BRIGHT_BLUE   : int  = 64
127 |         BRIGHT_MAGENTA: int  = 65
128 |         BRIGHT_CYAN   : int  = 66
129 |         BRIGHT_WHITE  : int  = 67
130 | 
131 | 
132 |     class Rgb(typing.NamedTuple):
133 |         red: int
134 |         green: int
135 |         blue: int
136 | 
137 |         @classmethod
138 |         def from_24_bit(cls, val: int) -> _Colors.Rgb:
139 |             return cls(val >> 16, (val >> 8) & 0xFF, val & 0xFF)
140 | 
141 | 
142 |     class EightBit(int):
143 |         """
144 |             0-  7:  standard colors (as in ESC [ 30–37 m)
145 |             8- 15:  high intensity colors (as in ESC [ 90–97 m)
146 |             16-231:  6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5)
147 |             232-255:  grayscale from dark to light in 24 steps
148 |         """
149 |         pass
150 | 
151 | 
152 | Colors = _Colors()
153 | 
154 | C_COLOR = Colors.Named | Colors.Rgb | Colors.EightBit
155 | C_PALETTE = typing.Sequence[C_COLOR]
156 | 
157 | 
158 | @dataclass(frozen=True)
159 | class Fore(_Sgr):
160 |     _NAMED_OFFSET: int = 30
161 |     _BY_IDX      : int = 38
162 |     RESET        : str = encode(39)
163 | 
164 |     @classmethod
165 |     def from_color(cls, src: C_COLOR) -> str:
166 |         if isinstance(src, Colors.Named):
167 |             nc = getattr(Colors.Named, src.name)
168 |             return encode(nc.value + cls._NAMED_OFFSET)
169 |         elif isinstance(src, Colors.Rgb):
170 |             return encode(cls._BY_IDX, 2, *src)
171 |         elif isinstance(src, Colors.EightBit):
172 |             return encode(cls._BY_IDX, 5, src)
173 |         else:
174 |             raise Errors.parameter_invalid_type('src', src, Colors.Named, Colors.Rgb, Colors.EightBit)
175 |         
176 |     def __init__(self, src: C_COLOR):
177 |         object.__setattr__(self, '_value', self.from_color(src))
178 |         
179 |     def __str__(self) -> str:
180 |         return self._value
181 | 
182 | 
183 | @dataclass(frozen=True)
184 | class Back(Fore):
185 |     _NAMED_OFFSET: int = Fore._NAMED_OFFSET + 10
186 |     _BY_IDX      : int = Fore._BY_IDX + 10
187 |     RESET        : str = encode(49)
188 | 
189 |     def __init__(self, src: C_COLOR):
190 |         super().__init__(src)
191 | 


--------------------------------------------------------------------------------
/pawpaw/xml/__init__.py:
--------------------------------------------------------------------------------
1 | from pawpaw.xml import descriptors
2 | 
3 | from .xml_helper import QualifiedName, EtName, XmlErrors, XmlHelper
4 | del xml_helper
5 | 
6 | from .xml_parser import XmlParser
7 | del xml_parser
8 | 


--------------------------------------------------------------------------------
/pawpaw/xml/descriptors.py:
--------------------------------------------------------------------------------
 1 | ATTRIBUTES: str = 'attributes'
 2 | ATTRIBUTE: str = 'attribute'
 3 | COMMENT: str = 'comment'
 4 | ELEMENT: str = 'element'
 5 | END_TAG: str = 'end_tag'
 6 | NAME: str = 'name'
 7 | NAMESPACE: str = 'namespace'
 8 | PI: str = 'pi'
 9 | START_TAG: str = 'start_tag'
10 | TAG: str = 'tag'
11 | TEXT: str = 'text'
12 | VALUE: str = 'value'
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pawpaw"
 7 | dynamic = ["version"]
 8 | authors = [
 9 |   { name="Robert L. Ayers", email="rlayers@yahoo.com" },
10 | ]
11 | description = "High Performance Text Processing & Segmentation Framework"
12 | readme = "README.md"
13 | requires-python = ">=3.10"
14 | license = "MIT"
15 | license-files = { paths = ["LICENSE"] }
16 | dependencies = [
17 |   "regex >= 2023.8.8",
18 | ]
19 | keywords = [
20 |   "nlp",
21 |   "information-extraction",
22 |   "text-processing",
23 |   "text-segmentation",
24 |   "hierarchical-text-segmentation",
25 |   "python",
26 |   "xml-parser",
27 |   "extract-text",
28 |   "knowledge-graph",
29 | ]
30 | classifiers = [
31 |     # See https://pypi.org/classifiers/
32 |     "Programming Language :: Python :: 3.10",
33 |     "Programming Language :: Python :: 3.11",
34 |     "License :: OSI Approved :: MIT License",
35 |     "Operating System :: OS Independent",
36 |     "Development Status :: 3 - Alpha",
37 |     "Intended Audience :: Developers",
38 |     "Topic :: Software Development :: Libraries :: Python Modules",
39 |     "Topic :: Text Processing"
40 | ]
41 | 
42 | [project.urls]
43 | "Homepage" = "https://github.com/rlayers/pawpaw"
44 | "Bug Tracker" = "https://github.com/rlayers/pawpaw/issues"
45 | 
46 | [tool.hatch.version]
47 | path = "pawpaw/_version.py"
48 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/__init__.py


--------------------------------------------------------------------------------
/tests/arborform/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/arborform/__init__.py


--------------------------------------------------------------------------------
/tests/arborform/test_invert.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import Extract, Invert
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestInvert(_TestIto):
 8 |     def test_transform(self):
 9 |         s = ' a1b2c '
10 |         root = Ito(s)
11 | 
12 |         non_gap_desc = 'nongap'
13 |         gap_desc = 'gap'
14 | 
15 |         extract_res = [
16 |             regex.compile(r'.', regex.DOTALL),
17 |             regex.compile(r'\s', regex.DOTALL),
18 |             regex.compile(r'[a-z]', regex.DOTALL),
19 |             regex.compile(r'\d', regex.DOTALL),
20 |             regex.compile(r'\S', regex.DOTALL),
21 |             regex.compile(r'_', regex.DOTALL),
22 |         ]
23 | 
24 |         for re in extract_res:
25 |             with self.subTest(re=re.pattern):
26 |                 itor_extract = Extract(re, desc=lambda match, gk: non_gap_desc)
27 |                 non_gaps = [*itor_extract(root)]
28 |                 expected = [*Ito.from_gaps(root, non_gaps, gap_desc)]
29 | 
30 |                 itor_gaps = Invert(itor_extract, desc=gap_desc)
31 |                 self.assertSequenceEqual(expected, [*itor_gaps(root)])
32 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_desc.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import Desc
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestDesc(_TestIto):
 8 |     def test_transform(self):
 9 |         s = ' abc '
10 |         root = Ito(s, 1, -1)
11 |         self.assertIsNone(root.desc)
12 | 
13 |         desc = 'changed'
14 |         itor = Desc(desc)
15 |         rv = [*itor._transform(root)]
16 |         self.assertEqual(1, len(rv))
17 | 
18 |         rv = rv[0]
19 |         self.assertIs(root, rv)
20 |         self.assertEqual(desc, rv.desc)
21 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_filter.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import Itorator, Filter, Connectors
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestFilter(_TestIto):
 8 |     def test_traverse_partial(self):
 9 |         s = '1a2b3c'
10 |         root = Ito(s)
11 | 
12 |         split_chr = Itorator.wrap(lambda ito: ito)
13 |         rv = [*split_chr(root)]
14 |         self.assertEqual(len(s), len(rv))
15 | 
16 |         for ft, f in [('None', lambda ito: False), ('All', lambda ito: True), ('Partial', Ito.str_isnumeric)]:
17 |             with self.subTest(filter_type=ft):
18 |                 split_chr = split_chr.clone()
19 |                 filter = Filter(f)
20 |                 con = Connectors.Delegate(filter)
21 |                 split_chr.connections.append(con)
22 |                 expected = [i for i in root if f(i)]
23 |                 actual = [*split_chr(root)]
24 |                 self.assertSequenceEqual(expected, actual)
25 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_reflect.py:
--------------------------------------------------------------------------------
 1 | from pawpaw import Ito
 2 | from pawpaw.arborform import Reflect
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestReflect(_TestIto):
 7 |     def test_transform(self):
 8 |         s = 'abc'
 9 |         root = Ito(s)
10 |         self.add_chars_as_children(root, 'Child')
11 | 
12 |         reflect = Reflect()
13 |         rv = [*reflect._transform(root)]
14 |         self.assertEqual(1, len(rv))
15 |         self.assertIs(root, rv[0])
16 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_split.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import typing
  3 | 
  4 | import regex
  5 | from pawpaw import Ito
  6 | from pawpaw.arborform import Itorator, Split
  7 | from tests.util import _TestIto
  8 | 
  9 | 
 10 | class TestSplit(_TestIto):
 11 |     PREFIX = 'PRE'
 12 |     MIDDLE = 'MID'
 13 |     SUFFIX = 'SUF'
 14 |   
 15 |     @classmethod
 16 |     def str_from(cls, sep: str) -> str:
 17 |         return sep.join((cls.PREFIX, cls.MIDDLE, cls.MIDDLE, cls.SUFFIX))
 18 |     
 19 |     SEP_DESC = 'sep'
 20 | 
 21 |     @classmethod
 22 |     def re_from(cls, sep: str) -> regex.Pattern:
 23 |         return regex.compile(regex.escape(sep), regex.DOTALL)
 24 |     
 25 |     @classmethod
 26 |     def expected_from(cls, s: str, sep: str, brt: Split.BoundaryRetention) -> typing.List[str]:
 27 |         if sep == '':
 28 |             return [c for c in s]
 29 |       
 30 |         rv = s.split(sep)
 31 |         if brt == Split.BoundaryRetention.LEADING:
 32 |             del rv[0]
 33 |             for i, s in enumerate(rv):
 34 |                 rv[i] = sep + s
 35 |         elif brt == Split.BoundaryRetention.TRAILING:
 36 |             del rv[-1]
 37 |             for i, s in enumerate(rv):
 38 |                 rv[i] += sep
 39 |         elif brt == Split.BoundaryRetention.ALL:
 40 |             rv = rv[:1] + list(itertools.chain.from_iterable((sep, i) for i in rv[1:]))
 41 | 
 42 |         return rv
 43 |     
 44 |     valid_ctor_params = {
 45 |         'splitter': [Itorator.wrap(lambda ito: ito.str_split()), regex.compile(r'\s+', regex.DOTALL)],
 46 |         'limit': [-1, -0, 1, None],
 47 |         'boundary_retention': list(Split.BoundaryRetention),
 48 |         'return_zero_split': [False, True],
 49 |         'desc': ['abc', None],
 50 |         'tag': ['abc', None],
 51 |     }
 52 | 
 53 |     def test_ctor_valid(self):
 54 |         keys, values = zip(*self.valid_ctor_params.items())
 55 |         for kwargs in [dict(zip(keys, v)) for v in itertools.product(*values)]:
 56 |             with self.subTest(**kwargs):
 57 |                 itor = Split(**kwargs)
 58 | 
 59 |     invalid_ctor_params = {
 60 |         'splitter': [None, True, 1, 'abc'],
 61 |         'limit': [1.0, 'abc'],
 62 |         'boundary_retention': [None, True, 1, 'abc'],
 63 |         'return_zero_split': [None, 1, 'abc'],
 64 |         'desc': [True, 1],
 65 |         'tag': [True, 1.3],
 66 |     }
 67 | 
 68 |     def test_ctor_invalid(self):
 69 |         valids = {k: v[0] for k, v in self.valid_ctor_params.items()}
 70 |         for k, vs in self.invalid_ctor_params.items():
 71 |             invalids = dict(**valids)
 72 |             for v in vs:
 73 |                 invalids[k] = v
 74 |                 with self.subTest(**invalids):
 75 |                     with self.assertRaises(TypeError):
 76 |                         itor = Split(**invalids)
 77 | 
 78 |     def test_iter_simple(self):
 79 |         for sep in ' ', '-':  # '', ' ', '-':
 80 |             s = self.str_from(sep)
 81 |             ito = Ito(s, desc='root')
 82 |             re = self.re_from(sep)
 83 |             for brt in Split.BoundaryRetention:
 84 |                 with self.subTest(string=s, separator=sep, boundary_retention=brt):
 85 |                     expected = self.expected_from(s, sep, brt)
 86 |                     non_sep_desc = 'split'
 87 |                     split = Split(re, boundary_retention=brt, desc=non_sep_desc)
 88 |                     actual = [*split._transform(ito)]
 89 |                     self.assertListEqual(expected, [str(i) for i in actual])
 90 |                     self.assertTrue(all(i.desc in (None, non_sep_desc) for i in actual))
 91 | 
 92 |     def test_iter_sep_not_present(self):
 93 |         sep = 'XXX'
 94 |         s = self.str_from(' ')
 95 |         ito = Ito(s)
 96 |         re = regex.compile(regex.escape(sep))
 97 |         desc='post-split'
 98 |         for brt in Split.BoundaryRetention:
 99 |             for return_zero_split in True, False:
100 |                 with self.subTest(string=s, separator=sep, boundary_retention=brt, return_zero_split=return_zero_split, desc=desc):
101 |                     expected = [ito.clone(desc=desc)] if return_zero_split else []
102 |                     split = Split(re, boundary_retention=brt, return_zero_split=return_zero_split, desc=desc)
103 |                     actual = [*split._transform(ito)]
104 |                     self.assertListEqual(expected, actual)
105 | 
106 |     @classmethod
107 |     def zero_width_patterns(cls, sep: str) -> typing.Iterable[regex.Pattern]:
108 |         esc_sep = regex.escape(sep)
109 |         yield r'(?<=' + esc_sep + r')'  # look behind
110 |         yield r'(?=' + esc_sep + r')'  # look ahead
111 |     
112 |     def test_iter_zero_width_matches(self):
113 |         sep = '.'
114 |         s = self.str_from(sep)
115 |         ito = Ito(s, desc='root')
116 |         for pat in self.zero_width_patterns(sep):
117 |             re = regex.compile(pat)
118 |             for brt in Split.BoundaryRetention:
119 |                 with self.subTest(string=s, pattern=pat, boundary_retention=brt):
120 |                     expected = re.split(s)
121 |                     if brt == Split.BoundaryRetention.LEADING:
122 |                         del expected[0]
123 |                     elif brt == Split.BoundaryRetention.TRAILING:
124 |                         del expected[-1]
125 |                     desc = 'split'
126 |                     split = Split(re, boundary_retention=brt, desc=desc)
127 |                     actual = [*split._transform(ito)]
128 |                     self.assertListEqual(expected, [str(i) for i in actual])
129 |                     self.assertTrue(all(i.desc == desc for i in actual))
130 | 
131 |     def test_limit(self):
132 |         s = 'abc'
133 |         root = Ito(s)
134 |         
135 |         re = regex.compile('(?=.)')
136 |         for limit in None, *range(0, len(s)):
137 |             with self.subTest(re=re.pattern, limit=limit):
138 |                 splitter = Split(re, limit=limit)
139 |                 rv = [*splitter(root)]
140 |                 expected = []
141 |                 if limit is None:
142 |                     expected.extend(root)
143 |                 elif limit == 0:
144 |                     expected.append(root)
145 |                 else:
146 |                     expected.extend(i for i in root[:limit-1] if len(i) > 0)  # split parts
147 |                     expected.append(root.clone(limit-1))  # remaining part
148 |                 self.assertSequenceEqual(expected, rv)
149 | 
150 |         re = regex.compile('(?<=.)')
151 |         for limit in None, *range(0, len(s)):
152 |             with self.subTest(re=re.pattern, limit=limit):
153 |                 splitter = Split(re, limit=limit)
154 |                 rv = [*splitter(root)]
155 |                 expected = []
156 |                 if limit is None:
157 |                     expected.extend(root)
158 |                 elif limit == 0:
159 |                     expected.append(root)
160 |                 else:
161 |                     expected.extend(i for i in root[:limit] if len(i) > 0)  # split parts
162 |                     expected.append(root.clone(limit))  # remaining part
163 |                 self.assertSequenceEqual(expected, rv)
164 | 
165 |         re = regex.compile('b')
166 |         for limit in None, *range(0, len(s)):
167 |             with self.subTest(re=re.pattern, limit=limit):
168 |                 splitter = Split(re, limit=limit)
169 |                 rv = [*splitter(root)]
170 |                 expected = []
171 |                 if limit is None or limit > 0:
172 |                     expected.extend(root.str_split('b'))
173 |                 else:
174 |                     expected.append(root)
175 |                 self.assertSequenceEqual(expected, rv)
176 |                 
177 | 


--------------------------------------------------------------------------------
/tests/arborform/test_itorator_value_func.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from pawpaw.arborform import ValueFunc
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestValueFunc(_TestIto):
 8 |     def test_transform(self):
 9 |         s = '123'
10 |         root = Ito(s)
11 |         self.assertEqual(str(root), root.value())
12 | 
13 |         f = lambda i: int(str(i))
14 |         itor = ValueFunc(f)
15 |         rv = [*itor._transform(root)]
16 |         self.assertEqual(1, len(rv))
17 | 
18 |         rv = rv[0]
19 |         self.assertIs(root, rv)
20 |         self.assertEqual(f(rv), rv.value())
21 | 


--------------------------------------------------------------------------------
/tests/arborform/test_nuco.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito, arborform
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestNuco(_TestIto):
 7 |     def test_init(self):
 8 |         itor_a = arborform.Reflect()
 9 |         itor_b = arborform.Desc('123')
10 |         tag = 'abc'
11 |         nuco = arborform.Nuco(itor_a, itor_b, tag=tag)
12 |         self.assertListEqual([itor_a, itor_b], nuco._itorators)
13 |         self.assertEqual(tag, nuco.tag)
14 | 
15 |     def test_transform(self):
16 |         s = 'She bought 12 eggs'
17 |         root = Ito(s)
18 | 
19 |         itor_split = arborform.Itorator.wrap(lambda ito: ito.str_split())
20 | 
21 |         itor_num = arborform.Extract(regex.compile('(?P<number>\d+)'))
22 |         itor_word = arborform.Desc('word')
23 |         
24 |         itor_nuco = arborform.Nuco(itor_num, itor_word)
25 |         con = arborform.Connectors.Delegate(itor_nuco)
26 |         itor_split.connections.append(con)
27 | 
28 |         root.children.add(*itor_split(root))
29 |         self.assertEqual(len(s.split()), len(root.children))
30 |         for tok in root.children:
31 |             expected = 'number' if tok.str_isdecimal() else 'word'
32 |             self.assertEqual(expected, tok.desc)
33 | 


--------------------------------------------------------------------------------
/tests/arborform/test_postorator.py:
--------------------------------------------------------------------------------
 1 | from itertools import tee
 2 | 
 3 | import regex
 4 | from pawpaw import Ito, Types
 5 | from pawpaw.arborform import Split
 6 | from pawpaw.arborform.postorator import Postorator
 7 | from tests.util import _TestIto
 8 | 
 9 | 
10 | class TestPostorator(_TestIto):
11 |     post_desc = 'joined'
12 | 
13 |     @classmethod
14 |     def simple(cls, itos: Types.C_IT_ITOS) -> Types.C_IT_ITOS:
15 |         yield Ito.join(*itos)
16 | 
17 |     def test_traverse(self):
18 |         for s in 'One', 'One Two', 'One Two Three', 'One Two Three Four':
19 |             itos = Ito(s).str_split()
20 |             with self.subTest(string=s, itos=itos, desc=self.post_desc):
21 |                 wrapped = Postorator.wrap(self.simple)
22 |                 expected = [*self.simple(itos)]
23 |                 actual = [*wrapped(itos)]
24 |                 self.assertListEqual(expected, actual)
25 | 
26 |     def test_post(self):
27 |         for s in 'One', 'One Two', 'One Two Three', 'One Two Three Four':
28 |             root = Ito(s, desc='root')
29 |             splitter = Split(regex.compile(r'\s+'), desc=root.desc)
30 | 
31 |             rv = [*splitter(root)]
32 |             self.assertListEqual(root.str_split(), rv)
33 | 
34 |             splitter.postorator = Postorator.wrap(self.simple)
35 |             expected = [Ito(s)]
36 |             actual = [*splitter(root)]
37 |             self.assertListEqual(expected, actual)
38 | 


--------------------------------------------------------------------------------
/tests/arborform/test_postorator_windowed_join.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito, Types
 3 | from pawpaw.arborform.postorator import WindowedJoin
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestWindowedJoin(_TestIto):
 8 |     def test_window_size(self):
 9 |         func = lambda itos: True
10 |         for window_size in -1, 0, 1, 2:
11 |           with self.subTest(window_size=window_size):
12 |               if window_size < 2:
13 |                   with self.assertRaises(ValueError):
14 |                       WindowedJoin(window_size, func)
15 |               else:
16 |                   WindowedJoin(window_size, func)
17 | 
18 |     def test_traverse_tautology(self):
19 |         func = lambda itos: True
20 |         re = regex.compile(r'\s')
21 |         for s in '', 'One', 'One Two', 'One Two Three', 'One Two Three Four':
22 |             root = Ito(s, desc='root')
23 |             itos = root.split(re)
24 |             desc = 'merged'
25 |             for window_size in 2, 3, 4:
26 |                 with self.subTest(string=s, window_size=window_size, desc=desc):
27 |                     wj = WindowedJoin(window_size, func, desc=desc)
28 |                     actual = [*wj(itos)]
29 |                     if len(itos) < window_size:
30 |                         self.assertListEqual(itos, actual)
31 |                     else:
32 |                         joined_count = len(itos) // window_size
33 |                         unjoined_count = len(itos) % window_size
34 |                         self.assertEqual(joined_count + unjoined_count, len(actual))
35 | 
36 |                         for i in range(0, joined_count):
37 |                             expected = Ito.join(*itos[i * window_size:i * window_size + window_size], desc=desc)
38 |                             self.assertEqual(expected, actual[i])
39 | 
40 |                         if unjoined_count > 0:
41 |                             tail = itos[-unjoined_count:]
42 |                             self.assertListEqual(tail, actual[-unjoined_count:])
43 | 
44 |     def test_traverse_non_tautology(self):
45 |         s = 'One Two Three Four'
46 |         root = Ito(s, desc='root')
47 |         itos = root.str_split()
48 | 
49 |         window_size = 2
50 |         func = lambda itos: all(i.str_startswith('T') for i in itos)
51 |         desc = 'merged'
52 | 
53 |         wj = WindowedJoin(window_size, func, desc=desc)
54 |         actual = [*wj(itos)]
55 |         self.assertEqual(3, len(actual))
56 |         self.assertEqual(itos[0], actual[0])
57 |         self.assertEqual(itos[-1], actual[-1])
58 |         self.assertEqual('Two Three', str(actual[1]))
59 | 


--------------------------------------------------------------------------------
/tests/ito/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/ito/__init__.py


--------------------------------------------------------------------------------
/tests/ito/test_ito_descend.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | import regex
 4 | from pawpaw import Ito
 5 | from tests.util import _TestIto
 6 | 
 7 | class TestItoDescend(_TestIto):
 8 |     def test_descends_from(self):
 9 |         root = Ito('abcde')
10 |         root.children.add(c := Ito(root, 1, -1))
11 |         c.children.add(gc := Ito(c, 1, -1))
12 | 
13 |         for desc, basis, expected in (
14 |             ('root', root, False),
15 |             ('child', c, True),
16 |             ('grandchild', gc, True),
17 |         ):
18 |             with self.subTest(desc=f'{desc}.descends_from(root) is {expected}'):
19 |                 self.assertEqual(expected, basis.descends_from(root))
20 | 
21 |             with self.subTest(desc=f'{desc}.clone().descends_from(root) is False'):
22 |                 self.assertFalse(basis.clone().descends_from(root))
23 | 
24 |     def test_has_descendant(self):
25 |         root = Ito('abcde')
26 |         root.children.add(c := Ito(root, 1, -1))
27 |         c.children.add(gc := Ito(c, 1, -1))
28 | 
29 |         for desc, basis, expected in (
30 |             ('root', root, False),
31 |             ('child', c, True),
32 |             ('grandchild', gc, True),
33 |         ):
34 |             with self.subTest(desc=f'root.has_descendant({desc}) is {expected}'):
35 |                 self.assertEqual(expected, root.has_descendant(basis))
36 | 
37 |             with self.subTest(desc=f'root.has_descendant({desc}.clone) is False'):
38 |                 self.assertFalse(root.has_descendant(basis.clone()))                
39 | 


--------------------------------------------------------------------------------
/tests/ito/test_ito_regex_equivalence_methods.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import Ito
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestItoRegexEquivalenceMethods(_TestIto):
 7 |     def test_regex_finditer(self):
 8 |         strings = '', 'A', 'Here are some words.'
 9 |         paddings = '', ' ', '_'
10 |         for string in strings:
11 |             for padding in paddings:
12 |                 s = f'{padding}{string}{padding}'
13 |                 pad_slice = slice(len(padding), -len(padding))
14 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
15 |                 for re_str in r' ', r'\w+', r'(?P<Word>\w+)':
16 |                     re = regex.compile(re_str, regex.DOTALL)
17 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
18 |                         expected = [*re.finditer(s, pos=pad_slice.start, endpos=pad_slice.stop)]
19 |                         actual = [*ito.regex_finditer(re)]
20 |                         self.assertEqual(len(expected), len(actual))
21 |                         for e, a in zip(expected, actual):
22 |                             self.assertEqual(e, a)
23 | 
24 |     def test_regex_match(self):
25 |         strings = '', 'A', 'Here are some words.'
26 |         paddings = '', ' ', '_'
27 |         for string in strings:
28 |             for padding in paddings:
29 |                 s = f'{padding}{string}{padding}'
30 |                 pad_slice = slice(len(padding), -len(padding))
31 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
32 |                 for re_str in r' ', r'\w+', r'(?P<Word>\w+)':
33 |                     re = regex.compile(re_str, regex.DOTALL)
34 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
35 |                         expected = re.match(s, pos=pad_slice.start, endpos=pad_slice.stop)
36 |                         actual = ito.regex_match(re)
37 |                         self.assertEqual(expected, actual)
38 | 
39 |     def test_regex_search(self):
40 |         strings = '', 'A', 'Here are some words.'
41 |         paddings = '', ' ', '_'
42 |         for string in strings:
43 |             for padding in paddings:
44 |                 s = f'{padding}{string}{padding}'
45 |                 pad_slice = slice(len(padding), -len(padding))
46 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
47 |                 for re_str in r' ', r'\w+', r'(?P<Word>\w+)':
48 |                     re = regex.compile(re_str, regex.DOTALL)
49 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
50 |                         expected = re.search(s, pos=pad_slice.start, endpos=pad_slice.stop)
51 |                         actual = ito.regex_search(re)
52 |                         self.assertEqual(expected, actual)
53 | 
54 |     def test_regex_split_simple(self):
55 |         strings = '', 'A', 'Here are some words.'
56 |         separators = ' ', '\n', '\r\n'
57 |         paddings = '', ' ', '_'
58 |         for string in strings:
59 |             for sep in separators:
60 |                 s = string.replace(' ', sep)
61 |                 for padding in paddings:
62 |                     s = f'{padding}{s}{padding}'
63 |                     pad_slice = slice(len(padding), -len(padding))
64 |                     ito = Ito(s, pad_slice.start, pad_slice.stop)
65 |                     re = regex.compile(regex.escape(sep), regex.DOTALL)
66 |                     with self.subTest(string=s, ito=ito, pattern=re.pattern):
67 |                         expected = re.split(s[pad_slice])
68 |                         actual = ito.regex_split(re)
69 |                         self.assertListEqual(expected, [str(i) for i in actual])
70 | 
71 |     def test_regex_split_sep_not_present(self):
72 |         strings = '', 'A', 'Here are some words.'
73 |         separator = 'XXX'
74 |         paddings = '', ' ', '_'
75 |         for string in strings:
76 |             for padding in paddings:
77 |                 s = f'{padding}{string}{padding}'
78 |                 pad_slice = slice(len(padding), -len(padding))
79 |                 ito = Ito(s, pad_slice.start, pad_slice.stop)
80 |                 re = regex.compile(regex.escape(separator), regex.DOTALL)
81 |                 with self.subTest(string=s, ito=ito, pattern=re.pattern):
82 |                     expected = re.split(s[pad_slice])
83 |                     actual = ito.regex_split(re)
84 |                     self.assertListEqual(expected, [str(i) for i in actual])
85 | 


--------------------------------------------------------------------------------
/tests/ito/test_ito_serialization.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | 
 4 | from pawpaw import Ito
 5 | from tests.util import _TestIto
 6 | 
 7 | 
 8 | class TestItoSerialization(_TestIto):
 9 |     def setUp(self) -> None:
10 |         super().setUp()
11 | 
12 |         s = 'See Jack run.'
13 |         self.h_ito = Ito(s, desc='Phrase')
14 |         self.h_ito.children.add(*self.h_ito.str_split())
15 |         for c in self.h_ito.children:
16 |             c.desc = 'Word'
17 |             self.add_chars_as_children(c, 'Char')
18 | 
19 |     def test_pickle_serialize(self):
20 |         word = self.h_ito.find('**[d:Word]')
21 |         pickle_data = pickle.dumps(word)
22 |         self.assertLess(0, len(pickle_data))
23 |       
24 |     def test_pickle_deserialize(self):
25 |         w_orig = self.h_ito.find('**[d:Word]')
26 |         pickle_data = pickle.dumps(w_orig)
27 |         w_deser = pickle.loads(pickle_data)
28 |         self.assertEqual(w_orig, w_deser)
29 | 
30 |     def test_json_serialize(self):
31 |         word = self.h_ito.find('**[d:Word]')
32 |         js_data = json.dumps(word, cls=Ito.JsonEncoder)
33 |         expected_prefix = '{"__type__": "typing.Tuple[str, Ito]", "string": "' + \
34 |             word.string + \
35 |             '", "ito": {"__type__": "Ito", "span": ' + \
36 |             str(list(word.span)) + \
37 |             ', "desc": "' + \
38 |             word.desc + \
39 |             '"'
40 |         self.assertTrue(js_data.startswith(expected_prefix))      
41 | 
42 |     def test_json_deserialize(self):
43 |         w_orig = self.h_ito.find('**[d:Word]')
44 |         js_data = json.dumps(w_orig, cls=Ito.JsonEncoder)
45 |         w_deser = json.loads(js_data, object_hook=Ito.json_decoder)
46 |         self.assertIsNot(w_orig, w_deser)
47 |         self.assertEqual(w_orig, w_deser)
48 |         
49 |     def test_json_stringless_serialize(self):
50 |         word = self.h_ito.find('**[d:Word]')
51 |         js_data = json.dumps(word, cls=Ito.JsonEncoderStringless)
52 |         expected_prefix = '{"__type__": "Ito", "span": ' + \
53 |             str(list(word.span)) + \
54 |             ', "desc": "' + \
55 |             word.desc + \
56 |             '"'
57 |         self.assertTrue(js_data.startswith(expected_prefix))
58 | 
59 |     def test_json_stringless_deserialize(self):
60 |         w_orig = self.h_ito.find('**[d:Word]')
61 |         js_data = json.dumps(w_orig, cls=Ito.JsonEncoderStringless)
62 |         w_deser = Ito.json_decode_stringless(w_orig.string, js_data)
63 |         self.assertIsNot(w_orig, w_deser)
64 |         self.assertEqual(w_orig, w_deser)
65 | 


--------------------------------------------------------------------------------
/tests/ito/test_ito_utility_methods.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | 
 3 | from pawpaw import Ito
 4 | from tests.util import _TestIto
 5 | 
 6 | 
 7 | class TestItoUtilityMethods(_TestIto):
 8 | 
 9 |     def test_to_line_str_col_empty(self):
10 |         s = 'a\nb\nc'
11 |         line = 1
12 |         col = 0
13 |         for i in range(0, len(s)):
14 |             col += 1
15 |             ito = Ito(s, i, i + 1)
16 |             with self.subTest(string=s, ito=ito.span):
17 |                 expected = line, col
18 |                 actual = ito.to_line_col('\n')
19 |                 self.assertEqual(expected, actual)
20 |                 if s[i] == '\n':
21 |                     line += 1
22 |                     col = 0
23 | 
24 |     def test_to_line_str_non_empty(self):
25 |         s = 'a\r\nb\r\nc'
26 |         line = 1
27 |         col = 0
28 |         for i in range(0, len(s) - 1):
29 |             col += 1
30 |             ito = Ito(s, i, i + 2)
31 |             with self.subTest(string=s, ito=ito.span):
32 |                 expected = line, col
33 |                 actual = ito.to_line_col('\r\n')
34 |                 self.assertEqual(expected, actual)
35 |                 if s[i] == '\n':
36 |                     line += 1
37 |                     col = 0
38 | 
39 |     def test_to_line_regex_non_empty(self):
40 |         string = 'abc\r\ndef\nghi'
41 |         eol = regex.compile(r'\r?\n', regex.DOTALL)
42 |         matches = eol.findall(string)
43 |         for i, ito in enumerate(Ito.from_gaps(string, Ito.from_re(eol, string)), 1):
44 |             for sub in ito:
45 |                 with self.subTest(ito=sub):
46 |                     expected = i, 1 + sub.start - ito.start
47 |                     self.assertEqual(expected, sub.to_line_col(eol))
48 | 


--------------------------------------------------------------------------------
/tests/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/nlp/__init__.py


--------------------------------------------------------------------------------
/tests/ontology/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/ontology/__init__.py


--------------------------------------------------------------------------------
/tests/ontology/test_keyed_list.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | import regex
 4 | import pawpaw
 5 | from tests.util import _TestIto
 6 | 
 7 | 
 8 | class TestTable(_TestIto):
 9 |     _sample_list_keys = [
10 |         ['1', '2', '3'],
11 |         ['1.1', '1.2', '1.3'],
12 |         ['A', 'B', 'C'],
13 |     ]
14 | 
15 |     _sample_list_values = [
16 |         'First line.',
17 |         'Second line.',
18 |         'Third line.',
19 |     ]
20 | 
21 |     # def test_itorator(self) -> None:
22 |     #     itor = pawpaw.nlp.KeyedList().get_itor()
23 |     #
24 |     #     for sks in self._sample_list_keys:
25 |     #         for key_sep in ['.', ')', ':']:
26 |     #             list_lines = [f'{k}{key_sep} {val}' for k, val in zip(sks, self._sample_list_values)]
27 |     #             for line_sep in ['\n', '\n\r']:
28 |     #                 _list = pawpaw.Ito(line_sep.join(list_lines))
29 |     #                 with self.subTest(_list=_list):
30 |     #                     rv = [*itor(_list)]
31 |     #                     self.assertEqual(len(sks), len(rv))
32 | 


--------------------------------------------------------------------------------
/tests/ontology/test_ontology.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import typing
  3 | 
  4 | import regex
  5 | import pawpaw
  6 | from pawpaw.ontology import Ontology
  7 | from tests.util import _TestIto
  8 | 
  9 | 
 10 | class TestOntology(_TestIto):
 11 |     def setUp(self) -> None:
 12 |         super().setUp()
 13 | 
 14 |         self.ontology = Ontology(
 15 |             {
 16 |                 'vehicle': Ontology(
 17 |                     {
 18 |                         'car': Ontology(
 19 |                             {
 20 |                                 'Ford': Ontology(
 21 |                                     rules=[
 22 |                                         pawpaw.arborform.Extract(
 23 |                                             regex.compile(
 24 |                                                 r'(?P<Mustang>(?:Ford\s+)?Mustang(?:(?:-|\s+)\L<subtypes>)?)',
 25 |                                                 regex.IGNORECASE | regex.DOTALL,
 26 |                                                 subtypes=['EcoBoost', 'LX', 'GT', 'GT350', 'GT500', 'Mach-E', 'Dark Horse']
 27 |                                             )
 28 |                                         ),
 29 |                                         pawpaw.arborform.Extract(
 30 |                                             regex.compile(
 31 |                                                 r'(?P<F_Series>F(?:ord)?-(?:150(?:\s+Lightningt)?|[3-7]50|600))',
 32 |                                                 regex.IGNORECASE | regex.DOTALL
 33 |                                             )
 34 |                                         ),
 35 |                                     ]
 36 |                                 )
 37 |                             }
 38 |                         ),
 39 |                         'airplane': Ontology(
 40 |                             {
 41 |                                 'Cessna': Ontology(
 42 |                                     rules=[
 43 |                                         pawpaw.arborform.Extract(
 44 |                                             regex.compile(
 45 |                                                 r'(?P<Skyhawk>Cessna\s+172(?:\s+Skyhawk)?|(?:Cessna\s+)?172\s+Skyhawk)',
 46 |                                                 regex.IGNORECASE | regex.DOTALL
 47 |                                             )
 48 |                                         ),
 49 |                                         pawpaw.arborform.Extract(
 50 |                                             regex.compile(
 51 |                                                 r'(?P<Skylane>Cessna\s+182(?:\s+Skylane)?|(?:Cessna\s+)?182\s+Skylane)',
 52 |                                                 regex.IGNORECASE | regex.DOTALL
 53 |                                             )
 54 |                                         ),
 55 |                                         pawpaw.arborform.Extract(
 56 |                                             regex.compile(
 57 |                                                 r'(?P<Stationair>Cessna\s+206(?:\s+Stationair)?|(?:Cessna\s+)?206\s+Stationair)',
 58 |                                                 regex.IGNORECASE | regex.DOTALL
 59 |                                             )
 60 |                                         ),
 61 |                                         pawpaw.arborform.Extract(
 62 |                                             regex.compile(
 63 |                                                 r'(?P<Caravan>Cessna\s+208(?:\s+Caravan)?|(?:Cessna\s+)?208\s+Caravan)',
 64 |                                                 regex.IGNORECASE | regex.DOTALL
 65 |                                             )
 66 |                                         ),
 67 |                                     ]
 68 |                                 )
 69 |                             }
 70 |                         ),                        
 71 |                     },
 72 |                     rules=[pawpaw.arborform.Extract(regex.compile(r'(?P<vehicle>vehicles?)', regex.IGNORECASE))]
 73 |                 )
 74 |             }
 75 |         )
 76 | 
 77 |     def test_ctor(self):
 78 |         for rules in [], [pawpaw.arborform.Extract(regex.compile(r'abc'))]:
 79 |             for items in {}, {'a': Ontology()}, {'a': Ontology(), 'rules': Ontology()}:
 80 |                 for b in None, Ontology():
 81 |                     with self.subTest(rules=rules, items=items, b=b):
 82 |                         args = []
 83 |                         if len(items) > 0:
 84 |                             args.append(items)
 85 |                         
 86 |                         kwargs = {}
 87 |                         if b is not None:
 88 |                             kwargs['b'] = b
 89 |                         items_expected = (items | kwargs).items()
 90 |                         
 91 |                         if len(rules) > 0:
 92 |                             kwargs['rules'] = rules
 93 |                         
 94 |                         ont = Ontology(*args, **kwargs)
 95 |                         
 96 |                         self.assertSequenceEqual(items_expected, ont.items())
 97 |                         self.assertSequenceEqual(rules, ont.rules)
 98 | 
 99 |     def test_path_index_access(self):
100 |         paths = [
101 |             ('vehicle', ),
102 |             ('vehicle', 'car'),
103 |             ('vehicle', 'car', 'Ford'),
104 |             ('vehicle', 'airplane', 'Cessna'),
105 |         ]
106 |         for path in paths:
107 |             with self.subTest(path=path):
108 |                 expected = self.ontology
109 |                 for s in path:
110 |                     expected = expected[s]
111 |                 self.assertIs(expected, self.ontology[path])
112 | 
113 |     def test_discover(self):
114 |         s = 'The vehicle John loves to drive most is his F-150, not his Cessna 172.'
115 |         ito = pawpaw.Ito(s)
116 | 
117 |         discoveries = self.ontology.discover(ito)
118 | 
119 |         vehicles = [*itertools.chain.from_iterable(rule(ito) for rule in self.ontology['vehicle'].rules)]
120 |         self.assertLess(0, len(vehicles))
121 |         self.assertSequenceEqual(vehicles, discoveries['vehicle'].itos)
122 | 
123 |         fords = [*itertools.chain.from_iterable(rule(ito) for rule in self.ontology['vehicle']['car']['Ford'].rules)]
124 |         self.assertLess(0, len(fords))
125 |         self.assertSequenceEqual(fords, discoveries['vehicle']['car']['Ford'].itos)
126 | 
127 |         cessnas = [*itertools.chain.from_iterable(rule(ito) for rule in self.ontology['vehicle']['airplane']['Cessna'].rules)]
128 |         self.assertLess(0, len(cessnas))
129 |         self.assertSequenceEqual(cessnas, discoveries['vehicle']['airplane']['Cessna'].itos)
130 | 


--------------------------------------------------------------------------------
/tests/query/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/query/__init__.py


--------------------------------------------------------------------------------
/tests/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/table/__init__.py


--------------------------------------------------------------------------------
/tests/table/test_table.py:
--------------------------------------------------------------------------------
 1 | import pawpaw
 2 | from tests.util import _TestIto
 3 | 
 4 | import regex
 5 | 
 6 | class TestTable(_TestIto):
 7 |     _test_style_data: list[tuple[str, str]]  = [
 8 |         (
 9 |             'TYPE_1',
10 | """-----+-----+-----
11 |   A  |  B  |  C
12 | -----+-----+-----
13 |  aaa | bbb | ccc
14 | -----+-----+-----"""
15 |         ),
16 |         (
17 |             'TYPE_2',
18 | """-------------------
19 | |  A  |  B  |  C  |
20 | |-----------------|
21 | | aaa | bbb | ccc |
22 | -------------------"""
23 |         ),
24 |     ]
25 | 
26 | 
27 |     def test_named_styles(self) -> None:
28 |         for style_name, data in self._test_style_data:
29 |             style: pawpaw.table.StyledTable = getattr(pawpaw.table.styles, style_name)
30 |             table = pawpaw.table.StyledTable(style)
31 |             for leading_trailing_crs in (False, True):
32 |                 indents = ['']
33 |                 if style.equi_distant_indent:
34 |                     indents.extend((' ', '\t', '  ', '\t '))
35 | 
36 |                 for indent in indents:
37 |                     with self.subTest(style=style_name, leading_trailing_crs=leading_trailing_crs, indent=indent):
38 |                         indented_data = '\n'.join([indent + line for line in data.split('\n')])
39 |                         indented_data = pawpaw.Ito(indented_data)
40 | 
41 |                         if indent == '':
42 |                             self.assertEqual(data, str(indented_data))
43 | 
44 |                         if style.equi_distant_indent:
45 |                             ed = pawpaw.table.StyledTable._re_equi_ident
46 |                             itor = pawpaw.arborform.Extract(ed)
47 |                             edr = [*itor(indented_data)]
48 |                             self.assertEqual(1, len(edr))
49 |                             self.assertEqual(str(indented_data), str(edr[0]))
50 |                         else:
51 |                             self.assertEqual(data, str(indented_data))
52 | 
53 |                         crd_data = pawpaw.Ito(f'\n{indented_data}\n') if leading_trailing_crs else indented_data
54 | 
55 |                         itor = table.get_itor()
56 |                         itos = list(itor(crd_data))
57 |                         self.assertIsNotNone(itos)
58 |                         self.assertEqual(1, len(itos))
59 | 
60 |                         ito = itos[0]
61 |                         self.assertEqual('table', ito.desc)
62 | 
63 |                         rows = [*ito.find_all('*[d:row]')]
64 |                         self.assertEqual(2, len(rows))
65 |                         self.assertTrue(all(i.desc == 'row' for i in rows))
66 | 


--------------------------------------------------------------------------------
/tests/test_group_keys.py:
--------------------------------------------------------------------------------
 1 | import regex
 2 | from pawpaw import GroupKeys
 3 | from tests.util import _TestIto
 4 | 
 5 | 
 6 | class TestGroupKeys(_TestIto):
 7 |     def test_preferred(self):
 8 |         for pat in r'.', r'(?P<key1>.).(?P<key2>.)':
 9 |             with self.subTest(re=pat):
10 |                 re = regex.compile(pat)
11 |                 pgks = GroupKeys.preferred(re)
12 |                 self.assertEqual(re.groups + 1, len(pgks))
13 |                 for i, gk in enumerate(pgks):
14 |                     if isinstance(gk, str):
15 |                         self.assertEqual(i, re.groupindex[gk])
16 |                     else:
17 |                         self.assertEqual(i, gk)
18 | 
19 |     def test_validate(self):
20 |         re = regex.compile(r'(?P<key1>.).(?P<key2>.)')
21 |         
22 |         for valid_gks in [[0], [0, 1, 2], ['key1'], ['key1', 'key2'], [0, 'key2'], GroupKeys.preferred(re)]:
23 |             with self.subTest(group_keys=valid_gks):
24 |                 GroupKeys.validate(re, valid_gks)
25 | 
26 |         for invalid_gks in [[-1], [0, 3], ['xyz'], ['key1', 'key1'], [1, 'key1'], ['key1', 1]]:
27 |             with self.subTest(group_keys=valid_gks):
28 |                 with self.assertRaises(ValueError):
29 |                     GroupKeys.validate(re, invalid_gks)
30 | 


--------------------------------------------------------------------------------
/tests/test_invoke_func.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import typing
 3 | 
 4 | import pawpaw
 5 | from tests.util import _TestIto
 6 | 
 7 | 
 8 | def arg_only_func(a: bool, b: int) -> typing.Dict[str, typing.Any]:
 9 |     return {'a': a, 'b': b}
10 | 
11 | 
12 | def arg_kwonlyargs_func(a: bool, b: int = 1, c: float = 1.0, d: str = 'd param val') -> typing.Dict[str, typing.Any]:
13 |     return {'a': a, 'b': b, 'c': c, 'd': d}
14 | 
15 | 
16 | def big_func(a: bool, b: int = 1, *args, c: float = 1.0, d: str = 'd param val', **kwargs) -> typing.Dict[str, typing.Any]:
17 |     return {'a': a, 'b': b, '*args': args, 'c': c, 'd': d, '**kwargs': kwargs}
18 | 
19 | 
20 | class TestDescFunc(_TestIto):
21 |     def test_arg_only_func(self):
22 |         for vars in {'a': False, 'b': -1}, {'a': False, 'b': -1, 'c': 1.234}:
23 |             with self.subTest(vars=vars):
24 |                 rv = pawpaw.type_magic.invoke_func(arg_only_func, *vars.values())
25 |                 for k, v in vars.items():
26 |                     if k in ('a', 'b'):
27 |                         self.assertEqual(v, rv[k])
28 |                     else:
29 |                         self.assertNotIn(k, rv.keys())
30 | 
31 |     def test_arg_kwonlyargs_func(self):
32 |         for vars in {'a': True}, \
33 |                     {'a': False, 'b': -1}, \
34 |                     {'a': False, 'b': -1, 'c': 1.234}, \
35 |                     {'a': False, 'b': -1, 'c': 1.234, 'd': 'd-value'}:
36 |             with self.subTest(vars=vars):
37 |                 rv = pawpaw.type_magic.invoke_func(arg_kwonlyargs_func, *vars.values())
38 |                 for k, v in vars.items():
39 |                     self.assertEqual(v, rv[k])
40 | 
41 |     def test_args(self):
42 |         for vars in {'a': True}, {'a': False, 'b': -1}:
43 |             with self.subTest(vars=vars):
44 |                 rv = pawpaw.type_magic.invoke_func(big_func, *vars.values())
45 |                 for k, v in vars.items():
46 |                     self.assertEqual(v, rv[k])
47 | 
48 |     def test_args_kwargs(self):
49 |         for vars in {'a': True, 'c': 1.234}, {'a': False, 'b': -1, 'c': 5.678}:
50 |             with self.subTest(vars=vars):
51 |                 rv = pawpaw.type_magic.invoke_func(big_func, *vars.values())
52 |                 for k, v in vars.items():
53 |                     self.assertEqual(v, rv[k])
54 | 


--------------------------------------------------------------------------------
/tests/test_span.py:
--------------------------------------------------------------------------------
 1 | from pawpaw import Span, Ito
 2 | from tests.util import _TestIto
 3 | 
 4 | 
 5 | class TestSpan(_TestIto):
 6 |     def test_from_indices_valid(self):
 7 |         for s in '', ' ', ' abc ':
 8 |             for start in (-100, -1, None, 0, 1, 100):
 9 |                 for stop in (-100, -1, None, 0, 1, 100):
10 |                     if len(s) == 0:
11 |                         ito = Ito(s)
12 |                     elif len(s) == 1:
13 |                         ito = Ito(s, 1)
14 |                     else:
15 |                         ito = Ito(s, 1, -1)
16 | 
17 |                     for basis in s, ito:
18 |                         with self.subTest(basis=basis, start=start, stop=stop):
19 |                             _slice = slice(start, stop)
20 |                             expected = basis[_slice]
21 |                             span = Span.from_indices(basis, start, stop)
22 |                             actual = basis[slice(*span)]
23 |                             self.assertEqual(expected, actual)
24 | 
25 |     def test_from_indices_invalid_base(self):
26 |         for basis in [None, 1.0]:
27 |             with self.subTest(basis=basis):
28 |                 with self.assertRaises(TypeError):
29 |                     Span.from_indices(basis)
30 | 
31 |     def test_from_indices_invalid_indices(self):
32 |         s = 'abc'
33 |         for k, v in {'start': 1.0, 'stop': 1.0}.items():
34 |             with self.subTest(basis=s, **{k: v}):
35 |                 with self.assertRaises(TypeError):
36 |                     Span.from_indices(s, **{k: v})
37 |                     
38 |     def test_offset(self):
39 |         s = 'abc'
40 |         for basis in s, Ito(s, 1, -1):
41 |             for i in -100, -1, 0, 1, 100:
42 |                 with self.subTest(basis=basis, i=i):
43 |                     span = Span.from_indices(basis)
44 |                     if (span.start + i < 0) or (span.stop + i < 0):
45 |                         with self.assertRaises(ValueError):
46 |                             span.offset(i)
47 |                     else:
48 |                         rv = span.offset(i)
49 |                         self.assertEqual(span.start + i, rv.start)
50 |                         self.assertEqual(span.stop + i, rv.stop)
51 | 


--------------------------------------------------------------------------------
/tests/test_type_magic.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import typing
  3 | 
  4 | import pawpaw
  5 | from tests.util import _TestIto
  6 | 
  7 | 
  8 | class Foo:
  9 |     ...
 10 |     
 11 | 
 12 | class FooDerived(Foo):
 13 |     ...
 14 | 
 15 | 
 16 | T_RET = bool
 17 | 
 18 | T_P1 = str
 19 | T_P2 = int | None
 20 | T_P3 = Foo
 21 | T_P4 = list[int]
 22 | 
 23 | F_EXACT = typing.Callable[[T_P1, T_P2, T_P3, T_P4], T_RET]
 24 | F_UNION_ELEMENT = typing.Callable[[T_P1, int, T_P3, T_P4], T_RET]
 25 | F_SUBTYPE = typing.Callable[[T_P1, T_P2, FooDerived, T_P4], T_RET]
 26 | F_NON_GENERIC = typing.Callable[[T_P1, T_P2, T_P3, list], T_RET]
 27 | F_INVALID_GENERIC = typing.Callable[[T_P1, T_P2, T_P3, list[str]], T_RET]
 28 | F_TOO_FEW = typing.Callable[[T_P1, T_P2, T_P3], T_RET]
 29 | F_TOO_MANY = typing.Callable[[T_P1, T_P2, T_P3, T_P4, bool], T_RET]
 30 | F_WRONG_RET = typing.Callable[[T_P1, T_P2, T_P3, T_P4], str]
 31 | 
 32 | 
 33 | def def_dir_w_type_hints(a: str, b: int | None, c: Foo, d: list[int]) -> bool:
 34 |     return True
 35 | 
 36 | 
 37 | def def_dir_wo_type_hints(a, b, c, d):
 38 |     return True
 39 | 
 40 | 
 41 | def def_indir_w_type_hints(a: T_P1, b: T_P2, c: T_P3, d: T_P4) -> T_RET:
 42 |     return True
 43 | 
 44 | 
 45 | def def_dir_subtype_w_type_hints(a: T_P1, b: T_P2, c: FooDerived, d: T_P4) -> T_RET:
 46 |     return True
 47 | 
 48 | 
 49 | @dataclass
 50 | class _TestData:
 51 |     name: str
 52 |     type_hints: bool
 53 |     subtype: bool
 54 |     functoid: typing.Callable
 55 |         
 56 |         
 57 | class TestTypeMagic(_TestIto):
 58 |     @classmethod
 59 |     def cls_m_w_type_hints(cls, a: T_P1, b: T_P2, c: T_P3, d: T_P4) -> T_RET:
 60 |         return True
 61 |     
 62 |     @classmethod
 63 |     def cls_m_wo_type_hints(cls, a, b, c, d):
 64 |         return True
 65 |     
 66 |     def inst_m_w_type_hints(self, a: T_P1, b: T_P2, c: T_P3, d: T_P4) -> T_RET:
 67 |         return self is not None
 68 |     
 69 |     def inst_m_wo_type_hints(self, a, b, c, d):
 70 |         return self is not None
 71 | 
 72 |     def setUp(self) -> None:
 73 |         super().setUp()
 74 | 
 75 |         lam_w_type_hints: typing.Callable[[T_P1, T_P2, T_P3, T_P4], T_RET] = lambda a, b, c, d: True
 76 | 
 77 |         lam_wo_type_hints = lambda a, b, c, d: True
 78 | 
 79 |         self.test_data = [
 80 |             _TestData('def direct', True, False, def_dir_w_type_hints),
 81 |             _TestData('def direct', False, False, def_dir_wo_type_hints),
 82 |             _TestData('def indirect', True, False, def_indir_w_type_hints),
 83 |             _TestData('def indirect subtype', True, True, def_dir_subtype_w_type_hints),
 84 |             _TestData('class method', True, False, TestTypeMagic.cls_m_w_type_hints),
 85 |             _TestData('class method', False, False, TestTypeMagic.cls_m_wo_type_hints),
 86 |             _TestData('instance method', True, False, self.inst_m_w_type_hints),
 87 |             _TestData('instance method', False, False, self.inst_m_wo_type_hints),
 88 |             _TestData('lambda', True, False, lam_w_type_hints),
 89 |             _TestData('lambda', False, False, lam_wo_type_hints),
 90 |         ]
 91 | 
 92 |     def test_is_callable_type_or_generic(self):
 93 |         for t in T_RET, T_P1, T_P2, T_P3, T_P4:
 94 |             with self.subTest(type=t):
 95 |                 self.assertFalse(pawpaw.type_magic.is_callable_type_or_generic(t))
 96 | 
 97 |         for t in F_EXACT, F_UNION_ELEMENT, F_SUBTYPE, F_NON_GENERIC, F_INVALID_GENERIC, F_TOO_FEW, F_TOO_MANY, F_WRONG_RET:
 98 |             with self.subTest(type=t):
 99 |                 self.assertTrue(pawpaw.type_magic.is_callable_type_or_generic(t))
100 |     
101 |         for ti in self.test_data:
102 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
103 |                 self.assertFalse(pawpaw.type_magic.is_callable_type_or_generic(ti.functoid))
104 |             
105 |     def test_is_functoid(self):
106 |         for ti in self.test_data:
107 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
108 |                 self.assertTrue(pawpaw.type_magic.is_functoid(ti.functoid))
109 | 
110 |     def test_is_def(self):
111 |         for ti in self.test_data:
112 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
113 |                 self.assertEqual(ti.name.startswith('def'), pawpaw.type_magic.is_def(ti.functoid))
114 | 
115 |     def test_is_lambda(self):
116 |         for ti in self.test_data:
117 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
118 |                 self.assertEqual(ti.name.startswith('lambda'), pawpaw.type_magic.is_lambda(ti.functoid))
119 | 
120 |     def test_is_callable_exact(self):
121 |         for ti in self.test_data:
122 |             with self.subTest(type=ti.name, type_hints=ti.type_hints):
123 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_EXACT))
124 | 
125 |     def test_is_callable_union_element(self):
126 |         for ti in self.test_data:
127 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
128 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_UNION_ELEMENT))
129 | 
130 |     def test_is_callable_subtype(self):
131 |         for ti in self.test_data:
132 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
133 |                 expected = not ti.type_hints or ti.name.startswith('lambda') or ti.subtype
134 |                 self.assertEqual(expected, pawpaw.type_magic.functoid_isinstance(ti.functoid, F_SUBTYPE))
135 | 
136 |     def test_is_callable_non_generic(self):
137 |         for ti in self.test_data:
138 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
139 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_NON_GENERIC))
140 | 
141 |     def test_is_callable_invalid_generic(self):
142 |         for ti in self.test_data:
143 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
144 |                 # Only compares origin (list to list), so everything will pass
145 |                 self.assertTrue(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_INVALID_GENERIC))
146 | 
147 |     def test_is_callable_wrong_count(self):
148 |         for ti in self.test_data:
149 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
150 |                 self.assertFalse(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_TOO_FEW))
151 |                 self.assertFalse(pawpaw.type_magic.functoid_isinstance(ti.functoid, F_TOO_MANY))
152 | 
153 |     def test_is_callable_wrong_ret(self):
154 |         for ti in self.test_data:
155 |             with self.subTest(type=ti.name, type_hints=ti.type_hints, subtype=ti.subtype):
156 |                 expected = not ti.type_hints or ti.name.startswith('lambda')  # type hints for lambdas don't show in annotations
157 |                 actual = pawpaw.type_magic.functoid_isinstance(ti.functoid, F_WRONG_RET)
158 |                 self.assertEqual(expected, actual)
159 | 
160 |         F_INT_2_NONE = typing.Callable[[int], None]
161 | 
162 |         def no_ret_val_w_type_hints(i: int) -> None:
163 |             return
164 | 
165 |         with self.subTest(type=no_ret_val_w_type_hints.__name__, type_hints=True, subtype=False):
166 |             actual = pawpaw.type_magic.functoid_isinstance(no_ret_val_w_type_hints, F_INT_2_NONE)
167 |             self.assertTrue(actual)
168 | 
169 |         def no_ret_val_wo_type_hints(i):
170 |             return
171 | 
172 |         with self.subTest(type=no_ret_val_wo_type_hints.__name__, type_hints=True, subtype=False):
173 |             actual = pawpaw.type_magic.functoid_isinstance(no_ret_val_w_type_hints, F_INT_2_NONE)
174 |             self.assertTrue(actual)
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/tests/test_util.py:
--------------------------------------------------------------------------------
  1 | import pawpaw
  2 | 
  3 | from tests.util import _TestIto
  4 | 
  5 | 
  6 | class TestFindUnescaped(_TestIto):
  7 |     def test_find_unescaped_invalid(self):
  8 |         s = ' abc '
  9 |         for src in s, pawpaw.Ito(s, 1, -1):
 10 |             for chars in [None, '']:
 11 |                 with self.subTest(src=src, chars=chars):
 12 |                     with self.assertRaises((TypeError, ValueError)):
 13 |                         next(pawpaw.find_unescaped(src, chars))
 14 | 
 15 |         for chars in 'a', 'ab':
 16 |             for src in s, pawpaw.Ito(s, 1, -1):
 17 |                 for escape in [None, '', '\\\\']:
 18 |                     with self.subTest(src=src, chars=chars, escape=escape):
 19 |                         with self.assertRaises((TypeError, ValueError)):
 20 |                             next(pawpaw.find_unescaped(src, chars, escape))
 21 | 
 22 |     def test_find_unescaped_trailing_escape(self):
 23 |         chars = 'a'
 24 |         src = 'a\\'
 25 |         with self.subTest(src=src, chars=chars):
 26 |             with self.assertRaises(ValueError):
 27 |                 [*pawpaw.find_unescaped(src, chars)]
 28 |                 
 29 |     def test_find_unescaped_empty_src(self):
 30 |         s = ''
 31 |         chars = 'a'
 32 |         for src in s, pawpaw.Ito(s), pawpaw.Ito('ab', 1, -1):
 33 |             with self.subTest(src=src, chars=chars):
 34 |                 i = next(pawpaw.find_unescaped(src, chars), None)
 35 |                 self.assertIsNone(i)
 36 | 
 37 |     def test_find_unescaped_not_present(self):
 38 |         s = ' abc '
 39 |         chars = 'z'
 40 |         for src in s, pawpaw.Ito(s), pawpaw.Ito(s, 1, -1):
 41 |             with self.subTest(src=src, chars=chars):
 42 |                 i = next(pawpaw.find_unescaped(src, chars), None)
 43 |                 self.assertIsNone(i)
 44 | 
 45 |     def test_find_unescaped_multiple(self):
 46 |         for string in 'a', 'b', 'ab', 'abc', 'bac':
 47 |             for chars in 'a', 'ab', 'cb':
 48 |                 with self.subTest(src=string, chars=chars):
 49 |                     expected = [i for i, c in enumerate(string) if c in chars]
 50 |                     actual = [*pawpaw.find_unescaped(string, chars)]
 51 |                     self.assertListEqual(expected, actual)
 52 | 
 53 |     def test_find_unescaped_simple(self):
 54 |         for string in 'a', 'b':
 55 |             chars = 'a'
 56 |             for pre in range(1, 5):
 57 |                 s = '\\' * pre + string
 58 |                 with self.subTest(src=s, chars=chars):
 59 |                     if pre & 1:  # odd
 60 |                         expected = []
 61 |                     else:        # even
 62 |                         expected = s.find(chars)
 63 |                         expected = [] if expected == -1 else [expected]
 64 |                     actual = [*pawpaw.find_unescaped(s, chars)]
 65 |                     self.assertListEqual(expected, actual)
 66 |     
 67 |     def test_find_unescaped_complex(self):
 68 |         s = ' a&b&&c '
 69 |         escape = '&'
 70 |         for src in s, pawpaw.Ito(s, 1, -1):
 71 |             for chars in 'a', 'b', 'c':
 72 |                 with self.subTest(src=src, chars=chars, escape=escape):
 73 |                     if chars == 'b':
 74 |                         expected = []
 75 |                     else:
 76 |                         expected = [str(src).find(chars)]
 77 |                     actual = [*pawpaw.find_unescaped(src, chars, escape)]
 78 |                     self.assertListEqual(expected, actual)
 79 | 
 80 | 
 81 | class TestSplitUnescaped(_TestIto):
 82 |     def test_split_unescaped_complex(self):
 83 |         s = ' a&b&&c '
 84 |         escape = '&'
 85 |         for src in s, pawpaw.Ito(s, 1, -1):
 86 |             for chars in 'a', 'b', 'c':
 87 |                 with self.subTest(src=src, chars=chars, escape=escape):
 88 |                     if chars == 'b':
 89 |                         expected = [src]
 90 |                     else:
 91 |                         i = str(src).index(chars)
 92 |                         expected = [src[:i], src[i + 1:]]
 93 |                 actual = [*pawpaw.split_unescaped(src, chars, escape)]
 94 |                 self.assertListEqual(expected, actual)
 95 | 
 96 |     def test_split_unescaped_prefix_suffix(self):
 97 |         s = 'aba'
 98 |         for src in s, pawpaw.Ito(s):
 99 |             for chars in 'a', 'b', 'c':
100 |                 with self.subTest(src=src, chars=chars):
101 |                     if isinstance(src, str):
102 |                         expected = src.split(chars)
103 |                     elif chars == 'a':
104 |                         expected = [src[0:0], src[1:1+1], src[3:3]]
105 |                     elif chars == 'b':
106 |                         expected = [src[0:1], src[2:3]]
107 |                     else:  # chars == 'c'
108 |                         expected = [src]
109 |                     actual = [*pawpaw.split_unescaped(src, chars)]
110 |                     self.assertListEqual(expected, actual)
111 | 
112 | 
113 | class TestFindBalanced(_TestIto):
114 |     def test_find_balanced_differing(self):
115 |         lchar = '('
116 |         rchar = ')'
117 |         balanced_segments = [r'(\))', r'(\()', '()', '(a)', '(a(b))', '()', '(123(abc)(def)456)']
118 | 
119 |         for b in [*balanced_segments]:
120 |             with self.subTest(src=b, lchar=lchar, rchar=rchar):
121 |                 actual = next(pawpaw.find_balanced(b, lchar, rchar))
122 |                 self.assertEqual(b, actual)
123 | 
124 |             b = pawpaw.Ito(b)
125 |             lcito = pawpaw.Ito(lchar)
126 |             rcito = pawpaw.Ito(rchar)
127 |             with self.subTest(src=b, lchar=lcito, rchar=rcito):
128 |                 actual = next(pawpaw.find_balanced(b, lcito, rcito))
129 |                 self.assertEqual(b, actual)
130 | 
131 |             b = pawpaw.Ito(f'({b})')
132 |             lcito = pawpaw.Ito(lchar)
133 |             rcito = pawpaw.Ito(rchar)
134 |             with self.subTest(src=b, lchar=lcito, rchar=rcito, start=1, stop=-1):
135 |                 actual = next(pawpaw.find_balanced(b, lcito, rcito, start=1, stop=-1))
136 |                 self.assertEqual(b[1:-1], actual)
137 | 
138 |         b = ''.join(balanced_segments)
139 |         with self.subTest(src=b, lchar=lchar, rchar=rchar):
140 |             actual = [*pawpaw.find_balanced(b, lchar, rchar)]
141 |             self.assertListEqual(balanced_segments, actual)
142 | 
143 |     def test_find_balanced_homogenous(self):
144 |         lchar = '"'
145 |         rchar = '"'
146 |         tokens = ("A", "B", "C")
147 |         src = ' '.join(f'"{t}"' for t in tokens)
148 | 
149 |         with self.subTest(src=src, lchar=lchar, rchar=rchar):
150 |             actual = [*pawpaw.find_balanced(src, lchar, rchar)]
151 |             self.assertListEqual([f'"{t}"' for t in tokens], [str(i) for i in actual])
152 | 


--------------------------------------------------------------------------------
/tests/test_version.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import pawpaw
 4 | 
 5 | 
 6 | class TestVersion(unittest.TestCase):
 7 | 
 8 |     _valid_versions = [
 9 |         '1.dev0',
10 |         '1.0.dev456',
11 |         '1.0a1',
12 |         '1.0a2.dev456',
13 |         '1.0a12.dev456',
14 |         '1.0a12',
15 |         '1.0b1.dev456',
16 |         '1.0b2',
17 |         '1.0b2.post345.dev456',
18 |         '1.0b2.post345',
19 |         '1.0rc1.dev456',
20 |         '1.0rc1',
21 |         '1.0',
22 |         '1.0+abc.5',
23 |         '1.0+abc.7',
24 |         '1.0+5',
25 |         '1.0.post456.dev34',
26 |         '1.0.post456',
27 |         '1.0.15',
28 |         '1.1.dev1',
29 |     ]
30 |     # Taken from https://peps.python.org/pep-0440/#summary-of-permitted-suffixes-and-relative-ordering
31 | 
32 |     def test_is_canonical_valid(self):
33 |         for v in self._valid_versions:
34 |              with self.subTest(version=v):
35 |                 self.assertTrue(pawpaw.Version.is_canonical(v))
36 | 
37 |     _invalid_versions = [
38 |         '1.0a',
39 |         '1.0dev0',
40 |         '1.0post',
41 |         '1.0d1',
42 |     ]
43 | 
44 |     def test_is_canonical_invalid(self):
45 |         for v in self._invalid_versions:
46 |              with self.subTest(version=v):
47 |                 self.assertFalse(pawpaw.Version.is_canonical(v))
48 | 
49 |     def test_version_parse_re(self):
50 |         v = '1.2a34.dev567+xyz.8'
51 |         m = pawpaw.Version.parse_re.fullmatch(v)
52 |         self.assertIsNotNone(m)
53 |         ito = pawpaw.Ito.from_match(m)[0]
54 | 
55 |         tests: list[tuple[str]] = [
56 |             ('release', '*[d:release]', '1.2'),
57 | 
58 |             ('pre', '*[d:pre]', 'a34'),
59 |             ('pre_l', '*[d:pre]/*[d:pre_l]', 'a'),
60 |             ('pre_n', '*[d:pre]/*[d:pre_n]', '34'),
61 | 
62 |             ('dev', '*[d:dev]', '.dev567'),
63 |             ('dev_l', '*[d:dev]/*[d:dev_l]', 'dev'),
64 |             ('dev_n', '*[d:dev]/*[d:dev_n]', '567'),
65 | 
66 |             ('local', '*[d:local]', 'xyz.8'),
67 |         ]
68 | 
69 |         for name, path, expected in tests:
70 |             with self.subTest(component=name):
71 |                 val = ito.find(path)
72 |                 self.assertEqual(expected, str(val))
73 | 


--------------------------------------------------------------------------------
/tests/test_xml_helper.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # Force Python XML parser, not faster C version so that we can hook methods
  3 | sys.modules['_elementtree'] = None
  4 | import xml.etree.ElementTree as ET
  5 | 
  6 | import pawpaw
  7 | import pawpaw.xml as xml
  8 | from tests.util import _TestIto, XML_TEST_SAMPLES
  9 | 
 10 | class TestQualifiedName(_TestIto):
 11 |     def test_from_src(self):
 12 |         for s in 'a', 'a:b':
 13 |             ito = pawpaw.Ito(s)
 14 |             parts = ito.str_split(':')
 15 |             if len(parts) == 1:
 16 |                 parts.insert(0, None)
 17 |             expected = xml.QualifiedName(*parts)
 18 | 
 19 |             with self.subTest(src=ito):
 20 |                 actual = xml.QualifiedName.from_src(ito)
 21 |                 self.assertEqual(expected, actual)
 22 | 
 23 |             with self.subTest(src=s):
 24 |                 actual = xml.QualifiedName.from_src(s)
 25 |                 self.assertEqual(expected, actual)
 26 | 
 27 | 
 28 | class TestXmlHelper(_TestIto):
 29 |     def test_get_qualified_name(self):
 30 |         pass
 31 | 
 32 |     def test_get_xmlns(self):
 33 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 34 |             with self.subTest(xml_sample_index=sample_index):
 35 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 36 |                 xmlns = xml.XmlHelper.get_xmlns(root)
 37 |                 xmlns = {str(k.local_part): str(v) for k, v in xmlns.items()}
 38 |                 if sample.default_namespace is None:
 39 |                     self.assertIsNone(xmlns.get('xmlns'))
 40 |                 else:
 41 |                     self.assertLessEqual({'xmlns': sample.default_namespace[1:-1]}.items(), xmlns.items())
 42 |                 self.assertLessEqual(sample.root_prefix_map.items(), xmlns.items())
 43 | 
 44 |     def test_get_prefix_map_root(self):
 45 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 46 |             with self.subTest(xml_sample_index=sample_index):
 47 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 48 |                 self.assertDictEqual(sample.root_prefix_map, xml.XmlHelper.get_prefix_map(root))
 49 | 
 50 |     def test_get_prefix_map_composite(self):
 51 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 52 |             with self.subTest(xml_sample_index=sample_index):
 53 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 54 |                 actual = xml.XmlHelper.get_prefix_map(root)
 55 |                 self.assertEqual(sample.root_prefix_map, actual)
 56 | 
 57 |                 actual = {}
 58 |                 for e in root.findall('.//'):
 59 |                     actual |= xml.XmlHelper.get_prefix_map(e)
 60 |                 self.assertDictEqual(sample.descendants_composite_prefix_map, actual)
 61 | 
 62 |     def test_get_default_namespace(self):
 63 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 64 |             depth = 0
 65 |             element = ET.fromstring(sample.xml, xml.XmlParser())
 66 |             while element is not None:
 67 |                 with self.subTest(xml_sample_index=sample_index, depth=depth):
 68 |                     if sample.default_namespace is None:
 69 |                         self.assertIsNone(xml.XmlHelper.get_default_namespace(element))
 70 |                     else:
 71 |                         self.assertEqual(sample.default_namespace, str(xml.XmlHelper.get_default_namespace(element)))
 72 |                 depth += 1
 73 |                 element = element.find('*')
 74 | 
 75 |     def test_get_element_text_if_found(self):
 76 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 77 |             path = sample.text_containing_descendant_path
 78 |             with self.subTest(xml_sample_index=sample_index, path=path):
 79 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
 80 |                 expected = root.find(path).text
 81 |                 actual = xml.XmlHelper.get_element_text_if_found(root, path)
 82 |                 self.assertEqual(expected, actual)
 83 | 
 84 |             invalid_path = path + '/.[tag=""]'  # ensures path returns nothing
 85 |             with self.subTest(xml_sample_index=sample_index, path=invalid_path):
 86 |                 actual = xml.XmlHelper.get_element_text_if_found(root, invalid_path)
 87 |                 self.assertIsNone(actual)
 88 | 
 89 |     def test_get_parent_element(self):
 90 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 91 |             root = ET.fromstring(sample.xml, xml.XmlParser())
 92 |             
 93 |             depth = 0
 94 |             with self.subTest(xml_sample_index=sample_index, depth=depth):
 95 |                 self.assertIsNone(xml.XmlHelper.get_parent_element(root))
 96 | 
 97 |             parent = root
 98 |             while (child := parent.find('*')) is not None:
 99 |                 depth += 1
100 |                 with self.subTest(xml_sample_index=sample_index, depth=depth):
101 |                     actual = xml.XmlHelper.get_parent_element(child)
102 |                     self.assertIs(parent, actual)
103 |                 parent = child
104 | 
105 |     def test_reverse_find(self):
106 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
107 |             desC_QPATH, anc_pred = sample.descendant_path_with_ancestor_predicate
108 |             with self.subTest(xml_sample_index=sample_index, descendant_path=desC_QPATH, ancestor_predicate=anc_pred):
109 |                 root = ET.fromstring(sample.xml, xml.XmlParser())
110 | 
111 |                 desc = root.find(desC_QPATH)
112 |                 self.assertIsNotNone(desc)
113 | 
114 |                 actual = xml.XmlHelper.reverse_find(desc, anc_pred)
115 |                 self.assertIsNotNone(actual)
116 | 


--------------------------------------------------------------------------------
/tests/test_xml_parser.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | # Force Python XML parser, not faster C version so that we can hook methods
  3 | sys.modules['_elementtree'] = None
  4 | import xml.etree.ElementTree as ET
  5 | import html
  6 | import itertools
  7 | 
  8 | from pawpaw import Ito, Span, xml
  9 | from tests.util import _TestIto, XML_TEST_SAMPLES
 10 | 
 11 | 
 12 | class TestXmlParser(_TestIto):
 13 |     def test_basic(self):
 14 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 15 |             with self.subTest(xml_sample_index=sample_index):
 16 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 17 |                 for e in itertools.chain([root_e], root_e.iterfind('**')):
 18 |                     with self.subTest(element=e):
 19 |                         self.assertTrue(hasattr(e, 'ito'))
 20 |                         i = e.ito
 21 |                         self.assertEqual(xml.descriptors.ELEMENT, i.desc)
 22 |                         self.assertIs(e, i.value())
 23 | 
 24 |     def test_attributes(self):
 25 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 26 |             with self.subTest(xml_sample_index=sample_index):
 27 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 28 |                 for e in itertools.chain([root_e], root_e.iterfind('**')):
 29 |                     with self.subTest(element=e):
 30 |                         i = e.ito.find(f'*[d:{xml.descriptors.START_TAG}]/*[d:{xml.descriptors.ATTRIBUTES}]')
 31 |                         if i is None:
 32 |                             self.assertEqual(0, len(e.attrib.keys()))
 33 |                         else:
 34 |                             self.assertIs(e.attrib, i.value())
 35 |                             xmlns_attrs = xml.XmlHelper.get_xmlns(e)
 36 |                             non_xmlns_attrs_count = len(i.children) - len(xmlns_attrs.keys())
 37 |                             self.assertEqual(len(e.attrib.keys()), non_xmlns_attrs_count)
 38 | 
 39 |     def test_namespace(self):
 40 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 41 |             with self.subTest(xml_sample_index=sample_index):
 42 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 43 | 
 44 |                 start_tag = root_e.ito.find(f'**[d:' + xml.descriptors.START_TAG + ']')
 45 |                 self.assertIsNotNone(start_tag)
 46 | 
 47 |                 for attr in start_tag.find_all(f'**[d:' + xml.descriptors.ATTRIBUTE + ']'):
 48 |                     if attr is not None:
 49 |                         expected = [xml.descriptors.TAG, xml.descriptors.VALUE]
 50 |                         self.assertListEqual(expected, [i.desc for i in attr.children])
 51 | 
 52 |                         expected = [xml.descriptors.NAME]
 53 |                         tag = attr.children[0]
 54 |                         if tag.str_find(':') >= 0:
 55 |                             expected.insert(0, xml.descriptors.NAMESPACE)
 56 |                         self.assertListEqual(expected, [i.desc for i in tag.children])
 57 | 
 58 |     def test_hiearchical(self):
 59 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 60 |             with self.subTest(xml_sample_index=sample_index):
 61 |                 root_e = ET.fromstring(sample.xml, parser=xml.XmlParser())
 62 | 
 63 |                 root_i: Ito = root_e.ito
 64 |                 self.assertIsNotNone(root_i)
 65 |                 self.assertIs(root_e, root_i.value())
 66 | 
 67 |                 for child_e in root_e.findall('.//'):
 68 |                     child_i = child_e.ito
 69 |                     self.assertIsNotNone(child_i)
 70 |                     self.assertIs(child_e, child_i.value())
 71 | 
 72 |     def test_values(self):
 73 |         for sample_index, sample in enumerate(XML_TEST_SAMPLES):
 74 |             with self.subTest(xml_sample_index=sample_index):
 75 |                 root = ET.fromstring(sample.xml, parser=xml.XmlParser()).ito
 76 |                 for ito in root.find_all('**!![d:' + ','.join((xml.descriptors.ELEMENT, xml.descriptors.TAG)) + ']'):
 77 |                     desc = ito.desc
 78 |                     with self.subTest(ito_desc=desc, ito_span=ito.span):
 79 |                         if desc == xml.descriptors.ELEMENT:
 80 |                             self.assertIsInstance(ito.value(), ET.Element)
 81 |                         elif desc == xml.descriptors.TAG:
 82 |                             self.assertIsInstance(ito.value(), xml.QualifiedName)
 83 | 
 84 |     def test_tails(self):
 85 |         # xml fragment taken from https://docs.python.org/3/library/xml.etree.elementtree.html
 86 |         fragment = '<?xml version="1.0"?><a><b>1<c>2<d/>3</c></b>4</a>'
 87 |         root = ET.fromstring(fragment, parser=xml.XmlParser())
 88 |         for descendant in root.findall('.//'):
 89 |             next_sibling = descendant.ito.find('>')
 90 |             if descendant.tail is None:
 91 |                 self.assertTrue(next_sibling is None or next_sibling.desc != xml.descriptors.TEXT)
 92 |             else:
 93 |                 self.assertEqual(descendant.tail, str(next_sibling))
 94 | 
 95 |     def test_self_closings(self):
 96 |         fragment = '<td style="vertical-align:bottom;background-color:#efefef;"><div style="text-align:left;font-size:10pt;"><span style="font-family:inherit;font-size:10pt;"><br/></span></div></td>'
 97 |         root = ET.fromstring(fragment, parser=xml.XmlParser()).ito
 98 | 
 99 |         start_tags = list(root.find_all('**!![d:' + xml.descriptors.START_TAG + ']/*[d:' + xml.descriptors.TAG + ']'))
100 |         self.assertSequenceEqual(('td', 'div', 'span', 'br'), tuple(str(st) for st in start_tags))
101 | 
102 |         end_tags = list(root.find_all('**!![d:' + xml.descriptors.END_TAG + ']/*[d:' + xml.descriptors.TAG + ']'))
103 |         self.assertSequenceEqual(('span', 'div', 'td'), tuple(str(et) for et in end_tags))
104 |         
105 |         span = root.find('**![s:<\/span>]')
106 |         self.assertIsNotNone(span)
107 |         self.assertEqual(xml.descriptors.END_TAG, span.desc)
108 | 
109 |     def test_xml_entity_references(self):
110 |         # Ensure that entity references (e.g., "&amp;") don't cause issues with span computations and Ito construction
111 |         sample = \
112 | """<?xml version="1.0" encoding="UTF-8"?>
113 | <nodes>
114 |     beans &amp; franks
115 |     <math>1 &lt; 2</math>
116 |     <music type="R&amp;B" />
117 |     Q&amp;A
118 | </nodes>"""
119 |         root = ET.fromstring(sample, parser=xml.XmlParser())
120 | 
121 |         # First make sure our xml looks correct with de-escaped references for its text & tails
122 |         self.assertEqual(root.text.strip(), html.unescape('beans &amp; franks'))
123 |         self.assertEqual(root[0].text, html.unescape('1 &lt; 2'))
124 |         self.assertEqual(root[-1].attrib['type'], html.unescape('R&amp;B'))
125 |         self.assertEqual(root[-1].tail.strip(), html.unescape('Q&amp;A'))
126 | 
127 |         # Now compare html escaped xml text & tails to corresponding Itos
128 |         self.assertEqual(html.escape(root.text), root.ito.find(f'*[d:{xml.descriptors.TEXT}]').__str__())
129 |         self.assertEqual(html.escape(root[0].text), root[0].ito.find(f'*[d:{xml.descriptors.TEXT}]').__str__())
130 |         self.assertEqual(html.escape(root[-1].attrib['type']), root[-1].ito.find(f'**[d:{xml.descriptors.ATTRIBUTE}]/*[d:{xml.descriptors.VALUE}]').__str__())
131 |         self.assertEqual(html.escape(root[-1].tail), root.ito.find(f'-*[d:{xml.descriptors.TEXT}]').__str__())
132 | 
133 |     def test_xml_comments(self):
134 |         # Ensure that encoded text (e.g., "&amp;") doesn't cause problems with span computations
135 |         comment = '<!--Here is a comment-->'
136 |         text = 'Here is some text'
137 |         sample = '<?xml version="1.0" encoding="UTF-8"?><a>' + comment + text + '</a>'
138 | 
139 |         # root = ET.fromstring(sample, parser=ET.XMLParser(target=ET.TreeBuilder(insert_comments=True)))
140 |         root = ET.fromstring(sample, parser=xml.XmlParser())
141 |         self.assertEqual(root.text, text)
142 | 
143 |         text_ito = root.ito.find(f'*[d:{xml.descriptors.TEXT}]')
144 |         self.assertIsNotNone(text_ito)
145 |         self.assertEqual(comment + text, str(text_ito))
146 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import collections.abc
  3 | import random
  4 | import typing
  5 | from unittest import TestCase
  6 | 
  7 | import regex
  8 | from pawpaw import Span, Ito, Types
  9 | 
 10 | 
 11 | class IntIto(Ito):  # Used for derived class tests
 12 |     def value(self) -> typing.Any:
 13 |         return int(str(self))
 14 |     
 15 | 
 16 | class _TestIto(TestCase):
 17 |     @classmethod
 18 |     def add_chars_as_children(cls, ito: Ito, desc: str | None) -> None:
 19 |         ito.children.add(*(ito.clone(i, i + 1, desc) for i in range(*ito.span)))
 20 | 
 21 |     def matches_equal(self, first: regex.Match, second: regex.Match, msg: typing.Any = ...) -> None:
 22 |         if first is second:
 23 |             return
 24 |         
 25 |         self.assertListEqual([*first.regs], [*second.regs])
 26 |         self.assertEqual(first.group(0), second.group(0))
 27 |         self.assertSequenceEqual(first.groupdict().keys(), second.groupdict().keys())
 28 |         for v1, v2 in zip(first.groupdict().values(), second.groupdict().values()):
 29 |             self.assertEqual(v1, v2)
 30 |             
 31 |     def setUp(self) -> None:
 32 |         self.addTypeEqualityFunc(regex.Match, self.matches_equal)
 33 | 
 34 |         
 35 | class RandSpans:
 36 |     def __init__(
 37 |             self,
 38 |             size: Span = (1, 1),
 39 |             gap: Span = (0, 0),
 40 |     ):
 41 |         if not (isinstance(size, tuple) and len(size) == 2 and all(isinstance(i, int) for i in size)):
 42 |             raise TypeError('invalid \'size\'')
 43 |         if size[0] < 0 or size[1] < 1 or size[0] > size[1]:
 44 |             raise ValueError('invalid \'size\'')
 45 |         self.size = size
 46 | 
 47 |         if not (isinstance(gap, tuple) and len(gap) == 2 and all(isinstance(i, int) for i in gap)):
 48 |             raise TypeError('invalid \'gap\'')
 49 |         if (gap[0] < 0 and abs(gap[0]) >= size[0]) or (gap[1] < 0 and abs(gap[1]) >= size[0]):
 50 |             raise ValueError('invalid \'gap\'')
 51 |         self.gap = gap
 52 | 
 53 |     def generate(
 54 |             self,
 55 |             basis: int | collections.abc.Sized,
 56 |             start: int | None = None,
 57 |             stop: int | None = None
 58 |     ) -> typing.Iterable[Span]:
 59 |         i, stop = Span.from_indices(basis, start, stop)
 60 |         while i < stop:
 61 |             k = i + random.randint(*self.size)
 62 |             k = min(k, stop)
 63 |             yield Span(i, k)
 64 |             if k == stop:
 65 |                 break
 66 |             i = k + random.randint(*self.gap)
 67 | 
 68 | 
 69 | class RandSubstrings(RandSpans):
 70 |     def __init__(
 71 |             self,
 72 |             size: Span = Span(1, 1),
 73 |             gap: Span = Span(0, 0),
 74 |     ):
 75 |         super().__init__(size, gap)
 76 | 
 77 |     def generate(self, string: str, start: int | None = None, stop: int | None = None) -> typing.Iterable[str]:
 78 |         for span in super().generate(string, start, stop):
 79 |             yield string[slice(*span)]
 80 | 
 81 | 
 82 | class XmlTestSample(typing.NamedTuple):
 83 |     source: str
 84 |     
 85 |     default_namespace: None | str
 86 |     
 87 |     # prefix_map of root node
 88 |     root_prefix_map: typing.Dict[str, str]
 89 |     
 90 |     # combined prefix_map of all non-root nodes
 91 |     descendants_composite_prefix_map: typing.Dict[str, str]
 92 |     
 93 |     # path to arbitrary descendant that has non-emtpy .text
 94 |     text_containing_descendant_path: str  
 95 |     
 96 |     # path to arbitrary descendant that can be reversed searched to find ancestor that matches predicate
 97 |     descendant_path_with_ancestor_predicate: typing.Tuple[str, str]
 98 | 
 99 |     xml: str
100 | 
101 | 
102 | XML_TEST_SAMPLES: typing.List[XmlTestSample] = [
103 |     XmlTestSample(
104 |         source='https://docs.python.org/3/library/xml.etree.elementtree.html',
105 |         default_namespace=None,
106 |         root_prefix_map={},
107 |         descendants_composite_prefix_map={},
108 |         text_containing_descendant_path='.//year',
109 |         descendant_path_with_ancestor_predicate=('.//gdppc', 'rank'),
110 |         xml=
111 | """<?xml version="1.0"?>
112 | <data>
113 |     <country name="Liechtenstein">
114 |         <rank>1</rank>
115 |         <year>2008</year>
116 |         <gdppc>141100</gdppc>
117 |         <neighbor name="Austria" direction="E"/>
118 |         <neighbor name="Switzerland" direction="W"/>
119 |     </country>
120 |     <country name="Singapore">
121 |         <rank>4</rank>
122 |         <year>2011</year>
123 |         <gdppc>59900</gdppc>
124 |         <neighbor name="Malaysia" direction="N"/>
125 |     </country>
126 |     <country name="Panama">
127 |         <rank>68</rank>
128 |         <year>2011</year>
129 |         <gdppc>13600</gdppc>
130 |         <neighbor name="Costa Rica" direction="W"/>
131 |         <neighbor name="Colombia" direction="E"/>
132 |     </country>
133 | </data>"""        
134 |     ),
135 | 
136 |     XmlTestSample(
137 |         source='https://docs.python.org/3/library/xml.etree.elementtree.html',
138 |         default_namespace='{http://people.example.com}',
139 |         root_prefix_map={'fictional': 'http://characters.example.com'},
140 |         descendants_composite_prefix_map={},
141 |         text_containing_descendant_path='.//{http://people.example.com}name',
142 |         descendant_path_with_ancestor_predicate=('.//{http://characters.example.com}character', '{http://people.example.com}actor'),
143 |         xml=
144 | """<?xml version="1.0"?>
145 | <actors xmlns:fictional="http://characters.example.com"
146 |         xmlns="http://people.example.com">
147 |     <actor>
148 |         <name>John Cleese</name>
149 |         <fictional:character>Lancelot</fictional:character>
150 |         <fictional:character>Archie Leach</fictional:character>
151 |     </actor>
152 |     <actor>
153 |         <name>Eric Idle</name>
154 |         <fictional:character>Sir Robin</fictional:character>
155 |         <fictional:character>Gunther</fictional:character>
156 |         <fictional:character>Commander Clement</fictional:character>
157 |     </actor>
158 | </actors>"""   
159 |     ),
160 | 
161 |     XmlTestSample(
162 |         source='https://www.xml.com/pub/a/1999/01/namespaces.html',
163 |         default_namespace=None,
164 |         root_prefix_map={'xdc': 'http://www.xml.com/books', 'h': 'http://www.w3.org/HTML/1998/html4'},
165 |         descendants_composite_prefix_map={},
166 |         text_containing_descendant_path='.//{http://www.xml.com/books}author',
167 |         descendant_path_with_ancestor_predicate=('.//{http://www.xml.com/books}date', '@align'),
168 |         xml='''
169 | <h:html xmlns:xdc="http://www.xml.com/books"
170 |         xmlns:h="http://www.w3.org/HTML/1998/html4">
171 |  <h:head><h:title>Book Review</h:title></h:head>
172 |  <h:body>
173 |   <xdc:bookreview>
174 |    <xdc:title>XML: A Primer</xdc:title>
175 |    <h:table>
176 |     <h:tr align="center">
177 |      <h:td>Author</h:td><h:td>Price</h:td>
178 |      <h:td>Pages</h:td><h:td>Date</h:td></h:tr>
179 |     <h:tr align="left">
180 |      <h:td><xdc:author>Simon St. Laurent</xdc:author></h:td>
181 |      <h:td><xdc:price>31.98</xdc:price></h:td>
182 |      <h:td><xdc:pages>352</xdc:pages></h:td>
183 |      <h:td><xdc:date>1998/01</xdc:date></h:td>
184 |     </h:tr>
185 |    </h:table>
186 |   </xdc:bookreview>
187 |  </h:body>
188 | </h:html>'''
189 |     ),
190 | 
191 |     XmlTestSample(
192 |         source='https://www.w3schools.com/xml/xml_namespaces.asp',
193 |         default_namespace=None,
194 |         root_prefix_map={},
195 |         descendants_composite_prefix_map={'h': 'http://www.w3.org/TR/html4/', 'f': 'https://www.w3schools.com/furniture'},
196 |         text_containing_descendant_path='.//{http://www.w3.org/TR/html4/}td',
197 |         descendant_path_with_ancestor_predicate=('.//{https://www.w3schools.com/furniture}length', '{https://www.w3schools.com/furniture}name'),
198 |         xml='''
199 | <root>
200 | 
201 | <h:table xmlns:h="http://www.w3.org/TR/html4/">
202 |   <h:tr>
203 |     <h:td>Apples</h:td>
204 |     <h:td>Bananas</h:td>
205 |   </h:tr>
206 | </h:table>
207 | 
208 | <f:table xmlns:f="https://www.w3schools.com/furniture">
209 |   <f:name>African Coffee Table</f:name>
210 |   <f:width>80</f:width>
211 |   <f:length>120</f:length>
212 | </f:table>
213 | 
214 | </root>'''
215 |     ),
216 | ]
217 | 


--------------------------------------------------------------------------------
/tests/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rlayers/pawpaw/bd0a1881e7b33a7f752238c267e6672557452394/tests/visualization/__init__.py


--------------------------------------------------------------------------------
/tests/visualization/test_sgr.py:
--------------------------------------------------------------------------------
 1 | from pawpaw.visualization.sgr import *
 2 | from tests.util import _TestIto
 3 | 
 4 | 
 5 | class TestSgr(_TestIto):
 6 |     def test_sgr_reset_all(self):
 7 |         self.assertTrue(f'\033[0m', RESET_ALL)
 8 |         
 9 |     def test_sgr_encode(self):
10 |         for vals in (0,), (1,), (1,2,3):
11 |             with self.subTest(value=vals):
12 |                 vals_str = ';'.join(str(v) for v in vals)
13 |                 expected = f'\033[{vals_str}m'
14 |                 actual = encode(*vals)
15 |                 self.assertEqual(expected, actual)
16 |                 self.assertFalse(actual.isprintable())
17 | 


--------------------------------------------------------------------------------
/tests/visualization/test_visualization_ascii_box.py:
--------------------------------------------------------------------------------
  1 | import typing
  2 | 
  3 | import pawpaw
  4 | import pawpaw.visualization.ascii_box as box
  5 | from tests.util import _TestIto
  6 | 
  7 | directions: typing.List[box.Direction] = [
  8 |     box.Direction.N,
  9 |     box.Direction.NE,
 10 |     box.Direction.E,
 11 |     box.Direction.SE,
 12 |     box.Direction.S,
 13 |     box.Direction.SW,
 14 |     box.Direction.W,
 15 |     box.Direction.NW,
 16 | ]
 17 | 
 18 | class TestDirection(_TestIto):
 19 |     @classmethod
 20 |     def setUpClass(cls) -> None:
 21 |         cls.directions = directions
 22 | 
 23 |     def test_values(self):
 24 |         for i, d in enumerate(self.directions):
 25 |             with self.subTest(direction=d):
 26 |                 self.assertEqual(i * 45, d.value)
 27 | 
 28 |     def test_from_degrees(self):
 29 |         for direction in self.directions:
 30 |             for degrees in -720, -360, 0, 1, 44, 360, 720:
 31 |                 with self.subTest(direction=direction, degrees=degrees):
 32 |                     self.assertEqual(direction, direction.rotate(degrees))
 33 | 
 34 |             for degrees in -1, 45:
 35 |                 with self.subTest(direction=direction, degrees=degrees):
 36 |                     self.assertNotEqual(direction, direction.rotate(degrees))
 37 | 
 38 |     def test_rotate(self):
 39 |         for i, direction in enumerate(self.directions):
 40 |             for degrees in range(0, 360 + 45, 45):
 41 |                 with self.subTest(direction=direction, degrees=degrees):
 42 |                     self.assertEqual(
 43 |                         box.Direction.from_degrees(direction.value + degrees),
 44 |                         direction.rotate(degrees)
 45 |                     )
 46 | 
 47 |     def test_reflect(self):
 48 |         for direction in self.directions:
 49 |             for surface in direction, direction.rotate(180):
 50 |                 with self.subTest(direction=direction, surface=surface):
 51 |                     self.assertEqual(direction, direction.reflect(surface))
 52 | 
 53 |             for surface in direction.rotate(90), direction.rotate(-90):
 54 |                 with self.subTest(direction=direction, surface=surface):
 55 |                     self.assertEqual(direction.rotate(180), direction.reflect(surface))
 56 | 
 57 |             for delta in 45, -45:
 58 |                 surface = direction.rotate(delta)
 59 |                 with self.subTest(direction=direction, surface=surface):
 60 |                     rot = 90 if delta > 0 else -90
 61 |                     self.assertEqual(direction.rotate(rot), direction.reflect(surface))
 62 | 
 63 |             for delta in 135, -135:
 64 |                 surface = direction.rotate(delta)
 65 |                 with self.subTest(direction=direction, surface=surface):
 66 |                     rot = -90 if delta > 0 else 90
 67 |                     self.assertEqual(direction.rotate(rot), direction.reflect(surface))
 68 | 
 69 | 
 70 | class TestAsciiBoxDrawing(_TestIto):
 71 |     @classmethod
 72 |     def setUpClass(cls) -> None:
 73 |         cls.directions = directions
 74 | 
 75 |         rotations = [
 76 |             ['┃', '━', '┃', '━'],
 77 |             ['┍', '┒', '┙', '┖'],
 78 |             ['┡', '┲', '┪', '┹'],
 79 |             ['╔', '╗', '╝', '╚'],
 80 |         ]
 81 |         cls.ninety_degree_rotatations: typing.List[typing.List[box.BoxDrawingChar]] = [
 82 |             [box.BoxDrawingChar.from_char(c) for c in rots] for rots in rotations
 83 |         ]
 84 | 
 85 |     def test_chars_unique(self):
 86 |         chars = set(c.char for c in box.BoxDrawingChar._instances)
 87 |         self.assertEqual(len(box.BoxDrawingChar._instances), len(chars))
 88 | 
 89 |     def test_direction_styles_unique(self):
 90 |         dss = set(frozenset((ds.direction, ds.style.weight, ds.style.count, ds.style.dash, ds.style.path) for ds in c.direction_styles) for c in box.BoxDrawingChar._instances)
 91 |         self.assertEqual(len(box.BoxDrawingChar._instances), len(dss))
 92 | 
 93 |     @classmethod
 94 |     def is_corner(cls, bdc: box.BoxDrawingChar) -> bool:
 95 |         if len(bdc.direction_styles) != 2:
 96 |             return False
 97 | 
 98 |         dirs = tuple(ds.direction for ds in bdc.direction_styles)
 99 |         if dirs[0] == box.Direction.N:
100 |             return dirs[1] in (box.Direction.W, box.Direction.E)
101 |         elif dirs[1] == box.Direction.S:
102 |             return dirs[0] in (box.Direction.W, box.Direction.E)
103 |         else:
104 |             return False
105 | 
106 |     def test_from_corners_single_valid(self):
107 |         for bdc in box.BoxDrawingChar.from_char('╭',),:  # box.BoxDrawingChar._instances:
108 |             with self.subTest(box_drawing_char=bdc):
109 |                 if self.is_corner(bdc):
110 |                     boxer = box.from_corners(bdc.char)
111 |                     boxer = box.from_corners(bdc)
112 |                 else:
113 |                     with self.assertRaises(ValueError):
114 |                         boxer = box.from_corners(bdc.char)
115 |                     with self.assertRaises(ValueError):
116 |                         boxer = box.from_corners(bdc)
117 | 
118 |     def test_corner_combos(self):
119 |         in_outs = (
120 |             (('╔', '╯'), ('╔', '╕', '╙', '╯')),
121 |             (('╚', '╮'), ('╓', '╮', '╚', '╛'))
122 |         )
123 | 
124 |         for ins, output_corners in in_outs:
125 |             for input_corners in ins, ins[::-1]:
126 |                 with self.subTest(input_corners=input_corners):
127 |                     input_corners = [box.BoxDrawingChar.from_char(c) for c in input_corners]
128 |                     boxer = box.from_corners(*input_corners)
129 |                     lines = list(boxer.from_srcs(' '))
130 |                     self.assertEqual(3, len(lines))
131 |                     self.assertEqual(output_corners[0], lines[0][0])
132 |                     self.assertEqual(output_corners[1], lines[0][-1])
133 |                     self.assertEqual(output_corners[2], lines[-1][0])
134 |                     self.assertEqual(output_corners[3], lines[-1][-1])
135 | 
136 |     def test_rotate(self):
137 |         for rots in self.ninety_degree_rotatations:
138 |             for i, bdc in enumerate(rots):
139 |                 with self.subTest(box_drawing_char=bdc):
140 |                     j = (i + 1) % 4
141 |                     self.assertEqual(rots[j], bdc.rotate(90))
142 | 


--------------------------------------------------------------------------------