├── .github
    ├── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── user-story.md
    └── workflows
    │   ├── codeql.yml
    │   ├── python-app.yml
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── requirements-dev.txt
├── setup.cfg
├── setup.py
├── tests
    ├── branch_layer_test.py
    ├── conftest.py
    ├── symbol_layer_test.py
    ├── token_layer_test.py
    └── xtructure_test.py
├── tox.ini
└── xsystem.py


/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- Complete this request to suggest an idea or improvement to a project. -->
11 | 
12 | **Is your feature request related to a problem? If so, please describe.**
13 | 
14 | <!-- Please provide a clear and concise description of what the problem is. Ex. "I am always frustrated when [...]" -->
15 | 
16 | **Describe your proposed solution**
17 | 
18 | <!-- Please provide a clear and concise description of the solution you would like to propose. -->
19 | 
20 | **Describe alternatives you have considered**
21 | 
22 | <!-- Please provide a clear and concise description of any alternative solutions or features you've considered. -->
23 | 
24 | **Additional context**
25 | 
26 | <!-- Add additional context or screenshots pertaining to the feature request here. -->
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/user-story.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: User story
 3 | about: Create a user story for this project.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Overview
11 | 
12 | <!--
13 | Describe the goal or feature or two, usually in the form of a user story.
14 | As a [USER], I want to [TASK], so that [REASON]
15 | -->
16 | 
17 | ### Acceptance Criteria
18 | 
19 | <!-- Ask yourself, are these criteria testable?
20 | 
21 | - [ ] List of what needs to be true
22 | - [ ] to consider this done
23 | - [ ] done done
24 | -->
25 | 
26 | ### Questions
27 | 
28 | <!--
29 | - List any questions?
30 | - That you are unsure of the answer?
31 | -->
32 | 
33 | ### Assumptions
34 | 
35 | <!--
36 | - List any assumptions that
37 | - You are making in terms of time-frame,
38 | - Output, or general context setting
39 | -->
40 | 
41 | ### Reference
42 | 
43 | <!--
44 | - [Reference](links)
45 | - [To issues](or-any-inspiration)
46 | - [That might be](helpful)
47 | -->
48 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 |   schedule:
 9 |     - cron: '45 17 * * 5'
10 | 
11 | jobs:
12 |   analyze:
13 |     name: Analyze
14 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
15 |     permissions:
16 |       actions: read
17 |       contents: read
18 |       security-events: write
19 | 
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         language: [ 'python' ]
24 | 
25 |     steps:
26 |     - name: Checkout repository
27 |       uses: actions/checkout@v3
28 | 
29 |     - name: Initialize CodeQL
30 |       uses: github/codeql-action/init@v2
31 |       with:
32 |         languages: ${{ matrix.language }}
33 |        
34 |     - name: Autobuild
35 |       uses: github/codeql-action/autobuild@v2
36 | 
37 |     - name: Perform CodeQL Analysis
38 |       uses: github/codeql-action/analyze@v2
39 |       with:
40 |         category: "/language:${{matrix.language}}"
41 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |     - name: Install dev dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest
30 |         if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; else pip install flake8 pytest; fi
31 |     - name: Install package
32 |       run: pip install .
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest
42 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 |   push:
15 |     branches:
16 |       - main
17 |   pull_request:
18 |     branches:
19 |       - main
20 | 
21 | permissions:
22 |   contents: read
23 | 
24 | jobs:
25 |   deploy:
26 | 
27 |     runs-on: ubuntu-latest
28 | 
29 |     steps:
30 |     - uses: actions/checkout@v3
31 |     - name: Set up Python
32 |       uses: actions/setup-python@v3
33 |       with:
34 |         python-version: '3.x'
35 |     - name: Install dependencies
36 |       run: |
37 |         python -m pip install --upgrade pip
38 |         pip install build
39 |     - name: Build package
40 |       run: python -m build
41 |     - name: Publish package
42 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
43 |       with:
44 |         user: __token__
45 |         password: ${{ secrets.PYPI_API_TOKEN }}
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # dependencies
 2 | 
 3 | # testing
 4 | 
 5 | # production
 6 | 
 7 | # misc
 8 | .DS_Store
 9 | .env.local
10 | .env.development.local
11 | .env.test.local
12 | .env.production.local
13 | 
14 | npm-debug.log*
15 | yarn-debug.log*
16 | yarn-error.log*
17 | 
18 | .venv
19 | .vscode
20 | 
21 | .env.prod
22 | .env.dev
23 | __pycache__
24 | *.pyc
25 | *.egg-info
26 | .tox
27 | .coverage
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Regex-learner
 2 | 
 3 | This project provides a tool/library implementing an automated regular expression building mechanism.
 4 | 
 5 | This project takes inspiration on the paper from Ilyas, et al [1]
 6 | 
 7 | [Ilyas, Andrew, M. F. da Trindade, Joana, Castro Fernandez, Raul and Madden, Samuel. 2018. "Extracting Syntactical Patterns from Databases."](https://hdl.handle.net/1721.1/137774)
 8 | 
 9 | This repository contains code and examples to assist in the exeuction of regular expression learning from the columns of data.
10 | 
11 | This is a basic readme. It will be completed as the prototype grows.
12 | 
13 | # Installation
14 | 
15 | The project can be installed via pip:
16 | ```bash
17 | pip install regex-learner
18 | ```
19 | 
20 | # Examples of usage
21 | 
22 | Example of learning a date pattern from 100 examples of randomly sampled dates in the format DD-MM-YYYY.
23 | 
24 | ```python
25 | from xsystem import XTructure
26 | from faker import Faker
27 | 
28 | fake = Faker()
29 | x = XTructure() # Create basic XTructure class
30 | 
31 | for _ in range(100):
32 |     d = fake.date(pattern=r"%d-%m-%Y") # Create example of data - date in the format DD-MM-YYYY
33 |     x.learn_new_word(d) # Add example to XSystem and learn new features
34 | 
35 | print(str(x)) # ([0312][0-9])(-)([01][891652073])(-)([21][09][078912][0-9])
36 | ```
37 | 
38 | Similary, the tool can be used directly from the command line using the `regex-learner` CLI provided by the installation of the package.
39 | 
40 | The tool has several options, as described by the help message:
41 | 
42 | ```
43 | > regex-learner -h
44 | usage: regex-learner [-h] [-i INPUT] [-o OUTPUT] [--max-branch MAX_BRANCH] [--alpha ALPHA] [--branch-threshold BRANCH_THRESHOLD]
45 | 
46 | A simple tool to learn human readable a regular expression from examples
47 | 
48 | options:
49 |   -h, --help            show this help message and exit
50 |   -i INPUT, --input INPUT
51 |                         Path to the input source, defaults to stdin
52 |   -o OUTPUT, --output OUTPUT
53 |                         Path to the output file, defaults to stdout
54 |   --max-branch MAX_BRANCH
55 |                         Maximum number of branches allowed, defaults to 8
56 |   --alpha ALPHA         Weight for fitting tuples, defaults to 1/5
57 |   --branch-threshold BRANCH_THRESHOLD
58 |                         Branching threshold, defaults to 0.85, relative to the fitting score alpha
59 | ```
60 | 
61 | Assuming a data file containing the examples to learn from is called `EXAMPLE_FILE`, and assuming one is interested in a very simple regular expression, the tool can be used as follows:
62 | 
63 | ```bash
64 | cat EXAMPLE_FILE | regex-learner --max-branch 2
65 | ```
66 | 
67 | ## Note
68 | Note that this project is not based on the actual implementation of the paper as presented in [2]
69 | 
70 | ## References
71 | 1. Ilyas, Andrew, et al. "Extracting syntactical patterns from databases." 2018 IEEE 34th International Conference on Data Engineering (ICDE). IEEE, 2018.
72 | 2. https://github.com/mitdbg/XSystem
73 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | covdefaults
2 | coverage
3 | pytest
4 | faker


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = regex-learner
 3 | version = 0.0.4
 4 | description = The project provides a tool/library implementing an automated regular expression building mechanism.
 5 | 
 6 | author = Stefano Braghin, Liubov Nedoshivina
 7 | author_email = "Liubov Nedoshivia" <liubov.nedoshivina@ibm.com>
 8 | long_description = long_description
 9 | long_description_content_type = text/markdown
10 | url = https://github.com/IBM/regex-learner
11 | license = Apache License 2.0
12 | [options]
13 | py_modules = xsystem
14 | python_requires = >=3.8
15 | 
16 | [options.entry_points]
17 | console_scripts =
18 |     regex-learner = xsystem:main
19 | 
20 | [bdist_wheel]
21 | universal = True
22 | 
23 | [mypy]
24 | check_untyped_defs = true
25 | disallow_any_generics = true
26 | disallow_incomplete_defs = true
27 | disallow_untyped_defs = true
28 | warn_redundant_casts = true
29 | warn_unused_ignores = true
30 | 
31 | [mypy-tests.*]
32 | disallow_untyped_defs = false
33 | 
34 | [flake8]
35 | ignore = E265,E501,W504
36 | 
37 | [bandit]
38 | ignore = B101
39 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | 
3 | from setuptools import setup  # type: ignore
4 | from pathlib import Path
5 | this_directory = Path(__file__).parent
6 | long_description = (this_directory / "README.md").read_text()
7 | 
8 | setup(long_description=long_description)
9 | 


--------------------------------------------------------------------------------
/tests/branch_layer_test.py:
--------------------------------------------------------------------------------
  1 | from xsystem import Branch
  2 | from xsystem import Token
  3 | from xsystem import Symbol
  4 | from xsystem import AsciiClass
  5 | 
  6 | 
  7 | def test_add():
  8 |     # examples: 1234
  9 | 
 10 |     branch = Branch(
 11 |         tokens=[
 12 |             Token(
 13 |                 symbols=[
 14 |                     Symbol.build("1"),
 15 |                     Symbol.build("2"),
 16 |                     Symbol.build("3"),
 17 |                     Symbol.build("4"),
 18 |                 ]
 19 |             )
 20 |         ]
 21 |     )
 22 | 
 23 |     branch.add(
 24 |         "2234"
 25 |     )
 26 | 
 27 |     assert len(branch.tokens) == 1
 28 |     assert len(branch.tokens[0].symbols) == 4
 29 | 
 30 |     for i, symbol in enumerate(branch.tokens[0].symbols):
 31 |         assert symbol is not None
 32 |         assert not symbol.is_class
 33 |         assert symbol.a_class == AsciiClass.DIGIT
 34 | 
 35 |         if i != 0:
 36 |             assert len(symbol.chars) == 1
 37 | 
 38 |     assert len(branch.tokens[0].symbols[0].chars) == 2
 39 | 
 40 | 
 41 | def test_fit_score_simmetric():
 42 |     b1 = Branch.build("ABC")
 43 |     b2 = Branch.build("CDE")
 44 | 
 45 |     assert b1.fit(b2) == b2.fit(b1)
 46 | 
 47 | 
 48 | def test_fit_score_same():
 49 |     b1 = Branch.build("ABC")
 50 |     b1_same = Branch.build("ABC")
 51 | 
 52 |     assert b1.fit(b1_same) == 0
 53 | 
 54 | 
 55 | def test_fit_score_of_similar_is_not_inf():
 56 |     b1 = Branch.build("ABC")
 57 |     b2 = Branch.build("123")
 58 | 
 59 |     assert b1.fit(b2) == 3
 60 | 
 61 |     b3 = Branch.build("AB1")
 62 | 
 63 |     assert b1.fit(b3) == 1
 64 | 
 65 | 
 66 | def test_merge_similar_length():
 67 |     b1 = Branch.build("ABC")
 68 |     b2 = Branch.build("123")
 69 | 
 70 |     b_merged = b1.merge(b2)
 71 | 
 72 |     assert b_merged is not None
 73 |     assert len(b_merged.tokens) == 1
 74 |     assert len(b_merged.tokens[0].symbols) == 3
 75 | 
 76 | 
 77 | def test_merge_different_length():
 78 |     b1 = Branch.build("AB")
 79 |     b2 = Branch.build("ABD")
 80 | 
 81 |     b_merged = b1.merge(b2)
 82 | 
 83 |     assert b_merged is not None
 84 |     assert len(b_merged.tokens) == 1
 85 |     assert len(b_merged.tokens[0].symbols) == 3
 86 |     assert not b_merged.tokens[0].symbols[0].is_optional
 87 |     assert not b_merged.tokens[0].symbols[1].is_optional
 88 |     assert b_merged.tokens[0].symbols[2].is_optional
 89 | 
 90 | 
 91 | def test_merge_different_token_numbers():
 92 |     b1 = Branch.build("a-b-c")
 93 |     b2 = Branch.build("a-b")
 94 | 
 95 |     b_merged = b1.merge(b2)
 96 | 
 97 |     assert b_merged is not None
 98 |     assert len(b_merged.tokens) == 5
 99 | 
100 |     assert not b_merged.tokens[0].optional
101 |     assert not b_merged.tokens[1].optional
102 |     assert not b_merged.tokens[2].optional
103 |     assert b_merged.tokens[3].optional
104 |     assert b_merged.tokens[4].optional
105 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | 
3 | 
4 | @pytest.fixture(scope="session", autouse=True)
5 | def faker_session_locale():
6 |     return ['it_IT', 'en_US']
7 | 


--------------------------------------------------------------------------------
/tests/symbol_layer_test.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import string
 3 | 
 4 | from xsystem import AsciiClass
 5 | from xsystem import Symbol
 6 | 
 7 | 
 8 | def test_get_ascii_class():
 9 |     for c in string.printable:
10 |         ascii_class = AsciiClass.get_ascii_class(c)
11 | 
12 |         assert ascii_class is not None
13 |         assert ascii_class in AsciiClass
14 | 
15 | 
16 | def test_symbol_creation():
17 |     symbol = Symbol.build("5")
18 | 
19 |     assert symbol
20 | 
21 |     assert 0 == symbol.fit_score("5", math.inf)
22 | 
23 | 
24 | def test_symbols_character_letters():
25 |     for letter in string.ascii_letters:
26 |         l_class = AsciiClass.get_ascii_class(letter)
27 | 
28 |         assert l_class in {
29 |             AsciiClass.UPPER, AsciiClass.LOWER, AsciiClass.ALPHA
30 |         }
31 | 
32 |         assert letter in AsciiClass.get_class_characters(l_class)
33 | 
34 | 
35 | def test_symbols_charater_digits():
36 |     for d in string.digits:
37 |         d_class = AsciiClass.get_ascii_class(d)
38 | 
39 |         assert d_class in {
40 |             AsciiClass.DIGIT
41 |         }
42 | 
43 |         assert d in AsciiClass.get_class_characters(d_class)
44 | 
45 | 
46 | def test_symbol_merge_same_class():
47 |     symbol = Symbol(
48 |         chars={"a"},
49 |         a_class=AsciiClass.LOWER,
50 |         is_class=False
51 |     )
52 | 
53 |     merged = symbol.merge(Symbol.build("b"))
54 | 
55 |     assert merged is not None
56 |     assert not merged.is_class
57 |     assert len(merged.chars) == 2
58 |     assert merged.a_class == AsciiClass.LOWER
59 | 
60 | 
61 | def test_symbol_merge_different_class():
62 |     symbol = Symbol(
63 |         chars={"a"},
64 |         a_class=AsciiClass.LOWER,
65 |         is_class=False
66 |     )
67 | 
68 |     merged = symbol.merge(Symbol.build("1"))
69 | 
70 |     assert merged is not None
71 |     assert not merged.is_class
72 |     assert len(merged.chars) == 2
73 |     assert merged.a_class == AsciiClass.ALNUM
74 | 
75 | 
76 | def test_symbol_merge_to_class():
77 |     symbol = Symbol(
78 |         chars=set([s for s in AsciiClass.get_class_characters(AsciiClass.LOWER) if s != "c"]),
79 |         a_class=AsciiClass.LOWER,
80 |         is_class=False
81 |     )
82 | 
83 |     assert len(symbol.chars) == len(AsciiClass.get_class_characters(AsciiClass.LOWER)) - 1
84 | 
85 |     merged = symbol.merge(Symbol.build("c"))
86 | 
87 |     assert merged.is_class
88 |     assert len(merged.chars) == len(AsciiClass.get_class_characters(AsciiClass.LOWER))
89 |     assert merged.a_class == AsciiClass.LOWER
90 | 


--------------------------------------------------------------------------------
/tests/token_layer_test.py:
--------------------------------------------------------------------------------
 1 | from xsystem import Token
 2 | from xsystem import Branch
 3 | 
 4 | 
 5 | def test_token_fit_score():
 6 |     pass
 7 | 
 8 | 
 9 | def test_tokenization_one_item():
10 |     tokens = list(Branch.get_tokens_in_tuple("abcd"))
11 | 
12 |     assert tokens is not None
13 |     assert len(tokens) == 1
14 |     assert tokens[0] == "abcd"
15 | 
16 | 
17 | def test_tokenization_function():
18 |     example = "2023-10-11"
19 | 
20 |     tokens = list(Branch.get_tokens_in_tuple(example))
21 | 
22 |     assert tokens is not None
23 |     assert len(tokens) == 5
24 |     assert tokens[0] == "2023"
25 |     assert tokens[1] == "-"
26 |     assert tokens[2] == "10"
27 |     assert tokens[3] == "-"
28 |     assert tokens[4] == "11"
29 | 
30 | 
31 | def test_date_tokenization():
32 |     example = "12/10/1998"
33 | 
34 |     tokens = list(Branch.get_tokens_in_tuple(example))
35 | 
36 |     assert len(tokens) == 5
37 | 
38 | 
39 | def test_token_createion():
40 |     token = Token.build("2023")
41 | 
42 |     assert token
43 |     assert len(token.symbols) == 4
44 | 


--------------------------------------------------------------------------------
/tests/xtructure_test.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import pytest
  3 | from xsystem import XTructure
  4 | import pkg_resources  # type: ignore
  5 | 
  6 | import re
  7 | import random
  8 | 
  9 | 
 10 | def test_working_example_single_branch(faker):
 11 |     x = XTructure()
 12 | 
 13 |     for _ in range(100):
 14 |         d = faker.date(pattern=r"%d-%m-%Y")
 15 | 
 16 |         x.learn_new_word(d)
 17 | 
 18 |     assert x
 19 |     assert len(x.branches) == 1
 20 |     assert len(x.branches[0].tokens) == 5
 21 |     assert len(x.branches[0].tokens[0].symbols) == 2
 22 |     assert len(x.branches[0].tokens[1].symbols) == 1
 23 |     assert len(x.branches[0].tokens[2].symbols) == 2
 24 |     assert len(x.branches[0].tokens[3].symbols) == 1
 25 |     assert len(x.branches[0].tokens[4].symbols) == 4
 26 | 
 27 | 
 28 | def test_working_example_multiple_branch():
 29 |     x = XTructure()
 30 | 
 31 |     x.learn_new_word("2022-12-25")
 32 |     x.learn_new_word("N/A")
 33 | 
 34 |     assert x
 35 |     assert len(x.branches) == 2
 36 | 
 37 | 
 38 | def test_learnt_pattern(faker):
 39 |     dataset = [
 40 |         date for date in faker.date(pattern=r"%d-%m-%Y")
 41 |     ]
 42 | 
 43 |     x = XTructure()
 44 | 
 45 |     list(map(x.learn_new_word, dataset))
 46 | 
 47 |     assert str(x)
 48 | 
 49 |     pattern = re.compile(str(x))
 50 | 
 51 |     for date in dataset:
 52 |         assert pattern.match(date), date
 53 | 
 54 | 
 55 | def test_ssn(faker):
 56 |     dataset = [
 57 |         faker.ssn() for _ in range(100)
 58 |     ]
 59 | 
 60 |     x = XTructure()
 61 | 
 62 |     list(map(x.learn_new_word, dataset))
 63 | 
 64 |     assert str(x)
 65 | 
 66 |     pattern = re.compile(str(x))
 67 | 
 68 |     for ssn in dataset:
 69 |         assert pattern.match(ssn), ssn
 70 | 
 71 | 
 72 | def test_optional_characters():
 73 |     x = XTructure()
 74 | 
 75 |     x.learn_new_word("ABCDE")
 76 |     x.learn_new_word("ABDE")
 77 | 
 78 |     assert len(x.branches) == 2
 79 | 
 80 | 
 81 | @pytest.mark.skip
 82 | def test_file_atc():
 83 |     x = XTructure()
 84 | 
 85 |     with open("common/atc.csv") as input:
 86 |         for line in input:
 87 |             line = line.strip()
 88 |             if len(line):
 89 |                 x.learn_new_word(line)
 90 | 
 91 |     s = str(x)
 92 | 
 93 |     assert len(s)
 94 | 
 95 |     assert x
 96 | 
 97 | 
 98 | @pytest.mark.skip
 99 | def test_realistic_data_account_id():
100 |     with pkg_resources.resource_stream(__name__, "csv files/account.csv") as io_stream:
101 |         data = codecs.getreader("utf8")(io_stream).readlines()
102 | 
103 |     assert len(data) == 2615
104 | 
105 |     rows = [line.strip().split(",") for line in data]
106 | 
107 |     lengths = [len(rows) for row in rows]
108 | 
109 |     assert min(lengths) == max(lengths)
110 | 
111 |     for i in range(len(rows[0])):
112 |         x = XTructure()
113 | 
114 |         for row in rows:
115 |             x.learn_new_word(row[i])
116 | 
117 |         print(x)
118 | 
119 |         assert str(x)
120 | 
121 | 
122 | def test_branching_issue_minimal():
123 |     x = XTructure(max_branches=3)
124 | 
125 |     x.learn_new_word("FOOO")
126 |     x.learn_new_word("BAR")
127 |     x.learn_new_word("FOOO")
128 | 
129 |     assert len(x.branches) == 2
130 | 
131 | 
132 | def test_branching_issue_large_dataset():
133 |     values = {
134 |         "CASH": 517,
135 |         "INVESTMENT": 1168,
136 |         "SERVICE": 929,
137 |     }
138 | 
139 |     dataset_size = sum(values.values())
140 | 
141 |     counts: dict[str, int] = dict()
142 | 
143 |     dataset: list[str] = []
144 | 
145 |     for _ in range(dataset_size):
146 |         c = random.choice(list(values.keys()))
147 |         dataset.append(c)
148 |         counts[c] = counts.get(c, 0) + 1
149 | 
150 |         if counts[c] == values[c]:
151 |             del values[c]
152 | 
153 |     x = XTructure(max_branches=len(counts) + 1)
154 | 
155 |     all(map(x.learn_new_word, dataset))
156 | 
157 |     assert len(x.branches) == len(counts)
158 | 
159 | 
160 | def test_italian_fiscal_code(faker, faker_session_locale):
161 |     faker_it = faker["it_IT"]
162 | 
163 |     assert faker_it
164 | 
165 |     dataset = [faker_it.ssn() for _ in range(100)]
166 | 
167 |     x = XTructure(max_branches=1)
168 | 
169 |     all(map(x.learn_new_word, dataset))
170 | 
171 |     learnt_regex = str(x)
172 | 
173 |     assert learnt_regex
174 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37,py38,py310,py311,pypy3
 3 | 
 4 | [testenv]
 5 | deps = -rrequirements-dev.txt
 6 | commands =
 7 |     coverage erase
 8 |     coverage run -m pytest {posargs:tests}
 9 |     coverage report
10 | 
11 | [pep8]
12 | ignore = E265,E501,W504


--------------------------------------------------------------------------------
/xsystem.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | from argparse import ArgumentParser, Namespace
  3 | from itertools import combinations
  4 | import math
  5 | 
  6 | import sys
  7 | import string
  8 | 
  9 | from dataclasses import dataclass
 10 | from dataclasses import field
 11 | 
 12 | from enum import Enum
 13 | from enum import auto
 14 | import re
 15 | from re import Match
 16 | from re import Pattern
 17 | from typing import Generator
 18 | from typing import Optional
 19 | 
 20 | 
 21 | class AsciiClass(Enum):
 22 |     ALNUM = auto(),  # Alphanumeric characters: ‘[:alpha:]’ and ‘[:digit:]’; in the ‘C’ locale and ASCII character encoding, this is the same as ‘[0-9A-Za-z]’.
 23 |     ALPHA = auto(),  # Alphabetic characters: ‘[:lower:]’ and ‘[:upper:]’; in the ‘C’ locale and ASCII character encoding, this is the same as ‘[A-Za-z]’.
 24 |     BLANK = auto(),  # Blank characters: space and tab.
 25 |     CNTRL = auto(),  # Control characters. In ASCII, these characters have octal codes 000 through 037, and 177 (DEL). In other character sets, these are the equivalent characters, if any.
 26 |     DIGIT = auto(),  # Digits: 0 1 2 3 4 5 6 7 8 9.
 27 |     GRAPH = auto(),  # Graphical characters: ‘[:alnum:]’ and ‘[:punct:]’.
 28 |     LOWER = auto(),  # Lower-case letters; in the ‘C’ locale and ASCII character encoding, this is a b c d e f g h i j k l m n o p q r s t u v w x y z.
 29 |     PRINT = auto(),  # Printable characters: ‘[:alnum:]’, ‘[:punct:]’, and space.
 30 |     PUNCT = auto(),  # Punctuation characters; in the ‘C’ locale and ASCII character encoding, this is ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
 31 |     SPACE = auto(),  # Space characters: in the ‘C’ locale, this is tab, newline, vertical tab, form feed, carriage return, and space. See Usage, for more discussion of matching newlines.
 32 |     UPPER = auto(),  # Upper-case letters: in the ‘C’ locale and ASCII character encoding, this is A B C D E F G H I J K L M N O P Q R S T U V W X Y Z.
 33 |     XDIGIT = auto(),  # Hexadecimal digits: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f.
 34 |     ANY = auto(),
 35 | 
 36 |     @staticmethod
 37 |     def get_parent(cls: AsciiClass) -> Optional[AsciiClass]:
 38 |         if cls == AsciiClass.ALNUM:
 39 |             return AsciiClass.GRAPH
 40 |         if cls == AsciiClass.ALPHA:
 41 |             return AsciiClass.ALNUM
 42 |         if cls == AsciiClass.BLANK:
 43 |             return AsciiClass.SPACE
 44 |         if cls == AsciiClass.DIGIT:
 45 |             return AsciiClass.ALNUM
 46 |         if cls == AsciiClass.GRAPH:
 47 |             return AsciiClass.PRINT
 48 |         if cls == AsciiClass.LOWER:
 49 |             return AsciiClass.ALPHA
 50 |         if cls == AsciiClass.PRINT:
 51 |             return AsciiClass.ANY
 52 |         if cls == AsciiClass.PUNCT:
 53 |             return AsciiClass.GRAPH
 54 |         if cls == AsciiClass.SPACE:
 55 |             return AsciiClass.PRINT
 56 |         if cls == AsciiClass.UPPER:
 57 |             return AsciiClass.ALPHA
 58 |         if cls == AsciiClass.CNTRL:
 59 |             return AsciiClass.ANY
 60 |         if cls == AsciiClass.XDIGIT:
 61 |             return AsciiClass.ALNUM
 62 |         if cls == AsciiClass.ANY:
 63 |             return None
 64 | 
 65 |         raise ValueError(f"Unknown ASCII class {cls}")
 66 | 
 67 |     @staticmethod
 68 |     def get_ascii_class_pattern(cls: AsciiClass) -> str:
 69 |         if cls == AsciiClass.ALNUM:
 70 |             return r"[:alnum:]"
 71 |         if cls == AsciiClass.ALPHA:
 72 |             return r"[:alpha:]"
 73 |         if cls == AsciiClass.BLANK:
 74 |             return r"[:blank:]"
 75 |         if cls == AsciiClass.CNTRL:
 76 |             return r"[:cntrl:]"
 77 |         if cls == AsciiClass.DIGIT:
 78 |             return r"[0-9]"
 79 |         if cls == AsciiClass.GRAPH:
 80 |             return r"[:graph:]"
 81 |         if cls == AsciiClass.LOWER:
 82 |             return r"[:lower:]"
 83 |         if cls == AsciiClass.PRINT:
 84 |             return r"[:print:]"
 85 |         if cls == AsciiClass.PUNCT:
 86 |             return r"[:punct:]"
 87 |         if cls == AsciiClass.SPACE:
 88 |             return r"[:space:]"
 89 |         if cls == AsciiClass.UPPER:
 90 |             return r"[:upper:]"
 91 |         if cls == AsciiClass.XDIGIT:
 92 |             return r"[:xdigit:]"
 93 |         if cls == AsciiClass.ANY:
 94 |             return r"."
 95 |         raise ValueError(f"Unsupported ASCII class {cls}")
 96 | 
 97 |     @staticmethod
 98 |     def get_class_characters(symbol_class: AsciiClass) -> set[str]:
 99 |         if symbol_class == AsciiClass.ALNUM:
100 |             return AsciiClass.get_class_characters(AsciiClass.ALPHA) & AsciiClass.get_class_characters(AsciiClass.DIGIT)
101 | 
102 |         if symbol_class == AsciiClass.ALPHA:
103 |             return AsciiClass.get_class_characters(AsciiClass.UPPER) & AsciiClass.get_class_characters(AsciiClass.LOWER)
104 | 
105 |         if symbol_class == AsciiClass.BLANK:
106 |             return set([" ", "\t"])
107 | 
108 |         if symbol_class == AsciiClass.CNTRL:
109 |             # CNTRL = auto(),  # Control characters. In ASCII, these characters have octal codes 000 through 037, and 177 (DEL). In other character sets, these are the equivalent characters, if any.
110 |             raise ValueError()
111 | 
112 |         if symbol_class == AsciiClass.DIGIT:
113 |             return set(string.digits)
114 | 
115 |         if symbol_class == AsciiClass.GRAPH:
116 |             return AsciiClass.get_class_characters(AsciiClass.ALPHA) & AsciiClass.get_class_characters(AsciiClass.PUNCT)
117 | 
118 |         if symbol_class == AsciiClass.LOWER:
119 |             return set(string.ascii_lowercase)
120 | 
121 |         if symbol_class == AsciiClass.PRINT:
122 |             return AsciiClass.get_class_characters(AsciiClass.ALNUM) & AsciiClass.get_class_characters(AsciiClass.PUNCT) & AsciiClass.get_class_characters(AsciiClass.SPACE)
123 | 
124 |         if symbol_class == AsciiClass.PUNCT:
125 |             return set(string.punctuation)
126 | 
127 |         if symbol_class == AsciiClass.UPPER:
128 |             return set(string.ascii_uppercase)
129 | 
130 |         if symbol_class == AsciiClass.XDIGIT:
131 |             return set(string.hexdigits)
132 | 
133 |         if symbol_class == AsciiClass.SPACE:
134 |             return set(
135 |                 # "\t\n\x0B\x0C\x0D "
136 |                 string.whitespace
137 |             )
138 | 
139 |         raise ValueError()
140 | 
141 |     @staticmethod
142 |     def get_ascii_class(s: str) -> AsciiClass:
143 |         if len(s) > 1:
144 |             raise ValueError("Expected single character")
145 | 
146 |         if s.isdigit():
147 |             return AsciiClass.DIGIT
148 | 
149 |         if s.isalpha():
150 |             if s.islower():
151 |                 return AsciiClass.LOWER
152 |             elif s.isupper():
153 |                 return AsciiClass.UPPER
154 |             return AsciiClass.ALPHA
155 | 
156 |         if s.isspace():
157 |             return AsciiClass.SPACE
158 | 
159 |         if s.isprintable():
160 |             return AsciiClass.PUNCT
161 | 
162 |         raise ValueError(f"{s} unknown")
163 | 
164 |     @staticmethod
165 |     def find_common_ancestor(class1: AsciiClass, class2: AsciiClass) -> AsciiClass:
166 |         parent: Optional[AsciiClass] = class1
167 |         ancestors: set[AsciiClass] = {class1}
168 | 
169 |         assert parent is not None
170 | 
171 |         while True:
172 |             parent = AsciiClass.get_parent(parent)
173 |             if parent is None:
174 |                 break
175 |             ancestors.add(parent)
176 | 
177 |         parent = class2
178 |         while parent is not None:
179 |             if parent in ancestors:
180 |                 return parent
181 |             else:
182 |                 parent = AsciiClass.get_parent(parent)
183 | 
184 |         if parent is None:
185 |             return AsciiClass.ANY
186 | 
187 |         raise ValueError()
188 | 
189 | 
190 | @dataclass
191 | class Symbol:
192 |     a_class: AsciiClass
193 |     chars: set[str]
194 |     is_class: bool
195 |     is_optional: bool = False
196 | 
197 |     def fit_score(self, s: str, alpha: float) -> float:
198 |         if AsciiClass.get_ascii_class(s) == self.a_class:
199 |             return 0
200 |         if not self.is_class and s in self.chars:
201 |             return alpha
202 |         return 1
203 | 
204 |     def __str__(self) -> str:
205 |         if self.is_class:
206 |             return AsciiClass.get_ascii_class_pattern(self.a_class)
207 |         elif len(self.chars) == 1:
208 |             return self._sanitize(next(iter(self.chars))) + ("?" if self.is_optional else "")
209 |         else:
210 |             return "[" + "".join(Symbol._sanitize(c) for c in self.chars) + "]" + ("?" if self.is_optional else "")
211 | 
212 |     def fit(self, other: Symbol) -> float:
213 |         if self.a_class == other.a_class:
214 |             return 0
215 |         if AsciiClass.find_common_ancestor(self.a_class, other.a_class) == other.a_class:
216 |             return 0
217 |         if AsciiClass.find_common_ancestor(self.a_class, other.a_class) == self.a_class:
218 |             return 0
219 | 
220 |         common_chars = len(self.chars & other.chars)
221 |         if common_chars != 0:
222 |             return 1 - common_chars / len(self.chars)
223 |         else:
224 |             return 1
225 | 
226 |     @staticmethod
227 |     def _sanitize(c: str) -> str:
228 |         if c in ".^$*+?()[{\\|":
229 |             return f"\\{c}"
230 |         return c
231 | 
232 |     def merge(self, other: Symbol) -> Symbol:
233 |         if other.a_class != self.a_class:
234 |             na_class = AsciiClass.find_common_ancestor(other.a_class, self.a_class)
235 |         else:
236 |             na_class = self.a_class
237 | 
238 |         chars = self.chars | other.chars
239 | 
240 |         return Symbol(
241 |             na_class,
242 |             chars=chars,
243 |             is_class=len(chars) == len(AsciiClass.get_class_characters(na_class))
244 |         )
245 | 
246 |     @staticmethod
247 |     def build(symbol: str) -> Symbol:
248 |         symbol_class = AsciiClass.get_ascii_class(symbol)
249 |         return Symbol(
250 |             a_class=symbol_class,
251 |             is_class=False,
252 |             chars=set(symbol)
253 |         )
254 | 
255 | 
256 | @dataclass
257 | class Token:
258 |     symbols: list[Symbol] = field(default_factory=list)
259 |     optional: bool = False
260 | 
261 |     def fit_score(self, t: str, alpha: float) -> float:
262 |         return sum(
263 |             symbol.fit_score(tuple_element, alpha) for symbol, tuple_element in zip(self.symbols, Token.get_symbols_in_token(t))
264 |         ) + abs(
265 |             len(t) - len(self.symbols)
266 |         )
267 | 
268 |     def merge(self, other: Token) -> Token:
269 |         symbols = [symbol.merge(other_symbol) for symbol, other_symbol in zip(self.symbols, other.symbols)]
270 | 
271 |         if len(self.symbols) == len(other.symbols):
272 |             return Token(symbols=symbols, optional=self.optional or other.optional)
273 |         elif len(self.symbols) > len(other.symbols):
274 |             missing = [Symbol(s.a_class, s.chars, s.is_class, True) for s in self.symbols[len(other.symbols):]]
275 |         else:
276 |             missing = [Symbol(s.a_class, s.chars, s.is_class, True) for s in other.symbols[len(self.symbols):]]
277 | 
278 |         return Token(symbols=symbols + missing, optional=self.optional or other.optional)
279 | 
280 |     def fit(self, other: Token) -> float:
281 |         return sum(
282 |             symbol.fit(other_symbol) for symbol, other_symbol in zip(self.symbols, other.symbols)
283 |         ) + abs(len(self.symbols) - len(other.symbols))
284 | 
285 |     @staticmethod
286 |     def get_symbols_in_token(t: str) -> Generator[str, None, None]:
287 |         for c in t:
288 |             yield c
289 | 
290 |     def __str__(self) -> str:
291 |         return "(" + "".join(str(symbol) for symbol in self.symbols) + ")" + ("?" if self.optional else "")
292 | 
293 |     @staticmethod
294 |     def build(word: str) -> Token:
295 |         return Token(
296 |             list(Symbol.build(symbol) for symbol in word)
297 |         )
298 | 
299 | 
300 | class NullToken(Token):
301 |     def d(self, t: str) -> float:
302 |         return 1.0 * len(t)
303 | 
304 | 
305 | @dataclass
306 | class Branch:
307 |     tokens: list[Token] = field(default_factory=list)
308 | 
309 |     def fit_score(self, t: str, alpha: float) -> float:
310 |         tokens: list[Token] = [self.tokens[i] if i < len(self.tokens) else NullToken() for i, _ in enumerate(t)]
311 | 
312 |         return sum(
313 |             token.fit_score(t_i, alpha) for token, t_i in zip(tokens, Branch.get_tokens_in_tuple(t))
314 |         )
315 | 
316 |     def add(self, word: str) -> None:
317 |         self.tokens = [token.merge(nt) for nt, token in zip(Branch.build(word).tokens, self.tokens)]
318 | 
319 |     def __str__(self) -> str:
320 |         return "".join(str(token) for token in self.tokens)
321 | 
322 |     def fit(self, other: Branch) -> float:
323 |         return sum(
324 |             token.fit(other_token) for token, other_token in zip(self.tokens, other.tokens)
325 |         ) + abs(len(self.tokens) - len(other.tokens))
326 | 
327 |     def merge(self, other: Branch) -> Branch:
328 |         tokens = [
329 |             token.merge(other_token) for token, other_token in zip(self.tokens, other.tokens)
330 |         ]
331 | 
332 |         if len(self.tokens) == len(other.tokens):
333 |             return Branch(tokens)
334 |         elif len(self.tokens) > len(other.tokens):
335 |             missing = [
336 |                 Token(token.symbols, True) for token in self.tokens[len(other.tokens):]
337 |             ]
338 | 
339 |             assert len(tokens) + len(missing) == len(self.tokens)
340 |         else:
341 |             missing = [
342 |                 Token(token.symbols, True) for token in other.tokens[len(self.tokens):]
343 |             ]
344 | 
345 |             assert len(tokens) + len(missing) == len(other.tokens)
346 | 
347 |         return Branch(tokens + missing)
348 | 
349 |     @staticmethod
350 |     def get_tokens_in_tuple(t: str, delimiters: str = r"[-_/\\#., ]") -> Generator[str, None, None]:
351 |         pattern: Pattern[str] = re.compile(delimiters)
352 | 
353 |         last_match: Optional[Match[str]] = None
354 | 
355 |         for m in re.finditer(pattern, t):
356 |             if last_match is None:
357 |                 yield t[:m.start()]
358 |             else:
359 |                 yield t[last_match.end():m.start()]
360 | 
361 |             yield t[m.start():m.end()]
362 | 
363 |             last_match = m
364 | 
365 |         if last_match is None:
366 |             yield t
367 |         else:
368 |             yield t[last_match.end():]
369 | 
370 |     @staticmethod
371 |     def build(word: str) -> Branch:
372 |         return Branch(
373 |             tokens=[
374 |                 Token.build(token) for token in Branch.get_tokens_in_tuple(word)
375 |             ]
376 |         )
377 | 
378 |     def __repr__(self) -> str:
379 |         return f"Branch[{str(self)}"
380 | 
381 | 
382 | @dataclass
383 | class XTructure:
384 |     alpha: float = 1 / 5
385 |     max_branches: int = 8
386 |     branching_threshold: float = 0.85
387 | 
388 |     branches: list[Branch] = field(default_factory=list)
389 | 
390 |     def fit_score(self, t: str) -> float:
391 |         return min(b.fit_score(t, self.alpha) for b in self.branches)
392 | 
393 |     def learn_new_word(self, word: str) -> bool:
394 |         if len(word) == 0:
395 |             return False
396 | 
397 |         if not len(self.branches):
398 |             self.branches.append(Branch.build(word))
399 | 
400 |         else:
401 |             best_branch, score = self._best_branch(word)
402 | 
403 |             if score < self.branching_threshold:
404 |                 best_branch.add(word)
405 |             else:
406 |                 self.branches.append(
407 |                     Branch.build(word)
408 |                 )
409 | 
410 |             if len(self.branches) > self.max_branches:
411 |                 self.branches = self.merge_most_similar()
412 | 
413 |         return True
414 | 
415 |     def _best_branch(self, word: str) -> tuple[Branch, float]:
416 |         assert len(self.branches)
417 | 
418 |         best_score = math.inf
419 |         best_branch: Optional[Branch] = None
420 | 
421 |         for branch in self.branches:
422 |             branch_score = branch.fit_score(word, self.alpha)
423 | 
424 |             if branch_score < best_score:
425 |                 best_branch = branch
426 |                 best_score = branch_score
427 | 
428 |         assert best_branch is not None
429 |         assert best_score != math.inf
430 | 
431 |         return best_branch, best_score
432 | 
433 |     def merge_most_similar(self) -> list[Branch]:
434 |         min_distance = math.inf
435 |         m_bi: Optional[Branch] = None
436 |         m_bj: Optional[Branch] = None
437 | 
438 |         for bi, bj in combinations(self.branches, 2):
439 |             assert bi is not bj
440 | 
441 |             distance = bi.fit(bj)
442 | 
443 |             if distance < min_distance:
444 |                 min_distance = distance
445 |                 m_bi = bi
446 |                 m_bj = bj
447 | 
448 |         assert m_bi is not None
449 |         assert m_bj is not None
450 | 
451 |         self.branches.remove(m_bi)
452 |         self.branches.remove(m_bj)
453 | 
454 |         self.branches.append(m_bi.merge(m_bj))
455 | 
456 |         return self.branches
457 | 
458 |     def __str__(self) -> str:
459 |         return "|".join(str(branch) for branch in self.branches)
460 | 
461 | 
462 | def parse_arguments() -> Namespace:
463 |     parser = ArgumentParser(
464 |         prog=sys.argv[0].split("/")[-1],
465 |         description="A simple tool to learn human readable a regular expression from examples",
466 |     )
467 | 
468 |     parser.add_argument("-i", "--input", help="Path to the input source, defaults to stdin")
469 |     parser.add_argument("-o", "--output", help="Path to the output file, defaults to stdout")
470 |     parser.add_argument("--max-branch", type=int, default=8, help="Maximum number of branches allowed, defaults to 8")
471 |     parser.add_argument("--alpha", type=float, default=1 / 5, help="Weight for fitting tuples, defaults to 1/5")
472 |     parser.add_argument("--branch-threshold", type=float, default=.85, help="Branching threshold, defaults to 0.85, relative to the fitting score alpha")
473 | 
474 |     return parser.parse_args()
475 | 
476 | 
477 | def main() -> int:
478 |     cmd = parse_arguments()
479 | 
480 |     x = XTructure(
481 |         cmd.alpha,
482 |         cmd.max_branch,
483 |         cmd.branch_threshold
484 |     )
485 | 
486 |     data_source = open(cmd.input) if cmd.input else sys.stdin
487 | 
488 |     for line in data_source:
489 |         x.learn_new_word(line.strip())
490 | 
491 |     output = open(cmd.output) if cmd.output else sys.stdout
492 | 
493 |     print(str(x), file=output)
494 | 
495 |     return 0
496 | 
497 | 
498 | if __name__ == "__main__":
499 |     raise SystemExit(main())
500 | 


--------------------------------------------------------------------------------