├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── user-story.md └── workflows │ ├── codeql.yml │ ├── python-app.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── requirements-dev.txt ├── setup.cfg ├── setup.py ├── tests ├── branch_layer_test.py ├── conftest.py ├── symbol_layer_test.py ├── token_layer_test.py └── xtructure_test.py ├── tox.ini └── xsystem.py /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | **Is your feature request related to a problem? If so, please describe.** 13 | 14 | 15 | 16 | **Describe your proposed solution** 17 | 18 | 19 | 20 | **Describe alternatives you have considered** 21 | 22 | 23 | 24 | **Additional context** 25 | 26 | 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/user-story.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: User story 3 | about: Create a user story for this project. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Overview 11 | 12 | 16 | 17 | ### Acceptance Criteria 18 | 19 | 25 | 26 | ### Questions 27 | 28 | 32 | 33 | ### Assumptions 34 | 35 | 40 | 41 | ### Reference 42 | 43 | 48 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | schedule: 9 | - cron: '45 17 * * 5' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ 'python' ] 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v3 28 | 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v2 31 | with: 32 | languages: ${{ matrix.language }} 33 | 34 | - name: Autobuild 35 | uses: github/codeql-action/autobuild@v2 36 | 37 | - name: Perform CodeQL Analysis 38 | uses: github/codeql-action/analyze@v2 39 | with: 40 | category: "/language:${{matrix.language}}" 41 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dev dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements-dev.txt ]; then pip install -r requirements-dev.txt; else pip install flake8 pytest; fi 31 | - name: Install package 32 | run: pip install . 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest 42 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | push: 15 | branches: 16 | - main 17 | pull_request: 18 | branches: 19 | - main 20 | 21 | permissions: 22 | contents: read 23 | 24 | jobs: 25 | deploy: 26 | 27 | runs-on: ubuntu-latest 28 | 29 | steps: 30 | - uses: actions/checkout@v3 31 | - name: Set up Python 32 | uses: actions/setup-python@v3 33 | with: 34 | python-version: '3.x' 35 | - name: Install dependencies 36 | run: | 37 | python -m pip install --upgrade pip 38 | pip install build 39 | - name: Build package 40 | run: python -m build 41 | - name: Publish package 42 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 43 | with: 44 | user: __token__ 45 | password: ${{ secrets.PYPI_API_TOKEN }} 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # dependencies 2 | 3 | # testing 4 | 5 | # production 6 | 7 | # misc 8 | .DS_Store 9 | .env.local 10 | .env.development.local 11 | .env.test.local 12 | .env.production.local 13 | 14 | npm-debug.log* 15 | yarn-debug.log* 16 | yarn-error.log* 17 | 18 | .venv 19 | .vscode 20 | 21 | .env.prod 22 | .env.dev 23 | __pycache__ 24 | *.pyc 25 | *.egg-info 26 | .tox 27 | .coverage 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Regex-learner 2 | 3 | This project provides a tool/library implementing an automated regular expression building mechanism. 4 | 5 | This project takes inspiration on the paper from Ilyas, et al [1] 6 | 7 | [Ilyas, Andrew, M. F. da Trindade, Joana, Castro Fernandez, Raul and Madden, Samuel. 2018. "Extracting Syntactical Patterns from Databases."](https://hdl.handle.net/1721.1/137774) 8 | 9 | This repository contains code and examples to assist in the exeuction of regular expression learning from the columns of data. 10 | 11 | This is a basic readme. It will be completed as the prototype grows. 12 | 13 | # Installation 14 | 15 | The project can be installed via pip: 16 | ```bash 17 | pip install regex-learner 18 | ``` 19 | 20 | # Examples of usage 21 | 22 | Example of learning a date pattern from 100 examples of randomly sampled dates in the format DD-MM-YYYY. 23 | 24 | ```python 25 | from xsystem import XTructure 26 | from faker import Faker 27 | 28 | fake = Faker() 29 | x = XTructure() # Create basic XTructure class 30 | 31 | for _ in range(100): 32 | d = fake.date(pattern=r"%d-%m-%Y") # Create example of data - date in the format DD-MM-YYYY 33 | x.learn_new_word(d) # Add example to XSystem and learn new features 34 | 35 | print(str(x)) # ([0312][0-9])(-)([01][891652073])(-)([21][09][078912][0-9]) 36 | ``` 37 | 38 | Similary, the tool can be used directly from the command line using the `regex-learner` CLI provided by the installation of the package. 39 | 40 | The tool has several options, as described by the help message: 41 | 42 | ``` 43 | > regex-learner -h 44 | usage: regex-learner [-h] [-i INPUT] [-o OUTPUT] [--max-branch MAX_BRANCH] [--alpha ALPHA] [--branch-threshold BRANCH_THRESHOLD] 45 | 46 | A simple tool to learn human readable a regular expression from examples 47 | 48 | options: 49 | -h, --help show this help message and exit 50 | -i INPUT, --input INPUT 51 | Path to the input source, defaults to stdin 52 | -o OUTPUT, --output OUTPUT 53 | Path to the output file, defaults to stdout 54 | --max-branch MAX_BRANCH 55 | Maximum number of branches allowed, defaults to 8 56 | --alpha ALPHA Weight for fitting tuples, defaults to 1/5 57 | --branch-threshold BRANCH_THRESHOLD 58 | Branching threshold, defaults to 0.85, relative to the fitting score alpha 59 | ``` 60 | 61 | Assuming a data file containing the examples to learn from is called `EXAMPLE_FILE`, and assuming one is interested in a very simple regular expression, the tool can be used as follows: 62 | 63 | ```bash 64 | cat EXAMPLE_FILE | regex-learner --max-branch 2 65 | ``` 66 | 67 | ## Note 68 | Note that this project is not based on the actual implementation of the paper as presented in [2] 69 | 70 | ## References 71 | 1. Ilyas, Andrew, et al. "Extracting syntactical patterns from databases." 2018 IEEE 34th International Conference on Data Engineering (ICDE). IEEE, 2018. 72 | 2. https://github.com/mitdbg/XSystem 73 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | covdefaults 2 | coverage 3 | pytest 4 | faker -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = regex-learner 3 | version = 0.0.4 4 | description = The project provides a tool/library implementing an automated regular expression building mechanism. 5 | 6 | author = Stefano Braghin, Liubov Nedoshivina 7 | author_email = "Liubov Nedoshivia" 8 | long_description = long_description 9 | long_description_content_type = text/markdown 10 | url = https://github.com/IBM/regex-learner 11 | license = Apache License 2.0 12 | [options] 13 | py_modules = xsystem 14 | python_requires = >=3.8 15 | 16 | [options.entry_points] 17 | console_scripts = 18 | regex-learner = xsystem:main 19 | 20 | [bdist_wheel] 21 | universal = True 22 | 23 | [mypy] 24 | check_untyped_defs = true 25 | disallow_any_generics = true 26 | disallow_incomplete_defs = true 27 | disallow_untyped_defs = true 28 | warn_redundant_casts = true 29 | warn_unused_ignores = true 30 | 31 | [mypy-tests.*] 32 | disallow_untyped_defs = false 33 | 34 | [flake8] 35 | ignore = E265,E501,W504 36 | 37 | [bandit] 38 | ignore = B101 39 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from setuptools import setup # type: ignore 4 | from pathlib import Path 5 | this_directory = Path(__file__).parent 6 | long_description = (this_directory / "README.md").read_text() 7 | 8 | setup(long_description=long_description) 9 | -------------------------------------------------------------------------------- /tests/branch_layer_test.py: -------------------------------------------------------------------------------- 1 | from xsystem import Branch 2 | from xsystem import Token 3 | from xsystem import Symbol 4 | from xsystem import AsciiClass 5 | 6 | 7 | def test_add(): 8 | # examples: 1234 9 | 10 | branch = Branch( 11 | tokens=[ 12 | Token( 13 | symbols=[ 14 | Symbol.build("1"), 15 | Symbol.build("2"), 16 | Symbol.build("3"), 17 | Symbol.build("4"), 18 | ] 19 | ) 20 | ] 21 | ) 22 | 23 | branch.add( 24 | "2234" 25 | ) 26 | 27 | assert len(branch.tokens) == 1 28 | assert len(branch.tokens[0].symbols) == 4 29 | 30 | for i, symbol in enumerate(branch.tokens[0].symbols): 31 | assert symbol is not None 32 | assert not symbol.is_class 33 | assert symbol.a_class == AsciiClass.DIGIT 34 | 35 | if i != 0: 36 | assert len(symbol.chars) == 1 37 | 38 | assert len(branch.tokens[0].symbols[0].chars) == 2 39 | 40 | 41 | def test_fit_score_simmetric(): 42 | b1 = Branch.build("ABC") 43 | b2 = Branch.build("CDE") 44 | 45 | assert b1.fit(b2) == b2.fit(b1) 46 | 47 | 48 | def test_fit_score_same(): 49 | b1 = Branch.build("ABC") 50 | b1_same = Branch.build("ABC") 51 | 52 | assert b1.fit(b1_same) == 0 53 | 54 | 55 | def test_fit_score_of_similar_is_not_inf(): 56 | b1 = Branch.build("ABC") 57 | b2 = Branch.build("123") 58 | 59 | assert b1.fit(b2) == 3 60 | 61 | b3 = Branch.build("AB1") 62 | 63 | assert b1.fit(b3) == 1 64 | 65 | 66 | def test_merge_similar_length(): 67 | b1 = Branch.build("ABC") 68 | b2 = Branch.build("123") 69 | 70 | b_merged = b1.merge(b2) 71 | 72 | assert b_merged is not None 73 | assert len(b_merged.tokens) == 1 74 | assert len(b_merged.tokens[0].symbols) == 3 75 | 76 | 77 | def test_merge_different_length(): 78 | b1 = Branch.build("AB") 79 | b2 = Branch.build("ABD") 80 | 81 | b_merged = b1.merge(b2) 82 | 83 | assert b_merged is not None 84 | assert len(b_merged.tokens) == 1 85 | assert len(b_merged.tokens[0].symbols) == 3 86 | assert not b_merged.tokens[0].symbols[0].is_optional 87 | assert not b_merged.tokens[0].symbols[1].is_optional 88 | assert b_merged.tokens[0].symbols[2].is_optional 89 | 90 | 91 | def test_merge_different_token_numbers(): 92 | b1 = Branch.build("a-b-c") 93 | b2 = Branch.build("a-b") 94 | 95 | b_merged = b1.merge(b2) 96 | 97 | assert b_merged is not None 98 | assert len(b_merged.tokens) == 5 99 | 100 | assert not b_merged.tokens[0].optional 101 | assert not b_merged.tokens[1].optional 102 | assert not b_merged.tokens[2].optional 103 | assert b_merged.tokens[3].optional 104 | assert b_merged.tokens[4].optional 105 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture(scope="session", autouse=True) 5 | def faker_session_locale(): 6 | return ['it_IT', 'en_US'] 7 | -------------------------------------------------------------------------------- /tests/symbol_layer_test.py: -------------------------------------------------------------------------------- 1 | import math 2 | import string 3 | 4 | from xsystem import AsciiClass 5 | from xsystem import Symbol 6 | 7 | 8 | def test_get_ascii_class(): 9 | for c in string.printable: 10 | ascii_class = AsciiClass.get_ascii_class(c) 11 | 12 | assert ascii_class is not None 13 | assert ascii_class in AsciiClass 14 | 15 | 16 | def test_symbol_creation(): 17 | symbol = Symbol.build("5") 18 | 19 | assert symbol 20 | 21 | assert 0 == symbol.fit_score("5", math.inf) 22 | 23 | 24 | def test_symbols_character_letters(): 25 | for letter in string.ascii_letters: 26 | l_class = AsciiClass.get_ascii_class(letter) 27 | 28 | assert l_class in { 29 | AsciiClass.UPPER, AsciiClass.LOWER, AsciiClass.ALPHA 30 | } 31 | 32 | assert letter in AsciiClass.get_class_characters(l_class) 33 | 34 | 35 | def test_symbols_charater_digits(): 36 | for d in string.digits: 37 | d_class = AsciiClass.get_ascii_class(d) 38 | 39 | assert d_class in { 40 | AsciiClass.DIGIT 41 | } 42 | 43 | assert d in AsciiClass.get_class_characters(d_class) 44 | 45 | 46 | def test_symbol_merge_same_class(): 47 | symbol = Symbol( 48 | chars={"a"}, 49 | a_class=AsciiClass.LOWER, 50 | is_class=False 51 | ) 52 | 53 | merged = symbol.merge(Symbol.build("b")) 54 | 55 | assert merged is not None 56 | assert not merged.is_class 57 | assert len(merged.chars) == 2 58 | assert merged.a_class == AsciiClass.LOWER 59 | 60 | 61 | def test_symbol_merge_different_class(): 62 | symbol = Symbol( 63 | chars={"a"}, 64 | a_class=AsciiClass.LOWER, 65 | is_class=False 66 | ) 67 | 68 | merged = symbol.merge(Symbol.build("1")) 69 | 70 | assert merged is not None 71 | assert not merged.is_class 72 | assert len(merged.chars) == 2 73 | assert merged.a_class == AsciiClass.ALNUM 74 | 75 | 76 | def test_symbol_merge_to_class(): 77 | symbol = Symbol( 78 | chars=set([s for s in AsciiClass.get_class_characters(AsciiClass.LOWER) if s != "c"]), 79 | a_class=AsciiClass.LOWER, 80 | is_class=False 81 | ) 82 | 83 | assert len(symbol.chars) == len(AsciiClass.get_class_characters(AsciiClass.LOWER)) - 1 84 | 85 | merged = symbol.merge(Symbol.build("c")) 86 | 87 | assert merged.is_class 88 | assert len(merged.chars) == len(AsciiClass.get_class_characters(AsciiClass.LOWER)) 89 | assert merged.a_class == AsciiClass.LOWER 90 | -------------------------------------------------------------------------------- /tests/token_layer_test.py: -------------------------------------------------------------------------------- 1 | from xsystem import Token 2 | from xsystem import Branch 3 | 4 | 5 | def test_token_fit_score(): 6 | pass 7 | 8 | 9 | def test_tokenization_one_item(): 10 | tokens = list(Branch.get_tokens_in_tuple("abcd")) 11 | 12 | assert tokens is not None 13 | assert len(tokens) == 1 14 | assert tokens[0] == "abcd" 15 | 16 | 17 | def test_tokenization_function(): 18 | example = "2023-10-11" 19 | 20 | tokens = list(Branch.get_tokens_in_tuple(example)) 21 | 22 | assert tokens is not None 23 | assert len(tokens) == 5 24 | assert tokens[0] == "2023" 25 | assert tokens[1] == "-" 26 | assert tokens[2] == "10" 27 | assert tokens[3] == "-" 28 | assert tokens[4] == "11" 29 | 30 | 31 | def test_date_tokenization(): 32 | example = "12/10/1998" 33 | 34 | tokens = list(Branch.get_tokens_in_tuple(example)) 35 | 36 | assert len(tokens) == 5 37 | 38 | 39 | def test_token_createion(): 40 | token = Token.build("2023") 41 | 42 | assert token 43 | assert len(token.symbols) == 4 44 | -------------------------------------------------------------------------------- /tests/xtructure_test.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import pytest 3 | from xsystem import XTructure 4 | import pkg_resources # type: ignore 5 | 6 | import re 7 | import random 8 | 9 | 10 | def test_working_example_single_branch(faker): 11 | x = XTructure() 12 | 13 | for _ in range(100): 14 | d = faker.date(pattern=r"%d-%m-%Y") 15 | 16 | x.learn_new_word(d) 17 | 18 | assert x 19 | assert len(x.branches) == 1 20 | assert len(x.branches[0].tokens) == 5 21 | assert len(x.branches[0].tokens[0].symbols) == 2 22 | assert len(x.branches[0].tokens[1].symbols) == 1 23 | assert len(x.branches[0].tokens[2].symbols) == 2 24 | assert len(x.branches[0].tokens[3].symbols) == 1 25 | assert len(x.branches[0].tokens[4].symbols) == 4 26 | 27 | 28 | def test_working_example_multiple_branch(): 29 | x = XTructure() 30 | 31 | x.learn_new_word("2022-12-25") 32 | x.learn_new_word("N/A") 33 | 34 | assert x 35 | assert len(x.branches) == 2 36 | 37 | 38 | def test_learnt_pattern(faker): 39 | dataset = [ 40 | date for date in faker.date(pattern=r"%d-%m-%Y") 41 | ] 42 | 43 | x = XTructure() 44 | 45 | list(map(x.learn_new_word, dataset)) 46 | 47 | assert str(x) 48 | 49 | pattern = re.compile(str(x)) 50 | 51 | for date in dataset: 52 | assert pattern.match(date), date 53 | 54 | 55 | def test_ssn(faker): 56 | dataset = [ 57 | faker.ssn() for _ in range(100) 58 | ] 59 | 60 | x = XTructure() 61 | 62 | list(map(x.learn_new_word, dataset)) 63 | 64 | assert str(x) 65 | 66 | pattern = re.compile(str(x)) 67 | 68 | for ssn in dataset: 69 | assert pattern.match(ssn), ssn 70 | 71 | 72 | def test_optional_characters(): 73 | x = XTructure() 74 | 75 | x.learn_new_word("ABCDE") 76 | x.learn_new_word("ABDE") 77 | 78 | assert len(x.branches) == 2 79 | 80 | 81 | @pytest.mark.skip 82 | def test_file_atc(): 83 | x = XTructure() 84 | 85 | with open("common/atc.csv") as input: 86 | for line in input: 87 | line = line.strip() 88 | if len(line): 89 | x.learn_new_word(line) 90 | 91 | s = str(x) 92 | 93 | assert len(s) 94 | 95 | assert x 96 | 97 | 98 | @pytest.mark.skip 99 | def test_realistic_data_account_id(): 100 | with pkg_resources.resource_stream(__name__, "csv files/account.csv") as io_stream: 101 | data = codecs.getreader("utf8")(io_stream).readlines() 102 | 103 | assert len(data) == 2615 104 | 105 | rows = [line.strip().split(",") for line in data] 106 | 107 | lengths = [len(rows) for row in rows] 108 | 109 | assert min(lengths) == max(lengths) 110 | 111 | for i in range(len(rows[0])): 112 | x = XTructure() 113 | 114 | for row in rows: 115 | x.learn_new_word(row[i]) 116 | 117 | print(x) 118 | 119 | assert str(x) 120 | 121 | 122 | def test_branching_issue_minimal(): 123 | x = XTructure(max_branches=3) 124 | 125 | x.learn_new_word("FOOO") 126 | x.learn_new_word("BAR") 127 | x.learn_new_word("FOOO") 128 | 129 | assert len(x.branches) == 2 130 | 131 | 132 | def test_branching_issue_large_dataset(): 133 | values = { 134 | "CASH": 517, 135 | "INVESTMENT": 1168, 136 | "SERVICE": 929, 137 | } 138 | 139 | dataset_size = sum(values.values()) 140 | 141 | counts: dict[str, int] = dict() 142 | 143 | dataset: list[str] = [] 144 | 145 | for _ in range(dataset_size): 146 | c = random.choice(list(values.keys())) 147 | dataset.append(c) 148 | counts[c] = counts.get(c, 0) + 1 149 | 150 | if counts[c] == values[c]: 151 | del values[c] 152 | 153 | x = XTructure(max_branches=len(counts) + 1) 154 | 155 | all(map(x.learn_new_word, dataset)) 156 | 157 | assert len(x.branches) == len(counts) 158 | 159 | 160 | def test_italian_fiscal_code(faker, faker_session_locale): 161 | faker_it = faker["it_IT"] 162 | 163 | assert faker_it 164 | 165 | dataset = [faker_it.ssn() for _ in range(100)] 166 | 167 | x = XTructure(max_branches=1) 168 | 169 | all(map(x.learn_new_word, dataset)) 170 | 171 | learnt_regex = str(x) 172 | 173 | assert learnt_regex 174 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37,py38,py310,py311,pypy3 3 | 4 | [testenv] 5 | deps = -rrequirements-dev.txt 6 | commands = 7 | coverage erase 8 | coverage run -m pytest {posargs:tests} 9 | coverage report 10 | 11 | [pep8] 12 | ignore = E265,E501,W504 -------------------------------------------------------------------------------- /xsystem.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from argparse import ArgumentParser, Namespace 3 | from itertools import combinations 4 | import math 5 | 6 | import sys 7 | import string 8 | 9 | from dataclasses import dataclass 10 | from dataclasses import field 11 | 12 | from enum import Enum 13 | from enum import auto 14 | import re 15 | from re import Match 16 | from re import Pattern 17 | from typing import Generator 18 | from typing import Optional 19 | 20 | 21 | class AsciiClass(Enum): 22 | ALNUM = auto(), # Alphanumeric characters: ‘[:alpha:]’ and ‘[:digit:]’; in the ‘C’ locale and ASCII character encoding, this is the same as ‘[0-9A-Za-z]’. 23 | ALPHA = auto(), # Alphabetic characters: ‘[:lower:]’ and ‘[:upper:]’; in the ‘C’ locale and ASCII character encoding, this is the same as ‘[A-Za-z]’. 24 | BLANK = auto(), # Blank characters: space and tab. 25 | CNTRL = auto(), # Control characters. In ASCII, these characters have octal codes 000 through 037, and 177 (DEL). In other character sets, these are the equivalent characters, if any. 26 | DIGIT = auto(), # Digits: 0 1 2 3 4 5 6 7 8 9. 27 | GRAPH = auto(), # Graphical characters: ‘[:alnum:]’ and ‘[:punct:]’. 28 | LOWER = auto(), # Lower-case letters; in the ‘C’ locale and ASCII character encoding, this is a b c d e f g h i j k l m n o p q r s t u v w x y z. 29 | PRINT = auto(), # Printable characters: ‘[:alnum:]’, ‘[:punct:]’, and space. 30 | PUNCT = auto(), # Punctuation characters; in the ‘C’ locale and ASCII character encoding, this is ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~. 31 | SPACE = auto(), # Space characters: in the ‘C’ locale, this is tab, newline, vertical tab, form feed, carriage return, and space. See Usage, for more discussion of matching newlines. 32 | UPPER = auto(), # Upper-case letters: in the ‘C’ locale and ASCII character encoding, this is A B C D E F G H I J K L M N O P Q R S T U V W X Y Z. 33 | XDIGIT = auto(), # Hexadecimal digits: 0 1 2 3 4 5 6 7 8 9 A B C D E F a b c d e f. 34 | ANY = auto(), 35 | 36 | @staticmethod 37 | def get_parent(cls: AsciiClass) -> Optional[AsciiClass]: 38 | if cls == AsciiClass.ALNUM: 39 | return AsciiClass.GRAPH 40 | if cls == AsciiClass.ALPHA: 41 | return AsciiClass.ALNUM 42 | if cls == AsciiClass.BLANK: 43 | return AsciiClass.SPACE 44 | if cls == AsciiClass.DIGIT: 45 | return AsciiClass.ALNUM 46 | if cls == AsciiClass.GRAPH: 47 | return AsciiClass.PRINT 48 | if cls == AsciiClass.LOWER: 49 | return AsciiClass.ALPHA 50 | if cls == AsciiClass.PRINT: 51 | return AsciiClass.ANY 52 | if cls == AsciiClass.PUNCT: 53 | return AsciiClass.GRAPH 54 | if cls == AsciiClass.SPACE: 55 | return AsciiClass.PRINT 56 | if cls == AsciiClass.UPPER: 57 | return AsciiClass.ALPHA 58 | if cls == AsciiClass.CNTRL: 59 | return AsciiClass.ANY 60 | if cls == AsciiClass.XDIGIT: 61 | return AsciiClass.ALNUM 62 | if cls == AsciiClass.ANY: 63 | return None 64 | 65 | raise ValueError(f"Unknown ASCII class {cls}") 66 | 67 | @staticmethod 68 | def get_ascii_class_pattern(cls: AsciiClass) -> str: 69 | if cls == AsciiClass.ALNUM: 70 | return r"[:alnum:]" 71 | if cls == AsciiClass.ALPHA: 72 | return r"[:alpha:]" 73 | if cls == AsciiClass.BLANK: 74 | return r"[:blank:]" 75 | if cls == AsciiClass.CNTRL: 76 | return r"[:cntrl:]" 77 | if cls == AsciiClass.DIGIT: 78 | return r"[0-9]" 79 | if cls == AsciiClass.GRAPH: 80 | return r"[:graph:]" 81 | if cls == AsciiClass.LOWER: 82 | return r"[:lower:]" 83 | if cls == AsciiClass.PRINT: 84 | return r"[:print:]" 85 | if cls == AsciiClass.PUNCT: 86 | return r"[:punct:]" 87 | if cls == AsciiClass.SPACE: 88 | return r"[:space:]" 89 | if cls == AsciiClass.UPPER: 90 | return r"[:upper:]" 91 | if cls == AsciiClass.XDIGIT: 92 | return r"[:xdigit:]" 93 | if cls == AsciiClass.ANY: 94 | return r"." 95 | raise ValueError(f"Unsupported ASCII class {cls}") 96 | 97 | @staticmethod 98 | def get_class_characters(symbol_class: AsciiClass) -> set[str]: 99 | if symbol_class == AsciiClass.ALNUM: 100 | return AsciiClass.get_class_characters(AsciiClass.ALPHA) & AsciiClass.get_class_characters(AsciiClass.DIGIT) 101 | 102 | if symbol_class == AsciiClass.ALPHA: 103 | return AsciiClass.get_class_characters(AsciiClass.UPPER) & AsciiClass.get_class_characters(AsciiClass.LOWER) 104 | 105 | if symbol_class == AsciiClass.BLANK: 106 | return set([" ", "\t"]) 107 | 108 | if symbol_class == AsciiClass.CNTRL: 109 | # CNTRL = auto(), # Control characters. In ASCII, these characters have octal codes 000 through 037, and 177 (DEL). In other character sets, these are the equivalent characters, if any. 110 | raise ValueError() 111 | 112 | if symbol_class == AsciiClass.DIGIT: 113 | return set(string.digits) 114 | 115 | if symbol_class == AsciiClass.GRAPH: 116 | return AsciiClass.get_class_characters(AsciiClass.ALPHA) & AsciiClass.get_class_characters(AsciiClass.PUNCT) 117 | 118 | if symbol_class == AsciiClass.LOWER: 119 | return set(string.ascii_lowercase) 120 | 121 | if symbol_class == AsciiClass.PRINT: 122 | return AsciiClass.get_class_characters(AsciiClass.ALNUM) & AsciiClass.get_class_characters(AsciiClass.PUNCT) & AsciiClass.get_class_characters(AsciiClass.SPACE) 123 | 124 | if symbol_class == AsciiClass.PUNCT: 125 | return set(string.punctuation) 126 | 127 | if symbol_class == AsciiClass.UPPER: 128 | return set(string.ascii_uppercase) 129 | 130 | if symbol_class == AsciiClass.XDIGIT: 131 | return set(string.hexdigits) 132 | 133 | if symbol_class == AsciiClass.SPACE: 134 | return set( 135 | # "\t\n\x0B\x0C\x0D " 136 | string.whitespace 137 | ) 138 | 139 | raise ValueError() 140 | 141 | @staticmethod 142 | def get_ascii_class(s: str) -> AsciiClass: 143 | if len(s) > 1: 144 | raise ValueError("Expected single character") 145 | 146 | if s.isdigit(): 147 | return AsciiClass.DIGIT 148 | 149 | if s.isalpha(): 150 | if s.islower(): 151 | return AsciiClass.LOWER 152 | elif s.isupper(): 153 | return AsciiClass.UPPER 154 | return AsciiClass.ALPHA 155 | 156 | if s.isspace(): 157 | return AsciiClass.SPACE 158 | 159 | if s.isprintable(): 160 | return AsciiClass.PUNCT 161 | 162 | raise ValueError(f"{s} unknown") 163 | 164 | @staticmethod 165 | def find_common_ancestor(class1: AsciiClass, class2: AsciiClass) -> AsciiClass: 166 | parent: Optional[AsciiClass] = class1 167 | ancestors: set[AsciiClass] = {class1} 168 | 169 | assert parent is not None 170 | 171 | while True: 172 | parent = AsciiClass.get_parent(parent) 173 | if parent is None: 174 | break 175 | ancestors.add(parent) 176 | 177 | parent = class2 178 | while parent is not None: 179 | if parent in ancestors: 180 | return parent 181 | else: 182 | parent = AsciiClass.get_parent(parent) 183 | 184 | if parent is None: 185 | return AsciiClass.ANY 186 | 187 | raise ValueError() 188 | 189 | 190 | @dataclass 191 | class Symbol: 192 | a_class: AsciiClass 193 | chars: set[str] 194 | is_class: bool 195 | is_optional: bool = False 196 | 197 | def fit_score(self, s: str, alpha: float) -> float: 198 | if AsciiClass.get_ascii_class(s) == self.a_class: 199 | return 0 200 | if not self.is_class and s in self.chars: 201 | return alpha 202 | return 1 203 | 204 | def __str__(self) -> str: 205 | if self.is_class: 206 | return AsciiClass.get_ascii_class_pattern(self.a_class) 207 | elif len(self.chars) == 1: 208 | return self._sanitize(next(iter(self.chars))) + ("?" if self.is_optional else "") 209 | else: 210 | return "[" + "".join(Symbol._sanitize(c) for c in self.chars) + "]" + ("?" if self.is_optional else "") 211 | 212 | def fit(self, other: Symbol) -> float: 213 | if self.a_class == other.a_class: 214 | return 0 215 | if AsciiClass.find_common_ancestor(self.a_class, other.a_class) == other.a_class: 216 | return 0 217 | if AsciiClass.find_common_ancestor(self.a_class, other.a_class) == self.a_class: 218 | return 0 219 | 220 | common_chars = len(self.chars & other.chars) 221 | if common_chars != 0: 222 | return 1 - common_chars / len(self.chars) 223 | else: 224 | return 1 225 | 226 | @staticmethod 227 | def _sanitize(c: str) -> str: 228 | if c in ".^$*+?()[{\\|": 229 | return f"\\{c}" 230 | return c 231 | 232 | def merge(self, other: Symbol) -> Symbol: 233 | if other.a_class != self.a_class: 234 | na_class = AsciiClass.find_common_ancestor(other.a_class, self.a_class) 235 | else: 236 | na_class = self.a_class 237 | 238 | chars = self.chars | other.chars 239 | 240 | return Symbol( 241 | na_class, 242 | chars=chars, 243 | is_class=len(chars) == len(AsciiClass.get_class_characters(na_class)) 244 | ) 245 | 246 | @staticmethod 247 | def build(symbol: str) -> Symbol: 248 | symbol_class = AsciiClass.get_ascii_class(symbol) 249 | return Symbol( 250 | a_class=symbol_class, 251 | is_class=False, 252 | chars=set(symbol) 253 | ) 254 | 255 | 256 | @dataclass 257 | class Token: 258 | symbols: list[Symbol] = field(default_factory=list) 259 | optional: bool = False 260 | 261 | def fit_score(self, t: str, alpha: float) -> float: 262 | return sum( 263 | symbol.fit_score(tuple_element, alpha) for symbol, tuple_element in zip(self.symbols, Token.get_symbols_in_token(t)) 264 | ) + abs( 265 | len(t) - len(self.symbols) 266 | ) 267 | 268 | def merge(self, other: Token) -> Token: 269 | symbols = [symbol.merge(other_symbol) for symbol, other_symbol in zip(self.symbols, other.symbols)] 270 | 271 | if len(self.symbols) == len(other.symbols): 272 | return Token(symbols=symbols, optional=self.optional or other.optional) 273 | elif len(self.symbols) > len(other.symbols): 274 | missing = [Symbol(s.a_class, s.chars, s.is_class, True) for s in self.symbols[len(other.symbols):]] 275 | else: 276 | missing = [Symbol(s.a_class, s.chars, s.is_class, True) for s in other.symbols[len(self.symbols):]] 277 | 278 | return Token(symbols=symbols + missing, optional=self.optional or other.optional) 279 | 280 | def fit(self, other: Token) -> float: 281 | return sum( 282 | symbol.fit(other_symbol) for symbol, other_symbol in zip(self.symbols, other.symbols) 283 | ) + abs(len(self.symbols) - len(other.symbols)) 284 | 285 | @staticmethod 286 | def get_symbols_in_token(t: str) -> Generator[str, None, None]: 287 | for c in t: 288 | yield c 289 | 290 | def __str__(self) -> str: 291 | return "(" + "".join(str(symbol) for symbol in self.symbols) + ")" + ("?" if self.optional else "") 292 | 293 | @staticmethod 294 | def build(word: str) -> Token: 295 | return Token( 296 | list(Symbol.build(symbol) for symbol in word) 297 | ) 298 | 299 | 300 | class NullToken(Token): 301 | def d(self, t: str) -> float: 302 | return 1.0 * len(t) 303 | 304 | 305 | @dataclass 306 | class Branch: 307 | tokens: list[Token] = field(default_factory=list) 308 | 309 | def fit_score(self, t: str, alpha: float) -> float: 310 | tokens: list[Token] = [self.tokens[i] if i < len(self.tokens) else NullToken() for i, _ in enumerate(t)] 311 | 312 | return sum( 313 | token.fit_score(t_i, alpha) for token, t_i in zip(tokens, Branch.get_tokens_in_tuple(t)) 314 | ) 315 | 316 | def add(self, word: str) -> None: 317 | self.tokens = [token.merge(nt) for nt, token in zip(Branch.build(word).tokens, self.tokens)] 318 | 319 | def __str__(self) -> str: 320 | return "".join(str(token) for token in self.tokens) 321 | 322 | def fit(self, other: Branch) -> float: 323 | return sum( 324 | token.fit(other_token) for token, other_token in zip(self.tokens, other.tokens) 325 | ) + abs(len(self.tokens) - len(other.tokens)) 326 | 327 | def merge(self, other: Branch) -> Branch: 328 | tokens = [ 329 | token.merge(other_token) for token, other_token in zip(self.tokens, other.tokens) 330 | ] 331 | 332 | if len(self.tokens) == len(other.tokens): 333 | return Branch(tokens) 334 | elif len(self.tokens) > len(other.tokens): 335 | missing = [ 336 | Token(token.symbols, True) for token in self.tokens[len(other.tokens):] 337 | ] 338 | 339 | assert len(tokens) + len(missing) == len(self.tokens) 340 | else: 341 | missing = [ 342 | Token(token.symbols, True) for token in other.tokens[len(self.tokens):] 343 | ] 344 | 345 | assert len(tokens) + len(missing) == len(other.tokens) 346 | 347 | return Branch(tokens + missing) 348 | 349 | @staticmethod 350 | def get_tokens_in_tuple(t: str, delimiters: str = r"[-_/\\#., ]") -> Generator[str, None, None]: 351 | pattern: Pattern[str] = re.compile(delimiters) 352 | 353 | last_match: Optional[Match[str]] = None 354 | 355 | for m in re.finditer(pattern, t): 356 | if last_match is None: 357 | yield t[:m.start()] 358 | else: 359 | yield t[last_match.end():m.start()] 360 | 361 | yield t[m.start():m.end()] 362 | 363 | last_match = m 364 | 365 | if last_match is None: 366 | yield t 367 | else: 368 | yield t[last_match.end():] 369 | 370 | @staticmethod 371 | def build(word: str) -> Branch: 372 | return Branch( 373 | tokens=[ 374 | Token.build(token) for token in Branch.get_tokens_in_tuple(word) 375 | ] 376 | ) 377 | 378 | def __repr__(self) -> str: 379 | return f"Branch[{str(self)}" 380 | 381 | 382 | @dataclass 383 | class XTructure: 384 | alpha: float = 1 / 5 385 | max_branches: int = 8 386 | branching_threshold: float = 0.85 387 | 388 | branches: list[Branch] = field(default_factory=list) 389 | 390 | def fit_score(self, t: str) -> float: 391 | return min(b.fit_score(t, self.alpha) for b in self.branches) 392 | 393 | def learn_new_word(self, word: str) -> bool: 394 | if len(word) == 0: 395 | return False 396 | 397 | if not len(self.branches): 398 | self.branches.append(Branch.build(word)) 399 | 400 | else: 401 | best_branch, score = self._best_branch(word) 402 | 403 | if score < self.branching_threshold: 404 | best_branch.add(word) 405 | else: 406 | self.branches.append( 407 | Branch.build(word) 408 | ) 409 | 410 | if len(self.branches) > self.max_branches: 411 | self.branches = self.merge_most_similar() 412 | 413 | return True 414 | 415 | def _best_branch(self, word: str) -> tuple[Branch, float]: 416 | assert len(self.branches) 417 | 418 | best_score = math.inf 419 | best_branch: Optional[Branch] = None 420 | 421 | for branch in self.branches: 422 | branch_score = branch.fit_score(word, self.alpha) 423 | 424 | if branch_score < best_score: 425 | best_branch = branch 426 | best_score = branch_score 427 | 428 | assert best_branch is not None 429 | assert best_score != math.inf 430 | 431 | return best_branch, best_score 432 | 433 | def merge_most_similar(self) -> list[Branch]: 434 | min_distance = math.inf 435 | m_bi: Optional[Branch] = None 436 | m_bj: Optional[Branch] = None 437 | 438 | for bi, bj in combinations(self.branches, 2): 439 | assert bi is not bj 440 | 441 | distance = bi.fit(bj) 442 | 443 | if distance < min_distance: 444 | min_distance = distance 445 | m_bi = bi 446 | m_bj = bj 447 | 448 | assert m_bi is not None 449 | assert m_bj is not None 450 | 451 | self.branches.remove(m_bi) 452 | self.branches.remove(m_bj) 453 | 454 | self.branches.append(m_bi.merge(m_bj)) 455 | 456 | return self.branches 457 | 458 | def __str__(self) -> str: 459 | return "|".join(str(branch) for branch in self.branches) 460 | 461 | 462 | def parse_arguments() -> Namespace: 463 | parser = ArgumentParser( 464 | prog=sys.argv[0].split("/")[-1], 465 | description="A simple tool to learn human readable a regular expression from examples", 466 | ) 467 | 468 | parser.add_argument("-i", "--input", help="Path to the input source, defaults to stdin") 469 | parser.add_argument("-o", "--output", help="Path to the output file, defaults to stdout") 470 | parser.add_argument("--max-branch", type=int, default=8, help="Maximum number of branches allowed, defaults to 8") 471 | parser.add_argument("--alpha", type=float, default=1 / 5, help="Weight for fitting tuples, defaults to 1/5") 472 | parser.add_argument("--branch-threshold", type=float, default=.85, help="Branching threshold, defaults to 0.85, relative to the fitting score alpha") 473 | 474 | return parser.parse_args() 475 | 476 | 477 | def main() -> int: 478 | cmd = parse_arguments() 479 | 480 | x = XTructure( 481 | cmd.alpha, 482 | cmd.max_branch, 483 | cmd.branch_threshold 484 | ) 485 | 486 | data_source = open(cmd.input) if cmd.input else sys.stdin 487 | 488 | for line in data_source: 489 | x.learn_new_word(line.strip()) 490 | 491 | output = open(cmd.output) if cmd.output else sys.stdout 492 | 493 | print(str(x), file=output) 494 | 495 | return 0 496 | 497 | 498 | if __name__ == "__main__": 499 | raise SystemExit(main()) 500 | --------------------------------------------------------------------------------