├── .codecov.yml ├── .editorconfig ├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── .pyup.yml ├── .travis.yml ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── ctparse ├── __init__.py ├── corpus.py ├── count_vectorizer.py ├── ctparse.py ├── loader.py ├── models │ ├── __init__.py │ ├── dummy.py │ └── model.pbz ├── nb_estimator.py ├── nb_scorer.py ├── partial_parse.py ├── pipeline.py ├── py.typed ├── rule.py ├── scorer.py ├── time │ ├── __init__.py │ ├── auto_corpus.py │ ├── corpus.py │ ├── postprocess_latent.py │ └── rules.py ├── timers.py └── types.py ├── datasets ├── README.rst └── timeparse_corpus.json ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── ctparse.rst ├── ctparse.time.rst ├── dataset.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── modules.rst ├── readme.rst └── usage.rst ├── mypy.ini ├── requirements.txt ├── requirements_dev.txt ├── scripts └── train_default_model.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_corpus.py ├── test_count_vectorizer.py ├── test_ctparse.py ├── test_partialparse.py ├── test_regressions.py ├── test_rule.py ├── test_scorer.py ├── test_time_rules.py ├── test_timers.py └── test_types.py └── tox.ini /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: off 2 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * ctparse - Parse natural language time expressions in pytho version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | .idea/ 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # pytest 105 | .pytest_cache/ 106 | 107 | # system 108 | .DS_Store 109 | 110 | # vscode 111 | .vscode 112 | -------------------------------------------------------------------------------- /.pyup.yml: -------------------------------------------------------------------------------- 1 | # autogenerated pyup.io config file 2 | # see https://pyup.io/docs/configuration/ for all available options 3 | 4 | update: insecure 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: required 3 | dist: xenial 4 | python: 5 | - 3.8 6 | - 3.7 7 | - 3.6 8 | install: pip install -U tox-travis codecov 9 | script: tox 10 | after_success: 11 | codecov 12 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Sebastian Mika 9 | 10 | Contributors 11 | ------------ 12 | 13 | * Gabriele Lanaro 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Contributing 3 | ============ 4 | 5 | Contributions are welcome, and they are greatly appreciated! Every little bit 6 | helps, and credit will always be given. 7 | 8 | You can contribute in many ways: 9 | 10 | 11 | Add Rules & Increase Coverage 12 | ----------------------------- 13 | 14 | If you find an expressions that ``ctparse`` can not resolve correctly 15 | but you feel it should do, you can adjust the existing rules or add a 16 | new one. 17 | 18 | The following steps are probably a helpful guideline. 19 | 20 | * Add your case to the ``corpus.py`` file and run the corpus tests 21 | using ``py.test tests/test_run_corpus.py``. Now basically two things can happen: 22 | 23 | #. **The tests pass**, which means ``ctparse`` can correctly resolve 24 | the expression. It might not score it highest. To check this, 25 | rebuild the model and try parsing the expression again: 26 | 27 | .. code:: bash 28 | 29 | make train 30 | 31 | To avoid issues with reloading, please restart the python 32 | interpreter after regenerating the model. 33 | 34 | If this fixes the issue please commit the updated ``corpus.py`` 35 | and the updated model as a pull request (PR) on GitHub, see this guide for 36 | more information on what pull requests are and how to create them 37 | https://help.github.com/articles/creating-a-pull-request/. 38 | 39 | The scoring can be influenced by 40 | adding more structurally identical examples to the corpus. Seeing 41 | more samples where a specific sequence of rule applications leads 42 | to the correct ranking will drive the model to favor these. This 43 | comes, however, at the potential price of downranking certain 44 | other production sequences. Although it would generally be 45 | considered more favorable to add varying test cases (e.g. in 46 | different languages, slight variation) to the corpus, the same 47 | string can also just be duplicated to achive this *implict 48 | up-weightning* effect. The examples that are intended to influence the scoring, 49 | as opposed to the ones used to develop new rules, are usually appended 50 | to the file ``auto_corpus.py```. 51 | 52 | #. **The tests fail**: if this is because not all tests in the 53 | corpus pass, i.e. you get an error message like the following:: 54 | 55 | ctparse.py 527 WARNING failure: target "Time[]{2019-X-X X:X (X/X)}" never produced in "2019" 56 | ctparse.py 532 WARNING failure: "Time[]{2019-X-X X:X (X/X)}" not always produced 57 | 58 | * If the tests fail, run ``ctparse`` in debug mode to see what goes wrong: 59 | 60 | .. code:: python 61 | 62 | import logging 63 | from ctparse import ctparse 64 | from ctparse.ctparse import logger 65 | from datetime import datetime 66 | 67 | logger.addHandler(logging.StreamHandler()) 68 | logger.setLevel(logging.DEBUG) 69 | 70 | # Set reference time 71 | ts = datetime(2018, 3, 12, 14, 30) 72 | r = list(ctparse('May 5th', ts=ts, debug=True)) 73 | 74 | 75 | This gives you plenty of debugging output. First you will see 76 | the individual regular expressions that were matched (and the time 77 | this took):: 78 | 79 | ================================================================================ 80 | -> matching regular expressions 81 | regex: RegexMatch[0-3]{114:May} 82 | regex: RegexMatch[4-5]{133:5} 83 | regex: RegexMatch[4-7]{135:5th} 84 | regex: RegexMatch[4-5]{134:5} 85 | regex: RegexMatch[4-5]{148:5} 86 | time in _match_regex: 1ms 87 | ================================================================================ 88 | 89 | Each line has the form ``regex: RegexMatch[0-3]{114:May}`` and describes 90 | the matched span in the text ``[0-3]``, the ID of the matching expression 91 | ``114`` and the surface string that the expression matched ``May``. 92 | 93 | If relevant parts of your expression were not picked up, this is an 94 | indicator that you should either modify an existing regular 95 | expression or need to add a new rule (see below). 96 | 97 | Next you see the unique sub-sequences constructed based on these 98 | regular expressions (plus again the time used to build them):: 99 | 100 | ================================================================================ 101 | -> building initial stack 102 | regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-7]{135:5th}) 103 | regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{148:5}) 104 | regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{134:5}) 105 | regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{133:5}) 106 | time in _regex_stack: 0ms 107 | initial stack length: 4 108 | stack length after relative match length: 1 109 | stack length after max stack depth limit: 1 110 | ================================================================================ 111 | 112 | This is followed by a summary of how many applicable rules there are 113 | per initial stack element:: 114 | 115 | ================================================================================ 116 | -> checking rule applicability 117 | of 75 total rules 20 are applicable in (RegexMatch[0-3]{114:May}, RegexMatch[4-7]{135:5th}) 118 | time in _filter_rules: 0ms 119 | ================================================================================ 120 | ================================================================================ 121 | -> checking rule applicability 122 | of 75 total rules 20 are applicable in (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{148:5}) 123 | time in _filter_rules: 0ms 124 | ================================================================================ 125 | ... 126 | 127 | Again, if you do not see any sequence that captures all relevant 128 | parts of your input, you may need to modify the regular expressions 129 | or add new ones via rules. 130 | 131 | Finally you see a list of productions that are applied to stack 132 | elements, where for each applicable rule the rule name and the new 133 | stack sequence are printed, e.g.:: 134 | 135 | -------------------------------------------------------------------------------- 136 | producing on (RegexMatch[0-3]{114:May}, RegexMatch[4-7]{135:5th}), score=-0.13 137 | ruleMonthMay -> (Time[0-3]{X-05-X X:X (X/X)}, RegexMatch[4-7]{135:5th}), score=1.41 138 | ruleDOM2 -> (RegexMatch[0-3]{114:May}, Time[4-7]{X-X-05 X:X (X/X)}), score=1.38 139 | added 2 new stack elements, depth after trunc: 2 140 | -------------------------------------------------------------------------------- 141 | 142 | If no productions could be applied to a stack element the emitted 143 | results are printed:: 144 | 145 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 146 | no rules applicable: emitting 147 | => Time[0-7]{2018-05-05 X:X (X/X)}, score=15.91, 148 | -------------------------------------------------------------------------------- 149 | 150 | If the desired production does not show up, but the regular 151 | expressions look fine and the initial stack elements as well, try 152 | increasing the ``max_stack_depth`` parameter, i.e. run 153 | ``ctparse(..., max_stack_depth=0)``. Also make sure that the 154 | ``timeout`` parameter is not set. Maybe ``ctparse`` is able to 155 | generate the resolution but it is too deep in the stack. 156 | 157 | 158 | Adding a rule 159 | ~~~~~~~~~~~~~ 160 | 161 | When adding rules try to follow these guidelines: 162 | 163 | 1. Be as general as possible: instead of writing one long regular 164 | expression that matches only a specific case, check whether you can 165 | rather divide your pattern in production parts + some regular 166 | expressions. For example, if you have a very specific way to 167 | speficy the year of a date in mind, it might do no harm to just 168 | allow anything that with ``predicate('hasDate')`` plus your 169 | specific year expression, i.e. 170 | 171 | .. code:: python 172 | 173 | @rule(predicate('hasDate'), r'your funky year') 174 | 175 | 2. Keep your regex as general as possible, but avoid regular 176 | expressions that are likely to generate many "false positives". Often 177 | that can be prevented by using positive or negative lookaheads and 178 | lookbehinds to keep the context sane (see `Lookaround 179 | `_ on the 180 | excellent regular-expression.info site). 181 | 182 | 3. Make sure your production covers corner cases and matches the 183 | ``ctparse`` opinion to resolve to times in the near future but - 184 | unless explicit -- never in the past (relative to the reference 185 | time). Also make sure it favors the close future over the further 186 | future. 187 | 188 | 189 | Other Types of Contributions 190 | ---------------------------- 191 | 192 | Report Bugs 193 | ~~~~~~~~~~~ 194 | 195 | Report bugs at https://github.com/comtravo/ctparse/issues. 196 | 197 | If you are reporting a bug, please include: 198 | 199 | * Your operating system name and version. 200 | * Any details about your local setup that might be helpful in troubleshooting. 201 | * Detailed steps to reproduce the bug. 202 | 203 | Fix Bugs 204 | ~~~~~~~~ 205 | 206 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 207 | wanted" is open to whoever wants to implement it. 208 | 209 | Implement Features 210 | ~~~~~~~~~~~~~~~~~~ 211 | 212 | Look through the GitHub issues for features. Anything tagged with "enhancement" 213 | and "help wanted" is open to whoever wants to implement it. 214 | 215 | Write Documentation 216 | ~~~~~~~~~~~~~~~~~~~ 217 | 218 | ctparse - Parse natural language time expressions in pytho could always use more documentation, whether as part of the 219 | official ctparse - Parse natural language time expressions in pytho docs, in docstrings, or even on the web in blog posts, 220 | articles, and such. 221 | 222 | Submit Feedback 223 | ~~~~~~~~~~~~~~~ 224 | 225 | The best way to send feedback is to file an issue at https://github.com/comtravo/ctparse/issues. 226 | 227 | If you are proposing a feature: 228 | 229 | * Explain in detail how it would work. 230 | * Keep the scope as narrow as possible, to make it easier to implement. 231 | * Remember that this is a volunteer-driven project, and that contributions 232 | are welcome :) 233 | 234 | Get Started! 235 | ------------ 236 | 237 | Ready to contribute? Here's how to set up `ctparse` for local development. 238 | 239 | 1. Fork the `ctparse` repo on GitHub. 240 | 2. Clone your fork locally:: 241 | 242 | $ git clone git@github.com:your_name_here/ctparse.git 243 | 244 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 245 | 246 | $ mkvirtualenv ctparse 247 | $ cd ctparse/ 248 | $ python setup.py develop 249 | 250 | 4. Create a branch for local development:: 251 | 252 | $ git checkout -b name-of-your-bugfix-or-feature 253 | 254 | Now you can make your changes locally. 255 | 256 | 5. When you're done making changes, check that your changes pass flake8 and the 257 | tests, including testing other Python versions with tox:: 258 | 259 | $ flake8 ctparse tests 260 | $ python setup.py test or py.test 261 | $ tox 262 | 263 | To get flake8 and tox, just pip install them into your virtualenv. 264 | 265 | 6. Commit your changes and push your branch to GitHub:: 266 | 267 | $ git add . 268 | $ git commit -m "Your detailed description of your changes." 269 | $ git push origin name-of-your-bugfix-or-feature 270 | 271 | 7. Submit a pull request through the GitHub website. 272 | 273 | Pull Request Guidelines 274 | ----------------------- 275 | 276 | Before you submit a pull request, check that it meets these guidelines: 277 | 278 | 1. The pull request should include tests. 279 | 2. If the pull request adds functionality, the docs should be updated. Put 280 | your new functionality into a function with a docstring, and add the 281 | feature to the list in README.rst. 282 | 3. The pull request should work for Python 3.6., 3.7. and 3.8. Check 283 | https://travis-ci.org/comtravo/ctparse/pull_requests 284 | and make sure that the tests pass for all supported Python versions. 285 | 286 | Tips 287 | ---- 288 | 289 | To run a subset of tests:: 290 | 291 | $ py.test tests.test_ctparse 292 | 293 | 294 | Deploying 295 | --------- 296 | 297 | A reminder for the maintainers on how to deploy. 298 | Make sure all your changes are committed (including an entry in HISTORY.rst). 299 | Then run on the ``master`` branch:: 300 | 301 | $ bumpversion patch # possible: major / minor / patch 302 | $ git push 303 | $ git push --tags 304 | $ make release 305 | 306 | You will need a username and password to upload to pypi (might be 307 | automated on Travis). 308 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 6 | 0.3.0 (2021-02-01) 7 | ------------------ 8 | 9 | * Removed latent rules regarding times (latent rules regarding dates are still present) 10 | * Added latent_time option to customize the new behavior, defauld behavior is backwards-compatible 11 | 12 | 0.2.1 (2020-05-27) 13 | ------------------ 14 | 15 | * Update development dependencies 16 | * Add flake8-bugbear and fixed issues 17 | 18 | 0.2.0 (2020-04-23) 19 | ------------------ 20 | 21 | * Implemented new type `Duration`, to handle lengths of time 22 | * Adapted the dataset to include `Duration` 23 | * Implemented basic rule to merge `Duration`, `Time` and `Interval` in simple cases. 24 | * Created a make target to train the model `make train` 25 | 26 | 0.1.0 (2020-03-20) 27 | ------------------ 28 | 29 | * Major refactor of code underlying predictive model 30 | * Based on a contribution from @bharathi-srini: replace naive bayes from sklearn by own implementation 31 | * Thus remove dependencies on numpy, scipy, scikit-learn 32 | * Predictions are much faster: 97/s in the old vs. 239/s in the new code base 33 | * Performance identical 34 | * Deprecate support for python 3.5, add 3.8 35 | * Add more strict type checking rules (mypy.ini) 36 | * Force black code formatting, make this a linter step, "black" all code 37 | 38 | 0.0.47 (2020-02-28) 39 | ------------------- 40 | 41 | * Allow overlapping matches of regular expression when generating inital stack of "tokens" 42 | 43 | 0.0.46 (2020-02-26) 44 | ------------------- 45 | 46 | * Implemented heuristics to detect (albeit imperfectly) military times 47 | 48 | 0.0.44 (2019-11-05) 49 | ------------------- 50 | 51 | * Released time corpus 52 | * Implemented training model using ctparse corpus 53 | 54 | 0.0.43 (2019-11-01) 55 | ------------------- 56 | 57 | * Added slash as a general separator 58 | * Added ruleTODTOD (to support expression like afternoon/evening) 59 | 60 | 0.0.42 (2019-10-30) 61 | ------------------- 62 | 63 | * Removed nb module 64 | * Fix for two digit years 65 | * Freshly retrained model binary file 66 | 67 | 0.0.41 (2019-10-29) 68 | ------------------- 69 | 70 | * Fix run_corpus refactoring bug 71 | * Implemented retraining utilities 72 | 73 | 0.0.40 (2019-10-25) 74 | ------------------- 75 | 76 | * update develop dependencies 77 | * remove unused Protocol import from typing_extensions 78 | 79 | 0.0.39 (2019-10-24) 80 | ------------------- 81 | 82 | * split ctparse file into several different modules 83 | * added types to public interface 84 | * introduced the Scorer abstraction to implement richer scoring strategies 85 | 86 | 0.0.38 (2018-11-05) 87 | ------------------- 88 | 89 | * Added python 3.7 to supported versions (fix on travis available) 90 | 91 | 0.0.8 (2018-06-07) 92 | ------------------ 93 | 94 | * First release on PyPI. 95 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018, Sebastian Mika, Comtravo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | include requirements.txt 7 | 8 | recursive-include tests * 9 | recursive-exclude * __pycache__ 10 | recursive-exclude * *.py[co] 11 | 12 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 13 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | try: 8 | from urllib import pathname2url 9 | except: 10 | from urllib.request import pathname2url 11 | 12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 13 | endef 14 | export BROWSER_PYSCRIPT 15 | 16 | define PRINT_HELP_PYSCRIPT 17 | import re, sys 18 | 19 | for line in sys.stdin: 20 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 21 | if match: 22 | target, help = match.groups() 23 | print("%-20s %s" % (target, help)) 24 | endef 25 | export PRINT_HELP_PYSCRIPT 26 | 27 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 28 | 29 | help: 30 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 31 | 32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 33 | 34 | clean-build: ## remove build artifacts 35 | rm -fr build/ 36 | rm -fr dist/ 37 | rm -fr .eggs/ 38 | find . -name '*.egg-info' -exec rm -fr {} + 39 | find . -name '*.egg' -exec rm -f {} + 40 | 41 | clean-pyc: ## remove Python file artifacts 42 | find . -name '*.pyc' -exec rm -f {} + 43 | find . -name '*.pyo' -exec rm -f {} + 44 | find . -name '*~' -exec rm -f {} + 45 | find . -name '__pycache__' -exec rm -fr {} + 46 | 47 | clean-test: ## remove test and coverage artifacts 48 | rm -fr .tox/ 49 | rm -f .coverage 50 | rm -fr htmlcov/ 51 | rm -fr .pytest_cache 52 | 53 | lint: ## check style with flake8 54 | black --check ctparse tests 55 | flake8 ctparse tests 56 | mypy -p ctparse -p tests 57 | 58 | test: ## run tests quickly with the default Python 59 | py.test 60 | 61 | test-all: ## run tests on every Python version with tox 62 | tox 63 | 64 | train: 65 | python scripts/train_default_model.py --legacy --dataset datasets/timeparse_corpus.json 66 | 67 | coverage: ## check code coverage quickly with the default Python 68 | coverage run --source ctparse -m pytest 69 | coverage report -m 70 | coverage html 71 | $(BROWSER) htmlcov/index.html 72 | 73 | docs: ## generate Sphinx HTML documentation, including API docs 74 | rm -f docs/ctparse.rst 75 | rm -f docs/modules.rst 76 | sphinx-apidoc -o docs/ ctparse 77 | $(MAKE) -C docs clean 78 | $(MAKE) -C docs html 79 | $(BROWSER) docs/_build/html/index.html 80 | 81 | servedocs: docs ## compile the docs watching for changes 82 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 83 | 84 | release: dist ## package and upload a release 85 | twine upload dist/* 86 | 87 | dist: clean ## builds source and wheel package 88 | python setup.py sdist 89 | python setup.py bdist_wheel 90 | ls -l dist 91 | 92 | install: clean ## install the package to the active Python's site-packages 93 | python setup.py install 94 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =========================================================== 2 | quickadd 3 | =========================================================== 4 | 5 | quickadd is a natural language date & time parser written in python. It builds on top of ctparse_ and is an actively maintained fork. 6 | 7 | Installation 8 | ---------- 9 | 10 | With ``pip install -e git+https://github.com/Acreom/quickadd.git#egg=quickadd`` 11 | 12 | 13 | or run ``python setup.py install`` in the root directory after forking. 14 | 15 | 16 | Main upgrades include: 17 | ---------- 18 | 19 | **Recurring events** 20 | 21 | 22 | .. code:: python 23 | 24 | r = ctparse("beer daily 4pm") 25 | r.resolution 26 | Recurring[5-14]{daily 1 2021-05-09 16:00 (X/X) 2021-05-09 16:00 (X/X)} 27 | 28 | r = ctparse("beer every thursday 4") 29 | r.resolution 30 | Recurring[5-21]{weekly 1 2021-04-15 16:00 (X/X) 2021-04-15 16:00 (X/X)} 31 | 32 | r = ctparse("beer every friday 9-5") 33 | r.resolution 34 | Recurring[5-21]{weekly 1 2021-05-14 09:00 (X/X) 2021-05-14 17:00 (X/X)} 35 | 36 | r = ctparse("beer september 24 / beer every 24.9") 37 | r.resolution 38 | Recurring[5-21]{YEARLY 1 2021-09-24 (X/X) 2021-09-24 (X/X)} 39 | 40 | r = ctparse("beer thursdays 3pm and wednesdays 4pm") 41 | r.resolution 42 | RecurringArray[5-37]{ 43 | Recurring instance: weekly 1 2021-05-13 15:00 (X/X) 2021-05-13 15:00 (X/X) 44 | Recurring instance: weekly 1 2021-05-12 16:00 (X/X) 2021-05-12 16:00 (X/X) 45 | } 46 | 47 | r = ctparse("beer 9pm weekdays") 48 | r.resolution 49 | RecurringArray[5-17]{ 50 | Recurring instance: weekly 1 2021-05-10 21:00 (X/X) 2021-05-10 21:00 (X/X) 51 | Recurring instance: weekly 1 2021-05-11 21:00 (X/X) 2021-05-11 21:00 (X/X) 52 | Recurring instance: weekly 1 2021-05-12 21:00 (X/X) 2021-05-12 21:00 (X/X) 53 | Recurring instance: weekly 1 2021-05-13 21:00 (X/X) 2021-05-13 21:00 (X/X) 54 | Recurring instance: weekly 1 2021-05-14 21:00 (X/X) 2021-05-14 21:00 (X/X)} 55 | 56 | 57 | **More rules** 58 | 59 | ruleNextFrequency 60 | 61 | .. code:: python 62 | 63 | #reference date = Dec 13th 2022 64 | r = ctparse("code next week 4pm") 65 | r.resolution 66 | Time[5-18]{2022-12-20 16:00 67 | 68 | r = ctparse("code next month") 69 | r.resolution 70 | Time[5-15]{2023-01-13 X:X (X/X)} 71 | 72 | 73 | ruleLastDOM 74 | 75 | .. code:: python 76 | 77 | #reference date = Dec 13th 2022 78 | r = ctparse("code last monday of the month") 79 | r.resolution 80 | Time[5-17]{2022-12-26 X:X (X/X)} 81 | 82 | 83 | rrule_ **support** 84 | 85 | .. code:: python 86 | 87 | r.resolution.to_rrule() 88 | Out[4]: 'RRULE:FREQ=DAILY;COUNT=1' 89 | 90 | 91 | **Subject extraction** 92 | 93 | 94 | .. code:: python 95 | 96 | r = ctparse("beers and burgers friday 8pm-9pm") 97 | r.subject 98 | Out[2]: 'beers and burgers' 99 | 100 | 101 | **PM bias** 102 | 103 | 104 | .. code:: python 105 | 106 | r = ctparse("fix the issue tmrw 2") 107 | r.resolution 108 | Time[14-20]{2022-11-23 14:00 (X/X)} 109 | 110 | r = ctparse("fix the issue tmrw 2", pm_bias=False) 111 | r.resolution 112 | Time[14-20]{2022-11-23 02:00 (X/X)} 113 | 114 | 115 | **Rules for ambigious natural language expressions** 116 | 117 | .. code:: python 118 | 119 | r = ctparse("code 9-5") 120 | r.resolution 121 | Interval[0-0]{2022-11-23 09:00 (X/X) - 2022-11-23 17:00 (X/X)} 122 | 123 | 124 | **US/EU date format** 125 | 126 | 127 | .. code:: python 128 | 129 | r = ctparse("fix the issue 5.3") 130 | r.resolution 131 | Time[14-17]{2023-03-05 X:X (X/X)} 132 | 133 | r = ctparse("fix the issue 5.3", date_format="US") 134 | r.resolution 135 | Time[14-17]{2023-05-03 X:X (X/X)} 136 | 137 | 138 | **Rule combinations** 139 | 140 | .. code:: python 141 | 142 | r = ctparse("beer in 3 days 4pm") 143 | r.resolution 144 | Time[5-18]{2021-05-12 16:00 (X/X)} 145 | 146 | 147 | r = ctparse("beer in 3 days 4pm every week") 148 | r.resolution 149 | Recurring[5-29]{weekly 1 2021-05-12 16:00 (X/X) 2021-05-12 16:00 (X/X)} 150 | 151 | 152 | r = ctparse("beer every friday 4-6:30pm") 153 | r.resolution 154 | Recurring[5-26]{WEEKLY 1 2022-11-25 16:00 (X/X) 2022-11-25 18:30 (X/X)} 155 | 156 | 157 | ``+`` **performance improvements** 158 | 159 | 160 | Base Capabilities 161 | ---------- 162 | | **Time** 163 | 164 | .. code:: python 165 | 166 | "beer thursday 4" 167 | Time[5-15]{2021-05-13 16:00 (X/X)} 168 | 169 | 170 | | **Interval** 171 | 172 | .. code:: python 173 | 174 | "beer 4-6" 175 | Interval[0-0]{2021-05-09 16:00 (X/X) - 2021-05-09 18:00 (X/X)} 176 | 177 | 178 | | **Duration** 179 | 180 | .. code:: python 181 | 182 | "beer in 4 hours" 183 | Duration[5-15]{4 hours} 184 | 185 | 186 | Ctparse 187 | ---------- 188 | 189 | The package ``ctparse`` is a pure python package to parse time 190 | expressions from natural language (i.e. strings). In many ways it builds 191 | on similar concepts as Facebook’s ``duckling`` package 192 | (https://github.com/facebook/duckling). However, for the time being it 193 | only targets times and only German and English text. 194 | 195 | In principle ``ctparse`` can be used to **detect** time expressions in a 196 | text, however its main use case is the semantic interpretation of such 197 | expressions. Detecting time expressions in the first place can - to our 198 | experience - be done more efficiently (and precisely) using e.g. CRFs or 199 | other models targeted at this specific task. 200 | 201 | ``ctparse`` is designed with the use case in mind where interpretation 202 | of time expressions is done under the following assumptions: 203 | 204 | - All expressions are relative to some pre-defined reference times 205 | - Unless explicitly specified in the time expression, valid resolutions 206 | are in the future relative to the reference time (i.e. ``12.5.`` will 207 | be the next 12th of May, but ``12.5.2012`` should correctly resolve 208 | to the 12th of May 2012). 209 | - If in doubt, resolutions in the near future are more likely than 210 | resolutions in the far future (not implemented yet, but any 211 | resolution more than i.e. 3 month in the future is extremely 212 | unlikely). 213 | 214 | The specific comtravo use-case is resolving time expressions in booking 215 | requests which almost always refer to some point in time within the next 216 | 4-8 weeks. 217 | 218 | ``ctparse`` currently is language agnostic and supports German and 219 | English expressions. This might get an extension in the future. The main 220 | reason is that in real world communication more often than not people 221 | write in one language (their business language) but use constructs to 222 | express times that are based on their mother tongue and/or what they 223 | believe to be the way to express dates in the target language. This 224 | leads to text in German with English time expressions and vice-versa. 225 | Using a language detection upfront on the complete original text is for 226 | obvious no solution - rather it would make the problem worse. 227 | 228 | Example 229 | ------- 230 | 231 | .. code:: python 232 | 233 | from ctparse import ctparse 234 | from datetime import datetime 235 | 236 | # Set reference time 237 | ts = datetime(2018, 3, 12, 14, 30) 238 | ctparse('May 5th 2:30 in the afternoon', ts=ts) 239 | 240 | This should return a ``Time`` object represented as 241 | ``Time[0-29]{2018-05-05 14:30 (X/X)}``, indicating that characters 242 | ``0-29`` were used in the resolution, that the resolved date time is the 243 | 5th of May 2018 at 14:30 and that this resolution is neither based on a 244 | day of week (first ``X``) nor a part of day (second ``X``). 245 | 246 | 247 | Latent time 248 | ~~~~~~~~~~~ 249 | 250 | Normally, ``ctparse`` will anchor time expressions to the reference time. 251 | For example, when parsing the time expression ``8:00 pm``, ctparse will 252 | resolve the expression to 8 pm after the reference time as follows 253 | 254 | .. code:: python 255 | 256 | parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=True) # default 257 | # parse.resolution -> Time(2020, 1, 1, 20, 00) 258 | 259 | This behavior can be customized using the option ``latent_time=False``, which will 260 | return a time resolution not anchored to a particular date 261 | 262 | .. code:: python 263 | 264 | parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=False) 265 | # parse.resolution -> Time(None, None, None, 20, 00) 266 | 267 | Implementation 268 | -------------- 269 | 270 | ``ctparse`` - as ``duckling`` - is a mixture of a rule and regular 271 | expression based system + some probabilistic modeling. In this sense it 272 | resembles a PCFG. 273 | 274 | Rules 275 | ~~~~~ 276 | 277 | At the core ``ctparse`` is a collection of production rules over 278 | sequences of regular expressions and (intermediate) productions. 279 | 280 | Productions are either of type ``Time``, ``Interval``, ``Duration`` or ``Recurring`` and can 281 | have certain predicates (e.g. whether a ``Time`` is a part of day like 282 | ``'afternoon'``). 283 | 284 | A typical rule than looks like this: 285 | 286 | .. code:: python 287 | 288 | @rule(predicate('isDate'), dimension(Interval)) 289 | 290 | I.e. this rule is applicable when the intermediate production resulted 291 | in something that has a date, followed by something that is in interval 292 | (like e.g. in ``'May 5th 9-10'``). 293 | 294 | The actual production is a python function with the following signature: 295 | 296 | .. code:: python 297 | 298 | @rule(predicate('isDate'), dimension(Interval)) 299 | def ruleDateInterval(ts, d, i): 300 | """ 301 | param ts: datetime - the current refenrence time 302 | d: Time - a time that contains at least a full date 303 | i: Interval - some Interval 304 | """ 305 | if not (i.t_from.isTOD and i.t_to.isTOD): 306 | return None 307 | return Interval( 308 | t_from=Time(year=d.year, month=d.month, day=d.day, 309 | hour=i.t_from.hour, minute=i.t_from.minute), 310 | t_to=Time(year=d.year, month=d.month, day=d.day, 311 | hour=i.t_to.hour, minute=i.t_to.minute)) 312 | 313 | This production will return a new interval at the date of 314 | ``predicate('isDate')`` spanning the time coded in 315 | ``dimension(Interval)``. If the latter does code for something else than 316 | a time of day (TOD), no production is returned, e.g. the rule matched 317 | but failed. 318 | 319 | 320 | Technical Background 321 | ~~~~~~~~~~~~~~~~~~~~ 322 | 323 | Some observations on the problem: 324 | 325 | - Each rule is a combination of regular expressions and productions. 326 | - Consequently, each production must originate in a sequence of regular 327 | expressions that must have matched (parts of) the text. 328 | - Hence, only subsequence of **all** regular expressions in **all** 329 | rules can lead to a successful production. 330 | 331 | To this end the algorithm proceeds as follows: 332 | 333 | 1. Input a string and a reference time 334 | 2. Find all matches of all regular expressions from all rules in the 335 | input strings. Each regular expression is assigned an identifier. 336 | 3. Find all distinct sequences of these matches where two matches do not 337 | overlap nor have a gap inbetween 338 | 4. To each such subsequence apply all rules at all possible positions 339 | until no further rules can be applied - in which case one solution is 340 | produced 341 | 342 | Obviously, not all sequences of matching expressions and not all 343 | sequences of rules applied on top lead to meaningful results. Here the 344 | **P**\ CFG kicks in: 345 | 346 | - Based on example data (``corpus.py``) a model is calibrated to 347 | predict how likely a production is to lead to a/the correct result. 348 | Instead of doing a breadth first search, the most promising 349 | productions are applied first. 350 | - Resolutions are produced until there are no more resolutions or a 351 | timeout is hit. 352 | - Based on the same model from all resolutions the highest scoring is 353 | returned. 354 | 355 | 356 | .. _ctparse: https://github.com/comtravo/ctparse 357 | .. _rrule: https://dateutil.readthedocs.io/en/stable/rrule.html 358 | 359 | Credits 360 | ------- 361 | 362 | This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. 363 | 364 | .. _Cookiecutter: https://github.com/audreyr/cookiecutter 365 | .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage 366 | -------------------------------------------------------------------------------- /ctparse/__init__.py: -------------------------------------------------------------------------------- 1 | """ctparse - parse time expressions in strings 2 | 3 | .. moduleauthor:: Comtravo 4 | 5 | """ 6 | __author__ = """Sebastian Mika""" 7 | __email__ = "sebastian.mika@comtravo.com" 8 | __version__ = "__version__ = '0.3.0'" 9 | 10 | from .ctparse import ctparse, ctparse_gen # noqa 11 | -------------------------------------------------------------------------------- /ctparse/corpus.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from datetime import datetime 4 | from typing import Callable, Iterable, List, NamedTuple, Sequence, Tuple, TypeVar, Union 5 | 6 | from tqdm import tqdm 7 | 8 | from .ctparse import ctparse_gen 9 | from .scorer import DummyScorer, Scorer 10 | from .types import Artifact, Duration, Interval, Time 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | # A triplet of text, reference timestamp and correct parse. 15 | # It can be used as raw data to build datasets for ctparse. 16 | TimeParseEntry = NamedTuple( 17 | "TimeParseEntry", [("text", str), ("ts", datetime), ("gold", Artifact)], 18 | ) 19 | 20 | T = TypeVar("T") 21 | 22 | 23 | def make_partial_rule_dataset( 24 | entries: Sequence[TimeParseEntry], 25 | scorer: Scorer, 26 | timeout: Union[float, int], 27 | max_stack_depth: int, 28 | relative_match_len: float = 1.0, 29 | progress: bool = False, 30 | ) -> Iterable[Tuple[List[str], bool]]: 31 | """Build a data set from an iterable of TimeParseEntry. 32 | 33 | The text is run through ctparse and all parses (within the specified timeout, 34 | max_stack_depth and scorer) are obtained. Each parse contains a sequence 35 | of rules (see ``CTParse.rules``) used to produce that parse. 36 | 37 | A dataset is generated by taking every possible partial rule and assigning to it 38 | a boolean indicating if that partial sequence did lead to a successful parse. 39 | 40 | If `progress` is ``True``, display a progress bar. 41 | 42 | Example: 43 | 44 | rule sequence: [r1, r2, r3] 45 | parse_is_correct: True 46 | 47 | [r1] -> True 48 | [r1, r2] -> True 49 | [r1, r2, r3] -> True 50 | """ 51 | # If we look at the signature for a scorer, the score is obtained from: 52 | # (text, reference_time, partial_parse) and optionally a production for a 53 | # partial parse. 54 | # Clearly, if we were to make a general scorer for the dataset, we would need 55 | # all of these features. It is possible to achieve that by tracking the list of 56 | # partial parses that led to a correct parse. Unfortunately we don't have the 57 | # full history with the current implementation, however we can obtain a dataset 58 | # of (text, reference_time, rule_ids) quite easily, because the rule is a linear 59 | # list. 60 | 61 | if progress: 62 | entries_it = _progress_bar( 63 | entries, 64 | total=len(entries), 65 | status_text=lambda entry: " {: <70}".format(entry.text), 66 | ) 67 | else: 68 | entries_it = entries 69 | 70 | for entry in entries_it: 71 | for parse in ctparse_gen( 72 | entry.text, 73 | entry.ts, 74 | relative_match_len=relative_match_len, 75 | timeout=timeout, 76 | max_stack_depth=max_stack_depth, 77 | scorer=scorer, 78 | latent_time=False, 79 | ): 80 | # TODO: we should make sure ctparse_gen never returns None. If there is no 81 | # result it should return an empty list 82 | if parse is None: 83 | continue 84 | 85 | y = parse.resolution == entry.gold 86 | # Build data set, one sample for each applied rule in 87 | # the sequence of rules applied in this production 88 | # *after* the matched regular expressions 89 | for i in range(1, len(parse.production) + 1): 90 | X = [str(p) for p in parse.production[:i]] 91 | yield X, y 92 | 93 | 94 | def _progress_bar( 95 | it: Iterable[T], total: int, status_text: Callable[[T], str] 96 | ) -> Iterable[T]: 97 | # Progress bar that can update text 98 | pbar = tqdm(it, total=total) 99 | for val in pbar: 100 | pbar.set_description(status_text(val)) 101 | yield val 102 | 103 | 104 | def load_timeparse_corpus(fname: str) -> Sequence[TimeParseEntry]: 105 | """Load a corpus from disk. 106 | 107 | For more information about the format of the time parse corpus, 108 | refer to the documentation. 109 | """ 110 | with open(fname, "r", encoding="utf-8") as fd: 111 | entries = json.load(fd) 112 | 113 | return [ 114 | TimeParseEntry( 115 | text=e["text"], 116 | ts=datetime.strptime(e["ref_time"], "%Y-%m-%dT%H:%M:%S"), 117 | gold=parse_nb_string(e["gold_parse"]), 118 | ) 119 | for e in entries 120 | ] 121 | 122 | 123 | def parse_nb_string(gold_parse: str) -> Union[Time, Interval, Duration]: 124 | """Parse a Time, Interval or Duration from their no-bound string representation. 125 | 126 | The no-bound string representations are generated from ``Artifact.nb_str``. 127 | """ 128 | if gold_parse.startswith("Time"): 129 | return Time.from_str(gold_parse[7:-1]) 130 | if gold_parse.startswith("Interval"): 131 | return Interval.from_str(gold_parse[11:-1]) 132 | if gold_parse.startswith("Duration"): 133 | return Duration.from_str(gold_parse[11:-1]) 134 | else: 135 | raise ValueError("'{}' has an invalid format".format(gold_parse)) 136 | 137 | 138 | def run_corpus( 139 | corpus: Sequence[Tuple[str, str, Sequence[str]]] 140 | ) -> Tuple[List[List[str]], List[bool]]: 141 | """Load the corpus (currently hard coded), run it through ctparse with 142 | no timeout and no limit on the stack depth. 143 | 144 | The corpus passes if ctparse generates the desired solution for 145 | each test at least once. Otherwise it fails. 146 | 147 | While testing this, a labeled data set (X, y) is generated based 148 | on *all* productions. Given a final production p, based on initial 149 | regular expression matches r_0, ..., r_n, which are then 150 | subsequently transformed using production rules p_0, ..., p_m, 151 | will result in the samples 152 | 153 | [r_0, ..., r_n, p_0, 'step_0'] 154 | [r_0, ..., r_n, p_0, p_1, 'step_1'] 155 | ... 156 | [r_0, ..., r_n, p_0, ..., p_m, 'step_m'] 157 | 158 | All samples from one production are given the same label which indicates if 159 | the production was correct. 160 | 161 | To build a similar datasets without the strict checking, use 162 | `make_partial_rule_dataset` 163 | """ 164 | at_least_one_failed = False 165 | # pos_parses: number of parses that are correct 166 | # neg_parses: number of parses that are wrong 167 | # pos_first_parses: number of first parses generated that are correct 168 | # pos_best_scored: number of correct parses that have the best score 169 | pos_parses = neg_parses = pos_first_parses = pos_best_scored = 0 170 | total_tests = 0 171 | Xs = [] 172 | ys = [] 173 | for target, ts, tests in tqdm(corpus): 174 | ts = datetime.strptime(ts, "%Y-%m-%dT%H:%M") 175 | all_tests_pass = True 176 | for test in tests: 177 | one_prod_passes = False 178 | first_prod = True 179 | y_score = [] 180 | for parse in ctparse_gen( 181 | test, 182 | ts, 183 | relative_match_len=1.0, 184 | timeout=0, 185 | max_stack_depth=0, 186 | scorer=DummyScorer(), 187 | latent_time=False, 188 | ): 189 | assert parse is not None 190 | 191 | y = parse.resolution.nb_str() == target 192 | # Build data set, one sample for each applied rule in 193 | # the sequence of rules applied in this production 194 | # *after* the matched regular expressions 195 | for i in range(1, len(parse.production) + 1): 196 | Xs.append([str(p) for p in parse.production[:i]]) 197 | ys.append(y) 198 | 199 | one_prod_passes |= y 200 | pos_parses += int(y) 201 | neg_parses += int(not y) 202 | pos_first_parses += int(y and first_prod) 203 | first_prod = False 204 | y_score.append((parse.score, y)) 205 | if not one_prod_passes: 206 | logger.warning( 207 | 'failure: target "{}" never produced in "{}"'.format(target, test) 208 | ) 209 | pos_best_scored += int(max(y_score, key=lambda x: x[0])[1]) 210 | total_tests += len(tests) 211 | all_tests_pass &= one_prod_passes 212 | if not all_tests_pass: 213 | logger.warning('failure: "{}" not always produced'.format(target)) 214 | at_least_one_failed = True 215 | logger.info( 216 | "run {} tests on {} targets with a total of " 217 | "{} positive and {} negative parses (={})".format( 218 | total_tests, len(corpus), pos_parses, neg_parses, pos_parses + neg_parses 219 | ) 220 | ) 221 | logger.info( 222 | "share of correct parses in all parses: {:.2%}".format( 223 | pos_parses / (pos_parses + neg_parses) 224 | ) 225 | ) 226 | logger.info( 227 | "share of correct parses being produced first: {:.2%}".format( 228 | pos_first_parses / (pos_parses + neg_parses) 229 | ) 230 | ) 231 | logger.info( 232 | "share of correct parses being scored highest: {:.2%}".format( 233 | pos_best_scored / total_tests 234 | ) 235 | ) 236 | if at_least_one_failed: 237 | raise Exception("ctparse corpus has errors") 238 | return Xs, ys 239 | -------------------------------------------------------------------------------- /ctparse/count_vectorizer.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import Dict, Sequence, Tuple, Optional 3 | 4 | 5 | class CountVectorizer: 6 | def __init__(self, ngram_range: Tuple[int, int]): 7 | """Create new count vectorizer that also counts n-grams. 8 | 9 | A count vectorizer builds an internal vocabulary and embeds each input 10 | by counting for each term in the document how often it appearsin the vocabulary. 11 | Here also n-grams are considered to be part of the vocabulary and the document 12 | terms, respectively 13 | 14 | Parameters 15 | ---------- 16 | ngram_range : Tuple[int, int] 17 | n-gram range to consider 18 | """ 19 | self.ngram_range = ngram_range 20 | self.vocabulary: Optional[Dict[str, int]] = None 21 | 22 | @staticmethod 23 | def _create_ngrams( 24 | ngram_range: Tuple[int, int], documents: Sequence[Sequence[str]] 25 | ) -> Sequence[Sequence[str]]: 26 | """For each document in documents, replace original tokens by a list of 27 | all min_n:max_n = self.ngram_range ngrams in that document. 28 | 29 | Parameters 30 | ---------- 31 | ngram_range : Tuple[int, int] 32 | Min and max number of ngrams to generate 33 | 34 | documents : Sequence[Sequence[str]] 35 | A sequence of already tokenized documents 36 | 37 | Returns 38 | ------- 39 | Sequence[Sequence[str]] 40 | For each document all ngrams of tokens in the desired range 41 | """ 42 | min_n, max_n = ngram_range 43 | space_join = " ".join 44 | 45 | def _create(document: Sequence[str]) -> Sequence[str]: 46 | doc_len = len(document) 47 | doc_max_n = min(max_n, doc_len) + 1 48 | if min_n == 1: 49 | ngrams = list(document) 50 | min_nn = min_n + 1 51 | else: 52 | ngrams = [] 53 | min_nn = min_n 54 | 55 | for n in range(min_nn, doc_max_n): 56 | for i in range(0, doc_len - n + 1): 57 | ngrams.append(space_join(document[i : i + n])) 58 | return ngrams 59 | 60 | return [_create(d) for d in documents] 61 | 62 | @staticmethod 63 | def _get_feature_counts( 64 | ngram_range: Tuple[int, int], documents: Sequence[Sequence[str]] 65 | ) -> Sequence[Dict[str, int]]: 66 | """Count (ngram) features appearing in each document 67 | 68 | Parameters 69 | ---------- 70 | ngram_range : Tuple[int, int] 71 | Min and max number of ngrams to generate 72 | 73 | documents : Sequence[Sequence[str]] 74 | Sequence of documents tokenized as sequence of string 75 | 76 | Returns 77 | ------- 78 | Tuple[Sequence[Dict[str, int]], Set[str]] 79 | For each document a dictionary counting how often which feature appeared and 80 | a set of all features in all documents. Features are according to this 81 | vectorizers n-gram settings. 82 | """ 83 | documents = CountVectorizer._create_ngrams(ngram_range, documents) 84 | count_matrix = [] 85 | 86 | for document in documents: 87 | # This is 5x faster than using a build in Counter 88 | feature_counts: Dict[str, int] = defaultdict(int) 89 | for feature in document: 90 | feature_counts[feature] += 1 91 | count_matrix.append(feature_counts) 92 | return count_matrix 93 | 94 | @staticmethod 95 | def _build_vocabulary(count_matrix: Sequence[Dict[str, int]]) -> Dict[str, int]: 96 | """Build the vocabulary from feature counts 97 | 98 | Parameters 99 | ---------- 100 | count_matrix : Sequence[Dict[str, int]] 101 | Sequence of dicts with counts (values) per feature (keys) 102 | 103 | Returns 104 | ------- 105 | Dict[str, int] 106 | The vocabulary as {feature: index} pairs 107 | """ 108 | all_features = set() 109 | for feature_counts in count_matrix: 110 | for feature in feature_counts.keys(): 111 | all_features.add(feature) 112 | return {word: idx for idx, word in enumerate(sorted(all_features))} 113 | 114 | @staticmethod 115 | def _create_feature_matrix( 116 | vocabulary: Dict[str, int], count_matrix: Sequence[Dict[str, int]] 117 | ) -> Sequence[Dict[int, int]]: 118 | """Map counts of string features to numerical data (sparse maps of 119 | `{feature_index: count}`). Here `feature_index` is relative to the vocabulary of 120 | this vectorizer. 121 | 122 | Parameters 123 | ---------- 124 | vocabulary : Dict[str, int] 125 | Vocabulary with {feature: index} mappings 126 | 127 | count_matrix : Sequence[Dict[str, int]] 128 | Sequence of dictionaries with feature counts 129 | 130 | Returns 131 | ------- 132 | Sequence[Dict[int, int]] 133 | For each document a mapping of `feature_index` to a count how often this 134 | feature appeared in the document. 135 | """ 136 | len_vocab = len(vocabulary) 137 | count_vectors_matrix = [] 138 | # Build document frequency matrix 139 | for count_dict in count_matrix: 140 | doc_vector: Dict[int, int] = defaultdict(int) 141 | for word, cnt in count_dict.items(): 142 | idx = vocabulary.get(word, None) 143 | if idx is not None: 144 | doc_vector[idx] = cnt 145 | count_vectors_matrix.append(doc_vector) 146 | # add vocab length in first element 147 | count_vectors_matrix[0][len_vocab - 1] = count_vectors_matrix[0][len_vocab - 1] 148 | return count_vectors_matrix 149 | 150 | def fit(self, documents: Sequence[Sequence[str]]) -> "CountVectorizer": 151 | """Learn a vocabulary dictionary of all tokens in the raw documents. 152 | 153 | Parameters 154 | ---------- 155 | documents : Sequence[Sequence[str]] 156 | Sequence of documents, each as a sequence of tokens 157 | 158 | Returns 159 | ------- 160 | CountVectorizer 161 | The updated vectorizer, i.e. this updates the internal vocabulary 162 | """ 163 | self.fit_transform(documents) 164 | return self 165 | 166 | def fit_transform( 167 | self, documents: Sequence[Sequence[str]] 168 | ) -> Sequence[Dict[int, int]]: 169 | """Learn the vocabulary dictionary and return a term-document matrix. Updates 170 | the internal vocabulary state of the vectorizer. 171 | 172 | Parameters 173 | ---------- 174 | documents : Sequence[Sequence[str] 175 | Sequence of documents, each as a sequence of tokens 176 | 177 | Returns 178 | ------- 179 | Sequence[Dict[int, int]] 180 | Document-term matrix. 181 | """ 182 | count_matrix = CountVectorizer._get_feature_counts(self.ngram_range, documents) 183 | self.vocabulary = CountVectorizer._build_vocabulary(count_matrix) 184 | return CountVectorizer._create_feature_matrix(self.vocabulary, count_matrix) 185 | 186 | def transform(self, documents: Sequence[Sequence[str]]) -> Sequence[Dict[int, int]]: 187 | """Create term-document matrix based on pre-generated vocabulary. Does *not* 188 | update the internal state of the vocabulary. 189 | 190 | Parameters 191 | ---------- 192 | documents : Sequence[Sequence[str]] 193 | Sequence of documents, each as a sequence of tokens 194 | 195 | Returns 196 | ------- 197 | Sequence[Dict[int, int]] 198 | Document-term matrix. 199 | """ 200 | if not self.vocabulary: 201 | raise ValueError("no vocabulary - vectorizer not fitted?") 202 | count_matrix = CountVectorizer._get_feature_counts(self.ngram_range, documents) 203 | return CountVectorizer._create_feature_matrix(self.vocabulary, count_matrix) 204 | -------------------------------------------------------------------------------- /ctparse/ctparse.py: -------------------------------------------------------------------------------- 1 | from ctparse.time.postprocess_latent import apply_postprocessing_rules 2 | import logging 3 | from datetime import datetime 4 | from typing import ( 5 | cast, 6 | Callable, 7 | Dict, 8 | Iterator, 9 | List, 10 | Optional, 11 | Sequence, 12 | Tuple, 13 | Union, 14 | ) 15 | 16 | import re 17 | 18 | import regex 19 | from itertools import chain 20 | 21 | from .partial_parse import PartialParse 22 | from .rule import _regex as global_regex, eu_regex, us_regex 23 | from .scorer import Scorer 24 | from .timers import CTParseTimeoutError, timeit 25 | 26 | # Avoid collision with variable "timeout" 27 | from .timers import timeout as timeout_ 28 | from .types import Artifact, RegexMatch 29 | from .loader import load_default_scorer 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | _DEFAULT_SCORER = load_default_scorer() 34 | 35 | 36 | class CTParse: 37 | def __init__( 38 | self, 39 | resolution: Artifact, 40 | production: Tuple[Union[int, str], ...], 41 | score: float, 42 | subject: str, 43 | # labels: str, 44 | ) -> None: 45 | """A possible parse returned by ctparse. 46 | 47 | :param resolution: the parsed `Time`, `Interval` or `Duration` 48 | :param production: the sequence of rules (productions) used to arrive 49 | at the parse 50 | :param score: a numerical score used to rank parses. A high score means 51 | a more likely parse 52 | """ 53 | self.resolution = resolution 54 | self.production = production 55 | self.score = score 56 | self.subject = subject 57 | # self.labels = labels 58 | 59 | def __repr__(self) -> str: 60 | return "CTParse({}, {}, {}, {})".format( 61 | self.resolution, self.production, self.score, self.subject) #self.labels 62 | 63 | def __str__(self) -> str: 64 | return "{} s={:.3f} p={} sb={}".format(self.resolution, self.score, self.production, self.subject) 65 | #self.labels) 66 | 67 | 68 | def ctparse( 69 | txt: str, 70 | ts: Optional[datetime] = None, 71 | pm_bias: Optional[bool] = True, 72 | date_format: Optional[str] = None, 73 | fallback: Optional[bool] = False, 74 | timeout: Union[int, float] = 1.0, 75 | debug: bool = False, 76 | relative_match_len: float = 1.0, 77 | max_stack_depth: int = 10, 78 | scorer: Optional[Scorer] = None, 79 | latent_time: bool = True, 80 | ) -> Optional[CTParse]: 81 | """Parse a string *txt* into a time expression 82 | 83 | :param ts: reference time 84 | :type ts: datetime.datetime 85 | :param pm_bias: pm bias on or off / 24h or 12h format 86 | :param date_format: us / eu date format 87 | :param fallback: fallback option if default date format is not parsed 88 | :param timeout: timeout for parsing in seconds; timeout=0 89 | indicates no timeout 90 | :type timeout: float 91 | :param debug: if True do return iterator over all resolution, else 92 | return highest scoring one (default=False) 93 | :param relative_match_len: relative minimum share of 94 | characters an initial regex match sequence must 95 | cover compared to the longest such sequence found 96 | to be considered for productions (default=1.0) 97 | :type relative_match_len: float 98 | :param max_stack_depth: limit the maximal number of highest scored candidate 99 | productions considered for future productions 100 | (default=10); set to 0 to not limit 101 | :type max_stack_depth: int 102 | :param latent_time: if True, resolve expressions that contain only a time 103 | (e.g. 8:00 pm) to be the next matching time after 104 | reference time *ts* 105 | :returns: Optional[CTParse] 106 | """ 107 | parsed = ctparse_gen( 108 | txt, 109 | ts, 110 | pm_bias, 111 | date_format, 112 | fallback, 113 | timeout=timeout, 114 | relative_match_len=relative_match_len, 115 | max_stack_depth=max_stack_depth, 116 | scorer=scorer, 117 | latent_time=latent_time, 118 | ) 119 | 120 | # TODO: keep debug for back-compatibility, but remove it later 121 | if debug: 122 | return parsed # type: ignore 123 | else: 124 | parsed_list = list(parsed) 125 | # TODO: this way of testing a failure to find a match is a bit clunky with types 126 | if len(parsed_list) == 0 or (len(parsed_list) == 1 and parsed_list[0] is None): 127 | # logger.warning('Failed to produce result for "{}"'.format(txt)) 128 | # labels = _get_labels(txt) 129 | txt = re.sub('#[a-zA-Z0-9\-_:/.]+', '', txt).strip() 130 | subject = txt 131 | return CTParse(None, None, None, subject)# labels) 132 | parsed_list.sort(key=lambda p: p.score) # type: ignore 133 | return parsed_list[-1] 134 | 135 | 136 | def ctparse_gen( 137 | txt: str, 138 | ts: Optional[datetime] = None, 139 | pm_bias: Optional[bool] = True, 140 | date_format: Optional[str] = None, 141 | fallback: Optional[bool] = False, 142 | timeout: Union[int, float] = 1.0, 143 | relative_match_len: float = 1.0, 144 | max_stack_depth: int = 10, 145 | scorer: Optional[Scorer] = None, 146 | latent_time: bool = True, 147 | ) -> Iterator[Optional[CTParse]]: 148 | """Generate parses for the string *txt*. 149 | 150 | This function is equivalent to ctparse, with the exception that it returns an 151 | iterator over the matches as soon as they are produced. 152 | """ 153 | if scorer is None: 154 | scorer = _DEFAULT_SCORER 155 | if ts is None: 156 | ts = datetime.now() 157 | 158 | 159 | generated_parse = list(_ctparse( 160 | _preprocess_string(txt), 161 | ts, 162 | pm_bias, 163 | date_format, 164 | timeout=timeout, 165 | relative_match_len=relative_match_len, 166 | max_stack_depth=max_stack_depth, 167 | scorer=scorer, 168 | )) 169 | 170 | if fallback and not generated_parse: 171 | if date_format == "US": 172 | fallback_date_format = "EU" 173 | else: 174 | fallback_date_format = "US" 175 | 176 | for parse in _ctparse( 177 | _preprocess_string(txt), 178 | ts, 179 | pm_bias, 180 | date_format=fallback_date_format, 181 | timeout=timeout, 182 | relative_match_len=relative_match_len, 183 | max_stack_depth=max_stack_depth, 184 | scorer=scorer, 185 | ): 186 | if parse and latent_time: 187 | # NOTE: we post-process after scoring because the model has been trained 188 | # without using the latent time. This means also that the post processing 189 | # step won't be added to the rules 190 | prod = apply_postprocessing_rules(ts, parse.resolution) 191 | parse.resolution = prod 192 | 193 | yield parse 194 | 195 | else: 196 | for parse in generated_parse: 197 | if parse and latent_time: 198 | # NOTE: we post-process after scoring because the model has been trained 199 | # without using the latent time. This means also that the post processing 200 | # step won't be added to the rules 201 | prod = apply_postprocessing_rules(ts, parse.resolution) 202 | parse.resolution = prod 203 | 204 | yield parse 205 | 206 | # for parse in _ctparse( 207 | # _preprocess_string(txt), 208 | # ts, 209 | # pm_bias, 210 | # date_format, 211 | # timeout=timeout, 212 | # relative_match_len=relative_match_len, 213 | # max_stack_depth=max_stack_depth, 214 | # scorer=scorer, 215 | # ): 216 | # if parse and latent_time: 217 | # # NOTE: we post-process after scoring because the model has been trained 218 | # # without using the latent time. This means also that the post processing 219 | # # step won't be added to the rules 220 | # prod = apply_postprocessing_rules(ts, parse.resolution) 221 | # parse.resolution = prod 222 | # 223 | # yield parse 224 | 225 | 226 | def _ctparse( 227 | txt: str, 228 | ts: datetime, 229 | pm_bias: bool, 230 | date_format: str, 231 | timeout: float, 232 | relative_match_len: float, 233 | max_stack_depth: int, 234 | scorer: Scorer, 235 | ) -> Iterator[Optional[CTParse]]: 236 | t_fun = timeout_(timeout) 237 | 238 | try: 239 | # =========== Label extraction =========== 240 | # labels = _get_labels(txt) 241 | # clear raw text of labels so what follows works properly 242 | # txt = re.sub('#[a-zA-Z0-9\-_:/.]+','', txt).strip() 243 | 244 | logger.debug("=" * 80) 245 | logger.debug("-> matching regular expressions") 246 | 247 | scope_regex = {**global_regex, **us_regex} if date_format == 'US' else {**global_regex, **eu_regex} 248 | p, _tp = timeit(_match_regex)(txt, scope_regex) 249 | logger.debug("time in _match_regex: {:.0f}ms".format(1000 * _tp)) 250 | 251 | logger.debug("=" * 80) 252 | logger.debug("-> building initial stack") 253 | regex_stack, _ts = timeit(_regex_stack)(txt, p, t_fun) 254 | logger.debug("time in _regex_stack: {:.0f}ms".format(1000 * _ts)) 255 | 256 | # add empty production path + counter of contained regex 257 | stack = [PartialParse.from_regex_matches(s) for s in regex_stack] 258 | # TODO: the score should be kept separate from the partial parse 259 | # because it depends also on the text and the ts. A good idea is 260 | # to create a namedtuple of kind StackElement(partial_parse, score) 261 | for pp in stack: 262 | pp.score = scorer.score(txt, ts, pp) 263 | 264 | logger.debug("initial stack length: {}".format(len(stack))) 265 | # sort stack by length of covered string and - if that is equal - score 266 | # --> last element is longest coverage and highest scored 267 | stack.sort() 268 | # only keep initial stack elements that cover at least 269 | # relative_match_len characters of what the highest 270 | # scored/covering stack element does cover 271 | stack = [ 272 | s 273 | for s in stack 274 | if s.max_covered_chars >= stack[-1].max_covered_chars * relative_match_len 275 | ] 276 | 277 | logger.debug("stack length after relative match length: {}".format(len(stack))) 278 | # limit depth of stack 279 | stack = stack[-max_stack_depth:] 280 | logger.debug("stack length after max stack depth limit: {}".format(len(stack))) 281 | 282 | # ======================== SUBJECT-EXTRACTION ======================== 283 | # get subject by extracting regex stack from raw text 284 | regex_matches = [match.prod for match in stack] 285 | regex_matches = [product.match.captures() for tuple in regex_matches for product in tuple] 286 | regex_matches = [match.split() for i in regex_matches for match in i] 287 | regex_matches = list(chain.from_iterable(regex_matches)) 288 | 289 | # "acr-11" edge case 290 | s = re.search(r'\b[A-Za-z]+\b-\d+[A-Za-z]*', txt) 291 | if s: 292 | raw = re.split(r'[\s]+', txt) 293 | else: 294 | raw = re.split(r'[\s-]+', txt) 295 | 296 | # subject = list(set(raw) - set(matches)) # doesn't preserve order, but more efficient 297 | subject = [i for i in raw if i not in regex_matches] 298 | subject = ' '.join(subject) 299 | 300 | # remove subject from txt so there's no FP parse 301 | if subject and subject in txt: 302 | txt = subject.replace(txt, '') 303 | 304 | # reset stack if txt is 0 305 | if len(txt) == 0: 306 | stack = [] 307 | # =========================================================== 308 | 309 | # track what has been added to the stack and do not add again 310 | # if the score is not better 311 | stack_prod = {} # type: Dict[Tuple[Artifact, ...], float] 312 | # track what has been emitted and do not emit again 313 | parse_prod = {} # type: Dict[Artifact, float] 314 | while stack: 315 | t_fun() 316 | s = stack.pop() 317 | logger.debug("-" * 80) 318 | logger.debug("producing on {}, score={:.2f}".format(s.prod, s.score)) 319 | new_stack_elements = [] 320 | for r_name, r in s.applicable_rules.items(): 321 | for r_match in _match_rule(s.prod, r[1]): 322 | # apply production part of rule 323 | new_s = s.apply_rule(ts, pm_bias, date_format, r[0], r_name, r_match) 324 | 325 | # TODO: We should store scores separately from the production itself 326 | # because the score may depend on the text and the ts 327 | if new_s is not None: 328 | new_s.score = scorer.score(txt, ts, new_s) 329 | 330 | if ( 331 | new_s 332 | and stack_prod.get(new_s.prod, new_s.score - 1) < new_s.score 333 | ): 334 | # either new_s.prod has never been produced 335 | # before or the score of new_s is higher than 336 | # a previous identical production 337 | new_stack_elements.append(new_s) 338 | logger.debug( 339 | " {} -> {}, score={:.2f}".format( 340 | r_name, new_s.prod, new_s.score 341 | ) 342 | ) 343 | stack_prod[new_s.prod] = new_s.score 344 | if not new_stack_elements: 345 | logger.debug("~" * 80) 346 | logger.debug("no rules applicable: emitting") 347 | # no new productions were generated from this stack element. 348 | # emit all (probably partial) production 349 | for x in s.prod: 350 | if not isinstance(x, RegexMatch): 351 | # TODO: why do we have a different method for scoring 352 | # final productions? This is because you may have non-reducible 353 | # parses of the kind [Time, RegexMatch, Interval] or 354 | # [Time, Time] etc. In this case we want to emit those Time, 355 | # Interval parses separately and score them appropriately 356 | # (the default Scorer.score function only operates on the 357 | # whole PartialParse). 358 | score_x = scorer.score_final(txt, ts, s, x) 359 | # only emit productions not emitted before or 360 | # productions emitted before but scored higher 361 | if parse_prod.get(x, score_x - 1) < score_x: 362 | parse_prod[x] = score_x 363 | logger.debug( 364 | " => {}, score={:.2f}, ".format(x.__repr__(), score_x) 365 | ) 366 | yield CTParse(x, s.rules, score_x, subject)#, labels) 367 | else: 368 | # new productions generated, put on stack and sort 369 | # stack by highst score 370 | stack.extend(new_stack_elements) 371 | stack.sort() 372 | stack = stack[-max_stack_depth:] 373 | logger.debug( 374 | "added {} new stack elements, depth after trunc: {}".format( 375 | len(new_stack_elements), len(stack) 376 | ) 377 | ) 378 | except CTParseTimeoutError: 379 | logger.debug('Timeout on "{}"'.format(txt)) 380 | return 381 | 382 | 383 | # replace all comma, semicolon, whitespace, invisible control, opening and 384 | # closing brackets 385 | # _repl1 = regex.compile(r"[,;\pZ\pC\p{Ps}\p{Pe}]+", regex.VERSION1) # original regex 386 | _repl1 = regex.compile(r"[\pZ\pC]+", regex.VERSION1) # allow brackets 387 | _repl2 = regex.compile(r"(\p{Pd}|[\u2010-\u2015]|\u2043)+", regex.VERSION1) 388 | 389 | 390 | def _get_labels(txt: str) -> str: 391 | labels = re.findall('#[a-zA-Z0-9\-_:/.]+', txt) 392 | labels = [label.replace("#", "") for label in labels] 393 | return labels 394 | 395 | 396 | def _preprocess_string(txt: str) -> str: 397 | return cast( 398 | str, _repl2.sub("-", _repl1.sub(" ", txt, concurrent=True).strip()).strip() 399 | ) 400 | 401 | 402 | def _match_rule( 403 | seq: Sequence[Artifact], rule: Sequence[Callable[[Artifact], bool]] 404 | ) -> Iterator[Tuple[int, int]]: 405 | if not seq: 406 | return 407 | if not rule: 408 | return 409 | i_r = 0 410 | i_s = 0 411 | r_len = len(rule) 412 | s_len = len(seq) 413 | while i_s < s_len: 414 | if rule[0](seq[i_s]): 415 | i_start = i_s + 1 416 | i_r = 1 417 | while i_start < s_len and i_r < r_len and rule[i_r](seq[i_start]): 418 | i_r += 1 419 | i_start += 1 420 | if i_r == r_len: 421 | yield i_s, i_start 422 | i_s += 1 423 | 424 | 425 | def _match_regex(txt: str, regexes: Dict[int, regex.Regex]) -> List[RegexMatch]: 426 | # Match a collection of regexes in *txt* 427 | # 428 | # The returned RegexMatch objects are sorted by the start of the match 429 | # :param txt: the text to match against 430 | # :param regexes: a collection of regexes name->pattern 431 | # :return: a list of RegexMatch objects ordered my RegexMatch.mstart 432 | matches = { 433 | RegexMatch(name, m) 434 | for name, re in regexes.items() 435 | for m in re.finditer(txt, overlapped=True, concurrent=True) 436 | } 437 | for m in matches: 438 | logger.debug("regex: {}".format(m.__repr__())) 439 | return sorted(matches, key=lambda x: (x.mstart, x.mend)) 440 | 441 | 442 | def _regex_stack( 443 | txt: str, 444 | regex_matches: List[RegexMatch], 445 | on_do_iter: Callable[[], None] = lambda: None, 446 | ) -> List[Tuple[RegexMatch, ...]]: 447 | # Group contiguous RegexMatch objects together. 448 | # 449 | # Assumes that regex_matches are sorted by increasing start index. on_do_iter 450 | # is a callback that will be invoked every time the algorithm performs a loop. 451 | # 452 | # Example: 453 | # Say you have the following text, where the regex matches are the 454 | # words between square brackets. 455 | # 456 | # [Tomorrow] I want to go to the movies between [2] [pm] and [5] [pm]. 457 | # 458 | # This function will return the matches that are contiguous (excluding space 459 | # characters) 460 | # [Tomorrow] 461 | # [2], [pm] 462 | # [5], [pm] 463 | # 464 | # This also works with overlapping matches. 465 | # 466 | # Algo: 467 | # * initialize an empty stack 468 | # 469 | # * add all sequences of one expression to the stack, excluding 470 | # expressions which can be reached from "earlier" expression 471 | # (i.e. there is no gap between them): 472 | # 473 | # - say A and B have no gap in between and all sequences starting 474 | # at A have already been produced. These by definition(which?: -) include as 475 | # sub-sequences all sequences starting at B. Any other sequences starting 476 | # at B directly will not add valid variations, as each of them could be 477 | # prefixed with a sequence starting at A 478 | # 479 | # * while the stack is not empty: 480 | # 481 | # * get top sequence s from stack 482 | # 483 | # * generate all possible continuations for this sequence, 484 | # i.e. sequences where expression can be appended to the last 485 | # element s[-1] in s and put these extended sequences on the stack 486 | # 487 | # * if no new continuation could be generated for s, this sequence of 488 | # RegexMatch is appended to the list of results. 489 | 490 | prods = [] 491 | n_rm = len(regex_matches) 492 | # Calculate the upper triangle of an n_rm x n_rm matrix M where 493 | # M[i, j] == 1 (for i avoid use of numpy here; since we need column sums below, 501 | # --> the representation of M is columns major, i.e. M[i] is the i-th 502 | # --> column; M[i, j] then basically becomes M[j][i] 503 | M = [[0 for _ in range(n_rm)] for _ in range(n_rm)] 504 | 505 | _separator_regex = regex.compile(r"\s*", regex.VERSION1) 506 | 507 | def get_m_dist(m1: RegexMatch, m2: RegexMatch) -> int: 508 | # 1 if there is no relevant gap between m1 and m2, 0 otherwise 509 | # assumes that m1 and m2 are sorted be their start index 510 | if m2.mstart < m1.mend: 511 | return 0 # Overlap 512 | gap_match = _separator_regex.fullmatch(txt[m1.mend : m2.mstart]) 513 | if gap_match: 514 | return 1 # No Gap 515 | else: 516 | return 0 # Gap 517 | 518 | for i in range(n_rm): 519 | for j in range(i + 1, n_rm): 520 | M[j][i] = get_m_dist(regex_matches[i], regex_matches[j]) 521 | 522 | # NOTE(glanaro): I believe this means that this is a beginning node. 523 | # why reversed? 524 | stack = [ 525 | (i,) for i in reversed(range(n_rm)) if sum(M[i]) == 0 526 | ] # type: List[Tuple[int, ...]] 527 | while stack: 528 | on_do_iter() 529 | s = stack.pop() 530 | i = s[-1] 531 | new_prod = False 532 | for j in range(i + 1, n_rm): 533 | if M[j][i] == 1: 534 | stack.append(s + (j,)) 535 | new_prod = True 536 | if not new_prod: 537 | prod = tuple(regex_matches[i] for i in s) 538 | logger.debug("regex stack {}".format(prod)) 539 | prods.append(prod) 540 | return prods 541 | -------------------------------------------------------------------------------- /ctparse/loader.py: -------------------------------------------------------------------------------- 1 | """Utility to load default model in ctparse""" 2 | 3 | import bz2 4 | import logging 5 | import os 6 | import pickle 7 | from .scorer import Scorer, DummyScorer 8 | from .nb_scorer import NaiveBayesScorer 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | # Location of the default model, included with ctparse 13 | DEFAULT_MODEL_FILE = os.path.join(os.path.dirname(__file__), "models", "model.pbz") 14 | 15 | 16 | def load_default_scorer() -> Scorer: 17 | resource = 'model.pbz' 18 | 19 | path = os.path.join(os.path.dirname(__file__), resource) 20 | 21 | # logger.warning(path) 22 | # debug 23 | # logger.warning([x.name for x in pkgutil.walk_packages()]) 24 | 25 | # for exec usage 26 | if os.access(path, mode=os.F_OK): 27 | # d = os.path.dirname(sys.modules[package].__file__) 28 | # logger.warning(os.path.join(d, resource)) 29 | with bz2.open(path, 'rb') as f: 30 | # logger.warning(str(f)) 31 | mdl = pickle.load(f) 32 | return NaiveBayesScorer(mdl) 33 | # for non-exec usage 34 | elif os.path.exists(DEFAULT_MODEL_FILE): 35 | logger.info("Loading model from {} for non-exec usage".format(DEFAULT_MODEL_FILE)) 36 | with bz2.open(DEFAULT_MODEL_FILE, "rb") as fd: 37 | mdl = pickle.load(fd) 38 | return NaiveBayesScorer(mdl) 39 | else: 40 | logger.warning("No model found, initializing empty scorer") 41 | return DummyScorer() 42 | -------------------------------------------------------------------------------- /ctparse/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .dummy import model_package_init 2 | -------------------------------------------------------------------------------- /ctparse/models/dummy.py: -------------------------------------------------------------------------------- 1 | def model_package_init(): 2 | return 1 == 1 3 | -------------------------------------------------------------------------------- /ctparse/models/model.pbz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Acreom/quickadd/69543c79ad5db05a712abf223940fadf61740235/ctparse/models/model.pbz -------------------------------------------------------------------------------- /ctparse/nb_estimator.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Dict, Tuple, List 2 | from math import log, exp 3 | 4 | 5 | def _log_sum_exp(x: Sequence[float]) -> float: 6 | max_value = max(x) 7 | sum_of_exp = sum(exp(x_i - max_value) for x_i in x) 8 | return max_value + log(sum_of_exp) 9 | 10 | 11 | class MultinomialNaiveBayes: 12 | """Implements a multinomial naive Bayes classifier. For background information 13 | (and what has inspired this, see e.g. https://scikit-learn.org/stable/... 14 | ...modules/generated/sklearn.naive_bayes.MultinomialNB.html) 15 | """ 16 | 17 | def __init__(self, alpha: float = 1.0): 18 | """Create new un-trained model 19 | 20 | Parameters 21 | ---------- 22 | alpha : Optional[float] 23 | Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing), 24 | defaults to 1.0 25 | """ 26 | self.alpha = alpha 27 | self.class_prior = (0.0, 0.0) 28 | self.log_likelihood: Dict[str, List[float]] = {} 29 | 30 | @staticmethod 31 | def _construct_log_class_prior(y: Sequence[int]) -> Tuple[float, float]: 32 | # Input classes are -1 and 1 33 | neg_class_count = sum(1 if y_i == -1 else 0 for y_i in y) 34 | pos_class_count = len(y) - neg_class_count 35 | 36 | neg_log_prior = log(neg_class_count / (pos_class_count + neg_class_count)) 37 | pos_log_prior = log(pos_class_count / (pos_class_count + neg_class_count)) 38 | return (neg_log_prior, pos_log_prior) 39 | 40 | @staticmethod 41 | def _construct_log_likelihood( 42 | X: Sequence[Dict[int, int]], y: Sequence[int], alpha: float 43 | ) -> Dict[str, List[float]]: 44 | # Token counts 45 | # implicit assumption from vectorizer: first element has count for #vocab 46 | # size set 47 | vocabulary_len = max(X[0].keys()) + 1 48 | token_counts_negative = [alpha] * vocabulary_len 49 | token_counts_positive = [alpha] * vocabulary_len 50 | for x, y_ in zip(X, y): 51 | for idx, cnt in x.items(): 52 | if y_ == 1: 53 | token_counts_positive[idx] += cnt 54 | else: 55 | token_counts_negative[idx] += cnt 56 | 57 | token_pos_class_sum = sum(token_counts_positive) 58 | token_neg_class_sum = sum(token_counts_negative) 59 | 60 | log_likelihood_negative = [] 61 | log_likelihood_positive = [] 62 | for token_ind in range(vocabulary_len): 63 | log_likelihood_positive.append( 64 | log(token_counts_positive[token_ind]) - log(token_pos_class_sum) 65 | ) 66 | 67 | log_likelihood_negative.append( 68 | log(token_counts_negative[token_ind]) - log(token_neg_class_sum) 69 | ) 70 | return { 71 | "negative_class": log_likelihood_negative, 72 | "positive_class": log_likelihood_positive, 73 | } 74 | 75 | def fit( 76 | self, X: Sequence[Dict[int, int]], y: Sequence[int] 77 | ) -> "MultinomialNaiveBayes": 78 | """Fit a naive Bayes model from a count of feature matrix 79 | 80 | Parameters 81 | ---------- 82 | X : Sequence[Dict[int, int]] 83 | Sequence of sparse {feature_index: count} dictionaries 84 | y : Sequence[int] 85 | Labels +1/-1 86 | 87 | Returns 88 | ------- 89 | MultinomialNaiveBayes 90 | The fitted model 91 | """ 92 | self.class_prior = self._construct_log_class_prior(y) 93 | self.log_likelihood = self._construct_log_likelihood(X, y, self.alpha) 94 | return self 95 | 96 | def predict_log_probability( 97 | self, X: Sequence[Dict[int, int]] 98 | ) -> Sequence[Tuple[float, float]]: 99 | """Calculate the posterior log probability of new sample X 100 | 101 | Parameters 102 | ---------- 103 | X : Sequence[Dict[int, int]] 104 | Sequence of data to predict on as sparse {feature_index: count} dictionarie 105 | 106 | Returns 107 | ------- 108 | Sequence[Tuple[float, float]] 109 | Tuple of (negative-class, positive-class) log likelihoods 110 | """ 111 | scores = [] 112 | for x in X: 113 | # Initialise the scores with priors of positive and negative class 114 | neg_score = self.class_prior[0] 115 | pos_score = self.class_prior[1] 116 | for idx, cnt in x.items(): 117 | pos_score += self.log_likelihood["positive_class"][idx] * cnt 118 | neg_score += self.log_likelihood["negative_class"][idx] * cnt 119 | joint_log_likelihood = [neg_score, pos_score] 120 | # Normalize the scores 121 | log_prob_x = _log_sum_exp(joint_log_likelihood) 122 | scores.append((neg_score - log_prob_x, pos_score - log_prob_x)) 123 | return scores 124 | -------------------------------------------------------------------------------- /ctparse/nb_scorer.py: -------------------------------------------------------------------------------- 1 | """This module cointains the implementation of the scorer based on naive bayes.""" 2 | import bz2 3 | import math 4 | import pickle 5 | from datetime import datetime 6 | from typing import Sequence 7 | 8 | from ctparse.nb_estimator import MultinomialNaiveBayes 9 | from ctparse.count_vectorizer import CountVectorizer 10 | from ctparse.pipeline import CTParsePipeline 11 | from .scorer import Scorer 12 | from .partial_parse import PartialParse 13 | from .types import Artifact 14 | 15 | 16 | class NaiveBayesScorer(Scorer): 17 | def __init__(self, nb_model: CTParsePipeline) -> None: 18 | """Scorer based on a naive bayes estimator. 19 | 20 | This scorer models the probability of having a correct parse, conditioned 21 | on the sequence of rules (expressed as a categorical feature) that led to 22 | that parse. 23 | 24 | The score is also modified by a "length" factor that penalizes parses that 25 | cover a smaller part of the text string. 26 | 27 | :param nb_model: 28 | A scikit-learn style Estimator that was trained on a corpus that takes 29 | a Sequence[Sequence[str]] as X (each entry is a sequence of rule 30 | identifiers) and a Sequence[int] in the set {-1, 1} that indicates if 31 | the parse was correct or incorrect. 32 | """ 33 | self._model = nb_model 34 | 35 | @classmethod 36 | def from_model_file(cls, fname: str) -> "NaiveBayesScorer": 37 | with bz2.open(fname, "rb") as fd: 38 | return cls(pickle.load(fd)) 39 | 40 | def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float: 41 | # Penalty for partial matches 42 | max_covered_chars = partial_parse.prod[-1].mend - partial_parse.prod[0].mstart 43 | len_score = math.log(max_covered_chars / len(txt)) 44 | 45 | X = _feature_extractor(txt, ts, partial_parse) 46 | pred = self._model.predict_log_proba([X]) 47 | 48 | # NOTE: the prediction is log-odds, or logit 49 | model_score = pred[0][1] - pred[0][0] 50 | 51 | return model_score + len_score 52 | 53 | def score_final( 54 | self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact 55 | ) -> float: 56 | # The difference between the original score and final score is that in the 57 | # final score, the len_score is calculated based on the length of the final 58 | # production 59 | len_score = math.log(len(prod) / len(txt)) 60 | 61 | X = _feature_extractor(txt, ts, partial_parse) 62 | pred = self._model.predict_log_proba([X]) 63 | 64 | # NOTE: the prediction is log-odds, or logit 65 | model_score = pred[0][1] - pred[0][0] 66 | 67 | # We want the len_score to always take precedence. I believe a logit won't go up 68 | # more than 1000. A better way would be to return an ordering tuple instead, 69 | # but then we would need to change many interfaces. 70 | return model_score + 1000 * len_score 71 | 72 | 73 | def _feature_extractor( 74 | txt: str, ts: datetime, partial_parse: PartialParse 75 | ) -> Sequence[str]: 76 | return [str(r) for r in partial_parse.rules] 77 | 78 | 79 | def train_naive_bayes(X: Sequence[Sequence[str]], y: Sequence[bool]) -> CTParsePipeline: 80 | """Train a naive bayes model for NaiveBayesScorer""" 81 | y_binary = [1 if y_i else -1 for y_i in y] 82 | # Create and train the pipeline 83 | pipeline = CTParsePipeline( 84 | CountVectorizer(ngram_range=(1, 3)), MultinomialNaiveBayes(alpha=1.0) 85 | ) 86 | model = pipeline.fit(X, y_binary) 87 | return model 88 | 89 | 90 | def save_naive_bayes(model: CTParsePipeline, fname: str) -> None: 91 | """Save a naive bayes model for NaiveBayesScorer""" 92 | # TODO: version this model and dump metadata with lots of information 93 | with bz2.open(fname, "wb") as fd: 94 | pickle.dump(model, fd) 95 | -------------------------------------------------------------------------------- /ctparse/partial_parse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | from typing import ( 4 | Callable, 5 | Optional, 6 | Sequence, 7 | Tuple, 8 | TypeVar, 9 | Union, 10 | Dict, 11 | List, 12 | Generator, 13 | ) 14 | 15 | from .rule import rules as global_rules, ProductionRule, Predicate 16 | from .timers import timeit 17 | from .types import Artifact, RegexMatch 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | T = TypeVar("T") 22 | 23 | 24 | class PartialParse: 25 | def __init__( 26 | self, prod: Tuple[Artifact, ...], rules: Tuple[Union[int, str], ...] 27 | ) -> None: 28 | """A data structure representing a partial parse. 29 | 30 | 31 | * prod: the current partial production 32 | * rules: the sequence of regular expressions and rules used/applied to produce 33 | prod 34 | * score: the score assigned to this production 35 | """ 36 | if len(prod) < 1: 37 | raise ValueError("prod should have at least one element") 38 | 39 | self.prod = prod 40 | self.rules = rules 41 | self.applicable_rules = global_rules 42 | self.max_covered_chars = self.prod[-1].mend - self.prod[0].mstart 43 | self.score = 0.0 44 | 45 | @classmethod 46 | def from_regex_matches( 47 | cls, regex_matches: Tuple[RegexMatch, ...] 48 | ) -> "PartialParse": 49 | """Create partial production from a series of RegexMatch 50 | 51 | This usually is called when no production rules (with the exception of 52 | regex matches) have been applied. 53 | 54 | """ 55 | se = cls(prod=regex_matches, rules=tuple(r.id for r in regex_matches)) 56 | 57 | logger.debug("=" * 80) 58 | logger.debug("-> checking rule applicability") 59 | # Reducing rules to only those applicable has no effect for 60 | # small stacks, but on larger there is a 10-20% speed 61 | # improvement 62 | se.applicable_rules, _ts = timeit(se._filter_rules)(global_rules) 63 | logger.debug( 64 | "of {} total rules {} are applicable in {}".format( 65 | len(global_rules), len(se.applicable_rules), se.prod 66 | ) 67 | ) 68 | logger.debug("time in _filter_rules: {:.0f}ms".format(1000 * _ts)) 69 | logger.debug("=" * 80) 70 | 71 | return se 72 | 73 | def apply_rule( 74 | self, 75 | ts: datetime, 76 | pm_bias: bool, 77 | date_format: str, 78 | rule: ProductionRule, 79 | rule_name: Union[str, int], 80 | match: Tuple[int, int], 81 | ) -> Optional["PartialParse"]: 82 | """Check whether the production in rule can be applied to this stack 83 | element. 84 | 85 | If yes, return a copy where this update is 86 | incorporated in the production, the record of applied rules 87 | and the score. 88 | 89 | :param ts: reference time 90 | :param pm_bias: bias option bool 91 | :param date_format: us / eu date format 92 | :param rule: a tuple where the first element is the production rule to apply 93 | :param rule_name: the name of the rule 94 | :param match: the start and end index of the parameters that the rule needs. 95 | """ 96 | prod = rule(ts, pm_bias, date_format, *self.prod[match[0]: match[1]]) 97 | 98 | if prod is not None: 99 | pp = PartialParse( 100 | prod=self.prod[: match[0]] + (prod,) + self.prod[match[1] :], 101 | rules=self.rules + (rule_name,), 102 | ) 103 | 104 | pp.applicable_rules = self.applicable_rules 105 | return pp 106 | else: 107 | return None 108 | 109 | def __lt__(self, other: "PartialParse") -> bool: 110 | """Sort stack elements by (a) the length of text they can 111 | (potentially) cover and (b) the score assigned to the 112 | production. 113 | 114 | a < b <=> a.max_covered_chars < b.max_covered_chars or 115 | (a.max_covered_chars <= b.max_covered_chars and a.score < b.score) 116 | """ 117 | return (self.max_covered_chars < other.max_covered_chars) or ( 118 | self.max_covered_chars == other.max_covered_chars 119 | and self.score < other.score 120 | ) 121 | 122 | def __repr__(self) -> str: 123 | return "PartialParse(prod={}, rules={}, score={})".format( 124 | repr(self.prod), repr(self.rules), repr(self.score) 125 | ) 126 | 127 | def _filter_rules( 128 | self, rules: Dict[str, Tuple[ProductionRule, List[Predicate]]] 129 | ) -> Dict[str, Tuple[ProductionRule, List[Predicate]]]: 130 | # find all rules that can be applied to the current prod sequence 131 | def _hasNext(it: Generator[List[int], None, None]) -> bool: 132 | try: 133 | next(it) 134 | return True 135 | except StopIteration: 136 | return False 137 | 138 | return { 139 | rule_name: r 140 | for rule_name, r in rules.items() 141 | if _hasNext(_seq_match(self.prod, r[1])) 142 | } 143 | 144 | 145 | def _seq_match( 146 | seq: Sequence[T], pat: Sequence[Callable[[T], bool]], offset: int = 0 147 | ) -> Generator[List[int], None, None]: 148 | # :param seq: a list of intermediate productions, either of type 149 | # RegexMatch or some other Artifact 150 | # 151 | # :param pat: a list of rule patterns to be matched, i.e. either a 152 | # RegexMatch or a callable 153 | # 154 | # Determine whether the pattern pat matches the sequence seq and 155 | # return a list of lists, where each sub-list contains those 156 | # indices where the RegexMatch objects in pat are located in seq. 157 | # 158 | # A pattern pat only matches seq, iff each RegexMatch in pat is in 159 | # seq in the same order and iff between two RegexMatches aligned 160 | # to seq there is at least one additional element in seq. Reason: 161 | # 162 | # * Rule patterns never have two consequitive RegexMatch objects. 163 | # 164 | # * Hence there must be some predicate/dimension between two 165 | # * RegexMatch objects. 166 | # 167 | # * For the whole pat to match there must then be at least one 168 | # element in seq that can product this intermediate bit 169 | # 170 | # If pat does not start with a RegexMatch then there must be at 171 | # least one element in seq before the first RegexMatch in pat that 172 | # is alignes on seq. Likewise, if pat does not end with a 173 | # RegexMatch, then there must be at least one additional element 174 | # in seq to match the last non-RegexMatch element in pat. 175 | # 176 | # STRONG ASSUMPTIONS ON ARGUMENTS: seq and pat do not contain 177 | # consequiteve elements which are both of type RegexMatch! Callers 178 | # obligation to ensure this! 179 | 180 | if not pat: 181 | # if pat is empty yield the empty match 182 | yield [] 183 | elif not seq or not pat: 184 | # if either seq or pat is empty there will be no match 185 | return 186 | elif pat[-1].__name__ != "_regex_match": 187 | # there must be at least one additional element in seq at the 188 | # end 189 | yield from _seq_match(seq[:-1], pat[:-1], offset) 190 | elif len(pat) > len(seq): 191 | # if pat is longer than seq it cannot match 192 | return 193 | else: 194 | p1 = pat[0] 195 | # if p1 is not a RegexMatch, then continue on next pat and 196 | # advance sequence by one 197 | if p1.__name__ != "_regex_match": 198 | yield from _seq_match(seq[1:], pat[1:], offset + 1) 199 | else: 200 | # Get number of RegexMatch in p 201 | n_regex = sum(1 for p in pat if p.__name__ == "_regex_match") 202 | # For each occurance of RegexMatch pat[0] in seq 203 | for iseq, s in enumerate(seq): 204 | # apply _regex_match check 205 | if p1(s): 206 | # for each match of pat[1:] in seq[iseq+1:], yield a result 207 | for subm in _seq_match(seq[iseq + 1 :], pat[1:], offset + iseq + 1): 208 | if len(subm) == n_regex - 1: 209 | # only yield if all subsequent RegexMatch 210 | # have been aligned! 211 | yield [iseq + offset] + subm 212 | -------------------------------------------------------------------------------- /ctparse/pipeline.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence, Tuple 2 | 3 | from .nb_estimator import MultinomialNaiveBayes 4 | from .count_vectorizer import CountVectorizer 5 | 6 | 7 | class CTParsePipeline: 8 | def __init__(self, transformer: CountVectorizer, estimator: MultinomialNaiveBayes): 9 | """Setup a pipeline of feature extraction and naive bayes. Overkill for what it 10 | does but leaves room to use different models/features in the future 11 | 12 | Parameters 13 | ---------- 14 | transformer : CountVectorizer 15 | feature extraction step 16 | estimator : MultinomialNaiveBayes 17 | naive bayes model 18 | """ 19 | self.transformer = transformer 20 | self.estimator = estimator 21 | 22 | def fit(self, X: Sequence[Sequence[str]], y: Sequence[int]) -> "CTParsePipeline": 23 | """Fit the transformer and then fit the Naive Bayes model on the transformed 24 | data 25 | 26 | Returns 27 | ------- 28 | CTParsePipeline 29 | Returns the fitted pipeline 30 | """ 31 | X_transformed = self.transformer.fit_transform(X) 32 | self.estimator = self.estimator.fit(X_transformed, y) 33 | return self 34 | 35 | def predict_log_proba( 36 | self, X: Sequence[Sequence[str]] 37 | ) -> Sequence[Tuple[float, float]]: 38 | """Apply the transforms and get probability predictions from the estimator 39 | 40 | Parameters 41 | ---------- 42 | X : Sequence[Sequence[str]] 43 | Sequence of documents, each as sequence of tokens. In ctparse case there are 44 | just the names of the regex matches and rules applied 45 | 46 | Returns 47 | ------- 48 | Sequence[Tuple[float, float]] 49 | For each document the tuple of negative/positive log probability from the 50 | naive bayes model 51 | """ 52 | X_transformed = self.transformer.transform(X) 53 | return self.estimator.predict_log_probability(X_transformed) 54 | -------------------------------------------------------------------------------- /ctparse/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. -------------------------------------------------------------------------------- /ctparse/rule.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F405 2 | import logging 3 | 4 | from datetime import datetime 5 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Type 6 | 7 | import regex 8 | 9 | from .types import Artifact, RegexMatch 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | # A predicate is a callable that returns True if the predicate 15 | # applies to the artifact 16 | Predicate = Callable[[Artifact], bool] 17 | 18 | # ProductionRule is a function used to generate an artifact given other 19 | # artifacts. 20 | ProductionRule = Callable[..., Optional[Artifact]] 21 | 22 | 23 | rules = {} # type: Dict[str, Tuple[ProductionRule, List[Predicate]]] 24 | 25 | _regex_cnt = 150 # leave this much space for ids of production types 26 | _regex = {} # compiled regex 27 | _regex_str = {} # map regex id to original string 28 | _str_regex = {} # type: Dict[str, int] # map regex raw str to regex id 29 | eu_regex = {} 30 | us_regex = {} 31 | 32 | _regex_hour = r"(?:[01]?\d)|(?:2[0-3])" 33 | _regex_minute = r"[0-5]\d" 34 | _regex_day = r"[012]?[1-9]|10|20|30|31" 35 | _regex_month = r"10|11|12|0?[1-9]" 36 | _regex_year = r"(?:19\d\d)|(?:20[0-2]\d)|(?:\d\d)" 37 | 38 | # used in many places in rules 39 | _regex_to_join = ( 40 | r"(\-|to( the)?|(un)?til|bis( zum)?|zum|auf( den)?|und|" 41 | "no later than|spätestens?|at latest( at)?|and)" 42 | ) 43 | 44 | _defines = ( 45 | r"(?(DEFINE)(?<_hour>{regex_hour})(?P<_minute>{regex_minute})" 46 | "(?P<_day>{regex_day})(?P<_month>{regex_month})" 47 | "(?P<_year>{regex_year}))" 48 | ).format( 49 | regex_hour=_regex_hour, 50 | regex_minute=_regex_minute, 51 | regex_day=_regex_day, 52 | regex_month=_regex_month, 53 | regex_year=_regex_year, 54 | ) 55 | 56 | 57 | def rule(*patterns: Union[str, Predicate], **kwargs) -> Callable[[Any], ProductionRule]: 58 | def _map(p: Union[str, Predicate]) -> Predicate: 59 | if isinstance(p, str): 60 | # its a regex 61 | global _regex_cnt 62 | if p in _str_regex: 63 | # have seen this regex before - recycle 64 | return regex_match(_str_regex[p]) 65 | # test the regex first 66 | re = r"{defines}(?i)(?P{re})".format( 67 | defines=_defines, re=p, re_key=_regex_cnt 68 | ) 69 | new_rr = regex.compile( 70 | # Removed the separator here - leads to more matches, 71 | # as now each rule can also match if it is not followed 72 | # or preceeded by a separator character 73 | # r'(?i)(?:{sep})(?P<{re_key}>{re})(?:{sep})'.format( 74 | re, 75 | regex.VERSION1, 76 | ) 77 | if new_rr.match(""): 78 | raise ValueError("expression {} matches empty strings".format(p)) 79 | 80 | if kwargs and "date_format" in kwargs: 81 | if kwargs['date_format'] == 'US': 82 | us_regex[_regex_cnt] = new_rr 83 | else: 84 | eu_regex[_regex_cnt] = new_rr 85 | else: 86 | _regex[_regex_cnt] = new_rr 87 | _regex_str[_regex_cnt] = p 88 | _str_regex[p] = _regex_cnt 89 | _regex_cnt += 1 90 | return regex_match(_regex_cnt - 1) 91 | else: 92 | return p 93 | 94 | # check that in rules we never have a regex followed by a regex - 95 | # that must be merged into one regex 96 | def _has_consequtive_regex( 97 | ps: Tuple[Union[str, Callable[[Artifact], bool]], ...] 98 | ) -> bool: 99 | for p0, p1 in zip(ps[:-1], ps[1:]): 100 | if isinstance(p0, str) and isinstance(p1, str): 101 | return True 102 | return False 103 | 104 | if _has_consequtive_regex(patterns): 105 | raise ValueError("rule which contains consequtive regular expressions found") 106 | 107 | mapped_patterns = [_map(p) for p in patterns] 108 | 109 | def fwrapper(f: ProductionRule) -> ProductionRule: 110 | def wrapper(ts: datetime, *args: Artifact) -> Optional[Artifact]: 111 | res = f(ts, *args) 112 | if res is not None: 113 | # upon a successful production, update the span 114 | # information by expanding it to that of all args 115 | res.update_span(*args) 116 | return res 117 | 118 | rules[f.__name__] = (wrapper, mapped_patterns) 119 | return wrapper 120 | 121 | return fwrapper 122 | 123 | 124 | def regex_match(r_id: int) -> Predicate: 125 | def _regex_match(r: Artifact) -> bool: 126 | return type(r) == RegexMatch and r.id == r_id # type: ignore 127 | 128 | return _regex_match 129 | 130 | 131 | def dimension(dim: Type[Artifact]) -> Predicate: 132 | def _dimension(d: Artifact) -> bool: 133 | return isinstance(d, dim) 134 | 135 | return _dimension 136 | 137 | 138 | def predicate(pred: str) -> Predicate: 139 | def _predicate(d: Artifact) -> Any: 140 | return getattr(d, pred, False) 141 | 142 | return _predicate 143 | 144 | 145 | from .time.rules import * # noqa 146 | -------------------------------------------------------------------------------- /ctparse/scorer.py: -------------------------------------------------------------------------------- 1 | """This module contains the Scorer abstraction that can be used to 2 | implement scoring strategies for ctparse. 3 | """ 4 | 5 | from abc import ABCMeta, abstractmethod 6 | from datetime import datetime 7 | from random import Random 8 | from typing import Optional 9 | 10 | from .partial_parse import PartialParse 11 | from .types import Artifact 12 | 13 | 14 | class Scorer(metaclass=ABCMeta): 15 | """Interface for scoring parses generated by ctparse""" 16 | 17 | @abstractmethod 18 | def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float: 19 | """Produce a score for a partial production. 20 | 21 | :param txt: the text that is being parsed 22 | :param ts: the reference time 23 | :param partial_parse: the partial parse that needs to be scored 24 | """ 25 | 26 | @abstractmethod 27 | def score_final( 28 | self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact 29 | ) -> float: 30 | """Produce the final score for a production. 31 | 32 | :param txt: the text that is being parsed 33 | :param ts: the reference time 34 | :param partial_parse: the PartialParse object that generated the production 35 | :param prod: the production 36 | """ 37 | 38 | 39 | class DummyScorer(Scorer): 40 | """A scorer that always return a 0.0 score.""" 41 | 42 | def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float: 43 | return 0.0 44 | 45 | def score_final( 46 | self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact 47 | ) -> float: 48 | return 0.0 49 | 50 | 51 | class RandomScorer(Scorer): 52 | def __init__(self, rng: Optional[Random] = None) -> None: 53 | """A score that returns a random number between 0 and 1. 54 | 55 | :param rng: 56 | the random number generator to use 57 | """ 58 | self.rng = rng if rng is not None else Random() 59 | 60 | def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float: 61 | return self.rng.random() 62 | 63 | def score_final( 64 | self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact 65 | ) -> float: 66 | return self.rng.random() 67 | -------------------------------------------------------------------------------- /ctparse/time/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Acreom/quickadd/69543c79ad5db05a712abf223940fadf61740235/ctparse/time/__init__.py -------------------------------------------------------------------------------- /ctparse/time/corpus.py: -------------------------------------------------------------------------------- 1 | corpus = [ 2 | # ruleYear 3 | ("Time[]{2019-X-X X:X (X/X)}", "2018-03-07T12:43", ["2019"]), 4 | # ruleToday 5 | ( 6 | "Time[]{2018-03-07 X:X (X/X)}", 7 | "2018-03-07T12:43", 8 | ["heute", "zu dieser zeit", "today"], 9 | ), 10 | # ruleNow 11 | ( 12 | "Time[]{2018-03-07 12:43 (X/X)}", 13 | "2018-03-07T12:43", 14 | ["jetzt", "genau jetzt", "gerade eben", "rightnow", "just now"], 15 | ), 16 | # ruleTomorrow 17 | ("Time[]{2019-01-01 X:X (X/X)}", "2018-12-31T12:43", ["morgen", "tomorrow", "tom", "tmrw"]), 18 | # ruleAfterTomorrow 19 | ("Time[]{2019-01-02 X:X (X/X)}", "2018-12-31T12:43", ["übermorgen"]), 20 | # ruleTomorrow + time 21 | ( 22 | "Time[]{2019-01-01 19:25 (X/X)}", 23 | "2018-12-31T12:43", 24 | ["morgen 19:25", "tomorrow 7:25 pm"], 25 | ), 26 | # ruleYesterday 27 | # test on a leap-year 28 | ("Time[]{2020-02-29 X:X (X/X)}", "2020-03-01T12:43", ["gestern", "yesterday"]), 29 | # ruleBeforeYesterday 30 | # test on a leap-year 31 | ("Time[]{2020-02-28 X:X (X/X)}", "2020-03-01T12:43", ["vorgestern"]), 32 | # ruleEOM 33 | ( 34 | "Time[]{2018-03-31 X:X (X/X)}", 35 | "2018-03-07T12:43", 36 | ["ende des Monats", "eom", "end of the month"], 37 | ), 38 | # ruleEOY 39 | ( 40 | "Time[]{2018-12-31 X:X (X/X)}", 41 | "2018-03-07T12:43", 42 | ["ende des Jahres", "eoy", "end of the year"], 43 | ), 44 | # ruleNamedDOW 45 | ("Time[]{2018-03-12 X:X (X/X)}", "2018-03-07T12:43", ["Montag", "mon", "monday"]), 46 | ( 47 | "Time[]{2018-03-13 X:X (X/X)}", 48 | "2018-03-07T12:43", 49 | ["Dienstag", "tuesday", "tue"], 50 | ), 51 | # ruleNamedDOW + POD 52 | # ("Time[]{2018-03-12 X:X (X/morning)}", "2018-03-07T12:43", ["Montagmorgen"]), 53 | # ("Time[]{2018-03-14 X:X (X/forenoon)}", "2018-03-07T12:43", ["Mittwochvormittag"]), 54 | # ("Time[]{2018-03-10 X:X (X/morning)}", "2018-03-07T12:43", ["Samstagfrüh"]), 55 | # ( 56 | # "Time[]{2018-03-11 X:X (X/night)}", 57 | # "2018-03-07T12:43", 58 | # ["sunday night", "Sonntagnacht"], 59 | # ), 60 | # ruleNamedMonth 61 | ("Time[]{2023-01-01 X:X (X/X)}", "2022-11-28T12:43", ["1st January", "1st jan."]), 62 | ("Time[]{2023-04-15 X:X (X/X)}", "2022-11-28T12:43", ["15 April", "15 apr."]), 63 | ("Time[]{X-07-X X:X (X/X)}", "2022-11-28T12:43", ["Juli", "July", "Jul"]), 64 | ( 65 | "Time[]{2022-12-24 X:X (X/X)}", 66 | "2022-11-28T12:43", 67 | ["24 Dezember", "December 24", "24 Dec.", "24 Dez."], 68 | ), 69 | # ruleAtDOW 70 | ("Time[]{2018-03-13 X:X (X/X)}", "2018-03-07T12:43", ["am Dienstag", "on Tue"]), 71 | ( 72 | "Time[]{2018-03-14 X:X (X/X)}", 73 | "2018-03-07T12:43", 74 | ["this Wednesday", "diesen Mittwoch"], 75 | ), 76 | # ruleNextDOW 77 | ( 78 | "Time[]{2018-03-16 X:X (X/X)}", 79 | "2018-03-07T12:43", 80 | [ 81 | "am nächsten Freitag", 82 | "next Friday", 83 | "nächste Woche Freitag", 84 | "Friday next week", 85 | "on the following Friday", 86 | ], 87 | ), 88 | # ruleDOYYear, ruleDDMM, ruleDDMMYYYY 89 | ( 90 | "Time[]{2018-05-08 X:X (X/X)}", 91 | "2018-03-07T12:43", 92 | [ 93 | "8.5.2018", 94 | "8. Mai 2018", 95 | "8. Mai 18", 96 | "8 May 2018", 97 | "8 May", 98 | "May 8", 99 | "8/5", 100 | "8.5.", 101 | "am 8. Mai 2018", 102 | "diesen 8. Mai 18", 103 | "den 8.5.", 104 | "8th May", 105 | "8th of May", 106 | "May 8th", 107 | "at 8th May", 108 | "on 8th of May", 109 | "this May 8th", 110 | "may 8", 111 | ], 112 | ), 113 | ( 114 | "Time[]{2022-12-24 X:X (X/X)}", 115 | "2022-11-29T12:43", 116 | [ 117 | "12/24", 118 | "12/24", 119 | "12/24", 120 | "12/24", 121 | "12/24", 122 | "12/24", 123 | "12/24", 124 | "12/24", 125 | "12/24", 126 | "12/24", 127 | "12/24", 128 | "12/24", 129 | "12/24", 130 | "12/24", 131 | "12/24", 132 | "12/24", 133 | "12/24", 134 | "12/24", 135 | "12/24", 136 | "12/24", 137 | "12/24", 138 | "12/24", 139 | "12/24", 140 | "24/12", 141 | "24/12", 142 | "24/12", 143 | ], 144 | ), 145 | # ruleDOWDOM 146 | ( 147 | "Time[]{2022-11-29 X:X (X/X)}", 148 | "2022-11-28T12:43", 149 | ["Tuesday 29th", "Tuesday the 29th", "Dienstag der 29."], 150 | ), 151 | ( 152 | "Time[]{2018-06-02 X:X (X/X)}", 153 | "2018-03-07T12:43", 154 | ["Saturday 2nd", "Jun 2nd", "am 2ten Juni"], 155 | ), 156 | # ruleDOWDate, ruleDateDOW 157 | ( 158 | "Time[]{2018-05-08 X:X (X/X)}", 159 | "2018-03-07T12:43", 160 | ["Tuesday 8.5", "8.5 Tuesday"], 161 | ), 162 | # ( 163 | # "Time[]{2018-05-08 X:X (X/morning)}", 164 | # "2018-03-07T12:43", 165 | # ["Dienstagmorgen 8.5.", "8.5. Dienstagmorgen"], 166 | # ), 167 | # rulePOD, ruleLatentPOD 168 | # ( 169 | # "Time[]{2018-03-08 X:X (X/morning)}", 170 | # "2018-03-07T12:43", 171 | # ["morgens", "früh", "in der früh", "early", "morning"], 172 | # ), 173 | # ( 174 | # "Time[]{2018-03-08 X:X (X/earlymorning)}", 175 | # "2018-03-07T12:43", 176 | # ["früh morgens", "sehr früh", "early morning"], 177 | # ), 178 | # ( 179 | # "Time[]{2018-03-08 X:X (X/forenoon)}", 180 | # "2018-03-07T12:43", 181 | # ["vormittags", "forenoon"], 182 | # ), 183 | # before noon case 184 | # ( 185 | # "Interval[]{None - 2018-03-08 X:X (X/noon)}", 186 | # "2018-03-07T12:43", 187 | # ["vor mittags", "before noon"], 188 | # ), 189 | # ( 190 | # "Time[]{2018-03-08 X:X (X/afternoon)}", 191 | # "2018-03-07T12:43", 192 | # ["nachmittag", "afternoon"], 193 | # ), 194 | # # past noon case 195 | # ( 196 | # "Interval[]{2018-03-08 X:X (X/noon) - None}", 197 | # "2018-03-07T12:43", 198 | # ["nach mittag", "after noon"], 199 | # ), 200 | # ("Time[]{2018-03-08 X:X (X/noon)}", "2018-03-07T12:43", ["mittags", "noon"]), 201 | # ( 202 | # "Time[]{2018-03-07 X:X (X/evening)}", 203 | # "2018-03-07T12:43", 204 | # ["abends", "late", "spät"], 205 | # ), 206 | # ( 207 | # "Time[]{2018-03-07 X:X (X/lateevening)}", 208 | # "2018-03-07T12:43", 209 | # ["später abend", "very late", "late evening"], 210 | # ), 211 | # ( 212 | # "Time[]{2018-03-08 X:X (X/veryearlyafternoon)}", 213 | # "2018-03-07T12:43", 214 | # ["sehr früher nachmittag", "very early afternoon"], 215 | # ), 216 | # ( 217 | # "Time[]{2018-03-07 X:X (X/night)}", 218 | # "2018-03-07T12:43", 219 | # ["heute nacht", "this night", "nachts"], 220 | # ), 221 | # # First/Last 222 | # ( 223 | # "Time[]{2018-03-08 X:X (X/first)}", 224 | # "2018-03-07T12:43", 225 | # [ 226 | # "tomorrow first", 227 | # "morgen erster", 228 | # "morgen so früh wie möglich", 229 | # "tomorrow earliest possible", 230 | # ], 231 | # ), 232 | # ( 233 | # "Time[]{2018-03-08 X:X (X/last)}", 234 | # "2018-03-07T12:43", 235 | # [ 236 | # "tomorrow last", 237 | # "morgen letzter", 238 | # "tomorrow as late as possible", 239 | # "morgen spätest möglicher", 240 | # ], 241 | # ), 242 | # ( 243 | # "Time[]{2018-03-09 X:X (X/first)}", 244 | # "2018-03-07T12:43", 245 | # ["Friday first", "Freitag erster"], 246 | # ), 247 | # ( 248 | # "Time[]{2018-03-13 X:X (X/last)}", 249 | # "2018-03-07T12:43", 250 | # ["Tuesday last", "Dienstag letzter"], 251 | # ), 252 | # # Date + POD 253 | # ( 254 | # "Time[]{2017-01-25 X:X (X/evening)}", 255 | # "2018-03-07T12:43", 256 | # [ 257 | # "25.01.2017 abends", 258 | # "evening of January 25th 2017", 259 | # "25.01.2017 late", 260 | # "25.01.2017 spät", 261 | # "25.01.2017 (spät)", 262 | # ], 263 | # ), 264 | # ( 265 | # "Time[]{2017-01-25 X:X (X/lateafternoon)}", 266 | # "2018-03-07T12:43", 267 | # [ 268 | # "25.01.2017 spät nachmittags", 269 | # "am 25. Januar 2017 am späten Nachmittag", 270 | # "am 25. Januar 2017 am späten Nachmittag", 271 | # "am 25. Januar 2017 am späten Nachmittag", 272 | # "late afternoon of January 25th 2017", 273 | # ], 274 | # ), 275 | # ( 276 | # "Time[]{2020-01-25 X:X (X/evening)}", 277 | # "2018-03-07T12:43", 278 | # [ 279 | # "25.01.2020 abends", 280 | # "25.01.2020 late", 281 | # "25.01.2020 spät", 282 | # "25. Januar 2020 abends", 283 | # "abends 25.01.2020", 284 | # "evening of January 25th 2020", 285 | # ], 286 | # ), 287 | # ( 288 | # "Time[]{2018-03-25 X:X (X/evening)}", 289 | # "2018-03-07T12:43", 290 | # ["evening of the 25th", "am 25. abends", "abends am 25."], 291 | # ), 292 | # # ruleTODPOD 293 | # ( 294 | # "Time[]{X-X-X 16:30 (X/X)}", 295 | # "2018-03-07T12:43", 296 | # ["um 4:30 nachmittags", "at 4:30 in the afternoon"], 297 | # ), 298 | # rulePODTOD 299 | # ( 300 | # "Time[]{X-X-X 20:00 (X/X)}", # next day since moning is already over 301 | # "2018-03-07T12:43", 302 | # ["morgens um 8", "late morning at 8"], 303 | # ), 304 | ( 305 | "Time[]{X-X-X 16:30 (X/X)}", 306 | "2018-03-07T12:43", 307 | ["nachmittags um 16:30", "afternoon at 16:30", "16:30"], 308 | ), 309 | # ruleDateTOD 310 | ( 311 | "Time[]{2018-08-05 20:00 (X/X)}", 312 | "2018-03-07T12:43", 313 | [ 314 | "5. August um 8", 315 | "August 5th at 8", 316 | # "august 5 at 8am", 317 | "5. Aug gegen 8", 318 | "05.08.2018 8Uhr", 319 | "05.08.2018 8pm", 320 | "august 5th 8" 321 | ], 322 | ), 323 | # ruleTODDate 324 | ( 325 | "Time[]{2018-08-05 20:00 (X/X)}", 326 | "2018-03-07T12:43", 327 | ["um 8 5. August", "at 8 on August 5th"], 328 | ), 329 | # ruleDateDate, ruleDOMDate, ruleDateDOM 330 | ( 331 | "Interval[]{2018-08-05 X:X (X/X) - 2018-08-16 X:X (X/X)}", 332 | "2018-03-07T12:43", 333 | [ 334 | "5.8. - 16.8.", 335 | "August 5th - August 16th", 336 | "Aug 5 - 16", 337 | "from Aug 5 to 16", 338 | "5 to 16 Aug", 339 | "from 5 to 16 Aug", 340 | "5. - 16.8.", 341 | "5.8. - 16.8.2018", 342 | "5.8. bis 16.8.2018", 343 | "5.8. - 16.8.", 344 | "5.8. bis 16.8.", 345 | "5. - 16.8.", 346 | "5.8. - 16.8.2018", 347 | "5.8. bis 16.8.2018", 348 | "5.8. - 16.8.", 349 | "5.8. bis 16.8.", 350 | "5. bis zum 16.8.", 351 | "vom 05.08.2018 zum 16.08.2018", 352 | ], 353 | ), 354 | # ruleDOYDate 355 | ( 356 | "Interval[]{2017-08-05 X:X (X/X) - 2017-08-16 X:X (X/X)}", 357 | "2018-03-07T12:43", 358 | ["5.8. - 16.8.2017", "Samstag 5.8. - Mittwoch 16.8.2017"], 359 | ), 360 | # ruleDateTimeDateTime 361 | ( 362 | "Interval[]{2018-08-05 08:00 (X/X) - 2018-08-16 13:00 (X/X)}", 363 | "2018-03-07T12:43", 364 | ["5.8. 8am - 16.8. 13Uhr", "August 5th 8am - August 16th 13h"], 365 | ), 366 | ( 367 | "Interval[]{X-X-X 08:00 (X/X) - X-X-X 13:00 (X/X)}", 368 | "2018-03-07T12:43", 369 | ["8am - 13:00", "8am - 13Uhr", "8am to 13h", "8am-13"], 370 | ), 371 | # increasing coverage for int-int hours 372 | ( 373 | "Interval[]{X-X-X 10:00 (X/X) - X-X-X 12:00 (X/X)}", 374 | "2018-03-07T12:43", 375 | ["10am - 12:00", "10am - 12Uhr", "10am to 12h", "10-12am", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12",], 376 | ), 377 | ( 378 | "Interval[]{X-X-X 15:00 (X/X) - X-X-X 16:00 (X/X)}", 379 | "2018-03-07T12:43", 380 | ["03:00 - 04:00", "3Uhr - 4Uhr", "3h to 4h", "3-4"], 381 | ), 382 | # rulePODPOD 383 | # ( 384 | # "Interval[]{X-X-X X:X (X/evening) - X-X-X X:X (X/night)}", 385 | # "2018-05-08T10:32", 386 | # ["evening/night"], 387 | # ), 388 | # ruleAfterTime 389 | ( 390 | "Interval[]{2017-11-26 20:00 (X/X) - None}", 391 | "2018-03-07T12:43", 392 | ["26.11.2017 ab 08:00 Uhr"], 393 | ), 394 | ( 395 | "Interval[]{2018-11-26 20:00 (X/X) - None}", 396 | "2018-03-07T12:43", 397 | [ 398 | "26.11.2018 ab 08:00 Uhr", 399 | "26.11. ab 08:00 Uhr", 400 | "26.11. frühestens um 08:00 Uhr", 401 | "November 26th earliest 08:00 Uhr", 402 | "November 26th earliest after 08:00 Uhr", 403 | "November 26th from earliest 08:00 Uhr", 404 | "26.11. nicht vor 08:00 Uhr", 405 | ], 406 | ), 407 | # ruleBeforeTime 408 | ( 409 | "Interval[]{None - 2017-11-26 20:00 (X/X)}", 410 | "2018-03-07T12:43", 411 | [ 412 | "26.11.2017 vor 08:00 Uhr", 413 | "26.11.2017 bis spätestens 08:00 Uhr", 414 | "26.11.2017 spätestens bis 08:00 Uhr", 415 | ], 416 | ), 417 | ( 418 | "Interval[]{None - 2018-11-26 20:00 (X/X)}", 419 | "2018-03-07T12:43", 420 | ["26.11.2018 vor 08:00 Uhr", "26.11. vor 08:00 Uhr", "26.11. not after 08:00"], 421 | ), 422 | # ruleHHMM 423 | ( 424 | "Time[]{X-X-X 20:00 (X/X)}", 425 | "2018-03-07T00:00", 426 | ["8h", "8 Uhr", "8:00", "8h00", "8"], 427 | ), 428 | ( 429 | "Time[]{X-X-X 20:00 (X/X)}", 430 | "2018-03-07T00:00", 431 | ["20h", "20 Uhr", "20:00", "20pm", "20"], 432 | ), # <-- ignore am, since this makes no sense 433 | # ruleMonthDOM 434 | ( 435 | "Time[]{2018-04-07 X:X (X/X)}", 436 | "2018-03-07T00:00", 437 | ["april 7", "april 7th", "7. April"], 438 | ), 439 | # ruleAbsorbOnTime 440 | ( 441 | "Time[]{X-X-X 20:00 (X/X)}", 442 | "2018-03-07T00:00", 443 | ["at 8pm", "um 20h", "gegen 20:00", "about 8pm", "at around 8pm"], 444 | ), 445 | # ruleAbsorbOnTime + X 446 | ( 447 | "Time[]{2018-06-21 08:00 (X/X)}", 448 | "2018-03-07T00:00", 449 | [ 450 | "Jun 21 at 8am", 451 | "Jun 21 8am", 452 | "Jun 21 at 8am", 453 | "Jun 21 on 8am", 454 | "21. Juni um 8am", 455 | ], 456 | ), 457 | # ruleDateInterval 458 | ( 459 | "Interval[]{2018-11-13 13:30 (X/X) - 2018-11-13 15:35 (X/X)}", 460 | "2018-03-07T00:00", 461 | [ 462 | "Mon, Nov 13 1:30 PM - 3:35 PM", 463 | "Montag, 13. November von 13:30 bis 15:35", 464 | "Nov 13 13:30 - 15:35", 465 | ], 466 | ), 467 | ( 468 | "Interval[]{2018-11-13 13:30 (X/X) - None}", 469 | "2018-03-07T00:00", 470 | [ 471 | "Mon, Nov 13 after 1:30 PM", 472 | "Montag, 13. November nach 13:30", 473 | "Montag, 13. November 2018 nach 13:30", 474 | "13.11. ab 13:30", 475 | ], 476 | ), 477 | ( 478 | "Interval[]{2016-11-13 13:30 (X/X) - None}", 479 | "2018-03-07T00:00", 480 | [ 481 | "Mon, Nov 13 2016 after 1:30 PM", 482 | "Montag, 13. November 2016 nach 13:30", 483 | "Montag, 13. November 2016 nach 13:30", 484 | "13.11.16 ab 13:30", 485 | ], 486 | ), 487 | ( 488 | "Interval[]{2018-03-11 X:X (X/noon) - None}", 489 | "2018-03-07T00:00", 490 | ["Sunday after noon", "Sonntag ab Mittag", "Sonntag, 11. März 2018 ab Mittag"], 491 | ), 492 | # ( 493 | # "Interval[]{2018-03-11 21:00 (X/X) - None}", 494 | # "2018-03-07T00:00", 495 | # [ 496 | # "Sunday Mar 11 after 9", 497 | # "Sonntag, 11. März 2018 nach 9", 498 | # "Sonntag, der 11. Mrz. nach 9", 499 | # ], 500 | # ), 501 | # ( 502 | # "Interval[]{2016-03-11 21:00 (X/X) - None}", 503 | # "2018-03-07T00:00", 504 | # [ 505 | # "Sunday Mar 11 2016 after 9", 506 | # "Sonntag, 11. März 2016 nach 9", 507 | # "Sonntag, der 11. Mrz 2016 nach 9", 508 | # ], 509 | # ), 510 | # ruleDateInterval - day wrap 511 | ( 512 | "Interval[]{2018-11-13 23:30 (X/X) - 2018-11-14 03:35 (X/X)}", 513 | "2018-03-07T00:00", 514 | ["Mon, Nov 13 11:30 PM - 3:35 AM", "Nov 13 23:30 - 3:35am"], 515 | ), 516 | ( 517 | "Interval[]{2018-02-20 21:00 (X/X) - 2018-02-21 04:00 (X/X)}", 518 | "2018-02-20T09:37", 519 | ["today 9pm - 4am"], 520 | ), 521 | # ruleAbsorbDOWComma -- deleted, comma should be removed by caller 522 | ( 523 | "Time[]{2018-07-27 X:X (X/X)}", 524 | "2018-07-26T00:00", 525 | ["Freitag, dem 27.", "Fri, the 27th", "fri 27"], 526 | ), 527 | # ruleNamedHour 528 | ("Time[]{X-X-X 09:00 (X/X)}", "2018-07-26T00:00", ["neun", "nine"]), 529 | # ruleQuarterBeforeHH 530 | # ( 531 | # "Time[]{2018-07-26 19:45 (X/X)}", 532 | # "2018-07-26T00:00", 533 | # ["viertel vor acht", "viertel vor 8", "quarter to eight"], 534 | # ), 535 | # ruleQuarterBeforeHH midnight wrap 536 | ("Time[]{X-X-X 23:45 (X/X)}", "2018-07-26T00:00", ["viertel vor 0"]), 537 | # ruleQuarterAfterHH 538 | # ( 539 | # "Time[]{2018-07-26 08:15 (X/X)}", 540 | # "2018-07-26T00:00", 541 | # ["viertel nach acht", "viertel nach 8", "quarter past eight"], 542 | # ), 543 | # ruleHalfBeforeHH 544 | # ( 545 | # "Time[]{2018-07-26 07:30 (X/X)}", 546 | # "2018-07-26T00:00", 547 | # ["half eight"], 548 | # ), 549 | # ruleHalfBeforeHH not when minutes are present 550 | ("Time[]{X-X-X 19:35 (X/X)}", "2018-07-26T00:00", ["halb 7:35"]), 551 | # ruleHalfBeforeHH midnight wrap 552 | ("Time[]{X-X-X 23:30 (X/X)}", "2018-07-26T00:00", ["halb mitternacht"]), 553 | # ruleHalfAfterHH 554 | ( 555 | "Time[]{X-X-X 08:30 (X/X)}", 556 | "2018-07-26T00:00", 557 | ["halb nach acht", "halfe past eight"], 558 | ), 559 | # ruleHalfAfterHH not when minutes are present 560 | ("Time[]{X-X-X 20:32 (X/X)}", "2018-07-26T00:00", ["halb nach 8:32"]), 561 | # rulePODInterval 562 | # ( 563 | # "Interval[]{None - 2018-09-17 22:00 (X/X)}", 564 | # "2018-07-26T00:00", 565 | # ["am 17.9. abends vor 10", "at Sep 17th in the evening before 10"], 566 | # ), 567 | # ( 568 | # "Interval[]{X-X-X 22:00 (X/X) - None}", 569 | # "2018-07-26T00:00", 570 | # ["abends nach 10", "in the evening after 10", "in the evening after 22h"], 571 | # ), 572 | # ( 573 | # "Interval[]{X-X-X 20:00 (X/X) - X-X-X 21:00 (X/X)}", 574 | # "2018-07-26T00:00", 575 | # ["in the evening between 8 and 9", "Jul 26th between 20 and 21"], 576 | # ), 577 | # ( 578 | # "Interval[]{X-X-X 08:00 (X/X) - X-X-X 09:00 (X/X)}", 579 | # "2018-07-26T00:00", 580 | # ["in the morning between 8 and 9", "Jul 26th between 8 and 9"], 581 | # ), 582 | # rule 583 | # 584 | # ----------------------------------------------------------------------------- 585 | # OLD CORPUS 586 | # ----------------------------------------------------------------------------- 587 | # 588 | ( 589 | "Interval[]{2017-12-19 21:30 (X/X) - 2017-12-19 22:45 (X/X)}", 590 | "2017-12-18T12:34", 591 | [ 592 | "tomorrow 09:30 - 10:45", 593 | "tomorrow 0930 - 1045", 594 | "19. Dezember von 09:30 bis 10:45", 595 | "19th of December from 09:30 til 10:45", 596 | "19.12. 09:30 - 10:45", 597 | "19.12.17 09:30 - 10:45", 598 | "19.12.2017 09:30 - 19.12.2017 10:45", 599 | "19.12.2017 09:30 - 10:45", 600 | "19 dec 0930-1045", 601 | "Dec 19th 9:30pm to 10:45pm", 602 | ], 603 | ), 604 | ( 605 | "Interval[]{2018-02-16 X:X (X/X) - 2018-02-21 X:X (X/X)}", 606 | "2017-12-18T12:34", 607 | ["16.02.2018 - 21.02.2018", "16. bis 21.02.2018"], 608 | ), 609 | ( 610 | "Interval[]{2018-08-07 X:X (X/X) - 2018-08-10 X:X (X/X)}", 611 | "2017-12-18T12:34", 612 | ["07.-10.08.2018"], 613 | ), 614 | # ('Range[]{2018-12-09 - 2018-12-13}', 615 | # '2017-12-18T12:34', 616 | # [ 617 | # '09.-13.12.2018 von Samstag bis Time[]{2017-05-11 X:X (X/X)}' 618 | # ]), 619 | # ('Range[]{2018-04-27 - 2018-04-30}', 620 | # '2017-12-18T12:34', 621 | # [ 622 | # # 'from the 27th to the 30th of April 2018', 623 | # '27.-30.04.2018 von Freitag bis Montag' 624 | # ]), 625 | ( 626 | "Time[]{2018-01-13 X:X (X/X)}", 627 | "2017-12-18T12:34", 628 | ["am 13.1.", "am 13.01.", "am 13. Januar", "13.01", "13.1", "13th Jan"], 629 | ), 630 | ( 631 | "Time[]{2017-12-19 X:X (X/X)}", 632 | "2017-12-18T12:34", 633 | [ 634 | "am Dienstag", 635 | "am 19.12", 636 | "Dienstag 19.12", 637 | "Tuesday 19th of December", 638 | "Tuesday December 19th", 639 | "Dienstag 19. Dezember", 640 | "Dienstag Dezember 19.", 641 | "Dienstag", 642 | ], 643 | ), 644 | ( 645 | "Time[]{2018-03-01 14:30 (X/X)}", 646 | "2017-12-18T12:34", 647 | [ 648 | # mm/dd does not work yet 649 | # '03/01/2018 at 2:30 pm', 650 | "am 01.03.2018 um 14:30", 651 | "Mar 1st 2:30 pm", 652 | "1. März um 1430 Uhr", 653 | "01.03.2018 14:30", 654 | ], 655 | ), 656 | ( 657 | "Time[]{2018-01-03 14:30 (X/X)}", 658 | "2017-12-18T12:34", 659 | [ 660 | # mm/dd does not work yet 661 | # '01/03/2018 at 2:30 pm', 662 | "am 03.01.2018 um 14:30", 663 | "Jan. 3rd 2:30 pm", 664 | "3. Januar 1430 Uhr", 665 | "03.01.2018 14:30", 666 | "3 Jan 2018 14:30", 667 | ], 668 | ), 669 | ("Time[]{2018-04-23 23:00 (X/X)}", "2017-12-18T12:34", ["23.04.2018 11:00"]), 670 | ("Time[]{2018-11-19 18:00 (X/X)}", "2017-12-18T12:34", ["19.11.2018 18:00"]), 671 | # ( 672 | # "Time[]{2017-12-20 X:X (X/morning)}", 673 | # "2017-12-18T12:34", 674 | # ["Wednesday, 20th December morning", "december 20 morning"], 675 | # ), 676 | # ( 677 | # "Time[]{2018-12-06 X:X (X/morning)}", 678 | # "2018-03-07T12:43", 679 | # [ 680 | # "6. dezember morgens", 681 | # "6. dezember früh", 682 | # "6. dezember in der früh", 683 | # "december 6 early", 684 | # "december 6th morning", 685 | # ], 686 | # ), 687 | # ( 688 | # "Time[]{2018-12-06 X:X (X/earlymorning)}", 689 | # "2018-03-07T12:43", 690 | # ["6. dezember früh morgens", "december 6 early morning"], 691 | # ), 692 | # ( 693 | # "Time[]{2018-12-06 X:X (X/forenoon)}", 694 | # "2018-03-07T12:43", 695 | # ["6. Dezember vormittags", "december 6th forenoon"], 696 | # ), 697 | # ( 698 | # "Time[]{2018-12-06 X:X (X/afternoon)}", 699 | # "2018-03-07T12:43", 700 | # ["6. Dezember nachmittag", "december 6 afternoon"], 701 | # ), 702 | # ( 703 | # "Time[]{2018-12-06 X:X (X/noon)}", 704 | # "2018-03-07T12:43", 705 | # ["6. Dezember mittags", "december 6 noon"], 706 | # ), 707 | # ( 708 | # "Time[]{2018-12-06 X:X (X/evening)}", 709 | # "2018-03-07T12:43", 710 | # ["6. Dezember abends", "december 6 late"], 711 | # ), 712 | # ( 713 | # "Time[]{2018-12-06 X:X (X/lateevening)}", 714 | # "2018-03-07T12:43", 715 | # ["6. Dezember später abend", "december 6 late evening"], 716 | # ), 717 | # ( 718 | # "Time[]{2018-12-06 X:X (X/veryearlyafternoon)}", 719 | # "2018-03-07T12:43", 720 | # ["6. Dezember sehr früher nachmittag", "december 6 very early afternoon"], 721 | # ), 722 | # ('DateTime[]{2017-12-20Tmorning}', 723 | # '2017-12-18T12:34', 724 | # ['Wednesday, morning, 20.12.17']), 725 | # ('DateTime[]{2017-12-20Tafternoon}', 726 | # '2017-12-18T12:34', 727 | # ['Wednesday, afternoon, 20.12.17']), 728 | # ('DateTime[]{2017-12-20 XX:XX (X/evening)}', 729 | # '2017-12-18T12:34', 730 | # ['Wednesday, evening, 20.12.17']), 731 | ("Time[]{2017-12-20 18:45 (X/X)}", "2017-12-18T12:34", ["6:45 Uhr 20.12.2017"]), 732 | ("Time[]{2018-08-04 15:00 (X/X)}", "2017-12-18T12:34", ["04.08.2018 15:00"]), 733 | ("Time[]{2018-09-01 13:00 (X/X)}", "2017-12-18T12:34", ["01.09.2018 01:00"]), 734 | ("Time[]{2018-11-29 22:00 (X/X)}", "2017-12-18T12:34", ["29.11.2018 22:00"]), 735 | ("Time[]{2018-02-27 19:00 (X/X)}", "2017-12-18T12:34", ["27.02.2018 07:00"]), 736 | ("Time[]{2018-05-09 21:30 (X/X)}", "2017-12-18T12:34", ["09.05.2018 09:30"]), 737 | ("Time[]{2018-01-17 14:30 (X/X)}", "2017-12-18T12:34", ["17.01.2018 14:30"]), 738 | ( 739 | "Interval[]{2018-06-21 11:00 (X/X) - 2018-06-21 13:00 (X/X)}", 740 | "2017-12-18T12:34", 741 | ["21.06.2018 11:00-13:00", "Jun 21st between 11am and 1pm"], 742 | ), 743 | ( 744 | "Interval[]{2018-07-09 20:00 (X/X) - 2018-07-13 22:00 (X/X)}", 745 | "2017-12-18T12:34", 746 | ["09.07.2018 08:00 - 13.07.2018 10:00"], 747 | ), 748 | # Military time tests 749 | ("Time[]{2020-02-03 X:X (X/X)}", "2020-02-25T12:34", ["3 Feb 2020"]), 750 | # Duration tests 751 | # ( 752 | # "Duration[]{1 nights}", 753 | # "2020-02-25T12:34", 754 | # ["one night", "ein nacht", "eine übernachtung"], 755 | # ), 756 | ("Duration[]{30 days}", "2020-02-25T12:34", ["in 30 days", "in 30 tage"],), 757 | ("Duration[]{7 weeks}", "2020-02-25T12:34", ["in 7 weeks", "in 7 wochen"],), 758 | ( 759 | "Duration[]{20 minutes}", 760 | "2020-02-25T12:34", 761 | ["in 20 minutes", "in twenty minutes", "in zwanzig Minuten"], 762 | ), 763 | ("Duration[]{1 months}", "2020-02-25T12:34", ["in 1 month", "in one month", "in ein Monat"]), 764 | ( 765 | "Duration[]{30 minutes}", 766 | "2020-02-25T12:34", 767 | ["in half an hour", "in half hour", "in 1/2 hour", "in 1/2h", "in 1/2 h", "in halbe Stunde"], 768 | ), 769 | # ruleTimeDuration 770 | # ( 771 | # "Interval[]{2020-02-27 X:X (X/X) - 2020-02-28 X:X (X/X)}", 772 | # "2020-02-25T12:34", 773 | # ["on the 27th for one day", "on the 27th for one night"], 774 | # ), 775 | # ( 776 | # "Interval[]{2020-02-25 15:00 (X/X) - 2020-02-25 16:00 (X/X)}", 777 | # "2020-02-25T12:34", 778 | # ["today 15:00 for one hour"], 779 | # ), 780 | # # ruleDurationInterval, ruleIntervalDuration 781 | # ( 782 | # "Interval[]{2020-11-15 X:X (X/X) - 2020-11-18 X:X (X/X)}", 783 | # "2020-02-25T12:34", 784 | # ["3 days 15-18 Nov", "15-18 Nov 3 Nächte", "15-18 Nov für 3 Nächte"], 785 | # ), 786 | ] 787 | -------------------------------------------------------------------------------- /ctparse/time/postprocess_latent.py: -------------------------------------------------------------------------------- 1 | """Those rules are applied as postprocessing steps after scoring has been already 2 | done. Needed for backwards compatibility.""" 3 | from ctparse.types import Artifact, Interval, Time 4 | from datetime import datetime 5 | from dateutil.relativedelta import relativedelta 6 | 7 | 8 | def apply_postprocessing_rules(ts: datetime, art: Artifact) -> Artifact: 9 | """Apply postprocessing rules to a resolution *art*. This is 10 | introduced for backwards compatibility reasons. 11 | 12 | Example: 13 | 14 | 8:00 pm, ts=2020.01.01 07:00 15 | 16 | produces a resolution: 17 | 18 | X-X-X 20:00 19 | 20 | after postprocessing this is anchored to the reference time: 21 | 22 | 2020-01-01 20:00 23 | """ 24 | if isinstance(art, Time): 25 | if art.isTOD: 26 | return _latent_tod(ts, art) 27 | if isinstance(art, Interval): 28 | if art.isTimeInterval: 29 | return _latent_time_interval(ts, art) 30 | 31 | return art 32 | 33 | 34 | def _latent_tod(ts: datetime, tod: Time) -> Time: 35 | dm = ts + relativedelta(hour=tod.hour, minute=tod.minute or 0) 36 | if dm <= ts: 37 | dm += relativedelta(days=1) 38 | 39 | res = Time( 40 | year=dm.year, month=dm.month, day=dm.day, hour=dm.hour, minute=dm.minute, period=tod.period, 41 | ) 42 | res.mstart = tod.mstart 43 | res.mend = tod.mend 44 | return res 45 | 46 | 47 | def _latent_time_interval(ts: datetime, ti: Interval) -> Interval: 48 | assert ti.t_from and ti.t_to # guaranteed by the caller 49 | dm_from = ts + relativedelta(hour=ti.t_from.hour, minute=ti.t_from.minute or 0) 50 | dm_to = ts + relativedelta(hour=ti.t_to.hour, minute=ti.t_to.minute or 0) 51 | if dm_from <= ts: 52 | dm_from += relativedelta(days=1) 53 | dm_to += relativedelta(days=1) 54 | 55 | # pm-am interval overlap 56 | if ti.t_from.period == "pm" and ti.t_to.period == "am": 57 | dm_to += relativedelta(days=1) 58 | 59 | res = Interval( 60 | t_from=Time( 61 | year=dm_from.year, 62 | month=dm_from.month, 63 | day=dm_from.day, 64 | hour=dm_from.hour, 65 | minute=dm_from.minute, 66 | period=ti.t_from.period, 67 | ), 68 | t_to=Time( 69 | year=dm_to.year, 70 | month=dm_to.month, 71 | day=dm_to.day, 72 | hour=dm_to.hour, 73 | minute=dm_to.minute, 74 | period=ti.t_to.period, 75 | ), 76 | ) 77 | res.mstart = ti.mstart 78 | res.mend = ti.mend 79 | return res 80 | -------------------------------------------------------------------------------- /ctparse/timers.py: -------------------------------------------------------------------------------- 1 | """Utilities for tracking time spent in functions. 2 | 3 | Although this module is not part of the public API, it is used in various parts of 4 | the ctparse package. 5 | 6 | """ 7 | from time import perf_counter 8 | from typing import Any, Callable, TypeVar, Union, Tuple 9 | from functools import wraps 10 | 11 | T = TypeVar("T") 12 | 13 | 14 | def timeout(timeout: Union[float, int]) -> Callable[[], None]: 15 | """Generate a functions that raises an exceptions if a timeout has passed. 16 | 17 | Example: 18 | 19 | sentinel = timeout(1.0) 20 | time.sleep(0.5) 21 | sentinel() # Do nothing 22 | time.sleep(0.6) 23 | sentinel() # Raises CTParseTimeoutException 24 | 25 | :param timeout: 26 | time in seconds. If it is equal to zero, it means to never raise an exception. 27 | :returns: 28 | A function that raises a `CTParseTimeoutException` if `timeout` seconds have 29 | expired. 30 | """ 31 | start_time = perf_counter() 32 | 33 | def _tt() -> None: 34 | if timeout == 0: 35 | return 36 | if perf_counter() - start_time > timeout: 37 | raise CTParseTimeoutError() 38 | 39 | return _tt 40 | 41 | 42 | def timeit(f: Callable[..., T]) -> Callable[..., Tuple[T, float]]: 43 | """Wrapper to time a function. 44 | 45 | The wrapped function is modified so that it returns a tuple `(f(args), t)` 46 | where `t` the time in seconds the function call took to run. 47 | 48 | Example: 49 | 50 | def fun(x): 51 | return x * x 52 | 53 | result, exec_time = timeit(fun)(3) 54 | 55 | """ 56 | 57 | @wraps(f) 58 | def _wrapper(*args: Any, **kwargs: Any) -> Tuple[T, float]: 59 | start_time = perf_counter() 60 | res = f(*args, **kwargs) 61 | return res, perf_counter() - start_time 62 | 63 | return _wrapper 64 | 65 | 66 | # NOTE: TimeoutError is a built-in exception that means that 67 | # system function timed out at the system level. Hence we opt 68 | # for a custom exception. 69 | class CTParseTimeoutError(Exception): 70 | """Exception raised by the `timeout` function.""" 71 | -------------------------------------------------------------------------------- /ctparse/types.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Any, Dict, Optional, Tuple, Type, TypeVar, List 3 | from dateutil.relativedelta import relativedelta 4 | from dateutil.rrule import rrule, YEARLY, MONTHLY, WEEKLY, DAILY 5 | 6 | import regex 7 | from regex import Regex 8 | import enum 9 | 10 | T = TypeVar("T", bound="Artifact") 11 | 12 | 13 | class Artifact: 14 | def __init__(self) -> None: 15 | self.mstart = 0 16 | self.mend = 0 17 | self._attrs = ["mstart", "mend"] 18 | 19 | def update_span(self: T, *args: "Artifact") -> T: 20 | self.mstart = args[2].mstart 21 | self.mend = args[-1].mend 22 | return self 23 | 24 | def __len__(self) -> int: 25 | return self.mend - self.mstart 26 | 27 | def __bool__(self) -> bool: 28 | return True 29 | 30 | def __str__(self) -> str: 31 | return "" 32 | 33 | def __repr__(self) -> str: 34 | return "{}[{}-{}]{{{}}}".format( 35 | self.__class__.__name__, self.mstart, self.mend, str(self) 36 | ) 37 | 38 | def nb_str(self) -> str: 39 | """Return a string representation without the bounds information.""" 40 | return "{}[]{{{}}}".format(self.__class__.__name__, str(self)) 41 | 42 | def __eq__(self, other: Any) -> bool: 43 | if type(other) != type(self): 44 | return False 45 | else: 46 | return all(getattr(self, a) == getattr(other, a) for a in self._attrs) 47 | 48 | def __hash__(self) -> int: 49 | return hash(tuple(getattr(self, a) for a in self._attrs)) 50 | 51 | def _hasOnly(self, *args: str) -> bool: 52 | """check that all attributes set to True are set (i.e. not None) and 53 | all set to False are not set (i.e. None) 54 | 55 | """ 56 | return all( 57 | getattr(self, a) is not None if a in args else getattr(self, a) is None 58 | for a in self._attrs 59 | ) 60 | 61 | def _hasAtLeast(self, *args: str) -> bool: 62 | """check that all attributes set to True are set (i.e. not None) and 63 | all set to False are not set (i.e. None) 64 | 65 | """ 66 | return all(getattr(self, a) is not None for a in args) 67 | 68 | 69 | class RegexMatch(Artifact): 70 | def __init__(self, id: int, m: Regex) -> None: 71 | super().__init__() 72 | self._attrs = ["mstart", "mend", "id"] 73 | self.key = "R{}".format(id) 74 | self.id = id 75 | self.match = m 76 | self.mstart = m.span(self.key)[0] 77 | self.mend = m.span(self.key)[1] 78 | self._text = m.group(self.key) 79 | 80 | def __str__(self) -> str: 81 | return "{}:{}".format(self.id, self._text) 82 | 83 | 84 | _pod_hours = { 85 | "earlymorning": { 86 | "offset": (4, 7), 87 | "early": { 88 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 89 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 90 | "very": {"offset": (0, 0)}, 91 | "offset": (-1, -1), 92 | }, 93 | "late": { 94 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 95 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 96 | "very": {"offset": (0, 0)}, 97 | "offset": (1, 1), 98 | }, 99 | }, 100 | "morning": { 101 | "offset": (6, 9), 102 | "early": { 103 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 104 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 105 | "very": {"offset": (0, 0)}, 106 | "offset": (-1, -1), 107 | }, 108 | "late": { 109 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 110 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 111 | "very": {"offset": (0, 0)}, 112 | "offset": (1, 1), 113 | }, 114 | }, 115 | "forenoon": { 116 | "offset": (9, 12), 117 | "early": { 118 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 119 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 120 | "very": {"offset": (0, 0)}, 121 | "offset": (-1, -1), 122 | }, 123 | "late": { 124 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 125 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 126 | "very": {"offset": (0, 0)}, 127 | "offset": (1, 1), 128 | }, 129 | }, 130 | "noon": { 131 | "offset": (11, 13), 132 | "early": { 133 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 134 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 135 | "very": {"offset": (0, 0)}, 136 | "offset": (-1, -1), 137 | }, 138 | "late": { 139 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 140 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 141 | "very": {"offset": (0, 0)}, 142 | "offset": (1, 1), 143 | }, 144 | }, 145 | "afternoon": { 146 | "offset": (12, 17), 147 | "early": { 148 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 149 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 150 | "very": {"offset": (0, 0)}, 151 | "offset": (-1, -1), 152 | }, 153 | "late": { 154 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 155 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 156 | "very": {"offset": (0, 0)}, 157 | "offset": (1, 1), 158 | }, 159 | }, 160 | "evening": { 161 | "offset": (17, 20), 162 | "early": { 163 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 164 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 165 | "very": {"offset": (0, 0)}, 166 | "offset": (-1, -1), 167 | }, 168 | "late": { 169 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 170 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 171 | "very": {"offset": (0, 0)}, 172 | "offset": (1, 1), 173 | }, 174 | }, 175 | "lateevening": { 176 | "offset": (18, 21), 177 | "early": { 178 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 179 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 180 | "very": {"offset": (0, 0)}, 181 | "offset": (-1, -1), 182 | }, 183 | "late": { 184 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 185 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 186 | "very": {"offset": (0, 0)}, 187 | "offset": (1, 1), 188 | }, 189 | }, 190 | "night": { 191 | "offset": (19, 22), 192 | "early": { 193 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 194 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 195 | "very": {"offset": (0, 0)}, 196 | "offset": (-1, -1), 197 | }, 198 | "late": { 199 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 200 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 201 | "very": {"offset": (0, 0)}, 202 | "offset": (1, 1), 203 | }, 204 | }, 205 | "first": { 206 | "offset": (0, 0), 207 | "early": { 208 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 209 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 210 | "very": {"offset": (0, 0)}, 211 | "offset": (0, 0), 212 | }, 213 | "late": { 214 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 215 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 216 | "very": {"offset": (0, 0)}, 217 | "offset": (0, 0), 218 | }, 219 | }, 220 | "last": { 221 | "offset": (23, 23), 222 | "early": { 223 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 224 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 225 | "very": {"offset": (0, 0)}, 226 | "offset": (0, 0), 227 | }, 228 | "late": { 229 | "early": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 230 | "late": {"offset": (0, 0), "very": {"offset": (0, 0)}}, 231 | "very": {"offset": (0, 0)}, 232 | "offset": (0, 0), 233 | }, 234 | }, 235 | } 236 | 237 | 238 | def _mk_pod_hours() -> Dict[str, Tuple[int, int]]: 239 | def _add_ts(t1: Tuple[int, int], t2: Tuple[int, int]) -> Tuple[int, int]: 240 | return (t1[0] + t2[0], t1[1] + t2[1]) 241 | 242 | def _mk( 243 | pod: str, pod_data: Dict[str, Any], t: Tuple[int, int] 244 | ) -> Dict[str, Tuple[int, int]]: 245 | r = {pod: _add_ts(t, pod_data["offset"])} 246 | for k, v in pod_data.items(): 247 | if k == "offset": 248 | continue 249 | r.update(_mk(k + pod, v, r[pod])) 250 | return r 251 | 252 | res = {} 253 | for k, v in _pod_hours.items(): 254 | if k == "offset": 255 | continue 256 | res.update(_mk(k, v, (0, 0))) 257 | return res 258 | 259 | 260 | pod_hours = _mk_pod_hours() 261 | 262 | 263 | _TIME_REGEX = regex.compile( 264 | r"(\d{4}|X)-(\d{2}|X)-(\d{2}|X) (\d{2}|X):(\d{2}|X) \((\d|X)\/(\w+)\)" 265 | ) 266 | 267 | 268 | class Time(Artifact): 269 | def __init__( 270 | self, 271 | year: Optional[int] = None, 272 | month: Optional[int] = None, 273 | day: Optional[int] = None, 274 | hour: Optional[int] = None, 275 | minute: Optional[int] = None, 276 | DOW: Optional[int] = None, 277 | POD: Optional[str] = None, 278 | period: Optional[str] = None, 279 | ) -> None: 280 | super().__init__() 281 | self._attrs = ["year", "month", "day", "hour", "minute", "DOW", "POD", "period"] 282 | # Might add some validation here, did not to avoid the overhead 283 | self.year = year 284 | self.month = month 285 | self.day = day 286 | self.hour = hour 287 | self.minute = minute 288 | self.DOW = DOW 289 | self.POD = POD 290 | self.period = period 291 | 292 | # ----------------------------------------------------------------------------- 293 | # Make sure to not accidentially test bool(x) as False when x==0, but you meant 294 | # x==None 295 | # ----------------------------------------------------------------------------- 296 | @property 297 | def isDOY(self) -> bool: 298 | """isDayOfYear <=> a dd.mm but not year 299 | """ 300 | return self._hasOnly("month", "day") 301 | 302 | @property 303 | def isDOM(self) -> bool: 304 | """isDayOfMonth <=> a dd but no month 305 | """ 306 | return self._hasOnly("day") 307 | 308 | @property 309 | def isDOW(self) -> bool: 310 | """isDayOfWeek <=> DOW is the 0=Monday index; fragile test, as the DOW 311 | could be accompanied by e.g. a full date etc.; in practice, 312 | however, the production rules do not do that. 313 | 314 | """ 315 | return self._hasOnly("DOW") 316 | 317 | @property 318 | def isMonth(self) -> bool: 319 | return self._hasOnly("month") 320 | 321 | @property 322 | def isPOD(self) -> bool: 323 | """isPartOfDay <=> morning, etc.; fragile, tests only that there is a 324 | POD and neither a full date nor a full time 325 | """ 326 | return self._hasOnly("POD") 327 | 328 | @property 329 | def isHour(self) -> bool: 330 | """only has an hour""" 331 | return self._hasOnly("hour") 332 | 333 | @property 334 | def isTOD(self) -> bool: 335 | """isTimeOfDay - only a time, not date""" 336 | return self._hasOnly("hour") or self._hasOnly("hour", "minute") or self._hasOnly("hour", "period") or self._hasOnly("hour", "minute", "period") 337 | 338 | @property 339 | def isDate(self) -> bool: 340 | """isDate - only a date, not time""" 341 | return self._hasOnly("year", "month", "day") 342 | 343 | @property 344 | def isDateTime(self) -> bool: 345 | """a date and a time""" 346 | return self._hasOnly("year", "month", "day", "hour") or self._hasOnly( 347 | "year", "month", "day", "hour", "minute" 348 | ) 349 | 350 | @property 351 | def isYear(self) -> bool: 352 | """just a year""" 353 | return self._hasOnly("year") 354 | 355 | @property 356 | def hasDate(self) -> bool: 357 | """at least a date""" 358 | return self._hasAtLeast("year", "month", "day") 359 | 360 | @property 361 | def hasDOY(self) -> bool: 362 | """at least a day of year""" 363 | return self._hasAtLeast("month", "day") 364 | 365 | @property 366 | def hasDOW(self) -> bool: 367 | """at least a day of week""" 368 | return self._hasAtLeast("DOW") 369 | 370 | @property 371 | def hasTime(self) -> bool: 372 | """at least a time to the hour""" 373 | return self._hasAtLeast("hour") or self._hasOnly("hour", "period") 374 | 375 | @property 376 | def hasPeriod(self) -> bool: 377 | """at least a period""" 378 | return self._hasAtLeast("period") 379 | 380 | @property 381 | def hasPOD(self) -> bool: 382 | """at least a part of day""" 383 | return self._hasAtLeast("POD") 384 | 385 | def __str__(self) -> str: 386 | return "{}-{}-{} {}:{} ({}/{})".format( 387 | "{:04d}".format(self.year) if self.year is not None else "X", 388 | "{:02d}".format(self.month) if self.month is not None else "X", 389 | "{:02d}".format(self.day) if self.day is not None else "X", 390 | "{:02d}".format(self.hour) if self.hour is not None else "X", 391 | "{:02d}".format(self.minute) if self.minute is not None else "X", 392 | "{:d}".format(self.DOW) if self.DOW is not None else "X", 393 | "{}".format(self.POD) if self.POD is not None else "X", 394 | ) 395 | 396 | @classmethod 397 | def from_str(cls: Type["Time"], text: str) -> "Time": 398 | match = _TIME_REGEX.match(text) 399 | if not match: 400 | raise ValueError("Invalid format") 401 | else: 402 | 403 | def parse_opt_int(x: str) -> Optional[int]: 404 | return None if x == "X" else int(x) 405 | 406 | pod = match.group(7) 407 | return cls( 408 | year=parse_opt_int(match.group(1)), 409 | month=parse_opt_int(match.group(2)), 410 | day=parse_opt_int(match.group(3)), 411 | hour=parse_opt_int(match.group(4)), 412 | minute=parse_opt_int(match.group(5)), 413 | DOW=parse_opt_int(match.group(6)), 414 | POD=None if pod == "X" else pod, 415 | ) 416 | 417 | @property 418 | def start(self) -> "Time": 419 | if self.hour is None and self.hasPOD: 420 | hour = pod_hours[self.POD][0] # type: ignore 421 | else: 422 | hour = self.hour or 0 423 | return Time( 424 | year=self.year, 425 | month=self.month, 426 | day=self.day, 427 | hour=hour, 428 | minute=self.minute or 0, 429 | period=self.period, 430 | ) 431 | 432 | @property 433 | def end(self) -> "Time": 434 | if self.hour is None and self.hasPOD: 435 | hour = pod_hours[self.POD][1] # type: ignore 436 | else: 437 | hour = self.hour if self.hour is not None else 23 438 | return Time( 439 | year=self.year, 440 | month=self.month, 441 | day=self.day, 442 | hour=hour, 443 | minute=self.minute if self.minute is not None else 59, 444 | period=self.period, 445 | ) 446 | 447 | @property 448 | def dt(self) -> datetime: 449 | # Use the start time, in case we have a POD specification 450 | t = self.start 451 | if t.year is None or t.month is None or t.day is None: 452 | raise ValueError( 453 | "cannot convert underspecified Time into datetime" 454 | ", missing at least one of year, month or day" 455 | ) 456 | return datetime(t.year, t.month, t.day, t.hour or 0, t.minute or 0) 457 | 458 | 459 | class Interval(Artifact): 460 | def __init__( 461 | self, t_from: Optional[Time] = None, t_to: Optional[Time] = None 462 | ) -> None: 463 | super().__init__() 464 | self._attrs = ["t_from", "t_to"] 465 | self.t_from = t_from 466 | self.t_to = t_to 467 | 468 | @property 469 | def isTimeInterval(self) -> bool: 470 | if self.t_from is None or self.t_to is None: 471 | return False 472 | else: 473 | return self.t_from.isTOD and self.t_to.isTOD 474 | 475 | @property 476 | def isDateInterval(self) -> bool: 477 | if self.t_from is None or self.t_to is None: 478 | return False 479 | return self.t_from.isDate and self.t_to.isDate 480 | 481 | def __str__(self) -> str: 482 | return "{} - {}".format(str(self.t_from), str(self.t_to)) 483 | 484 | @classmethod 485 | def from_str(cls: Type["Interval"], text: str) -> "Interval": 486 | bounds = text.split(" - ") 487 | if len(bounds) != 2: 488 | raise ValueError("Invalid format") 489 | 490 | t_from = None if bounds[0] == "None" else Time.from_str(bounds[0]) 491 | t_to = None if bounds[1] == "None" else Time.from_str(bounds[1]) 492 | return cls(t_from=t_from, t_to=t_to) 493 | 494 | @property 495 | def start(self) -> Optional[Time]: 496 | if self.t_from is not None: 497 | return self.t_from.start 498 | else: 499 | return None 500 | 501 | @property 502 | def end(self) -> Optional[Time]: 503 | if self.t_to is not None: 504 | return self.t_to.end 505 | else: 506 | return None 507 | 508 | 509 | @enum.unique 510 | class DurationUnit(enum.Enum): 511 | MINUTES = "minutes" 512 | HOURS = "hours" 513 | DAYS = "days" 514 | NIGHTS = "nights" 515 | WEEKS = "weeks" 516 | MONTHS = "months" 517 | YEARS = "years" 518 | 519 | 520 | class Duration(Artifact): 521 | def __init__(self, value: int, unit: DurationUnit): 522 | """Create a Duration using value and unit. 523 | 524 | Typical values for unit are: 525 | 526 | minute, hour, day, night, week, month, year 527 | """ 528 | super().__init__() 529 | self.value = value 530 | self.unit = unit 531 | 532 | def __str__(self) -> str: 533 | return "{} {}".format(self.value, self.unit.value) 534 | 535 | @classmethod 536 | def from_str(cls: Type["Duration"], text: str) -> "Duration": 537 | value, unit = text.split() 538 | return Duration(int(value), DurationUnit(unit)) 539 | 540 | def time(self, ts: datetime) -> Time: 541 | if self.unit == DurationUnit.MINUTES: 542 | dm = ts + relativedelta(minutes=+self.value) 543 | return Time(year=dm.year, month=dm.month, day=dm.day, hour=dm.hour, minute=dm.minute) 544 | if self.unit == DurationUnit.HOURS: 545 | dm = ts + relativedelta(hours=+self.value) 546 | return Time(year=dm.year, month=dm.month, day=dm.day, hour=dm.hour, minute=dm.minute) 547 | if self.unit == DurationUnit.DAYS: 548 | dm = ts + relativedelta(days=+self.value) 549 | if self.unit == DurationUnit.WEEKS: 550 | dm = ts + relativedelta(days=+self.value*7) 551 | if self.unit == DurationUnit.MONTHS: 552 | dm = ts + relativedelta(months=+self.value) 553 | if self.unit == DurationUnit.YEARS: 554 | dm = ts + relativedelta(years=+self.value) 555 | 556 | return Time(year=dm.year, month=dm.month, day=dm.day) 557 | 558 | 559 | @enum.unique 560 | class RecurringFrequency(enum.Enum): 561 | DAILY = "DAILY" 562 | WEEKLY = "WEEKLY" 563 | MONTHLY = "MONTHLY" 564 | YEARLY = "YEARLY" 565 | 566 | 567 | class Recurring(Artifact): 568 | def __init__( 569 | self, 570 | frequency: Optional[RecurringFrequency] = None, 571 | interval: Optional[int] = None, 572 | start_time: Optional[Time] = None, 573 | end_time: Optional[Time] = None, 574 | byday: Optional[tuple[int, ...]] = None, 575 | ): 576 | super().__init__() 577 | self._attrs = ['start_time', 'end_time', 'frequency', 'interval', 'byday'] 578 | self.start_time = start_time 579 | self.end_time = end_time 580 | self.frequency = frequency 581 | self.interval = interval 582 | self.frequency_map = {RecurringFrequency.DAILY.value: DAILY, 583 | RecurringFrequency.WEEKLY.value: WEEKLY, 584 | RecurringFrequency.MONTHLY.value: MONTHLY, 585 | RecurringFrequency.YEARLY.value: YEARLY} 586 | self.byday = byday 587 | 588 | def __str__(self) -> str: 589 | return "{} {} {} {}".format(self.frequency, self.interval, self.start_time, self.end_time, self.byday) 590 | 591 | @property 592 | def isRecurring(self) -> bool: 593 | if self.frequency and self.interval is None: 594 | return False 595 | else: 596 | return True 597 | 598 | @property 599 | def isRecurringDOW(self) -> bool: 600 | if not self.start_time.DOW: 601 | return False 602 | else: 603 | return True 604 | 605 | @property 606 | def isRecurringTime(self) -> bool: 607 | if not self.start_time.hasTime: 608 | return False 609 | else: 610 | return True 611 | 612 | def to_rrule(self) -> rrule: 613 | r_rule = rrule(freq=self.frequency_map[self.frequency], interval=self.interval, byweekday=self.byday) 614 | r_rule = r_rule.__str__().split('\n')[1] 615 | return r_rule 616 | 617 | 618 | class RecurringArray(Artifact): 619 | def __init__(self, 620 | rec_1: Optional[Recurring] = None, 621 | rec_2: Optional[Recurring] = None, 622 | rec_3: Optional[Recurring] = None, 623 | rec_4: Optional[Recurring] = None, 624 | rec_5: Optional[Recurring] = None, 625 | ): 626 | super().__init__() 627 | self._attrs = ['rec_1', 'rec_2', 'rec_3', 'rec_4', 'rec_5'] 628 | self.rec_1 = rec_1 629 | self.rec_2 = rec_2 630 | self.rec_3 = rec_3 631 | self.rec_4 = rec_4 632 | self.rec_5 = rec_5 633 | 634 | def __str__(self) -> str: 635 | return "\n Recurring instance: {} \n Recurring instance: {} \n Recurring instance: {} \n Recurring instance: {} \n Recurring instance: {}".format(self.rec_1, self.rec_2, self.rec_3, self.rec_4, self.rec_5) 636 | 637 | @property 638 | def to_list(self) -> list: 639 | array = [self.rec_1, self.rec_2, self.rec_3, self.rec_4, self.rec_5] 640 | array = [i for i in array if i != None] 641 | return array 642 | -------------------------------------------------------------------------------- /datasets/README.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | Time Parse Dataset 3 | ================== 4 | 5 | The dataset included in ``datasets/timeparse_corpus.json`` contains a set of ~2000 human annotated time expression in english and german. 6 | 7 | The dataset is a list of json records with the following fields: 8 | 9 | - *text*: the text for the time expression 10 | - *ref_time*: a timestamp in ISO 8601 format ``YYYY-MM-DDTHH:MM:SS`` 11 | - *gold_parse*: the human annotation of the time expression. It can be a ``Time`` or ``Interval``. 12 | - *language*: a two-digit code indicating the language. In this dataset it is either "en" or "de". 13 | 14 | 15 | For ``Time``, the format is as follows:: 16 | 17 | Time[]{YYYY-MM-DD HH:MM (dow/tod)} 18 | 19 | Where: 20 | - ``YYYY`` is a four-digit year or ``X``, if year is missing 21 | - ``MM`` is a two-digit month or ``X``, if month is missing 22 | - ``DD`` is a two-digit day or ``X``, if day is missing 23 | - ``HH`` is a two-digit hour (24 hour clock) or ``X``, if hour is missing 24 | - ``MM`` is a two-digit minute or ``X``, if minute is missing 25 | - ``dow`` is an integer between 0 and 6 representing day of week or X, if missing (in the dataset, day of week is always missing) 26 | - ``tod`` is a string representing the time of day (such as earlymorning, morning, forenoon, noon, afternoon, evening, lateevening) or X if not specified. 27 | 28 | Example:: 29 | 30 | Morning of the 11th June 2017 31 | Time[]{2017-06-11 X:X (X/morning)} 32 | 33 | For ``Interval`` the format is as follows:: 34 | 35 | Interval[]{ - } 36 | 37 | Where ```` and ```` are the beginning and end of the interval. ```` or ```` can be None if the interval is open-ended. They can be specified 38 | using the same representation for times, as described above:: 39 | 40 | YYYY-MM-DD HH:MM (dow/tod) 41 | 42 | Example:: 43 | 44 | Wed, Oct 11 2017 8:30 PM - 9:47 PM 45 | Interval[]{2017-10-11 08:30 (X/X) - 2017-10-11 09:47 (X/X)} -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = ctparse 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # ctparse documentation build configuration file, created by 5 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another 17 | # directory, add these directories to sys.path here. If the directory is 18 | # relative to the documentation root, use os.path.abspath to make it 19 | # absolute, like shown here. 20 | # 21 | import os 22 | import sys 23 | sys.path.insert(0, os.path.abspath('..')) 24 | 25 | import ctparse # noqa 26 | 27 | # -- General configuration --------------------------------------------- 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon'] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = '.rst' 45 | 46 | # The master toctree document. 47 | master_doc = 'index' 48 | 49 | # General information about the project. 50 | project = u'ctparse - Parse natural language time expressions' 51 | copyright = u"2018, Sebastian Mika, Comtravo GmbH" 52 | author = u"Sebastian Mika - Comtravo" 53 | 54 | # The version info for the project you're documenting, acts as replacement 55 | # for |version| and |release|, also used in various other places throughout 56 | # the built documents. 57 | # 58 | # The short X.Y version. 59 | version = ctparse.__version__ 60 | # The full version, including alpha/beta/rc tags. 61 | release = ctparse.__version__ 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = 'sphinx_rtd_theme' 88 | 89 | # Theme options are theme-specific and customize the look and feel of a 90 | # theme further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ['_static'] 99 | 100 | 101 | # -- Options for HTMLHelp output --------------------------------------- 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = 'ctparsedoc' 105 | 106 | 107 | # -- Options for LaTeX output ------------------------------------------ 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | 114 | # The font size ('10pt', '11pt' or '12pt'). 115 | # 116 | # 'pointsize': '10pt', 117 | 118 | # Additional stuff for the LaTeX preamble. 119 | # 120 | # 'preamble': '', 121 | 122 | # Latex figure (float) alignment 123 | # 124 | # 'figure_align': 'htbp', 125 | } 126 | 127 | # Grouping the document tree into LaTeX files. List of tuples 128 | # (source start file, target name, title, author, documentclass 129 | # [howto, manual, or own class]). 130 | latex_documents = [ 131 | (master_doc, 'ctparse.tex', 132 | u'ctparse - Parse natural language time expressions in pytho Documentation', 133 | u'Sebastian Mika', 'manual'), 134 | ] 135 | 136 | 137 | # -- Options for manual page output ------------------------------------ 138 | 139 | # One entry per manual page. List of tuples 140 | # (source start file, name, description, authors, manual section). 141 | man_pages = [ 142 | (master_doc, 'ctparse', 143 | u'ctparse - Parse natural language time expressions Documentation', 144 | [author], 1) 145 | ] 146 | 147 | 148 | # -- Options for Texinfo output ---------------------------------------- 149 | 150 | # Grouping the document tree into Texinfo files. List of tuples 151 | # (source start file, target name, title, author, 152 | # dir menu entry, description, category) 153 | texinfo_documents = [ 154 | (master_doc, 'ctparse', 155 | u'ctparse - Parse natural language time expressions Documentation', 156 | author, 157 | 'ctparse', 158 | 'One line description of project.', 159 | 'Miscellaneous'), 160 | ] 161 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/ctparse.rst: -------------------------------------------------------------------------------- 1 | ctparse package 2 | =============== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | ctparse.time 10 | 11 | Submodules 12 | ---------- 13 | 14 | ctparse.corpus module 15 | --------------------- 16 | 17 | .. automodule:: ctparse.corpus 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | ctparse.count\_vectorizer module 23 | -------------------------------- 24 | 25 | .. automodule:: ctparse.count_vectorizer 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | ctparse.ctparse module 31 | ---------------------- 32 | 33 | .. automodule:: ctparse.ctparse 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | ctparse.loader module 39 | --------------------- 40 | 41 | .. automodule:: ctparse.loader 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | ctparse.nb\_estimator module 47 | ---------------------------- 48 | 49 | .. automodule:: ctparse.nb_estimator 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | ctparse.nb\_scorer module 55 | ------------------------- 56 | 57 | .. automodule:: ctparse.nb_scorer 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | ctparse.partial\_parse module 63 | ----------------------------- 64 | 65 | .. automodule:: ctparse.partial_parse 66 | :members: 67 | :undoc-members: 68 | :show-inheritance: 69 | 70 | ctparse.pipeline module 71 | ----------------------- 72 | 73 | .. automodule:: ctparse.pipeline 74 | :members: 75 | :undoc-members: 76 | :show-inheritance: 77 | 78 | ctparse.rule module 79 | ------------------- 80 | 81 | .. automodule:: ctparse.rule 82 | :members: 83 | :undoc-members: 84 | :show-inheritance: 85 | 86 | ctparse.scorer module 87 | --------------------- 88 | 89 | .. automodule:: ctparse.scorer 90 | :members: 91 | :undoc-members: 92 | :show-inheritance: 93 | 94 | ctparse.timers module 95 | --------------------- 96 | 97 | .. automodule:: ctparse.timers 98 | :members: 99 | :undoc-members: 100 | :show-inheritance: 101 | 102 | ctparse.types module 103 | -------------------- 104 | 105 | .. automodule:: ctparse.types 106 | :members: 107 | :undoc-members: 108 | :show-inheritance: 109 | 110 | 111 | Module contents 112 | --------------- 113 | 114 | .. automodule:: ctparse 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | -------------------------------------------------------------------------------- /docs/ctparse.time.rst: -------------------------------------------------------------------------------- 1 | ctparse.time package 2 | ==================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | ctparse.time.corpus module 8 | -------------------------- 9 | 10 | .. automodule:: ctparse.time.corpus 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | ctparse.time.rules module 16 | ------------------------- 17 | 18 | .. automodule:: ctparse.time.rules 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: ctparse.time 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/dataset.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../datasets/README.rst -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to the ctparse documentation! 2 | ===================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | usage 11 | dataset 12 | contributing 13 | modules 14 | authors 15 | history 16 | 17 | Indices and tables 18 | ================== 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install ctparse - Parse natural language time expressions in pytho, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install ctparse 16 | 17 | This is the preferred method to install ctparse - Parse natural language time expressions in pytho, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for ctparse - Parse natural language time expressions in pytho can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/comtravo/ctparse 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OL https://github.com/comtravo/ctparse/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/comtravo/ctparse 51 | .. _tarball: https://github.com/comtravo/ctparse/tarball/master 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=ctparse 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | ctparse 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | ctparse 8 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | To use ctparse simply import the main ``ctparse`` function:: 6 | 7 | 8 | from datetime import datetime 9 | from ctparse import ctparse 10 | 11 | ctparse('today', datetime(2018, 7, 8), timeout=1) 12 | 13 | The output for the above code is `2018-07-08 X:X (X/X) s=2.273 p=(149, 'ruleToday')` 14 | 15 | For more details on the parameters please see the docstrings. 16 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | # Specify the target platform details in config, so your developers are 3 | # free to run mypy on Windows, Linux, or macOS and get consistent 4 | # results. 5 | python_version=3.6 6 | platform=linux 7 | 8 | # flake8-mypy expects the two following for sensible formatting 9 | show_column_numbers=True 10 | 11 | # show error messages from unrelated files 12 | follow_imports=normal 13 | 14 | # suppress errors about unsatisfied imports 15 | ignore_missing_imports=True 16 | 17 | # be strict 18 | disallow_untyped_calls=True 19 | warn_return_any=True 20 | strict_optional=True 21 | warn_no_return=True 22 | warn_redundant_casts=True 23 | warn_unused_ignores=True 24 | disallow_any_generics=True 25 | disallow_untyped_defs=True 26 | check_untyped_defs=True 27 | 28 | # No incremental mode 29 | cache_dir=/dev/null 30 | 31 | [mypy-ctparse.time.rules] 32 | # time rules check existence of fields in predicates 33 | strict_optional=False 34 | 35 | [mypy-tests.*] 36 | disallow_untyped_defs=False 37 | 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | . 2 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip==20.1.1 2 | bumpversion==0.6.0 3 | watchdog==0.10.2 4 | flake8==3.8.2 5 | flake8-bugbear==20.1.4 6 | tox==3.15.1 7 | coverage==5.1 8 | sphinx==3.0.4 9 | sphinx-rtd-theme==0.4.3 10 | 11 | twine==3.1.1 12 | 13 | pytest==5.4.2 14 | pytest-runner==5.2 15 | pytest-cov==2.9.0 16 | mypy==0.770 17 | black==19.10b0 18 | -------------------------------------------------------------------------------- /scripts/train_default_model.py: -------------------------------------------------------------------------------- 1 | """Train a default multinomial bayes classifier""" 2 | import argparse 3 | import logging 4 | 5 | from ctparse.corpus import load_timeparse_corpus, make_partial_rule_dataset, run_corpus 6 | from ctparse.loader import DEFAULT_MODEL_FILE 7 | from ctparse.nb_scorer import save_naive_bayes, train_naive_bayes 8 | from ctparse.scorer import DummyScorer 9 | from ctparse.time import auto_corpus, corpus 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument( 17 | "--legacy", 18 | help="Use legacy dataset (ctparse.time.corpus and ctparse.time.auto_corpus as training data)", 19 | action="store_true", 20 | ) 21 | parser.add_argument("--dataset", help="Dataset file") 22 | return parser.parse_args() 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | logging.basicConfig( 28 | level=logging.INFO, format="%(asctime)s %(levelname)s [%(name)s] %(message)s" 29 | ) 30 | 31 | X_combined = [] 32 | y_combined = [] 33 | 34 | if args.legacy: 35 | logger.info("Loading legacy dataset") 36 | X, y = run_corpus(corpus.corpus + auto_corpus.corpus) 37 | X_combined.extend(X) 38 | y_combined.extend(y) 39 | 40 | if args.dataset: 41 | logger.info("Loading dataset {}".format(args.dataset)) 42 | entries = load_timeparse_corpus(args.dataset) 43 | X, y = zip( 44 | *make_partial_rule_dataset( 45 | entries, 46 | scorer=DummyScorer(), 47 | timeout=30, 48 | max_stack_depth=100, 49 | progress=True, 50 | ) 51 | ) 52 | X_combined.extend(X) 53 | y_combined.extend(y) 54 | 55 | if len(X) == 0: 56 | raise ValueError("Need to specify at least a dataset for training") 57 | 58 | mdl = train_naive_bayes(X_combined, y_combined) 59 | save_naive_bayes(mdl, DEFAULT_MODEL_FILE) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 0.3.01 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:ctparse/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | max-line-length = 80 20 | select = C,E,F,W,B,B950 21 | ignore = E203,E266,E501,W503 22 | mypy_config = mypy.ini 23 | 24 | [aliases] 25 | test = pytest 26 | 27 | ;[tool:pytest] 28 | ;collect_ignore = ['setup.py'] 29 | 30 | [coverage:run] 31 | include = ctparse/* 32 | 33 | [coverage:report] 34 | show_missing = True 35 | fail_under = 95 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """The setup script.""" 5 | 6 | from setuptools import setup, find_packages 7 | 8 | with open('README.rst') as readme_file: 9 | readme = readme_file.read() 10 | 11 | with open('HISTORY.rst') as history_file: 12 | history = history_file.read() 13 | 14 | requirements = [] 15 | 16 | setup_requirements = ['pytest-runner', ] 17 | 18 | test_requirements = ['pytest', ] 19 | 20 | setup( 21 | author="Sebastian Mika/Comtravo", 22 | author_email='sebastian.mika@comtravo.com', 23 | classifiers=[ 24 | 'Development Status :: 3 - Alpha', 25 | 'Intended Audience :: Developers', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Natural Language :: English', 28 | 'Programming Language :: Python :: 3', 29 | 'Programming Language :: Python :: 3.6', 30 | 'Programming Language :: Python :: 3.7', 31 | 'Programming Language :: Python :: 3.8', 32 | 'Topic :: Software Development :: Libraries :: Python Modules', 33 | 'Topic :: Text Processing :: Linguistic', 34 | ], 35 | description="Parse natural language time expressions in python", 36 | install_requires=[ 37 | 'python-dateutil>=2.7.3,<3.0.0', 38 | 'regex>=2018.6.6', 39 | 'tqdm>=4.23.4,<5.0.0' 40 | ], 41 | license="MIT license", 42 | long_description=readme + '\n\n' + history, 43 | include_package_data=True, 44 | keywords='quickadd', 45 | name='quickadd', 46 | packages=find_packages(include=['ctparse*']), 47 | package_dir={'ctparse': 'ctparse'}, 48 | package_data={'ctparse': ['models/model.pbz', 'py.typed']}, 49 | setup_requires=setup_requirements, 50 | test_suite='tests', 51 | tests_require=test_requirements, 52 | url='https://github.com/inferense/quickadd', 53 | version='0.6.5', 54 | zip_safe=False, 55 | ) 56 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Acreom/quickadd/69543c79ad5db05a712abf223940fadf61740235/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_corpus.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pytest 4 | 5 | from ctparse.corpus import ( 6 | TimeParseEntry, 7 | load_timeparse_corpus, 8 | make_partial_rule_dataset, 9 | parse_nb_string, 10 | run_corpus, 11 | ) 12 | from ctparse.scorer import DummyScorer 13 | from ctparse.time.corpus import corpus 14 | from ctparse.types import Interval, Time 15 | 16 | CORPUS_JSON = """ 17 | [ 18 | { 19 | "text": "Donnerstag, den 05.10. ca 6:55", 20 | "ref_time": "2017-09-25T16:06:55", 21 | "gold_parse": "Time[]{2017-10-05 06:55 (X/X)}", 22 | "language": "de" 23 | }, 24 | { 25 | "text": "22.05.2017 früh", 26 | "ref_time": "2017-05-16T05:42:09", 27 | "gold_parse": "Time[]{2017-05-22 X:X (X/earlymorning)}", 28 | "language": "de" 29 | } 30 | ] 31 | """ 32 | 33 | 34 | def test_run_corpus() -> None: 35 | """The corpus passes if ctparse generates the desired 36 | solution for each test at least once. Otherwise it fails. 37 | """ 38 | X, y = run_corpus(corpus) 39 | assert isinstance(y[0], bool) 40 | assert isinstance(X[0][0], str) 41 | 42 | 43 | def test_run_corpus_failure() -> None: 44 | fail_corpus = [("never produced", "2015-12-12T12:30", ("today", "heute"))] 45 | with pytest.raises(Exception): 46 | run_corpus(fail_corpus) 47 | 48 | 49 | def test_make_partial_rule_dataset() -> None: 50 | ts = datetime(year=2019, month=10, day=1) 51 | entries = [ 52 | TimeParseEntry( 53 | "today at 5 pm", ts, Time(year=2019, month=10, day=1, hour=17, minute=0) 54 | ) 55 | ] 56 | 57 | X, y = zip( 58 | *make_partial_rule_dataset( 59 | entries, timeout=0, max_stack_depth=0, scorer=DummyScorer() 60 | ) 61 | ) 62 | assert isinstance(y[0], bool) 63 | assert isinstance(X[0][0], str) 64 | 65 | 66 | def test_parse_nb_string() -> None: 67 | t = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod") 68 | 69 | assert t == parse_nb_string("Time[]{0001-01-01 01:01 (1/pod)}") 70 | assert Interval(Time(), Time()) == parse_nb_string( 71 | "Interval[]{X-X-X X:X (X/X) - X-X-X X:X (X/X)}" 72 | ) 73 | 74 | 75 | def test_load_timeparse_corpus(tmp_path) -> None: 76 | path = tmp_path / "test.json" 77 | path.write_text(CORPUS_JSON, encoding="utf-8") 78 | 79 | result = load_timeparse_corpus(str(path)) 80 | 81 | assert len(result) == 2 82 | -------------------------------------------------------------------------------- /tests/test_count_vectorizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ctparse.count_vectorizer import CountVectorizer 3 | 4 | 5 | @pytest.mark.parametrize( 6 | "ngrams,doc,result", 7 | [ 8 | ((1, 1), ["a", "b", "c"], ["a", "b", "c"]), 9 | ((1, 2), ["a", "b", "c"], ["a", "b", "c", "a b", "b c"]), 10 | ((2, 2), ["a", "b", "c"], ["a b", "b c"]), 11 | ((1, 3), ["a", "b"], ["a", "b", "a b"]), 12 | ((2, 3), ["a", "b"], ["a b"]), 13 | ], 14 | ) 15 | def test_ngrams(ngrams, doc, result): 16 | assert CountVectorizer._create_ngrams(ngrams, [doc]) == [result] 17 | 18 | 19 | def test_count_vectorizer_fit_and_transform(): 20 | cv = CountVectorizer((1, 2)) 21 | cv = cv.fit([["a", "b", "c"], ["c", "d"]]) 22 | assert cv.vocabulary 23 | assert cv.transform([["b"]]) == [{cv.vocabulary["b"]: 1, 6: 0}] 24 | 25 | 26 | def test_count_vectorizer_fit_transform(): 27 | cv = CountVectorizer((1, 2)) 28 | X = cv.fit_transform([["a", "b"], ["b", "c"]]) 29 | assert cv.vocabulary 30 | assert X == [ 31 | { 32 | cv.vocabulary["a"]: 1, 33 | cv.vocabulary["b"]: 1, 34 | cv.vocabulary["a b"]: 1, 35 | len(cv.vocabulary) - 1: 0, 36 | }, 37 | {cv.vocabulary["b"]: 1, cv.vocabulary["c"]: 1, cv.vocabulary["b c"]: 1}, 38 | ] 39 | 40 | 41 | def test_count_vectorizer_transform_no_fit(): 42 | cv = CountVectorizer((1, 2)) 43 | with pytest.raises(ValueError): 44 | cv.transform([["a"]]) 45 | -------------------------------------------------------------------------------- /tests/test_ctparse.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from ctparse.ctparse import ctparse, ctparse_gen, _match_rule 3 | from ctparse.types import Interval, Time, Artifact 4 | 5 | 6 | def test_ctparse(): 7 | txt = "12.12.2020" 8 | res = ctparse(txt) 9 | assert res 10 | assert res.resolution == Time(year=2020, month=12, day=12) 11 | assert str(res) 12 | assert repr(res) 13 | 14 | # non sense gives no result 15 | assert ctparse("gargelbabel") is None 16 | txt = "12.12." 17 | res = ctparse(txt, ts=datetime(2020, 12, 1)) 18 | assert res 19 | assert res.resolution == Time(year=2020, month=12, day=12) 20 | 21 | gres = ctparse_gen(txt, ts=datetime(2020, 12, 1)) 22 | first_res = next(gres) 23 | assert first_res 24 | assert first_res.resolution == Time(year=2020, month=12, day=12) 25 | 26 | 27 | def test_ctparse_timeout(): 28 | # timeout in ctparse: should rather mock the logger and see 29 | # whether the timeout was hit, but cannot get it mocked 30 | txt = "tomorrow 8 yesterday Sep 9 9 12 2023 1923" 31 | ctparse(txt, timeout=0.0001) 32 | 33 | 34 | def test_match_rule(): 35 | def rule(a: Artifact) -> bool: 36 | return True 37 | 38 | assert list(_match_rule([], [rule])) == [] 39 | assert list(_match_rule([Artifact()], [])) == [] 40 | 41 | 42 | def test_latent_time(): 43 | parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=False) 44 | assert parse 45 | assert parse.resolution == Time(None, None, None, 20, 00) 46 | 47 | parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=True) 48 | assert parse 49 | assert parse.resolution == Time(2020, 1, 1, 20, 00) 50 | 51 | 52 | def test_latent_time_interval(): 53 | parse = ctparse( 54 | "8:00 pm - 9:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=False 55 | ) 56 | assert parse 57 | assert parse.resolution == Interval( 58 | Time(None, None, None, 20, 00), Time(None, None, None, 21, 00) 59 | ) 60 | 61 | parse = ctparse( 62 | "8:00 pm - 9:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=True 63 | ) 64 | assert parse 65 | assert parse.resolution == Interval( 66 | Time(2020, 1, 1, 20, 00), Time(2020, 1, 1, 21, 00) 67 | ) 68 | -------------------------------------------------------------------------------- /tests/test_partialparse.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Any, Callable 3 | 4 | import pytest 5 | import regex 6 | 7 | from ctparse.partial_parse import PartialParse, _seq_match 8 | from ctparse.types import RegexMatch, Time 9 | 10 | 11 | def test_partial_parse() -> None: 12 | match_a = regex.match("(?a)", "ab") 13 | match_b = next(regex.finditer("(?b)", "ab")) 14 | 15 | pp = PartialParse.from_regex_matches( 16 | (RegexMatch(1, match_a), RegexMatch(2, match_b)) 17 | ) 18 | 19 | assert len(pp.prod) == 2 20 | assert len(pp.rules) == 2 21 | 22 | assert isinstance(pp.score, float) 23 | 24 | def mock_rule(ts: datetime.datetime, a: Time) -> Time: 25 | return Time() 26 | 27 | pp2 = pp.apply_rule( 28 | datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1) 29 | ) 30 | 31 | assert pp != pp2 32 | 33 | with pytest.raises(ValueError): 34 | PartialParse((), ()) 35 | 36 | 37 | def test_seq_match() -> None: 38 | # NOTE: we are testing a private function because the algorithm 39 | # is quite complex 40 | 41 | def make_rm(i: int) -> Callable[[Any], bool]: 42 | def _regex_match(s: Any) -> bool: 43 | return bool(s == i) 44 | 45 | return _regex_match 46 | 47 | # empty sequence, empty pattern: matches on a single empty sequence 48 | assert list(_seq_match([], [])) == [[]] 49 | # non empty sequence, empty pattern matches on an empty sequence 50 | assert list(_seq_match(["a", "b"], [])) == [[]] 51 | # non empty sequence, non empty pattern that does not apper: no match 52 | assert list(_seq_match(["a", "b"], [make_rm(1)])) == [] 53 | # empty sequence, non empty pattern: no match 54 | assert list(_seq_match([], [make_rm(1)])) == [] 55 | # sequence shorter than pattern: no match 56 | assert list(_seq_match(["a"], [make_rm(1), make_rm(2)])) == [] 57 | # seq = pat 58 | assert list(_seq_match([1], [make_rm(1)])) == [[0]] 59 | assert list(_seq_match([1, 2, 3], [make_rm(1)])) == [[0]] 60 | assert list(_seq_match([1, 2, 3], [make_rm(2)])) == [[1]] 61 | assert list(_seq_match([1, 2, 3], [make_rm(3)])) == [[2]] 62 | assert list(_seq_match([1, 2, "a"], [make_rm(1), make_rm(2)])) == [[0, 1]] 63 | assert list(_seq_match([1, "a", 3], [make_rm(1), _identity, make_rm(3)])) == [ 64 | [0, 2] 65 | ] 66 | assert list(_seq_match(["a", 2, 3], [make_rm(2), make_rm(3)])) == [[1, 2]] 67 | # starts with non regex 68 | assert list(_seq_match([1, 2], [_identity, make_rm(1), make_rm(2)])) == [] 69 | assert list(_seq_match(["a", 1, 2], [_identity, make_rm(1), make_rm(2)])) == [ 70 | [1, 2] 71 | ] 72 | # ends with non regex 73 | assert list(_seq_match([1, 2], [make_rm(1), make_rm(2), _identity])) == [] 74 | assert list(_seq_match([1, 2, "a"], [make_rm(1), make_rm(2), _identity])) == [ 75 | [0, 1] 76 | ] 77 | # repeated pattern 78 | assert list(_seq_match([1, 2, 1, 2, 2], [make_rm(1), make_rm(2)])) == [ 79 | [0, 1], 80 | [0, 3], 81 | [0, 4], 82 | [2, 3], 83 | [2, 4], 84 | ] 85 | assert list(_seq_match([1, 2, 1, 2, 2], [make_rm(1), _identity, make_rm(2)])) == [ 86 | [0, 3], 87 | [0, 4], 88 | [2, 4], 89 | ] 90 | assert list(_seq_match([1, 2, 1, 2, 2], [_identity, make_rm(1), make_rm(2)])) == [ 91 | [2, 3], 92 | [2, 4], 93 | ] 94 | assert list(_seq_match([1, 2, 1, 2, 2], [make_rm(1), make_rm(2), _identity])) == [ 95 | [0, 1], 96 | [0, 3], 97 | [2, 3], 98 | ] 99 | assert ( 100 | list( 101 | _seq_match( 102 | [1, 2, 1, 2, 2], 103 | [_identity, make_rm(1), _identity, make_rm(2), _identity], 104 | ) 105 | ) 106 | == [] 107 | ) 108 | assert list( 109 | _seq_match( 110 | [1, 2, 1, 2, 2, 3], 111 | [_identity, make_rm(1), _identity, make_rm(2), _identity], 112 | ) 113 | ) == [[2, 4]] 114 | 115 | 116 | def _identity(x: Any) -> bool: 117 | return True 118 | -------------------------------------------------------------------------------- /tests/test_regressions.py: -------------------------------------------------------------------------------- 1 | """This file contains regression tests for commonly parsed time expressions""" 2 | import ctparse 3 | from datetime import datetime 4 | 5 | 6 | def test_military_time(): 7 | result = ctparse.ctparse("3 March 2020", ts=datetime(2020, 2, 25)) 8 | assert result 9 | assert str(result.resolution) == "2020-03-03 X:X (X/X)" 10 | 11 | 12 | def test_parse_years_ahead(): 13 | result = ctparse.ctparse("3 March 2023", ts=datetime(2020, 2, 25)) 14 | assert result 15 | assert str(result.resolution) == "2023-03-03 X:X (X/X)" 16 | -------------------------------------------------------------------------------- /tests/test_rule.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import regex 3 | from ctparse.types import RegexMatch, Artifact 4 | from ctparse.rule import dimension, predicate, regex_match, rule 5 | 6 | 7 | class TestClassA(Artifact): 8 | predA = 1 9 | 10 | 11 | class TestClassB(Artifact): 12 | pass 13 | 14 | 15 | class TestRule(TestCase): 16 | def test_empty_regex_match_not_allowed(self): 17 | with self.assertRaises(ValueError): 18 | rule(r"") 19 | with self.assertRaises(ValueError): 20 | rule(r"[a-z]*") 21 | self.assertIsNotNone( 22 | rule( 23 | r"This long string must not match as this expression " 24 | "will be part of the system unless ctparse is reloaded" 25 | ) 26 | ) 27 | 28 | def test_consecutive_regex_not_allowed(self): 29 | with self.assertRaises(ValueError): 30 | rule(r"one", r"two") 31 | 32 | def test_regex_match(self): 33 | m = next(regex.finditer("(?Px)", "x")) 34 | r = RegexMatch(1, m) 35 | self.assertTrue(regex_match(1)(r)) 36 | self.assertFalse(regex_match(1)(TestClassA())) 37 | 38 | def test_dimension(self): 39 | self.assertTrue(dimension(TestClassA)(TestClassA())) 40 | self.assertFalse(dimension(TestClassA)(TestClassB())) 41 | 42 | def test_predicate(self): 43 | self.assertTrue(predicate("predA")(TestClassA())) 44 | self.assertFalse(predicate("predA")(TestClassB())) 45 | -------------------------------------------------------------------------------- /tests/test_scorer.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | import bz2 4 | import pickle 5 | 6 | from ctparse.nb_scorer import NaiveBayesScorer, train_naive_bayes, save_naive_bayes 7 | from ctparse.partial_parse import PartialParse 8 | from ctparse.scorer import DummyScorer, RandomScorer 9 | from ctparse.count_vectorizer import CountVectorizer 10 | from ctparse.nb_estimator import MultinomialNaiveBayes 11 | from ctparse.pipeline import CTParsePipeline 12 | from ctparse.types import Interval, Time 13 | 14 | 15 | def test_dummy(): 16 | scorer = DummyScorer() 17 | pp = PartialParse((Time(), Interval()), ("rule1", "rule2")) 18 | 19 | assert scorer.score("a", datetime.datetime(2019, 1, 1), pp) == 0.0 20 | assert scorer.score_final("a", datetime.datetime(2019, 1, 1), pp, pp.prod[0]) == 0.0 21 | 22 | 23 | def test_random(): 24 | rng = random.Random(42) 25 | scorer = RandomScorer(rng) 26 | 27 | pp = PartialParse((Time(), Interval()), ("rule1", "rule2")) 28 | 29 | assert 0.0 <= scorer.score("a", datetime.datetime(2019, 1, 1), pp) <= 1.0 30 | assert ( 31 | 0.0 32 | <= scorer.score_final("a", datetime.datetime(2019, 1, 1), pp, pp.prod[1]) 33 | <= 1.0 34 | ) 35 | 36 | 37 | def test_nbscorer(): 38 | # We only test that it runs just fine 39 | X = [("a", "b"), ("a",), ("b"), ("a", "b", "a", "b")] 40 | y = [False, True, True, False] 41 | 42 | model = train_naive_bayes(X, y) 43 | scorer = NaiveBayesScorer(model) 44 | 45 | pp = PartialParse((Time(), Interval()), ("rule1", "rule2")) 46 | 47 | pp.prod[0].mstart = 0 48 | pp.prod[1].mend = 1 49 | 50 | pp.prod[0].mend = 1 51 | pp.prod[1].mend = 2 52 | 53 | assert 0.0 <= scorer.score("ab", datetime.datetime(2019, 1, 1), pp) <= 1.0 54 | assert ( 55 | 0.0 56 | <= scorer.score_final("ab", datetime.datetime(2019, 1, 1), pp, pp.prod[1]) 57 | <= 1.0 58 | ) 59 | 60 | 61 | def test_naive_bayes_from_file(tmp_path): 62 | nb = NaiveBayesScorer( 63 | CTParsePipeline(CountVectorizer((1, 1)), MultinomialNaiveBayes()) 64 | ) 65 | path = tmp_path / "model.pkl" 66 | with bz2.open(path, "w") as f: 67 | pickle.dump(nb, f) 68 | nb = NaiveBayesScorer.from_model_file(path) 69 | assert nb 70 | 71 | 72 | def test_save_naive_bayes(tmp_path): 73 | path = tmp_path / "model.pkl" 74 | model = CTParsePipeline(CountVectorizer((1, 1)), MultinomialNaiveBayes()) 75 | save_naive_bayes(model, path) 76 | -------------------------------------------------------------------------------- /tests/test_time_rules.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from ctparse.types import Time 4 | from ctparse.time.rules import ( 5 | ruleDateDate, 6 | ruleDOMDate, 7 | ruleDateTimeDateTime, 8 | ruleDOYDate, 9 | ruleQuarterBeforeHH, 10 | ruleQuarterAfterHH, 11 | ) 12 | 13 | 14 | class TestRules(TestCase): 15 | def test_ruleDateDate(self): 16 | t1 = Time(year=2017) 17 | t2 = Time(year=2015) 18 | self.assertIsNone(ruleDateDate(None, t1, None, t2)) 19 | 20 | t1 = Time(year=2017, month=12) 21 | t2 = Time(year=2017, month=11) 22 | self.assertIsNone(ruleDateDate(None, t1, None, t2)) 23 | 24 | t1 = Time(year=2017, month=12, day=31) 25 | t2 = Time(year=2017, month=12, day=30) 26 | self.assertIsNone(ruleDateDate(None, t1, None, t2)) 27 | 28 | t1 = Time(year=2017, month=12, day=31) 29 | t2 = Time(year=2017, month=12, day=31) 30 | self.assertIsNone(ruleDateDate(None, t1, None, t2)) 31 | 32 | t1 = Time(year=2017, month=12, day=30) 33 | t2 = Time(year=2017, month=12, day=31) 34 | self.assertIsNotNone(ruleDateDate(None, t1, None, t2)) 35 | 36 | def test_ruleDOMDate(self): 37 | t1 = Time(day=30) 38 | t2 = Time(year=2015, month=1, day=29) 39 | self.assertIsNone(ruleDOMDate(None, t1, None, t2)) 40 | 41 | t1 = Time(day=30) 42 | t2 = Time(year=2015, month=1, day=30) 43 | self.assertIsNone(ruleDOMDate(None, t1, None, t2)) 44 | 45 | t1 = Time(day=29) 46 | t2 = Time(year=2015, month=1, day=30) 47 | self.assertIsNotNone(ruleDOMDate(None, t1, None, t2)) 48 | 49 | def test_ruleDateTimeDateTime(self): 50 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 51 | t2 = Time(year=2016, month=4, day=12, hour=12, minute=30) 52 | self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2)) 53 | 54 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 55 | t2 = Time(year=2017, month=3, day=12, hour=12, minute=30) 56 | self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2)) 57 | 58 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 59 | t2 = Time(year=2017, month=4, day=11, hour=12, minute=30) 60 | self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2)) 61 | 62 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 63 | t2 = Time(year=2017, month=4, day=12, hour=11, minute=30) 64 | self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2)) 65 | 66 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 67 | t2 = Time(year=2017, month=4, day=12, hour=12, minute=29) 68 | self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2)) 69 | 70 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 71 | t2 = Time(year=2017, month=4, day=12, hour=12, minute=30) 72 | self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2)) 73 | 74 | t1 = Time(year=2017, month=4, day=12, hour=12, minute=30) 75 | t2 = Time(year=2017, month=4, day=12, hour=12, minute=31) 76 | self.assertIsNotNone(ruleDateTimeDateTime(None, t1, None, t2)) 77 | 78 | def test_ruleDOYDate(self): 79 | t1 = Time(month=4, day=12) 80 | t2 = Time(year=2017, month=4, day=12) 81 | self.assertIsNone(ruleDOYDate(None, t1, None, t2)) 82 | 83 | t1 = Time(month=4, day=12) 84 | t2 = Time(year=2017, month=4, day=13) 85 | self.assertIsNotNone(ruleDOYDate(None, t1, None, t2)) 86 | 87 | def test_ruleQuarterBeforeHH(self): 88 | t1 = Time(hour=12, minute=1) 89 | self.assertIsNone(ruleQuarterBeforeHH(None, None, t1)) 90 | 91 | def test_ruleQuarterAferHH(self): 92 | t1 = Time(hour=12, minute=1) 93 | self.assertIsNone(ruleQuarterAfterHH(None, None, t1)) 94 | -------------------------------------------------------------------------------- /tests/test_timers.py: -------------------------------------------------------------------------------- 1 | from ctparse.timers import timeout, CTParseTimeoutError, timeit 2 | from unittest import TestCase 3 | import time 4 | 5 | 6 | class TimersTest(TestCase): 7 | def test_timeout(self): 8 | t_fun = timeout(0.5) 9 | with self.assertRaises(CTParseTimeoutError): 10 | time.sleep(1.0) 11 | t_fun() 12 | t_fun = timeout(0) 13 | t_fun() # all good 14 | 15 | def test_timeit(self): 16 | def fun(x): 17 | return x * x 18 | 19 | result, elapsed = timeit(fun)(3) 20 | self.assertEqual(result, 9) 21 | self.assertIsInstance(elapsed, float) 22 | -------------------------------------------------------------------------------- /tests/test_types.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import regex 3 | from datetime import datetime 4 | from ctparse.types import Artifact, RegexMatch, Time, Interval 5 | 6 | 7 | class TestArtifact(TestCase): 8 | def test_init(self): 9 | a = Artifact() 10 | self.assertEqual(a.mstart, 0) 11 | self.assertEqual(a.mend, 0) 12 | self.assertEqual(len(a), 0) 13 | self.assertTrue(a) 14 | 15 | def test_eq(self): 16 | a = Artifact() 17 | b = Artifact() 18 | self.assertEqual(a, b) 19 | 20 | a = Time(2017, 12, 12, 12, 12, 4, "morning") 21 | b = Time(2017, 12, 12, 12, 12, 4, "morning") 22 | self.assertEqual(a, b) 23 | 24 | a = Time(2017, 12, 12, 12, 12, 4, "morning") 25 | b = Time(2017, 12, 12, 12, 12, 3, "morning") 26 | self.assertNotEqual(a, b) 27 | 28 | a = Time() 29 | b = Interval() 30 | self.assertNotEqual(a, b) 31 | 32 | def test_update_span(self): 33 | a1 = Artifact() 34 | a2 = Artifact() 35 | a3 = Artifact() 36 | a2.mstart = 10 37 | a3.mend = 100 38 | a1.update_span(a2, a3) 39 | self.assertEqual(a1.mstart, 10) 40 | self.assertEqual(a1.mend, 100) 41 | self.assertEqual(len(a1), 90) 42 | 43 | def test_repr(self): 44 | a = Artifact() 45 | self.assertEqual(repr(a), "Artifact[0-0]{}") 46 | 47 | def test_nb_str(self): 48 | a = Artifact() 49 | self.assertEqual(a.nb_str(), "Artifact[]{}") 50 | 51 | 52 | class TestRegexMatch(TestCase): 53 | def test_init(self): 54 | m = next(regex.finditer(r"(?Pmatch me)", "xxx match me xxx")) 55 | r = RegexMatch(1, m) 56 | self.assertEqual(r.mstart, 4) 57 | self.assertEqual(r.mend, 12) 58 | self.assertEqual(len(r), 8) 59 | self.assertEqual(r._text, "match me") 60 | self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}") 61 | self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}") 62 | 63 | 64 | class TestTime(TestCase): 65 | def test_init(self): 66 | self.assertIsNotNone(Time()) 67 | 68 | def test_isDOY(self): 69 | self.assertTrue(Time(month=1, day=1).isDOY) 70 | self.assertFalse(Time(year=1).isDOY) 71 | 72 | def test_isDOM(self): 73 | self.assertTrue(Time(day=1).isDOM) 74 | self.assertFalse(Time(month=1).isDOM) 75 | 76 | def test_isHour(self): 77 | self.assertTrue(Time(hour=1).isHour) 78 | self.assertFalse(Time(hour=1, minute=1).isHour) 79 | self.assertFalse(Time(hour=1, month=1).isHour) 80 | 81 | def test_isDOW(self): 82 | self.assertTrue(Time(DOW=1).isDOW) 83 | self.assertFalse(Time().isDOW) 84 | 85 | def test_isMonth(self): 86 | self.assertTrue(Time(month=1).isMonth) 87 | self.assertFalse(Time(day=1).isMonth) 88 | self.assertFalse(Time(year=1).isMonth) 89 | 90 | def test_isPOD(self): 91 | self.assertTrue(Time(POD="morning").isPOD) 92 | self.assertFalse(Time(day=1).isPOD) 93 | self.assertFalse(Time(year=1).isPOD) 94 | 95 | def test_isTOD(self): 96 | self.assertTrue(Time(hour=1, minute=1).isTOD) 97 | self.assertTrue(Time(hour=1).isTOD) 98 | self.assertFalse(Time(minute=1).isTOD) 99 | self.assertFalse(Time().isTOD) 100 | 101 | def test_isDate(self): 102 | self.assertTrue(Time(year=1, month=1, day=1).isDate) 103 | self.assertFalse(Time(year=1, month=1).isDate) 104 | self.assertFalse(Time(year=1, day=1).isDate) 105 | self.assertFalse(Time(day=1, month=1).isDate) 106 | self.assertFalse(Time(year=1, month=1, day=1, hour=1).isDate) 107 | 108 | def test_isDateTime(self): 109 | self.assertTrue(Time(year=1, month=1, day=1, hour=1).isDateTime) 110 | self.assertFalse(Time(year=1, month=1, day=1).isDateTime) 111 | 112 | def test_isYear(self): 113 | self.assertTrue(Time(year=1).isYear) 114 | self.assertFalse(Time(year=1, month=1).isYear) 115 | 116 | def test_hasDate(self): 117 | self.assertTrue(Time(year=1, month=1, day=1).hasDate) 118 | self.assertFalse(Time(year=1, month=1).isDate) 119 | self.assertFalse(Time(year=1, day=1).isDate) 120 | self.assertFalse(Time(day=1, month=1).isDate) 121 | self.assertTrue(Time(year=1, month=1, day=1, hour=1).hasDate) 122 | 123 | def test_hasTime(self): 124 | self.assertTrue(Time(hour=1, minute=1, day=1, month=1, year=1).hasTime) 125 | self.assertTrue(Time(hour=1, day=1, month=1, year=1).hasTime) 126 | self.assertFalse(Time(day=1, month=1, year=1).hasTime) 127 | 128 | def test_hasPOD(self): 129 | self.assertTrue(Time(POD="pod").hasPOD) 130 | self.assertFalse(Time(day=1, month=1, year=1).hasPOD) 131 | 132 | def test_repr(self): 133 | t = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod") 134 | self.assertEqual(repr(t), "Time[0-0]{0001-01-01 01:01 (1/pod)}") 135 | 136 | def test_from_str(self): 137 | # Complete time 138 | t = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod") 139 | t_str = str(t) 140 | t_back = Time.from_str(t_str) 141 | self.assertEqual(t, t_back) 142 | 143 | # Incomplete time 144 | t = Time(year=None, month=1, day=1, hour=None, minute=None, DOW=None, POD="pod") 145 | t_str = str(t) 146 | t_back = Time.from_str(t_str) 147 | self.assertEqual(t, t_back) 148 | 149 | # Zeroed time 150 | t = Time() 151 | t_str = str(t) 152 | t_back = Time.from_str(t_str) 153 | self.assertEqual(t, t_back) 154 | 155 | # Mistake 156 | with self.assertRaises(ValueError): 157 | Time.from_str("0001-01-01 01-01 (1/pod)") 158 | 159 | def test_start(self): 160 | t = Time() 161 | self.assertEqual(t.start, Time(hour=0, minute=0)) 162 | t = Time(year=2012, month=1, day=1) 163 | self.assertEqual(t.start, Time(2012, 1, 1, 0, 0)) 164 | t = Time(year=2012, month=1, day=1, hour=12) 165 | self.assertEqual(t.start, Time(2012, 1, 1, 12, 0)) 166 | t = Time(year=2012, month=1, day=1, hour=12, minute=20) 167 | self.assertEqual(t.start, Time(2012, 1, 1, 12, 20)) 168 | t = Time(year=2012, month=1, day=1, POD="last") 169 | self.assertEqual(t.start, Time(2012, 1, 1, 23, 00)) 170 | 171 | def test_end(self): 172 | t = Time() 173 | self.assertEqual(t.end, Time(hour=23, minute=59)) 174 | t = Time(year=2012, month=1, day=1) 175 | self.assertEqual(t.end, Time(2012, 1, 1, 23, 59)) 176 | t = Time(year=2012, month=1, day=1, hour=12) 177 | self.assertEqual(t.end, Time(2012, 1, 1, 12, 59)) 178 | t = Time(year=2012, month=1, day=1, hour=12, minute=20) 179 | self.assertEqual(t.end, Time(2012, 1, 1, 12, 20)) 180 | t = Time(year=2012, month=1, day=1, POD="last") 181 | self.assertEqual(t.end, Time(2012, 1, 1, 23, 59)) 182 | 183 | def test_dt(self): 184 | t = Time(2015, 12, 12, 12, 12) 185 | self.assertEqual(t.dt, datetime(2015, 12, 12, 12, 12)) 186 | t = Time(2015, 12, 12, 12) 187 | self.assertEqual(t.dt, datetime(2015, 12, 12, 12)) 188 | t = Time(2015, 12, 12) 189 | self.assertEqual(t.dt, datetime(2015, 12, 12)) 190 | 191 | with self.assertRaises(ValueError): 192 | t = Time(year=2012, month=12, hour=12, minute=12) 193 | t.dt 194 | 195 | 196 | class TestInterval(TestCase): 197 | def test_init(self): 198 | self.assertIsNotNone(Interval()) 199 | 200 | def test_isTimeInterval(self): 201 | self.assertTrue(Interval(Time(hour=1), Time(hour=2)).isTimeInterval) 202 | 203 | def test_repr(self): 204 | self.assertEqual( 205 | repr(Interval(Time(), Time())), 206 | "Interval[0-0]{X-X-X X:X (X/X) - X-X-X X:X (X/X)}", 207 | ) 208 | 209 | def test_from_str(self): 210 | # Complete interval 211 | t1 = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod") 212 | t2 = Time(year=2, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod") 213 | interval = Interval(t1, t2) 214 | i_back = Interval.from_str(str(interval)) 215 | self.assertEqual(interval, i_back) 216 | 217 | # Incomplete interval 218 | interval = Interval(None, t2) 219 | i_back = Interval.from_str(str(interval)) 220 | self.assertEqual(interval, i_back) 221 | 222 | # Zeroed interval 223 | interval = Interval() 224 | i_back = Interval.from_str(str(interval)) 225 | self.assertEqual(interval, i_back) 226 | 227 | # Mistake 228 | with self.assertRaises(ValueError): 229 | Interval.from_str("X-X-X X: X(X/X) -X-X-X X: X(X/X)") 230 | 231 | def test_start(self): 232 | i = Interval(Time(2013, 1, 1), Time(2013, 1, 2)) 233 | self.assertEqual(i.start, Time(2013, 1, 1, 0, 0)) 234 | 235 | i = Interval(Time(2013, 1, 1), None) 236 | self.assertEqual(i.start, Time(2013, 1, 1, 0, 0)) 237 | 238 | i = Interval(None, Time(2013, 1, 2)) 239 | self.assertIsNone(i.start) 240 | 241 | def test_end(self): 242 | i = Interval(Time(2013, 1, 1), Time(2013, 1, 2)) 243 | self.assertEqual(i.end, Time(2013, 1, 2, 23, 59)) 244 | 245 | i = Interval(None, Time(2013, 1, 2)) 246 | self.assertEqual(i.end, Time(2013, 1, 2, 23, 59)) 247 | 248 | i = Interval(Time(2013, 1, 1), None) 249 | self.assertIsNone(i.end) 250 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py36, py37, py38 3 | 4 | [travis] 5 | python = 6 | 3.8: py38 7 | 3.7: py37 8 | 3.6: py36 9 | 10 | [testenv] 11 | whitelist_externals= make 12 | setenv = 13 | PYTHONPATH = {toxinidir} 14 | deps = -r{toxinidir}/requirements_dev.txt 15 | commands = 16 | pip install -U pip 17 | make lint 18 | py.test --cov=ctparse --basetemp={envtmpdir} --------------------------------------------------------------------------------