├── .codecov.yml
├── .editorconfig
├── .github
    └── ISSUE_TEMPLATE.md
├── .gitignore
├── .pyup.yml
├── .travis.yml
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── ctparse
    ├── __init__.py
    ├── corpus.py
    ├── count_vectorizer.py
    ├── ctparse.py
    ├── loader.py
    ├── models
    │   ├── __init__.py
    │   ├── dummy.py
    │   └── model.pbz
    ├── nb_estimator.py
    ├── nb_scorer.py
    ├── partial_parse.py
    ├── pipeline.py
    ├── py.typed
    ├── rule.py
    ├── scorer.py
    ├── time
    │   ├── __init__.py
    │   ├── auto_corpus.py
    │   ├── corpus.py
    │   ├── postprocess_latent.py
    │   └── rules.py
    ├── timers.py
    └── types.py
├── datasets
    ├── README.rst
    └── timeparse_corpus.json
├── docs
    ├── Makefile
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── ctparse.rst
    ├── ctparse.time.rst
    ├── dataset.rst
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── modules.rst
    ├── readme.rst
    └── usage.rst
├── mypy.ini
├── requirements.txt
├── requirements_dev.txt
├── scripts
    └── train_default_model.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── test_corpus.py
    ├── test_count_vectorizer.py
    ├── test_ctparse.py
    ├── test_partialparse.py
    ├── test_regressions.py
    ├── test_rule.py
    ├── test_scorer.py
    ├── test_time_rules.py
    ├── test_timers.py
    └── test_types.py
└── tox.ini


/.codecov.yml:
--------------------------------------------------------------------------------
1 | comment: off
2 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * ctparse - Parse natural language time expressions in pytho version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | .idea/
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # pytest
105 | .pytest_cache/
106 | 
107 | # system
108 | .DS_Store
109 | 
110 | # vscode
111 | .vscode
112 | 


--------------------------------------------------------------------------------
/.pyup.yml:
--------------------------------------------------------------------------------
1 | # autogenerated pyup.io config file 
2 | # see https://pyup.io/docs/configuration/ for all available options
3 | 
4 | update: insecure
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | sudo: required
 3 | dist: xenial
 4 | python:
 5 |   - 3.8
 6 |   - 3.7
 7 |   - 3.6
 8 | install: pip install -U tox-travis codecov
 9 | script: tox
10 | after_success:
11 |   codecov
12 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Sebastian Mika <sebastian.mika@comtravo.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | * Gabriele Lanaro <gabriele.lanaro@comtravo.com>
14 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | ============
  2 | Contributing
  3 | ============
  4 | 
  5 | Contributions are welcome, and they are greatly appreciated! Every little bit
  6 | helps, and credit will always be given.
  7 | 
  8 | You can contribute in many ways:
  9 | 
 10 | 
 11 | Add Rules & Increase Coverage
 12 | -----------------------------
 13 | 
 14 | If you find an expressions that ``ctparse`` can not resolve correctly
 15 | but you feel it should do, you can adjust the existing rules or add a
 16 | new one.
 17 | 
 18 | The following steps are probably a helpful guideline.
 19 | 
 20 | * Add your case to the ``corpus.py`` file and run the corpus tests
 21 |   using ``py.test tests/test_run_corpus.py``. Now basically two things can happen:
 22 | 
 23 |   #. **The tests pass**, which means ``ctparse`` can correctly resolve
 24 |      the expression. It might not score it highest. To check this,
 25 |      rebuild the model and try parsing the expression again:
 26 | 
 27 |      .. code:: bash
 28 | 
 29 |             make train
 30 | 
 31 |      To avoid issues with reloading, please restart the python
 32 |      interpreter after regenerating the model.
 33 | 
 34 |      If this fixes the issue please commit the updated ``corpus.py``
 35 |      and the updated model as a pull request (PR) on GitHub, see this guide for
 36 |      more information on what pull requests are and how to create them
 37 |      https://help.github.com/articles/creating-a-pull-request/.
 38 | 
 39 |      The scoring can be influenced by
 40 |      adding more structurally identical examples to the corpus. Seeing
 41 |      more samples where a specific sequence of rule applications leads
 42 |      to the correct ranking will drive the model to favor these. This
 43 |      comes, however, at the potential price of downranking certain
 44 |      other production sequences. Although it would generally be
 45 |      considered more favorable to add varying test cases (e.g. in
 46 |      different languages, slight variation) to the corpus, the same
 47 |      string can also just be duplicated to achive this *implict
 48 |      up-weightning* effect. The examples that are intended to influence the scoring,
 49 |      as opposed to the ones used to develop new rules, are usually appended
 50 |      to the file ``auto_corpus.py```.
 51 | 
 52 |   #. **The tests fail**: if this is because not all tests in the
 53 |      corpus pass, i.e. you get an error message like the following::
 54 | 
 55 |        ctparse.py 527 WARNING  failure: target "Time[]{2019-X-X X:X (X/X)}" never produced in "2019"
 56 |        ctparse.py 532 WARNING  failure: "Time[]{2019-X-X X:X (X/X)}" not always produced
 57 | 
 58 | * If the tests fail, run ``ctparse`` in debug mode to see what goes wrong:
 59 | 
 60 |   .. code:: python
 61 | 
 62 |      import logging
 63 |      from ctparse import ctparse
 64 |      from ctparse.ctparse import logger
 65 |      from datetime import datetime
 66 | 
 67 |      logger.addHandler(logging.StreamHandler())
 68 |      logger.setLevel(logging.DEBUG)
 69 | 
 70 |      # Set reference time
 71 |      ts = datetime(2018, 3, 12, 14, 30)
 72 |      r = list(ctparse('May 5th', ts=ts, debug=True))
 73 | 
 74 | 
 75 |   This gives you plenty of debugging output. First you will see
 76 |   the individual regular expressions that were matched (and the time
 77 |   this took)::
 78 | 
 79 |     ================================================================================
 80 |     -> matching regular expressions
 81 |     regex: RegexMatch[0-3]{114:May}
 82 |     regex: RegexMatch[4-5]{133:5}
 83 |     regex: RegexMatch[4-7]{135:5th}
 84 |     regex: RegexMatch[4-5]{134:5}
 85 |     regex: RegexMatch[4-5]{148:5}
 86 |     time in _match_regex: 1ms
 87 |     ================================================================================
 88 | 
 89 |   Each line has the form ``regex: RegexMatch[0-3]{114:May}`` and describes
 90 |   the matched span in the text ``[0-3]``, the ID of the matching expression
 91 |   ``114`` and the surface string that the expression matched ``May``.
 92 | 
 93 |   If relevant parts of your expression were not picked up, this is an
 94 |   indicator that you should either modify an existing regular
 95 |   expression or need to add a new rule (see below).
 96 | 
 97 |   Next you see the unique sub-sequences constructed based on these
 98 |   regular expressions (plus again the time used to build them)::
 99 | 
100 |     ================================================================================
101 |     -> building initial stack
102 |     regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-7]{135:5th})
103 |     regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{148:5})
104 |     regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{134:5})
105 |     regex stack (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{133:5})
106 |     time in _regex_stack: 0ms
107 |     initial stack length: 4
108 |     stack length after relative match length: 1
109 |     stack length after max stack depth limit: 1
110 |     ================================================================================
111 | 
112 |   This is followed by a summary of how many applicable rules there are
113 |   per initial stack element::
114 | 
115 |     ================================================================================
116 |     -> checking rule applicability
117 |     of 75 total rules 20 are applicable in (RegexMatch[0-3]{114:May}, RegexMatch[4-7]{135:5th})
118 |     time in _filter_rules: 0ms
119 |     ================================================================================
120 |     ================================================================================
121 |     -> checking rule applicability
122 |     of 75 total rules 20 are applicable in (RegexMatch[0-3]{114:May}, RegexMatch[4-5]{148:5})
123 |     time in _filter_rules: 0ms
124 |     ================================================================================
125 |     ...
126 | 
127 |   Again, if you do not see any sequence that captures all relevant
128 |   parts of your input, you may need to modify the regular expressions
129 |   or add new ones via rules.
130 | 
131 |   Finally you see a list of productions that are applied to stack
132 |   elements, where for each applicable rule the rule name and the new
133 |   stack sequence are printed, e.g.::
134 | 
135 |     --------------------------------------------------------------------------------
136 |     producing on (RegexMatch[0-3]{114:May}, RegexMatch[4-7]{135:5th}), score=-0.13
137 |       ruleMonthMay -> (Time[0-3]{X-05-X X:X (X/X)}, RegexMatch[4-7]{135:5th}), score=1.41
138 |       ruleDOM2 -> (RegexMatch[0-3]{114:May}, Time[4-7]{X-X-05 X:X (X/X)}), score=1.38
139 |     added 2 new stack elements, depth after trunc: 2
140 |     --------------------------------------------------------------------------------
141 | 
142 |   If no productions could be applied to a stack element the emitted
143 |   results are printed::
144 | 
145 |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
146 |     no rules applicable: emitting
147 |     => Time[0-7]{2018-05-05 X:X (X/X)}, score=15.91,
148 |     --------------------------------------------------------------------------------
149 | 
150 |   If the desired production does not show up, but the regular
151 |   expressions look fine and the initial stack elements as well, try
152 |   increasing the ``max_stack_depth`` parameter, i.e. run
153 |   ``ctparse(..., max_stack_depth=0)``. Also make sure that the
154 |   ``timeout`` parameter is not set. Maybe ``ctparse`` is able to
155 |   generate the resolution but it is too deep in the stack.
156 | 
157 | 
158 | Adding a rule
159 | ~~~~~~~~~~~~~
160 | 
161 | When adding rules try to follow these guidelines:
162 | 
163 | 1. Be as general as possible: instead of writing one long regular
164 |    expression that matches only a specific case, check whether you can
165 |    rather divide your pattern in production parts + some regular
166 |    expressions. For example, if you have a very specific way to
167 |    speficy the year of a date in mind, it might do no harm to just
168 |    allow anything that with ``predicate('hasDate')`` plus your
169 |    specific year expression, i.e.
170 | 
171 |    .. code:: python
172 | 
173 |       @rule(predicate('hasDate'), r'your funky year')
174 | 
175 | 2. Keep your regex as general as possible, but avoid regular
176 |    expressions that are likely to generate many "false positives". Often
177 |    that can be prevented by using positive or negative lookaheads and
178 |    lookbehinds to keep the context sane (see `Lookaround
179 |    <https://www.regular-expressions.info/lookaround.html>`_ on the
180 |    excellent regular-expression.info site).
181 | 
182 | 3. Make sure your production covers corner cases and matches the
183 |    ``ctparse`` opinion to resolve to times in the near future but -
184 |    unless explicit -- never in the past (relative to the reference
185 |    time). Also make sure it favors the close future over the further
186 |    future.
187 | 
188 | 
189 | Other Types of Contributions
190 | ----------------------------
191 | 
192 | Report Bugs
193 | ~~~~~~~~~~~
194 | 
195 | Report bugs at https://github.com/comtravo/ctparse/issues.
196 | 
197 | If you are reporting a bug, please include:
198 | 
199 | * Your operating system name and version.
200 | * Any details about your local setup that might be helpful in troubleshooting.
201 | * Detailed steps to reproduce the bug.
202 | 
203 | Fix Bugs
204 | ~~~~~~~~
205 | 
206 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
207 | wanted" is open to whoever wants to implement it.
208 | 
209 | Implement Features
210 | ~~~~~~~~~~~~~~~~~~
211 | 
212 | Look through the GitHub issues for features. Anything tagged with "enhancement"
213 | and "help wanted" is open to whoever wants to implement it.
214 | 
215 | Write Documentation
216 | ~~~~~~~~~~~~~~~~~~~
217 | 
218 | ctparse - Parse natural language time expressions in pytho could always use more documentation, whether as part of the
219 | official ctparse - Parse natural language time expressions in pytho docs, in docstrings, or even on the web in blog posts,
220 | articles, and such.
221 | 
222 | Submit Feedback
223 | ~~~~~~~~~~~~~~~
224 | 
225 | The best way to send feedback is to file an issue at https://github.com/comtravo/ctparse/issues.
226 | 
227 | If you are proposing a feature:
228 | 
229 | * Explain in detail how it would work.
230 | * Keep the scope as narrow as possible, to make it easier to implement.
231 | * Remember that this is a volunteer-driven project, and that contributions
232 |   are welcome :)
233 | 
234 | Get Started!
235 | ------------
236 | 
237 | Ready to contribute? Here's how to set up `ctparse` for local development.
238 | 
239 | 1. Fork the `ctparse` repo on GitHub.
240 | 2. Clone your fork locally::
241 | 
242 |     $ git clone git@github.com:your_name_here/ctparse.git
243 | 
244 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
245 | 
246 |     $ mkvirtualenv ctparse
247 |     $ cd ctparse/
248 |     $ python setup.py develop
249 | 
250 | 4. Create a branch for local development::
251 | 
252 |     $ git checkout -b name-of-your-bugfix-or-feature
253 | 
254 |    Now you can make your changes locally.
255 | 
256 | 5. When you're done making changes, check that your changes pass flake8 and the
257 |    tests, including testing other Python versions with tox::
258 | 
259 |     $ flake8 ctparse tests
260 |     $ python setup.py test or py.test
261 |     $ tox
262 | 
263 |    To get flake8 and tox, just pip install them into your virtualenv.
264 | 
265 | 6. Commit your changes and push your branch to GitHub::
266 | 
267 |     $ git add .
268 |     $ git commit -m "Your detailed description of your changes."
269 |     $ git push origin name-of-your-bugfix-or-feature
270 | 
271 | 7. Submit a pull request through the GitHub website.
272 | 
273 | Pull Request Guidelines
274 | -----------------------
275 | 
276 | Before you submit a pull request, check that it meets these guidelines:
277 | 
278 | 1. The pull request should include tests.
279 | 2. If the pull request adds functionality, the docs should be updated. Put
280 |    your new functionality into a function with a docstring, and add the
281 |    feature to the list in README.rst.
282 | 3. The pull request should work for Python 3.6., 3.7. and 3.8. Check
283 |    https://travis-ci.org/comtravo/ctparse/pull_requests
284 |    and make sure that the tests pass for all supported Python versions.
285 | 
286 | Tips
287 | ----
288 | 
289 | To run a subset of tests::
290 | 
291 | $ py.test tests.test_ctparse
292 | 
293 | 
294 | Deploying
295 | ---------
296 | 
297 | A reminder for the maintainers on how to deploy.
298 | Make sure all your changes are committed (including an entry in HISTORY.rst).
299 | Then run on the ``master`` branch::
300 | 
301 | $ bumpversion patch # possible: major / minor / patch
302 | $ git push
303 | $ git push --tags
304 | $ make release
305 | 
306 | You will need a username and password to upload to pypi (might be
307 | automated on Travis).
308 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | History
 3 | =======
 4 | 
 5 | 
 6 | 0.3.0 (2021-02-01)
 7 | ------------------
 8 | 
 9 | * Removed latent rules regarding times (latent rules regarding dates are still present)
10 | * Added latent_time option to customize the new behavior, defauld behavior is backwards-compatible
11 | 
12 | 0.2.1 (2020-05-27)
13 | ------------------
14 | 
15 | * Update development dependencies
16 | * Add flake8-bugbear and fixed issues
17 | 
18 | 0.2.0 (2020-04-23)
19 | ------------------
20 | 
21 | * Implemented new type `Duration`, to handle lengths of time
22 | * Adapted the dataset to include `Duration`
23 | * Implemented basic rule to merge `Duration`, `Time` and `Interval` in simple cases.
24 | * Created a make target to train the model `make train`
25 | 
26 | 0.1.0 (2020-03-20)
27 | ------------------
28 | 
29 | * Major refactor of code underlying predictive model
30 | * Based on a contribution from @bharathi-srini: replace naive bayes from sklearn by own implementation
31 | * Thus remove dependencies on numpy, scipy, scikit-learn
32 | * Predictions are much faster: 97/s in the old vs. 239/s in the new code base
33 | * Performance identical
34 | * Deprecate support for python 3.5, add 3.8
35 | * Add more strict type checking rules (mypy.ini)
36 | * Force black code formatting, make this a linter step, "black" all code
37 | 
38 | 0.0.47 (2020-02-28)
39 | -------------------
40 | 
41 | * Allow overlapping matches of regular expression when generating inital stack of "tokens"
42 | 
43 | 0.0.46 (2020-02-26)
44 | -------------------
45 | 
46 | * Implemented heuristics to detect (albeit imperfectly) military times
47 | 
48 | 0.0.44 (2019-11-05)
49 | -------------------
50 | 
51 | * Released time corpus
52 | * Implemented training model using ctparse corpus
53 | 
54 | 0.0.43 (2019-11-01)
55 | -------------------
56 | 
57 | * Added slash as a general separator
58 | * Added ruleTODTOD (to support expression like afternoon/evening)
59 | 
60 | 0.0.42 (2019-10-30)
61 | -------------------
62 | 
63 | * Removed nb module
64 | * Fix for two digit years
65 | * Freshly retrained model binary file
66 | 
67 | 0.0.41 (2019-10-29)
68 | -------------------
69 | 
70 | * Fix run_corpus refactoring bug
71 | * Implemented retraining utilities
72 | 
73 | 0.0.40 (2019-10-25)
74 | -------------------
75 | 
76 | * update develop dependencies
77 | * remove unused Protocol import from typing_extensions
78 | 
79 | 0.0.39 (2019-10-24)
80 | -------------------
81 | 
82 | * split ctparse file into several different modules
83 | * added types to public interface
84 | * introduced the Scorer abstraction to implement richer scoring strategies
85 | 
86 | 0.0.38 (2018-11-05)
87 | -------------------
88 | 
89 | * Added python 3.7 to supported versions (fix on travis available)
90 | 
91 | 0.0.8 (2018-06-07)
92 | ------------------
93 | 
94 | * First release on PyPI.
95 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018, Sebastian Mika, Comtravo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | include requirements.txt
 7 | 
 8 | recursive-include tests *
 9 | recursive-exclude * __pycache__
10 | recursive-exclude * *.py[co]
11 | 
12 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean clean-test clean-pyc clean-build docs help
 2 | .DEFAULT_GOAL := help
 3 | 
 4 | define BROWSER_PYSCRIPT
 5 | import os, webbrowser, sys
 6 | 
 7 | try:
 8 | 	from urllib import pathname2url
 9 | except:
10 | 	from urllib.request import pathname2url
11 | 
12 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
13 | endef
14 | export BROWSER_PYSCRIPT
15 | 
16 | define PRINT_HELP_PYSCRIPT
17 | import re, sys
18 | 
19 | for line in sys.stdin:
20 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
21 | 	if match:
22 | 		target, help = match.groups()
23 | 		print("%-20s %s" % (target, help))
24 | endef
25 | export PRINT_HELP_PYSCRIPT
26 | 
27 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
28 | 
29 | help:
30 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
31 | 
32 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33 | 
34 | clean-build: ## remove build artifacts
35 | 	rm -fr build/
36 | 	rm -fr dist/
37 | 	rm -fr .eggs/
38 | 	find . -name '*.egg-info' -exec rm -fr {} +
39 | 	find . -name '*.egg' -exec rm -f {} +
40 | 
41 | clean-pyc: ## remove Python file artifacts
42 | 	find . -name '*.pyc' -exec rm -f {} +
43 | 	find . -name '*.pyo' -exec rm -f {} +
44 | 	find . -name '*~' -exec rm -f {} +
45 | 	find . -name '__pycache__' -exec rm -fr {} +
46 | 
47 | clean-test: ## remove test and coverage artifacts
48 | 	rm -fr .tox/
49 | 	rm -f .coverage
50 | 	rm -fr htmlcov/
51 | 	rm -fr .pytest_cache
52 | 
53 | lint: ## check style with flake8
54 | 	black --check ctparse tests
55 | 	flake8 ctparse tests
56 | 	mypy -p ctparse -p tests
57 | 
58 | test: ## run tests quickly with the default Python
59 | 	py.test
60 | 
61 | test-all: ## run tests on every Python version with tox
62 | 	tox
63 | 
64 | train:
65 | 	python scripts/train_default_model.py --legacy --dataset datasets/timeparse_corpus.json
66 | 
67 | coverage: ## check code coverage quickly with the default Python
68 | 	coverage run --source ctparse -m pytest
69 | 	coverage report -m
70 | 	coverage html
71 | 	$(BROWSER) htmlcov/index.html
72 | 
73 | docs: ## generate Sphinx HTML documentation, including API docs
74 | 	rm -f docs/ctparse.rst
75 | 	rm -f docs/modules.rst
76 | 	sphinx-apidoc -o docs/ ctparse
77 | 	$(MAKE) -C docs clean
78 | 	$(MAKE) -C docs html
79 | 	$(BROWSER) docs/_build/html/index.html
80 | 
81 | servedocs: docs ## compile the docs watching for changes
82 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
83 | 
84 | release: dist ## package and upload a release
85 | 	twine upload dist/*
86 | 
87 | dist: clean ## builds source and wheel package
88 | 	python setup.py sdist
89 | 	python setup.py bdist_wheel
90 | 	ls -l dist
91 | 
92 | install: clean ## install the package to the active Python's site-packages
93 | 	python setup.py install
94 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ===========================================================
  2 | quickadd
  3 | ===========================================================
  4 | 
  5 | quickadd is a natural language date & time parser written in python. It builds on top of ctparse_ and is an actively maintained fork. 
  6 | 
  7 | Installation
  8 | ----------
  9 | 
 10 | With ``pip install -e git+https://github.com/Acreom/quickadd.git#egg=quickadd``
 11 | 
 12 | 
 13 | or run ``python setup.py install`` in the root directory after forking.
 14 | 
 15 | 
 16 | Main upgrades include:
 17 | ----------
 18 | 
 19 | **Recurring events**
 20 | 
 21 | 
 22 | .. code:: python
 23 | 
 24 |     r = ctparse("beer daily 4pm")
 25 |     r.resolution
 26 |     Recurring[5-14]{daily 1 2021-05-09 16:00 (X/X) 2021-05-09 16:00 (X/X)}
 27 |     
 28 |     r = ctparse("beer every thursday 4")
 29 |     r.resolution
 30 |     Recurring[5-21]{weekly 1 2021-04-15 16:00 (X/X) 2021-04-15 16:00 (X/X)}
 31 |     
 32 |     r = ctparse("beer every friday 9-5")
 33 |     r.resolution
 34 |     Recurring[5-21]{weekly 1 2021-05-14 09:00 (X/X) 2021-05-14 17:00 (X/X)}
 35 |     
 36 |     r = ctparse("beer september 24 / beer every 24.9")
 37 |     r.resolution
 38 |     Recurring[5-21]{YEARLY 1 2021-09-24 (X/X) 2021-09-24 (X/X)}
 39 | 
 40 |     r = ctparse("beer thursdays 3pm and wednesdays 4pm")
 41 |     r.resolution
 42 |     RecurringArray[5-37]{
 43 |     Recurring instance: weekly 1 2021-05-13 15:00 (X/X) 2021-05-13 15:00 (X/X) 
 44 |     Recurring instance: weekly 1 2021-05-12 16:00 (X/X) 2021-05-12 16:00 (X/X)
 45 |     }
 46 |     
 47 |     r = ctparse("beer 9pm weekdays")
 48 |     r.resolution
 49 |     RecurringArray[5-17]{
 50 |     Recurring instance: weekly 1 2021-05-10 21:00 (X/X) 2021-05-10 21:00 (X/X) 
 51 |     Recurring instance: weekly 1 2021-05-11 21:00 (X/X) 2021-05-11 21:00 (X/X) 
 52 |     Recurring instance: weekly 1 2021-05-12 21:00 (X/X) 2021-05-12 21:00 (X/X) 
 53 |     Recurring instance: weekly 1 2021-05-13 21:00 (X/X) 2021-05-13 21:00 (X/X) 
 54 |     Recurring instance: weekly 1 2021-05-14 21:00 (X/X) 2021-05-14 21:00 (X/X)}    
 55 | 
 56 | 
 57 | **More rules**
 58 | 
 59 | ruleNextFrequency
 60 | 
 61 | .. code:: python
 62 |     
 63 |     #reference date = Dec 13th 2022
 64 |     r = ctparse("code next week 4pm")
 65 |     r.resolution
 66 |     Time[5-18]{2022-12-20 16:00
 67 |     
 68 |     r = ctparse("code next month")
 69 |     r.resolution
 70 |     Time[5-15]{2023-01-13 X:X (X/X)}
 71 | 
 72 | 
 73 | ruleLastDOM
 74 | 
 75 | .. code:: python
 76 |     
 77 |     #reference date = Dec 13th 2022
 78 |     r = ctparse("code last monday of the month")
 79 |     r.resolution
 80 |     Time[5-17]{2022-12-26 X:X (X/X)}
 81 | 
 82 | 
 83 | rrule_ **support**
 84 | 
 85 | .. code:: python
 86 | 
 87 |     r.resolution.to_rrule()
 88 |     Out[4]: 'RRULE:FREQ=DAILY;COUNT=1'
 89 |     
 90 | 
 91 | **Subject extraction**
 92 | 
 93 | 
 94 | .. code:: python
 95 | 
 96 |     r = ctparse("beers and burgers friday 8pm-9pm")
 97 |     r.subject
 98 |     Out[2]: 'beers and burgers'
 99 |     
100 |     
101 | **PM bias**
102 | 
103 | 
104 | .. code:: python
105 | 
106 |     r = ctparse("fix the issue tmrw 2")
107 |     r.resolution
108 |     Time[14-20]{2022-11-23 14:00 (X/X)}
109 |     
110 |     r = ctparse("fix the issue tmrw 2", pm_bias=False)
111 |     r.resolution
112 |     Time[14-20]{2022-11-23 02:00 (X/X)}
113 | 
114 | 
115 | **Rules for ambigious natural language expressions** 
116 | 
117 | .. code:: python
118 | 
119 |     r = ctparse("code 9-5")
120 |     r.resolution
121 |     Interval[0-0]{2022-11-23 09:00 (X/X) - 2022-11-23 17:00 (X/X)}
122 | 
123 | 
124 | **US/EU date format**
125 | 
126 | 
127 | .. code:: python
128 | 
129 |     r = ctparse("fix the issue 5.3")
130 |     r.resolution
131 |     Time[14-17]{2023-03-05 X:X (X/X)}
132 |     
133 |     r = ctparse("fix the issue 5.3", date_format="US")
134 |     r.resolution
135 |     Time[14-17]{2023-05-03 X:X (X/X)}
136 | 
137 | 
138 | **Rule combinations** 
139 | 
140 | .. code:: python
141 | 
142 |     r = ctparse("beer in 3 days 4pm")
143 |     r.resolution
144 |     Time[5-18]{2021-05-12 16:00 (X/X)}
145 |     
146 |     
147 |     r = ctparse("beer in 3 days 4pm every week")
148 |     r.resolution
149 |     Recurring[5-29]{weekly 1 2021-05-12 16:00 (X/X) 2021-05-12 16:00 (X/X)}
150 | 
151 | 
152 |     r = ctparse("beer every friday 4-6:30pm")
153 |     r.resolution
154 |     Recurring[5-26]{WEEKLY 1 2022-11-25 16:00 (X/X) 2022-11-25 18:30 (X/X)}
155 | 
156 | 
157 | ``+`` **performance improvements**
158 | 
159 | 
160 | Base Capabilities
161 | ----------
162 | | **Time** 
163 | 
164 | .. code:: python
165 | 
166 |     "beer thursday 4"
167 |     Time[5-15]{2021-05-13 16:00 (X/X)}
168 | 
169 | 
170 | | **Interval** 
171 | 
172 | .. code:: python
173 | 
174 |     "beer 4-6"
175 |     Interval[0-0]{2021-05-09 16:00 (X/X) - 2021-05-09 18:00 (X/X)}
176 | 
177 | 
178 | | **Duration** 
179 | 
180 | .. code:: python
181 | 
182 |     "beer in 4 hours"
183 |     Duration[5-15]{4 hours}
184 | 
185 | 
186 | Ctparse
187 | ----------
188 | 
189 | The package ``ctparse`` is a pure python package to parse time
190 | expressions from natural language (i.e. strings). In many ways it builds
191 | on similar concepts as Facebook’s ``duckling`` package
192 | (https://github.com/facebook/duckling). However, for the time being it
193 | only targets times and only German and English text.
194 | 
195 | In principle ``ctparse`` can be used to **detect** time expressions in a
196 | text, however its main use case is the semantic interpretation of such
197 | expressions. Detecting time expressions in the first place can - to our
198 | experience - be done more efficiently (and precisely) using e.g. CRFs or
199 | other models targeted at this specific task.
200 | 
201 | ``ctparse`` is designed with the use case in mind where interpretation
202 | of time expressions is done under the following assumptions:
203 | 
204 | -  All expressions are relative to some pre-defined reference times
205 | -  Unless explicitly specified in the time expression, valid resolutions
206 |    are in the future relative to the reference time (i.e. ``12.5.`` will
207 |    be the next 12th of May, but ``12.5.2012`` should correctly resolve
208 |    to the 12th of May 2012).
209 | -  If in doubt, resolutions in the near future are more likely than
210 |    resolutions in the far future (not implemented yet, but any
211 |    resolution more than i.e. 3 month in the future is extremely
212 |    unlikely).
213 | 
214 | The specific comtravo use-case is resolving time expressions in booking
215 | requests which almost always refer to some point in time within the next
216 | 4-8 weeks.
217 | 
218 | ``ctparse`` currently is language agnostic and supports German and
219 | English expressions. This might get an extension in the future. The main
220 | reason is that in real world communication more often than not people
221 | write in one language (their business language) but use constructs to
222 | express times that are based on their mother tongue and/or what they
223 | believe to be the way to express dates in the target language. This
224 | leads to text in German with English time expressions and vice-versa.
225 | Using a language detection upfront on the complete original text is for
226 | obvious no solution - rather it would make the problem worse.
227 | 
228 | Example
229 | -------
230 | 
231 | .. code:: python
232 | 
233 |    from ctparse import ctparse
234 |    from datetime import datetime
235 | 
236 |    # Set reference time
237 |    ts = datetime(2018, 3, 12, 14, 30)
238 |    ctparse('May 5th 2:30 in the afternoon', ts=ts)
239 | 
240 | This should return a ``Time`` object represented as
241 | ``Time[0-29]{2018-05-05 14:30 (X/X)}``, indicating that characters
242 | ``0-29`` were used in the resolution, that the resolved date time is the
243 | 5th of May 2018 at 14:30 and that this resolution is neither based on a
244 | day of week (first ``X``) nor a part of day (second ``X``).
245 | 
246 | 
247 | Latent time
248 | ~~~~~~~~~~~
249 | 
250 | Normally, ``ctparse`` will anchor time expressions to the reference time. 
251 | For example, when parsing the time expression ``8:00 pm``, ctparse will
252 | resolve the expression to 8 pm after the reference time as follows
253 | 
254 | .. code:: python
255 | 
256 |    parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=True) # default
257 |    # parse.resolution -> Time(2020, 1, 1, 20, 00)
258 | 
259 | This behavior can be customized using the option ``latent_time=False``, which will
260 | return a time resolution not anchored to a particular date
261 | 
262 | .. code:: python
263 | 
264 |    parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=False)
265 |    # parse.resolution -> Time(None, None, None, 20, 00)
266 | 
267 | Implementation
268 | --------------
269 | 
270 | ``ctparse`` - as ``duckling`` - is a mixture of a rule and regular
271 | expression based system + some probabilistic modeling. In this sense it
272 | resembles a PCFG.
273 | 
274 | Rules
275 | ~~~~~
276 | 
277 | At the core ``ctparse`` is a collection of production rules over
278 | sequences of regular expressions and (intermediate) productions.
279 | 
280 | Productions are either of type ``Time``, ``Interval``, ``Duration`` or ``Recurring`` and can
281 | have certain predicates (e.g. whether a ``Time`` is a part of day like
282 | ``'afternoon'``).
283 | 
284 | A typical rule than looks like this:
285 | 
286 | .. code:: python
287 | 
288 |    @rule(predicate('isDate'), dimension(Interval))
289 | 
290 | I.e. this rule is applicable when the intermediate production resulted
291 | in something that has a date, followed by something that is in interval
292 | (like e.g. in ``'May 5th 9-10'``).
293 | 
294 | The actual production is a python function with the following signature:
295 | 
296 | .. code:: python
297 | 
298 |    @rule(predicate('isDate'), dimension(Interval))
299 |    def ruleDateInterval(ts, d, i):
300 |      """
301 |      param ts: datetime - the current refenrence time
302 |      d: Time - a time that contains at least a full date
303 |      i: Interval - some Interval
304 |      """
305 |      if not (i.t_from.isTOD and i.t_to.isTOD):
306 |        return None
307 |      return Interval(
308 |        t_from=Time(year=d.year, month=d.month, day=d.day,
309 |                    hour=i.t_from.hour, minute=i.t_from.minute),
310 |        t_to=Time(year=d.year, month=d.month, day=d.day,
311 |                  hour=i.t_to.hour, minute=i.t_to.minute))
312 | 
313 | This production will return a new interval at the date of
314 | ``predicate('isDate')`` spanning the time coded in
315 | ``dimension(Interval)``. If the latter does code for something else than
316 | a time of day (TOD), no production is returned, e.g. the rule matched
317 | but failed.
318 | 
319 | 
320 | Technical Background
321 | ~~~~~~~~~~~~~~~~~~~~
322 | 
323 | Some observations on the problem:
324 | 
325 | -  Each rule is a combination of regular expressions and productions.
326 | -  Consequently, each production must originate in a sequence of regular
327 |    expressions that must have matched (parts of) the text.
328 | -  Hence, only subsequence of **all** regular expressions in **all**
329 |    rules can lead to a successful production.
330 | 
331 | To this end the algorithm proceeds as follows:
332 | 
333 | 1. Input a string and a reference time
334 | 2. Find all matches of all regular expressions from all rules in the
335 |    input strings. Each regular expression is assigned an identifier.
336 | 3. Find all distinct sequences of these matches where two matches do not
337 |    overlap nor have a gap inbetween
338 | 4. To each such subsequence apply all rules at all possible positions
339 |    until no further rules can be applied - in which case one solution is
340 |    produced
341 | 
342 | Obviously, not all sequences of matching expressions and not all
343 | sequences of rules applied on top lead to meaningful results. Here the
344 | **P**\ CFG kicks in:
345 | 
346 | -  Based on example data (``corpus.py``) a model is calibrated to
347 |    predict how likely a production is to lead to a/the correct result.
348 |    Instead of doing a breadth first search, the most promising
349 |    productions are applied first.
350 | -  Resolutions are produced until there are no more resolutions or a
351 |    timeout is hit.
352 | -  Based on the same model from all resolutions the highest scoring is
353 |    returned.
354 | 
355 | 
356 | .. _ctparse: https://github.com/comtravo/ctparse
357 | .. _rrule: https://dateutil.readthedocs.io/en/stable/rrule.html
358 | 
359 | Credits
360 | -------
361 | 
362 | This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template.
363 | 
364 | .. _Cookiecutter: https://github.com/audreyr/cookiecutter
365 | .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
366 | 


--------------------------------------------------------------------------------
/ctparse/__init__.py:
--------------------------------------------------------------------------------
 1 | """ctparse - parse time expressions in strings
 2 | 
 3 | .. moduleauthor:: Comtravo
 4 | 
 5 | """
 6 | __author__ = """Sebastian Mika"""
 7 | __email__ = "sebastian.mika@comtravo.com"
 8 | __version__ = "__version__ = '0.3.0'"
 9 | 
10 | from .ctparse import ctparse, ctparse_gen  # noqa
11 | 


--------------------------------------------------------------------------------
/ctparse/corpus.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from datetime import datetime
  4 | from typing import Callable, Iterable, List, NamedTuple, Sequence, Tuple, TypeVar, Union
  5 | 
  6 | from tqdm import tqdm
  7 | 
  8 | from .ctparse import ctparse_gen
  9 | from .scorer import DummyScorer, Scorer
 10 | from .types import Artifact, Duration, Interval, Time
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | # A triplet of text, reference timestamp and correct parse.
 15 | # It can be used as raw data to build datasets for ctparse.
 16 | TimeParseEntry = NamedTuple(
 17 |     "TimeParseEntry", [("text", str), ("ts", datetime), ("gold", Artifact)],
 18 | )
 19 | 
 20 | T = TypeVar("T")
 21 | 
 22 | 
 23 | def make_partial_rule_dataset(
 24 |     entries: Sequence[TimeParseEntry],
 25 |     scorer: Scorer,
 26 |     timeout: Union[float, int],
 27 |     max_stack_depth: int,
 28 |     relative_match_len: float = 1.0,
 29 |     progress: bool = False,
 30 | ) -> Iterable[Tuple[List[str], bool]]:
 31 |     """Build a data set from an iterable of TimeParseEntry.
 32 | 
 33 |     The text is run through ctparse and all parses (within the specified timeout,
 34 |     max_stack_depth and scorer) are obtained. Each parse contains a sequence
 35 |     of rules (see ``CTParse.rules``) used to produce that parse.
 36 | 
 37 |     A dataset is generated by taking every possible partial rule and assigning to it
 38 |     a boolean indicating if that partial sequence did lead to a successful parse.
 39 | 
 40 |     If `progress` is ``True``, display a progress bar.
 41 | 
 42 |     Example:
 43 | 
 44 |     rule sequence: [r1, r2, r3]
 45 |     parse_is_correct: True
 46 | 
 47 |     [r1] -> True
 48 |     [r1, r2] -> True
 49 |     [r1, r2, r3] -> True
 50 |     """
 51 |     # If we look at the signature for a scorer, the score is obtained from:
 52 |     # (text, reference_time, partial_parse) and optionally a production for a
 53 |     # partial parse.
 54 |     # Clearly, if we were to make a general scorer for the dataset, we would need
 55 |     # all of these features. It is possible to achieve that by tracking the list of
 56 |     # partial parses that led to a correct parse. Unfortunately we don't have the
 57 |     # full history with the current implementation, however we can obtain a dataset
 58 |     # of (text, reference_time, rule_ids) quite easily, because the rule is a linear
 59 |     # list.
 60 | 
 61 |     if progress:
 62 |         entries_it = _progress_bar(
 63 |             entries,
 64 |             total=len(entries),
 65 |             status_text=lambda entry: "  {: <70}".format(entry.text),
 66 |         )
 67 |     else:
 68 |         entries_it = entries
 69 | 
 70 |     for entry in entries_it:
 71 |         for parse in ctparse_gen(
 72 |             entry.text,
 73 |             entry.ts,
 74 |             relative_match_len=relative_match_len,
 75 |             timeout=timeout,
 76 |             max_stack_depth=max_stack_depth,
 77 |             scorer=scorer,
 78 |             latent_time=False,
 79 |         ):
 80 |             # TODO: we should make sure ctparse_gen never returns None. If there is no
 81 |             # result it should return an empty list
 82 |             if parse is None:
 83 |                 continue
 84 | 
 85 |             y = parse.resolution == entry.gold
 86 |             # Build data set, one sample for each applied rule in
 87 |             # the sequence of rules applied in this production
 88 |             # *after* the matched regular expressions
 89 |             for i in range(1, len(parse.production) + 1):
 90 |                 X = [str(p) for p in parse.production[:i]]
 91 |                 yield X, y
 92 | 
 93 | 
 94 | def _progress_bar(
 95 |     it: Iterable[T], total: int, status_text: Callable[[T], str]
 96 | ) -> Iterable[T]:
 97 |     # Progress bar that can update text
 98 |     pbar = tqdm(it, total=total)
 99 |     for val in pbar:
100 |         pbar.set_description(status_text(val))
101 |         yield val
102 | 
103 | 
104 | def load_timeparse_corpus(fname: str) -> Sequence[TimeParseEntry]:
105 |     """Load a corpus from disk.
106 | 
107 |     For more information about the format of the time parse corpus,
108 |     refer to the documentation.
109 |     """
110 |     with open(fname, "r", encoding="utf-8") as fd:
111 |         entries = json.load(fd)
112 | 
113 |     return [
114 |         TimeParseEntry(
115 |             text=e["text"],
116 |             ts=datetime.strptime(e["ref_time"], "%Y-%m-%dT%H:%M:%S"),
117 |             gold=parse_nb_string(e["gold_parse"]),
118 |         )
119 |         for e in entries
120 |     ]
121 | 
122 | 
123 | def parse_nb_string(gold_parse: str) -> Union[Time, Interval, Duration]:
124 |     """Parse a Time, Interval or Duration from their no-bound string representation.
125 | 
126 |     The no-bound string representations are generated from ``Artifact.nb_str``.
127 |     """
128 |     if gold_parse.startswith("Time"):
129 |         return Time.from_str(gold_parse[7:-1])
130 |     if gold_parse.startswith("Interval"):
131 |         return Interval.from_str(gold_parse[11:-1])
132 |     if gold_parse.startswith("Duration"):
133 |         return Duration.from_str(gold_parse[11:-1])
134 |     else:
135 |         raise ValueError("'{}' has an invalid format".format(gold_parse))
136 | 
137 | 
138 | def run_corpus(
139 |     corpus: Sequence[Tuple[str, str, Sequence[str]]]
140 | ) -> Tuple[List[List[str]], List[bool]]:
141 |     """Load the corpus (currently hard coded), run it through ctparse with
142 |     no timeout and no limit on the stack depth.
143 | 
144 |     The corpus passes if ctparse generates the desired solution for
145 |     each test at least once. Otherwise it fails.
146 | 
147 |     While testing this, a labeled data set (X, y) is generated based
148 |     on *all* productions. Given a final production p, based on initial
149 |     regular expression matches r_0, ..., r_n, which are then
150 |     subsequently transformed using production rules p_0, ..., p_m,
151 |     will result in the samples
152 | 
153 |     [r_0, ..., r_n, p_0, 'step_0']
154 |     [r_0, ..., r_n, p_0, p_1, 'step_1']
155 |     ...
156 |     [r_0, ..., r_n, p_0, ..., p_m, 'step_m']
157 | 
158 |     All samples from one production are given the same label which indicates if
159 |     the production was correct.
160 | 
161 |     To build a similar datasets without the strict checking, use
162 |     `make_partial_rule_dataset`
163 |     """
164 |     at_least_one_failed = False
165 |     # pos_parses: number of parses that are correct
166 |     # neg_parses: number of parses that are wrong
167 |     # pos_first_parses: number of first parses generated that are correct
168 |     # pos_best_scored: number of correct parses that have the best score
169 |     pos_parses = neg_parses = pos_first_parses = pos_best_scored = 0
170 |     total_tests = 0
171 |     Xs = []
172 |     ys = []
173 |     for target, ts, tests in tqdm(corpus):
174 |         ts = datetime.strptime(ts, "%Y-%m-%dT%H:%M")
175 |         all_tests_pass = True
176 |         for test in tests:
177 |             one_prod_passes = False
178 |             first_prod = True
179 |             y_score = []
180 |             for parse in ctparse_gen(
181 |                 test,
182 |                 ts,
183 |                 relative_match_len=1.0,
184 |                 timeout=0,
185 |                 max_stack_depth=0,
186 |                 scorer=DummyScorer(),
187 |                 latent_time=False,
188 |             ):
189 |                 assert parse is not None
190 | 
191 |                 y = parse.resolution.nb_str() == target
192 |                 # Build data set, one sample for each applied rule in
193 |                 # the sequence of rules applied in this production
194 |                 # *after* the matched regular expressions
195 |                 for i in range(1, len(parse.production) + 1):
196 |                     Xs.append([str(p) for p in parse.production[:i]])
197 |                     ys.append(y)
198 | 
199 |                 one_prod_passes |= y
200 |                 pos_parses += int(y)
201 |                 neg_parses += int(not y)
202 |                 pos_first_parses += int(y and first_prod)
203 |                 first_prod = False
204 |                 y_score.append((parse.score, y))
205 |             if not one_prod_passes:
206 |                 logger.warning(
207 |                     'failure: target "{}" never produced in "{}"'.format(target, test)
208 |                 )
209 |             pos_best_scored += int(max(y_score, key=lambda x: x[0])[1])
210 |             total_tests += len(tests)
211 |             all_tests_pass &= one_prod_passes
212 |         if not all_tests_pass:
213 |             logger.warning('failure: "{}" not always produced'.format(target))
214 |             at_least_one_failed = True
215 |     logger.info(
216 |         "run {} tests on {} targets with a total of "
217 |         "{} positive and {} negative parses (={})".format(
218 |             total_tests, len(corpus), pos_parses, neg_parses, pos_parses + neg_parses
219 |         )
220 |     )
221 |     logger.info(
222 |         "share of correct parses in all parses: {:.2%}".format(
223 |             pos_parses / (pos_parses + neg_parses)
224 |         )
225 |     )
226 |     logger.info(
227 |         "share of correct parses being produced first: {:.2%}".format(
228 |             pos_first_parses / (pos_parses + neg_parses)
229 |         )
230 |     )
231 |     logger.info(
232 |         "share of correct parses being scored highest: {:.2%}".format(
233 |             pos_best_scored / total_tests
234 |         )
235 |     )
236 |     if at_least_one_failed:
237 |         raise Exception("ctparse corpus has errors")
238 |     return Xs, ys
239 | 


--------------------------------------------------------------------------------
/ctparse/count_vectorizer.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from typing import Dict, Sequence, Tuple, Optional
  3 | 
  4 | 
  5 | class CountVectorizer:
  6 |     def __init__(self, ngram_range: Tuple[int, int]):
  7 |         """Create new count vectorizer that also counts n-grams.
  8 | 
  9 |         A count vectorizer builds an internal vocabulary and embeds each input
 10 |         by counting for each term in the document how often it appearsin the vocabulary.
 11 |         Here also n-grams are considered to be part of the vocabulary and the document
 12 |         terms, respectively
 13 | 
 14 |         Parameters
 15 |         ----------
 16 |         ngram_range : Tuple[int, int]
 17 |             n-gram range to consider
 18 |         """
 19 |         self.ngram_range = ngram_range
 20 |         self.vocabulary: Optional[Dict[str, int]] = None
 21 | 
 22 |     @staticmethod
 23 |     def _create_ngrams(
 24 |         ngram_range: Tuple[int, int], documents: Sequence[Sequence[str]]
 25 |     ) -> Sequence[Sequence[str]]:
 26 |         """For each document in documents, replace original tokens by a list of
 27 |         all min_n:max_n = self.ngram_range ngrams in that document.
 28 | 
 29 |         Parameters
 30 |         ----------
 31 |         ngram_range : Tuple[int, int]
 32 |             Min and max number of ngrams to generate
 33 | 
 34 |         documents : Sequence[Sequence[str]]
 35 |             A sequence of already tokenized documents
 36 | 
 37 |         Returns
 38 |         -------
 39 |         Sequence[Sequence[str]]
 40 |             For each document all ngrams of tokens in the desired range
 41 |         """
 42 |         min_n, max_n = ngram_range
 43 |         space_join = " ".join
 44 | 
 45 |         def _create(document: Sequence[str]) -> Sequence[str]:
 46 |             doc_len = len(document)
 47 |             doc_max_n = min(max_n, doc_len) + 1
 48 |             if min_n == 1:
 49 |                 ngrams = list(document)
 50 |                 min_nn = min_n + 1
 51 |             else:
 52 |                 ngrams = []
 53 |                 min_nn = min_n
 54 | 
 55 |             for n in range(min_nn, doc_max_n):
 56 |                 for i in range(0, doc_len - n + 1):
 57 |                     ngrams.append(space_join(document[i : i + n]))
 58 |             return ngrams
 59 | 
 60 |         return [_create(d) for d in documents]
 61 | 
 62 |     @staticmethod
 63 |     def _get_feature_counts(
 64 |         ngram_range: Tuple[int, int], documents: Sequence[Sequence[str]]
 65 |     ) -> Sequence[Dict[str, int]]:
 66 |         """Count (ngram) features appearing in each document
 67 | 
 68 |         Parameters
 69 |         ----------
 70 |         ngram_range : Tuple[int, int]
 71 |             Min and max number of ngrams to generate
 72 | 
 73 |         documents : Sequence[Sequence[str]]
 74 |             Sequence of documents tokenized as sequence of string
 75 | 
 76 |         Returns
 77 |         -------
 78 |         Tuple[Sequence[Dict[str, int]], Set[str]]
 79 |             For each document a dictionary counting how often which feature appeared and
 80 |             a set of all features in all documents. Features are according to this
 81 |             vectorizers n-gram settings.
 82 |         """
 83 |         documents = CountVectorizer._create_ngrams(ngram_range, documents)
 84 |         count_matrix = []
 85 | 
 86 |         for document in documents:
 87 |             # This is 5x faster than using a build in Counter
 88 |             feature_counts: Dict[str, int] = defaultdict(int)
 89 |             for feature in document:
 90 |                 feature_counts[feature] += 1
 91 |             count_matrix.append(feature_counts)
 92 |         return count_matrix
 93 | 
 94 |     @staticmethod
 95 |     def _build_vocabulary(count_matrix: Sequence[Dict[str, int]]) -> Dict[str, int]:
 96 |         """Build the vocabulary from feature counts
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         count_matrix : Sequence[Dict[str, int]]
101 |             Sequence of dicts with counts (values) per feature (keys)
102 | 
103 |         Returns
104 |         -------
105 |         Dict[str, int]
106 |             The vocabulary as {feature: index} pairs
107 |         """
108 |         all_features = set()
109 |         for feature_counts in count_matrix:
110 |             for feature in feature_counts.keys():
111 |                 all_features.add(feature)
112 |         return {word: idx for idx, word in enumerate(sorted(all_features))}
113 | 
114 |     @staticmethod
115 |     def _create_feature_matrix(
116 |         vocabulary: Dict[str, int], count_matrix: Sequence[Dict[str, int]]
117 |     ) -> Sequence[Dict[int, int]]:
118 |         """Map counts of string features to numerical data (sparse maps of
119 |         `{feature_index: count}`). Here `feature_index` is relative to the vocabulary of
120 |         this vectorizer.
121 | 
122 |         Parameters
123 |         ----------
124 |         vocabulary : Dict[str, int]
125 |             Vocabulary with {feature: index} mappings
126 | 
127 |         count_matrix : Sequence[Dict[str, int]]
128 |             Sequence of dictionaries with feature counts
129 | 
130 |         Returns
131 |         -------
132 |         Sequence[Dict[int, int]]
133 |             For each document a mapping of `feature_index` to a count how often this
134 |             feature appeared in the document.
135 |         """
136 |         len_vocab = len(vocabulary)
137 |         count_vectors_matrix = []
138 |         # Build document frequency matrix
139 |         for count_dict in count_matrix:
140 |             doc_vector: Dict[int, int] = defaultdict(int)
141 |             for word, cnt in count_dict.items():
142 |                 idx = vocabulary.get(word, None)
143 |                 if idx is not None:
144 |                     doc_vector[idx] = cnt
145 |             count_vectors_matrix.append(doc_vector)
146 |         # add vocab length in first element
147 |         count_vectors_matrix[0][len_vocab - 1] = count_vectors_matrix[0][len_vocab - 1]
148 |         return count_vectors_matrix
149 | 
150 |     def fit(self, documents: Sequence[Sequence[str]]) -> "CountVectorizer":
151 |         """Learn a vocabulary dictionary of all tokens in the raw documents.
152 | 
153 |         Parameters
154 |         ----------
155 |         documents : Sequence[Sequence[str]]
156 |             Sequence of documents, each as a sequence of tokens
157 | 
158 |         Returns
159 |         -------
160 |         CountVectorizer
161 |             The updated vectorizer, i.e. this updates the internal vocabulary
162 |         """
163 |         self.fit_transform(documents)
164 |         return self
165 | 
166 |     def fit_transform(
167 |         self, documents: Sequence[Sequence[str]]
168 |     ) -> Sequence[Dict[int, int]]:
169 |         """Learn the vocabulary dictionary and return a term-document matrix. Updates
170 |         the internal vocabulary state of the vectorizer.
171 | 
172 |         Parameters
173 |         ----------
174 |         documents : Sequence[Sequence[str]
175 |             Sequence of documents, each as a sequence of tokens
176 | 
177 |         Returns
178 |         -------
179 |         Sequence[Dict[int, int]]
180 |             Document-term matrix.
181 |         """
182 |         count_matrix = CountVectorizer._get_feature_counts(self.ngram_range, documents)
183 |         self.vocabulary = CountVectorizer._build_vocabulary(count_matrix)
184 |         return CountVectorizer._create_feature_matrix(self.vocabulary, count_matrix)
185 | 
186 |     def transform(self, documents: Sequence[Sequence[str]]) -> Sequence[Dict[int, int]]:
187 |         """Create term-document matrix based on pre-generated vocabulary. Does *not*
188 |         update the internal state of the vocabulary.
189 | 
190 |         Parameters
191 |         ----------
192 |         documents : Sequence[Sequence[str]]
193 |             Sequence of documents, each as a sequence of tokens
194 | 
195 |         Returns
196 |         -------
197 |         Sequence[Dict[int, int]]
198 |             Document-term matrix.
199 |         """
200 |         if not self.vocabulary:
201 |             raise ValueError("no vocabulary - vectorizer not fitted?")
202 |         count_matrix = CountVectorizer._get_feature_counts(self.ngram_range, documents)
203 |         return CountVectorizer._create_feature_matrix(self.vocabulary, count_matrix)
204 | 


--------------------------------------------------------------------------------
/ctparse/ctparse.py:
--------------------------------------------------------------------------------
  1 | from ctparse.time.postprocess_latent import apply_postprocessing_rules
  2 | import logging
  3 | from datetime import datetime
  4 | from typing import (
  5 |     cast,
  6 |     Callable,
  7 |     Dict,
  8 |     Iterator,
  9 |     List,
 10 |     Optional,
 11 |     Sequence,
 12 |     Tuple,
 13 |     Union,
 14 | )
 15 | 
 16 | import re
 17 | 
 18 | import regex
 19 | from itertools import chain
 20 | 
 21 | from .partial_parse import PartialParse
 22 | from .rule import _regex as global_regex, eu_regex, us_regex
 23 | from .scorer import Scorer
 24 | from .timers import CTParseTimeoutError, timeit
 25 | 
 26 | # Avoid collision with variable "timeout"
 27 | from .timers import timeout as timeout_
 28 | from .types import Artifact, RegexMatch
 29 | from .loader import load_default_scorer
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | _DEFAULT_SCORER = load_default_scorer()
 34 | 
 35 | 
 36 | class CTParse:
 37 |     def __init__(
 38 |         self,
 39 |         resolution: Artifact,
 40 |         production: Tuple[Union[int, str], ...],
 41 |         score: float,
 42 |         subject: str,
 43 |         # labels: str,
 44 |     ) -> None:
 45 |         """A possible parse returned by ctparse.
 46 | 
 47 |         :param resolution: the parsed `Time`, `Interval` or `Duration`
 48 |         :param production: the sequence of rules (productions) used to arrive
 49 |           at the parse
 50 |         :param score: a numerical score used to rank parses. A high score means
 51 |           a more likely parse
 52 |         """
 53 |         self.resolution = resolution
 54 |         self.production = production
 55 |         self.score = score
 56 |         self.subject = subject
 57 |         # self.labels = labels
 58 | 
 59 |     def __repr__(self) -> str:
 60 |         return "CTParse({}, {}, {}, {})".format(
 61 |             self.resolution, self.production, self.score, self.subject) #self.labels
 62 | 
 63 |     def __str__(self) -> str:
 64 |         return "{} s={:.3f} p={} sb={}".format(self.resolution, self.score, self.production, self.subject)
 65 |                                                        #self.labels)
 66 | 
 67 | 
 68 | def ctparse(
 69 |     txt: str,
 70 |     ts: Optional[datetime] = None,
 71 |     pm_bias: Optional[bool] = True,
 72 |     date_format: Optional[str] = None,
 73 |     fallback: Optional[bool] = False,
 74 |     timeout: Union[int, float] = 1.0,
 75 |     debug: bool = False,
 76 |     relative_match_len: float = 1.0,
 77 |     max_stack_depth: int = 10,
 78 |     scorer: Optional[Scorer] = None,
 79 |     latent_time: bool = True,
 80 | ) -> Optional[CTParse]:
 81 |     """Parse a string *txt* into a time expression
 82 | 
 83 |     :param ts: reference time
 84 |     :type ts: datetime.datetime
 85 |     :param pm_bias: pm bias on or off / 24h or 12h format
 86 |     :param date_format: us / eu date format
 87 |     :param fallback: fallback option if default date format is not parsed
 88 |     :param timeout: timeout for parsing in seconds; timeout=0
 89 |                     indicates no timeout
 90 |     :type timeout: float
 91 |     :param debug: if True do return iterator over all resolution, else
 92 |                   return highest scoring one (default=False)
 93 |     :param relative_match_len: relative minimum share of
 94 |                                characters an initial regex match sequence must
 95 |                                cover compared to the longest such sequence found
 96 |                                to be considered for productions (default=1.0)
 97 |     :type relative_match_len: float
 98 |     :param max_stack_depth: limit the maximal number of highest scored candidate
 99 |                             productions considered for future productions
100 |                             (default=10); set to 0 to not limit
101 |     :type max_stack_depth: int
102 |     :param latent_time: if True, resolve expressions that contain only a time
103 |                         (e.g. 8:00 pm) to be the next matching time after
104 |                         reference time *ts*
105 |     :returns: Optional[CTParse]
106 |     """
107 |     parsed = ctparse_gen(
108 |         txt,
109 |         ts,
110 |         pm_bias,
111 |         date_format,
112 |         fallback,
113 |         timeout=timeout,
114 |         relative_match_len=relative_match_len,
115 |         max_stack_depth=max_stack_depth,
116 |         scorer=scorer,
117 |         latent_time=latent_time,
118 |     )
119 | 
120 |     # TODO: keep debug for back-compatibility, but remove it later
121 |     if debug:
122 |         return parsed  # type: ignore
123 |     else:
124 |         parsed_list = list(parsed)
125 |         # TODO: this way of testing a failure to find a match is a bit clunky with types
126 |         if len(parsed_list) == 0 or (len(parsed_list) == 1 and parsed_list[0] is None):
127 |             # logger.warning('Failed to produce result for "{}"'.format(txt))
128 |             # labels = _get_labels(txt)
129 |             txt = re.sub('#[a-zA-Z0-9\-_:/.]+', '', txt).strip()
130 |             subject = txt
131 |             return CTParse(None, None, None, subject)# labels)
132 |         parsed_list.sort(key=lambda p: p.score)  # type: ignore
133 |         return parsed_list[-1]
134 | 
135 | 
136 | def ctparse_gen(
137 |     txt: str,
138 |     ts: Optional[datetime] = None,
139 |     pm_bias: Optional[bool] = True,
140 |     date_format: Optional[str] = None,
141 |     fallback: Optional[bool] = False,
142 |     timeout: Union[int, float] = 1.0,
143 |     relative_match_len: float = 1.0,
144 |     max_stack_depth: int = 10,
145 |     scorer: Optional[Scorer] = None,
146 |     latent_time: bool = True,
147 | ) -> Iterator[Optional[CTParse]]:
148 |     """Generate parses for the string *txt*.
149 | 
150 |     This function is equivalent to ctparse, with the exception that it returns an
151 |     iterator over the matches as soon as they are produced.
152 |     """
153 |     if scorer is None:
154 |         scorer = _DEFAULT_SCORER
155 |     if ts is None:
156 |         ts = datetime.now()
157 | 
158 | 
159 |     generated_parse = list(_ctparse(
160 |         _preprocess_string(txt),
161 |         ts,
162 |         pm_bias,
163 |         date_format,
164 |         timeout=timeout,
165 |         relative_match_len=relative_match_len,
166 |         max_stack_depth=max_stack_depth,
167 |         scorer=scorer,
168 |     ))
169 | 
170 |     if fallback and not generated_parse:
171 |         if date_format == "US":
172 |             fallback_date_format = "EU"
173 |         else:
174 |             fallback_date_format = "US"
175 | 
176 |         for parse in _ctparse(
177 |             _preprocess_string(txt),
178 |             ts,
179 |             pm_bias,
180 |             date_format=fallback_date_format,
181 |             timeout=timeout,
182 |             relative_match_len=relative_match_len,
183 |             max_stack_depth=max_stack_depth,
184 |             scorer=scorer,
185 |         ):
186 |             if parse and latent_time:
187 |                 # NOTE: we post-process after scoring because the model has been trained
188 |                 # without using the latent time. This means also that the post processing
189 |                 # step won't be added to the rules
190 |                 prod = apply_postprocessing_rules(ts, parse.resolution)
191 |                 parse.resolution = prod
192 | 
193 |             yield parse
194 | 
195 |     else:
196 |         for parse in generated_parse:
197 |             if parse and latent_time:
198 |                 # NOTE: we post-process after scoring because the model has been trained
199 |                 # without using the latent time. This means also that the post processing
200 |                 # step won't be added to the rules
201 |                 prod = apply_postprocessing_rules(ts, parse.resolution)
202 |                 parse.resolution = prod
203 | 
204 |             yield parse
205 | 
206 |     # for parse in _ctparse(
207 |     #     _preprocess_string(txt),
208 |     #     ts,
209 |     #     pm_bias,
210 |     #     date_format,
211 |     #     timeout=timeout,
212 |     #     relative_match_len=relative_match_len,
213 |     #     max_stack_depth=max_stack_depth,
214 |     #     scorer=scorer,
215 |     # ):
216 |     #     if parse and latent_time:
217 |     #         # NOTE: we post-process after scoring because the model has been trained
218 |     #         # without using the latent time. This means also that the post processing
219 |     #         # step won't be added to the rules
220 |     #         prod = apply_postprocessing_rules(ts, parse.resolution)
221 |     #         parse.resolution = prod
222 |     #
223 |     #     yield parse
224 | 
225 | 
226 | def _ctparse(
227 |     txt: str,
228 |     ts: datetime,
229 |     pm_bias: bool,
230 |     date_format: str,
231 |     timeout: float,
232 |     relative_match_len: float,
233 |     max_stack_depth: int,
234 |     scorer: Scorer,
235 | ) -> Iterator[Optional[CTParse]]:
236 |     t_fun = timeout_(timeout)
237 | 
238 |     try:
239 |         # =========== Label extraction ===========
240 |         # labels = _get_labels(txt)
241 |         # clear raw text of labels so what follows works properly
242 |         # txt = re.sub('#[a-zA-Z0-9\-_:/.]+','', txt).strip()
243 | 
244 |         logger.debug("=" * 80)
245 |         logger.debug("-> matching regular expressions")
246 | 
247 |         scope_regex = {**global_regex, **us_regex} if date_format == 'US' else {**global_regex, **eu_regex}
248 |         p, _tp = timeit(_match_regex)(txt, scope_regex)
249 |         logger.debug("time in _match_regex: {:.0f}ms".format(1000 * _tp))
250 | 
251 |         logger.debug("=" * 80)
252 |         logger.debug("-> building initial stack")
253 |         regex_stack, _ts = timeit(_regex_stack)(txt, p, t_fun)
254 |         logger.debug("time in _regex_stack: {:.0f}ms".format(1000 * _ts))
255 | 
256 |         # add empty production path + counter of contained regex
257 |         stack = [PartialParse.from_regex_matches(s) for s in regex_stack]
258 |         # TODO: the score should be kept separate from the partial parse
259 |         # because it depends also on the text and the ts. A good idea is
260 |         # to create a namedtuple of kind StackElement(partial_parse, score)
261 |         for pp in stack:
262 |             pp.score = scorer.score(txt, ts, pp)
263 | 
264 |         logger.debug("initial stack length: {}".format(len(stack)))
265 |         # sort stack by length of covered string and - if that is equal - score
266 |         # --> last element is longest coverage and highest scored
267 |         stack.sort()
268 |         # only keep initial stack elements that cover at least
269 |         # relative_match_len characters of what the highest
270 |         # scored/covering stack element does cover
271 |         stack = [
272 |             s
273 |             for s in stack
274 |             if s.max_covered_chars >= stack[-1].max_covered_chars * relative_match_len
275 |         ]
276 | 
277 |         logger.debug("stack length after relative match length: {}".format(len(stack)))
278 |         # limit depth of stack
279 |         stack = stack[-max_stack_depth:]
280 |         logger.debug("stack length after max stack depth limit: {}".format(len(stack)))
281 | 
282 |         # ======================== SUBJECT-EXTRACTION ========================
283 |         # get subject by extracting regex stack from raw text
284 |         regex_matches = [match.prod for match in stack]
285 |         regex_matches = [product.match.captures() for tuple in regex_matches for product in tuple]
286 |         regex_matches = [match.split() for i in regex_matches for match in i]
287 |         regex_matches = list(chain.from_iterable(regex_matches))
288 | 
289 |         # "acr-11" edge case
290 |         s = re.search(r'\b[A-Za-z]+\b-\d+[A-Za-z]*', txt)
291 |         if s:
292 |             raw = re.split(r'[\s]+', txt)
293 |         else:
294 |             raw = re.split(r'[\s-]+', txt)
295 | 
296 |         # subject = list(set(raw) - set(matches)) # doesn't preserve order, but more efficient
297 |         subject = [i for i in raw if i not in regex_matches]
298 |         subject = ' '.join(subject)
299 | 
300 |         # remove subject from txt so there's no FP parse
301 |         if subject and subject in txt:
302 |             txt = subject.replace(txt, '')
303 | 
304 |             # reset stack if txt is 0
305 |             if len(txt) == 0:
306 |                 stack = []
307 |         # ===========================================================
308 | 
309 |         # track what has been added to the stack and do not add again
310 |         # if the score is not better
311 |         stack_prod = {}  # type: Dict[Tuple[Artifact, ...], float]
312 |         # track what has been emitted and do not emit again
313 |         parse_prod = {}  # type: Dict[Artifact, float]
314 |         while stack:
315 |             t_fun()
316 |             s = stack.pop()
317 |             logger.debug("-" * 80)
318 |             logger.debug("producing on {}, score={:.2f}".format(s.prod, s.score))
319 |             new_stack_elements = []
320 |             for r_name, r in s.applicable_rules.items():
321 |                 for r_match in _match_rule(s.prod, r[1]):
322 |                     # apply production part of rule
323 |                     new_s = s.apply_rule(ts, pm_bias, date_format, r[0], r_name, r_match)
324 | 
325 |                     # TODO: We should store scores separately from the production itself
326 |                     # because the score may depend on the text and the ts
327 |                     if new_s is not None:
328 |                         new_s.score = scorer.score(txt, ts, new_s)
329 | 
330 |                     if (
331 |                         new_s
332 |                         and stack_prod.get(new_s.prod, new_s.score - 1) < new_s.score
333 |                     ):
334 |                         # either new_s.prod has never been produced
335 |                         # before or the score of new_s is higher than
336 |                         # a previous identical production
337 |                         new_stack_elements.append(new_s)
338 |                         logger.debug(
339 |                             "  {} -> {}, score={:.2f}".format(
340 |                                 r_name, new_s.prod, new_s.score
341 |                             )
342 |                         )
343 |                         stack_prod[new_s.prod] = new_s.score
344 |             if not new_stack_elements:
345 |                 logger.debug("~" * 80)
346 |                 logger.debug("no rules applicable: emitting")
347 |                 # no new productions were generated from this stack element.
348 |                 # emit all (probably partial) production
349 |                 for x in s.prod:
350 |                     if not isinstance(x, RegexMatch):
351 |                         # TODO: why do we have a different method for scoring
352 |                         # final productions? This is because you may have non-reducible
353 |                         # parses of the kind [Time, RegexMatch, Interval] or
354 |                         # [Time, Time] etc. In this case we want to emit those Time,
355 |                         # Interval parses separately and score them appropriately
356 |                         # (the default Scorer.score function only operates on the
357 |                         # whole PartialParse).
358 |                         score_x = scorer.score_final(txt, ts, s, x)
359 |                         # only emit productions not emitted before or
360 |                         # productions emitted before but scored higher
361 |                         if parse_prod.get(x, score_x - 1) < score_x:
362 |                             parse_prod[x] = score_x
363 |                             logger.debug(
364 |                                 " => {}, score={:.2f}, ".format(x.__repr__(), score_x)
365 |                             )
366 |                             yield CTParse(x, s.rules, score_x, subject)#, labels)
367 |             else:
368 |                 # new productions generated, put on stack and sort
369 |                 # stack by highst score
370 |                 stack.extend(new_stack_elements)
371 |                 stack.sort()
372 |                 stack = stack[-max_stack_depth:]
373 |                 logger.debug(
374 |                     "added {} new stack elements, depth after trunc: {}".format(
375 |                         len(new_stack_elements), len(stack)
376 |                     )
377 |                 )
378 |     except CTParseTimeoutError:
379 |         logger.debug('Timeout on "{}"'.format(txt))
380 |         return
381 | 
382 | 
383 | # replace all comma, semicolon, whitespace, invisible control, opening and
384 | # closing brackets
385 | # _repl1 = regex.compile(r"[,;\pZ\pC\p{Ps}\p{Pe}]+", regex.VERSION1) # original regex
386 | _repl1 = regex.compile(r"[\pZ\pC]+", regex.VERSION1) # allow brackets
387 | _repl2 = regex.compile(r"(\p{Pd}|[\u2010-\u2015]|\u2043)+", regex.VERSION1)
388 | 
389 | 
390 | def _get_labels(txt: str) -> str:
391 |     labels = re.findall('#[a-zA-Z0-9\-_:/.]+', txt)
392 |     labels = [label.replace("#", "") for label in labels]
393 |     return labels
394 | 
395 | 
396 | def _preprocess_string(txt: str) -> str:
397 |     return cast(
398 |         str, _repl2.sub("-", _repl1.sub(" ", txt, concurrent=True).strip()).strip()
399 |     )
400 | 
401 | 
402 | def _match_rule(
403 |     seq: Sequence[Artifact], rule: Sequence[Callable[[Artifact], bool]]
404 | ) -> Iterator[Tuple[int, int]]:
405 |     if not seq:
406 |         return
407 |     if not rule:
408 |         return
409 |     i_r = 0
410 |     i_s = 0
411 |     r_len = len(rule)
412 |     s_len = len(seq)
413 |     while i_s < s_len:
414 |         if rule[0](seq[i_s]):
415 |             i_start = i_s + 1
416 |             i_r = 1
417 |             while i_start < s_len and i_r < r_len and rule[i_r](seq[i_start]):
418 |                 i_r += 1
419 |                 i_start += 1
420 |             if i_r == r_len:
421 |                 yield i_s, i_start
422 |         i_s += 1
423 | 
424 | 
425 | def _match_regex(txt: str, regexes: Dict[int, regex.Regex]) -> List[RegexMatch]:
426 |     # Match a collection of regexes in *txt*
427 |     #
428 |     # The returned RegexMatch objects are sorted by the start of the match
429 |     # :param txt: the text to match against
430 |     # :param regexes: a collection of regexes name->pattern
431 |     # :return: a list of RegexMatch objects ordered my RegexMatch.mstart
432 |     matches = {
433 |         RegexMatch(name, m)
434 |         for name, re in regexes.items()
435 |         for m in re.finditer(txt, overlapped=True, concurrent=True)
436 |     }
437 |     for m in matches:
438 |         logger.debug("regex: {}".format(m.__repr__()))
439 |     return sorted(matches, key=lambda x: (x.mstart, x.mend))
440 | 
441 | 
442 | def _regex_stack(
443 |     txt: str,
444 |     regex_matches: List[RegexMatch],
445 |     on_do_iter: Callable[[], None] = lambda: None,
446 | ) -> List[Tuple[RegexMatch, ...]]:
447 |     # Group contiguous RegexMatch objects together.
448 |     #
449 |     # Assumes that regex_matches are sorted by increasing start index. on_do_iter
450 |     # is a callback that will be invoked every time the algorithm performs a loop.
451 |     #
452 |     # Example:
453 |     # Say you have the following text, where the regex matches are the
454 |     # words between square brackets.
455 |     #
456 |     # [Tomorrow] I want to go to the movies between [2] [pm] and [5] [pm].
457 |     #
458 |     # This function will return the matches that are contiguous (excluding space
459 |     # characters)
460 |     # [Tomorrow]
461 |     # [2], [pm]
462 |     # [5], [pm]
463 |     #
464 |     # This also works with overlapping matches.
465 |     #
466 |     # Algo:
467 |     # * initialize an empty stack
468 |     #
469 |     # * add all sequences of one expression to the stack, excluding
470 |     #   expressions which can be reached from "earlier" expression
471 |     #   (i.e. there is no gap between them):
472 |     #
473 |     #   - say A and B have no gap in between and all sequences starting
474 |     #     at A have already been produced. These by definition(which?: -) include as
475 |     #     sub-sequences all sequences starting at B. Any other sequences starting
476 |     #     at B directly will not add valid variations, as each of them could be
477 |     #     prefixed with a sequence starting at A
478 |     #
479 |     # * while the stack is not empty:
480 |     #
481 |     #   * get top sequence s from stack
482 |     #
483 |     #   * generate all possible continuations for this sequence,
484 |     #     i.e. sequences where expression can be appended to the last
485 |     #     element s[-1] in s and put these extended sequences on the stack
486 |     #
487 |     #   * if no new continuation could be generated for s, this sequence of
488 |     #     RegexMatch is appended to the list of results.
489 | 
490 |     prods = []
491 |     n_rm = len(regex_matches)
492 |     # Calculate the upper triangle of an n_rm x n_rm matrix M where
493 |     # M[i, j] == 1 (for i<j) iff the expressions i and j are
494 |     # consecutive (i.e. there is no gap and they can be put together
495 |     # in one sequence).
496 | 
497 |     # import numpy as np
498 |     # M = np.zeros(shape=(n_rm, n_rm), dtype=int)
499 | 
500 |     # --> avoid use of numpy here; since we need column sums below,
501 |     # --> the representation of M is columns major, i.e. M[i] is the i-th
502 |     # --> column; M[i, j] then basically becomes M[j][i]
503 |     M = [[0 for _ in range(n_rm)] for _ in range(n_rm)]
504 | 
505 |     _separator_regex = regex.compile(r"\s*", regex.VERSION1)
506 | 
507 |     def get_m_dist(m1: RegexMatch, m2: RegexMatch) -> int:
508 |         # 1 if there is no relevant gap between m1 and m2, 0 otherwise
509 |         # assumes that m1 and m2 are sorted be their start index
510 |         if m2.mstart < m1.mend:
511 |             return 0  # Overlap
512 |         gap_match = _separator_regex.fullmatch(txt[m1.mend : m2.mstart])
513 |         if gap_match:
514 |             return 1  # No Gap
515 |         else:
516 |             return 0  # Gap
517 | 
518 |     for i in range(n_rm):
519 |         for j in range(i + 1, n_rm):
520 |             M[j][i] = get_m_dist(regex_matches[i], regex_matches[j])
521 | 
522 |     # NOTE(glanaro): I believe this means that this is a beginning node.
523 |     # why reversed?
524 |     stack = [
525 |         (i,) for i in reversed(range(n_rm)) if sum(M[i]) == 0
526 |     ]  # type: List[Tuple[int, ...]]
527 |     while stack:
528 |         on_do_iter()
529 |         s = stack.pop()
530 |         i = s[-1]
531 |         new_prod = False
532 |         for j in range(i + 1, n_rm):
533 |             if M[j][i] == 1:
534 |                 stack.append(s + (j,))
535 |                 new_prod = True
536 |         if not new_prod:
537 |             prod = tuple(regex_matches[i] for i in s)
538 |             logger.debug("regex stack {}".format(prod))
539 |             prods.append(prod)
540 |     return prods
541 | 


--------------------------------------------------------------------------------
/ctparse/loader.py:
--------------------------------------------------------------------------------
 1 | """Utility to load default model in ctparse"""
 2 | 
 3 | import bz2
 4 | import logging
 5 | import os
 6 | import pickle
 7 | from .scorer import Scorer, DummyScorer
 8 | from .nb_scorer import NaiveBayesScorer
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | # Location of the default model, included with ctparse
13 | DEFAULT_MODEL_FILE = os.path.join(os.path.dirname(__file__), "models", "model.pbz")
14 | 
15 | 
16 | def load_default_scorer() -> Scorer:
17 |     resource = 'model.pbz'
18 | 
19 |     path = os.path.join(os.path.dirname(__file__), resource)
20 | 
21 |     # logger.warning(path)
22 |     # debug
23 |     # logger.warning([x.name for x in pkgutil.walk_packages()])
24 | 
25 |     # for exec usage
26 |     if os.access(path, mode=os.F_OK):
27 |         # d = os.path.dirname(sys.modules[package].__file__)
28 |         # logger.warning(os.path.join(d, resource))
29 |         with bz2.open(path, 'rb') as f:
30 |             # logger.warning(str(f))
31 |             mdl = pickle.load(f)
32 |         return NaiveBayesScorer(mdl)
33 |     # for non-exec usage
34 |     elif os.path.exists(DEFAULT_MODEL_FILE):
35 |         logger.info("Loading model from {} for non-exec usage".format(DEFAULT_MODEL_FILE))
36 |         with bz2.open(DEFAULT_MODEL_FILE, "rb") as fd:
37 |             mdl = pickle.load(fd)
38 |         return NaiveBayesScorer(mdl)
39 |     else:
40 |         logger.warning("No model found, initializing empty scorer")
41 |         return DummyScorer()
42 | 


--------------------------------------------------------------------------------
/ctparse/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .dummy import model_package_init
2 | 


--------------------------------------------------------------------------------
/ctparse/models/dummy.py:
--------------------------------------------------------------------------------
1 | def model_package_init():
2 |     return 1 == 1
3 | 


--------------------------------------------------------------------------------
/ctparse/models/model.pbz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Acreom/quickadd/69543c79ad5db05a712abf223940fadf61740235/ctparse/models/model.pbz


--------------------------------------------------------------------------------
/ctparse/nb_estimator.py:
--------------------------------------------------------------------------------
  1 | from typing import Sequence, Dict, Tuple, List
  2 | from math import log, exp
  3 | 
  4 | 
  5 | def _log_sum_exp(x: Sequence[float]) -> float:
  6 |     max_value = max(x)
  7 |     sum_of_exp = sum(exp(x_i - max_value) for x_i in x)
  8 |     return max_value + log(sum_of_exp)
  9 | 
 10 | 
 11 | class MultinomialNaiveBayes:
 12 |     """Implements a multinomial naive Bayes classifier. For background information
 13 |     (and what has inspired this, see e.g. https://scikit-learn.org/stable/...
 14 |         ...modules/generated/sklearn.naive_bayes.MultinomialNB.html)
 15 |     """
 16 | 
 17 |     def __init__(self, alpha: float = 1.0):
 18 |         """Create new un-trained model
 19 | 
 20 |         Parameters
 21 |         ----------
 22 |         alpha : Optional[float]
 23 |             Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing),
 24 |             defaults to 1.0
 25 |         """
 26 |         self.alpha = alpha
 27 |         self.class_prior = (0.0, 0.0)
 28 |         self.log_likelihood: Dict[str, List[float]] = {}
 29 | 
 30 |     @staticmethod
 31 |     def _construct_log_class_prior(y: Sequence[int]) -> Tuple[float, float]:
 32 |         # Input classes are -1 and 1
 33 |         neg_class_count = sum(1 if y_i == -1 else 0 for y_i in y)
 34 |         pos_class_count = len(y) - neg_class_count
 35 | 
 36 |         neg_log_prior = log(neg_class_count / (pos_class_count + neg_class_count))
 37 |         pos_log_prior = log(pos_class_count / (pos_class_count + neg_class_count))
 38 |         return (neg_log_prior, pos_log_prior)
 39 | 
 40 |     @staticmethod
 41 |     def _construct_log_likelihood(
 42 |         X: Sequence[Dict[int, int]], y: Sequence[int], alpha: float
 43 |     ) -> Dict[str, List[float]]:
 44 |         # Token counts
 45 |         # implicit assumption from vectorizer: first element has count for #vocab
 46 |         # size set
 47 |         vocabulary_len = max(X[0].keys()) + 1
 48 |         token_counts_negative = [alpha] * vocabulary_len
 49 |         token_counts_positive = [alpha] * vocabulary_len
 50 |         for x, y_ in zip(X, y):
 51 |             for idx, cnt in x.items():
 52 |                 if y_ == 1:
 53 |                     token_counts_positive[idx] += cnt
 54 |                 else:
 55 |                     token_counts_negative[idx] += cnt
 56 | 
 57 |         token_pos_class_sum = sum(token_counts_positive)
 58 |         token_neg_class_sum = sum(token_counts_negative)
 59 | 
 60 |         log_likelihood_negative = []
 61 |         log_likelihood_positive = []
 62 |         for token_ind in range(vocabulary_len):
 63 |             log_likelihood_positive.append(
 64 |                 log(token_counts_positive[token_ind]) - log(token_pos_class_sum)
 65 |             )
 66 | 
 67 |             log_likelihood_negative.append(
 68 |                 log(token_counts_negative[token_ind]) - log(token_neg_class_sum)
 69 |             )
 70 |         return {
 71 |             "negative_class": log_likelihood_negative,
 72 |             "positive_class": log_likelihood_positive,
 73 |         }
 74 | 
 75 |     def fit(
 76 |         self, X: Sequence[Dict[int, int]], y: Sequence[int]
 77 |     ) -> "MultinomialNaiveBayes":
 78 |         """Fit a naive Bayes model from a count of feature matrix
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         X : Sequence[Dict[int, int]]
 83 |             Sequence of sparse {feature_index: count} dictionaries
 84 |         y : Sequence[int]
 85 |             Labels +1/-1
 86 | 
 87 |         Returns
 88 |         -------
 89 |         MultinomialNaiveBayes
 90 |             The fitted model
 91 |         """
 92 |         self.class_prior = self._construct_log_class_prior(y)
 93 |         self.log_likelihood = self._construct_log_likelihood(X, y, self.alpha)
 94 |         return self
 95 | 
 96 |     def predict_log_probability(
 97 |         self, X: Sequence[Dict[int, int]]
 98 |     ) -> Sequence[Tuple[float, float]]:
 99 |         """Calculate the posterior log probability of new sample X
100 | 
101 |         Parameters
102 |         ----------
103 |         X : Sequence[Dict[int, int]]
104 |             Sequence of data to predict on as sparse {feature_index: count} dictionarie
105 | 
106 |         Returns
107 |         -------
108 |         Sequence[Tuple[float, float]]
109 |             Tuple of (negative-class, positive-class) log likelihoods
110 |         """
111 |         scores = []
112 |         for x in X:
113 |             # Initialise the scores with priors of positive and negative class
114 |             neg_score = self.class_prior[0]
115 |             pos_score = self.class_prior[1]
116 |             for idx, cnt in x.items():
117 |                 pos_score += self.log_likelihood["positive_class"][idx] * cnt
118 |                 neg_score += self.log_likelihood["negative_class"][idx] * cnt
119 |             joint_log_likelihood = [neg_score, pos_score]
120 |             # Normalize the scores
121 |             log_prob_x = _log_sum_exp(joint_log_likelihood)
122 |             scores.append((neg_score - log_prob_x, pos_score - log_prob_x))
123 |         return scores
124 | 


--------------------------------------------------------------------------------
/ctparse/nb_scorer.py:
--------------------------------------------------------------------------------
 1 | """This module cointains the implementation of the scorer based on naive bayes."""
 2 | import bz2
 3 | import math
 4 | import pickle
 5 | from datetime import datetime
 6 | from typing import Sequence
 7 | 
 8 | from ctparse.nb_estimator import MultinomialNaiveBayes
 9 | from ctparse.count_vectorizer import CountVectorizer
10 | from ctparse.pipeline import CTParsePipeline
11 | from .scorer import Scorer
12 | from .partial_parse import PartialParse
13 | from .types import Artifact
14 | 
15 | 
16 | class NaiveBayesScorer(Scorer):
17 |     def __init__(self, nb_model: CTParsePipeline) -> None:
18 |         """Scorer based on a naive bayes estimator.
19 | 
20 |         This scorer models the probability of having a correct parse, conditioned
21 |         on the sequence of rules (expressed as a categorical feature) that led to
22 |         that parse.
23 | 
24 |         The score is also modified by a "length" factor that penalizes parses that
25 |         cover a smaller part of the text string.
26 | 
27 |         :param nb_model:
28 |             A scikit-learn style Estimator that was trained on a corpus that takes
29 |             a Sequence[Sequence[str]] as X (each entry is a sequence of rule
30 |             identifiers) and a Sequence[int] in the set {-1, 1} that indicates if
31 |             the parse was correct or incorrect.
32 |         """
33 |         self._model = nb_model
34 | 
35 |     @classmethod
36 |     def from_model_file(cls, fname: str) -> "NaiveBayesScorer":
37 |         with bz2.open(fname, "rb") as fd:
38 |             return cls(pickle.load(fd))
39 | 
40 |     def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float:
41 |         # Penalty for partial matches
42 |         max_covered_chars = partial_parse.prod[-1].mend - partial_parse.prod[0].mstart
43 |         len_score = math.log(max_covered_chars / len(txt))
44 | 
45 |         X = _feature_extractor(txt, ts, partial_parse)
46 |         pred = self._model.predict_log_proba([X])
47 | 
48 |         # NOTE: the prediction is log-odds, or logit
49 |         model_score = pred[0][1] - pred[0][0]
50 | 
51 |         return model_score + len_score
52 | 
53 |     def score_final(
54 |         self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact
55 |     ) -> float:
56 |         # The difference between the original score and final score is that in the
57 |         # final score, the len_score is calculated based on the length of the final
58 |         # production
59 |         len_score = math.log(len(prod) / len(txt))
60 | 
61 |         X = _feature_extractor(txt, ts, partial_parse)
62 |         pred = self._model.predict_log_proba([X])
63 | 
64 |         # NOTE: the prediction is log-odds, or logit
65 |         model_score = pred[0][1] - pred[0][0]
66 | 
67 |         # We want the len_score to always take precedence. I believe a logit won't go up
68 |         # more than 1000. A better way would be to return an ordering tuple instead,
69 |         # but then we would need to change many interfaces.
70 |         return model_score + 1000 * len_score
71 | 
72 | 
73 | def _feature_extractor(
74 |     txt: str, ts: datetime, partial_parse: PartialParse
75 | ) -> Sequence[str]:
76 |     return [str(r) for r in partial_parse.rules]
77 | 
78 | 
79 | def train_naive_bayes(X: Sequence[Sequence[str]], y: Sequence[bool]) -> CTParsePipeline:
80 |     """Train a naive bayes model for NaiveBayesScorer"""
81 |     y_binary = [1 if y_i else -1 for y_i in y]
82 |     # Create and train the pipeline
83 |     pipeline = CTParsePipeline(
84 |         CountVectorizer(ngram_range=(1, 3)), MultinomialNaiveBayes(alpha=1.0)
85 |     )
86 |     model = pipeline.fit(X, y_binary)
87 |     return model
88 | 
89 | 
90 | def save_naive_bayes(model: CTParsePipeline, fname: str) -> None:
91 |     """Save a naive bayes model for NaiveBayesScorer"""
92 |     # TODO: version this model and dump metadata with lots of information
93 |     with bz2.open(fname, "wb") as fd:
94 |         pickle.dump(model, fd)
95 | 


--------------------------------------------------------------------------------
/ctparse/partial_parse.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from datetime import datetime
  3 | from typing import (
  4 |     Callable,
  5 |     Optional,
  6 |     Sequence,
  7 |     Tuple,
  8 |     TypeVar,
  9 |     Union,
 10 |     Dict,
 11 |     List,
 12 |     Generator,
 13 | )
 14 | 
 15 | from .rule import rules as global_rules, ProductionRule, Predicate
 16 | from .timers import timeit
 17 | from .types import Artifact, RegexMatch
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | T = TypeVar("T")
 22 | 
 23 | 
 24 | class PartialParse:
 25 |     def __init__(
 26 |         self, prod: Tuple[Artifact, ...], rules: Tuple[Union[int, str], ...]
 27 |     ) -> None:
 28 |         """A data structure representing a partial parse.
 29 | 
 30 | 
 31 |         * prod: the current partial production
 32 |         * rules: the sequence of regular expressions and rules used/applied to produce
 33 |                  prod
 34 |         * score: the score assigned to this production
 35 |         """
 36 |         if len(prod) < 1:
 37 |             raise ValueError("prod should have at least one element")
 38 | 
 39 |         self.prod = prod
 40 |         self.rules = rules
 41 |         self.applicable_rules = global_rules
 42 |         self.max_covered_chars = self.prod[-1].mend - self.prod[0].mstart
 43 |         self.score = 0.0
 44 | 
 45 |     @classmethod
 46 |     def from_regex_matches(
 47 |         cls, regex_matches: Tuple[RegexMatch, ...]
 48 |     ) -> "PartialParse":
 49 |         """Create partial production from a series of RegexMatch
 50 | 
 51 |         This usually is called when no production rules (with the exception of
 52 |         regex matches) have been applied.
 53 | 
 54 |         """
 55 |         se = cls(prod=regex_matches, rules=tuple(r.id for r in regex_matches))
 56 | 
 57 |         logger.debug("=" * 80)
 58 |         logger.debug("-> checking rule applicability")
 59 |         # Reducing rules to only those applicable has no effect for
 60 |         # small stacks, but on larger there is a 10-20% speed
 61 |         # improvement
 62 |         se.applicable_rules, _ts = timeit(se._filter_rules)(global_rules)
 63 |         logger.debug(
 64 |             "of {} total rules {} are applicable in {}".format(
 65 |                 len(global_rules), len(se.applicable_rules), se.prod
 66 |             )
 67 |         )
 68 |         logger.debug("time in _filter_rules: {:.0f}ms".format(1000 * _ts))
 69 |         logger.debug("=" * 80)
 70 | 
 71 |         return se
 72 | 
 73 |     def apply_rule(
 74 |         self,
 75 |         ts: datetime,
 76 |         pm_bias: bool,
 77 |         date_format: str,
 78 |         rule: ProductionRule,
 79 |         rule_name: Union[str, int],
 80 |         match: Tuple[int, int],
 81 |     ) -> Optional["PartialParse"]:
 82 |         """Check whether the production in rule can be applied to this stack
 83 |         element.
 84 | 
 85 |         If yes, return a copy where this update is
 86 |         incorporated in the production, the record of applied rules
 87 |         and the score.
 88 | 
 89 |         :param ts: reference time
 90 |         :param pm_bias: bias option bool
 91 |         :param date_format: us / eu date format
 92 |         :param rule: a tuple where the first element is the production rule to apply
 93 |         :param rule_name: the name of the rule
 94 |         :param match: the start and end index of the parameters that the rule needs.
 95 |         """
 96 |         prod = rule(ts, pm_bias, date_format, *self.prod[match[0]: match[1]])
 97 | 
 98 |         if prod is not None:
 99 |             pp = PartialParse(
100 |                 prod=self.prod[: match[0]] + (prod,) + self.prod[match[1] :],
101 |                 rules=self.rules + (rule_name,),
102 |             )
103 | 
104 |             pp.applicable_rules = self.applicable_rules
105 |             return pp
106 |         else:
107 |             return None
108 | 
109 |     def __lt__(self, other: "PartialParse") -> bool:
110 |         """Sort stack elements by (a) the length of text they can
111 |         (potentially) cover and (b) the score assigned to the
112 |         production.
113 | 
114 |         a < b <=> a.max_covered_chars < b.max_covered_chars or
115 |                   (a.max_covered_chars <= b.max_covered_chars and a.score < b.score)
116 |         """
117 |         return (self.max_covered_chars < other.max_covered_chars) or (
118 |             self.max_covered_chars == other.max_covered_chars
119 |             and self.score < other.score
120 |         )
121 | 
122 |     def __repr__(self) -> str:
123 |         return "PartialParse(prod={}, rules={}, score={})".format(
124 |             repr(self.prod), repr(self.rules), repr(self.score)
125 |         )
126 | 
127 |     def _filter_rules(
128 |         self, rules: Dict[str, Tuple[ProductionRule, List[Predicate]]]
129 |     ) -> Dict[str, Tuple[ProductionRule, List[Predicate]]]:
130 |         # find all rules that can be applied to the current prod sequence
131 |         def _hasNext(it: Generator[List[int], None, None]) -> bool:
132 |             try:
133 |                 next(it)
134 |                 return True
135 |             except StopIteration:
136 |                 return False
137 | 
138 |         return {
139 |             rule_name: r
140 |             for rule_name, r in rules.items()
141 |             if _hasNext(_seq_match(self.prod, r[1]))
142 |         }
143 | 
144 | 
145 | def _seq_match(
146 |     seq: Sequence[T], pat: Sequence[Callable[[T], bool]], offset: int = 0
147 | ) -> Generator[List[int], None, None]:
148 |     # :param seq: a list of intermediate productions, either of type
149 |     # RegexMatch or some other Artifact
150 |     #
151 |     # :param pat: a list of rule patterns to be matched, i.e. either a
152 |     # RegexMatch or a callable
153 |     #
154 |     # Determine whether the pattern pat matches the sequence seq and
155 |     # return a list of lists, where each sub-list contains those
156 |     # indices where the RegexMatch objects in pat are located in seq.
157 |     #
158 |     # A pattern pat only matches seq, iff each RegexMatch in pat is in
159 |     # seq in the same order and iff between two RegexMatches aligned
160 |     # to seq there is at least one additional element in seq. Reason:
161 |     #
162 |     # * Rule patterns never have two consequitive RegexMatch objects.
163 |     #
164 |     # * Hence there must be some predicate/dimension between two
165 |     # * RegexMatch objects.
166 |     #
167 |     # * For the whole pat to match there must then be at least one
168 |     #  element in seq that can product this intermediate bit
169 |     #
170 |     # If pat does not start with a RegexMatch then there must be at
171 |     # least one element in seq before the first RegexMatch in pat that
172 |     # is alignes on seq. Likewise, if pat does not end with a
173 |     # RegexMatch, then there must be at least one additional element
174 |     # in seq to match the last non-RegexMatch element in pat.
175 |     #
176 |     # STRONG ASSUMPTIONS ON ARGUMENTS: seq and pat do not contain
177 |     # consequiteve elements which are both of type RegexMatch! Callers
178 |     # obligation to ensure this!
179 | 
180 |     if not pat:
181 |         # if pat is empty yield the empty match
182 |         yield []
183 |     elif not seq or not pat:
184 |         # if either seq or pat is empty there will be no match
185 |         return
186 |     elif pat[-1].__name__ != "_regex_match":
187 |         # there must be at least one additional element in seq at the
188 |         # end
189 |         yield from _seq_match(seq[:-1], pat[:-1], offset)
190 |     elif len(pat) > len(seq):
191 |         # if pat is longer than seq it cannot match
192 |         return
193 |     else:
194 |         p1 = pat[0]
195 |         # if p1 is not a RegexMatch, then continue on next pat and
196 |         # advance sequence by one
197 |         if p1.__name__ != "_regex_match":
198 |             yield from _seq_match(seq[1:], pat[1:], offset + 1)
199 |         else:
200 |             # Get number of RegexMatch in p
201 |             n_regex = sum(1 for p in pat if p.__name__ == "_regex_match")
202 |             # For each occurance of RegexMatch pat[0] in seq
203 |             for iseq, s in enumerate(seq):
204 |                 # apply _regex_match check
205 |                 if p1(s):
206 |                     # for each match of pat[1:] in seq[iseq+1:], yield a result
207 |                     for subm in _seq_match(seq[iseq + 1 :], pat[1:], offset + iseq + 1):
208 |                         if len(subm) == n_regex - 1:
209 |                             # only yield if all subsequent RegexMatch
210 |                             # have been aligned!
211 |                             yield [iseq + offset] + subm
212 | 


--------------------------------------------------------------------------------
/ctparse/pipeline.py:
--------------------------------------------------------------------------------
 1 | from typing import Sequence, Tuple
 2 | 
 3 | from .nb_estimator import MultinomialNaiveBayes
 4 | from .count_vectorizer import CountVectorizer
 5 | 
 6 | 
 7 | class CTParsePipeline:
 8 |     def __init__(self, transformer: CountVectorizer, estimator: MultinomialNaiveBayes):
 9 |         """Setup a pipeline of feature extraction and naive bayes. Overkill for what it
10 |         does but leaves room to use different models/features in the future
11 | 
12 |         Parameters
13 |         ----------
14 |         transformer : CountVectorizer
15 |             feature extraction step
16 |         estimator : MultinomialNaiveBayes
17 |             naive bayes model
18 |         """
19 |         self.transformer = transformer
20 |         self.estimator = estimator
21 | 
22 |     def fit(self, X: Sequence[Sequence[str]], y: Sequence[int]) -> "CTParsePipeline":
23 |         """Fit the transformer and then fit the Naive Bayes model on the transformed
24 |         data
25 | 
26 |         Returns
27 |         -------
28 |         CTParsePipeline
29 |             Returns the fitted pipeline
30 |         """
31 |         X_transformed = self.transformer.fit_transform(X)
32 |         self.estimator = self.estimator.fit(X_transformed, y)
33 |         return self
34 | 
35 |     def predict_log_proba(
36 |         self, X: Sequence[Sequence[str]]
37 |     ) -> Sequence[Tuple[float, float]]:
38 |         """Apply the transforms and get probability predictions from the estimator
39 | 
40 |         Parameters
41 |         ----------
42 |         X : Sequence[Sequence[str]]
43 |             Sequence of documents, each as sequence of tokens. In ctparse case there are
44 |             just the names of the regex matches and rules applied
45 | 
46 |         Returns
47 |         -------
48 |         Sequence[Tuple[float, float]]
49 |             For each document the tuple of negative/positive log probability from the
50 |             naive bayes model
51 |         """
52 |         X_transformed = self.transformer.transform(X)
53 |         return self.estimator.predict_log_probability(X_transformed)
54 | 


--------------------------------------------------------------------------------
/ctparse/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.


--------------------------------------------------------------------------------
/ctparse/rule.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa F405
  2 | import logging
  3 | 
  4 | from datetime import datetime
  5 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union, Type
  6 | 
  7 | import regex
  8 | 
  9 | from .types import Artifact, RegexMatch
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | # A predicate is a callable that returns True if the predicate
 15 | # applies to the artifact
 16 | Predicate = Callable[[Artifact], bool]
 17 | 
 18 | # ProductionRule is a function used to generate an artifact given other
 19 | # artifacts.
 20 | ProductionRule = Callable[..., Optional[Artifact]]
 21 | 
 22 | 
 23 | rules = {}  # type: Dict[str, Tuple[ProductionRule, List[Predicate]]]
 24 | 
 25 | _regex_cnt = 150  # leave this much space for ids of production types
 26 | _regex = {}  # compiled regex
 27 | _regex_str = {}  # map regex id to original string
 28 | _str_regex = {}  # type: Dict[str, int] # map regex raw str to regex id
 29 | eu_regex = {}
 30 | us_regex = {}
 31 | 
 32 | _regex_hour = r"(?:[01]?\d)|(?:2[0-3])"
 33 | _regex_minute = r"[0-5]\d"
 34 | _regex_day = r"[012]?[1-9]|10|20|30|31"
 35 | _regex_month = r"10|11|12|0?[1-9]"
 36 | _regex_year = r"(?:19\d\d)|(?:20[0-2]\d)|(?:\d\d)"
 37 | 
 38 | # used in many places in rules
 39 | _regex_to_join = (
 40 |     r"(\-|to( the)?|(un)?til|bis( zum)?|zum|auf( den)?|und|"
 41 |     "no later than|spätestens?|at latest( at)?|and)"
 42 | )
 43 | 
 44 | _defines = (
 45 |     r"(?(DEFINE)(?<_hour>{regex_hour})(?P<_minute>{regex_minute})"
 46 |     "(?P<_day>{regex_day})(?P<_month>{regex_month})"
 47 |     "(?P<_year>{regex_year}))"
 48 | ).format(
 49 |     regex_hour=_regex_hour,
 50 |     regex_minute=_regex_minute,
 51 |     regex_day=_regex_day,
 52 |     regex_month=_regex_month,
 53 |     regex_year=_regex_year,
 54 | )
 55 | 
 56 | 
 57 | def rule(*patterns: Union[str, Predicate], **kwargs) -> Callable[[Any], ProductionRule]:
 58 |     def _map(p: Union[str, Predicate]) -> Predicate:
 59 |         if isinstance(p, str):
 60 |             # its a regex
 61 |             global _regex_cnt
 62 |             if p in _str_regex:
 63 |                 # have seen this regex before - recycle
 64 |                 return regex_match(_str_regex[p])
 65 |             # test the regex first
 66 |             re = r"{defines}(?i)(?P<R{re_key}>{re})".format(
 67 |                 defines=_defines, re=p, re_key=_regex_cnt
 68 |             )
 69 |             new_rr = regex.compile(
 70 |                 # Removed the separator here - leads to more matches,
 71 |                 # as now each rule can also match if it is not followed
 72 |                 # or preceeded by a separator character
 73 |                 # r'(?i)(?:{sep})(?P<{re_key}>{re})(?:{sep})'.format(
 74 |                 re,
 75 |                 regex.VERSION1,
 76 |             )
 77 |             if new_rr.match(""):
 78 |                 raise ValueError("expression {} matches empty strings".format(p))
 79 | 
 80 |             if kwargs and "date_format" in kwargs:
 81 |                 if kwargs['date_format'] == 'US':
 82 |                     us_regex[_regex_cnt] = new_rr
 83 |                 else:
 84 |                     eu_regex[_regex_cnt] = new_rr
 85 |             else:
 86 |                 _regex[_regex_cnt] = new_rr
 87 |             _regex_str[_regex_cnt] = p
 88 |             _str_regex[p] = _regex_cnt
 89 |             _regex_cnt += 1
 90 |             return regex_match(_regex_cnt - 1)
 91 |         else:
 92 |             return p
 93 | 
 94 |     # check that in rules we never have a regex followed by a regex -
 95 |     # that must be merged into one regex
 96 |     def _has_consequtive_regex(
 97 |         ps: Tuple[Union[str, Callable[[Artifact], bool]], ...]
 98 |     ) -> bool:
 99 |         for p0, p1 in zip(ps[:-1], ps[1:]):
100 |             if isinstance(p0, str) and isinstance(p1, str):
101 |                 return True
102 |         return False
103 | 
104 |     if _has_consequtive_regex(patterns):
105 |         raise ValueError("rule which contains consequtive regular expressions found")
106 | 
107 |     mapped_patterns = [_map(p) for p in patterns]
108 | 
109 |     def fwrapper(f: ProductionRule) -> ProductionRule:
110 |         def wrapper(ts: datetime, *args: Artifact) -> Optional[Artifact]:
111 |             res = f(ts, *args)
112 |             if res is not None:
113 |                 # upon a successful production, update the span
114 |                 # information by expanding it to that of all args
115 |                 res.update_span(*args)
116 |             return res
117 | 
118 |         rules[f.__name__] = (wrapper, mapped_patterns)
119 |         return wrapper
120 | 
121 |     return fwrapper
122 | 
123 | 
124 | def regex_match(r_id: int) -> Predicate:
125 |     def _regex_match(r: Artifact) -> bool:
126 |         return type(r) == RegexMatch and r.id == r_id  # type: ignore
127 | 
128 |     return _regex_match
129 | 
130 | 
131 | def dimension(dim: Type[Artifact]) -> Predicate:
132 |     def _dimension(d: Artifact) -> bool:
133 |         return isinstance(d, dim)
134 | 
135 |     return _dimension
136 | 
137 | 
138 | def predicate(pred: str) -> Predicate:
139 |     def _predicate(d: Artifact) -> Any:
140 |         return getattr(d, pred, False)
141 | 
142 |     return _predicate
143 | 
144 | 
145 | from .time.rules import *  # noqa
146 | 


--------------------------------------------------------------------------------
/ctparse/scorer.py:
--------------------------------------------------------------------------------
 1 | """This module contains the Scorer abstraction that can be used to
 2 | implement scoring strategies for ctparse.
 3 | """
 4 | 
 5 | from abc import ABCMeta, abstractmethod
 6 | from datetime import datetime
 7 | from random import Random
 8 | from typing import Optional
 9 | 
10 | from .partial_parse import PartialParse
11 | from .types import Artifact
12 | 
13 | 
14 | class Scorer(metaclass=ABCMeta):
15 |     """Interface for scoring parses generated by ctparse"""
16 | 
17 |     @abstractmethod
18 |     def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float:
19 |         """Produce a score for a partial production.
20 | 
21 |         :param txt:  the text that is being parsed
22 |         :param ts: the reference time
23 |         :param partial_parse: the partial parse that needs to be scored
24 |         """
25 | 
26 |     @abstractmethod
27 |     def score_final(
28 |         self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact
29 |     ) -> float:
30 |         """Produce the final score for a production.
31 | 
32 |         :param txt: the text that is being parsed
33 |         :param ts: the reference time
34 |         :param partial_parse: the PartialParse object that generated the production
35 |         :param prod: the production
36 |         """
37 | 
38 | 
39 | class DummyScorer(Scorer):
40 |     """A scorer that always return a 0.0 score."""
41 | 
42 |     def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float:
43 |         return 0.0
44 | 
45 |     def score_final(
46 |         self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact
47 |     ) -> float:
48 |         return 0.0
49 | 
50 | 
51 | class RandomScorer(Scorer):
52 |     def __init__(self, rng: Optional[Random] = None) -> None:
53 |         """A score that returns a random number between 0 and 1.
54 | 
55 |         :param rng:
56 |             the random number generator to use
57 |         """
58 |         self.rng = rng if rng is not None else Random()
59 | 
60 |     def score(self, txt: str, ts: datetime, partial_parse: PartialParse) -> float:
61 |         return self.rng.random()
62 | 
63 |     def score_final(
64 |         self, txt: str, ts: datetime, partial_parse: PartialParse, prod: Artifact
65 |     ) -> float:
66 |         return self.rng.random()
67 | 


--------------------------------------------------------------------------------
/ctparse/time/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Acreom/quickadd/69543c79ad5db05a712abf223940fadf61740235/ctparse/time/__init__.py


--------------------------------------------------------------------------------
/ctparse/time/corpus.py:
--------------------------------------------------------------------------------
  1 | corpus = [
  2 |     # ruleYear
  3 |     ("Time[]{2019-X-X X:X (X/X)}", "2018-03-07T12:43", ["2019"]),
  4 |     # ruleToday
  5 |     (
  6 |         "Time[]{2018-03-07 X:X (X/X)}",
  7 |         "2018-03-07T12:43",
  8 |         ["heute", "zu dieser zeit", "today"],
  9 |     ),
 10 |     # ruleNow
 11 |     (
 12 |         "Time[]{2018-03-07 12:43 (X/X)}",
 13 |         "2018-03-07T12:43",
 14 |         ["jetzt", "genau jetzt", "gerade eben", "rightnow", "just now"],
 15 |     ),
 16 |     # ruleTomorrow
 17 |     ("Time[]{2019-01-01 X:X (X/X)}", "2018-12-31T12:43", ["morgen", "tomorrow", "tom", "tmrw"]),
 18 |     # ruleAfterTomorrow
 19 |     ("Time[]{2019-01-02 X:X (X/X)}", "2018-12-31T12:43", ["übermorgen"]),
 20 |     # ruleTomorrow + time
 21 |     (
 22 |         "Time[]{2019-01-01 19:25 (X/X)}",
 23 |         "2018-12-31T12:43",
 24 |         ["morgen 19:25", "tomorrow 7:25 pm"],
 25 |     ),
 26 |     # ruleYesterday
 27 |     # test on a leap-year
 28 |     ("Time[]{2020-02-29 X:X (X/X)}", "2020-03-01T12:43", ["gestern", "yesterday"]),
 29 |     # ruleBeforeYesterday
 30 |     # test on a leap-year
 31 |     ("Time[]{2020-02-28 X:X (X/X)}", "2020-03-01T12:43", ["vorgestern"]),
 32 |     # ruleEOM
 33 |     (
 34 |         "Time[]{2018-03-31 X:X (X/X)}",
 35 |         "2018-03-07T12:43",
 36 |         ["ende des Monats", "eom", "end of the month"],
 37 |     ),
 38 |     # ruleEOY
 39 |     (
 40 |         "Time[]{2018-12-31 X:X (X/X)}",
 41 |         "2018-03-07T12:43",
 42 |         ["ende des Jahres", "eoy", "end of the year"],
 43 |     ),
 44 |     # ruleNamedDOW
 45 |     ("Time[]{2018-03-12 X:X (X/X)}", "2018-03-07T12:43", ["Montag", "mon", "monday"]),
 46 |     (
 47 |         "Time[]{2018-03-13 X:X (X/X)}",
 48 |         "2018-03-07T12:43",
 49 |         ["Dienstag", "tuesday", "tue"],
 50 |     ),
 51 |     # ruleNamedDOW + POD
 52 |     # ("Time[]{2018-03-12 X:X (X/morning)}", "2018-03-07T12:43", ["Montagmorgen"]),
 53 |     # ("Time[]{2018-03-14 X:X (X/forenoon)}", "2018-03-07T12:43", ["Mittwochvormittag"]),
 54 |     # ("Time[]{2018-03-10 X:X (X/morning)}", "2018-03-07T12:43", ["Samstagfrüh"]),
 55 |     # (
 56 |     #     "Time[]{2018-03-11 X:X (X/night)}",
 57 |     #     "2018-03-07T12:43",
 58 |     #     ["sunday night", "Sonntagnacht"],
 59 |     # ),
 60 |     # ruleNamedMonth
 61 |     ("Time[]{2023-01-01 X:X (X/X)}", "2022-11-28T12:43", ["1st January", "1st jan."]),
 62 |     ("Time[]{2023-04-15 X:X (X/X)}", "2022-11-28T12:43", ["15 April", "15 apr."]),
 63 |     ("Time[]{X-07-X X:X (X/X)}", "2022-11-28T12:43", ["Juli", "July", "Jul"]),
 64 |     (
 65 |         "Time[]{2022-12-24 X:X (X/X)}",
 66 |         "2022-11-28T12:43",
 67 |         ["24 Dezember", "December 24", "24 Dec.", "24 Dez."],
 68 |     ),
 69 |     # ruleAtDOW
 70 |     ("Time[]{2018-03-13 X:X (X/X)}", "2018-03-07T12:43", ["am Dienstag", "on Tue"]),
 71 |     (
 72 |         "Time[]{2018-03-14 X:X (X/X)}",
 73 |         "2018-03-07T12:43",
 74 |         ["this Wednesday", "diesen Mittwoch"],
 75 |     ),
 76 |     # ruleNextDOW
 77 |     (
 78 |         "Time[]{2018-03-16 X:X (X/X)}",
 79 |         "2018-03-07T12:43",
 80 |         [
 81 |             "am nächsten Freitag",
 82 |             "next Friday",
 83 |             "nächste Woche Freitag",
 84 |             "Friday next week",
 85 |             "on the following Friday",
 86 |         ],
 87 |     ),
 88 |     # ruleDOYYear, ruleDDMM, ruleDDMMYYYY
 89 |     (
 90 |         "Time[]{2018-05-08 X:X (X/X)}",
 91 |         "2018-03-07T12:43",
 92 |         [
 93 |             "8.5.2018",
 94 |             "8. Mai 2018",
 95 |             "8. Mai 18",
 96 |             "8 May 2018",
 97 |             "8 May",
 98 |             "May 8",
 99 |             "8/5",
100 |             "8.5.",
101 |             "am 8. Mai 2018",
102 |             "diesen 8. Mai 18",
103 |             "den 8.5.",
104 |             "8th May",
105 |             "8th of May",
106 |             "May 8th",
107 |             "at 8th May",
108 |             "on 8th of May",
109 |             "this May 8th",
110 |             "may 8",
111 |         ],
112 |     ),
113 |     (
114 |         "Time[]{2022-12-24 X:X (X/X)}",
115 |         "2022-11-29T12:43",
116 |         [
117 |             "12/24",
118 |             "12/24",
119 |             "12/24",
120 |             "12/24",
121 |             "12/24",
122 |             "12/24",
123 |             "12/24",
124 |             "12/24",
125 |             "12/24",
126 |             "12/24",
127 |             "12/24",
128 |             "12/24",
129 |             "12/24",
130 |             "12/24",
131 |             "12/24",
132 |             "12/24",
133 |             "12/24",
134 |             "12/24",
135 |             "12/24",
136 |             "12/24",
137 |             "12/24",
138 |             "12/24",
139 |             "12/24",
140 |             "24/12",
141 |             "24/12",
142 |             "24/12",
143 |         ],
144 |     ),
145 |     # ruleDOWDOM
146 |     (
147 |         "Time[]{2022-11-29 X:X (X/X)}",
148 |         "2022-11-28T12:43",
149 |         ["Tuesday 29th", "Tuesday the 29th", "Dienstag der 29."],
150 |     ),
151 |     (
152 |         "Time[]{2018-06-02 X:X (X/X)}",
153 |         "2018-03-07T12:43",
154 |         ["Saturday 2nd", "Jun 2nd", "am 2ten Juni"],
155 |     ),
156 |     # ruleDOWDate, ruleDateDOW
157 |     (
158 |         "Time[]{2018-05-08 X:X (X/X)}",
159 |         "2018-03-07T12:43",
160 |         ["Tuesday 8.5", "8.5 Tuesday"],
161 |     ),
162 |     # (
163 |     #     "Time[]{2018-05-08 X:X (X/morning)}",
164 |     #     "2018-03-07T12:43",
165 |     #     ["Dienstagmorgen 8.5.", "8.5. Dienstagmorgen"],
166 |     # ),
167 |     # rulePOD, ruleLatentPOD
168 |     # (
169 |     #     "Time[]{2018-03-08 X:X (X/morning)}",
170 |     #     "2018-03-07T12:43",
171 |     #     ["morgens", "früh", "in der früh", "early", "morning"],
172 |     # ),
173 |     # (
174 |     #     "Time[]{2018-03-08 X:X (X/earlymorning)}",
175 |     #     "2018-03-07T12:43",
176 |     #     ["früh morgens", "sehr früh", "early morning"],
177 |     # ),
178 |     # (
179 |     #     "Time[]{2018-03-08 X:X (X/forenoon)}",
180 |     #     "2018-03-07T12:43",
181 |     #     ["vormittags", "forenoon"],
182 |     # ),
183 |     # before noon case
184 |     # (
185 |     #     "Interval[]{None - 2018-03-08 X:X (X/noon)}",
186 |     #     "2018-03-07T12:43",
187 |     #     ["vor mittags", "before noon"],
188 |     # ),
189 |     # (
190 |     #     "Time[]{2018-03-08 X:X (X/afternoon)}",
191 |     #     "2018-03-07T12:43",
192 |     #     ["nachmittag", "afternoon"],
193 |     # ),
194 |     # # past noon case
195 |     # (
196 |     #     "Interval[]{2018-03-08 X:X (X/noon) - None}",
197 |     #     "2018-03-07T12:43",
198 |     #     ["nach mittag", "after noon"],
199 |     # ),
200 |     # ("Time[]{2018-03-08 X:X (X/noon)}", "2018-03-07T12:43", ["mittags", "noon"]),
201 |     # (
202 |     #     "Time[]{2018-03-07 X:X (X/evening)}",
203 |     #     "2018-03-07T12:43",
204 |     #     ["abends", "late", "spät"],
205 |     # ),
206 |     # (
207 |     #     "Time[]{2018-03-07 X:X (X/lateevening)}",
208 |     #     "2018-03-07T12:43",
209 |     #     ["später abend", "very late", "late evening"],
210 |     # ),
211 |     # (
212 |     #     "Time[]{2018-03-08 X:X (X/veryearlyafternoon)}",
213 |     #     "2018-03-07T12:43",
214 |     #     ["sehr früher nachmittag", "very early afternoon"],
215 |     # ),
216 |     # (
217 |     #     "Time[]{2018-03-07 X:X (X/night)}",
218 |     #     "2018-03-07T12:43",
219 |     #     ["heute nacht", "this night", "nachts"],
220 |     # ),
221 |     # # First/Last
222 |     # (
223 |     #     "Time[]{2018-03-08 X:X (X/first)}",
224 |     #     "2018-03-07T12:43",
225 |     #     [
226 |     #         "tomorrow first",
227 |     #         "morgen erster",
228 |     #         "morgen so früh wie möglich",
229 |     #         "tomorrow earliest possible",
230 |     #     ],
231 |     # ),
232 |     # (
233 |     #     "Time[]{2018-03-08 X:X (X/last)}",
234 |     #     "2018-03-07T12:43",
235 |     #     [
236 |     #         "tomorrow last",
237 |     #         "morgen letzter",
238 |     #         "tomorrow as late as possible",
239 |     #         "morgen spätest möglicher",
240 |     #     ],
241 |     # ),
242 |     # (
243 |     #     "Time[]{2018-03-09 X:X (X/first)}",
244 |     #     "2018-03-07T12:43",
245 |     #     ["Friday first", "Freitag erster"],
246 |     # ),
247 |     # (
248 |     #     "Time[]{2018-03-13 X:X (X/last)}",
249 |     #     "2018-03-07T12:43",
250 |     #     ["Tuesday last", "Dienstag letzter"],
251 |     # ),
252 |     # # Date + POD
253 |     # (
254 |     #     "Time[]{2017-01-25 X:X (X/evening)}",
255 |     #     "2018-03-07T12:43",
256 |     #     [
257 |     #         "25.01.2017 abends",
258 |     #         "evening of January 25th 2017",
259 |     #         "25.01.2017 late",
260 |     #         "25.01.2017 spät",
261 |     #         "25.01.2017 (spät)",
262 |     #     ],
263 |     # ),
264 |     # (
265 |     #     "Time[]{2017-01-25 X:X (X/lateafternoon)}",
266 |     #     "2018-03-07T12:43",
267 |     #     [
268 |     #         "25.01.2017 spät nachmittags",
269 |     #         "am 25. Januar 2017 am späten Nachmittag",
270 |     #         "am 25. Januar 2017 am späten Nachmittag",
271 |     #         "am 25. Januar 2017 am späten Nachmittag",
272 |     #         "late afternoon of January 25th 2017",
273 |     #     ],
274 |     # ),
275 |     # (
276 |     #     "Time[]{2020-01-25 X:X (X/evening)}",
277 |     #     "2018-03-07T12:43",
278 |     #     [
279 |     #         "25.01.2020 abends",
280 |     #         "25.01.2020 late",
281 |     #         "25.01.2020 spät",
282 |     #         "25. Januar 2020 abends",
283 |     #         "abends 25.01.2020",
284 |     #         "evening of January 25th 2020",
285 |     #     ],
286 |     # ),
287 |     # (
288 |     #     "Time[]{2018-03-25 X:X (X/evening)}",
289 |     #     "2018-03-07T12:43",
290 |     #     ["evening of the 25th", "am 25. abends", "abends am 25."],
291 |     # ),
292 |     # # ruleTODPOD
293 |     # (
294 |     #     "Time[]{X-X-X 16:30 (X/X)}",
295 |     #     "2018-03-07T12:43",
296 |     #     ["um 4:30 nachmittags", "at 4:30 in the afternoon"],
297 |     # ),
298 |     # rulePODTOD
299 |     # (
300 |     #     "Time[]{X-X-X 20:00 (X/X)}",  # next day since moning is already over
301 |     #     "2018-03-07T12:43",
302 |     #     ["morgens um 8", "late morning at 8"],
303 |     # ),
304 |     (
305 |         "Time[]{X-X-X 16:30 (X/X)}",
306 |         "2018-03-07T12:43",
307 |         ["nachmittags um 16:30", "afternoon at 16:30", "16:30"],
308 |     ),
309 |     # ruleDateTOD
310 |     (
311 |         "Time[]{2018-08-05 20:00 (X/X)}",
312 |         "2018-03-07T12:43",
313 |         [
314 |             "5. August um 8",
315 |             "August 5th at 8",
316 |             # "august 5 at 8am",
317 |             "5. Aug gegen 8",
318 |             "05.08.2018 8Uhr",
319 |             "05.08.2018 8pm",
320 |             "august 5th 8"
321 |         ],
322 |     ),
323 |     # ruleTODDate
324 |     (
325 |         "Time[]{2018-08-05 20:00 (X/X)}",
326 |         "2018-03-07T12:43",
327 |         ["um 8 5. August", "at 8 on August 5th"],
328 |     ),
329 |     # ruleDateDate, ruleDOMDate, ruleDateDOM
330 |     (
331 |         "Interval[]{2018-08-05 X:X (X/X) - 2018-08-16 X:X (X/X)}",
332 |         "2018-03-07T12:43",
333 |         [
334 |             "5.8. - 16.8.",
335 |             "August 5th - August 16th",
336 |             "Aug 5 - 16",
337 |             "from Aug 5 to 16",
338 |             "5 to 16 Aug",
339 |             "from 5 to 16 Aug",
340 |             "5. - 16.8.",
341 |             "5.8. - 16.8.2018",
342 |             "5.8. bis 16.8.2018",
343 |             "5.8. - 16.8.",
344 |             "5.8. bis 16.8.",
345 |             "5. - 16.8.",
346 |             "5.8. - 16.8.2018",
347 |             "5.8. bis 16.8.2018",
348 |             "5.8. - 16.8.",
349 |             "5.8. bis 16.8.",
350 |             "5. bis zum 16.8.",
351 |             "vom 05.08.2018 zum 16.08.2018",
352 |         ],
353 |     ),
354 |     # ruleDOYDate
355 |     (
356 |         "Interval[]{2017-08-05 X:X (X/X) - 2017-08-16 X:X (X/X)}",
357 |         "2018-03-07T12:43",
358 |         ["5.8. - 16.8.2017", "Samstag 5.8. - Mittwoch 16.8.2017"],
359 |     ),
360 |     # ruleDateTimeDateTime
361 |     (
362 |         "Interval[]{2018-08-05 08:00 (X/X) - 2018-08-16 13:00 (X/X)}",
363 |         "2018-03-07T12:43",
364 |         ["5.8. 8am - 16.8. 13Uhr", "August 5th 8am - August 16th 13h"],
365 |     ),
366 |     (
367 |         "Interval[]{X-X-X 08:00 (X/X) - X-X-X 13:00 (X/X)}",
368 |         "2018-03-07T12:43",
369 |         ["8am - 13:00", "8am - 13Uhr", "8am to 13h", "8am-13"],
370 |     ),
371 |     # increasing coverage for int-int hours
372 |     (
373 |         "Interval[]{X-X-X 10:00 (X/X) - X-X-X 12:00 (X/X)}",
374 |         "2018-03-07T12:43",
375 |         ["10am - 12:00", "10am - 12Uhr", "10am to 12h", "10-12am", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12", "10-12",],
376 |     ),
377 |     (
378 |         "Interval[]{X-X-X 15:00 (X/X) - X-X-X 16:00 (X/X)}",
379 |         "2018-03-07T12:43",
380 |         ["03:00 - 04:00", "3Uhr - 4Uhr", "3h to 4h", "3-4"],
381 |     ),
382 |     # rulePODPOD
383 |     # (
384 |     #     "Interval[]{X-X-X X:X (X/evening) - X-X-X X:X (X/night)}",
385 |     #     "2018-05-08T10:32",
386 |     #     ["evening/night"],
387 |     # ),
388 |     # ruleAfterTime
389 |     (
390 |         "Interval[]{2017-11-26 20:00 (X/X) - None}",
391 |         "2018-03-07T12:43",
392 |         ["26.11.2017 ab 08:00 Uhr"],
393 |     ),
394 |     (
395 |         "Interval[]{2018-11-26 20:00 (X/X) - None}",
396 |         "2018-03-07T12:43",
397 |         [
398 |             "26.11.2018 ab 08:00 Uhr",
399 |             "26.11. ab 08:00 Uhr",
400 |             "26.11. frühestens um 08:00 Uhr",
401 |             "November 26th earliest 08:00 Uhr",
402 |             "November 26th earliest after 08:00 Uhr",
403 |             "November 26th from earliest 08:00 Uhr",
404 |             "26.11. nicht vor 08:00 Uhr",
405 |         ],
406 |     ),
407 |     # ruleBeforeTime
408 |     (
409 |         "Interval[]{None - 2017-11-26 20:00 (X/X)}",
410 |         "2018-03-07T12:43",
411 |         [
412 |             "26.11.2017 vor 08:00 Uhr",
413 |             "26.11.2017 bis spätestens 08:00 Uhr",
414 |             "26.11.2017 spätestens bis 08:00 Uhr",
415 |         ],
416 |     ),
417 |     (
418 |         "Interval[]{None - 2018-11-26 20:00 (X/X)}",
419 |         "2018-03-07T12:43",
420 |         ["26.11.2018 vor 08:00 Uhr", "26.11. vor 08:00 Uhr", "26.11. not after 08:00"],
421 |     ),
422 |     # ruleHHMM
423 |     (
424 |         "Time[]{X-X-X 20:00 (X/X)}",
425 |         "2018-03-07T00:00",
426 |         ["8h", "8 Uhr", "8:00", "8h00", "8"],
427 |     ),
428 |     (
429 |         "Time[]{X-X-X 20:00 (X/X)}",
430 |         "2018-03-07T00:00",
431 |         ["20h", "20 Uhr", "20:00", "20pm", "20"],
432 |     ),  # <-- ignore am, since this makes no sense
433 |     # ruleMonthDOM
434 |     (
435 |         "Time[]{2018-04-07 X:X (X/X)}",
436 |         "2018-03-07T00:00",
437 |         ["april 7", "april 7th", "7. April"],
438 |     ),
439 |     # ruleAbsorbOnTime
440 |     (
441 |         "Time[]{X-X-X 20:00 (X/X)}",
442 |         "2018-03-07T00:00",
443 |         ["at 8pm", "um 20h", "gegen 20:00", "about 8pm", "at around 8pm"],
444 |     ),
445 |     # ruleAbsorbOnTime + X
446 |     (
447 |         "Time[]{2018-06-21 08:00 (X/X)}",
448 |         "2018-03-07T00:00",
449 |         [
450 |             "Jun 21 at 8am",
451 |             "Jun 21 8am",
452 |             "Jun 21 at 8am",
453 |             "Jun 21 on 8am",
454 |             "21. Juni um 8am",
455 |         ],
456 |     ),
457 |     # ruleDateInterval
458 |     (
459 |         "Interval[]{2018-11-13 13:30 (X/X) - 2018-11-13 15:35 (X/X)}",
460 |         "2018-03-07T00:00",
461 |         [
462 |             "Mon, Nov 13 1:30 PM - 3:35 PM",
463 |             "Montag, 13. November von 13:30 bis 15:35",
464 |             "Nov 13 13:30 - 15:35",
465 |         ],
466 |     ),
467 |     (
468 |         "Interval[]{2018-11-13 13:30 (X/X) - None}",
469 |         "2018-03-07T00:00",
470 |         [
471 |             "Mon, Nov 13 after 1:30 PM",
472 |             "Montag, 13. November nach 13:30",
473 |             "Montag, 13. November 2018 nach 13:30",
474 |             "13.11. ab 13:30",
475 |         ],
476 |     ),
477 |     (
478 |         "Interval[]{2016-11-13 13:30 (X/X) - None}",
479 |         "2018-03-07T00:00",
480 |         [
481 |             "Mon, Nov 13 2016 after 1:30 PM",
482 |             "Montag, 13. November 2016 nach 13:30",
483 |             "Montag, 13. November 2016 nach 13:30",
484 |             "13.11.16 ab 13:30",
485 |         ],
486 |     ),
487 |     (
488 |         "Interval[]{2018-03-11 X:X (X/noon) - None}",
489 |         "2018-03-07T00:00",
490 |         ["Sunday after noon", "Sonntag ab Mittag", "Sonntag, 11. März 2018 ab Mittag"],
491 |     ),
492 |     # (
493 |     #     "Interval[]{2018-03-11 21:00 (X/X) - None}",
494 |     #     "2018-03-07T00:00",
495 |     #     [
496 |     #         "Sunday Mar 11 after 9",
497 |     #         "Sonntag, 11. März 2018 nach 9",
498 |     #         "Sonntag, der 11. Mrz. nach 9",
499 |     #     ],
500 |     # ),
501 |     # (
502 |     #     "Interval[]{2016-03-11 21:00 (X/X) - None}",
503 |     #     "2018-03-07T00:00",
504 |     #     [
505 |     #         "Sunday Mar 11 2016 after 9",
506 |     #         "Sonntag, 11. März 2016 nach 9",
507 |     #         "Sonntag, der 11. Mrz 2016 nach 9",
508 |     #     ],
509 |     # ),
510 |     # ruleDateInterval - day wrap
511 |     (
512 |         "Interval[]{2018-11-13 23:30 (X/X) - 2018-11-14 03:35 (X/X)}",
513 |         "2018-03-07T00:00",
514 |         ["Mon, Nov 13 11:30 PM - 3:35 AM", "Nov 13 23:30 - 3:35am"],
515 |     ),
516 |     (
517 |         "Interval[]{2018-02-20 21:00 (X/X) - 2018-02-21 04:00 (X/X)}",
518 |         "2018-02-20T09:37",
519 |         ["today 9pm - 4am"],
520 |     ),
521 |     # ruleAbsorbDOWComma -- deleted, comma should be removed by caller
522 |     (
523 |         "Time[]{2018-07-27 X:X (X/X)}",
524 |         "2018-07-26T00:00",
525 |         ["Freitag, dem 27.", "Fri, the 27th", "fri 27"],
526 |     ),
527 |     # ruleNamedHour
528 |     ("Time[]{X-X-X 09:00 (X/X)}", "2018-07-26T00:00", ["neun", "nine"]),
529 |     # ruleQuarterBeforeHH
530 |     # (
531 |     #     "Time[]{2018-07-26 19:45 (X/X)}",
532 |     #     "2018-07-26T00:00",
533 |     #     ["viertel vor acht", "viertel vor 8", "quarter to eight"],
534 |     # ),
535 |     # ruleQuarterBeforeHH midnight wrap
536 |     ("Time[]{X-X-X 23:45 (X/X)}", "2018-07-26T00:00", ["viertel vor 0"]),
537 |     # ruleQuarterAfterHH
538 |     # (
539 |     #     "Time[]{2018-07-26 08:15 (X/X)}",
540 |     #     "2018-07-26T00:00",
541 |     #     ["viertel nach acht", "viertel nach 8", "quarter past eight"],
542 |     # ),
543 |     # ruleHalfBeforeHH
544 |     # (
545 |     #     "Time[]{2018-07-26 07:30 (X/X)}",
546 |     #     "2018-07-26T00:00",
547 |     #     ["half eight"],
548 |     # ),
549 |     # ruleHalfBeforeHH not when minutes are present
550 |     ("Time[]{X-X-X 19:35 (X/X)}", "2018-07-26T00:00", ["halb 7:35"]),
551 |     # ruleHalfBeforeHH midnight wrap
552 |     ("Time[]{X-X-X 23:30 (X/X)}", "2018-07-26T00:00", ["halb mitternacht"]),
553 |     # ruleHalfAfterHH
554 |     (
555 |         "Time[]{X-X-X 08:30 (X/X)}",
556 |         "2018-07-26T00:00",
557 |         ["halb nach acht", "halfe past eight"],
558 |     ),
559 |     # ruleHalfAfterHH not when minutes are present
560 |     ("Time[]{X-X-X 20:32 (X/X)}", "2018-07-26T00:00", ["halb nach 8:32"]),
561 |     # rulePODInterval
562 |     # (
563 |     #     "Interval[]{None - 2018-09-17 22:00 (X/X)}",
564 |     #     "2018-07-26T00:00",
565 |     #     ["am 17.9. abends vor 10", "at Sep 17th in the evening before 10"],
566 |     # ),
567 |     # (
568 |     #     "Interval[]{X-X-X 22:00 (X/X) - None}",
569 |     #     "2018-07-26T00:00",
570 |     #     ["abends nach 10", "in the evening after 10", "in the evening after 22h"],
571 |     # ),
572 |     # (
573 |     #     "Interval[]{X-X-X 20:00 (X/X) - X-X-X 21:00 (X/X)}",
574 |     #     "2018-07-26T00:00",
575 |     #     ["in the evening between 8 and 9", "Jul 26th between 20 and 21"],
576 |     # ),
577 |     # (
578 |     #     "Interval[]{X-X-X 08:00 (X/X) - X-X-X 09:00 (X/X)}",
579 |     #     "2018-07-26T00:00",
580 |     #     ["in the morning between 8 and 9", "Jul 26th between 8 and 9"],
581 |     # ),
582 |     # rule
583 |     #
584 |     # -----------------------------------------------------------------------------
585 |     # OLD CORPUS
586 |     # -----------------------------------------------------------------------------
587 |     #
588 |     (
589 |         "Interval[]{2017-12-19 21:30 (X/X) - 2017-12-19 22:45 (X/X)}",
590 |         "2017-12-18T12:34",
591 |         [
592 |             "tomorrow 09:30 - 10:45",
593 |             "tomorrow 0930 - 1045",
594 |             "19. Dezember von 09:30 bis 10:45",
595 |             "19th of December from 09:30 til 10:45",
596 |             "19.12. 09:30 - 10:45",
597 |             "19.12.17 09:30 - 10:45",
598 |             "19.12.2017 09:30 - 19.12.2017 10:45",
599 |             "19.12.2017 09:30 - 10:45",
600 |             "19 dec 0930-1045",
601 |             "Dec 19th 9:30pm to 10:45pm",
602 |         ],
603 |     ),
604 |     (
605 |         "Interval[]{2018-02-16 X:X (X/X) - 2018-02-21 X:X (X/X)}",
606 |         "2017-12-18T12:34",
607 |         ["16.02.2018 - 21.02.2018", "16. bis 21.02.2018"],
608 |     ),
609 |     (
610 |         "Interval[]{2018-08-07 X:X (X/X) - 2018-08-10 X:X (X/X)}",
611 |         "2017-12-18T12:34",
612 |         ["07.-10.08.2018"],
613 |     ),
614 |     # ('Range[]{2018-12-09 - 2018-12-13}',
615 |     #  '2017-12-18T12:34',
616 |     #  [
617 |     #      '09.-13.12.2018 von Samstag bis Time[]{2017-05-11 X:X (X/X)}'
618 |     #  ]),
619 |     # ('Range[]{2018-04-27 - 2018-04-30}',
620 |     #  '2017-12-18T12:34',
621 |     #  [
622 |     #      # 'from the 27th to the 30th of April 2018',
623 |     #      '27.-30.04.2018 von Freitag bis Montag'
624 |     #  ]),
625 |     (
626 |         "Time[]{2018-01-13 X:X (X/X)}",
627 |         "2017-12-18T12:34",
628 |         ["am 13.1.", "am 13.01.", "am 13. Januar", "13.01", "13.1", "13th Jan"],
629 |     ),
630 |     (
631 |         "Time[]{2017-12-19 X:X (X/X)}",
632 |         "2017-12-18T12:34",
633 |         [
634 |             "am Dienstag",
635 |             "am 19.12",
636 |             "Dienstag 19.12",
637 |             "Tuesday 19th of December",
638 |             "Tuesday December 19th",
639 |             "Dienstag 19. Dezember",
640 |             "Dienstag Dezember 19.",
641 |             "Dienstag",
642 |         ],
643 |     ),
644 |     (
645 |         "Time[]{2018-03-01 14:30 (X/X)}",
646 |         "2017-12-18T12:34",
647 |         [
648 |             # mm/dd does not work yet
649 |             # '03/01/2018 at 2:30 pm',
650 |             "am 01.03.2018 um 14:30",
651 |             "Mar 1st 2:30 pm",
652 |             "1. März um 1430 Uhr",
653 |             "01.03.2018 14:30",
654 |         ],
655 |     ),
656 |     (
657 |         "Time[]{2018-01-03 14:30 (X/X)}",
658 |         "2017-12-18T12:34",
659 |         [
660 |             # mm/dd does not work yet
661 |             # '01/03/2018 at 2:30 pm',
662 |             "am 03.01.2018 um 14:30",
663 |             "Jan. 3rd 2:30 pm",
664 |             "3. Januar 1430 Uhr",
665 |             "03.01.2018 14:30",
666 |             "3 Jan 2018 14:30",
667 |         ],
668 |     ),
669 |     ("Time[]{2018-04-23 23:00 (X/X)}", "2017-12-18T12:34", ["23.04.2018 11:00"]),
670 |     ("Time[]{2018-11-19 18:00 (X/X)}", "2017-12-18T12:34", ["19.11.2018 18:00"]),
671 |     # (
672 |     #     "Time[]{2017-12-20 X:X (X/morning)}",
673 |     #     "2017-12-18T12:34",
674 |     #     ["Wednesday, 20th December morning", "december 20 morning"],
675 |     # ),
676 |     # (
677 |     #     "Time[]{2018-12-06 X:X (X/morning)}",
678 |     #     "2018-03-07T12:43",
679 |     #     [
680 |     #         "6. dezember morgens",
681 |     #         "6. dezember früh",
682 |     #         "6. dezember in der früh",
683 |     #         "december 6 early",
684 |     #         "december 6th morning",
685 |     #     ],
686 |     # ),
687 |     # (
688 |     #     "Time[]{2018-12-06 X:X (X/earlymorning)}",
689 |     #     "2018-03-07T12:43",
690 |     #     ["6. dezember früh morgens", "december 6 early morning"],
691 |     # ),
692 |     # (
693 |     #     "Time[]{2018-12-06 X:X (X/forenoon)}",
694 |     #     "2018-03-07T12:43",
695 |     #     ["6. Dezember vormittags", "december 6th forenoon"],
696 |     # ),
697 |     # (
698 |     #     "Time[]{2018-12-06 X:X (X/afternoon)}",
699 |     #     "2018-03-07T12:43",
700 |     #     ["6. Dezember nachmittag", "december 6 afternoon"],
701 |     # ),
702 |     # (
703 |     #     "Time[]{2018-12-06 X:X (X/noon)}",
704 |     #     "2018-03-07T12:43",
705 |     #     ["6. Dezember mittags", "december 6 noon"],
706 |     # ),
707 |     # (
708 |     #     "Time[]{2018-12-06 X:X (X/evening)}",
709 |     #     "2018-03-07T12:43",
710 |     #     ["6. Dezember abends", "december 6 late"],
711 |     # ),
712 |     # (
713 |     #     "Time[]{2018-12-06 X:X (X/lateevening)}",
714 |     #     "2018-03-07T12:43",
715 |     #     ["6. Dezember später abend", "december 6 late evening"],
716 |     # ),
717 |     # (
718 |     #     "Time[]{2018-12-06 X:X (X/veryearlyafternoon)}",
719 |     #     "2018-03-07T12:43",
720 |     #     ["6. Dezember sehr früher nachmittag", "december 6 very early afternoon"],
721 |     # ),
722 |     # ('DateTime[]{2017-12-20Tmorning}',
723 |     #  '2017-12-18T12:34',
724 |     #  ['Wednesday, morning, 20.12.17']),
725 |     # ('DateTime[]{2017-12-20Tafternoon}',
726 |     #  '2017-12-18T12:34',
727 |     #  ['Wednesday, afternoon, 20.12.17']),
728 |     # ('DateTime[]{2017-12-20 XX:XX (X/evening)}',
729 |     #  '2017-12-18T12:34',
730 |     #  ['Wednesday, evening, 20.12.17']),
731 |     ("Time[]{2017-12-20 18:45 (X/X)}", "2017-12-18T12:34", ["6:45 Uhr 20.12.2017"]),
732 |     ("Time[]{2018-08-04 15:00 (X/X)}", "2017-12-18T12:34", ["04.08.2018 15:00"]),
733 |     ("Time[]{2018-09-01 13:00 (X/X)}", "2017-12-18T12:34", ["01.09.2018 01:00"]),
734 |     ("Time[]{2018-11-29 22:00 (X/X)}", "2017-12-18T12:34", ["29.11.2018 22:00"]),
735 |     ("Time[]{2018-02-27 19:00 (X/X)}", "2017-12-18T12:34", ["27.02.2018 07:00"]),
736 |     ("Time[]{2018-05-09 21:30 (X/X)}", "2017-12-18T12:34", ["09.05.2018 09:30"]),
737 |     ("Time[]{2018-01-17 14:30 (X/X)}", "2017-12-18T12:34", ["17.01.2018 14:30"]),
738 |     (
739 |         "Interval[]{2018-06-21 11:00 (X/X) - 2018-06-21 13:00 (X/X)}",
740 |         "2017-12-18T12:34",
741 |         ["21.06.2018 11:00-13:00", "Jun 21st between 11am and 1pm"],
742 |     ),
743 |     (
744 |         "Interval[]{2018-07-09 20:00 (X/X) - 2018-07-13 22:00 (X/X)}",
745 |         "2017-12-18T12:34",
746 |         ["09.07.2018 08:00 - 13.07.2018 10:00"],
747 |     ),
748 |     # Military time tests
749 |     ("Time[]{2020-02-03 X:X (X/X)}", "2020-02-25T12:34", ["3 Feb 2020"]),
750 |     # Duration tests
751 |     # (
752 |     #     "Duration[]{1 nights}",
753 |     #     "2020-02-25T12:34",
754 |     #     ["one night", "ein nacht", "eine übernachtung"],
755 |     # ),
756 |     ("Duration[]{30 days}", "2020-02-25T12:34", ["in 30 days", "in 30 tage"],),
757 |     ("Duration[]{7 weeks}", "2020-02-25T12:34", ["in 7 weeks", "in 7 wochen"],),
758 |     (
759 |         "Duration[]{20 minutes}",
760 |         "2020-02-25T12:34",
761 |         ["in 20 minutes", "in twenty minutes", "in zwanzig Minuten"],
762 |     ),
763 |     ("Duration[]{1 months}", "2020-02-25T12:34", ["in 1 month", "in one month", "in ein Monat"]),
764 |     (
765 |         "Duration[]{30 minutes}",
766 |         "2020-02-25T12:34",
767 |         ["in half an hour", "in half hour", "in 1/2 hour", "in 1/2h", "in 1/2 h", "in halbe Stunde"],
768 |     ),
769 |     # ruleTimeDuration
770 |     # (
771 |     #     "Interval[]{2020-02-27 X:X (X/X) - 2020-02-28 X:X (X/X)}",
772 |     #     "2020-02-25T12:34",
773 |     #     ["on the 27th for one day", "on the 27th for one night"],
774 |     # ),
775 |     # (
776 |     #     "Interval[]{2020-02-25 15:00 (X/X) - 2020-02-25 16:00 (X/X)}",
777 |     #     "2020-02-25T12:34",
778 |     #     ["today 15:00 for one hour"],
779 |     # ),
780 |     # # ruleDurationInterval, ruleIntervalDuration
781 |     # (
782 |     #     "Interval[]{2020-11-15 X:X (X/X) - 2020-11-18 X:X (X/X)}",
783 |     #     "2020-02-25T12:34",
784 |     #     ["3 days 15-18 Nov", "15-18 Nov 3 Nächte", "15-18 Nov für 3 Nächte"],
785 |     # ),
786 | ]
787 | 


--------------------------------------------------------------------------------
/ctparse/time/postprocess_latent.py:
--------------------------------------------------------------------------------
 1 | """Those rules are applied as postprocessing steps after scoring has been already
 2 | done. Needed for backwards compatibility."""
 3 | from ctparse.types import Artifact, Interval, Time
 4 | from datetime import datetime
 5 | from dateutil.relativedelta import relativedelta
 6 | 
 7 | 
 8 | def apply_postprocessing_rules(ts: datetime, art: Artifact) -> Artifact:
 9 |     """Apply postprocessing rules to a resolution *art*. This is
10 |     introduced for backwards compatibility reasons.
11 | 
12 |     Example:
13 | 
14 |     8:00 pm, ts=2020.01.01 07:00
15 | 
16 |     produces a resolution:
17 | 
18 |     X-X-X 20:00
19 | 
20 |     after postprocessing this is anchored to the reference time:
21 | 
22 |     2020-01-01 20:00
23 |     """
24 |     if isinstance(art, Time):
25 |         if art.isTOD:
26 |             return _latent_tod(ts, art)
27 |     if isinstance(art, Interval):
28 |         if art.isTimeInterval:
29 |             return _latent_time_interval(ts, art)
30 | 
31 |     return art
32 | 
33 | 
34 | def _latent_tod(ts: datetime, tod: Time) -> Time:
35 |     dm = ts + relativedelta(hour=tod.hour, minute=tod.minute or 0)
36 |     if dm <= ts:
37 |         dm += relativedelta(days=1)
38 | 
39 |     res = Time(
40 |         year=dm.year, month=dm.month, day=dm.day, hour=dm.hour, minute=dm.minute, period=tod.period,
41 |     )
42 |     res.mstart = tod.mstart
43 |     res.mend = tod.mend
44 |     return res
45 | 
46 | 
47 | def _latent_time_interval(ts: datetime, ti: Interval) -> Interval:
48 |     assert ti.t_from and ti.t_to  # guaranteed by the caller
49 |     dm_from = ts + relativedelta(hour=ti.t_from.hour, minute=ti.t_from.minute or 0)
50 |     dm_to = ts + relativedelta(hour=ti.t_to.hour, minute=ti.t_to.minute or 0)
51 |     if dm_from <= ts:
52 |         dm_from += relativedelta(days=1)
53 |         dm_to += relativedelta(days=1)
54 | 
55 |     # pm-am interval overlap
56 |     if ti.t_from.period == "pm" and ti.t_to.period == "am":
57 |         dm_to += relativedelta(days=1)
58 | 
59 |     res = Interval(
60 |         t_from=Time(
61 |             year=dm_from.year,
62 |             month=dm_from.month,
63 |             day=dm_from.day,
64 |             hour=dm_from.hour,
65 |             minute=dm_from.minute,
66 |             period=ti.t_from.period,
67 |         ),
68 |         t_to=Time(
69 |             year=dm_to.year,
70 |             month=dm_to.month,
71 |             day=dm_to.day,
72 |             hour=dm_to.hour,
73 |             minute=dm_to.minute,
74 |             period=ti.t_to.period,
75 |         ),
76 |     )
77 |     res.mstart = ti.mstart
78 |     res.mend = ti.mend
79 |     return res
80 | 


--------------------------------------------------------------------------------
/ctparse/timers.py:
--------------------------------------------------------------------------------
 1 | """Utilities for tracking time spent in functions.
 2 | 
 3 | Although this module is not part of the public API, it is used in various parts of
 4 | the ctparse package.
 5 | 
 6 | """
 7 | from time import perf_counter
 8 | from typing import Any, Callable, TypeVar, Union, Tuple
 9 | from functools import wraps
10 | 
11 | T = TypeVar("T")
12 | 
13 | 
14 | def timeout(timeout: Union[float, int]) -> Callable[[], None]:
15 |     """Generate a functions that raises an exceptions if a timeout has passed.
16 | 
17 |     Example:
18 | 
19 |         sentinel = timeout(1.0)
20 |         time.sleep(0.5)
21 |         sentinel() # Do nothing
22 |         time.sleep(0.6)
23 |         sentinel() # Raises CTParseTimeoutException
24 | 
25 |     :param timeout:
26 |        time in seconds. If it is equal to zero, it means to never raise an exception.
27 |     :returns:
28 |         A function that raises a `CTParseTimeoutException` if `timeout` seconds have
29 |         expired.
30 |     """
31 |     start_time = perf_counter()
32 | 
33 |     def _tt() -> None:
34 |         if timeout == 0:
35 |             return
36 |         if perf_counter() - start_time > timeout:
37 |             raise CTParseTimeoutError()
38 | 
39 |     return _tt
40 | 
41 | 
42 | def timeit(f: Callable[..., T]) -> Callable[..., Tuple[T, float]]:
43 |     """Wrapper to time a function.
44 | 
45 |     The wrapped function is modified so that it returns a tuple `(f(args), t)`
46 |     where `t` the time in seconds the function call took to run.
47 | 
48 |     Example:
49 | 
50 |         def fun(x):
51 |             return x * x
52 | 
53 |         result, exec_time = timeit(fun)(3)
54 | 
55 |     """
56 | 
57 |     @wraps(f)
58 |     def _wrapper(*args: Any, **kwargs: Any) -> Tuple[T, float]:
59 |         start_time = perf_counter()
60 |         res = f(*args, **kwargs)
61 |         return res, perf_counter() - start_time
62 | 
63 |     return _wrapper
64 | 
65 | 
66 | # NOTE: TimeoutError is a built-in exception that means that
67 | # system function timed out at the system level. Hence we opt
68 | # for a custom exception.
69 | class CTParseTimeoutError(Exception):
70 |     """Exception raised by the `timeout` function."""
71 | 


--------------------------------------------------------------------------------
/ctparse/types.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | from typing import Any, Dict, Optional, Tuple, Type, TypeVar, List
  3 | from dateutil.relativedelta import relativedelta
  4 | from dateutil.rrule import rrule, YEARLY, MONTHLY, WEEKLY, DAILY
  5 | 
  6 | import regex
  7 | from regex import Regex
  8 | import enum
  9 | 
 10 | T = TypeVar("T", bound="Artifact")
 11 | 
 12 | 
 13 | class Artifact:
 14 |     def __init__(self) -> None:
 15 |         self.mstart = 0
 16 |         self.mend = 0
 17 |         self._attrs = ["mstart", "mend"]
 18 | 
 19 |     def update_span(self: T, *args: "Artifact") -> T:
 20 |         self.mstart = args[2].mstart
 21 |         self.mend = args[-1].mend
 22 |         return self
 23 | 
 24 |     def __len__(self) -> int:
 25 |         return self.mend - self.mstart
 26 | 
 27 |     def __bool__(self) -> bool:
 28 |         return True
 29 | 
 30 |     def __str__(self) -> str:
 31 |         return ""
 32 | 
 33 |     def __repr__(self) -> str:
 34 |         return "{}[{}-{}]{{{}}}".format(
 35 |             self.__class__.__name__, self.mstart, self.mend, str(self)
 36 |         )
 37 | 
 38 |     def nb_str(self) -> str:
 39 |         """Return a string representation without the bounds information."""
 40 |         return "{}[]{{{}}}".format(self.__class__.__name__, str(self))
 41 | 
 42 |     def __eq__(self, other: Any) -> bool:
 43 |         if type(other) != type(self):
 44 |             return False
 45 |         else:
 46 |             return all(getattr(self, a) == getattr(other, a) for a in self._attrs)
 47 | 
 48 |     def __hash__(self) -> int:
 49 |         return hash(tuple(getattr(self, a) for a in self._attrs))
 50 | 
 51 |     def _hasOnly(self, *args: str) -> bool:
 52 |         """check that all attributes set to True are set (i.e. not None) and
 53 |         all set to False are not set (i.e. None)
 54 | 
 55 |         """
 56 |         return all(
 57 |             getattr(self, a) is not None if a in args else getattr(self, a) is None
 58 |             for a in self._attrs
 59 |         )
 60 | 
 61 |     def _hasAtLeast(self, *args: str) -> bool:
 62 |         """check that all attributes set to True are set (i.e. not None) and
 63 |         all set to False are not set (i.e. None)
 64 | 
 65 |         """
 66 |         return all(getattr(self, a) is not None for a in args)
 67 | 
 68 | 
 69 | class RegexMatch(Artifact):
 70 |     def __init__(self, id: int, m: Regex) -> None:
 71 |         super().__init__()
 72 |         self._attrs = ["mstart", "mend", "id"]
 73 |         self.key = "R{}".format(id)
 74 |         self.id = id
 75 |         self.match = m
 76 |         self.mstart = m.span(self.key)[0]
 77 |         self.mend = m.span(self.key)[1]
 78 |         self._text = m.group(self.key)
 79 | 
 80 |     def __str__(self) -> str:
 81 |         return "{}:{}".format(self.id, self._text)
 82 | 
 83 | 
 84 | _pod_hours = {
 85 |     "earlymorning": {
 86 |         "offset": (4, 7),
 87 |         "early": {
 88 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
 89 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
 90 |             "very": {"offset": (0, 0)},
 91 |             "offset": (-1, -1),
 92 |         },
 93 |         "late": {
 94 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
 95 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
 96 |             "very": {"offset": (0, 0)},
 97 |             "offset": (1, 1),
 98 |         },
 99 |     },
100 |     "morning": {
101 |         "offset": (6, 9),
102 |         "early": {
103 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
104 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
105 |             "very": {"offset": (0, 0)},
106 |             "offset": (-1, -1),
107 |         },
108 |         "late": {
109 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
110 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
111 |             "very": {"offset": (0, 0)},
112 |             "offset": (1, 1),
113 |         },
114 |     },
115 |     "forenoon": {
116 |         "offset": (9, 12),
117 |         "early": {
118 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
119 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
120 |             "very": {"offset": (0, 0)},
121 |             "offset": (-1, -1),
122 |         },
123 |         "late": {
124 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
125 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
126 |             "very": {"offset": (0, 0)},
127 |             "offset": (1, 1),
128 |         },
129 |     },
130 |     "noon": {
131 |         "offset": (11, 13),
132 |         "early": {
133 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
134 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
135 |             "very": {"offset": (0, 0)},
136 |             "offset": (-1, -1),
137 |         },
138 |         "late": {
139 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
140 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
141 |             "very": {"offset": (0, 0)},
142 |             "offset": (1, 1),
143 |         },
144 |     },
145 |     "afternoon": {
146 |         "offset": (12, 17),
147 |         "early": {
148 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
149 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
150 |             "very": {"offset": (0, 0)},
151 |             "offset": (-1, -1),
152 |         },
153 |         "late": {
154 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
155 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
156 |             "very": {"offset": (0, 0)},
157 |             "offset": (1, 1),
158 |         },
159 |     },
160 |     "evening": {
161 |         "offset": (17, 20),
162 |         "early": {
163 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
164 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
165 |             "very": {"offset": (0, 0)},
166 |             "offset": (-1, -1),
167 |         },
168 |         "late": {
169 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
170 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
171 |             "very": {"offset": (0, 0)},
172 |             "offset": (1, 1),
173 |         },
174 |     },
175 |     "lateevening": {
176 |         "offset": (18, 21),
177 |         "early": {
178 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
179 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
180 |             "very": {"offset": (0, 0)},
181 |             "offset": (-1, -1),
182 |         },
183 |         "late": {
184 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
185 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
186 |             "very": {"offset": (0, 0)},
187 |             "offset": (1, 1),
188 |         },
189 |     },
190 |     "night": {
191 |         "offset": (19, 22),
192 |         "early": {
193 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
194 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
195 |             "very": {"offset": (0, 0)},
196 |             "offset": (-1, -1),
197 |         },
198 |         "late": {
199 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
200 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
201 |             "very": {"offset": (0, 0)},
202 |             "offset": (1, 1),
203 |         },
204 |     },
205 |     "first": {
206 |         "offset": (0, 0),
207 |         "early": {
208 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
209 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
210 |             "very": {"offset": (0, 0)},
211 |             "offset": (0, 0),
212 |         },
213 |         "late": {
214 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
215 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
216 |             "very": {"offset": (0, 0)},
217 |             "offset": (0, 0),
218 |         },
219 |     },
220 |     "last": {
221 |         "offset": (23, 23),
222 |         "early": {
223 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
224 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
225 |             "very": {"offset": (0, 0)},
226 |             "offset": (0, 0),
227 |         },
228 |         "late": {
229 |             "early": {"offset": (0, 0), "very": {"offset": (0, 0)}},
230 |             "late": {"offset": (0, 0), "very": {"offset": (0, 0)}},
231 |             "very": {"offset": (0, 0)},
232 |             "offset": (0, 0),
233 |         },
234 |     },
235 | }
236 | 
237 | 
238 | def _mk_pod_hours() -> Dict[str, Tuple[int, int]]:
239 |     def _add_ts(t1: Tuple[int, int], t2: Tuple[int, int]) -> Tuple[int, int]:
240 |         return (t1[0] + t2[0], t1[1] + t2[1])
241 | 
242 |     def _mk(
243 |         pod: str, pod_data: Dict[str, Any], t: Tuple[int, int]
244 |     ) -> Dict[str, Tuple[int, int]]:
245 |         r = {pod: _add_ts(t, pod_data["offset"])}
246 |         for k, v in pod_data.items():
247 |             if k == "offset":
248 |                 continue
249 |             r.update(_mk(k + pod, v, r[pod]))
250 |         return r
251 | 
252 |     res = {}
253 |     for k, v in _pod_hours.items():
254 |         if k == "offset":
255 |             continue
256 |         res.update(_mk(k, v, (0, 0)))
257 |     return res
258 | 
259 | 
260 | pod_hours = _mk_pod_hours()
261 | 
262 | 
263 | _TIME_REGEX = regex.compile(
264 |     r"(\d{4}|X)-(\d{2}|X)-(\d{2}|X) (\d{2}|X):(\d{2}|X) \((\d|X)\/(\w+)\)"
265 | )
266 | 
267 | 
268 | class Time(Artifact):
269 |     def __init__(
270 |         self,
271 |         year: Optional[int] = None,
272 |         month: Optional[int] = None,
273 |         day: Optional[int] = None,
274 |         hour: Optional[int] = None,
275 |         minute: Optional[int] = None,
276 |         DOW: Optional[int] = None,
277 |         POD: Optional[str] = None,
278 |         period: Optional[str] = None,
279 |     ) -> None:
280 |         super().__init__()
281 |         self._attrs = ["year", "month", "day", "hour", "minute", "DOW", "POD", "period"]
282 |         # Might add some validation here, did not to avoid the overhead
283 |         self.year = year
284 |         self.month = month
285 |         self.day = day
286 |         self.hour = hour
287 |         self.minute = minute
288 |         self.DOW = DOW
289 |         self.POD = POD
290 |         self.period = period
291 | 
292 |     # -----------------------------------------------------------------------------
293 |     # Make sure to not accidentially test bool(x) as False when x==0, but you meant
294 |     # x==None
295 |     # -----------------------------------------------------------------------------
296 |     @property
297 |     def isDOY(self) -> bool:
298 |         """isDayOfYear <=> a dd.mm but not year
299 |         """
300 |         return self._hasOnly("month", "day")
301 | 
302 |     @property
303 |     def isDOM(self) -> bool:
304 |         """isDayOfMonth <=> a dd but no month
305 |         """
306 |         return self._hasOnly("day")
307 | 
308 |     @property
309 |     def isDOW(self) -> bool:
310 |         """isDayOfWeek <=> DOW is the 0=Monday index; fragile test, as the DOW
311 |         could be accompanied by e.g. a full date etc.; in practice,
312 |         however, the production rules do not do that.
313 | 
314 |         """
315 |         return self._hasOnly("DOW")
316 | 
317 |     @property
318 |     def isMonth(self) -> bool:
319 |         return self._hasOnly("month")
320 | 
321 |     @property
322 |     def isPOD(self) -> bool:
323 |         """isPartOfDay <=> morning, etc.; fragile, tests only that there is a
324 |         POD and neither a full date nor a full time
325 |         """
326 |         return self._hasOnly("POD")
327 | 
328 |     @property
329 |     def isHour(self) -> bool:
330 |         """only has an hour"""
331 |         return self._hasOnly("hour")
332 | 
333 |     @property
334 |     def isTOD(self) -> bool:
335 |         """isTimeOfDay - only a time, not date"""
336 |         return self._hasOnly("hour") or self._hasOnly("hour", "minute") or self._hasOnly("hour", "period") or self._hasOnly("hour", "minute", "period")
337 | 
338 |     @property
339 |     def isDate(self) -> bool:
340 |         """isDate - only a date, not time"""
341 |         return self._hasOnly("year", "month", "day")
342 | 
343 |     @property
344 |     def isDateTime(self) -> bool:
345 |         """a date and a time"""
346 |         return self._hasOnly("year", "month", "day", "hour") or self._hasOnly(
347 |             "year", "month", "day", "hour", "minute"
348 |         )
349 | 
350 |     @property
351 |     def isYear(self) -> bool:
352 |         """just a year"""
353 |         return self._hasOnly("year")
354 | 
355 |     @property
356 |     def hasDate(self) -> bool:
357 |         """at least a date"""
358 |         return self._hasAtLeast("year", "month", "day")
359 | 
360 |     @property
361 |     def hasDOY(self) -> bool:
362 |         """at least a day of year"""
363 |         return self._hasAtLeast("month", "day")
364 | 
365 |     @property
366 |     def hasDOW(self) -> bool:
367 |         """at least a day of week"""
368 |         return self._hasAtLeast("DOW")
369 | 
370 |     @property
371 |     def hasTime(self) -> bool:
372 |         """at least a time to the hour"""
373 |         return self._hasAtLeast("hour") or self._hasOnly("hour", "period")
374 | 
375 |     @property
376 |     def hasPeriod(self) -> bool:
377 |         """at least a period"""
378 |         return self._hasAtLeast("period")
379 | 
380 |     @property
381 |     def hasPOD(self) -> bool:
382 |         """at least a part of day"""
383 |         return self._hasAtLeast("POD")
384 | 
385 |     def __str__(self) -> str:
386 |         return "{}-{}-{} {}:{} ({}/{})".format(
387 |             "{:04d}".format(self.year) if self.year is not None else "X",
388 |             "{:02d}".format(self.month) if self.month is not None else "X",
389 |             "{:02d}".format(self.day) if self.day is not None else "X",
390 |             "{:02d}".format(self.hour) if self.hour is not None else "X",
391 |             "{:02d}".format(self.minute) if self.minute is not None else "X",
392 |             "{:d}".format(self.DOW) if self.DOW is not None else "X",
393 |             "{}".format(self.POD) if self.POD is not None else "X",
394 |         )
395 | 
396 |     @classmethod
397 |     def from_str(cls: Type["Time"], text: str) -> "Time":
398 |         match = _TIME_REGEX.match(text)
399 |         if not match:
400 |             raise ValueError("Invalid format")
401 |         else:
402 | 
403 |             def parse_opt_int(x: str) -> Optional[int]:
404 |                 return None if x == "X" else int(x)
405 | 
406 |             pod = match.group(7)
407 |             return cls(
408 |                 year=parse_opt_int(match.group(1)),
409 |                 month=parse_opt_int(match.group(2)),
410 |                 day=parse_opt_int(match.group(3)),
411 |                 hour=parse_opt_int(match.group(4)),
412 |                 minute=parse_opt_int(match.group(5)),
413 |                 DOW=parse_opt_int(match.group(6)),
414 |                 POD=None if pod == "X" else pod,
415 |             )
416 | 
417 |     @property
418 |     def start(self) -> "Time":
419 |         if self.hour is None and self.hasPOD:
420 |             hour = pod_hours[self.POD][0]  # type: ignore
421 |         else:
422 |             hour = self.hour or 0
423 |         return Time(
424 |             year=self.year,
425 |             month=self.month,
426 |             day=self.day,
427 |             hour=hour,
428 |             minute=self.minute or 0,
429 |             period=self.period,
430 |         )
431 | 
432 |     @property
433 |     def end(self) -> "Time":
434 |         if self.hour is None and self.hasPOD:
435 |             hour = pod_hours[self.POD][1]  # type: ignore
436 |         else:
437 |             hour = self.hour if self.hour is not None else 23
438 |         return Time(
439 |             year=self.year,
440 |             month=self.month,
441 |             day=self.day,
442 |             hour=hour,
443 |             minute=self.minute if self.minute is not None else 59,
444 |             period=self.period,
445 |         )
446 | 
447 |     @property
448 |     def dt(self) -> datetime:
449 |         # Use the start time, in case we have a POD specification
450 |         t = self.start
451 |         if t.year is None or t.month is None or t.day is None:
452 |             raise ValueError(
453 |                 "cannot convert underspecified Time into datetime"
454 |                 ", missing at least one of year, month or day"
455 |             )
456 |         return datetime(t.year, t.month, t.day, t.hour or 0, t.minute or 0)
457 | 
458 | 
459 | class Interval(Artifact):
460 |     def __init__(
461 |         self, t_from: Optional[Time] = None, t_to: Optional[Time] = None
462 |     ) -> None:
463 |         super().__init__()
464 |         self._attrs = ["t_from", "t_to"]
465 |         self.t_from = t_from
466 |         self.t_to = t_to
467 | 
468 |     @property
469 |     def isTimeInterval(self) -> bool:
470 |         if self.t_from is None or self.t_to is None:
471 |             return False
472 |         else:
473 |             return self.t_from.isTOD and self.t_to.isTOD
474 | 
475 |     @property
476 |     def isDateInterval(self) -> bool:
477 |         if self.t_from is None or self.t_to is None:
478 |             return False
479 |         return self.t_from.isDate and self.t_to.isDate
480 | 
481 |     def __str__(self) -> str:
482 |         return "{} - {}".format(str(self.t_from), str(self.t_to))
483 | 
484 |     @classmethod
485 |     def from_str(cls: Type["Interval"], text: str) -> "Interval":
486 |         bounds = text.split(" - ")
487 |         if len(bounds) != 2:
488 |             raise ValueError("Invalid format")
489 | 
490 |         t_from = None if bounds[0] == "None" else Time.from_str(bounds[0])
491 |         t_to = None if bounds[1] == "None" else Time.from_str(bounds[1])
492 |         return cls(t_from=t_from, t_to=t_to)
493 | 
494 |     @property
495 |     def start(self) -> Optional[Time]:
496 |         if self.t_from is not None:
497 |             return self.t_from.start
498 |         else:
499 |             return None
500 | 
501 |     @property
502 |     def end(self) -> Optional[Time]:
503 |         if self.t_to is not None:
504 |             return self.t_to.end
505 |         else:
506 |             return None
507 | 
508 | 
509 | @enum.unique
510 | class DurationUnit(enum.Enum):
511 |     MINUTES = "minutes"
512 |     HOURS = "hours"
513 |     DAYS = "days"
514 |     NIGHTS = "nights"
515 |     WEEKS = "weeks"
516 |     MONTHS = "months"
517 |     YEARS = "years"
518 | 
519 | 
520 | class Duration(Artifact):
521 |     def __init__(self, value: int, unit: DurationUnit):
522 |         """Create a Duration using value and unit.
523 | 
524 |         Typical values for unit are:
525 | 
526 |         minute, hour, day, night, week, month, year
527 |         """
528 |         super().__init__()
529 |         self.value = value
530 |         self.unit = unit
531 | 
532 |     def __str__(self) -> str:
533 |         return "{} {}".format(self.value, self.unit.value)
534 | 
535 |     @classmethod
536 |     def from_str(cls: Type["Duration"], text: str) -> "Duration":
537 |         value, unit = text.split()
538 |         return Duration(int(value), DurationUnit(unit))
539 | 
540 |     def time(self, ts: datetime) -> Time:
541 |         if self.unit == DurationUnit.MINUTES:
542 |             dm = ts + relativedelta(minutes=+self.value)
543 |             return Time(year=dm.year, month=dm.month, day=dm.day, hour=dm.hour, minute=dm.minute)
544 |         if self.unit == DurationUnit.HOURS:
545 |             dm = ts + relativedelta(hours=+self.value)
546 |             return Time(year=dm.year, month=dm.month, day=dm.day, hour=dm.hour, minute=dm.minute)
547 |         if self.unit == DurationUnit.DAYS:
548 |             dm = ts + relativedelta(days=+self.value)
549 |         if self.unit == DurationUnit.WEEKS:
550 |             dm = ts + relativedelta(days=+self.value*7)
551 |         if self.unit == DurationUnit.MONTHS:
552 |             dm = ts + relativedelta(months=+self.value)
553 |         if self.unit == DurationUnit.YEARS:
554 |             dm = ts + relativedelta(years=+self.value)
555 | 
556 |         return Time(year=dm.year, month=dm.month, day=dm.day)
557 | 
558 | 
559 | @enum.unique
560 | class RecurringFrequency(enum.Enum):
561 |     DAILY = "DAILY"
562 |     WEEKLY = "WEEKLY"
563 |     MONTHLY = "MONTHLY"
564 |     YEARLY = "YEARLY"
565 | 
566 | 
567 | class Recurring(Artifact):
568 |     def __init__(
569 |         self,
570 |         frequency: Optional[RecurringFrequency] = None,
571 |         interval: Optional[int] = None,
572 |         start_time: Optional[Time] = None,
573 |         end_time: Optional[Time] = None,
574 |         byday: Optional[tuple[int, ...]] = None,
575 |     ):
576 |         super().__init__()
577 |         self._attrs = ['start_time', 'end_time', 'frequency', 'interval', 'byday']
578 |         self.start_time = start_time
579 |         self.end_time = end_time
580 |         self.frequency = frequency
581 |         self.interval = interval
582 |         self.frequency_map = {RecurringFrequency.DAILY.value: DAILY,
583 |                               RecurringFrequency.WEEKLY.value: WEEKLY,
584 |                               RecurringFrequency.MONTHLY.value: MONTHLY,
585 |                               RecurringFrequency.YEARLY.value: YEARLY}
586 |         self.byday = byday
587 | 
588 |     def __str__(self) -> str:
589 |         return "{} {} {} {}".format(self.frequency, self.interval, self.start_time, self.end_time, self.byday)
590 | 
591 |     @property
592 |     def isRecurring(self) -> bool:
593 |         if self.frequency and self.interval is None:
594 |             return False
595 |         else:
596 |             return True
597 | 
598 |     @property
599 |     def isRecurringDOW(self) -> bool:
600 |         if not self.start_time.DOW:
601 |             return False
602 |         else:
603 |             return True
604 | 
605 |     @property
606 |     def isRecurringTime(self) -> bool:
607 |         if not self.start_time.hasTime:
608 |             return False
609 |         else:
610 |             return True
611 | 
612 |     def to_rrule(self) -> rrule:
613 |         r_rule = rrule(freq=self.frequency_map[self.frequency], interval=self.interval, byweekday=self.byday)
614 |         r_rule = r_rule.__str__().split('\n')[1]
615 |         return r_rule
616 | 
617 | 
618 | class RecurringArray(Artifact):
619 |     def __init__(self,
620 |                  rec_1: Optional[Recurring] = None,
621 |                  rec_2: Optional[Recurring] = None,
622 |                  rec_3: Optional[Recurring] = None,
623 |                  rec_4: Optional[Recurring] = None,
624 |                  rec_5: Optional[Recurring] = None,
625 |                  ):
626 |         super().__init__()
627 |         self._attrs = ['rec_1', 'rec_2', 'rec_3', 'rec_4', 'rec_5']
628 |         self.rec_1 = rec_1
629 |         self.rec_2 = rec_2
630 |         self.rec_3 = rec_3
631 |         self.rec_4 = rec_4
632 |         self.rec_5 = rec_5
633 | 
634 |     def __str__(self) -> str:
635 |         return "\n Recurring instance: {} \n Recurring instance: {} \n Recurring instance: {} \n Recurring instance: {} \n Recurring instance: {}".format(self.rec_1, self.rec_2, self.rec_3, self.rec_4, self.rec_5)
636 | 
637 |     @property
638 |     def to_list(self) -> list:
639 |         array = [self.rec_1, self.rec_2, self.rec_3, self.rec_4, self.rec_5]
640 |         array = [i for i in array if i != None]
641 |         return array
642 | 


--------------------------------------------------------------------------------
/datasets/README.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | Time Parse Dataset
 3 | ==================
 4 | 
 5 | The dataset included in ``datasets/timeparse_corpus.json`` contains a set of ~2000 human annotated time expression in english and german.
 6 | 
 7 | The dataset is a list of json records with the following fields:
 8 | 
 9 | - *text*: the text for the time expression
10 | - *ref_time*: a timestamp in ISO 8601 format ``YYYY-MM-DDTHH:MM:SS``
11 | - *gold_parse*: the human annotation of the time expression. It can be a ``Time`` or ``Interval``. 
12 | - *language*: a two-digit code indicating the language. In this dataset it is either "en" or "de".
13 | 
14 | 
15 | For ``Time``, the format is as follows::
16 | 
17 |     Time[]{YYYY-MM-DD HH:MM (dow/tod)} 
18 | 
19 | Where:
20 | - ``YYYY`` is a four-digit year or ``X``, if year is missing
21 | - ``MM`` is a two-digit month or ``X``, if month is missing
22 | - ``DD`` is a two-digit day or ``X``, if day is missing
23 | - ``HH`` is a two-digit hour (24 hour clock) or ``X``, if hour is missing
24 | - ``MM`` is a two-digit minute or ``X``, if minute is missing
25 | - ``dow`` is an integer between 0 and 6 representing day of week or X, if missing (in the dataset, day of week is always missing)
26 | - ``tod`` is a string representing the time of day (such as earlymorning, morning, forenoon, noon, afternoon, evening, lateevening) or X if not specified.
27 | 
28 | Example::
29 | 
30 |     Morning of the 11th June 2017
31 |     Time[]{2017-06-11 X:X (X/morning)}
32 | 
33 | For ``Interval`` the format is as follows::
34 | 
35 |     Interval[]{<START_T> - <END_T>}
36 | 
37 | Where ``<START_T>`` and ``<END_T>`` are the beginning and end of the interval. ``<START_T>`` or ``<END_T>`` can be None if the interval is open-ended. They can be specified
38 | using the same representation for times, as described above::
39 | 
40 |     YYYY-MM-DD HH:MM (dow/tod)
41 | 
42 | Example::
43 | 
44 |     Wed, Oct 11 2017 8:30 PM - 9:47 PM
45 |     Interval[]{2017-10-11 08:30 (X/X) - 2017-10-11 09:47 (X/X)}


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = ctparse
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # ctparse documentation build configuration file, created by
  5 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another
 17 | # directory, add these directories to sys.path here. If the directory is
 18 | # relative to the documentation root, use os.path.abspath to make it
 19 | # absolute, like shown here.
 20 | #
 21 | import os
 22 | import sys
 23 | sys.path.insert(0, os.path.abspath('..'))
 24 | 
 25 | import ctparse  # noqa
 26 | 
 27 | # -- General configuration ---------------------------------------------
 28 | 
 29 | # If your documentation needs a minimal Sphinx version, state it here.
 30 | #
 31 | # needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 35 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.napoleon']
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ['_templates']
 39 | 
 40 | # The suffix(es) of source filenames.
 41 | # You can specify multiple suffix as a list of string:
 42 | #
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = '.rst'
 45 | 
 46 | # The master toctree document.
 47 | master_doc = 'index'
 48 | 
 49 | # General information about the project.
 50 | project = u'ctparse - Parse natural language time expressions'
 51 | copyright = u"2018, Sebastian Mika, Comtravo GmbH"
 52 | author = u"Sebastian Mika - Comtravo"
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement
 55 | # for |version| and |release|, also used in various other places throughout
 56 | # the built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = ctparse.__version__
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = ctparse.__version__
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This patterns also effect to html_static_path and html_extra_path
 73 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 74 | 
 75 | # The name of the Pygments (syntax highlighting) style to use.
 76 | pygments_style = 'sphinx'
 77 | 
 78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 79 | todo_include_todos = False
 80 | 
 81 | 
 82 | # -- Options for HTML output -------------------------------------------
 83 | 
 84 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 85 | # a list of builtin themes.
 86 | #
 87 | html_theme = 'sphinx_rtd_theme'
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a
 90 | # theme further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = ['_static']
 99 | 
100 | 
101 | # -- Options for HTMLHelp output ---------------------------------------
102 | 
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = 'ctparsedoc'
105 | 
106 | 
107 | # -- Options for LaTeX output ------------------------------------------
108 | 
109 | latex_elements = {
110 |     # The paper size ('letterpaper' or 'a4paper').
111 |     #
112 |     # 'papersize': 'letterpaper',
113 | 
114 |     # The font size ('10pt', '11pt' or '12pt').
115 |     #
116 |     # 'pointsize': '10pt',
117 | 
118 |     # Additional stuff for the LaTeX preamble.
119 |     #
120 |     # 'preamble': '',
121 | 
122 |     # Latex figure (float) alignment
123 |     #
124 |     # 'figure_align': 'htbp',
125 | }
126 | 
127 | # Grouping the document tree into LaTeX files. List of tuples
128 | # (source start file, target name, title, author, documentclass
129 | # [howto, manual, or own class]).
130 | latex_documents = [
131 |     (master_doc, 'ctparse.tex',
132 |      u'ctparse - Parse natural language time expressions in pytho Documentation',
133 |      u'Sebastian Mika', 'manual'),
134 | ]
135 | 
136 | 
137 | # -- Options for manual page output ------------------------------------
138 | 
139 | # One entry per manual page. List of tuples
140 | # (source start file, name, description, authors, manual section).
141 | man_pages = [
142 |     (master_doc, 'ctparse',
143 |      u'ctparse - Parse natural language time expressions Documentation',
144 |      [author], 1)
145 | ]
146 | 
147 | 
148 | # -- Options for Texinfo output ----------------------------------------
149 | 
150 | # Grouping the document tree into Texinfo files. List of tuples
151 | # (source start file, target name, title, author,
152 | #  dir menu entry, description, category)
153 | texinfo_documents = [
154 |     (master_doc, 'ctparse',
155 |      u'ctparse - Parse natural language time expressions Documentation',
156 |      author,
157 |      'ctparse',
158 |      'One line description of project.',
159 |      'Miscellaneous'),
160 | ]
161 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/ctparse.rst:
--------------------------------------------------------------------------------
  1 | ctparse package
  2 | ===============
  3 | 
  4 | Subpackages
  5 | -----------
  6 | 
  7 | .. toctree::
  8 | 
  9 |    ctparse.time
 10 | 
 11 | Submodules
 12 | ----------
 13 | 
 14 | ctparse.corpus module
 15 | ---------------------
 16 | 
 17 | .. automodule:: ctparse.corpus
 18 |    :members:
 19 |    :undoc-members:
 20 |    :show-inheritance:
 21 | 
 22 | ctparse.count\_vectorizer module
 23 | --------------------------------
 24 | 
 25 | .. automodule:: ctparse.count_vectorizer
 26 |    :members:
 27 |    :undoc-members:
 28 |    :show-inheritance:
 29 | 
 30 | ctparse.ctparse module
 31 | ----------------------
 32 | 
 33 | .. automodule:: ctparse.ctparse
 34 |    :members:
 35 |    :undoc-members:
 36 |    :show-inheritance:
 37 | 
 38 | ctparse.loader module
 39 | ---------------------
 40 | 
 41 | .. automodule:: ctparse.loader
 42 |    :members:
 43 |    :undoc-members:
 44 |    :show-inheritance:
 45 | 
 46 | ctparse.nb\_estimator module
 47 | ----------------------------
 48 | 
 49 | .. automodule:: ctparse.nb_estimator
 50 |    :members:
 51 |    :undoc-members:
 52 |    :show-inheritance:
 53 | 
 54 | ctparse.nb\_scorer module
 55 | -------------------------
 56 | 
 57 | .. automodule:: ctparse.nb_scorer
 58 |    :members:
 59 |    :undoc-members:
 60 |    :show-inheritance:
 61 | 
 62 | ctparse.partial\_parse module
 63 | -----------------------------
 64 | 
 65 | .. automodule:: ctparse.partial_parse
 66 |    :members:
 67 |    :undoc-members:
 68 |    :show-inheritance:
 69 | 
 70 | ctparse.pipeline module
 71 | -----------------------
 72 | 
 73 | .. automodule:: ctparse.pipeline
 74 |    :members:
 75 |    :undoc-members:
 76 |    :show-inheritance:
 77 | 
 78 | ctparse.rule module
 79 | -------------------
 80 | 
 81 | .. automodule:: ctparse.rule
 82 |    :members:
 83 |    :undoc-members:
 84 |    :show-inheritance:
 85 | 
 86 | ctparse.scorer module
 87 | ---------------------
 88 | 
 89 | .. automodule:: ctparse.scorer
 90 |    :members:
 91 |    :undoc-members:
 92 |    :show-inheritance:
 93 | 
 94 | ctparse.timers module
 95 | ---------------------
 96 | 
 97 | .. automodule:: ctparse.timers
 98 |    :members:
 99 |    :undoc-members:
100 |    :show-inheritance:
101 | 
102 | ctparse.types module
103 | --------------------
104 | 
105 | .. automodule:: ctparse.types
106 |    :members:
107 |    :undoc-members:
108 |    :show-inheritance:
109 | 
110 | 
111 | Module contents
112 | ---------------
113 | 
114 | .. automodule:: ctparse
115 |    :members:
116 |    :undoc-members:
117 |    :show-inheritance:
118 | 


--------------------------------------------------------------------------------
/docs/ctparse.time.rst:
--------------------------------------------------------------------------------
 1 | ctparse.time package
 2 | ====================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | ctparse.time.corpus module
 8 | --------------------------
 9 | 
10 | .. automodule:: ctparse.time.corpus
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | ctparse.time.rules module
16 | -------------------------
17 | 
18 | .. automodule:: ctparse.time.rules
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: ctparse.time
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/dataset.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../datasets/README.rst


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to the ctparse documentation!
 2 | =====================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Contents:
 7 | 
 8 |    readme
 9 |    installation
10 |    usage
11 |    dataset
12 |    contributing
13 |    modules
14 |    authors
15 |    history
16 | 
17 | Indices and tables
18 | ==================
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Stable release
 9 | --------------
10 | 
11 | To install ctparse - Parse natural language time expressions in pytho, run this command in your terminal:
12 | 
13 | .. code-block:: console
14 | 
15 |     $ pip install ctparse
16 | 
17 | This is the preferred method to install ctparse - Parse natural language time expressions in pytho, as it will always install the most recent stable release.
18 | 
19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20 | you through the process.
21 | 
22 | .. _pip: https://pip.pypa.io
23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24 | 
25 | 
26 | From sources
27 | ------------
28 | 
29 | The sources for ctparse - Parse natural language time expressions in pytho can be downloaded from the `Github repo`_.
30 | 
31 | You can either clone the public repository:
32 | 
33 | .. code-block:: console
34 | 
35 |     $ git clone git://github.com/comtravo/ctparse
36 | 
37 | Or download the `tarball`_:
38 | 
39 | .. code-block:: console
40 | 
41 |     $ curl  -OL https://github.com/comtravo/ctparse/tarball/master
42 | 
43 | Once you have a copy of the source, you can install it with:
44 | 
45 | .. code-block:: console
46 | 
47 |     $ python setup.py install
48 | 
49 | 
50 | .. _Github repo: https://github.com/comtravo/ctparse
51 | .. _tarball: https://github.com/comtravo/ctparse/tarball/master
52 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=ctparse
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | ctparse
2 | =======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    ctparse
8 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
 1 | =====
 2 | Usage
 3 | =====
 4 | 
 5 | To use ctparse simply import the main ``ctparse`` function::
 6 | 
 7 |     
 8 |     from datetime import datetime
 9 |     from ctparse import ctparse
10 | 
11 |     ctparse('today', datetime(2018, 7, 8), timeout=1)
12 |     
13 | The output for the above code is `2018-07-08 X:X (X/X) s=2.273 p=(149, 'ruleToday')`
14 | 
15 | For more details on the parameters please see the docstrings.
16 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | # Specify the target platform details in config, so your developers are
 3 | # free to run mypy on Windows, Linux, or macOS and get consistent
 4 | # results.
 5 | python_version=3.6
 6 | platform=linux
 7 | 
 8 | # flake8-mypy expects the two following for sensible formatting
 9 | show_column_numbers=True
10 | 
11 | # show error messages from unrelated files
12 | follow_imports=normal
13 | 
14 | # suppress errors about unsatisfied imports
15 | ignore_missing_imports=True
16 | 
17 | # be strict
18 | disallow_untyped_calls=True
19 | warn_return_any=True
20 | strict_optional=True
21 | warn_no_return=True
22 | warn_redundant_casts=True
23 | warn_unused_ignores=True
24 | disallow_any_generics=True
25 | disallow_untyped_defs=True
26 | check_untyped_defs=True
27 | 
28 | # No incremental mode
29 | cache_dir=/dev/null
30 | 
31 | [mypy-ctparse.time.rules]
32 | # time rules check existence of fields in predicates
33 | strict_optional=False
34 | 
35 | [mypy-tests.*]
36 | disallow_untyped_defs=False
37 | 
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | pip==20.1.1
 2 | bumpversion==0.6.0
 3 | watchdog==0.10.2
 4 | flake8==3.8.2
 5 | flake8-bugbear==20.1.4
 6 | tox==3.15.1
 7 | coverage==5.1
 8 | sphinx==3.0.4
 9 | sphinx-rtd-theme==0.4.3
10 | 
11 | twine==3.1.1
12 | 
13 | pytest==5.4.2
14 | pytest-runner==5.2
15 | pytest-cov==2.9.0
16 | mypy==0.770
17 | black==19.10b0
18 | 


--------------------------------------------------------------------------------
/scripts/train_default_model.py:
--------------------------------------------------------------------------------
 1 | """Train a default multinomial bayes classifier"""
 2 | import argparse
 3 | import logging
 4 | 
 5 | from ctparse.corpus import load_timeparse_corpus, make_partial_rule_dataset, run_corpus
 6 | from ctparse.loader import DEFAULT_MODEL_FILE
 7 | from ctparse.nb_scorer import save_naive_bayes, train_naive_bayes
 8 | from ctparse.scorer import DummyScorer
 9 | from ctparse.time import auto_corpus, corpus
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument(
17 |         "--legacy",
18 |         help="Use legacy dataset (ctparse.time.corpus and ctparse.time.auto_corpus as training data)",
19 |         action="store_true",
20 |     )
21 |     parser.add_argument("--dataset", help="Dataset file")
22 |     return parser.parse_args()
23 | 
24 | 
25 | def main():
26 |     args = parse_args()
27 |     logging.basicConfig(
28 |         level=logging.INFO, format="%(asctime)s %(levelname)s [%(name)s] %(message)s"
29 |     )
30 | 
31 |     X_combined = []
32 |     y_combined = []
33 | 
34 |     if args.legacy:
35 |         logger.info("Loading legacy dataset")
36 |         X, y = run_corpus(corpus.corpus + auto_corpus.corpus)
37 |         X_combined.extend(X)
38 |         y_combined.extend(y)
39 | 
40 |     if args.dataset:
41 |         logger.info("Loading dataset {}".format(args.dataset))
42 |         entries = load_timeparse_corpus(args.dataset)
43 |         X, y = zip(
44 |             *make_partial_rule_dataset(
45 |                 entries,
46 |                 scorer=DummyScorer(),
47 |                 timeout=30,
48 |                 max_stack_depth=100,
49 |                 progress=True,
50 |             )
51 |         )
52 |         X_combined.extend(X)
53 |         y_combined.extend(y)
54 | 
55 |     if len(X) == 0:
56 |         raise ValueError("Need to specify at least a dataset for training")
57 | 
58 |     mdl = train_naive_bayes(X_combined, y_combined)
59 |     save_naive_bayes(mdl, DEFAULT_MODEL_FILE)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 0.3.01
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bumpversion:file:ctparse/__init__.py]
11 | search = __version__ = '{current_version}'
12 | replace = __version__ = '{new_version}'
13 | 
14 | [bdist_wheel]
15 | universal = 1
16 | 
17 | [flake8]
18 | exclude = docs
19 | max-line-length = 80
20 | select = C,E,F,W,B,B950
21 | ignore = E203,E266,E501,W503
22 | mypy_config = mypy.ini
23 | 
24 | [aliases]
25 | test = pytest
26 | 
27 | ;[tool:pytest]
28 | ;collect_ignore = ['setup.py']
29 | 
30 | [coverage:run]
31 | include = ctparse/*
32 | 
33 | [coverage:report]
34 | show_missing = True
35 | fail_under = 95
36 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """The setup script."""
 5 | 
 6 | from setuptools import setup, find_packages
 7 | 
 8 | with open('README.rst') as readme_file:
 9 |     readme = readme_file.read()
10 | 
11 | with open('HISTORY.rst') as history_file:
12 |     history = history_file.read()
13 | 
14 | requirements = []
15 | 
16 | setup_requirements = ['pytest-runner', ]
17 | 
18 | test_requirements = ['pytest', ]
19 | 
20 | setup(
21 |     author="Sebastian Mika/Comtravo",
22 |     author_email='sebastian.mika@comtravo.com',
23 |     classifiers=[
24 |         'Development Status :: 3 - Alpha',
25 |         'Intended Audience :: Developers',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Natural Language :: English',
28 |         'Programming Language :: Python :: 3',
29 |         'Programming Language :: Python :: 3.6',
30 |         'Programming Language :: Python :: 3.7',
31 |         'Programming Language :: Python :: 3.8',
32 |         'Topic :: Software Development :: Libraries :: Python Modules',
33 |         'Topic :: Text Processing :: Linguistic',
34 |     ],
35 |     description="Parse natural language time expressions in python",
36 |     install_requires=[
37 |         'python-dateutil>=2.7.3,<3.0.0',
38 |         'regex>=2018.6.6',
39 |         'tqdm>=4.23.4,<5.0.0'
40 |     ],
41 |     license="MIT license",
42 |     long_description=readme + '\n\n' + history,
43 |     include_package_data=True,
44 |     keywords='quickadd',
45 |     name='quickadd',
46 |     packages=find_packages(include=['ctparse*']),
47 |     package_dir={'ctparse': 'ctparse'},
48 |     package_data={'ctparse': ['models/model.pbz', 'py.typed']},
49 |     setup_requires=setup_requirements,
50 |     test_suite='tests',
51 |     tests_require=test_requirements,
52 |     url='https://github.com/inferense/quickadd',
53 |     version='0.6.5',
54 |     zip_safe=False,
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Acreom/quickadd/69543c79ad5db05a712abf223940fadf61740235/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_corpus.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pytest
 4 | 
 5 | from ctparse.corpus import (
 6 |     TimeParseEntry,
 7 |     load_timeparse_corpus,
 8 |     make_partial_rule_dataset,
 9 |     parse_nb_string,
10 |     run_corpus,
11 | )
12 | from ctparse.scorer import DummyScorer
13 | from ctparse.time.corpus import corpus
14 | from ctparse.types import Interval, Time
15 | 
16 | CORPUS_JSON = """
17 | [
18 |  {
19 |   "text": "Donnerstag, den 05.10. ca 6:55",
20 |   "ref_time": "2017-09-25T16:06:55",
21 |   "gold_parse": "Time[]{2017-10-05 06:55 (X/X)}",
22 |   "language": "de"
23 |  },
24 |  {
25 |   "text": "22.05.2017 früh",
26 |   "ref_time": "2017-05-16T05:42:09",
27 |   "gold_parse": "Time[]{2017-05-22 X:X (X/earlymorning)}",
28 |   "language": "de"
29 |  }
30 | ]
31 | """
32 | 
33 | 
34 | def test_run_corpus() -> None:
35 |     """The corpus passes if ctparse generates the desired
36 |     solution for each test at least once. Otherwise it fails.
37 |     """
38 |     X, y = run_corpus(corpus)
39 |     assert isinstance(y[0], bool)
40 |     assert isinstance(X[0][0], str)
41 | 
42 | 
43 | def test_run_corpus_failure() -> None:
44 |     fail_corpus = [("never produced", "2015-12-12T12:30", ("today", "heute"))]
45 |     with pytest.raises(Exception):
46 |         run_corpus(fail_corpus)
47 | 
48 | 
49 | def test_make_partial_rule_dataset() -> None:
50 |     ts = datetime(year=2019, month=10, day=1)
51 |     entries = [
52 |         TimeParseEntry(
53 |             "today at 5 pm", ts, Time(year=2019, month=10, day=1, hour=17, minute=0)
54 |         )
55 |     ]
56 | 
57 |     X, y = zip(
58 |         *make_partial_rule_dataset(
59 |             entries, timeout=0, max_stack_depth=0, scorer=DummyScorer()
60 |         )
61 |     )
62 |     assert isinstance(y[0], bool)
63 |     assert isinstance(X[0][0], str)
64 | 
65 | 
66 | def test_parse_nb_string() -> None:
67 |     t = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod")
68 | 
69 |     assert t == parse_nb_string("Time[]{0001-01-01 01:01 (1/pod)}")
70 |     assert Interval(Time(), Time()) == parse_nb_string(
71 |         "Interval[]{X-X-X X:X (X/X) - X-X-X X:X (X/X)}"
72 |     )
73 | 
74 | 
75 | def test_load_timeparse_corpus(tmp_path) -> None:
76 |     path = tmp_path / "test.json"
77 |     path.write_text(CORPUS_JSON, encoding="utf-8")
78 | 
79 |     result = load_timeparse_corpus(str(path))
80 | 
81 |     assert len(result) == 2
82 | 


--------------------------------------------------------------------------------
/tests/test_count_vectorizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from ctparse.count_vectorizer import CountVectorizer
 3 | 
 4 | 
 5 | @pytest.mark.parametrize(
 6 |     "ngrams,doc,result",
 7 |     [
 8 |         ((1, 1), ["a", "b", "c"], ["a", "b", "c"]),
 9 |         ((1, 2), ["a", "b", "c"], ["a", "b", "c", "a b", "b c"]),
10 |         ((2, 2), ["a", "b", "c"], ["a b", "b c"]),
11 |         ((1, 3), ["a", "b"], ["a", "b", "a b"]),
12 |         ((2, 3), ["a", "b"], ["a b"]),
13 |     ],
14 | )
15 | def test_ngrams(ngrams, doc, result):
16 |     assert CountVectorizer._create_ngrams(ngrams, [doc]) == [result]
17 | 
18 | 
19 | def test_count_vectorizer_fit_and_transform():
20 |     cv = CountVectorizer((1, 2))
21 |     cv = cv.fit([["a", "b", "c"], ["c", "d"]])
22 |     assert cv.vocabulary
23 |     assert cv.transform([["b"]]) == [{cv.vocabulary["b"]: 1, 6: 0}]
24 | 
25 | 
26 | def test_count_vectorizer_fit_transform():
27 |     cv = CountVectorizer((1, 2))
28 |     X = cv.fit_transform([["a", "b"], ["b", "c"]])
29 |     assert cv.vocabulary
30 |     assert X == [
31 |         {
32 |             cv.vocabulary["a"]: 1,
33 |             cv.vocabulary["b"]: 1,
34 |             cv.vocabulary["a b"]: 1,
35 |             len(cv.vocabulary) - 1: 0,
36 |         },
37 |         {cv.vocabulary["b"]: 1, cv.vocabulary["c"]: 1, cv.vocabulary["b c"]: 1},
38 |     ]
39 | 
40 | 
41 | def test_count_vectorizer_transform_no_fit():
42 |     cv = CountVectorizer((1, 2))
43 |     with pytest.raises(ValueError):
44 |         cv.transform([["a"]])
45 | 


--------------------------------------------------------------------------------
/tests/test_ctparse.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from ctparse.ctparse import ctparse, ctparse_gen, _match_rule
 3 | from ctparse.types import Interval, Time, Artifact
 4 | 
 5 | 
 6 | def test_ctparse():
 7 |     txt = "12.12.2020"
 8 |     res = ctparse(txt)
 9 |     assert res
10 |     assert res.resolution == Time(year=2020, month=12, day=12)
11 |     assert str(res)
12 |     assert repr(res)
13 | 
14 |     # non sense gives no result
15 |     assert ctparse("gargelbabel") is None
16 |     txt = "12.12."
17 |     res = ctparse(txt, ts=datetime(2020, 12, 1))
18 |     assert res
19 |     assert res.resolution == Time(year=2020, month=12, day=12)
20 | 
21 |     gres = ctparse_gen(txt, ts=datetime(2020, 12, 1))
22 |     first_res = next(gres)
23 |     assert first_res
24 |     assert first_res.resolution == Time(year=2020, month=12, day=12)
25 | 
26 | 
27 | def test_ctparse_timeout():
28 |     # timeout in ctparse: should rather mock the logger and see
29 |     # whether the timeout was hit, but cannot get it mocked
30 |     txt = "tomorrow 8 yesterday Sep 9 9 12 2023 1923"
31 |     ctparse(txt, timeout=0.0001)
32 | 
33 | 
34 | def test_match_rule():
35 |     def rule(a: Artifact) -> bool:
36 |         return True
37 | 
38 |     assert list(_match_rule([], [rule])) == []
39 |     assert list(_match_rule([Artifact()], [])) == []
40 | 
41 | 
42 | def test_latent_time():
43 |     parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=False)
44 |     assert parse
45 |     assert parse.resolution == Time(None, None, None, 20, 00)
46 | 
47 |     parse = ctparse("8:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=True)
48 |     assert parse
49 |     assert parse.resolution == Time(2020, 1, 1, 20, 00)
50 | 
51 | 
52 | def test_latent_time_interval():
53 |     parse = ctparse(
54 |         "8:00 pm - 9:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=False
55 |     )
56 |     assert parse
57 |     assert parse.resolution == Interval(
58 |         Time(None, None, None, 20, 00), Time(None, None, None, 21, 00)
59 |     )
60 | 
61 |     parse = ctparse(
62 |         "8:00 pm - 9:00 pm", ts=datetime(2020, 1, 1, 7, 0), latent_time=True
63 |     )
64 |     assert parse
65 |     assert parse.resolution == Interval(
66 |         Time(2020, 1, 1, 20, 00), Time(2020, 1, 1, 21, 00)
67 |     )
68 | 


--------------------------------------------------------------------------------
/tests/test_partialparse.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from typing import Any, Callable
  3 | 
  4 | import pytest
  5 | import regex
  6 | 
  7 | from ctparse.partial_parse import PartialParse, _seq_match
  8 | from ctparse.types import RegexMatch, Time
  9 | 
 10 | 
 11 | def test_partial_parse() -> None:
 12 |     match_a = regex.match("(?<R1>a)", "ab")
 13 |     match_b = next(regex.finditer("(?<R2>b)", "ab"))
 14 | 
 15 |     pp = PartialParse.from_regex_matches(
 16 |         (RegexMatch(1, match_a), RegexMatch(2, match_b))
 17 |     )
 18 | 
 19 |     assert len(pp.prod) == 2
 20 |     assert len(pp.rules) == 2
 21 | 
 22 |     assert isinstance(pp.score, float)
 23 | 
 24 |     def mock_rule(ts: datetime.datetime, a: Time) -> Time:
 25 |         return Time()
 26 | 
 27 |     pp2 = pp.apply_rule(
 28 |         datetime.datetime(day=1, month=1, year=2015), mock_rule, "mock_rule", (0, 1)
 29 |     )
 30 | 
 31 |     assert pp != pp2
 32 | 
 33 |     with pytest.raises(ValueError):
 34 |         PartialParse((), ())
 35 | 
 36 | 
 37 | def test_seq_match() -> None:
 38 |     # NOTE: we are testing a private function because the algorithm
 39 |     # is quite complex
 40 | 
 41 |     def make_rm(i: int) -> Callable[[Any], bool]:
 42 |         def _regex_match(s: Any) -> bool:
 43 |             return bool(s == i)
 44 | 
 45 |         return _regex_match
 46 | 
 47 |     # empty sequence, empty pattern: matches on a single empty sequence
 48 |     assert list(_seq_match([], [])) == [[]]
 49 |     # non empty sequence, empty pattern matches on an empty sequence
 50 |     assert list(_seq_match(["a", "b"], [])) == [[]]
 51 |     # non empty sequence, non empty pattern that does not apper: no match
 52 |     assert list(_seq_match(["a", "b"], [make_rm(1)])) == []
 53 |     # empty sequence, non empty pattern: no match
 54 |     assert list(_seq_match([], [make_rm(1)])) == []
 55 |     # sequence shorter than pattern: no match
 56 |     assert list(_seq_match(["a"], [make_rm(1), make_rm(2)])) == []
 57 |     # seq = pat
 58 |     assert list(_seq_match([1], [make_rm(1)])) == [[0]]
 59 |     assert list(_seq_match([1, 2, 3], [make_rm(1)])) == [[0]]
 60 |     assert list(_seq_match([1, 2, 3], [make_rm(2)])) == [[1]]
 61 |     assert list(_seq_match([1, 2, 3], [make_rm(3)])) == [[2]]
 62 |     assert list(_seq_match([1, 2, "a"], [make_rm(1), make_rm(2)])) == [[0, 1]]
 63 |     assert list(_seq_match([1, "a", 3], [make_rm(1), _identity, make_rm(3)])) == [
 64 |         [0, 2]
 65 |     ]
 66 |     assert list(_seq_match(["a", 2, 3], [make_rm(2), make_rm(3)])) == [[1, 2]]
 67 |     # starts with non regex
 68 |     assert list(_seq_match([1, 2], [_identity, make_rm(1), make_rm(2)])) == []
 69 |     assert list(_seq_match(["a", 1, 2], [_identity, make_rm(1), make_rm(2)])) == [
 70 |         [1, 2]
 71 |     ]
 72 |     # ends with non regex
 73 |     assert list(_seq_match([1, 2], [make_rm(1), make_rm(2), _identity])) == []
 74 |     assert list(_seq_match([1, 2, "a"], [make_rm(1), make_rm(2), _identity])) == [
 75 |         [0, 1]
 76 |     ]
 77 |     # repeated pattern
 78 |     assert list(_seq_match([1, 2, 1, 2, 2], [make_rm(1), make_rm(2)])) == [
 79 |         [0, 1],
 80 |         [0, 3],
 81 |         [0, 4],
 82 |         [2, 3],
 83 |         [2, 4],
 84 |     ]
 85 |     assert list(_seq_match([1, 2, 1, 2, 2], [make_rm(1), _identity, make_rm(2)])) == [
 86 |         [0, 3],
 87 |         [0, 4],
 88 |         [2, 4],
 89 |     ]
 90 |     assert list(_seq_match([1, 2, 1, 2, 2], [_identity, make_rm(1), make_rm(2)])) == [
 91 |         [2, 3],
 92 |         [2, 4],
 93 |     ]
 94 |     assert list(_seq_match([1, 2, 1, 2, 2], [make_rm(1), make_rm(2), _identity])) == [
 95 |         [0, 1],
 96 |         [0, 3],
 97 |         [2, 3],
 98 |     ]
 99 |     assert (
100 |         list(
101 |             _seq_match(
102 |                 [1, 2, 1, 2, 2],
103 |                 [_identity, make_rm(1), _identity, make_rm(2), _identity],
104 |             )
105 |         )
106 |         == []
107 |     )
108 |     assert list(
109 |         _seq_match(
110 |             [1, 2, 1, 2, 2, 3],
111 |             [_identity, make_rm(1), _identity, make_rm(2), _identity],
112 |         )
113 |     ) == [[2, 4]]
114 | 
115 | 
116 | def _identity(x: Any) -> bool:
117 |     return True
118 | 


--------------------------------------------------------------------------------
/tests/test_regressions.py:
--------------------------------------------------------------------------------
 1 | """This file contains regression tests for commonly parsed time expressions"""
 2 | import ctparse
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | def test_military_time():
 7 |     result = ctparse.ctparse("3 March 2020", ts=datetime(2020, 2, 25))
 8 |     assert result
 9 |     assert str(result.resolution) == "2020-03-03 X:X (X/X)"
10 | 
11 | 
12 | def test_parse_years_ahead():
13 |     result = ctparse.ctparse("3 March 2023", ts=datetime(2020, 2, 25))
14 |     assert result
15 |     assert str(result.resolution) == "2023-03-03 X:X (X/X)"
16 | 


--------------------------------------------------------------------------------
/tests/test_rule.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | import regex
 3 | from ctparse.types import RegexMatch, Artifact
 4 | from ctparse.rule import dimension, predicate, regex_match, rule
 5 | 
 6 | 
 7 | class TestClassA(Artifact):
 8 |     predA = 1
 9 | 
10 | 
11 | class TestClassB(Artifact):
12 |     pass
13 | 
14 | 
15 | class TestRule(TestCase):
16 |     def test_empty_regex_match_not_allowed(self):
17 |         with self.assertRaises(ValueError):
18 |             rule(r"")
19 |         with self.assertRaises(ValueError):
20 |             rule(r"[a-z]*")
21 |         self.assertIsNotNone(
22 |             rule(
23 |                 r"This long string must not match as this expression "
24 |                 "will be part of the system unless ctparse is reloaded"
25 |             )
26 |         )
27 | 
28 |     def test_consecutive_regex_not_allowed(self):
29 |         with self.assertRaises(ValueError):
30 |             rule(r"one", r"two")
31 | 
32 |     def test_regex_match(self):
33 |         m = next(regex.finditer("(?P<R1>x)", "x"))
34 |         r = RegexMatch(1, m)
35 |         self.assertTrue(regex_match(1)(r))
36 |         self.assertFalse(regex_match(1)(TestClassA()))
37 | 
38 |     def test_dimension(self):
39 |         self.assertTrue(dimension(TestClassA)(TestClassA()))
40 |         self.assertFalse(dimension(TestClassA)(TestClassB()))
41 | 
42 |     def test_predicate(self):
43 |         self.assertTrue(predicate("predA")(TestClassA()))
44 |         self.assertFalse(predicate("predA")(TestClassB()))
45 | 


--------------------------------------------------------------------------------
/tests/test_scorer.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import random
 3 | import bz2
 4 | import pickle
 5 | 
 6 | from ctparse.nb_scorer import NaiveBayesScorer, train_naive_bayes, save_naive_bayes
 7 | from ctparse.partial_parse import PartialParse
 8 | from ctparse.scorer import DummyScorer, RandomScorer
 9 | from ctparse.count_vectorizer import CountVectorizer
10 | from ctparse.nb_estimator import MultinomialNaiveBayes
11 | from ctparse.pipeline import CTParsePipeline
12 | from ctparse.types import Interval, Time
13 | 
14 | 
15 | def test_dummy():
16 |     scorer = DummyScorer()
17 |     pp = PartialParse((Time(), Interval()), ("rule1", "rule2"))
18 | 
19 |     assert scorer.score("a", datetime.datetime(2019, 1, 1), pp) == 0.0
20 |     assert scorer.score_final("a", datetime.datetime(2019, 1, 1), pp, pp.prod[0]) == 0.0
21 | 
22 | 
23 | def test_random():
24 |     rng = random.Random(42)
25 |     scorer = RandomScorer(rng)
26 | 
27 |     pp = PartialParse((Time(), Interval()), ("rule1", "rule2"))
28 | 
29 |     assert 0.0 <= scorer.score("a", datetime.datetime(2019, 1, 1), pp) <= 1.0
30 |     assert (
31 |         0.0
32 |         <= scorer.score_final("a", datetime.datetime(2019, 1, 1), pp, pp.prod[1])
33 |         <= 1.0
34 |     )
35 | 
36 | 
37 | def test_nbscorer():
38 |     # We only test that it runs just fine
39 |     X = [("a", "b"), ("a",), ("b"), ("a", "b", "a", "b")]
40 |     y = [False, True, True, False]
41 | 
42 |     model = train_naive_bayes(X, y)
43 |     scorer = NaiveBayesScorer(model)
44 | 
45 |     pp = PartialParse((Time(), Interval()), ("rule1", "rule2"))
46 | 
47 |     pp.prod[0].mstart = 0
48 |     pp.prod[1].mend = 1
49 | 
50 |     pp.prod[0].mend = 1
51 |     pp.prod[1].mend = 2
52 | 
53 |     assert 0.0 <= scorer.score("ab", datetime.datetime(2019, 1, 1), pp) <= 1.0
54 |     assert (
55 |         0.0
56 |         <= scorer.score_final("ab", datetime.datetime(2019, 1, 1), pp, pp.prod[1])
57 |         <= 1.0
58 |     )
59 | 
60 | 
61 | def test_naive_bayes_from_file(tmp_path):
62 |     nb = NaiveBayesScorer(
63 |         CTParsePipeline(CountVectorizer((1, 1)), MultinomialNaiveBayes())
64 |     )
65 |     path = tmp_path / "model.pkl"
66 |     with bz2.open(path, "w") as f:
67 |         pickle.dump(nb, f)
68 |     nb = NaiveBayesScorer.from_model_file(path)
69 |     assert nb
70 | 
71 | 
72 | def test_save_naive_bayes(tmp_path):
73 |     path = tmp_path / "model.pkl"
74 |     model = CTParsePipeline(CountVectorizer((1, 1)), MultinomialNaiveBayes())
75 |     save_naive_bayes(model, path)
76 | 


--------------------------------------------------------------------------------
/tests/test_time_rules.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from ctparse.types import Time
 4 | from ctparse.time.rules import (
 5 |     ruleDateDate,
 6 |     ruleDOMDate,
 7 |     ruleDateTimeDateTime,
 8 |     ruleDOYDate,
 9 |     ruleQuarterBeforeHH,
10 |     ruleQuarterAfterHH,
11 | )
12 | 
13 | 
14 | class TestRules(TestCase):
15 |     def test_ruleDateDate(self):
16 |         t1 = Time(year=2017)
17 |         t2 = Time(year=2015)
18 |         self.assertIsNone(ruleDateDate(None, t1, None, t2))
19 | 
20 |         t1 = Time(year=2017, month=12)
21 |         t2 = Time(year=2017, month=11)
22 |         self.assertIsNone(ruleDateDate(None, t1, None, t2))
23 | 
24 |         t1 = Time(year=2017, month=12, day=31)
25 |         t2 = Time(year=2017, month=12, day=30)
26 |         self.assertIsNone(ruleDateDate(None, t1, None, t2))
27 | 
28 |         t1 = Time(year=2017, month=12, day=31)
29 |         t2 = Time(year=2017, month=12, day=31)
30 |         self.assertIsNone(ruleDateDate(None, t1, None, t2))
31 | 
32 |         t1 = Time(year=2017, month=12, day=30)
33 |         t2 = Time(year=2017, month=12, day=31)
34 |         self.assertIsNotNone(ruleDateDate(None, t1, None, t2))
35 | 
36 |     def test_ruleDOMDate(self):
37 |         t1 = Time(day=30)
38 |         t2 = Time(year=2015, month=1, day=29)
39 |         self.assertIsNone(ruleDOMDate(None, t1, None, t2))
40 | 
41 |         t1 = Time(day=30)
42 |         t2 = Time(year=2015, month=1, day=30)
43 |         self.assertIsNone(ruleDOMDate(None, t1, None, t2))
44 | 
45 |         t1 = Time(day=29)
46 |         t2 = Time(year=2015, month=1, day=30)
47 |         self.assertIsNotNone(ruleDOMDate(None, t1, None, t2))
48 | 
49 |     def test_ruleDateTimeDateTime(self):
50 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
51 |         t2 = Time(year=2016, month=4, day=12, hour=12, minute=30)
52 |         self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2))
53 | 
54 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
55 |         t2 = Time(year=2017, month=3, day=12, hour=12, minute=30)
56 |         self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2))
57 | 
58 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
59 |         t2 = Time(year=2017, month=4, day=11, hour=12, minute=30)
60 |         self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2))
61 | 
62 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
63 |         t2 = Time(year=2017, month=4, day=12, hour=11, minute=30)
64 |         self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2))
65 | 
66 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
67 |         t2 = Time(year=2017, month=4, day=12, hour=12, minute=29)
68 |         self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2))
69 | 
70 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
71 |         t2 = Time(year=2017, month=4, day=12, hour=12, minute=30)
72 |         self.assertIsNone(ruleDateTimeDateTime(None, t1, None, t2))
73 | 
74 |         t1 = Time(year=2017, month=4, day=12, hour=12, minute=30)
75 |         t2 = Time(year=2017, month=4, day=12, hour=12, minute=31)
76 |         self.assertIsNotNone(ruleDateTimeDateTime(None, t1, None, t2))
77 | 
78 |     def test_ruleDOYDate(self):
79 |         t1 = Time(month=4, day=12)
80 |         t2 = Time(year=2017, month=4, day=12)
81 |         self.assertIsNone(ruleDOYDate(None, t1, None, t2))
82 | 
83 |         t1 = Time(month=4, day=12)
84 |         t2 = Time(year=2017, month=4, day=13)
85 |         self.assertIsNotNone(ruleDOYDate(None, t1, None, t2))
86 | 
87 |     def test_ruleQuarterBeforeHH(self):
88 |         t1 = Time(hour=12, minute=1)
89 |         self.assertIsNone(ruleQuarterBeforeHH(None, None, t1))
90 | 
91 |     def test_ruleQuarterAferHH(self):
92 |         t1 = Time(hour=12, minute=1)
93 |         self.assertIsNone(ruleQuarterAfterHH(None, None, t1))
94 | 


--------------------------------------------------------------------------------
/tests/test_timers.py:
--------------------------------------------------------------------------------
 1 | from ctparse.timers import timeout, CTParseTimeoutError, timeit
 2 | from unittest import TestCase
 3 | import time
 4 | 
 5 | 
 6 | class TimersTest(TestCase):
 7 |     def test_timeout(self):
 8 |         t_fun = timeout(0.5)
 9 |         with self.assertRaises(CTParseTimeoutError):
10 |             time.sleep(1.0)
11 |             t_fun()
12 |         t_fun = timeout(0)
13 |         t_fun()  # all good
14 | 
15 |     def test_timeit(self):
16 |         def fun(x):
17 |             return x * x
18 | 
19 |         result, elapsed = timeit(fun)(3)
20 |         self.assertEqual(result, 9)
21 |         self.assertIsInstance(elapsed, float)
22 | 


--------------------------------------------------------------------------------
/tests/test_types.py:
--------------------------------------------------------------------------------
  1 | from unittest import TestCase
  2 | import regex
  3 | from datetime import datetime
  4 | from ctparse.types import Artifact, RegexMatch, Time, Interval
  5 | 
  6 | 
  7 | class TestArtifact(TestCase):
  8 |     def test_init(self):
  9 |         a = Artifact()
 10 |         self.assertEqual(a.mstart, 0)
 11 |         self.assertEqual(a.mend, 0)
 12 |         self.assertEqual(len(a), 0)
 13 |         self.assertTrue(a)
 14 | 
 15 |     def test_eq(self):
 16 |         a = Artifact()
 17 |         b = Artifact()
 18 |         self.assertEqual(a, b)
 19 | 
 20 |         a = Time(2017, 12, 12, 12, 12, 4, "morning")
 21 |         b = Time(2017, 12, 12, 12, 12, 4, "morning")
 22 |         self.assertEqual(a, b)
 23 | 
 24 |         a = Time(2017, 12, 12, 12, 12, 4, "morning")
 25 |         b = Time(2017, 12, 12, 12, 12, 3, "morning")
 26 |         self.assertNotEqual(a, b)
 27 | 
 28 |         a = Time()
 29 |         b = Interval()
 30 |         self.assertNotEqual(a, b)
 31 | 
 32 |     def test_update_span(self):
 33 |         a1 = Artifact()
 34 |         a2 = Artifact()
 35 |         a3 = Artifact()
 36 |         a2.mstart = 10
 37 |         a3.mend = 100
 38 |         a1.update_span(a2, a3)
 39 |         self.assertEqual(a1.mstart, 10)
 40 |         self.assertEqual(a1.mend, 100)
 41 |         self.assertEqual(len(a1), 90)
 42 | 
 43 |     def test_repr(self):
 44 |         a = Artifact()
 45 |         self.assertEqual(repr(a), "Artifact[0-0]{}")
 46 | 
 47 |     def test_nb_str(self):
 48 |         a = Artifact()
 49 |         self.assertEqual(a.nb_str(), "Artifact[]{}")
 50 | 
 51 | 
 52 | class TestRegexMatch(TestCase):
 53 |     def test_init(self):
 54 |         m = next(regex.finditer(r"(?P<R1>match me)", "xxx match me xxx"))
 55 |         r = RegexMatch(1, m)
 56 |         self.assertEqual(r.mstart, 4)
 57 |         self.assertEqual(r.mend, 12)
 58 |         self.assertEqual(len(r), 8)
 59 |         self.assertEqual(r._text, "match me")
 60 |         self.assertEqual(repr(r), "RegexMatch[4-12]{1:match me}")
 61 |         self.assertEqual(r.nb_str(), "RegexMatch[]{1:match me}")
 62 | 
 63 | 
 64 | class TestTime(TestCase):
 65 |     def test_init(self):
 66 |         self.assertIsNotNone(Time())
 67 | 
 68 |     def test_isDOY(self):
 69 |         self.assertTrue(Time(month=1, day=1).isDOY)
 70 |         self.assertFalse(Time(year=1).isDOY)
 71 | 
 72 |     def test_isDOM(self):
 73 |         self.assertTrue(Time(day=1).isDOM)
 74 |         self.assertFalse(Time(month=1).isDOM)
 75 | 
 76 |     def test_isHour(self):
 77 |         self.assertTrue(Time(hour=1).isHour)
 78 |         self.assertFalse(Time(hour=1, minute=1).isHour)
 79 |         self.assertFalse(Time(hour=1, month=1).isHour)
 80 | 
 81 |     def test_isDOW(self):
 82 |         self.assertTrue(Time(DOW=1).isDOW)
 83 |         self.assertFalse(Time().isDOW)
 84 | 
 85 |     def test_isMonth(self):
 86 |         self.assertTrue(Time(month=1).isMonth)
 87 |         self.assertFalse(Time(day=1).isMonth)
 88 |         self.assertFalse(Time(year=1).isMonth)
 89 | 
 90 |     def test_isPOD(self):
 91 |         self.assertTrue(Time(POD="morning").isPOD)
 92 |         self.assertFalse(Time(day=1).isPOD)
 93 |         self.assertFalse(Time(year=1).isPOD)
 94 | 
 95 |     def test_isTOD(self):
 96 |         self.assertTrue(Time(hour=1, minute=1).isTOD)
 97 |         self.assertTrue(Time(hour=1).isTOD)
 98 |         self.assertFalse(Time(minute=1).isTOD)
 99 |         self.assertFalse(Time().isTOD)
100 | 
101 |     def test_isDate(self):
102 |         self.assertTrue(Time(year=1, month=1, day=1).isDate)
103 |         self.assertFalse(Time(year=1, month=1).isDate)
104 |         self.assertFalse(Time(year=1, day=1).isDate)
105 |         self.assertFalse(Time(day=1, month=1).isDate)
106 |         self.assertFalse(Time(year=1, month=1, day=1, hour=1).isDate)
107 | 
108 |     def test_isDateTime(self):
109 |         self.assertTrue(Time(year=1, month=1, day=1, hour=1).isDateTime)
110 |         self.assertFalse(Time(year=1, month=1, day=1).isDateTime)
111 | 
112 |     def test_isYear(self):
113 |         self.assertTrue(Time(year=1).isYear)
114 |         self.assertFalse(Time(year=1, month=1).isYear)
115 | 
116 |     def test_hasDate(self):
117 |         self.assertTrue(Time(year=1, month=1, day=1).hasDate)
118 |         self.assertFalse(Time(year=1, month=1).isDate)
119 |         self.assertFalse(Time(year=1, day=1).isDate)
120 |         self.assertFalse(Time(day=1, month=1).isDate)
121 |         self.assertTrue(Time(year=1, month=1, day=1, hour=1).hasDate)
122 | 
123 |     def test_hasTime(self):
124 |         self.assertTrue(Time(hour=1, minute=1, day=1, month=1, year=1).hasTime)
125 |         self.assertTrue(Time(hour=1, day=1, month=1, year=1).hasTime)
126 |         self.assertFalse(Time(day=1, month=1, year=1).hasTime)
127 | 
128 |     def test_hasPOD(self):
129 |         self.assertTrue(Time(POD="pod").hasPOD)
130 |         self.assertFalse(Time(day=1, month=1, year=1).hasPOD)
131 | 
132 |     def test_repr(self):
133 |         t = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod")
134 |         self.assertEqual(repr(t), "Time[0-0]{0001-01-01 01:01 (1/pod)}")
135 | 
136 |     def test_from_str(self):
137 |         # Complete time
138 |         t = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod")
139 |         t_str = str(t)
140 |         t_back = Time.from_str(t_str)
141 |         self.assertEqual(t, t_back)
142 | 
143 |         # Incomplete time
144 |         t = Time(year=None, month=1, day=1, hour=None, minute=None, DOW=None, POD="pod")
145 |         t_str = str(t)
146 |         t_back = Time.from_str(t_str)
147 |         self.assertEqual(t, t_back)
148 | 
149 |         # Zeroed time
150 |         t = Time()
151 |         t_str = str(t)
152 |         t_back = Time.from_str(t_str)
153 |         self.assertEqual(t, t_back)
154 | 
155 |         # Mistake
156 |         with self.assertRaises(ValueError):
157 |             Time.from_str("0001-01-01 01-01 (1/pod)")
158 | 
159 |     def test_start(self):
160 |         t = Time()
161 |         self.assertEqual(t.start, Time(hour=0, minute=0))
162 |         t = Time(year=2012, month=1, day=1)
163 |         self.assertEqual(t.start, Time(2012, 1, 1, 0, 0))
164 |         t = Time(year=2012, month=1, day=1, hour=12)
165 |         self.assertEqual(t.start, Time(2012, 1, 1, 12, 0))
166 |         t = Time(year=2012, month=1, day=1, hour=12, minute=20)
167 |         self.assertEqual(t.start, Time(2012, 1, 1, 12, 20))
168 |         t = Time(year=2012, month=1, day=1, POD="last")
169 |         self.assertEqual(t.start, Time(2012, 1, 1, 23, 00))
170 | 
171 |     def test_end(self):
172 |         t = Time()
173 |         self.assertEqual(t.end, Time(hour=23, minute=59))
174 |         t = Time(year=2012, month=1, day=1)
175 |         self.assertEqual(t.end, Time(2012, 1, 1, 23, 59))
176 |         t = Time(year=2012, month=1, day=1, hour=12)
177 |         self.assertEqual(t.end, Time(2012, 1, 1, 12, 59))
178 |         t = Time(year=2012, month=1, day=1, hour=12, minute=20)
179 |         self.assertEqual(t.end, Time(2012, 1, 1, 12, 20))
180 |         t = Time(year=2012, month=1, day=1, POD="last")
181 |         self.assertEqual(t.end, Time(2012, 1, 1, 23, 59))
182 | 
183 |     def test_dt(self):
184 |         t = Time(2015, 12, 12, 12, 12)
185 |         self.assertEqual(t.dt, datetime(2015, 12, 12, 12, 12))
186 |         t = Time(2015, 12, 12, 12)
187 |         self.assertEqual(t.dt, datetime(2015, 12, 12, 12))
188 |         t = Time(2015, 12, 12)
189 |         self.assertEqual(t.dt, datetime(2015, 12, 12))
190 | 
191 |         with self.assertRaises(ValueError):
192 |             t = Time(year=2012, month=12, hour=12, minute=12)
193 |             t.dt
194 | 
195 | 
196 | class TestInterval(TestCase):
197 |     def test_init(self):
198 |         self.assertIsNotNone(Interval())
199 | 
200 |     def test_isTimeInterval(self):
201 |         self.assertTrue(Interval(Time(hour=1), Time(hour=2)).isTimeInterval)
202 | 
203 |     def test_repr(self):
204 |         self.assertEqual(
205 |             repr(Interval(Time(), Time())),
206 |             "Interval[0-0]{X-X-X X:X (X/X) - X-X-X X:X (X/X)}",
207 |         )
208 | 
209 |     def test_from_str(self):
210 |         # Complete interval
211 |         t1 = Time(year=1, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod")
212 |         t2 = Time(year=2, month=1, day=1, hour=1, minute=1, DOW=1, POD="pod")
213 |         interval = Interval(t1, t2)
214 |         i_back = Interval.from_str(str(interval))
215 |         self.assertEqual(interval, i_back)
216 | 
217 |         # Incomplete interval
218 |         interval = Interval(None, t2)
219 |         i_back = Interval.from_str(str(interval))
220 |         self.assertEqual(interval, i_back)
221 | 
222 |         # Zeroed interval
223 |         interval = Interval()
224 |         i_back = Interval.from_str(str(interval))
225 |         self.assertEqual(interval, i_back)
226 | 
227 |         # Mistake
228 |         with self.assertRaises(ValueError):
229 |             Interval.from_str("X-X-X X: X(X/X) -X-X-X X: X(X/X)")
230 | 
231 |     def test_start(self):
232 |         i = Interval(Time(2013, 1, 1), Time(2013, 1, 2))
233 |         self.assertEqual(i.start, Time(2013, 1, 1, 0, 0))
234 | 
235 |         i = Interval(Time(2013, 1, 1), None)
236 |         self.assertEqual(i.start, Time(2013, 1, 1, 0, 0))
237 | 
238 |         i = Interval(None, Time(2013, 1, 2))
239 |         self.assertIsNone(i.start)
240 | 
241 |     def test_end(self):
242 |         i = Interval(Time(2013, 1, 1), Time(2013, 1, 2))
243 |         self.assertEqual(i.end, Time(2013, 1, 2, 23, 59))
244 | 
245 |         i = Interval(None, Time(2013, 1, 2))
246 |         self.assertEqual(i.end, Time(2013, 1, 2, 23, 59))
247 | 
248 |         i = Interval(Time(2013, 1, 1), None)
249 |         self.assertIsNone(i.end)
250 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py36, py37, py38
 3 | 
 4 | [travis]
 5 | python =
 6 |     3.8: py38
 7 |     3.7: py37
 8 |     3.6: py36
 9 | 
10 | [testenv]
11 | whitelist_externals= make
12 | setenv =
13 |     PYTHONPATH = {toxinidir}
14 | deps = -r{toxinidir}/requirements_dev.txt
15 | commands =
16 |     pip install -U pip
17 |     make lint
18 |     py.test --cov=ctparse --basetemp={envtmpdir}


--------------------------------------------------------------------------------