├── .coveragerc
├── .github
    ├── ISSUE_TEMPLATE
    │   └── bug_report.md
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── artifacts
    ├── Poster of PySBD_ Pragmatic Sentence Boundary Disambiguation.pdf
    ├── PySBD_ Pragmatic Sentence Boundary Disambiguation.pdf
    ├── code_pysbd_spacy_component.png
    ├── pysbd_code.png
    ├── pysbd_code2.png
    ├── pysbd_code3.png
    ├── pysbd_code_example.png
    ├── pysbd_logo.png
    ├── pysbd_poster.png
    └── pysbd_talk.png
├── benchmarks
    ├── __init__.py
    ├── benchmark_sbd_tools.py
    ├── bigtext_speed_benchmark.py
    ├── english_golden_rules.py
    └── genia_benchmark.py
├── examples
    ├── pysbd_as_spacy_component.py
    └── test_timing_script.py
├── pysbd
    ├── __init__.py
    ├── abbreviation_replacer.py
    ├── about.py
    ├── between_punctuation.py
    ├── clean
    │   ├── __init__.py
    │   └── rules.py
    ├── cleaner.py
    ├── exclamation_words.py
    ├── lang
    │   ├── __init__.py
    │   ├── amharic.py
    │   ├── arabic.py
    │   ├── armenian.py
    │   ├── bulgarian.py
    │   ├── burmese.py
    │   ├── chinese.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── common.py
    │   │   └── standard.py
    │   ├── danish.py
    │   ├── deutsch.py
    │   ├── dutch.py
    │   ├── english.py
    │   ├── french.py
    │   ├── greek.py
    │   ├── hindi.py
    │   ├── italian.py
    │   ├── japanese.py
    │   ├── kazakh.py
    │   ├── marathi.py
    │   ├── persian.py
    │   ├── polish.py
    │   ├── russian.py
    │   ├── slovak.py
    │   ├── spanish.py
    │   └── urdu.py
    ├── languages.py
    ├── lists_item_replacer.py
    ├── processor.py
    ├── punctuation_replacer.py
    ├── segmenter.py
    └── utils.py
├── pytest.ini
├── requirements-benchmark.txt
├── requirements-dev.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── conftest.py
    ├── lang
        ├── __init__.py
        ├── test_amharic.py
        ├── test_arabic.py
        ├── test_armenian.py
        ├── test_bulgarian.py
        ├── test_burmese.py
        ├── test_chinese.py
        ├── test_danish.py
        ├── test_deutsch.py
        ├── test_dutch.py
        ├── test_english.py
        ├── test_english_clean.py
        ├── test_french.py
        ├── test_greek.py
        ├── test_hindi.py
        ├── test_italian.py
        ├── test_japanese.py
        ├── test_kazakh.py
        ├── test_marathi.py
        ├── test_persian.py
        ├── test_polish.py
        ├── test_russian.py
        ├── test_slovak.py
        ├── test_spanish.py
        └── test_urdu.py
    ├── regression
        ├── __init__.py
        └── test_issues.py
    ├── test_cleaner.py
    ├── test_languages.py
    └── test_segmenter.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [report]
 2 | exclude_lines =
 3 |     pragma: no cover
 4 |     def __repr__
 5 |     if self.debug:
 6 |     if settings.DEBUG
 7 |     raise AssertionError
 8 |     raise NotImplementedError
 9 |     if 0:
10 |     if __name__ == .__main__.:
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report & testcase to help us improve
 4 | title: <Appropriate title>
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | Input text - <put your input text here>
16 | 
17 | Example:
18 | Input text - "My name is Jonas E. Smith. Please turn to p. 55."
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | Expected output - list of expected sentences
23 | 
24 | Example:
25 | `['My name is Jonas E. Smith.', 'Please turn to p. 55.']`
26 | 
27 | **Additional context**
28 | Add any other context about the problem here.
29 | 
30 | <details>
31 |   <summary>Paste Error Traceback here, if any</summary>
32 | <details>
33 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: [3.5, 3.6, 3.7, 3.8]
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
30 |     - name: Lint with flake8
31 |       run: |
32 |         pip install flake8
33 |         # stop the build if there are Python syntax errors or undefined names
34 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 |     - name: Test with pytest
38 |       run: |
39 |         pip install pytest
40 |         pip install pytest-cov
41 |         pytest --cov=pysbd tests/ --color yes --cov-report=xml --cov-report=html
42 |     - name: Upload coverage to Codecov
43 |       uses: codecov/codecov-action@v1
44 |       with:
45 |         token: ${{ secrets.CODECOV_TOKEN }}
46 |         file: ./coverage.xml
47 |         flags: unittests
48 |         env_vars: OS,PYTHON
49 |         name: codecov-umbrella
50 |         fail_ci_if_error: true
51 | 
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | # vscode
105 | .vscode/
106 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # v0.3.4
 2 | - 🐛 Fix trailing period/ellipses with spaces - #83
 3 | - 🐛 Regex escape for parenthesis - #87
 4 | 
 5 | # v0.3.3
 6 | - 🐛 Better handling consecutive periods and reserved special symbols - allenai/scholarphi#114
 7 | - Add CONTRIBUTING.md
 8 | 
 9 | # v0.3.2
10 | - 🐛 ✅ Enforce clean=True when doc_type="pdf" - \#75
11 | 
12 | # v0.3.1
13 | - 🚑 ✅ Handle Newline character & update tests
14 | 
15 | # v0.3.0
16 | -   ✨ 💫  Support Multiple languages - \#2
17 | -   🏎⚡️💯 Benchmark across Segmentation Tools, Libraries and Algorithms
18 | -   🎨 ♻️ Update sentence char_span logic
19 | -   ⚡️  Performance improvements - \#41
20 | -   ♻️🐛 Refactor AbbreviationReplacer
21 | 
22 | # v0.3.0rc
23 | -   ✨ 💫 sent `char_span` through with spaCy & regex approach - \#63
24 | -   ♻️ Refactoring to support multiple languages
25 | -   ✨ 💫Initial language support for - Hindi, Marathi, Chinese, Spanish
26 | -   ✅ Updated tests - more coverage & regression tests for issues
27 | -   👷👷🏻‍♀️ GitHub actions for CI-CD
28 | -   💚☂️ Add code coverage - coverage.py Add Codecov
29 | -   🐛 Fix incorrect text span & vanilla pysbd vs spacy output discrepancy - \#49, \#53, \#55 , \#59
30 | -   🐛 Fix `NUMBERED_REFERENCE_REGEX` for zero or one time - \#58
31 | -   🔐Fix security vulnerability bleach - \#62
32 | 
33 | 
34 | # v0.2.3
35 | -   🐛 Performance improvement in `abbreviation_replacer`- \#50
36 | 
37 | # v0.2.2
38 | -   🐛 Fix unbalanced parenthesis - \#47
39 | 
40 | # v0.2.1
41 | -   ✨pySBD as a spaCy component through entrypoints
42 | 
43 | # v0.2.0
44 | -   ✨Add `char_span` parameter (optional) to get sentence & its (start, end) char offsets from original text
45 | -   ✨pySBD as a spaCy component example
46 | -   🐛 Fix double question mark swallow bug - \#39
47 | 
48 | # v0.1.5
49 | -   🐛 Handle text with only punctuations - \#36
50 | -   🐛 Handle exclamation marks at EOL- \#37
51 | 
52 | # v0.1.4
53 | -   ✨ ✅ Handle intermittent punctuations - \#34
54 | 
55 | # v0.1.3
56 | -   🐛 Fix `lists_item_replacer` - \#29
57 | -   🐛 Fix & ♻️refactor `replace_multi_period_abbreviations` - \#30
58 | -   🐛 Fix `abbreviation_replacer` - \#31
59 | -   ✅ Add regression tests for issues
60 | 
61 | # v0.1.2
62 | -   🐛BugFix - IndexError of `scanlists` function
63 | 
64 | # v0.1.1
65 | -   English language support only
66 | -   Support for oother languages - WIP
67 | 
68 | # v0.1.0
69 | -   Initial Release
70 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to pySBD
  2 | Thanks for your interest in contributing to pySBD 🎉. The project is maintained by [@nipunsadvilkar](https://github.com/nipunsadvilkar), and I'll do my best to help you get started. This page will give you a quick overview of how things are organised and most importantly, how to get involved.
  3 | 
  4 | ## Table of contents
  5 | 
  6 | 1. [Issues and bug reports](#issues-and-bug-reports)</br>
  7 |     a. [Submitting issues](#submitting-issues)</br>
  8 |     b. [Issue labels](#issue-labels)
  9 | 2. [Contributing to the code base](#contributing-to-the-code-base)</br>
 10 |     a. [Getting started](#getting-started)</br>
 11 |     b. [Add a new rule to existing *Golden Rules Set* (GRS)](#add-a-new-rule-to-existing-golden-rules-set-grs)</br>
 12 |     c. [Add new language support](#add-new-language-support)</br>
 13 |     d. [Add tests](#add-tests)</br>
 14 |     e. [Fix bugs](#fix-bugs)
 15 | 
 16 | ## Issues and bug reports
 17 | First, do a [quick search](https://github.com/nipunsadvilkar/pySBD/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aclosed+is%3Aopen) to see if the issue has already been reported or already open. If so, it's often better to just leave a comment on an existing issue, rather than creating a new one. Old issues also often include helpful tips and solutions to common problems.
 18 | 
 19 | Please understand that author won't be able to provide individual support via
 20 | email. Author also believe that help is much more valuable if it's **shared publicly**,
 21 | so that more people can benefit from it.
 22 | 
 23 | ### Submitting issues
 24 | 
 25 | When opening an issue, use an **appropriate and descriptive title** and include your
 26 | **environment** (operating system, Python version, pySBD version). Choose the report type [from here](https://github.com/nipunsadvilkar/pySBD/issues/new/choose), if type is not available then open a [blank issue](https://github.com/nipunsadvilkar/pySBD/issues/new). The
 27 | [issue template](https://github.com/nipunsadvilkar/pySBD/issues/new?assignees=&labels=&template=bug_report.md&title=%3CAppropriate+title%3E) helps you
 28 | remember the most important details to include. If you've discovered a bug, you
 29 | can also submit a [regression test](#fix-bugs) straight away. When you're
 30 | opening an issue to report the bug, simply refer to your pull request in the
 31 | issue body. A few more tips:
 32 | 
 33 | -   **Describing your issue:** Try to provide as many details as possible. What
 34 |     exactly goes wrong? _How_ is it failing? Is there an error?
 35 |     "XY doesn't work" usually isn't that helpful for tracking down problems. Always
 36 |     remember to include the code you ran and if possible, extract only the relevant
 37 |     parts and don't just dump your entire script. Also, provide what was the expected output for given input. This will make it easier for contributors to
 38 |     reproduce the error.
 39 | 
 40 | -   **Getting info about your pySBD installation and environment:** You can use the command line interface to print details and copy-paste psybd verson along with python version into GitHub issues:
 41 |     `pip freeze|grep pysbd`.
 42 | 
 43 | -   **Sharing long blocks of code/logs/tracebacks:** If you need to include long code,
 44 |     logs or tracebacks, you can wrap them in `<details>` and `</details>`. This
 45 |     [collapses the content](https://developer.mozilla.org/en/docs/Web/HTML/Element/details)
 46 |     so it only becomes visible on click, making the issue easier to read and follow.
 47 | 
 48 | ### Issue labels
 49 | 
 50 | [See this page](https://github.com/nipunsadvilkar/pySBD/labels) for an overview of
 51 | the system author uses to tag our issues and pull requests.
 52 | 
 53 | ## Contributing to the code base
 54 | 
 55 | Happy to see you contibute to pySBD codebase. To help you get started and understand internals of pySBD, a good place to start is to refer to the implementation section of [pySBD research paper](https://arxiv.org/abs/2010.09657). Another great place for reference is to look at [merged pull requests](https://github.com/nipunsadvilkar/pySBD/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Amerged). Depending on the type of your contribution, refer to the assigned labels.
 56 | 
 57 | ### Getting started
 58 | To make changes to pySBD's code base, you need to fork then clone the GitHub repository to your local machine. You'll need to make sure that you have a development environment consisting of a Python distribution including python 3+, pip and git installed.
 59 | 
 60 | ```python
 61 | python -m pip install -U pip
 62 | git clone https://github.com/nipunsadvilkar/pySBD
 63 | cd pySBD
 64 | pip install -r requirements-dev.txt
 65 | ```
 66 | Since pySBD is lightweight, it requires only python inbuilt modules, more specifically python `re` module to function. Development packages requiremment will be provided in `requirements-dev.txt`. If you want to use pySBD as a spacy component then install spacy in your environment.
 67 | 
 68 | ### Add a new rule to existing *Golden Rules Set* (GRS)
 69 | The language specific *Golden Rules Set* are hand-constructed rules, designed to cover sentence boundaries across a variety of domains. The set is by no means complete and will evolve and expand over time. If you would like to report an issue in existing rule or report a new rule, please [open an issue.](#submitting-issues) If you want to contribute yourself then please go ahead and send pull request by referring to [add tests](#add-tests) section.
 70 | 
 71 | ### Add new language support
 72 | Great to see you adding new language support to pySBD ✨.</br>
 73 | You would need following steps to add new language support:
 74 | 
 75 | ^^ Please use already supported language commits - [Marathi](https://github.com/nipunsadvilkar/pySBD/commit/ab39442ece525285e5e83a80e2d2672bba467db7), [Spanish](https://github.com/nipunsadvilkar/pySBD/commit/ed6fb8672e30521e6e5d55bc86b779b2b4cf47dd), [Chinese](https://github.com/nipunsadvilkar/pySBD/commit/092764f896911bb97259720998b636f18980bb62) - as a frame of reference as you go through each steps below.
 76 | 
 77 | 1. **New Language Specific *Golden Rules Set***</br>
 78 | You would require to create *Golden Rule Set* representing basic to complex sentence boundary variations as a test set. Assuming you know the language, its sentence syntax and other intricacies you can create a new file at `tests/lang/test_<language_name>.py` and enlist input text and expected output in the same way author has added support for existing^^ languages. You may want to refer to [adding tests](#adding-tests) section to know more details on how to add, run tests, adding language fixture. Next, run the tests using `pytest` and let it deliberately fail.
 79 | 
 80 | 2. **Add your language module**</br>
 81 | Create a new file at `pysbd/lang/<language_name>.py` and define a new class `LanguageName` which should be inheriting from two base classes - `Common, Standard` - involving basic rules common across majority of languages. Try running tests to see your GRS passes or not. If fails, you would need to override `SENTENCE_BOUNDARY_REGEX`, `Punctuations` class variables and `AbbreviationReplacer` class to support your language specific punctuations, sentence boundaries.
 82 | 
 83 |  > Illustration: As you could see in [`Marathi`](https://github.com/nipunsadvilkar/pySBD/blob/master/pysbd/lang/marathi.py) language, `AbbreviationReplacer` & its `SENTENCE_STARTERS` are kept blank to override `Standard`'s [`SENTENCE_STARTERS`](https://github.com/nipunsadvilkar/pySBD/blob/master/pysbd/lang/common/standard.py#L111). Next, `Punctuations` are limited to  `['.', '!', '?']` and as per it `SENTENCE_BOUNDARY_REGEX` is constructed to make sure it would pass [Marathi GRS](https://github.com/nipunsadvilkar/pySBD/blob/master/tests/lang/test_marathi.py). Similar to the class variables, if you find any rule not pertaining to your language then you can override it in your language class.
 84 | 
 85 | 3. **Add language code**<br>
 86 | Your language module & language GRS should be in place by now. Next step is to make it available to pySBD's [`languages`](https://github.com/nipunsadvilkar/pySBD/blob/master/pysbd/languages.py) module by importing your language module and adding a new key having [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) equivalent language code belonging to your language to the `LANGUAGE_CODES` dictionary and value as your language class you would have imported.
 87 | 
 88 | ### Add tests
 89 | Author emphasizes on Test-Driven Development [(TDD)](https://testdriven.io/test-driven-development/) approach to ensure robustness of the pySBD module. You will follow a "<span style="color: red;">Red</span>-<span style="color: green;">Green</span>-<span style="color: orange;">Refactor</span>" cycle.
 90 | 
 91 | 1. Make sure you have proper development environment [setup](#getting-started)
 92 | 2. Depending on your type of contribution your test script would vary between [feature-specific](#add-new-language-support) / [bugfix-specific](#fix-bugs).
 93 | 3. (<span style="color: red;">Red</span>) Once you add those tests, run `pytest` to make sure it fails deliberately.
 94 | 4. (<span style="color: green;">Green</span>) Write just enough code to implement your logic in respective python script to pass the specific test which you added and got failed earlier.
 95 | 5. Once it passes, run all the tests to see if your added code doesn't break existing code.
 96 | 6. (<span style="color: orange;">Refactor</span>) Do necessary refactoring & cleaning to keep tests green.
 97 | 7. Repeat 🔁
 98 | 
 99 | ### Fix bug(s)
100 | 
101 | When fixing a bug, first create an
102 | [issue](https://github.com/nipunsadvilkar/pySBD/issues) if one does not already exist.
103 | The description text can be very short – don't need to be verbose.
104 | 
105 | Next, depending on your type of issue, add your test in `TEST_ISSUE_DATA` / `TEST_ISSUE_DATA_CHAR_SPANS` with a tuple `("#ISSUE_NUMBER", "<input_text>", <expected_output>)` in the
106 | [`pysbd/tests/regression`](pysbd/tests/regression) folder. Test for the bug
107 | you're fixing, and make sure the test fails. Next, add and commit your test file
108 | referencing the issue number in the commit message. Finally, fix the bug, make
109 | sure your test passes and reference the issue in your commit message.
110 | 
111 | Thank you for contributing! ✨ 🍰 ✨
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Nipun Sadvilkar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![PySBD logo](artifacts/pysbd_logo.png?raw=true "pysbd logo")
 2 | # pySBD: Python Sentence Boundary Disambiguation (SBD)
 3 | 
 4 | ![Python package](https://github.com/nipunsadvilkar/pySBD/workflows/Python%20package/badge.svg) [![codecov](https://codecov.io/gh/nipunsadvilkar/pySBD/branch/master/graph/badge.svg)](https://codecov.io/gh/nipunsadvilkar/pySBD) [![License](https://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](https://github.com/nipunsadvilkar/pySBD/blob/master/LICENSE) [![PyPi](https://img.shields.io/pypi/v/pysbd?color=blue&logo=pypi&logoColor=white)](https://pypi.python.org/pypi/pysbd) [![GitHub](https://img.shields.io/github/v/release/nipunsadvilkar/pySBD.svg?include_prereleases&logo=github&style=flat)](https://github.com/nipunsadvilkar/pySBD)
 5 | 
 6 | pySBD - python Sentence Boundary Disambiguation (SBD) - is a rule-based sentence boundary detection module that works out-of-the-box.
 7 | 
 8 | This project is a direct port of ruby gem - [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) which provides rule-based sentence boundary detection.
 9 | 
10 | ![pysbd_code](artifacts/pysbd_code.png?raw=true "pysbd_code")
11 | 
12 | ## Highlights
13 | **'PySBD: Pragmatic Sentence Boundary Disambiguation'** a short research paper got accepted into 2nd Workshop for Natural Language Processing Open Source Software (NLP-OSS) at EMNLP 2020. </br>
14 | 
15 | **Research Paper:**</br>
16 | 
17 | https://arxiv.org/abs/2010.09657</br>
18 | 
19 | **[Recorded Talk:](https://slideslive.com/38939754)**</br>
20 | 
21 | [![pysbd_talk](artifacts/pysbd_talk.png)](https://slideslive.com/38939754)</br>
22 | 
23 | **Poster:**</br>
24 | 
25 | [![name](artifacts/pysbd_poster.png)](artifacts/pysbd_poster.png)
26 | 
27 | ## Install
28 | 
29 | **Python**
30 | 
31 |     pip install pysbd
32 | 
33 | ## Usage
34 | 
35 | -   Currently pySBD supports 22 languages.
36 | 
37 | ```python
38 | import pysbd
39 | text = "My name is Jonas E. Smith. Please turn to p. 55."
40 | seg = pysbd.Segmenter(language="en", clean=False)
41 | print(seg.segment(text))
42 | # ['My name is Jonas E. Smith.', 'Please turn to p. 55.']
43 | ```
44 | 
45 | -   Use `pysbd` as a [spaCy](https://spacy.io/usage/processing-pipelines) pipeline component. (recommended)</br>Please refer to example [pysbd\_as\_spacy\_component.py](https://github.com/nipunsadvilkar/pySBD/blob/master/examples/pysbd_as_spacy_component.py)
46 | - Use pysbd through [entrypoints](https://spacy.io/usage/saving-loading#entry-points-components)
47 | 
48 | ```python
49 | import spacy
50 | from pysbd.utils import PySBDFactory
51 | 
52 | nlp = spacy.blank('en')
53 | 
54 | # explicitly adding component to pipeline
55 | # (recommended - makes it more readable to tell what's going on)
56 | nlp.add_pipe(PySBDFactory(nlp))
57 | 
58 | # or you can use it implicitly with keyword
59 | # pysbd = nlp.create_pipe('pysbd')
60 | # nlp.add_pipe(pysbd)
61 | 
62 | doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')
63 | print(list(doc.sents))
64 | # [My name is Jonas E. Smith., Please turn to p. 55.]
65 | 
66 | ```
67 | 
68 | ## Contributing
69 | 
70 | If you want to contribute new feature/language support or found a text that is incorrectly segmented using pySBD, then please head to [CONTRIBUTING.md](https://github.com/nipunsadvilkar/pySBD/blob/master/CONTRIBUTING.md) to know more and follow these steps.
71 | 
72 | 1.  Fork it ( https://github.com/nipunsadvilkar/pySBD/fork )
73 | 2.  Create your feature branch (`git checkout -b my-new-feature`)
74 | 3.  Commit your changes (`git commit -am 'Add some feature'`)
75 | 4.  Push to the branch (`git push origin my-new-feature`)
76 | 5.  Create a new Pull Request
77 | 
78 | ## Citation
79 | If you use `pysbd` package in your projects or research, please cite [PySBD: Pragmatic Sentence Boundary Disambiguation](https://www.aclweb.org/anthology/2020.nlposs-1.15).
80 | ```
81 | @inproceedings{sadvilkar-neumann-2020-pysbd,
82 |     title = "{P}y{SBD}: Pragmatic Sentence Boundary Disambiguation",
83 |     author = "Sadvilkar, Nipun  and
84 |       Neumann, Mark",
85 |     booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
86 |     month = nov,
87 |     year = "2020",
88 |     address = "Online",
89 |     publisher = "Association for Computational Linguistics",
90 |     url = "https://www.aclweb.org/anthology/2020.nlposs-1.15",
91 |     pages = "110--114",
92 |     abstract = "We present a rule-based sentence boundary disambiguation Python package that works out-of-the-box for 22 languages. We aim to provide a realistic segmenter which can provide logical sentences even when the format and domain of the input text is unknown. In our work, we adapt the Golden Rules Set (a language specific set of sentence boundary exemplars) originally implemented as a ruby gem pragmatic segmenter which we ported to Python with additional improvements and functionality. PySBD passes 97.92{\%} of the Golden Rule Set examplars for English, an improvement of 25{\%} over the next best open source Python tool.",
93 | }
94 | ```
95 | 
96 | ## Credit
97 | 
98 | This project wouldn't be possible without the great work done by [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter) team.
99 | 


--------------------------------------------------------------------------------
/artifacts/Poster of PySBD_ Pragmatic Sentence Boundary Disambiguation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/Poster of PySBD_ Pragmatic Sentence Boundary Disambiguation.pdf


--------------------------------------------------------------------------------
/artifacts/PySBD_ Pragmatic Sentence Boundary Disambiguation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/PySBD_ Pragmatic Sentence Boundary Disambiguation.pdf


--------------------------------------------------------------------------------
/artifacts/code_pysbd_spacy_component.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/code_pysbd_spacy_component.png


--------------------------------------------------------------------------------
/artifacts/pysbd_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_code.png


--------------------------------------------------------------------------------
/artifacts/pysbd_code2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_code2.png


--------------------------------------------------------------------------------
/artifacts/pysbd_code3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_code3.png


--------------------------------------------------------------------------------
/artifacts/pysbd_code_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_code_example.png


--------------------------------------------------------------------------------
/artifacts/pysbd_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_logo.png


--------------------------------------------------------------------------------
/artifacts/pysbd_poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_poster.png


--------------------------------------------------------------------------------
/artifacts/pysbd_talk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/artifacts/pysbd_talk.png


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/benchmark_sbd_tools.py:
--------------------------------------------------------------------------------
 1 | import blingfire
 2 | import nltk
 3 | import pysbd
 4 | import spacy
 5 | import stanza
 6 | 
 7 | from syntok.tokenizer import Tokenizer
 8 | import syntok.segmenter as syntok_segmenter
 9 | 
10 | from english_golden_rules import GOLDEN_EN_RULES
11 | 
12 | pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
13 | 
14 | nlp = spacy.blank('en')
15 | nlp.add_pipe(nlp.create_pipe("sentencizer"))
16 | nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
17 | #stanza.download('en')
18 | stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
19 | 
20 | syntok_tokenizer = Tokenizer()
21 | 
22 | def blingfire_tokenize(text):
23 |     return blingfire.text_to_sentences(text).split('\n')
24 | 
25 | def nltk_tokenize(text):
26 |     return nltk.sent_tokenize(text)
27 | 
28 | def pysbd_tokenize(text):
29 |     segments = pysbd_segmenter.segment(text)
30 |     return [s.strip() for s in segments]
31 | 
32 | def spacy_tokenize(text):
33 |     return [sent.text for sent in nlp(text).sents]
34 | 
35 | def spacy_dep_tokenize(text):
36 |     return [sent.text for sent in nlp_dep(text).sents]
37 | 
38 | def stanza_tokenize(text):
39 |     return [e.text for e in stanza_nlp(text).sentences]
40 | 
41 | def make_sentences(segmented_tokens):
42 |     for sentence in segmented_tokens:
43 |         yield "".join(str(token) for token in sentence).strip()
44 | 
45 | def syntok_tokenize(text):
46 |     tokens = syntok_tokenizer.split(text)
47 |     result = syntok_segmenter.split(iter(tokens))
48 |     segments = [sent for sent in make_sentences(result)]
49 |     return segments
50 | 
51 | 
52 | total_rules = len(GOLDEN_EN_RULES)
53 | 
54 | def benchmark(golden_rules, tokenize_func):
55 |     score = 0
56 |     for rule in golden_rules:
57 |         text, expected = rule
58 |         segments = tokenize_func(text)
59 |         if segments == expected:
60 |             score += 1
61 |     percent_score = (score / total_rules) * 100.0
62 | 
63 |     return percent_score
64 | 
65 | if __name__ == "__main__":
66 |     import time
67 |     libraries = (
68 |         blingfire_tokenize,
69 |         nltk_tokenize,
70 |         pysbd_tokenize,
71 |         spacy_tokenize,
72 |         spacy_dep_tokenize,
73 |         stanza_tokenize,
74 |         syntok_tokenize)
75 |     for tokenize_func in libraries:
76 |         t = time.time()
77 |         for i in range(100):
78 |             percent_score = benchmark(GOLDEN_EN_RULES, tokenize_func)
79 | 
80 |         time_taken = time.time() - t
81 |         print()
82 |         print(tokenize_func.__name__)
83 |         print('GRS score: {:0.2f}%'.format(percent_score))
84 |         print('Speed(Avg over 100 runs): {:>10.2f} ms'.format(time_taken*1000/100))
85 | 


--------------------------------------------------------------------------------
/benchmarks/bigtext_speed_benchmark.py:
--------------------------------------------------------------------------------
 1 | import blingfire
 2 | import nltk
 3 | import pysbd
 4 | import spacy
 5 | import stanza
 6 | 
 7 | from syntok.tokenizer import Tokenizer
 8 | import syntok.segmenter as syntok_segmenter
 9 | 
10 | pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
11 | 
12 | nlp = spacy.blank('en')
13 | nlp.add_pipe(nlp.create_pipe("sentencizer"))
14 | nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
15 | #stanza.download('en')
16 | stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
17 | 
18 | syntok_tokenizer = Tokenizer()
19 | 
20 | def blingfire_tokenize(text):
21 |     return blingfire.text_to_sentences(text).split('\n')
22 | 
23 | def nltk_tokenize(text):
24 |     return nltk.sent_tokenize(text)
25 | 
26 | def pysbd_tokenize(text):
27 |     segments = pysbd_segmenter.segment(text)
28 |     segments = [s.strip() for s in segments]
29 |     return segments
30 | 
31 | def spacy_tokenize(text):
32 |     return [sent.text.strip("\n") for sent in nlp(text).sents]
33 | 
34 | def spacy_dep_tokenize(text):
35 |     return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
36 | 
37 | def stanza_tokenize(text):
38 |     return [e.text for e in stanza_nlp(text).sentences]
39 | 
40 | def make_sentences(segmented_tokens):
41 |     for sentence in segmented_tokens:
42 |         yield "".join(str(token) for token in sentence).strip()
43 | 
44 | def syntok_tokenize(text):
45 |     tokens = syntok_tokenizer.split(text)
46 |     result = syntok_segmenter.split(iter(tokens))
47 |     segments = [sent for sent in make_sentences(result)]
48 |     return segments
49 | 
50 | def speed_benchmark(big_text, tokenize_func):
51 |     segments = tokenize_func(big_text)
52 |     return segments
53 | 
54 | if __name__ == "__main__":
55 |     import time
56 |     libraries = (
57 |         blingfire_tokenize,
58 |         nltk_tokenize,
59 |         pysbd_tokenize,
60 |         spacy_tokenize,
61 |         spacy_dep_tokenize,
62 |         stanza_tokenize,
63 |         syntok_tokenize)
64 | 
65 |     for tokenize_func in libraries:
66 |         t = time.time()
67 |         # wget http://www.gutenberg.org/files/1661/1661-0.txt -P benchmarks/
68 |         with open('benchmarks/1661-0.txt') as bigfile:
69 |             big_text = bigfile.read()
70 |         sentences = speed_benchmark(big_text, tokenize_func)
71 | 
72 |         time_taken = time.time() - t
73 |         print()
74 |         print(tokenize_func.__name__)
75 |         print('Speed : {:>20.2f} ms'.format(time_taken * 1000))
76 | 


--------------------------------------------------------------------------------
/benchmarks/english_golden_rules.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | GOLDEN_EN_RULES = [
  4 |     # 1) Simple period to end sentence
  5 |     ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
  6 |     # 2) Question mark to end sentence
  7 |     ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
  8 |     # 3) Exclamation point to end sentence
  9 |     ("There it is! I found it.", ["There it is!", "I found it."]),
 10 |     # 4) One letter upper case abbreviations
 11 |     ("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
 12 |     # 5) One letter lower case abbreviations
 13 |     ("Please turn to p. 55.", ["Please turn to p. 55."]),
 14 |     # 6) Two letter lower case abbreviations in the middle of a sentence
 15 |     ("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
 16 |     # 7) Two letter upper case abbreviations in the middle of a sentence
 17 |     ("They closed the deal with Pitt, Briggs & Co. at noon.",
 18 |         ["They closed the deal with Pitt, Briggs & Co. at noon."]),
 19 |     # 8) Two letter lower case abbreviations at the end of a sentence
 20 |     (
 21 |         "Let's ask Jane and co. They should know.",
 22 |         ["Let's ask Jane and co.", "They should know."]),
 23 |     # 9) Two letter upper case abbreviations at the end of a sentence
 24 |     (
 25 |         "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [
 26 |             "They closed the deal with Pitt, Briggs & Co.",
 27 |             "It closed yesterday."
 28 |         ],
 29 |     ),
 30 |     # 10) Two letter (prepositive) abbreviations
 31 |     ("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
 32 |     # 11) Two letter (prepositive & postpositive) abbreviations
 33 |     (
 34 |         "St. Michael's Church is on 5th st. near the light.",
 35 |         ["St. Michael's Church is on 5th st. near the light."],
 36 |     ),
 37 |     # 12) Possesive two letter abbreviations
 38 |     ("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
 39 |     # 13) Multi-period abbreviations in the middle of a sentence
 40 |     ("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
 41 |     # 14) Multi-period abbreviations at the end of a sentence
 42 |     (
 43 |         "I live in the E.U. How about you?",
 44 |         ["I live in the E.U.", "How about you?"],
 45 |     ),
 46 |     # 15) U.S. as sentence boundary
 47 |     (
 48 |         "I live in the U.S. How about you?",
 49 |         ["I live in the U.S.", "How about you?"],
 50 |     ),
 51 |     # 16) U.S. as non sentence boundary with next word capitalized
 52 |     ("I work for the U.S. Government in Virginia.",
 53 |         ["I work for the U.S. Government in Virginia."]),
 54 |     # 17) U.S. as non sentence boundary
 55 |     ("I have lived in the U.S. for 20 years.",
 56 |         ["I have lived in the U.S. for 20 years."]),
 57 |     # Most difficult sentence to crack
 58 |     # 18) A.M. / P.M. as non sentence boundary and sentence boundary
 59 |     (
 60 |          "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
 61 |          [
 62 |              "At 5 a.m. Mr. Smith went to the bank.",
 63 |              "He left the bank at 6 P.M.", "Mr. Smith then went to the store."
 64 |          ]
 65 |     ),
 66 |     # 19) Number as non sentence boundary
 67 |     ("She has $100.00 in her bag.", ["She has $100.00 in her bag."]),
 68 |     # 20) Number as sentence boundary
 69 |     ("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]),
 70 |     # 21) Parenthetical inside sentence
 71 |     ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
 72 |         ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
 73 |     # 22) Email addresses
 74 |     ("Her email is Jane.Doe@example.com. I sent her an email.",
 75 |         ["Her email is Jane.Doe@example.com.", "I sent her an email."]),
 76 |     # 23) Web addresses
 77 |     ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
 78 |         ["The site is: https://www.example.50.com/new-site/awesome_content.html.",
 79 |             "Please check it out."]),
 80 |     # 24) Single quotations inside sentence
 81 |     (
 82 |         "She turned to him, 'This is great.' she said.",
 83 |         ["She turned to him, 'This is great.' she said."],
 84 |     ),
 85 |     # 25) Double quotations inside sentence
 86 |     (
 87 |         'She turned to him, "This is great." she said.',
 88 |         ['She turned to him, "This is great." she said.'],
 89 |     ),
 90 |     # 26) Double quotations at the end of a sentence
 91 |     (
 92 |         'She turned to him, "This is great." She held the book out to show him.',
 93 |         [
 94 |             'She turned to him, "This is great."',
 95 |             "She held the book out to show him."
 96 |         ],
 97 |     ),
 98 |     # 27) Double punctuation (exclamation point)
 99 |     ("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
100 |     # 28) Double punctuation (question mark)
101 |     ("Hello?? Who is there?", ["Hello??", "Who is there?"]),
102 |     # 29) Double punctuation (exclamation point / question mark)
103 |     ("Hello!? Is that you?", ["Hello!?", "Is that you?"]),
104 |     # 30) Double punctuation (question mark / exclamation point)
105 |     ("Hello?! Is that you?", ["Hello?!", "Is that you?"]),
106 |     # 31) List (period followed by parens and no period to end item)
107 |     (
108 |         "1.) The first item 2.) The second item",
109 |         ["1.) The first item", "2.) The second item"],
110 |     ),
111 |     # 32) List (period followed by parens and period to end item)
112 |     (
113 |         "1.) The first item. 2.) The second item.",
114 |         ["1.) The first item.", "2.) The second item."],
115 |     ),
116 |     # 33) List (parens and no period to end item)
117 |     (
118 |         "1) The first item 2) The second item",
119 |         ["1) The first item", "2) The second item"],
120 |     ),
121 |     # 34) List (parens and period to end item)
122 |     ("1) The first item. 2) The second item.",
123 |         ["1) The first item.", "2) The second item."]),
124 |     # 35) List (period to mark list and no period to end item)
125 |     (
126 |         "1. The first item 2. The second item",
127 |         ["1. The first item", "2. The second item"],
128 |     ),
129 |     # 36) List (period to mark list and period to end item)
130 |     (
131 |         "1. The first item. 2. The second item.",
132 |         ["1. The first item.", "2. The second item."],
133 |     ),
134 |     # 37) List with bullet
135 |     (
136 |         "• 9. The first item • 10. The second item",
137 |         ["• 9. The first item", "• 10. The second item"],
138 |     ),
139 |     # 38) List with hypthen
140 |     (
141 |         "⁃9. The first item ⁃10. The second item",
142 |         ["⁃9. The first item", "⁃10. The second item"],
143 |     ),
144 |     # 39) Alphabetical list
145 |     (
146 |         "a. The first item b. The second item c. The third list item",
147 |         ["a. The first item", "b. The second item", "c. The third list item"],
148 |     ),
149 |     # 40) Geo Coordinates
150 |     (
151 |         "You can find it at N°. 1026.253.553. That is where the treasure is.",
152 |         [
153 |             "You can find it at N°. 1026.253.553.",
154 |             "That is where the treasure is."
155 |         ],
156 |     ),
157 |     # 41) Named entities with an exclamation point
158 |     (
159 |         "She works at Yahoo! in the accounting department.",
160 |         ["She works at Yahoo! in the accounting department."],
161 |     ),
162 |     # 42) I as a sentence boundary and I as an abbreviation
163 |     (
164 |         "We make a good team, you and I. Did you see Albert I. Jones yesterday?",
165 |         [
166 |             "We make a good team, you and I.",
167 |             "Did you see Albert I. Jones yesterday?"
168 |         ],
169 |     ),
170 |     # 43) Ellipsis at end of quotation
171 |     (
172 |         "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
173 |         [
174 |             "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
175 |         ],
176 |     ),
177 |     # 44) Ellipsis with square brackets
178 |     (
179 |         """"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""",
180 |         [
181 |             '"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'
182 |         ],
183 |     ),
184 |     # 45) Ellipsis as sentence boundary (standard ellipsis rules)
185 |     ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
186 |         [
187 |             "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .",
188 |             "Next sentence."
189 |         ]),
190 |     # 46) Ellipsis as sentence boundary (non-standard ellipsis rules)
191 |     (
192 |         "I never meant that.... She left the store.",
193 |         ["I never meant that....", "She left the store."],
194 |     ),
195 |     # 47) Ellipsis as non sentence boundary
196 |     (
197 |         "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
198 |         [
199 |             "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."
200 |         ],
201 |     ),
202 |     # 48) 4-dot ellipsis
203 |     (
204 |         "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
205 |         [
206 |             "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.",
207 |             ". . . The practice was not abandoned. . . ."
208 |         ],
209 |     )
210 | ]
211 | 


--------------------------------------------------------------------------------
/benchmarks/genia_benchmark.py:
--------------------------------------------------------------------------------
  1 | import blingfire
  2 | import nltk
  3 | import pysbd
  4 | import spacy
  5 | import stanza
  6 | 
  7 | from syntok.tokenizer import Tokenizer
  8 | import syntok.segmenter as syntok_segmenter
  9 | 
 10 | from pathlib import Path
 11 | 
 12 | pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
 13 | 
 14 | nlp = spacy.blank('en')
 15 | nlp.add_pipe(nlp.create_pipe("sentencizer"))
 16 | nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
 17 | #stanza.download('en')
 18 | stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
 19 | 
 20 | syntok_tokenizer = Tokenizer()
 21 | 
 22 | def blingfire_tokenize(text):
 23 |     return blingfire.text_to_sentences(text).split('\n')
 24 | 
 25 | def nltk_tokenize(text):
 26 |     return nltk.sent_tokenize(text)
 27 | 
 28 | def pysbd_tokenize(text):
 29 |     segments = pysbd_segmenter.segment(text)
 30 |     return [s.strip() for s in segments]
 31 | 
 32 | def spacy_tokenize(text):
 33 |     return [sent.text.strip("\n") for sent in nlp(text).sents]
 34 | 
 35 | def spacy_dep_tokenize(text):
 36 |     return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
 37 | 
 38 | def stanza_tokenize(text):
 39 |     return [e.text for e in stanza_nlp(text).sentences]
 40 | 
 41 | def make_sentences(segmented_tokens):
 42 |     for sentence in segmented_tokens:
 43 |         yield "".join(str(token) for token in sentence).strip()
 44 | 
 45 | def syntok_tokenize(text):
 46 |     tokens = syntok_tokenizer.split(text)
 47 |     result = syntok_segmenter.split(iter(tokens))
 48 |     segments = [sent for sent in make_sentences(result)]
 49 |     return segments
 50 | 
 51 | def load_genia_corpus(genia_raw_dir):
 52 |     txtfiles = Path(genia_raw_dir).glob("**/*.txt")
 53 |     txtfiles = list(txtfiles)
 54 |     all_docs = []
 55 |     for ind, txtfile in enumerate(txtfiles, start=1):
 56 |         with open(txtfile) as f:
 57 |             geniatext = f.read().strip()
 58 |         expected = geniatext.split('\n')
 59 |         all_docs.append((geniatext, expected))
 60 | 
 61 |     return all_docs
 62 | 
 63 | def benchmark(docs, tokenize_func):
 64 | 
 65 |     correct = 0
 66 |     for (text, expected) in docs:
 67 |         segments = tokenize_func(text)
 68 |         if segments == expected:
 69 |             correct +=1
 70 |     return correct
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 |     import argparse
 75 |     parser = argparse.ArgumentParser()
 76 |     parser.add_argument(
 77 |             '--genia',
 78 |             help="Path to the directory containing genia data."
 79 |     )
 80 | 
 81 |     args = parser.parse_args()
 82 | 
 83 |     libraries = (
 84 |         blingfire_tokenize,
 85 |         nltk_tokenize,
 86 |         pysbd_tokenize,
 87 |         spacy_tokenize,
 88 |         spacy_dep_tokenize,
 89 |         stanza_tokenize,
 90 |         syntok_tokenize
 91 |         )
 92 | 
 93 |     docs = load_genia_corpus(args.genia)
 94 |     total = len(docs)
 95 |     for tokenize_func in libraries:
 96 |         correct = benchmark(docs, tokenize_func)
 97 |         percent_score = correct/total * 100
 98 |         print()
 99 |         print(tokenize_func.__name__)
100 |         print('GENIA abstract acc: {:0.2f}%'.format(percent_score))
101 | 


--------------------------------------------------------------------------------
/examples/pysbd_as_spacy_component.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example of pySBD as a sentencizer component for spaCy
 3 | 
 4 | Installation:
 5 | pip install spacy
 6 | """
 7 | import pysbd
 8 | import spacy
 9 | 
10 | def pysbd_sentence_boundaries(doc):
11 |     seg = pysbd.Segmenter(language="en", clean=False, char_span=True)
12 |     sents_char_spans = seg.segment(doc.text)
13 |     char_spans = [doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans]
14 |     start_token_ids = [span[0].idx for span in char_spans if span is not None]
15 |     for token in doc:
16 |         token.is_sent_start = True if token.idx in start_token_ids else False
17 |     return doc
18 | 
19 | if __name__ == "__main__":
20 |     text = "My name is Jonas E. Smith.          Please turn to p. 55."
21 |     nlp = spacy.blank('en')
22 | 
23 |     # add as a spacy pipeline component
24 |     nlp.add_pipe(pysbd_sentence_boundaries)
25 | 
26 |     doc = nlp(text)
27 |     print('sent_id', 'sentence', sep='\t|\t')
28 |     for sent_id, sent in enumerate(doc.sents, start=1):
29 |         print(sent_id, sent.text, sep='\t|\t')
30 | 


--------------------------------------------------------------------------------
/pysbd/__init__.py:
--------------------------------------------------------------------------------
1 | from .segmenter import Segmenter
2 | from .about import __version__
3 | 


--------------------------------------------------------------------------------
/pysbd/abbreviation_replacer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from pysbd.utils import Text
  4 | 
  5 | 
  6 | def replace_pre_number_abbr(txt, abbr):
  7 |     # prepend a space to avoid needing another regex for start of string
  8 |     txt = " " + txt
  9 |     txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
 10 |     # remove the prepended space
 11 |     txt = txt[1:]
 12 |     return txt
 13 | 
 14 | 
 15 | def replace_prepositive_abbr(txt, abbr):
 16 |     # prepend a space to avoid needing another regex for start of string
 17 |     txt = " " + txt
 18 |     txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
 19 |     # remove the prepended space
 20 |     txt = txt[1:]
 21 |     return txt
 22 | 
 23 | 
 24 | class AbbreviationReplacer(object):
 25 |     def __init__(self, text, lang):
 26 |         self.text = text
 27 |         self.lang = lang
 28 | 
 29 |     def replace(self):
 30 |         self.text = Text(self.text).apply(
 31 |             self.lang.PossessiveAbbreviationRule,
 32 |             self.lang.KommanditgesellschaftRule,
 33 |             *self.lang.SingleLetterAbbreviationRules.All
 34 |         )
 35 |         abbr_handled_text = ""
 36 |         for line in self.text.splitlines(True):
 37 |             abbr_handled_text += self.search_for_abbreviations_in_string(line)
 38 |         self.text = abbr_handled_text
 39 |         self.replace_multi_period_abbreviations()
 40 |         self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
 41 |         self.text = self.replace_abbreviation_as_sentence_boundary()
 42 |         return self.text
 43 | 
 44 |     def replace_abbreviation_as_sentence_boundary(self):
 45 |         sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
 46 |         regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
 47 |         self.text = re.sub(regex, '\\1.', self.text)
 48 |         return self.text
 49 | 
 50 |     def replace_multi_period_abbreviations(self):
 51 |         def mpa_replace(match):
 52 |             match = match.group()
 53 |             match = re.sub(re.escape(r"."), "∯", match)
 54 |             return match
 55 | 
 56 |         self.text = re.sub(
 57 |             self.lang.MULTI_PERIOD_ABBREVIATION_REGEX,
 58 |             mpa_replace,
 59 |             self.text,
 60 |             flags=re.IGNORECASE
 61 |         )
 62 | 
 63 |     def replace_period_of_abbr(self, txt, abbr):
 64 |         # prepend a space to avoid needing another regex for start of string
 65 |         txt = " " + txt
 66 |         txt = re.sub(
 67 |             r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
 68 |                 abbr=re.escape(abbr.strip())
 69 |             ),
 70 |             "∯",
 71 |             txt,
 72 |         )
 73 |         # remove the prepended space
 74 |         txt = txt[1:]
 75 |         return txt
 76 | 
 77 | 
 78 |     def search_for_abbreviations_in_string(self, text):
 79 |         lowered = text.lower()
 80 |         for abbr in self.lang.Abbreviation.ABBREVIATIONS:
 81 |             stripped = abbr.strip()
 82 |             if stripped not in lowered:
 83 |                 continue
 84 |             abbrev_match = re.findall(
 85 |                 r"(?:^|\s|\r|\n){}".format(stripped), text, flags=re.IGNORECASE
 86 |             )
 87 |             if not abbrev_match:
 88 |                 continue
 89 |             next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
 90 |             char_array = re.findall(next_word_start, text)
 91 |             for ind, match in enumerate(abbrev_match):
 92 |                 text = self.scan_for_replacements(
 93 |                     text, match, ind, char_array
 94 |                 )
 95 |         return text
 96 | 
 97 |     def scan_for_replacements(self, txt, am, ind, char_array):
 98 |         try:
 99 |             char = char_array[ind]
100 |         except IndexError:
101 |             char = ""
102 |         prepositive = self.lang.Abbreviation.PREPOSITIVE_ABBREVIATIONS
103 |         number_abbr = self.lang.Abbreviation.NUMBER_ABBREVIATIONS
104 |         upper = str(char).isupper()
105 |         if not upper or am.strip().lower() in prepositive:
106 |             if am.strip().lower() in prepositive:
107 |                 txt = replace_prepositive_abbr(txt, am)
108 |             elif am.strip().lower() in number_abbr:
109 |                 txt = replace_pre_number_abbr(txt, am)
110 |             else:
111 |                 txt = self.replace_period_of_abbr(txt, am)
112 |         return txt
113 | 


--------------------------------------------------------------------------------
/pysbd/about.py:
--------------------------------------------------------------------------------
 1 | # inspired from:
 2 | # https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
 3 | 
 4 | __title__ = "pysbd"
 5 | __version__ = "0.3.4"
 6 | __summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
 7 | __uri__ = "http://nipunsadvilkar.github.io/"
 8 | __author__ = "Nipun Sadvilkar"
 9 | __email__ = "nipunsadvilkar@gmail.com"
10 | __license__ = "MIT"
11 | 


--------------------------------------------------------------------------------
/pysbd/between_punctuation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from functools import partial
 4 | from pysbd.punctuation_replacer import replace_punctuation
 5 | 
 6 | 
 7 | class BetweenPunctuation(object):
 8 |     # Rubular: http://rubular.com/r/2YFrKWQUYi
 9 |     BETWEEN_SINGLE_QUOTES_REGEX = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'"
10 | 
11 |     BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = r"(?<=\s)‘(?:[^’]|’[a-zA-Z])*’"
12 | 
13 |     # Rubular: http://rubular.com/r/3Pw1QlXOjd
14 |     BETWEEN_DOUBLE_QUOTES_REGEX = r'"(?>[^"\\]+|\\{2}|\\.)*"'
15 | 
16 |     # https://regex101.com/r/r6I1bW/1
17 |     # https://stackoverflow.com/questions/13577372/do-python-regular-expressions-have-an-equivalent-to-rubys-atomic-grouping?noredirect=1&lq=1
18 |     BETWEEN_DOUBLE_QUOTES_REGEX_2 = r'"(?=(?P<tmp>[^\"\\]+|\\{2}|\\.)*)(?P=tmp)"'
19 | 
20 |     # Rubular: http://rubular.com/r/x6s4PZK8jc
21 |     BETWEEN_QUOTE_ARROW_REGEX = r'«(?>[^»\\]+|\\{2}|\\.)*»'
22 | 
23 |     BETWEEN_QUOTE_ARROW_REGEX_2 = r"\«(?=(?P<tmp>[^»\\]+|\\{2}|\\.)*)(?P=tmp)\»"
24 | 
25 |     # Rubular: http://rubular.com/r/JbAIpKdlSq
26 |     BETWEEN_QUOTE_SLANTED_REGEX = r"“(?>[^”\\]+|\\{2}|\\.)*”"
27 |     BETWEEN_QUOTE_SLANTED_REGEX_2 = r"\“(?=(?P<tmp>[^”\\]+|\\{2}|\\.)*)(?P=tmp)\”"
28 | 
29 |     # Rubular: http://rubular.com/r/WX4AvnZvlX
30 |     BETWEEN_SQUARE_BRACKETS_REGEX = r"\[(?>[^\]\\]+|\\{2}|\\.)*\]"
31 | 
32 |     BETWEEN_SQUARE_BRACKETS_REGEX_2 = r'\[(?=(?P<tmp>[^\]\\]+|\\{2}|\\.)*)(?P=tmp)\]'
33 | 
34 |     # Rubular: http://rubular.com/r/6tTityPflI
35 |     BETWEEN_PARENS_REGEX = r"\((?>[^\(\)\\]+|\\{2}|\\.)*\)"
36 | 
37 |     BETWEEN_PARENS_REGEX_2 = r"\((?=(?P<tmp>[^\(\)\\]+|\\{2}|\\.)*)(?P=tmp)\)"
38 | 
39 |     # Rubular: http://rubular.com/r/mXf8cW025o
40 |     WORD_WITH_LEADING_APOSTROPHE = r"(?<=\s)'(?:[^']|'[a-zA-Z])*'\S"
41 | 
42 |     # Rubular: http://rubular.com/r/jTtDKfjxzr
43 |     BETWEEN_EM_DASHES_REGEX = r"\-\-(?>[^\-\-])*\-\-"
44 | 
45 |     BETWEEN_EM_DASHES_REGEX_2 = r"--(?=(?P<tmp>[^--]*))(?P=tmp)--"
46 | 
47 |     def __init__(self, text):
48 |         self.text = text
49 | 
50 |     def replace(self):
51 |         return self.sub_punctuation_between_quotes_and_parens(self.text)
52 | 
53 |     def sub_punctuation_between_quotes_and_parens(self, txt):
54 |         txt = self.sub_punctuation_between_single_quotes(txt)
55 |         txt = self.sub_punctuation_between_single_quote_slanted(txt)
56 |         txt = self.sub_punctuation_between_double_quotes(txt)
57 |         txt = self.sub_punctuation_between_square_brackets(txt)
58 |         txt = self.sub_punctuation_between_parens(txt)
59 |         txt = self.sub_punctuation_between_quotes_arrow(txt)
60 |         txt = self.sub_punctuation_between_em_dashes(txt)
61 |         txt = self.sub_punctuation_between_quotes_slanted(txt)
62 |         return txt
63 | 
64 |     def sub_punctuation_between_parens(self, txt):
65 |         return re.sub(self.BETWEEN_PARENS_REGEX_2, replace_punctuation, txt)
66 | 
67 |     def sub_punctuation_between_square_brackets(self, txt):
68 |         return re.sub(self.BETWEEN_SQUARE_BRACKETS_REGEX_2, replace_punctuation,
69 |                       txt)
70 | 
71 |     def sub_punctuation_between_single_quotes(self, txt):
72 |         if re.search(self.WORD_WITH_LEADING_APOSTROPHE, txt) and \
73 |                 (not re.search(r"'\s", txt)):
74 |             return txt
75 |         return re.sub(self.BETWEEN_SINGLE_QUOTES_REGEX,
76 |                       partial(replace_punctuation, match_type='single'), txt)
77 | 
78 |     def sub_punctuation_between_single_quote_slanted(self, txt):
79 |         return re.sub(self.BETWEEN_SINGLE_QUOTE_SLANTED_REGEX,
80 |                       replace_punctuation, txt)
81 | 
82 |     def sub_punctuation_between_double_quotes(self, txt):
83 |         return re.sub(self.BETWEEN_DOUBLE_QUOTES_REGEX_2, replace_punctuation,
84 |                       txt)
85 | 
86 |     def sub_punctuation_between_quotes_arrow(self, txt):
87 |         return re.sub(self.BETWEEN_QUOTE_ARROW_REGEX_2, replace_punctuation, txt)
88 | 
89 |     def sub_punctuation_between_em_dashes(self, txt):
90 |         return re.sub(self.BETWEEN_EM_DASHES_REGEX_2, replace_punctuation, txt)
91 | 
92 |     def sub_punctuation_between_quotes_slanted(self, txt):
93 |         return re.sub(self.BETWEEN_QUOTE_SLANTED_REGEX_2, replace_punctuation,
94 |                       txt)
95 | 


--------------------------------------------------------------------------------
/pysbd/clean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/pysbd/clean/__init__.py


--------------------------------------------------------------------------------
/pysbd/clean/rules.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.utils import Rule
 3 | 
 4 | 
 5 | class CleanRules(object):
 6 | 
 7 |     # NOTE: Caution: Might require \\ for special characters
 8 |     # if regex is defined with r'' then dont
 9 |     # add extra \\ for special characters
10 |     # Rubular: http://rubular.com/r/V57WnM9Zut
11 |     NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')
12 | 
13 |     # Rubular: http://rubular.com/r/dMxp5MixFS
14 |     DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")
15 | 
16 |     # Rubular: http://rubular.com/r/H6HOJeA8bq
17 |     DoubleNewLineRule = Rule(r'\n\n', "\r")
18 | 
19 |     # Rubular: http://rubular.com/r/FseyMiiYFT
20 |     NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')
21 | 
22 |     ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")
23 | 
24 |     EscapedNewLineRule = Rule(r'\\n', "\n")
25 | 
26 |     EscapedCarriageReturnRule = Rule(r'\\r', "\r")
27 | 
28 |     TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")
29 | 
30 |     TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")
31 | 
32 |     # Rubular: http://rubular.com/r/bAJrhyLNeZ
33 |     InlineFormattingRule = Rule(r'{b\^&gt;\d*&lt;b\^}|{b\^>\d*<b\^}', '')
34 | 
35 |     # Rubular: http://rubular.com/r/8mc1ArOIGy
36 |     TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")
37 | 
38 |     # Rubular: http://rubular.com/r/DwNSuZrNtk
39 |     ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')
40 | 
41 |     # Rubular: http://rubular.com/r/IQ4TPfsbd8
42 |     ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')
43 | 
44 |     # Rubular: http://rubular.com/r/6dt98uI76u
45 |     NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
46 |     # NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]'
47 |     NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')
48 | 
49 |     # Rubular: http://rubular.com/r/l6KN6rH5XE
50 |     NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])'
51 |     NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')
52 | 
53 |     URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']
54 | 
55 |     # Rubular: http://rubular.com/r/3GiRiP2IbD
56 |     NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'
57 | 
58 |     # Rubular: http://rubular.com/r/Gn18aAnLdZ
59 |     NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")
60 | 
61 |     QuotationsFirstRule = Rule(r"''", '"')
62 |     QuotationsSecondRule = Rule(r'``', '"')
63 | 
64 | 
65 | class HTML(object):
66 |     # Rubular: http://rubular.com/r/9d0OVOEJWj
67 |     HTMLTagRule = Rule(r"<\/?\w+((\s+\w+(\s*=\s*(?:\".*?\"|'.*?'|[\^'\">\s]+))?)+\s*|\s*)\/?>", '')
68 | 
69 |     # Rubular: http://rubular.com/r/XZVqMPJhea
70 |     EscapedHTMLTagRule = Rule(r'&lt;\/?[^gt;]*gt;', '')
71 | 
72 |     All = [HTMLTagRule, EscapedHTMLTagRule]
73 | 
74 | 
75 | class PDF(object):
76 |     # Rubular: http://rubular.com/r/UZAVcwqck8
77 |     NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')
78 | 
79 |     # Rubular: http://rubular.com/r/eaNwGavmdo
80 |     NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
81 | 


--------------------------------------------------------------------------------
/pysbd/cleaner.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from pysbd.utils import Text
  4 | from pysbd.clean.rules import PDF, HTML, CleanRules as cr
  5 | 
  6 | 
  7 | class Cleaner(object):
  8 | 
  9 |     def __init__(self, text, lang, doc_type=None):
 10 |         self.text = text
 11 |         self.lang = lang
 12 |         self.doc_type = doc_type
 13 | 
 14 |     def clean(self):
 15 |         if not self.text:
 16 |             return self.text
 17 |         self.remove_all_newlines()
 18 |         self.replace_double_newlines()
 19 |         self.replace_newlines()
 20 |         self.replace_escaped_newlines()
 21 |         self.text = Text(self.text).apply(*HTML.All)
 22 |         self.replace_punctuation_in_brackets()
 23 |         self.text = Text(self.text).apply(cr.InlineFormattingRule)
 24 |         self.clean_quotations()
 25 |         self.clean_table_of_contents()
 26 |         self.check_for_no_space_in_between_sentences()
 27 |         self.clean_consecutive_characters()
 28 |         return self.text
 29 | 
 30 |     def remove_all_newlines(self):
 31 |         self.remove_newline_in_middle_of_sentence()
 32 |         self.remove_newline_in_middle_of_word()
 33 | 
 34 |     def remove_newline_in_middle_of_sentence(self):
 35 |         def replace_w_blank(match):
 36 |             match = match.group()
 37 |             sub = re.sub(cr.NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX, '', match)
 38 |             return sub
 39 |         self.text = re.sub(r'(?:[^\.])*', replace_w_blank, self.text)
 40 | 
 41 |     def remove_newline_in_middle_of_word(self):
 42 |         self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
 43 | 
 44 |     def replace_double_newlines(self):
 45 |         self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
 46 |                                           cr.DoubleNewLineRule)
 47 | 
 48 |     def remove_pdf_line_breaks(self):
 49 |         self.text = Text(
 50 |                 self.text).apply(cr.NewLineFollowedByBulletRule,
 51 |                                  PDF.NewLineInMiddleOfSentenceRule,
 52 |                                  PDF.NewLineInMiddleOfSentenceNoSpacesRule)
 53 | 
 54 |     def replace_newlines(self):
 55 |         if self.doc_type == 'pdf':
 56 |             self.remove_pdf_line_breaks()
 57 |         else:
 58 |             self.text = Text(
 59 |                 self.text).apply(cr.NewLineFollowedByPeriodRule,
 60 |                                  cr.ReplaceNewlineWithCarriageReturnRule)
 61 | 
 62 |     def replace_escaped_newlines(self):
 63 |         self.text = Text(
 64 |                 self.text).apply(cr.EscapedNewLineRule,
 65 |                                  cr.EscapedCarriageReturnRule,
 66 |                                  cr.TypoEscapedNewLineRule,
 67 |                                  cr.TypoEscapedCarriageReturnRule)
 68 | 
 69 |     def replace_punctuation_in_brackets(self):
 70 |         def replace_punct(match):
 71 |             match = match.group()
 72 |             if '?' in match:
 73 |                 sub = re.sub(re.escape('?'), '&ᓷ&', match)
 74 |                 return sub
 75 |             return match
 76 |         self.text = re.sub(r'\[(?:[^\]])*\]', replace_punct, self.text)
 77 | 
 78 |     def clean_quotations(self):
 79 |         # method added explicitly
 80 |         # pragmatic-segmenter applies thhis method
 81 |         # at different location
 82 |         self.text = re.sub('`', "'", self.text)
 83 |         self.text = Text(self.text).apply(
 84 |                                         cr.QuotationsFirstRule,
 85 |                                         cr.QuotationsSecondRule)
 86 | 
 87 |     def clean_table_of_contents(self):
 88 |         self.text = Text(self.text).apply(
 89 |                                         cr.TableOfContentsRule,
 90 |                                         cr.ConsecutivePeriodsRule,
 91 |                                         cr.ConsecutiveForwardSlashRule)
 92 | 
 93 |     def search_for_connected_sentences(self, word, txt, regex, rule):
 94 |         if not re.search(regex, word):
 95 |             return txt
 96 |         if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
 97 |             return txt
 98 |         new_word = Text(word).apply(rule)
 99 |         txt = re.sub(re.escape(word), new_word, txt)
100 |         return txt
101 | 
102 |     def check_for_no_space_in_between_sentences(self):
103 |         words = self.text.split(' ')
104 |         for word in words:
105 |             self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_REGEX, cr.NoSpaceBetweenSentencesRule)
106 |             self.text = self.search_for_connected_sentences(word, self.text, cr.NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, cr.NoSpaceBetweenSentencesDigitRule)
107 | 
108 |     def clean_consecutive_characters(self):
109 |         self.text = Text(self.text).apply(
110 |                                         cr.ConsecutivePeriodsRule,
111 |                                         cr.ConsecutiveForwardSlashRule)
112 | 


--------------------------------------------------------------------------------
/pysbd/exclamation_words.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pysbd.punctuation_replacer import replace_punctuation
 4 | 
 5 | 
 6 | class ExclamationWords(object):
 7 |     """
 8 |     Searches for exclamation points that are part of words
 9 |     and not ending punctuation and replaces them.
10 |     """
11 |     EXCLAMATION_WORDS = "!Xũ !Kung ǃʼOǃKung !Xuun !Kung-Ekoka ǃHu ǃKhung ǃKu ǃung ǃXo ǃXû ǃXung ǃXũ !Xun Yahoo! Y!J Yum!".split()
12 |     EXCLAMATION_REGEX = r"|".join(re.escape(w) for w in EXCLAMATION_WORDS)
13 | 
14 |     @classmethod
15 |     def apply_rules(cls, text):
16 |         return re.sub(ExclamationWords.EXCLAMATION_REGEX, replace_punctuation,
17 |                       text)
18 | 


--------------------------------------------------------------------------------
/pysbd/lang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/pysbd/lang/__init__.py


--------------------------------------------------------------------------------
/pysbd/lang/amharic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Amharic(Common, Standard):
 6 | 
 7 |     iso_code = 'am'
 8 | 
 9 |     SENTENCE_BOUNDARY_REGEX = r'.*?[፧።!\?]|.*?$'
10 |     Punctuations = ['።', '፧', '?', '!']
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 


--------------------------------------------------------------------------------
/pysbd/lang/arabic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 5 | from pysbd.lang.common import Common, Standard
 6 | from pysbd.utils import Rule
 7 | 
 8 | class Arabic(Common, Standard):
 9 | 
10 |     iso_code = 'ar'
11 | 
12 |     Punctuations = ['?', '!', ':', '.', '؟', '،']
13 |     SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟،]|.*?\Z|.*?$'
14 | 
15 |     # Rubular: http://rubular.com/r/RX5HpdDIyv
16 |     ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
17 | 
18 |     # Rubular: http://rubular.com/r/kPRgApNHUg
19 |     ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
20 | 
21 |     class AbbreviationReplacer(AbbreviationReplacer):
22 | 
23 |         SENTENCE_STARTERS = []
24 | 
25 |         def __init__(self, text, lang):
26 |             super().__init__(text, lang)
27 | 
28 |         def scan_for_replacements(self, txt, am, index, character_array):
29 |             txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
30 |             return txt
31 | 
32 |     class Abbreviation(Standard.Abbreviation):
33 |         ABBREVIATIONS = ['ا', 'ا. د', 'ا.د', 'ا.ش.ا', 'ا.ش.ا', 'إلخ', 'ت.ب', 'ت.ب', 'ج.ب', 'جم', 'ج.ب', 'ج.م.ع', 'ج.م.ع', 'س.ت', 'س.ت', 'سم', 'ص.ب.', 'ص.ب', 'كج.', 'كلم.', 'م', 'م.ب', 'م.ب', 'ه',]
34 |         PREPOSITIVE_ABBREVIATIONS = []
35 |         NUMBER_ABBREVIATIONS = []
36 | 


--------------------------------------------------------------------------------
/pysbd/lang/armenian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Armenian(Common, Standard):
 6 | 
 7 |     iso_code = 'hy'
 8 | 
 9 |     SENTENCE_BOUNDARY_REGEX = r'.*?[։՜:]|.*?$'
10 |     Punctuations = ['։', '՜', ':']
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 


--------------------------------------------------------------------------------
/pysbd/lang/bulgarian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 5 | from pysbd.lang.common import Common, Standard
 6 | 
 7 | class Bulgarian(Common, Standard):
 8 | 
 9 |     iso_code = 'bg'
10 | 
11 |     class Abbreviation(Standard.Abbreviation):
12 |         ABBREVIATIONS = ["p.s", "акад", "ал", "б.р", "б.ред", "бел.а", "бел.пр", "бр", "бул", "в", "вж", "вкл", "вм", "вр", "г", "ген", "гр", "дж", "дм", "доц", "др", "ем", "заб", "зам", "инж", "к.с", "кв", "кв.м", "кг", "км", "кор", "куб", "куб.м", "л", "лв", "м", "м.г", "мин", "млн", "млрд", "мм", "н.с", "напр", "пл", "полк", "проф", "р", "рис", "с", "св", "сек", "см", "сп", "срв", "ст", "стр", "т", "т.г", "т.е", "т.н", "т.нар", "табл", "тел", "у", "ул", "фиг", "ха", "хил", "ч", "чл", "щ.д"]
13 |         NUMBER_ABBREVIATIONS = []
14 |         PREPOSITIVE_ABBREVIATIONS = []
15 | 
16 |     class AbbreviationReplacer(AbbreviationReplacer):
17 |         SENTENCE_STARTERS = []
18 | 
19 |         def __init__(self, text, lang):
20 |             super().__init__(text, lang)
21 | 
22 |         def replace_period_of_abbr(self, txt, abbr):
23 |             txt = re.sub(r'(?<=\s{abbr})\.|(?<=^{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
24 |             return txt
25 | 


--------------------------------------------------------------------------------
/pysbd/lang/burmese.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Burmese(Common, Standard):
 6 | 
 7 |     iso_code = 'my'
 8 | 
 9 |     SENTENCE_BOUNDARY_REGEX = r'.*?[။၏!\?]|.*?$'
10 |     Punctuations = ['။', '၏', '?', '!']
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 


--------------------------------------------------------------------------------
/pysbd/lang/chinese.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 4 | from pysbd.between_punctuation import BetweenPunctuation
 5 | from pysbd.lang.common import Common, Standard
 6 | from pysbd.punctuation_replacer import replace_punctuation
 7 | 
 8 | class Chinese(Common, Standard):
 9 | 
10 |     iso_code = 'zh'
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 
15 |     class BetweenPunctuation(BetweenPunctuation):
16 | 
17 |         def __init__(self, text):
18 |             super().__init__(text)
19 | 
20 |         def replace(self):
21 |             self.sub_punctuation_between_quotes_and_parens()
22 |             return self.text
23 | 
24 |         def sub_punctuation_between_double_angled_quotation_marks(self):
25 |             BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = r"《(?=(?P<tmp>[^》\\]+|\\{2}|\\.)*)(?P=tmp)》"
26 |             self.text = re.sub(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX, replace_punctuation,
27 |                       self.text)
28 | 
29 |         def sub_punctuation_between_l_bracket(self):
30 |             BETWEEN_L_BRACKET_REGEX = r"「(?=(?P<tmp>[^」\\]+|\\{2}|\\.)*)(?P=tmp)」"
31 |             self.text = re.sub(BETWEEN_L_BRACKET_REGEX, replace_punctuation,
32 |                       self.text)
33 | 
34 |         def sub_punctuation_between_quotes_and_parens(self):
35 |             self.sub_punctuation_between_double_angled_quotation_marks()
36 |             self.sub_punctuation_between_l_bracket()
37 | 


--------------------------------------------------------------------------------
/pysbd/lang/common/__init__.py:
--------------------------------------------------------------------------------
1 | from .common import Common  # noqa: F401
2 | from .standard import Standard  # noqa: F401
3 | 


--------------------------------------------------------------------------------
/pysbd/lang/common/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pysbd.utils import Rule
 4 | 
 5 | class Common(object):
 6 | 
 7 |     # added special case: r"[。．.！!? ]{2,}" to handle intermittent dots, exclamation, etc.
 8 |     # r"[。．.！!?] at end to handle single instances of these symbol inputs
 9 |     SENTENCE_BOUNDARY_REGEX = r"（(?:[^）])*）(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。．.！!?？ ]{2,}|\S.*?[。．.！!?？ȸȹ☉☈☇☄]|[。．.！!?？]"
10 | 
11 |     # # Rubular: http://rubular.com/r/NqCqv372Ix
12 |     QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'
13 | 
14 |     # # Rubular: http://rubular.com/r/6flGnUMEVl
15 |     PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'
16 | 
17 |     # # Rubular: http://rubular.com/r/TYzr4qOW1Q
18 |     # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/
19 | 
20 |     # # Rubular: http://rubular.com/r/JMjlZHAT4g
21 |     SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'
22 | 
23 |     # # Rubular: http://rubular.com/r/mQ8Es9bxtk
24 |     CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'
25 | 
26 |     # https://rubular.com/r/UkumQaILKbkeyc
27 |     # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
28 |     NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'
29 | 
30 |     # # Rubular: http://rubular.com/r/yqa4Rit8EY
31 |     PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯')
32 | 
33 |     # # Rubular: http://rubular.com/r/NEv265G2X2
34 |     KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯')
35 | 
36 |     # # Rubular: http://rubular.com/r/xDkpFZ0EgH
37 |     MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"
38 | 
39 |     class SingleLetterAbbreviationRules(object):
40 |         """Searches for periods within an abbreviation and
41 |         replaces the periods.
42 |         """
43 |         # Rubular: http://rubular.com/r/e3H6kwnr6H
44 |         SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯')
45 | 
46 |         # Rubular: http://rubular.com/r/gitvf0YWH4
47 |         SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')
48 | 
49 |         All = [
50 |             SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule
51 |         ]
52 | 
53 |     class AmPmRules(object):
54 | 
55 |         # Rubular: http://rubular.com/r/Vnx3m4Spc8
56 |         UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')
57 | 
58 |         # Rubular: http://rubular.com/r/AJMCotJVbW
59 |         UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')
60 | 
61 |         # Rubular: http://rubular.com/r/13q7SnOhgA
62 |         LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')
63 | 
64 |         # Rubular: http://rubular.com/r/DgUDq4mLz5
65 |         LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')
66 | 
67 |         All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
68 | 
69 |     class Numbers(object):
70 |         # Rubular: http://rubular.com/r/oNyxBOqbyy
71 |         PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')
72 | 
73 |         # Rubular: http://rubular.com/r/EMk5MpiUzt
74 |         NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')
75 | 
76 |         # Rubular: http://rubular.com/r/rf4l1HjtjG
77 |         NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')
78 | 
79 |         # Rubular: http://rubular.com/r/HPa4sdc6b9
80 |         StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')
81 | 
82 |         # Rubular: http://rubular.com/r/NuvWnKleFl
83 |         StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')
84 | 
85 |         All = [
86 |             PeriodBeforeNumberRule,
87 |             NumberAfterPeriodBeforeLetterRule,
88 |             NewLineNumberPeriodSpaceLetterRule,
89 |             StartLineNumberPeriodRule,
90 |             StartLineTwoDigitNumberPeriodRule
91 |             ]
92 | 


--------------------------------------------------------------------------------
/pysbd/lang/common/standard.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from pysbd.utils import Rule
  3 | from pysbd.abbreviation_replacer import AbbreviationReplacer
  4 | 
  5 | class Standard:
  6 | 
  7 |     # This class holds the punctuation marks.
  8 |     Punctuations = ['。', '．', '.', '！', '!', '?', '？']
  9 | 
 10 |     # Rubular: http://rubular.com/r/G2opjedIm9
 11 |     GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')
 12 | 
 13 |     FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯')
 14 | 
 15 |     SingleNewLineRule = Rule(r'\n', 'ȹ')
 16 | 
 17 |     # Rubular: http://rubular.com/r/aXPUGm6fQh
 18 |     QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')
 19 | 
 20 |     ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')
 21 | 
 22 |     SubSingleQuoteRule = Rule(r'&⎋&', "'")
 23 | 
 24 |     class Abbreviation(object):
 25 |         """Defines the abbreviations for each language (if available)"""
 26 |         ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
 27 |         PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
 28 |         NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']
 29 | 
 30 |         # Rubular: http://rubular.com/r/EUbZCNfgei
 31 |         # WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')
 32 |         # \w in python matches unicode abbreviations also so limit to english alphanumerics
 33 |         WithMultiplePeriodsAndEmailRule = Rule(r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
 34 | 
 35 |     class DoublePunctuationRules(object):
 36 |         FirstRule = Rule(r'\?!', '☉')
 37 |         SecondRule = Rule(r'!\?', '☈')
 38 |         ThirdRule = Rule(r'\?\?', '☇')
 39 |         ForthRule = Rule(r'!!', '☄')
 40 |         DoublePunctuation = r'\?!|!\?|\?\?|!!'
 41 |         All = [FirstRule, SecondRule, ThirdRule, ForthRule]
 42 | 
 43 |     class ExclamationPointRules(object):
 44 |         # Rubular: http://rubular.com/r/XS1XXFRfM2
 45 |         InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&')
 46 | 
 47 |         # Rubular: http://rubular.com/r/sl57YI8LkA
 48 |         BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&')
 49 | 
 50 |         # Rubular: http://rubular.com/r/f9zTjmkIPb
 51 |         MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&')
 52 | 
 53 |         All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]
 54 | 
 55 |     class SubSymbolsRules(object):
 56 |         Period = Rule(r'∯', '.')
 57 |         ArabicComma = Rule(r'♬', '،')
 58 |         SemiColon = Rule(r'♭', ':')
 59 |         FullWidthPeriod = Rule(r'&ᓰ&', '。')
 60 |         SpecialPeriod = Rule(r'&ᓱ&', '．')
 61 |         FullWidthExclamation = Rule(r'&ᓳ&', '！')
 62 |         ExclamationPoint = Rule(r'&ᓴ&', '!')
 63 |         QuestionMark = Rule(r'&ᓷ&', '?')
 64 |         FullWidthQuestionMark = Rule(r'&ᓸ&', '？')
 65 |         MixedDoubleQE = Rule(r'☉', '?!')
 66 |         MixedDoubleQQ = Rule(r'☇', '??')
 67 |         MixedDoubleEQ = Rule(r'☈', '!?')
 68 |         MixedDoubleEE = Rule(r'☄', '!!')
 69 |         LeftParens = Rule(r'&✂&', '(')
 70 |         RightParens = Rule(r'&⌬&', ')')
 71 |         TemporaryEndingPunctutation = Rule(r'ȸ', '')
 72 |         Newline = Rule(r'ȹ', "\n")
 73 |         All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
 74 |                FullWidthExclamation, ExclamationPoint, QuestionMark,
 75 |                FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
 76 |                MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
 77 |                Newline]
 78 | 
 79 |     class EllipsisRules(object):
 80 | 
 81 |         # below rules aren't similar to original rules of pragmatic segmenter
 82 |         # modification: spaces replaced with same number of symbols
 83 |         # Rubular: http://rubular.com/r/i60hCK81fz
 84 |         ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')
 85 | 
 86 |         # Rubular: http://rubular.com/r/Hdqpd90owl
 87 |         FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')
 88 | 
 89 |         # Rubular: http://rubular.com/r/YBG1dIHTRu
 90 |         ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')
 91 | 
 92 |         # Rubular: http://rubular.com/r/2VvZ8wRbd8
 93 |         FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')
 94 | 
 95 |         OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')
 96 | 
 97 |         All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
 98 |                ThreeConsecutiveRule, OtherThreePeriodRule]
 99 | 
100 |     class ReinsertEllipsisRules(object):
101 |         # below rules aren't similar to original rules of pragmatic segmenter
102 |         # modification: symbols replaced with same number of ellipses
103 |         SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
104 |         SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
105 |         SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
106 |         SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
107 |         SubOnePeriod = Rule(r'∮', '.')
108 |         All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
109 |                SubTwoConsecutivePeriod, SubOnePeriod]
110 | 
111 |     class AbbreviationReplacer(AbbreviationReplacer):
112 |         SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
113 |             "More She That The There They We What When Where Who Why".split(" ")
114 | 


--------------------------------------------------------------------------------
/pysbd/lang/danish.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from re import escape
 4 | 
 5 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 6 | from pysbd.lang.common import Common, Standard
 7 | from pysbd.utils import Rule
 8 | 
 9 | class Danish(Common, Standard):
10 | 
11 |     iso_code = 'da'
12 | 
13 |     MONTHS = ['Januar', 'Februar', 'Marts', 'April', 'Maj', 'Juni', 'Juli', 'August', 'September', 'Oktober', 'November', 'December']
14 | 
15 |     class Numbers(Common.Numbers):
16 | 
17 |         NumberPeriodSpaceRule = Rule(r'(?<=\s[1-9][0-9])\.(?=\s)|(?<=\s[0-9])\.(?=\s)', '∯')
18 | 
19 |         NegativeNumberPeriodSpaceRule = Rule(r'(?<=\s-[1-9][0-9])\.(?=\s)|(?<=\s-[0-9])\.(?=\s)', '∯')
20 | 
21 |         All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
22 | 
23 |     class AbbreviationReplacer(AbbreviationReplacer):
24 | 
25 |         SENTENCE_STARTERS = ("At De Dem Den Der Det Du En Et For Få Gjorde Han Hun Hvad Hvem"
26 |                              " Hvilke Hvor Hvordan Hvorfor Hvorledes Hvornår I Jeg Mange Vi Være").split(' ')
27 | 
28 |         def __init__(self, text, lang):
29 |             super().__init__(text, lang)
30 | 
31 |         def replace_abbreviation_as_sentence_boundary(self):
32 |             sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in self.SENTENCE_STARTERS))
33 |             regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|s.u|s.U)∯({})".format(sent_starters)
34 |             self.text = re.sub(regex, '\\1.', self.text)
35 |             return self.text
36 | 
37 |     class Abbreviation(Standard.Abbreviation):
38 |         ABBREVIATIONS = ['adm', 'adr', 'afd', 'afs', 'al', 'alm', 'alm', 'ang', 'ank', 'anm', 'ann', 'ansvh', 'apr', 'arr', 'ass', 'att', 'aud', 'aug', 'aut', 'bd', 'bdt', 'bet', 'bhk', 'bio', 'biol', 'bk', 'bl.a', 'bot', 'br', 'bto', 'ca', 'cal', 'cirk', 'cit', 'co', 'cpr-nr', 'cvr-nr', 'd.d', 'd.e', 'd.m', 'd.s', 'd.s.s', 'd.y', 'd.å', 'd.æ', 'da', 'dav', 'dec', 'def', 'del', 'dep', 'diam', 'din', 'dir', 'disp', 'distr', 'do', 'dobb', 'dr', 'ds', 'dvs', 'e.b', 'e.kr', 'e.l', 'e.o', 'e.v.t', 'eftf', 'eftm', 'egl', 'eks', 'eksam', 'ekskl', 'eksp', 'ekspl', 'el', 'emer', 'endv', 'eng', 'enk', 'etc', 'eur', 'evt', 'exam', 'f', 'f', 'f.eks', 'f.kr', 'f.m', 'f.n', 'f.o', 'f.o.m', 'f.s.v', 'f.t', 'f.v.t', 'f.å', 'fa', 'fakt', 'feb', 'fec', 'ff', 'fg', 'fg', 'fhv', 'fig', 'fl', 'flg', 'fm', 'fm', 'fmd', 'forb', 'foreg', 'foren', 'forf', 'forh', 'fork', 'form', 'forr', 'fors', 'forsk', 'forts', 'fp', 'fr', 'frk', 'fuldm', 'fuldm', 'fung', 'fung', 'fys', 'fær', 'g', 'g.d', 'g.m', 'gd', 'gdr', 'gg', 'gh', 'gl', 'gn', 'gns', 'gr', 'grdl', 'gross', 'h.a', 'h.c', 'hdl', 'henh', 'henv', 'hf', 'hft', 'hhv', 'hort', 'hosp', 'hpl', 'hr', 'hrs', 'hum', 'i', 'i.e', 'ib', 'ibid', 'if', 'ifm', 'ill', 'indb', 'indreg', 'ing', 'inkl', 'insp', 'instr', 'isl', 'istf', 'jan', 'jf', 'jfr', 'jnr', 'jr', 'jul', 'jun', 'jur', 'jvf', 'kal', 'kap', 'kat', 'kbh', 'kem', 'kgl', 'kin', 'kl', 'kld', 'km/t', 'knsp', 'komm', 'kons', 'korr', 'kp', 'kr', 'kr', 'kst', 'kt', 'ktr', 'kv', 'kvt', 'l', 'l.c', 'lab', 'lat', 'lb', 'lb.', 'lb.nr', 'lejl', 'lgd', 'lic', 'lign', 'lin', 'ling.merc', 'litt', 'lok', 'lrs', 'ltr', 'lø', 'm', 'm.a.o', 'm.fl.st', 'm.m', 'm/', 'ma', 'mag', 'maks', 'mar', 'mat', 'matr.nr', 'md', 'mdl', 'mdr', 'mdtl', 'med', 'medd', 'medflg', 'medl', 'merc', 'mezz', 'mf', 'mfl', 'mgl', 'mhp', 'mht', 'mi', 'mia', 'mio', 'ml', 'mods', 'modsv', 'modt', 'mr', 'mrk', 'mrs', 'ms', 'mul', 'mv', 'mvh', 'n', 'n.br', 'n.f', 'nat', 'ned', 'nedenn', 'nedenst', 'nederl', 'nkr', 'nl', 'no', 'nord', 'nov', 'nr', 'nr', 'nto', 'nuv', 'o', 'o.a', 'o.fl.st', 'o.g', 'o.h', 'o.m.a', 'obj', 'obl', 'obs', 'odont', 'oecon', 'off', 'ofl', 'okt', 'omg', 'omr', 'omtr', 'on', 'op.cit', 'opg', 'opl', 'opr', 'org', 'orig', 'osfr', 'osv', 'ovenn', 'ovenst', 'overs', 'ovf', 'oz', 'p', 'p.a', 'p.b.v', 'p.c', 'p.m.v', 'p.p', 'p.s', 'p.t', 'p.v.a', 'p.v.c', 'par', 'partc', 'pass', 'pct', 'pd', 'pens', 'perf', 'pers', 'pg', 'pga', 'pgl', 'ph', 'ph.d', 'pharm', 'phil', 'pinx', 'pk', 'pkt', 'pl', 'pluskv', 'polit', 'polyt', 'port', 'pos', 'pp', 'pr', 'prc', 'priv', 'prod', 'prof', 'pron', 'præd', 'præf', 'præp', 'præs', 'præt', 'psych', 'pt', 'pæd', 'q.e.d', 'rad', 'red', 'ref', 'reg', 'regn', 'rel', 'rep', 'repr', 'rest', 'rk', 'russ', 's', 's.br', 's.d', 's.e', 's.f', 's.m.b.a', 's.u', 's.å', 's/', 'sa', 'sb', 'sc', 'scient', 'sek', 'sek', 'sekr', 'sem', 'sen', 'sep', 'sept', 'sg', 'sign', 'sj', 'skr', 'skt', 'slutn', 'sml', 'smp', 'sms', 'smst', 'soc', 'soc', 'sort', 'sp', 'spec', 'spm', 'spr', 'spsk', 'st', 'stk', 'str', 'stud', 'subj', 'subst', 'suff', 'sup', 'suppl', 'sv', 'såk', 'sædv', 'sø', 't', 't.h', 't.o.m', 't.v', 'tab', 'td', 'tdl', 'tdr', 'techn', 'tekn', 'temp', 'th', 'ti', 'tidl', 'tilf', 'tilh', 'till', 'tilsv', 'tjg', 'tlf', 'tlgr', 'to', 'tr', 'trp', 'tv', 'ty', 'u', 'u.p', 'u.st', 'u.å', 'uafh', 'ubf', 'ubøj', 'udb', 'udbet', 'udd', 'udg', 'uds', 'ugtl', 'ulin', 'ult', 'undt', 'univ', 'v.f', 'var', 'vb', 'vbsb', 'vedk', 'vedl', 'vedr', 'vejl', 'vh', 'vol', 'vs', 'vsa', 'vær', 'zool', 'årg', 'årh', 'årl', 'ø.f', 'øv', 'øvr']
39 |         NUMBER_ABBREVIATIONS = ['nr', 's']
40 |         PREPOSITIVE_ABBREVIATIONS = ['adm', 'skt', 'dr', 'hr', 'fru', 'st']
41 | 


--------------------------------------------------------------------------------
/pysbd/lang/deutsch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 4 | from pysbd.between_punctuation import BetweenPunctuation
 5 | from pysbd.lang.common import Common, Standard
 6 | from pysbd.punctuation_replacer import replace_punctuation
 7 | from pysbd.processor import Processor
 8 | from pysbd.utils import Text, Rule
 9 | 
10 | 
11 | class Deutsch(Common, Standard):
12 | 
13 |     iso_code = 'de'
14 | 
15 |     class Numbers(Common.Numbers):
16 |         # Rubular: http://rubular.com/r/hZxoyQwKT1
17 |         NumberPeriodSpaceRule = Rule(r'(?<=\s\d)\.(?=\s)|(?<=\s\d\d)\.(?=\s)', '∯')
18 | 
19 |         # Rubular: http://rubular.com/r/ityNMwdghj
20 |         NegativeNumberPeriodSpaceRule = Rule(r'(?<=-\d)\.(?=\s)|(?<=-\d\d)\.(?=\s)', '∯')
21 | 
22 |         All = Common.Numbers.All + [NumberPeriodSpaceRule, NegativeNumberPeriodSpaceRule]
23 | 
24 |     class Processor(Processor):
25 | 
26 |         def __init__(self, text, lang, char_span=False):
27 |             super().__init__(text, lang, char_span)
28 | 
29 |         def replace_numbers(self):
30 |             self.text = Text(self.text).apply(*self.lang.Numbers.All)
31 |             self.replace_period_in_deutsch_dates()
32 |             return self.text
33 | 
34 |         def replace_period_in_deutsch_dates(self):
35 |             MONTHS = ['Januar', 'Februar', 'März', 'April', 'Mai', 'Juni', 'Juli', 'August',
36 |                       'September', 'Oktober', 'November', 'Dezember']
37 |             for month in MONTHS:
38 |                 # Rubular: http://rubular.com/r/zlqgj7G5dA
39 |                 self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
40 | 
41 |     class Abbreviation(Standard.Abbreviation):
42 |         ABBREVIATIONS = ['Ä', 'ä', 'adj', 'adm', 'adv', 'art', 'asst', 'b.a', 'b.s', 'bart', 'bldg', 'brig', 'bros', 'bse', 'buchst', 'bzgl', 'bzw', 'c.-à-d', 'ca', 'capt', 'chr', 'cmdr', 'co', 'col', 'comdr', 'con', 'corp', 'cpl', 'd.h', 'd.j', 'dergl', 'dgl', 'dkr', 'dr ', 'ens', 'etc', 'ev ', 'evtl', 'ff', 'g.g.a', 'g.u', 'gen', 'ggf', 'gov', 'hon', 'hosp', 'i.f', 'i.h.v', 'ii', 'iii', 'insp', 'iv', 'ix', 'jun', 'k.o', 'kath ', 'lfd', 'lt', 'ltd', 'm.e', 'maj', 'med', 'messrs', 'mio', 'mlle', 'mm', 'mme', 'mr', 'mrd', 'mrs', 'ms', 'msgr', 'mwst', 'no', 'nos', 'nr', 'o.ä', 'op', 'ord', 'pfc', 'ph', 'pp', 'prof', 'pvt', 'rep', 'reps', 'res', 'rev', 'rt', 's.p.a', 'sa', 'sen', 'sens', 'sfc', 'sgt', 'sog', 'sogen', 'spp', 'sr', 'st', 'std', 'str  ', 'supt', 'surg', 'u.a  ', 'u.e', 'u.s.w', 'u.u', 'u.ä', 'usf', 'usw', 'v', 'vgl', 'vi', 'vii', 'viii', 'vs', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xix', 'xv', 'xvi', 'xvii', 'xviii', 'xx', 'z.b', 'z.t', 'z.z', 'z.zt', 'zt', 'zzt', 'univ.-prof', 'o.univ.-prof', 'ao.univ.prof', 'ass.prof', 'hon.prof', 'univ.-doz', 'univ.ass', 'stud.ass', 'projektass', 'ass', 'di', 'dipl.-ing', 'mag']
43 |         PREPOSITIVE_ABBREVIATIONS = []
44 |         NUMBER_ABBREVIATIONS = ['art', 'ca', 'no', 'nos', 'nr', 'pp']
45 | 
46 |     class AbbreviationReplacer(AbbreviationReplacer):
47 | 
48 |         SENTENCE_STARTERS = ("Am Auch Auf Bei Da Das Der Die Ein Eine Es Für Heute Ich Im In "
49 |           "Ist Jetzt Mein Mit Nach So Und Warum Was Wenn Wer Wie Wir").split(' ')
50 | 
51 |         def __init__(self, text, lang):
52 |             super().__init__(text, lang)
53 | 
54 |         def replace(self):
55 |             # Rubular: http://rubular.com/r/B4X33QKIL8
56 |             SingleLowerCaseLetterRule = Rule(r'(?<=\s[a-z])\.(?=\s)', '∯')
57 | 
58 |             # Rubular: http://rubular.com/r/iUNSkCuso0
59 |             SingleLowerCaseLetterAtStartOfLineRule = Rule(r'(?<=^[a-z])\.(?=\s)', '∯')
60 |             self.text = Text(self.text).apply(
61 |                     self.lang.PossessiveAbbreviationRule,
62 |                     *self.lang.SingleLetterAbbreviationRules.All,
63 |                     SingleLowerCaseLetterRule,
64 |                     SingleLowerCaseLetterAtStartOfLineRule)
65 | 
66 |             self.text = self.search_for_abbreviations_in_string(self.text)
67 |             self.replace_multi_period_abbreviations()
68 |             self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
69 |             self.text = self.replace_abbreviation_as_sentence_boundary()
70 |             return self.text
71 | 
72 |         def scan_for_replacements(self, txt, am, index, character_array):
73 |             txt = re.sub(r'(?<={am})\.(?=\s)'.format(am=am), '∯', txt)
74 |             return txt
75 | 
76 |     class BetweenPunctuation(BetweenPunctuation):
77 | 
78 |         def __init__(self, text):
79 |             super().__init__(text)
80 | 
81 |         def sub_punctuation_between_double_quotes(self, txt):
82 |             # Rubular: http://rubular.com/r/OdcXBsub0w
83 |             BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX = r',,(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
84 | 
85 |             # Rubular: http://rubular.com/r/2UskIupGgP
86 |             # SPLIT_DOUBLE_QUOTES_DE_REGEX = r'\A„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
87 | 
88 |             # Rubular: http://rubular.com/r/TkZomF9tTM
89 |             BETWEEN_DOUBLE_QUOTES_DE_REGEX = r'„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)“'
90 | 
91 |             if '„' in txt:
92 |                 return re.sub(BETWEEN_DOUBLE_QUOTES_DE_REGEX, replace_punctuation, txt)
93 |             elif ',,' in txt:
94 |                 return re.sub(BETWEEN_UNCONVENTIONAL_DOUBLE_QUOTE_DE_REGEX,
95 |                               replace_punctuation, txt)
96 |             else:
97 |                 return txt
98 | 


--------------------------------------------------------------------------------
/pysbd/lang/english.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class English(Common, Standard):
 6 | 
 7 |     iso_code = 'en'
 8 | 
 9 |     class AbbreviationReplacer(AbbreviationReplacer):
10 |         SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
11 |             "More She That The There They We What When Where Who Why".split(" ")
12 | 


--------------------------------------------------------------------------------
/pysbd/lang/french.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class French(Common, Standard):
 6 | 
 7 |     iso_code = 'fr'
 8 | 
 9 |     class AbbreviationReplacer(AbbreviationReplacer):
10 |         SENTENCE_STARTERS = []
11 | 
12 |     class Abbreviation(Standard.Abbreviation):
13 |         ABBREVIATIONS = ['a.c.n', 'a.m', 'al', 'ann', 'apr', 'art', 'auj', 'av', 'b.p', 'boul', 'c.-à-d', 'c.n', 'c.n.s', 'c.p.i', 'c.q.f.d', 'c.s', 'ca', 'cf', 'ch.-l', 'chap', 'co', 'co', 'contr', 'dir', 'e.g', 'e.v', 'env', 'etc', 'ex', 'fasc', 'fig', 'fr', 'fém', 'hab', 'i.e', 'ibid', 'id', 'inf', 'l.d', 'lib', 'll.aa', 'll.aa.ii', 'll.aa.rr', 'll.aa.ss', 'll.ee', 'll.mm', 'll.mm.ii.rr', 'loc.cit', 'ltd', 'ltd', 'masc', 'mm', 'ms', 'n.b', 'n.d', 'n.d.a', 'n.d.l.r', 'n.d.t', 'n.p.a.i', 'n.s', 'n/réf', 'nn.ss', 'p.c.c', 'p.ex', 'p.j', 'p.s', 'pl', 'pp', 'r.-v', 'r.a.s', 'r.i.p', 'r.p', 's.a', 's.a.i', 's.a.r', 's.a.s', 's.e', 's.m', 's.m.i.r', 's.s', 'sec', 'sect', 'sing', 'sq', 'sqq', 'ss', 'suiv', 'sup', 'suppl', 't.s.v.p', 'tél', 'vb', 'vol', 'vs', 'x.o', 'z.i', 'éd']
14 |         PREPOSITIVE_ABBREVIATIONS = []
15 |         NUMBER_ABBREVIATIONS = []
16 | 


--------------------------------------------------------------------------------
/pysbd/lang/greek.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Greek(Common, Standard):
 6 | 
 7 |     iso_code = 'el'
 8 | 
 9 |     SENTENCE_BOUNDARY_REGEX = r'.*?[\.;!\?]|.*?$'
10 |     Punctuations = ['.', '!', ';', '?']
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 


--------------------------------------------------------------------------------
/pysbd/lang/hindi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Hindi(Common, Standard):
 6 | 
 7 |     iso_code = 'hi'
 8 | 
 9 |     SENTENCE_BOUNDARY_REGEX = r'.*?[।\|!\?]|.*?$'
10 |     Punctuations = ['।', '|', '.', '!', '?']
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 


--------------------------------------------------------------------------------
/pysbd/lang/japanese.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 4 | from pysbd.between_punctuation import BetweenPunctuation
 5 | from pysbd.lang.common import Common, Standard
 6 | from pysbd.punctuation_replacer import replace_punctuation
 7 | from pysbd.cleaner import Cleaner
 8 | from pysbd.utils import Text, Rule
 9 | 
10 | class Japanese(Common, Standard):
11 | 
12 |     iso_code = 'ja'
13 | 
14 |     class Cleaner(Cleaner):
15 | 
16 |         def __init__(self, text, lang, doc_type=None):
17 |             super().__init__(text, lang)
18 | 
19 |         def clean(self):
20 |             self.remove_newline_in_middle_of_word()
21 |             return self.text
22 | 
23 |         def remove_newline_in_middle_of_word(self):
24 |             NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
25 |             self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
26 | 
27 |     class AbbreviationReplacer(AbbreviationReplacer):
28 |         SENTENCE_STARTERS = []
29 | 
30 |     class BetweenPunctuation(BetweenPunctuation):
31 | 
32 |         def __init__(self, text):
33 |             super().__init__(text)
34 | 
35 |         def replace(self):
36 |             self.sub_punctuation_between_quotes_and_parens()
37 |             return self.text
38 | 
39 |         def sub_punctuation_between_parens_ja(self):
40 |             BETWEEN_PARENS_JA_REGEX = r'（(?=(?P<tmp>[^（）]+|\\{2}|\\.)*)(?P=tmp)）'
41 |             self.text = re.sub(BETWEEN_PARENS_JA_REGEX, replace_punctuation,
42 |                       self.text)
43 | 
44 |         def sub_punctuation_between_quotes_ja(self):
45 |             BETWEEN_QUOTE_JA_REGEX = r'「(?=(?P<tmp>[^「」]+|\\{2}|\\.)*)(?P=tmp)」'
46 |             self.text = re.sub(BETWEEN_QUOTE_JA_REGEX, replace_punctuation,
47 |                       self.text)
48 | 
49 |         def sub_punctuation_between_quotes_and_parens(self):
50 |             self.sub_punctuation_between_parens_ja()
51 |             self.sub_punctuation_between_quotes_ja()
52 | 


--------------------------------------------------------------------------------
/pysbd/lang/kazakh.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | from pysbd.processor import Processor
 5 | from pysbd.utils import Text, Rule
 6 | 
 7 | 
 8 | class Kazakh(Common, Standard):
 9 | 
10 |     iso_code = 'kk'
11 | 
12 |     # Handling Cyrillic characters in re module
13 |     # https://stackoverflow.com/a/10982308/5462100
14 |     MULTI_PERIOD_ABBREVIATION_REGEX = r'\b[\u0400-\u0500]+(?:\.\s?[\u0400-\u0500])+[.]|b[a-z](?:\.[a-z])+[.]'
15 | 
16 |     class Processor(Processor):
17 | 
18 |         def __init__(self, text, lang, char_span=False):
19 |             super().__init__(text, lang, char_span)
20 | 
21 |         def between_punctuation(self, txt):
22 |             txt = self.between_punctuation_processor(txt).replace()
23 |             # Rubular: http://rubular.com/r/WRWy56Z5zp
24 |             QuestionMarkFollowedByDashLowercaseRule = Rule(r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&')
25 |             # Rubular: http://rubular.com/r/lixxP7puSa
26 |             ExclamationMarkFollowedByDashLowercaseRule = Rule(r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&')
27 | 
28 |             txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule,
29 |                                   ExclamationMarkFollowedByDashLowercaseRule)
30 |             return txt
31 | 
32 |     class Abbreviation(Standard.Abbreviation):
33 |         ABBREVIATIONS = ['afp', 'anp', 'atp', 'bae', 'bg', 'bp', 'cam', 'cctv', 'cd', 'cez', 'cgi', 'cnpc', 'farc', 'fbi', 'eiti', 'epo', 'er', 'gp', 'gps', 'has', 'hiv', 'hrh', 'http', 'icu', 'idf', 'imd', 'ime', 'icu', 'idf', 'ip', 'iso', 'kaz', 'kpo', 'kpa', 'kz', 'kz', 'mri', 'nasa', 'nba', 'nbc', 'nds', 'ohl', 'omlt', 'ppm', 'pda', 'pkk', 'psm', 'psp', 'raf', 'rss', 'rtl', 'sas', 'sme', 'sms', 'tnt', 'udf', 'uefa', 'usb', 'utc', 'x', 'zdf', 'әқбк', 'әқбк', 'аақ', 'авг.', 'aбб', 'аек', 'ак', 'ақ', 'акцион.', 'акср', 'ақш', 'англ', 'аөсшк', 'апр', 'м.', 'а.', 'р.', 'ғ.', 'апр.', 'аум.', 'ацат', 'әч', 'т. б.', 'б. з. б.', 'б. з. б.', 'б. з. д.', 'б. з. д.', 'биікт.', 'б. т.', 'биол.', 'биохим', 'бө', 'б. э. д.', 'бта', 'бұұ', 'вич', 'всоонл', 'геогр.', 'геол.', 'гленкор', 'гэс', 'қк', 'км', 'г', 'млн', 'млрд', 'т', 'ғ. с.', 'ғ.', 'қ.', 'ғ.', 'дек.', 'днқ', 'дсұ', 'еақк', 'еқыұ', 'ембімұнайгаз', 'ео', 'еуразэқ', 'еуроодақ', 'еұу', 'ж.', 'ж.', 'жж.', 'жоо', 'жіө', 'жсдп', 'жшс', 'іім', 'инта', 'исаф', 'камаз', 'кгб', 'кеу', 'кг', 'км²', 'км²', 'км³', 'км³', 'кимеп', 'кср', 'ксро', 'кокп', 'кхдр', 'қазатомпром', 'қазкср', 'қазұу', 'қазмұнайгаз', 'қазпошта', 'қазтаг', 'қазұу', 'қкп', 'қмдб', 'қр', 'қхр', 'лат.', 'м²', 'м²', 'м³', 'м³', 'магатэ', 'май.', 'максам', 'мб', 'мвт', 'мемл', 'м', 'мсоп', 'мтк', 'мыс.', 'наса', 'нато', 'нквд', 'нояб.', 'обл.', 'огпу', 'окт.', 'оңт.', 'опек', 'оеб', 'өзенмұнайгаз', 'өф', 'пәк', 'пед.', 'ркфср', 'рнқ', 'рсфср', 'рф', 'свс', 'сву', 'сду', 'сес', 'сент.', 'см', 'снпс', 'солт.', 'солт.', 'сооно', 'ссро', 'сср', 'ссср', 'ссс', 'сэс', 'дк', 'т. б.', 'т', 'тв', 'тереңд.', 'тех.', 'тжқ', 'тмд', 'төм.', 'трлн', 'тр', 'т.', 'и.', 'м.', 'с.', 'ш.', 'т.', 'т. с. с.', 'тэц', 'уаз', 'уефа', 'еқыұ', 'ұқк', 'ұқшұ', 'февр.', 'фққ', 'фсб', 'хим.', 'хқко', 'шұар', 'шыұ', 'экон.', 'экспо', 'цтп', 'цас', 'янв.', 'dvd', 'жкт', 'ққс', 'км', 'ацат', 'юнеско', 'ббс', 'mgm', 'жск', 'зоо', 'бсн', 'өұқ', 'оар', 'боак', 'эөкк', 'хтқо', 'әөк', 'жэк', 'хдо', 'спбму', 'аф', 'сбд', 'амт', 'гсдп', 'гсбп', 'эыдұ', 'нұсжп', 'шыұ', 'жтсх', 'хдп', 'эқк', 'фкққ', 'пиқ', 'өгк', 'мбф', 'маж', 'кота', 'тж', 'ук', 'обб', 'сбл', 'жхл', 'кмс', 'бмтрк', 'жққ', 'бхооо', 'мқо', 'ржмб', 'гулаг', 'жко', 'еэы', 'еаэы', 'кхдр', 'рфкп', 'рлдп', 'хвқ', 'мр', 'мт', 'кту', 'ртж', 'тим', 'мемдум', 'ксро', 'т.с.с', 'с.ш.', 'ш.б.', 'б.б.', 'руб', 'мин', 'акад.', 'ғ.', 'мм', 'мм.']
34 |         PREPOSITIVE_ABBREVIATIONS = []
35 |         NUMBER_ABBREVIATIONS = []
36 | 
37 |     class AbbreviationReplacer(AbbreviationReplacer):
38 | 
39 |         SENTENCE_STARTERS = []
40 | 
41 |         def __init__(self, text, lang):
42 |             super().__init__(text, lang)
43 | 
44 |         def replace(self):
45 |             SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule(r'(?<=^[А-ЯЁ])\.(?=\s)', '∯')
46 |             SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)', '∯')
47 |             self.text = Text(self.text).apply(SingleUpperCaseCyrillicLetterAtStartOfLineRule,
48 |                                               SingleUpperCaseCyrillicLetterRule)
49 |             self.replace_multi_period_abbreviations()
50 |             return self.text
51 | 


--------------------------------------------------------------------------------
/pysbd/lang/marathi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Grammer rules from https://gopract.com/Pages/Marathi-Grammar-Viramchinah.aspx
 3 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 4 | from pysbd.lang.common import Common, Standard
 5 | 
 6 | class Marathi(Common, Standard):
 7 | 
 8 |     iso_code = 'mr'
 9 | 
10 |     SENTENCE_BOUNDARY_REGEX = r'.*?[.!?]|.*?$'
11 |     Punctuations = ['.', '!', '?']
12 | 
13 |     class AbbreviationReplacer(AbbreviationReplacer):
14 |         SENTENCE_STARTERS = []
15 | 


--------------------------------------------------------------------------------
/pysbd/lang/persian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 5 | from pysbd.lang.common import Common, Standard
 6 | from pysbd.utils import Rule
 7 | 
 8 | class Persian(Common, Standard):
 9 | 
10 |     iso_code = 'fa'
11 | 
12 |     Punctuations = ['?', '!', ':', '.', '؟']
13 |     SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$'
14 | 
15 |     # Rubular: http://rubular.com/r/RX5HpdDIyv
16 |     ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')
17 | 
18 |     # Rubular: http://rubular.com/r/kPRgApNHUg
19 |     ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')
20 | 
21 |     class AbbreviationReplacer(AbbreviationReplacer):
22 | 
23 |         SENTENCE_STARTERS = []
24 | 
25 |         def __init__(self, text, lang):
26 |             super().__init__(text, lang)
27 | 
28 |         def scan_for_replacements(self, txt, am, index, character_array):
29 |             txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
30 |             return txt
31 | 


--------------------------------------------------------------------------------
/pysbd/lang/polish.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Polish(Common, Standard):
 6 | 
 7 |     iso_code = 'pl'
 8 | 
 9 |     class AbbreviationReplacer(AbbreviationReplacer):
10 |         SENTENCE_STARTERS = []
11 | 
12 |     class Abbreviation(Standard.Abbreviation):
13 |         ABBREVIATIONS = ['ags', 'alb', 'ang', 'aor', 'awest', 'bałt', 'bojkow', 'bret', 'brus', 'bsł', 'bułg', 'c.b.d.o', 'c.b.d.u', 'celt', 'chorw', 'cs', 'czakaw', 'czerw', 'czes', 'dłuż', 'dniem', 'dor', 'dubrow', 'duń', 'ekaw', 'fiń', 'franc', 'gal', 'germ', 'głuż', 'gniem', 'goc', 'gr', 'grudz', 'hebr', 'het', 'hol', 'I cont', 'ie', 'ikaw', 'irań', 'irl', 'islandz', 'itd', 'itd.', 'itp', 'jekaw', 'kajkaw', 'kasz', 'kirg', 'kwiec', 'łac', 'lip', 'listop', 'lit', 'łot', 'lp', 'maced', 'mar', 'młpol', 'moraw', 'n.e', 'nb.', 'ngr', 'niem', 'nord', 'norw', 'np', 'np.', 'ok.', 'orm', 'oset', 'osk', 'p.n', 'p.n.e', 'p.o', 'pazdz', 'pers', 'pie', 'pod red.', 'podhal', 'pol', 'połab', 'port', 'prekm', 'pskow', 'psł', 'R cont', 'rez', 'rom', 'rozdz.', 'rum', 'rus', 'rys.', 'sas', 'sch', 'scs', 'serb', 'sierp', 'śl', 'sła', 'słe', 'słi', 'słow', 'sp. z o.o', 'śrdniem', 'śrgniem', 'śrirl', 'stbułg', 'stind', 'stpol', 'stpr', 'str.', 'strus', 'stwniem', 'stycz', 'sztokaw', 'szwedz', 't.', 'tj.', 'tłum.', 'toch', 'tur', 'tzn', 'ukr', 'ul', 'umbr', 'wed', 'węg', 'wlkpol', 'włos', 'wrzes', 'wyd.', 'zakarp']
14 |         PREPOSITIVE_ABBREVIATIONS = []
15 |         NUMBER_ABBREVIATIONS = []
16 | 


--------------------------------------------------------------------------------
/pysbd/lang/russian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 5 | from pysbd.lang.common import Common, Standard
 6 | 
 7 | class Russian(Common, Standard):
 8 | 
 9 |     iso_code = 'ru'
10 | 
11 |     class Abbreviation(Standard.Abbreviation):
12 |         ABBREVIATIONS = ["y", "y.e", "а", "авт", "адм.-терр", "акад", "в", "вв", "вкз", "вост.-европ", "г", "гг", "гос", "гр", "д", "деп", "дисс", "дол", "долл", "ежедн", "ж", "жен", "з", "зап", "зап.-европ", "заруб", "и", "ин", "иностр", "инст", "к", "канд", "кв", "кг", "куб", "л", "л.h", "л.н", "м", "мин", "моск", "муж", "н", "нед", "о", "п", "пгт", "пер", "пп", "пр", "просп", "проф", "р", "руб", "с", "сек", "см", "спб", "стр", "т", "тел", "тов", "тт", "тыс", "у", "у.е", "ул", "ф", "ч"]
13 |         PREPOSITIVE_ABBREVIATIONS = []
14 |         NUMBER_ABBREVIATIONS = []
15 | 
16 |     class AbbreviationReplacer(AbbreviationReplacer):
17 | 
18 |         SENTENCE_STARTERS = []
19 | 
20 |         def __init__(self, text, lang):
21 |             super().__init__(text, lang)
22 | 
23 |         def replace_period_of_abbr(self, txt, abbr):
24 |             txt = re.sub(r'(?<=\s{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
25 |             txt = re.sub(r'(?<=\A{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
26 |             txt = re.sub(r'(?<=^{abbr})\.'.format(abbr=abbr.strip()), '∯', txt)
27 |             return txt
28 | 


--------------------------------------------------------------------------------
/pysbd/lang/slovak.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from pysbd.abbreviation_replacer import AbbreviationReplacer
  4 | from pysbd.between_punctuation import BetweenPunctuation
  5 | from pysbd.lang.common import Common, Standard
  6 | from pysbd.processor import Processor
  7 | from pysbd.utils import Text
  8 | from pysbd.punctuation_replacer import replace_punctuation
  9 | from pysbd.lists_item_replacer import ListItemReplacer
 10 | 
 11 | 
 12 | class Slovak(Common, Standard):
 13 | 
 14 |     iso_code = 'sk'
 15 | 
 16 |     class ListItemReplacer(ListItemReplacer):
 17 | 
 18 |         def add_line_break(self):
 19 |             # We've found alphabetical lists are causing a lot of problems with abbreviations
 20 |             # with multiple periods and spaces, such as 'Company name s. r. o.'. Disabling
 21 |             # alphabetical list parsing seems like a reasonable tradeoff.
 22 | 
 23 |             # self.format_alphabetical_lists()
 24 |             self.format_roman_numeral_lists()
 25 |             self.format_numbered_list_with_periods()
 26 |             self.format_numbered_list_with_parens()
 27 |             return self.text
 28 | 
 29 |     class AbbreviationReplacer(AbbreviationReplacer):
 30 |         SENTENCE_STARTERS = []
 31 | 
 32 |         def replace_period_of_abbr(self, txt, abbr):
 33 |             # This is a very simple version of the original function, which makes sure
 34 |             # all of the periods in the abbreviation get replaced, not only the last one.
 35 |             # In Slovak language we use a lot of abbreviations like 'Company Name s. r. o.', so it
 36 |             # is important to handle this properly.
 37 | 
 38 |             abbr_new = abbr.replace(".", "∯") + "∯"
 39 |             txt = txt.replace(abbr + ".", abbr_new)
 40 |             return txt
 41 | 
 42 |     class Abbreviation(Standard.Abbreviation):
 43 |         ABBREVIATIONS = ['č', 'no', 'nr', 's. r. o', 'ing', 'p', 'a. d', 'o. k', 'pol. pr', 'a. s. a. p', 'p. n. l', 'red', 'o.k', 'a.d', 'm.o', 'pol.pr', 'a.s.a.p', 'p.n.l', 'pp', 'sl', 'corp', 'plgr', 'tz', 'rtg', 'o.c.p', 'o. c. p', 'c.k', 'c. k', 'n.a', 'n. a', 'a.m', 'a. m', 'vz', 'i.b', 'i. b', 'ú.p.v.o', 'ú. p. v. o', 'bros', 'rsdr', 'doc', 'tu', 'ods', 'n.w.a', 'n. w. a', 'nár', 'pedg', 'paeddr', 'rndr', 'naprk', 'a.g.p', 'a. g. p', 'prof', 'pr', 'a.v', 'a. v', 'por', 'mvdr', 'nešp', 'u.s', 'u. s', 'kt', 'vyd', 'e.t', 'e. t', 'al', 'll.m', 'll. m', 'o.f.i', 'o. f. i', 'mr', 'apod', 'súkr', 'stred', 's.e.g', 's. e. g', 'sr', 'tvz', 'ind', 'var', 'etc', 'atd', 'n.o', 'n. o', 's.a', 's. a', 'např', 'a.i.i', 'a. i. i', 'a.k.a', 'a. k. a', 'konkr', 'čsl', 'odd', 'ltd', 't.z', 't. z', 'o.z', 'o. z', 'obv', 'obr', 'pok', 'tel', 'št', 'skr', 'phdr', 'xx', 'š.p', 'š. p', 'ph.d', 'ph. d', 'm.n.m', 'm. n. m', 'zz', 'roz', 'atď.', 'ev', 'v.sp', 'v. sp', 'drsc', 'mudr', 't.č', 't. č', 'el', 'os', 'co', 'r.o', 'r. o', 'str', 'p.a', 'p. a', 'zdravot', 'prek', 'gen', 'viď', 'dr', 'cca', 'p.s', 'p. s', 'zák', 'slov', 'arm', 'inc', 'max', 'd.c', 'k.o', 'a. r. k', 'd. c', 'k. o', 'a. r. k', 'soc', 'bc', 'zs', 'akad', 'sz', 'pozn', 'tr', 'nám', 'kol', 'csc', 'ul', 'sp', 'o.i', 'jr', 'zb', 'sv', 'tj', 'čs', 'tzn', 'príp', 'iv', 'hl', 'st', 'pod', 'vi', 'tis', 'stor', 'rozh', 'mld', 'atď', 'mgr', 'a.s', 'a. s', 'phd', 'z.z', 'z. z', 'judr', 'ing', 'hod', 'vs', 'písm', 's.r.o', 'min', 'ml', 'iii', 't.j', 't. j', 'spol', 'mil', 'ii', 'napr', 'resp', 'tzv']
 44 |         PREPOSITIVE_ABBREVIATIONS = ['st', 'p', 'dr', 'mudr', 'judr', 'ing', 'mgr', 'bc', 'drsc', 'doc', 'prof']
 45 |         NUMBER_ABBREVIATIONS = ['č', 'no', 'nr']
 46 | 
 47 |     class BetweenPunctuation(BetweenPunctuation):
 48 |         # Rubular: https://rubular.com/r/rImWbaYFtHHtf4
 49 |         BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX = r'„(?>[^“\\]+|\\{2}|\\.)*“'
 50 |         BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX_2 = r'\„(?=(?P<tmp>[^“\\]+|\\{2}|\\.)*)(?P=tmp)\“'
 51 | 
 52 |         def sub_punctuation_between_slovak_double_quotes(self, txt):
 53 |             return re.sub(self.BETWEEN_SLOVAK_DOUBLE_QUOTES_REGEX_2, replace_punctuation, txt)
 54 | 
 55 |         def sub_punctuation_between_quotes_and_parens(self, txt):
 56 |             txt = self.sub_punctuation_between_single_quotes(txt)
 57 |             txt = self.sub_punctuation_between_single_quote_slanted(txt)
 58 |             txt = self.sub_punctuation_between_double_quotes(txt)
 59 |             txt = self.sub_punctuation_between_square_brackets(txt)
 60 |             txt = self.sub_punctuation_between_parens(txt)
 61 |             txt = self.sub_punctuation_between_quotes_arrow(txt)
 62 |             txt = self.sub_punctuation_between_em_dashes(txt)
 63 |             txt = self.sub_punctuation_between_quotes_slanted(txt)
 64 |             txt = self.sub_punctuation_between_slovak_double_quotes(txt)
 65 |             return txt
 66 | 
 67 |     class Processor(Processor):
 68 | 
 69 |         def __init__(self, text, lang, char_span=False):
 70 |             super().__init__(text, lang, char_span)
 71 | 
 72 |         def process(self):
 73 |             if not self.text:
 74 |                 return self.text
 75 |             self.text = self.text.replace('\n', '\r')
 76 | 
 77 |             # Here we use language specific ListItemReplacer:
 78 |             li = self.lang.ListItemReplacer(self.text)
 79 |             self.text = li.add_line_break()
 80 | 
 81 |             self.replace_abbreviations()
 82 |             self.replace_numbers()
 83 |             self.replace_continuous_punctuation()
 84 |             self.replace_periods_before_numeric_references()
 85 |             self.text = Text(self.text).apply(
 86 |                 self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
 87 |                 self.lang.GeoLocationRule, self.lang.FileFormatRule)
 88 |             postprocessed_sents = self.split_into_segments()
 89 |             return postprocessed_sents
 90 | 
 91 |         def replace_numbers(self):
 92 |             self.text = Text(self.text).apply(*self.lang.Numbers.All)
 93 |             self.replace_period_in_slovak_dates()
 94 |             self.replace_period_in_ordinal_numerals()
 95 |             self.replace_period_in_roman_numerals()
 96 |             return self.text
 97 | 
 98 |         def replace_period_in_ordinal_numerals(self):
 99 |             # Rubular: https://rubular.com/r/0HkmvzMGTqgWs6
100 |             self.text = re.sub(r'(?<=\d)\.(?=\s*[a-z]+)', '∯', self.text)
101 | 
102 |         def replace_period_in_roman_numerals(self):
103 |             # Rubular: https://rubular.com/r/XlzTIi7aBRThSl
104 |             self.text = re.sub(r'((\s+[VXI]+)|(^[VXI]+))(\.)(?=\s+)', r'\1∯', self.text, re.IGNORECASE)
105 | 
106 |         def replace_period_in_slovak_dates(self):
107 |             MONTHS = ['Január', 'Február', 'Marec', 'Apríl', 'Máj', 'Jún', 'Júl', 'August', 'September', 'Október', 'November', 'December',
108 |                       'Januára', 'Februára', 'Marca', 'Apríla', 'Mája', 'Júna', 'Júla', 'Augusta', 'Septembra', 'Októbra', 'Novembra', 'Decembra']
109 |             for month in MONTHS:
110 |                 # Rubular: https://rubular.com/r/dGLZqsbjcdJvCd
111 |                 self.text = re.sub(r'(?<=\d)\.(?=\s*{month})'.format(month=month), '∯', self.text)
112 | 


--------------------------------------------------------------------------------
/pysbd/lang/spanish.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Spanish(Common, Standard):
 6 | 
 7 |     iso_code = 'es'
 8 | 
 9 |     class AbbreviationReplacer(AbbreviationReplacer):
10 |         SENTENCE_STARTERS = []
11 | 
12 |     class Abbreviation(Standard.Abbreviation):
13 |         ABBREVIATIONS = ['a.c', 'a/c', 'abr', 'adj', 'admón', 'afmo', 'ago', 'almte', 'ap', 'apdo', 'arq', 'art', 'atte', 'av', 'avda', 'bco', 'bibl', 'bs. as', 'c', 'c.f', 'c.g', 'c/c', 'c/u', 'cap', 'cc.aa', 'cdad', 'cm', 'co', 'cra', 'cta', 'cv', 'd.e.p', 'da', 'dcha', 'dcho', 'dep', 'dic', 'dicc', 'dir', 'dn', 'doc', 'dom', 'dpto', 'dr', 'dra', 'dto', 'ee', 'ej', 'en', 'entlo', 'esq', 'etc', 'excmo', 'ext', 'f.c', 'fca', 'fdo', 'febr', 'ff. aa', 'ff.cc', 'fig', 'fil', 'fra', 'g.p', 'g/p', 'gob', 'gr', 'gral', 'grs', 'hnos', 'hs', 'igl', 'iltre', 'imp', 'impr', 'impto', 'incl', 'ing', 'inst', 'izdo', 'izq', 'izqdo', 'j.c', 'jue', 'jul', 'jun', 'kg', 'km', 'lcdo', 'ldo', 'let', 'lic', 'ltd', 'lun', 'mar', 'may', 'mg', 'min', 'mié', 'mm', 'máx', 'mín', 'mt', 'n. del t', 'n.b', 'no', 'nov', 'ntra. sra', 'núm', 'oct', 'p', 'p.a', 'p.d', 'p.ej', 'p.v.p', 'párrf', 'ppal', 'prev', 'prof', 'prov', 'ptas', 'pts', 'pza', 'pág', 'págs', 'párr', 'q.e.g.e', 'q.e.p.d', 'q.e.s.m', 'reg', 'rep', 'rr. hh', 'rte', 's', 's. a', 's.a.r', 's.e', 's.l', 's.r.c', 's.r.l', 's.s.s', 's/n', 'sdad', 'seg', 'sept', 'sig', 'sr', 'sra', 'sres', 'srta', 'sta', 'sto', 'sáb', 't.v.e', 'tamb', 'tel', 'tfno', 'ud', 'uu', 'uds', 'univ', 'v.b', 'v.e', 'vd', 'vds', 'vid', 'vie', 'vol', 'vs', 'vto', 'a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
14 |         PREPOSITIVE_ABBREVIATIONS = ['a', 'aero', 'ambi', 'an', 'anfi', 'ante', 'anti', 'archi', 'arci', 'auto', 'bi', 'bien', 'bis', 'co', 'com', 'con', 'contra', 'crio', 'cuadri', 'cuasi', 'cuatri', 'de', 'deci', 'des', 'di', 'dis', 'dr', 'ecto', 'ee', 'en', 'endo', 'entre', 'epi', 'equi', 'ex', 'extra', 'geo', 'hemi', 'hetero', 'hiper', 'hipo', 'homo', 'i', 'im', 'in', 'infra', 'inter', 'intra', 'iso', 'lic', 'macro', 'mega', 'micro', 'mini', 'mono', 'mt', 'multi', 'neo', 'omni', 'para', 'pen', 'ph', 'ph.d', 'pluri', 'poli', 'pos', 'post', 'pre', 'pro', 'prof', 'pseudo', 're', 'retro', 'semi', 'seudo', 'sobre', 'sub', 'super', 'supra', 'sra', 'srta', 'trans', 'tras', 'tri', 'ulter', 'ultra', 'un', 'uni', 'vice', 'yuxta']
15 |         NUMBER_ABBREVIATIONS = ['cra', 'ext', 'no', 'nos', 'p', 'pp', 'tel']
16 | 


--------------------------------------------------------------------------------
/pysbd/lang/urdu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.abbreviation_replacer import AbbreviationReplacer
 3 | from pysbd.lang.common import Common, Standard
 4 | 
 5 | class Urdu(Common, Standard):
 6 | 
 7 |     iso_code = 'ur'
 8 | 
 9 |     SENTENCE_BOUNDARY_REGEX = r'.*?[۔؟!\?]|.*?$'
10 |     Punctuations = ['?', '!', '۔', '؟']
11 | 
12 |     class AbbreviationReplacer(AbbreviationReplacer):
13 |         SENTENCE_STARTERS = []
14 | 


--------------------------------------------------------------------------------
/pysbd/languages.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pysbd.lang.english import English
 3 | from pysbd.lang.hindi import Hindi
 4 | from pysbd.lang.marathi import Marathi
 5 | from pysbd.lang.chinese import Chinese
 6 | from pysbd.lang.spanish import Spanish
 7 | from pysbd.lang.amharic import Amharic
 8 | from pysbd.lang.arabic import Arabic
 9 | from pysbd.lang.armenian import Armenian
10 | from pysbd.lang.bulgarian import Bulgarian
11 | from pysbd.lang.urdu import Urdu
12 | from pysbd.lang.russian import Russian
13 | from pysbd.lang.polish import Polish
14 | from pysbd.lang.persian import Persian
15 | from pysbd.lang.dutch import Dutch
16 | from pysbd.lang.danish import Danish
17 | from pysbd.lang.french import French
18 | from pysbd.lang.burmese import Burmese
19 | from pysbd.lang.greek import Greek
20 | from pysbd.lang.italian import Italian
21 | from pysbd.lang.japanese import Japanese
22 | from pysbd.lang.deutsch import Deutsch
23 | from pysbd.lang.kazakh import Kazakh
24 | from pysbd.lang.slovak import Slovak
25 | 
26 | LANGUAGE_CODES = {
27 |     'en': English,
28 |     'hi': Hindi,
29 |     'mr': Marathi,
30 |     'zh': Chinese,
31 |     'es': Spanish,
32 |     'am': Amharic,
33 |     'ar': Arabic,
34 |     'hy': Armenian,
35 |     'bg': Bulgarian,
36 |     'ur': Urdu,
37 |     'ru': Russian,
38 |     'pl': Polish,
39 |     'fa': Persian,
40 |     'nl': Dutch,
41 |     'da': Danish,
42 |     'fr': French,
43 |     'my': Burmese,
44 |     'el': Greek,
45 |     'it': Italian,
46 |     'ja': Japanese,
47 |     'de': Deutsch,
48 |     'kk': Kazakh,
49 |     'sk': Slovak
50 | }
51 | 
52 | 
53 | class Language(object):
54 | 
55 |     def __init__(self, code):
56 |         self.code = code
57 | 
58 |     @classmethod
59 |     def get_language_code(cls, code):
60 |         try:
61 |             return LANGUAGE_CODES[code]
62 |         except KeyError:
63 |             raise ValueError("Provide valid language ID i.e. ISO code. "
64 |                 "Available codes are : {}".format(set(LANGUAGE_CODES.keys())))
65 | 


--------------------------------------------------------------------------------
/pysbd/lists_item_replacer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import string
  3 | import re
  4 | from pysbd.utils import Rule, Text
  5 | from functools import partial
  6 | 
  7 | 
  8 | class ListItemReplacer(object):
  9 | 
 10 |     ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
 11 |     LATIN_NUMERALS = list(string.ascii_lowercase)
 12 | 
 13 |     # Rubular: http://rubular.com/r/XcpaJKH0sz
 14 |     ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)'
 15 | 
 16 |     # Rubular: http://rubular.com/r/Gu5rQapywf
 17 |     # TODO: Make sure below regex call is case-insensitive
 18 |     ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
 19 | 
 20 |     # (pattern, replacement)
 21 |     SubstituteListPeriodRule = Rule('♨', '∯')
 22 |     ListMarkerRule = Rule('☝', '')
 23 | 
 24 |     # Rubular: http://rubular.com/r/Wv4qLdoPx7
 25 |     # https://regex101.com/r/62YBlv/1
 26 |     SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")
 27 | 
 28 |     # Rubular: http://rubular.com/r/AizHXC6HxK
 29 |     # https://regex101.com/r/62YBlv/2
 30 |     SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")
 31 | 
 32 |     # Rubular: http://rubular.com/r/GE5q6yID2j
 33 |     # https://regex101.com/r/62YBlv/3
 34 |     SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")
 35 | 
 36 |     NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))'
 37 |     # 1. abcd
 38 |     # 2. xyz
 39 |     NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))'
 40 |     # 1) abcd
 41 |     # 2) xyz
 42 |     NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'
 43 | 
 44 |     # Rubular: http://rubular.com/r/NsNFSqrNvJ
 45 |     # TODO: Make sure below regex call is case-insensitive
 46 |     EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'
 47 | 
 48 |     # Rubular: http://rubular.com/r/wMpnVedEIb
 49 |     # TODO: Make sure below regex call is case-insensitive
 50 |     ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.'
 51 | 
 52 |     # Rubular: http://rubular.com/r/GcnmQt4a3I
 53 |     ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])'
 54 | 
 55 |     def __init__(self, text):
 56 |         self.text = text
 57 | 
 58 |     def add_line_break(self):
 59 |         self.format_alphabetical_lists()
 60 |         self.format_roman_numeral_lists()
 61 |         self.format_numbered_list_with_periods()
 62 |         self.format_numbered_list_with_parens()
 63 |         return self.text
 64 | 
 65 |     def replace_parens(self):
 66 |         text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
 67 |                       r'&✂&\1&⌬&', self.text)
 68 |         return text
 69 | 
 70 |     def format_numbered_list_with_parens(self):
 71 |         self.replace_parens_in_numbered_list()
 72 |         self.add_line_breaks_for_numbered_list_with_parens()
 73 |         self.text = Text(self.text).apply(self.ListMarkerRule)
 74 | 
 75 |     def replace_periods_in_numbered_list(self):
 76 |         self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
 77 |                         '♨', strip=True)
 78 | 
 79 |     def format_numbered_list_with_periods(self):
 80 |         self.replace_periods_in_numbered_list()
 81 |         self.add_line_breaks_for_numbered_list_with_periods()
 82 |         self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
 83 | 
 84 |     def format_alphabetical_lists(self):
 85 |         self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
 86 |             roman_numeral=False)
 87 |         self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
 88 |             roman_numeral=False)
 89 |         return self.txt
 90 | 
 91 |     def format_roman_numeral_lists(self):
 92 |         self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
 93 |             roman_numeral=True)
 94 |         self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
 95 |             roman_numeral=True)
 96 |         return self.txt
 97 | 
 98 |     def add_line_breaks_for_alphabetical_list_with_periods(
 99 |             self, roman_numeral=False):
100 |         txt = self.iterate_alphabet_array(
101 |             self.ALPHABETICAL_LIST_WITH_PERIODS,
102 |             roman_numeral=roman_numeral)
103 |         return txt
104 | 
105 |     def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
106 |         txt = self.iterate_alphabet_array(
107 |             self.ALPHABETICAL_LIST_WITH_PARENS,
108 |             parens=True,
109 |             roman_numeral=roman_numeral)
110 |         return txt
111 | 
112 |     def scan_lists(self, regex1, regex2, replacement, strip=False):
113 |         list_array = re.findall(regex1, self.text)
114 |         list_array = list(map(int, list_array))
115 |         for ind, item in enumerate(list_array):
116 |             # to avoid IndexError
117 |             # ruby returns nil if index is out of range
118 |             if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
119 |                 self.substitute_found_list_items(regex2, item, strip, replacement)
120 |             elif ind > 0:
121 |                 if (((item - 1) == list_array[ind - 1]) or
122 |                     ((item == 0) and (list_array[ind - 1] == 9)) or
123 |                     ((item == 9) and (list_array[ind - 1] == 0))):
124 |                     self.substitute_found_list_items(regex2, item, strip, replacement)
125 | 
126 |     def substitute_found_list_items(self, regex, each, strip, replacement):
127 | 
128 |         def replace_item(match, val=None, strip=False, repl='♨'):
129 |             match = match.group()
130 |             if strip:
131 |                 match = str(match).strip()
132 |             chomped_match = match if len(match) == 1 else match.strip('.])')
133 |             if str(each) == chomped_match:
134 |                 return "{}{}".format(each, replacement)
135 |             else:
136 |                 return str(match)
137 | 
138 |         self.text = re.sub(regex, partial(replace_item, val=each,
139 |                            strip=strip, repl=replacement), self.text)
140 | 
141 |     def add_line_breaks_for_numbered_list_with_periods(self):
142 |         if ('♨' in self.text) and (not re.search(
143 |                 '♨.+(\n|\r).+♨', self.text)) and (not re.search(
144 |                     r'for\s\d{1,2}♨\s[a-z]', self.text)):
145 |             self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,
146 |                                     self.SpaceBetweenListItemsSecondRule)
147 | 
148 |     def replace_parens_in_numbered_list(self):
149 |         self.scan_lists(
150 |             self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
151 |         self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')
152 | 
153 |     def add_line_breaks_for_numbered_list_with_parens(self):
154 |         if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
155 |             self.text = Text(self.text).apply(
156 |                 self.SpaceBetweenListItemsThirdRule)
157 | 
158 |     def replace_alphabet_list(self, a):
159 |         """
160 |         Input: 'a. ffegnog b. fgegkl c.'
161 |         Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯
162 |         """
163 | 
164 |         def replace_letter_period(match, val=None):
165 |             match = match.group()
166 |             match_wo_period = match.strip('.')
167 |             if match_wo_period == val:
168 |                 return '\r{}∯'.format(match_wo_period)
169 |             else:
170 |                 return match
171 | 
172 |         txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX,
173 |                      partial(replace_letter_period, val=a),
174 |                      self.text, flags=re.IGNORECASE)
175 |         return txt
176 | 
177 |     def replace_alphabet_list_parens(self, a):
178 |         """
179 |         Input: "a) ffegnog (b) fgegkl c)"
180 |         Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"
181 |         """
182 | 
183 |         def replace_alphabet_paren(match, val=None):
184 |             match = match.group()
185 |             if '(' in match:
186 |                 match_wo_paren = match.strip('(')
187 |                 if match_wo_paren == val:
188 |                     return '\r&✂&{}'.format(match_wo_paren)
189 |                 else:
190 |                     return match
191 |             else:
192 |                 if match == val:
193 |                     return '\r{}'.format(match)
194 |                 else:
195 |                     return match
196 | 
197 |         # Make it cases-insensitive
198 |         txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX,
199 |                      partial(replace_alphabet_paren, val=a),
200 |                      self.text, flags=re.IGNORECASE)
201 |         return txt
202 | 
203 |     def replace_correct_alphabet_list(self, a, parens):
204 |         if parens:
205 |             a = self.replace_alphabet_list_parens(a)
206 |         else:
207 |             a = self.replace_alphabet_list(a)
208 |         return a
209 | 
210 |     def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
211 |         if (len(alphabet) == 0) & (len(list_array) == 0) or (
212 |                 list_array[i - 1] not in alphabet) or (a not in alphabet):
213 |             return self.text
214 |         if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
215 |             return self.text
216 |         result = self.replace_correct_alphabet_list(a, parens)
217 |         return result
218 | 
219 |     def other_items_replacement(self, a, i, alphabet, list_array, parens):
220 |         if (len(alphabet) == 0) & (len(list_array) == 0) or (
221 |                 list_array[i - 1] not in alphabet) or (a not in alphabet) or (
222 |                     list_array[i + 1] not in alphabet):
223 |             return self.text
224 |         if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
225 |                 abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
226 |             return self.text
227 |         result = self.replace_correct_alphabet_list(a, parens)
228 |         return result
229 | 
230 |     def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
231 |         list_array = re.findall(regex, self.text)
232 |         alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
233 |         list_array = [i for i in list_array if i in alphabet]
234 |         for ind, each in enumerate(list_array):
235 |             if ind == len(list_array) - 1:
236 |                 self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
237 |             else:
238 |                 self.text = self.other_items_replacement(
239 |                     each, ind, alphabet, list_array, parens)
240 |         return self.text
241 | 


--------------------------------------------------------------------------------
/pysbd/processor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from pysbd.utils import Text
  4 | from pysbd.lists_item_replacer import ListItemReplacer
  5 | from pysbd.exclamation_words import ExclamationWords
  6 | from pysbd.between_punctuation import BetweenPunctuation
  7 | from pysbd.abbreviation_replacer import AbbreviationReplacer
  8 | 
  9 | class Processor(object):
 10 | 
 11 |     def __init__(self, text, lang, char_span=False):
 12 |         """Process a text - do pre and post processing - to get proper sentences
 13 | 
 14 |         Parameters
 15 |         ----------
 16 |         text : str
 17 |             Original text
 18 |         language : object
 19 |             Language module
 20 |         char_span : bool, optional
 21 |             Get start & end character offsets of each sentences
 22 |             within original text, by default False
 23 |         """
 24 |         self.text = text
 25 |         self.lang = lang
 26 |         self.char_span = char_span
 27 | 
 28 |     def process(self):
 29 |         if not self.text:
 30 |             return self.text
 31 |         self.text = self.text.replace('\n', '\r')
 32 |         li = ListItemReplacer(self.text)
 33 |         self.text = li.add_line_break()
 34 |         self.replace_abbreviations()
 35 |         self.replace_numbers()
 36 |         self.replace_continuous_punctuation()
 37 |         self.replace_periods_before_numeric_references()
 38 |         self.text = Text(self.text).apply(
 39 |             self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
 40 |             self.lang.GeoLocationRule, self.lang.FileFormatRule)
 41 |         postprocessed_sents = self.split_into_segments()
 42 |         return postprocessed_sents
 43 | 
 44 |     def rm_none_flatten(self, sents):
 45 |         """Remove None values and unpack list of list sents
 46 | 
 47 |         Parameters
 48 |         ----------
 49 |         sents : list
 50 |             list of sentences
 51 | 
 52 |         Returns
 53 |         -------
 54 |         list
 55 |             unpacked and None removed list of sents
 56 |         """
 57 |         sents = list(filter(None, sents))
 58 |         if not any(isinstance(s, list) for s in sents):
 59 |             return sents
 60 |         new_sents = []
 61 |         for sent in sents:
 62 |             if isinstance(sent, list):
 63 |                 for s in sent:
 64 |                     new_sents.append(s)
 65 |             else:
 66 |                 new_sents.append(sent)
 67 |         return new_sents
 68 | 
 69 |     def split_into_segments(self):
 70 |         self.check_for_parens_between_quotes()
 71 |         sents = self.text.split('\r')
 72 |         # remove empty and none values
 73 |         sents = self.rm_none_flatten(sents)
 74 |         sents = [
 75 |             Text(s).apply(self.lang.SingleNewLineRule, *self.lang.EllipsisRules.All)
 76 |             for s in sents
 77 |         ]
 78 |         sents = [self.check_for_punctuation(s) for s in sents]
 79 |         # flatten list of list of sentences
 80 |         sents = self.rm_none_flatten(sents)
 81 |         postprocessed_sents = []
 82 |         for sent in sents:
 83 |             sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
 84 |             post_process_sent = self.post_process_segments(sent)
 85 |             if post_process_sent and isinstance(post_process_sent, str):
 86 |                 postprocessed_sents.append(post_process_sent)
 87 |             elif isinstance(post_process_sent, list):
 88 |                 for pps in post_process_sent:
 89 |                     postprocessed_sents.append(pps)
 90 |         postprocessed_sents = [Text(ns).apply(self.lang.SubSingleQuoteRule)
 91 |                                for ns in postprocessed_sents]
 92 |         return postprocessed_sents
 93 | 
 94 |     def post_process_segments(self, txt):
 95 |         if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
 96 |             return txt
 97 | 
 98 |         # below condition present in pragmatic segmenter
 99 |         # dont know significance of it yet.
100 |         # if self.consecutive_underscore(txt) or len(txt) < 2:
101 |         #     return txt
102 | 
103 |         if re.match(r'\t', txt):
104 |             pass
105 | 
106 |         # TODO:
107 |         # Decide on keeping or removing Standard.ExtraWhiteSpaceRule
108 |         # removed to retain original text spans
109 |         # txt = Text(txt).apply(*ReinsertEllipsisRules.All,
110 |         #                       Standard.ExtraWhiteSpaceRule)
111 |         txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)
112 |         if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
113 |             txt = re.split(
114 |                 self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
115 |             return txt
116 |         else:
117 |             txt = txt.replace('\n', '')
118 |             return txt.strip()
119 | 
120 |     def check_for_parens_between_quotes(self):
121 |         def paren_replace(match):
122 |             match = match.group()
123 |             sub1 = re.sub(r'\s(?=\()', '\r', match)
124 |             sub2 = re.sub(r'(?<=\))\s', '\r', sub1)
125 |             return sub2
126 |         self.text = re.sub(self.lang.PARENS_BETWEEN_DOUBLE_QUOTES_REGEX,
127 |                       paren_replace, self.text)
128 | 
129 |     def replace_continuous_punctuation(self):
130 |         def continuous_puncs_replace(match):
131 |             match = match.group()
132 |             sub1 = re.sub(re.escape('!'), '&ᓴ&', match)
133 |             sub2 = re.sub(re.escape('?'), '&ᓷ&', sub1)
134 |             return sub2
135 |         self.text = re.sub(self.lang.CONTINUOUS_PUNCTUATION_REGEX,
136 |                         continuous_puncs_replace, self.text)
137 | 
138 |     def replace_periods_before_numeric_references(self):
139 |          # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
140 |         self.text = re.sub(self.lang.NUMBERED_REFERENCE_REGEX,
141 |                       r"∯\2\r\7", self.text)
142 | 
143 |     def consecutive_underscore(self, txt):
144 |         # Rubular: http://rubular.com/r/fTF2Ff3WBL
145 |         txt = re.sub(r'_{3,}', '', txt)
146 |         return len(txt) == 0
147 | 
148 |     def check_for_punctuation(self, txt):
149 |         if any(p in txt for p in self.lang.Punctuations):
150 |             sents = self.process_text(txt)
151 |             return sents
152 |         else:
153 |             # NOTE: next steps of check_for_punctuation will unpack this list
154 |             return [txt]
155 | 
156 |     def process_text(self, txt):
157 |         if txt[-1] not in self.lang.Punctuations:
158 |             txt += 'ȸ'
159 |         txt = ExclamationWords.apply_rules(txt)
160 |         txt = self.between_punctuation(txt)
161 |         # handle text having only doublepunctuations
162 |         if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation, txt):
163 |             txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
164 |         txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,
165 |                               *self.lang.ExclamationPointRules.All)
166 |         txt = ListItemReplacer(txt).replace_parens()
167 |         txt = self.sentence_boundary_punctuation(txt)
168 |         return txt
169 | 
170 |     def replace_numbers(self):
171 |         self.text = Text(self.text).apply(*self.lang.Numbers.All)
172 | 
173 |     def abbreviations_replacer(self):
174 |         if hasattr(self.lang, "AbbreviationReplacer"):
175 |             return self.lang.AbbreviationReplacer(self.text, self.lang)
176 |         else:
177 |             return AbbreviationReplacer(self.text, self.lang)
178 | 
179 |     def replace_abbreviations(self):
180 |         self.text = self.abbreviations_replacer().replace()
181 | 
182 |     def between_punctuation_processor(self, txt):
183 |         if hasattr(self.lang, "BetweenPunctuation"):
184 |             return self.lang.BetweenPunctuation(txt)
185 |         else:
186 |             return BetweenPunctuation(txt)
187 | 
188 |     def between_punctuation(self, txt):
189 |         txt = self.between_punctuation_processor(txt).replace()
190 |         return txt
191 | 
192 |     def sentence_boundary_punctuation(self, txt):
193 |         if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
194 |             txt = Text(txt).apply(
195 |                 self.lang.ReplaceColonBetweenNumbersRule)
196 |         if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):
197 |             txt = Text(txt).apply(
198 |                 self.lang.ReplaceNonSentenceBoundaryCommaRule)
199 |         # retain exclamation mark if it is an ending character of a given text
200 |         txt = re.sub(r'&ᓴ&$', '!', txt)
201 |         txt = [
202 |             m.group() for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)
203 |             ]
204 |         return txt
205 | 


--------------------------------------------------------------------------------
/pysbd/punctuation_replacer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pysbd.utils import Rule, Text
 4 | 
 5 | 
 6 | class EscapeRegexReservedCharacters(object):
 7 |     LeftParen = Rule(r'\(', '\\(')
 8 |     RightParen = Rule(r'\)', '\\)')
 9 |     # LeftParen = Rule(re.escape(r'('), '(')
10 |     # RightParen = Rule(re.escape(r')'), ')')
11 |     LeftBracket = Rule(r'\[', '\\[')
12 |     RightBracket = Rule(r'\]', '\\]')
13 |     Dash = Rule(r'\-', '\\-')
14 | 
15 |     All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
16 | 
17 | 
18 | class SubEscapedRegexReservedCharacters(object):
19 |     SubLeftParen = Rule(r'\\\(', '(')
20 |     SubRightParen = Rule(r'\\\)', ')')
21 |     # SubLeftParen = Rule(re.escape(r"\\("), "(")
22 |     # SubRightParen = Rule(re.escape(r'\\)'), ')')
23 |     SubLeftBracket = Rule(r'\\\[', '[')
24 |     SubRightBracket = Rule(r'\\\]', ']')
25 |     SubDash = Rule(r'\\\-', '-')
26 | 
27 |     All = [
28 |         SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash
29 |     ]
30 | 
31 | 
32 | def replace_punctuation(match, match_type=None):
33 |     text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All)
34 |     sub = re.sub(r'\.', '∯', text)
35 |     sub_1 = re.sub(r'\。', '&ᓰ&', sub)
36 |     sub_2 = re.sub(r'\．', '&ᓱ&', sub_1)
37 |     sub_3 = re.sub(r'\！', '&ᓳ&', sub_2)
38 |     sub_4 = re.sub(r'\!', '&ᓴ&', sub_3)
39 |     sub_5 = re.sub(r'\?', '&ᓷ&', sub_4)
40 |     last_sub = re.sub(r'\？', '&ᓸ&', sub_5)
41 |     if match_type != 'single':
42 |         last_sub = re.sub(r"'", '&⎋&', last_sub)
43 |     text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All)
44 |     return text
45 | 


--------------------------------------------------------------------------------
/pysbd/segmenter.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | 
 4 | from pysbd.languages import Language
 5 | from pysbd.processor import Processor
 6 | from pysbd.cleaner import Cleaner
 7 | from pysbd.utils import TextSpan
 8 | 
 9 | class Segmenter(object):
10 | 
11 |     def __init__(self, language="en", clean=False, doc_type=None, char_span=False):
12 |         """Segments a text into an list of sentences
13 |         with or withour character offsets from original text
14 | 
15 |         Parameters
16 |         ----------
17 |         language : str, required
18 |             specify a language use its two character ISO 639-1 code,
19 |             by default "en"
20 |         clean : bool, optional
21 |             cleans original text, by default False
22 |         doc_type : [type], optional
23 |             Normal text or OCRed text, by default None
24 |             set to `pdf` for OCRed text
25 |         char_span : bool, optional
26 |             Get start & end character offsets of each sentences
27 |             within original text, by default False
28 |         """
29 |         self.language = language
30 |         self.language_module = Language.get_language_code(language)
31 |         self.clean = clean
32 |         self.doc_type = doc_type
33 |         self.char_span = char_span
34 |         if self.clean and self.char_span:
35 |             raise ValueError("char_span must be False if clean is True. "
36 |                              "Since `clean=True` will modify original text.")
37 |         # when doctype is pdf then force user to clean the text
38 |         # char_span func wont be provided with pdf doctype also
39 |         elif self.doc_type == 'pdf' and not self.clean:
40 |             raise ValueError("`doc_type='pdf'` should have `clean=True` & "
41 |                             "`char_span` should be False since original"
42 |                             "text will be modified.")
43 | 
44 |     def cleaner(self, text):
45 |         if hasattr(self.language_module, "Cleaner"):
46 |             return self.language_module.Cleaner(text, self.language_module,
47 |                                                 doc_type=self.doc_type)
48 |         else:
49 |             return Cleaner(text, self.language_module, doc_type=self.doc_type)
50 | 
51 |     def processor(self, text):
52 |         if hasattr(self.language_module, "Processor"):
53 |             return self.language_module.Processor(text, self.language_module,
54 |                                                   char_span=self.char_span)
55 |         else:
56 |             return Processor(text, self.language_module,
57 |                              char_span=self.char_span)
58 | 
59 |     def sentences_with_char_spans(self, sentences):
60 |         # since SENTENCE_BOUNDARY_REGEX doesnt account
61 |         # for trailing whitespaces \s* & is used as suffix
62 |         # to keep non-destructive text after segments joins
63 |         sent_spans = []
64 |         prior_end_char_idx = 0
65 |         for sent in sentences:
66 |             for match in re.finditer('{0}\s*'.format(re.escape(sent)), self.original_text):
67 |                 match_str = match.group()
68 |                 match_start_idx, match_end_idx = match.span()
69 |                 if match_end_idx > prior_end_char_idx:
70 |                     # making sure if curren sentence and its span
71 |                     # is either first sentence along with its char spans
72 |                     # or current sent spans adjacent to prior sentence spans
73 |                     sent_spans.append(
74 |                         TextSpan(match_str, match_start_idx, match_end_idx))
75 |                     prior_end_char_idx = match_end_idx
76 |                     break
77 |         return sent_spans
78 | 
79 |     def segment(self, text):
80 |         self.original_text = text
81 |         if not text:
82 |             return []
83 | 
84 |         if self.clean or self.doc_type == 'pdf':
85 |             text = self.cleaner(text).clean()
86 | 
87 |         postprocessed_sents = self.processor(text).process()
88 |         sentence_w_char_spans = self.sentences_with_char_spans(postprocessed_sents)
89 |         if self.char_span:
90 |             return sentence_w_char_spans
91 |         elif self.clean:
92 |             # clean and destructed sentences
93 |             return postprocessed_sents
94 |         else:
95 |             # nondestructive with whitespaces
96 |             return [textspan.sent for textspan in sentence_w_char_spans]
97 | 


--------------------------------------------------------------------------------
/pysbd/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import re
 4 | import pysbd
 5 | 
 6 | class Rule(object):
 7 | 
 8 |     def __init__(self, pattern, replacement):
 9 |         self.pattern = pattern
10 |         self.replacement = replacement
11 | 
12 |     def __repr__(self):  # pragma: no cover
13 |         return '<{} pattern="{}" and replacement="{}">'.format(
14 |             self.__class__.__name__, self.pattern, self.replacement)
15 | 
16 | 
17 | class Text(str):
18 |     """Extending str functionality to apply regex rules
19 | 
20 |     https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types
21 | 
22 |     Parameters
23 |     ----------
24 |     str : str
25 |         string content
26 | 
27 |     Returns
28 |     -------
29 |     str
30 |         input as it is if rule pattern doesnt match
31 |         else replacing found pattern with replacement chars
32 |     """
33 |     def apply(self, *rules):
34 |         for each_r in rules:
35 |             self = re.sub(each_r.pattern, each_r.replacement, self)
36 |         return self
37 | 
38 | 
39 | class TextSpan(object):
40 | 
41 |     def __init__(self, sent, start, end):
42 |         """
43 |         Sentence text and its start & end character offsets within original text
44 | 
45 |         Parameters
46 |         ----------
47 |         sent : str
48 |             Sentence text
49 |         start : int
50 |             start character offset of a sentence in original text
51 |         end : int
52 |             end character offset of a sentence in original text
53 |         """
54 |         self.sent = sent
55 |         self.start = start
56 |         self.end = end
57 | 
58 |     def __repr__(self):  # pragma: no cover
59 |         return "{0}(sent={1}, start={2}, end={3})".format(
60 |             self.__class__.__name__, repr(self.sent), self.start, self.end)
61 | 
62 |     def __eq__(self, other):
63 |         if isinstance(self, other.__class__):
64 |             return self.sent == other.sent and self.start == other.start and self.end == other.end
65 | 
66 | 
67 | class PySBDFactory(object):
68 |     """pysbd as a spacy component through entrypoints"""
69 | 
70 |     def __init__(self, nlp, language='en'):
71 |         self.nlp = nlp
72 |         self.seg = pysbd.Segmenter(language=language, clean=False,
73 |                                    char_span=True)
74 | 
75 |     def __call__(self, doc):
76 |         sents_char_spans = self.seg.segment(doc.text_with_ws)
77 |         start_token_ids = [sent.start for sent in sents_char_spans]
78 |         for token in doc:
79 |             token.is_sent_start = (True if token.idx
80 |                                    in start_token_ids else False)
81 |         return doc
82 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests/
3 | 


--------------------------------------------------------------------------------
/requirements-benchmark.txt:
--------------------------------------------------------------------------------
1 | nltk==3.5
2 | spacy==2.1.8
3 | stanza==1.0.1
4 | syntok==1.3.1
5 | blingfire==0.1.2
6 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==5.4.3
2 | pytest-cov==2.10.0
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # inspired from:
  4 | # https://github.com/kennethreitz/setup.py/blob/master/setup.py
  5 | # Note: To use the 'upload' functionality of this file, you must:
  6 | #   $ pipenv install twine --dev
  7 | 
  8 | import io
  9 | import os
 10 | import sys
 11 | from shutil import rmtree
 12 | from setuptools import find_packages, setup, Command
 13 | 
 14 | root = os.path.abspath(os.path.dirname(__file__))
 15 | 
 16 | REQUIRES_PYTHON = ">=3"
 17 | # What packages are required for this module to be executed?
 18 | REQUIRED = []
 19 | 
 20 | with io.open(os.path.join(root, "pysbd", "about.py"), encoding="utf8") as f:
 21 |     about = {}
 22 |     exec(f.read(), about)
 23 | 
 24 | # Import the README and use it as the long-description.
 25 | with io.open(os.path.join(root, 'README.md'), encoding='utf-8') as f:
 26 |     long_description = '\n' + f.read()
 27 | 
 28 | 
 29 | class UploadCommand(Command):
 30 |     """Support setup.py upload."""
 31 | 
 32 |     description = 'Build and publish the package.'
 33 |     user_options = []
 34 | 
 35 |     @staticmethod
 36 |     def status(s):
 37 |         """Prints things in bold."""
 38 |         print('\033[1m{0}\033[0m'.format(s))
 39 | 
 40 |     def initialize_options(self):
 41 |         pass
 42 | 
 43 |     def finalize_options(self):
 44 |         pass
 45 | 
 46 |     def run(self):
 47 |         try:
 48 |             self.status('Removing previous builds…')
 49 |             rmtree(os.path.join(root, 'dist'))
 50 |         except OSError:
 51 |             pass
 52 | 
 53 |         self.status('Building Source and Wheel distribution…')
 54 |         os.system('{0} setup.py sdist bdist_wheel'.format(sys.executable))
 55 | 
 56 |         self.status('Uploading the package to PyPI via Twine…')
 57 |         os.system('twine upload dist/*')
 58 | 
 59 |         self.status('Pushing git tags…')
 60 |         os.system('git tag v{0}'.format(about['__version__']))
 61 |         os.system('git push --tags')
 62 | 
 63 |         sys.exit()
 64 | 
 65 | 
 66 | setup(
 67 |     name='pysbd',
 68 |     version=about['__version__'],
 69 |     description=about['__summary__'],
 70 |     long_description=long_description,
 71 |     long_description_content_type='text/markdown',
 72 |     author=about["__author__"],
 73 |     author_email=about["__email__"],
 74 |     url=about["__uri__"],
 75 |     packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]),
 76 |     install_requires=REQUIRED,
 77 |     python_requires=REQUIRES_PYTHON,
 78 |     include_package_data=True,
 79 |     license=about["__license__"],
 80 |     classifiers=[
 81 |         # Trove classifiers
 82 |         # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
 83 | 
 84 |         # Indicate who your project is intended for
 85 |         'Intended Audience :: Developers',
 86 |         'Intended Audience :: Science/Research',
 87 |         'Topic :: Scientific/Engineering',
 88 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
 89 |         'Topic :: Scientific/Engineering :: Information Analysis',
 90 |         'Topic :: Text Processing',
 91 |         'Topic :: Text Processing :: Linguistic',
 92 |         'Topic :: Software Development',
 93 |         'Topic :: Software Development :: Libraries',
 94 |         'Programming Language :: Python :: 3',
 95 |         'Programming Language :: Python :: 3.5',
 96 |         'Programming Language :: Python :: 3.6',
 97 |         'Programming Language :: Python :: 3.7',
 98 |         'Programming Language :: Python :: 3.8',
 99 |         'License :: OSI Approved :: MIT License'
100 |     ],
101 |     keywords='natural-language-processing nlp',
102 |     # $ setup.py publish support.
103 |     cmdclass={
104 |         'upload': UploadCommand,
105 |     },
106 |     entry_points={
107 |         "spacy_factories": ["pysbd = pysbd.utils:PySBDFactory"]
108 |     }
109 | )
110 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pysbd
  3 | 
  4 | @pytest.fixture()
  5 | def pysbd_default_en_no_clean_no_span_fixture():
  6 |     en_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
  7 |     return en_segmenter
  8 | 
  9 | @pytest.fixture()
 10 | def en_with_clean_no_span_fixture():
 11 |     en_segmenter = pysbd.Segmenter(language="en", clean=True, char_span=False)
 12 |     return en_segmenter
 13 | 
 14 | @pytest.fixture()
 15 | def en_no_clean_with_span_fixture():
 16 |     en_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True)
 17 |     return en_segmenter
 18 | 
 19 | @pytest.fixture()
 20 | def hi_default_fixture():
 21 |     hi_segmenter = pysbd.Segmenter(language="hi", clean=False, char_span=False)
 22 |     return hi_segmenter
 23 | 
 24 | @pytest.fixture()
 25 | def mr_default_fixture():
 26 |     mr_segmenter = pysbd.Segmenter(language="mr", clean=False, char_span=False)
 27 |     return mr_segmenter
 28 | 
 29 | @pytest.fixture()
 30 | def zh_default_fixture():
 31 |     zh_segmenter = pysbd.Segmenter(language="zh", clean=False, char_span=False)
 32 |     return zh_segmenter
 33 | 
 34 | @pytest.fixture()
 35 | def es_default_fixture():
 36 |     es_segmenter = pysbd.Segmenter(language="es", clean=False, char_span=False)
 37 |     return es_segmenter
 38 | 
 39 | @pytest.fixture()
 40 | def es_with_clean_no_span_fixture():
 41 |     es_segmenter_clean = pysbd.Segmenter(language="es", clean=True, char_span=False)
 42 |     return es_segmenter_clean
 43 | 
 44 | @pytest.fixture()
 45 | def am_default_fixture():
 46 |     am_segmenter = pysbd.Segmenter(language="am", clean=False, char_span=False)
 47 |     return am_segmenter
 48 | 
 49 | @pytest.fixture()
 50 | def ar_default_fixture():
 51 |     ar_segmenter = pysbd.Segmenter(language="ar", clean=False, char_span=False)
 52 |     return ar_segmenter
 53 | 
 54 | @pytest.fixture()
 55 | def hy_default_fixture():
 56 |     hy_segmenter = pysbd.Segmenter(language="hy", clean=False, char_span=False)
 57 |     return hy_segmenter
 58 | 
 59 | @pytest.fixture()
 60 | def bg_default_fixture():
 61 |     bg_segmenter = pysbd.Segmenter(language="bg", clean=False, char_span=False)
 62 |     return bg_segmenter
 63 | 
 64 | @pytest.fixture()
 65 | def ur_default_fixture():
 66 |     ur_segmenter = pysbd.Segmenter(language="ur", clean=False, char_span=False)
 67 |     return ur_segmenter
 68 | 
 69 | @pytest.fixture()
 70 | def ru_default_fixture():
 71 |     ru_segmenter = pysbd.Segmenter(language="ru", clean=False, char_span=False)
 72 |     return ru_segmenter
 73 | 
 74 | @pytest.fixture()
 75 | def pl_default_fixture():
 76 |     pl_segmenter = pysbd.Segmenter(language="pl", clean=False, char_span=False)
 77 |     return pl_segmenter
 78 | 
 79 | @pytest.fixture()
 80 | def fa_default_fixture():
 81 |     fa_segmenter = pysbd.Segmenter(language="fa", clean=False, char_span=False)
 82 |     return fa_segmenter
 83 | 
 84 | @pytest.fixture()
 85 | def nl_default_fixture():
 86 |     nl_segmenter = pysbd.Segmenter(language="nl", clean=False, char_span=False)
 87 |     return nl_segmenter
 88 | 
 89 | @pytest.fixture()
 90 | def da_default_fixture():
 91 |     da_segmenter = pysbd.Segmenter(language="da", clean=False, char_span=False)
 92 |     return da_segmenter
 93 | 
 94 | @pytest.fixture()
 95 | def da_with_clean_no_span_fixture():
 96 |     da_segmenter = pysbd.Segmenter(language="da", clean=True, char_span=False)
 97 |     return da_segmenter
 98 | 
 99 | @pytest.fixture()
100 | def fr_default_fixture():
101 |     fr_segmenter = pysbd.Segmenter(language="fr", clean=False, char_span=False)
102 |     return fr_segmenter
103 | 
104 | @pytest.fixture()
105 | def my_default_fixture():
106 |     my_segmenter = pysbd.Segmenter(language="my", clean=False, char_span=False)
107 |     return my_segmenter
108 | 
109 | @pytest.fixture()
110 | def el_default_fixture():
111 |     el_segmenter = pysbd.Segmenter(language="el", clean=False, char_span=False)
112 |     return el_segmenter
113 | 
114 | @pytest.fixture()
115 | def it_default_fixture():
116 |     it_segmenter = pysbd.Segmenter(language="it", clean=False, char_span=False)
117 |     return it_segmenter
118 | 
119 | @pytest.fixture()
120 | def ja_default_fixture():
121 |     ja_segmenter = pysbd.Segmenter(language="ja", clean=False, char_span=False)
122 |     return ja_segmenter
123 | 
124 | @pytest.fixture()
125 | def ja_with_clean_no_span_fixture():
126 |     ja_segmenter = pysbd.Segmenter(language="ja", clean=True, char_span=False)
127 |     return ja_segmenter
128 | 
129 | @pytest.fixture()
130 | def de_default_fixture():
131 |     de_segmenter = pysbd.Segmenter(language="de", clean=False, char_span=False)
132 |     return de_segmenter
133 | 
134 | @pytest.fixture()
135 | def de_with_clean_no_span_fixture():
136 |     de_segmenter = pysbd.Segmenter(language="de", clean=True, char_span=False)
137 |     return de_segmenter
138 | 
139 | 
140 | @pytest.fixture()
141 | def kk_default_fixture():
142 |     kk_segmenter = pysbd.Segmenter(language="kk", clean=False, char_span=False)
143 |     return kk_segmenter
144 | 
145 | @pytest.fixture()
146 | def sk_default_fixture():
147 |     sk_segmenter = pysbd.Segmenter(language="sk", clean=False, char_span=False)
148 |     return sk_segmenter
149 | 


--------------------------------------------------------------------------------
/tests/lang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/tests/lang/__init__.py


--------------------------------------------------------------------------------
/tests/lang/test_amharic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_AM_RULES_TEST_CASES = [
 5 |     ("እንደምን አለህ፧መልካም ቀን ይሁንልህ።እባክሽ ያልሽዉን ድገሚልኝ።",
 6 |      ["እንደምን አለህ፧", "መልካም ቀን ይሁንልህ።", "እባክሽ ያልሽዉን ድገሚልኝ።"]),
 7 | ]
 8 | 
 9 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_AM_RULES_TEST_CASES)
10 | def test_am_sbd(am_default_fixture, text, expected_sents):
11 |     """Amharic language SBD tests"""
12 |     segments = am_default_fixture.segment(text)
13 |     segments = [s.strip() for s in segments]
14 |     assert segments == expected_sents
15 | 


--------------------------------------------------------------------------------
/tests/lang/test_arabic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_AR_RULES_TEST_CASES = [
 5 |     ("سؤال وجواب: ماذا حدث بعد الانتخابات الايرانية؟ طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن. يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب.",
 6 |      ["سؤال وجواب:", "ماذا حدث بعد الانتخابات الايرانية؟", "طرح الكثير من التساؤلات غداة ظهور نتائج الانتخابات الرئاسية الايرانية التي أججت مظاهرات واسعة واعمال عنف بين المحتجين على النتائج ورجال الامن.", "يقول معارضو الرئيس الإيراني إن الطريقة التي اعلنت بها النتائج كانت مثيرة للاستغراب."]),
 7 |     # has unicode character. Need arabic language knowledge to resolve the issue
 8 |     pytest.param("وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى. وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير.",
 9 |      ["وقال د‪.‬ ديفيد ريدي و الأطباء الذين كانوا يعالجونها في مستشفى برمنجهام إنها كانت تعاني من أمراض أخرى.", "وليس معروفا ما اذا كانت قد توفيت بسبب اصابتها بأنفلونزا الخنازير."], marks=pytest.mark.xfail),
10 |     ("ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار. ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية.",
11 |      ["ومن المنتظر أن يكتمل مشروع خط أنابيب نابوكو البالغ طوله 3300 كليومترا في 12‪/‬08‪/‬2014 بتكلفة تُقدر بـ 7.9 مليارات يورو أي نحو 10.9 مليارات دولار.", "ومن المقرر أن تصل طاقة ضخ الغاز في المشروع 31 مليار متر مكعب انطلاقا من بحر قزوين مرورا بالنمسا وتركيا ودول البلقان دون المرور على الأراضي الروسية."]),
12 |     ("الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز: رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه. العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي.",
13 |      ["الاحد, 21 فبراير/ شباط, 2010, 05:01 GMT الصنداي تايمز:", "رئيس الموساد قد يصبح ضحية الحرب السرية التي شتنها بنفسه.", "العقل المنظم هو مئير داجان رئيس الموساد الإسرائيلي الذي يشتبه بقيامه باغتيال القائد الفلسطيني في حركة حماس محمود المبحوح في دبي."]),
14 |     ("عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب، زرعها عملاء الموساد كما تقول مصادر إسرائيلية، وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية، وبدأت مراسم الحداد عليه",
15 |      ["عثر في الغرفة على بعض أدوية علاج ارتفاع ضغط الدم، والقلب،", "زرعها عملاء الموساد كما تقول مصادر إسرائيلية،", "وقرر الطبيب أن الفلسطيني قد توفي وفاة طبيعية ربما إثر نوبة قلبية،", "وبدأت مراسم الحداد عليه"])
16 | ]
17 | 
18 | 
19 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_AR_RULES_TEST_CASES)
20 | def test_ar_sbd(ar_default_fixture, text, expected_sents):
21 |     """Arabic language SBD tests"""
22 |     segments = ar_default_fixture.segment(text)
23 |     segments = [s.strip() for s in segments]
24 |     assert segments == expected_sents
25 | 


--------------------------------------------------------------------------------
/tests/lang/test_armenian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_HY_RULES_TEST_CASES = [
 5 |     ("Ի՞նչ ես մտածում: Ոչինչ:",
 6 |       ["Ի՞նչ ես մտածում:", "Ոչինչ:"]),
 7 |     ("Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:",
 8 |       ["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"]),
 9 |     ("Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:",
10 |       ["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"])
11 | ]
12 | 
13 | HY_MORE_TEST_CASES = [
14 |     ("Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը: Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը  կազմում է այս Փուլի արդյունքը:",
15 |      ["Սա այն փուլն է, երբ տեղի է ունենում Համակարգի մշակումը:", "Համաձայն Փուլ 2-ի, Մատակարարը մշակում և/կամ հարմարեցնում է համապատասխան ծրագիրը, տեղադրում ծրագրի բաղկացուցիչները, կատարում առանձին բլոկի և համակարգի թեստավորում և ներառում տարբեր մոդուլներ եզակի աշխատանքային համակարգում, որը  կազմում է այս Փուլի արդյունքը:"]),
16 |     ("Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է  Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է  Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`",
17 |      ["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է  Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է  Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"]),
18 |     ("Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար: 2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ: Այս թեստերի թիրախը հանդիսանում է  Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի: Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է  Կառավարության կողմից: Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում: Մատակարարը պետք է տրամադրի հետևյալը`",
19 |      ["Մատակարարի նախագծի անձնակազմի կողմից համակարգի թեստերը հաջող անցնելուց հետո, Համակարգը տրվում է Գնորդին թեստավորման համար:", "2-րդ փուլում, հիմք ընդունելով թեստային սցենարիոները, թեստերը կատարվում են Կառավարության կողմից Մատակարարի աջակցությամբ:", "Այս թեստերի թիրախը հանդիսանում է  Համակարգի` որպես մեկ ամբողջության և համակարգի գործունեության ստուգումը համաձայն տեխնիկական բնութագրերի:", "Այս թեստերի հաջողակ ավարտից հետո, Համակարգը ժամանակավոր ընդունվում է  Կառավարության կողմից:", "Այս թեստերի արդյունքները փաստաթղթային ձևով կներակայացվեն Թեստային Արդյունքների Հաշվետվություններում:", "Մատակարարը պետք է տրամադրի հետևյալը`"]),
20 |     # "Hello world. My name is Armine." ==> ["Hello world.", "My name is Armine."]
21 |     ("Բարև Ձեզ: Իմ անունն էԱրմինե:",
22 |      ["Բարև Ձեզ:", "Իմ անունն էԱրմինե:"]),
23 |     # "Today is Monday. I am going to work." ==> ["Today is Monday.", "I am going to work."]
24 |     ("Այսօր երկուշաբթի է: Ես գնում եմ աշխատանքի:",
25 |      ["Այսօր երկուշաբթի է:", "Ես գնում եմ աշխատանքի:"]),
26 |     #  "Tomorrow is September 1st. We are going to school." ==> ["Tomorrow is September 1st.", "We are going to school."]
27 |     ("Վաղը սեպտեմբերի 1-ն է: Մենք գնում ենք դպրոց:",
28 |      ["Վաղը սեպտեմբերի 1-ն է:", "Մենք գնում ենք դպրոց:"]),
29 |     #  "Yes, I understood. I really love you." ==> ["Yes, I understood.", "I really love you."]
30 |     ("Այո, ես հասկացա: Ես իսկապես քեզ սիրում եմ:",
31 |      ["Այո, ես հասկացա:", "Ես իսկապես քեզ սիրում եմ:"]),
32 |     #  "Close the windows. It is raining in the evening." ==> ["Close the windows.", "It is raining in the evening."]
33 |     ("Փակիր պատուհանները: Երեկոյան անձրևում է:",
34 |      ["Փակիր պատուհանները:", "Երեկոյան անձրևում է:"]),
35 |     #  "It is dark. I should go home." ==> ["It is dark.", "I should go home."]
36 |     ("Մութ է: Ես պետք է տուն վերադառնամ:",
37 |      ["Մութ է:", "Ես պետք է տուն վերադառնամ:"]),
38 |     #  "You know, I am starting to believe. Everything is changing." ==> ["You know, I am starting to believe.", "Everything is changing."]
39 |     ("Գիտես, սկսել եմ հավատալ: Ամեն ինչ փոխվում է:",
40 |      ["Գիտես, սկսել եմ հավատալ:", "Ամեն ինչ փոխվում է:"]),
41 |     #  "It is a new Christmas tree. We should decorate it." ==> ["It is a new Christmas tree.", "We should decorate it."]
42 |     ("Տոնածառը նոր է: Պետք է այն զարդարել:",
43 |      ["Տոնածառը նոր է:", "Պետք է այն զարդարել:"]),
44 |     #  "I am in hurry. I could not wait you." ==> ["I am in hurry.", "I could not wait you."]
45 |     ("Ես շտապում եմ: Ես քեզ չեմ կարող սպասել:",
46 |      ["Ես շտապում եմ:", "Ես քեզ չեմ կարող սպասել:"]),
47 |     #  "Wait, we love each other. I want us to live together." ==> ["Wait, we love each other.", "I want us to live together."]
48 |     ("Սպասիր, մենք իրար սիրում ենք: Ցանկանում եմ միասին ապրենք:",
49 |      ["Սպասիր, մենք իրար սիրում ենք:", "Ցանկանում եմ միասին ապրենք:"]),
50 |     #  "No, I do not think so. It is not true." ==> ["No, I do not think so.", "It is not true."]
51 |     ("Ոչ, այդպես չեմ կարծում: Դա ճիշտ չէ:",
52 |      ["Ոչ, այդպես չեմ կարծում:", "Դա ճիշտ չէ:"]),
53 |     #  "April 24 it has started to rain... I was thinking about." ==> ["April 24 it has started to rain... I was thinking about."]
54 |     ("Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:",
55 |      ["Ապրիլի 24-ին սկսեց անձրևել...Այդպես էի գիտեի:"]),
56 |     #  "It was 1960...it was winter...it was night. It was cold...emptiness." ==> ["It was 1960...it was winter...it was night.", "It was cold...emptiness."]
57 |     ("1960 թվական…ձմեռ…գիշեր: Սառն էր…դատարկություն:",
58 |      ["1960 թվական…ձմեռ…գիշեր:", "Սառն էր…դատարկություն:"]),
59 |     #  "Why a computer could not do what a man could do? Simply it doesn't have a human brain." ==> ["Why a computer could not do what a man could do?", "Simply it doesn't have a human brain."]
60 |     ("Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը: Պարզապես չունի մարդկային ուղեղ:",
61 |      ["Ինչ՟ու այն, ինչ անում է մարդը, չի կարող անել համակարգիչը:", "Պարզապես չունի մարդկային ուղեղ:"]),
62 |     #  "Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity." ==> ["Numerate for me 3 things that are important for you - I answer love, knowledge, sincerity."]
63 |     ("Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:",
64 |      ["Թվարկիր ինձ համար 3 բան, որ կարևոր է քեզ համար - Պատասխանում եմ. սեր, գիտելիք, ազնվություն:"]),
65 |     #  "So, we are coming to the end. The logic is...simplicity and work" ==> ["So, we are coming to the end.", "Simplicity and work."]
66 |     ("Այսպիսով` մոտենում ենք ավարտին: Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:",
67 |      ["Այսպիսով` մոտենում ենք ավարտին:", "Տրամաբանությյունը հետևյալն է. պարզություն և աշխատանք:"]),
68 |     #  "What are you thinking? Nothing!" ==> ["What are you thinking?", "Nothing!"]
69 |     ("Ի՞նչ ես մտածում: Ոչինչ:",
70 |      ["Ի՞նչ ես մտածում:", "Ոչինչ:"]),
71 |     #  "Can we work together ?. May be what you are thinking, is possible." ==> ["Can we work together?.", "May be what you are thinking is possible."]
72 |     ("Կարող ե՞նք միասին աշխատել: Գուցե այն ինչ մտածում ես, իրականանալի է:",
73 |      ["Կարող ե՞նք միասին աշխատել:", "Գուցե այն ինչ մտածում ես, իրականանալի է:"]),
74 |     #  "Now what we have started, comes to the end. However the questions are numerous... ." ==> ["Now what we have started, comes to the end.", "However the questions are numerous... ."]
75 |     ("Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում: Հարցերը սակայն շատ են...:",
76 |      ["Հիմա, այն ինչ սկսել ենք, ավարտին է մոտենում:", "Հարցերը սակայն շատ են...:"]),
77 |     #  "Honey... I am waiting. Shall I go... or?" ==> ["Honey... I am waiting.", "Shall I go... or?"]
78 |     ("Սիրելիս...սպասում եմ: Գնամ թ՟ե …:",
79 |      ["Սիրելիս...սպասում եմ:", "Գնամ թ՟ե …:"])
80 | ]
81 | 
82 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_HY_RULES_TEST_CASES)
83 | def test_hy_sbd(hy_default_fixture, text, expected_sents):
84 |     """Armenian language SBD tests"""
85 |     segments = hy_default_fixture.segment(text)
86 |     segments = [s.strip() for s in segments]
87 |     assert segments == expected_sents
88 | 
89 | @pytest.mark.parametrize('text,expected_sents', HY_MORE_TEST_CASES)
90 | def test_hy_sbd_more(hy_default_fixture, text, expected_sents):
91 |     """Armenian language SBD tests"""
92 |     segments = hy_default_fixture.segment(text)
93 |     segments = [s.strip() for s in segments]
94 |     assert segments == expected_sents
95 | 


--------------------------------------------------------------------------------
/tests/lang/test_bulgarian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_BG_RULES_TEST_CASES = [
 5 | ("В първата половина на ноември т.г. ще бъде свикан Консултативният съвет за национална сигурност, обяви държавният глава.",
 6 |  ["В първата половина на ноември т.г. ще бъде свикан Консултативният съвет за национална сигурност, обяви държавният глава."]),
 7 | ("Компютърът е устройство с общо предназначение, което може да бъде програмирано да извършва набор от аритметични и/или логически операции. Възможността поредицата такива операции да бъде променяна позволява компютърът да се използва за решаването на теоретично всяка изчислителна/логическа задача. Обикновено целта на тези операции е обработката на въведена информация (данни), представена в цифров (дигитален) вид, резултатът от които може да се изведе в най-общо казано използваема форма.",
 8 |  ["Компютърът е устройство с общо предназначение, което може да бъде програмирано да извършва набор от аритметични и/или логически операции.", "Възможността поредицата такива операции да бъде променяна позволява компютърът да се използва за решаването на теоретично всяка изчислителна/логическа задача.", "Обикновено целта на тези операции е обработката на въведена информация (данни), представена в цифров (дигитален) вид, резултатът от които може да се изведе в най-общо казано използваема форма."]),
 9 | ("Пл. \"20 Април\"",
10 |  ["Пл. \"20 Април\""]),
11 | ("Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат. Стойностни, вкл. български и руски",
12 |  ["Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.", "Стойностни, вкл. български и руски"])
13 | ]
14 | 
15 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_BG_RULES_TEST_CASES)
16 | def test_bg_sbd(bg_default_fixture, text, expected_sents):
17 |     """Bulgarian language SBD tests"""
18 |     segments = bg_default_fixture.segment(text)
19 |     segments = [s.strip() for s in segments]
20 |     assert segments == expected_sents
21 | 


--------------------------------------------------------------------------------
/tests/lang/test_burmese.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_MY_RULES_TEST_CASES = [
 5 | ("ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။၇ွင္ေနေကာင္းလား။",
 6 |  ["ခင္ဗ်ားနာမည္ဘယ္လိုေခၚလဲ။", "၇ွင္ေနေကာင္းလား။"])
 7 | ]
 8 | 
 9 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_MY_RULES_TEST_CASES)
10 | def test_my_sbd(my_default_fixture, text, expected_sents):
11 |     """Burmese language SBD tests"""
12 |     segments = my_default_fixture.segment(text)
13 |     segments = [s.strip() for s in segments]
14 |     assert segments == expected_sents
15 | 


--------------------------------------------------------------------------------
/tests/lang/test_chinese.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_ZH_RULES_TEST_CASES = [
 5 |     ("安永已聯繫周怡安親屬，協助辦理簽證相關事宜，周怡安家屬1月1日晚間搭乘東方航空班機抵達上海，他們步入入境大廳時神情落寞、不發一語。周怡安來自台中，去年剛從元智大學畢業，同年9月加入安永。",
 6 |      ["安永已聯繫周怡安親屬，協助辦理簽證相關事宜，周怡安家屬1月1日晚間搭乘東方航空班機抵達上海，他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中，去年剛從元智大學畢業，同年9月加入安永。"]),
 7 |     ("我们明天一起去看《摔跤吧！爸爸》好吗？好！",
 8 |      ["我们明天一起去看《摔跤吧！爸爸》好吗？", "好！"])
 9 | ]
10 | 
11 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_ZH_RULES_TEST_CASES)
12 | def test_zsh_sbd(zh_default_fixture, text, expected_sents):
13 |     """Chinese language SBD tests from Pragmatic Segmenter"""
14 |     segments = zh_default_fixture.segment(text)
15 |     segments = [s.strip() for s in segments]
16 |     assert segments == expected_sents
17 | 


--------------------------------------------------------------------------------
/tests/lang/test_danish.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pytest
  3 | import pysbd
  4 | 
  5 | GOLDEN_DA_RULES_TEST_CASES = [
  6 | ("Hej Verden. Mit navn er Jonas.",
  7 |  ["Hej Verden.", "Mit navn er Jonas."]),
  8 | ("Hvad er dit navn? Mit nav er Jonas.",
  9 |  ["Hvad er dit navn?", "Mit nav er Jonas."]),
 10 | ("There it is! I found it.",
 11 |  ["There it is!", "I found it."]),
 12 | ("My name is Jonas E. Smith.",
 13 |  ["My name is Jonas E. Smith."]),
 14 | ("Please turn to p. 55.",
 15 |  ["Please turn to p. 55."]),
 16 | ("Were Jane and co. at the party?",
 17 |  ["Were Jane and co. at the party?"]),
 18 | ("They closed the deal with Pitt, Briggs & Co. at noon.",
 19 |  ["They closed the deal with Pitt, Briggs & Co. at noon."]),
 20 | ("Lad os spørge Jane og co. De burde vide det.",
 21 |  ["Lad os spørge Jane og co.", "De burde vide det."]),
 22 | ("De lukkede aftalen med Pitt, Briggs & Co. Det lukkede i går.",
 23 |  ["De lukkede aftalen med Pitt, Briggs & Co.", "Det lukkede i går."]),
 24 | ("De holdt Skt. Hans i byen.",
 25 |  ["De holdt Skt. Hans i byen."]),
 26 | ("St. Michael's Kirke er på 5. gade nær ved lyset.",
 27 |  ["St. Michael's Kirke er på 5. gade nær ved lyset."]),
 28 | ("That is JFK Jr.'s book.",
 29 |  ["That is JFK Jr.'s book."]),
 30 | ("I visited the U.S.A. last year.",
 31 |  ["I visited the U.S.A. last year."]),
 32 | ("Jeg bor i E.U. Hvad med dig?",
 33 |  ["Jeg bor i E.U.", "Hvad med dig?"]),
 34 | ("I live in the U.S. Hvad med dig?",
 35 |  ["I live in the U.S.", "Hvad med dig?"]),
 36 | ("I work for the U.S. Government in Virginia.",
 37 |  ["I work for the U.S. Government in Virginia."]),
 38 | ("I have lived in the U.S. for 20 years.",
 39 |  ["I have lived in the U.S. for 20 years."]),
 40 | ("She has $100.00 in her bag.",
 41 |  ["She has $100.00 in her bag."]),
 42 | ("She has $100.00. It is in her bag.",
 43 |  ["She has $100.00.", "It is in her bag."]),
 44 | ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
 45 |  ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
 46 | ("Her email is Jane.Doe@example.com. I sent her an email.",
 47 |  ["Her email is Jane.Doe@example.com.", "I sent her an email."]),
 48 | ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
 49 |  ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."]),
 50 | ("She turned to him, 'This is great.' she said.",
 51 |  ["She turned to him, 'This is great.' she said."]),
 52 | ("She turned to him, \"This is great.\" she said.",
 53 |  ["She turned to him, \"This is great.\" she said."]),
 54 | ("She turned to him, \"This is great.\" Hun held the book out to show him.",
 55 |  ["She turned to him, \"This is great.\"", "Hun held the book out to show him."]),
 56 | ("Hello!! Long time no see.",
 57 |  ["Hello!!", "Long time no see."]),
 58 | ("Hello?? Who is there?",
 59 |  ["Hello??", "Who is there?"]),
 60 | ("Hello!? Is that you?",
 61 |  ["Hello!?", "Is that you?"]),
 62 | ("Hello?! Is that you?",
 63 |  ["Hello?!", "Is that you?"]),
 64 | ("1.) The first item 2.) The second item",
 65 |  ["1.) The first item", "2.) The second item"]),
 66 | ("1.) The first item. 2.) The second item.",
 67 |  ["1.) The first item.", "2.) The second item."]),
 68 | ("1) The first item 2) The second item",
 69 |  ["1) The first item", "2) The second item"]),
 70 | ("1) The first item. 2) The second item.",
 71 |  ["1) The first item.", "2) The second item."]),
 72 | ("1. The first item 2. The second item",
 73 |  ["1. The first item", "2. The second item"]),
 74 | ("1. The first item. 2. The second item.",
 75 |  ["1. The first item.", "2. The second item."]),
 76 | ("• 9. The first item • 10. The second item",
 77 |  ["• 9. The first item", "• 10. The second item"]),
 78 | ("⁃9. The first item ⁃10. The second item",
 79 |  ["⁃9. The first item", "⁃10. The second item"]),
 80 | ("a. The first item b. The second item c. The third list item",
 81 |  ["a. The first item", "b. The second item", "c. The third list item"]),
 82 | ("You can find it at N°. 1026.253.553. That is where the treasure is.",
 83 |  ["You can find it at N°. 1026.253.553.", "That is where the treasure is."]),
 84 | ("She works at Yahoo! in the accounting department.",
 85 |  ["She works at Yahoo! in the accounting department."]),
 86 | ("Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
 87 |  ["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"]),
 88 | ("\"Bohr  [...] used the analogy of parallel stairways  [...]\" (Smith 55).",
 89 |  ["\"Bohr  [...] used the analogy of parallel stairways  [...]\" (Smith 55)."]),
 90 | ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
 91 |  ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."]),
 92 | ("I never meant that.... She left the store.",
 93 |  ["I never meant that....", "She left the store."]),
 94 | ("I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
 95 |  ["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."]),
 96 | ("One further habned. . . .",
 97 |  ["One further habned. . . ."])
 98 | ]
 99 | 
100 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_DA_RULES_TEST_CASES)
101 | def test_da_sbd(da_default_fixture, text, expected_sents):
102 |     """Danish language SBD tests"""
103 |     segments = da_default_fixture.segment(text)
104 |     segments = [s.strip() for s in segments]
105 |     assert segments == expected_sents
106 | 
107 | DA_RULES_CLEAN_TEST_CASES = [
108 |     ("Hello world.I dag is Tuesday.Hr. Smith went to the store and bought 1,000.That is a lot.",
109 |      ["Hello world.", "I dag is Tuesday.", "Hr. Smith went to the store and bought 1,000.", "That is a lot."]),
110 |     ("It was a cold \nnight in the city.",
111 |      ["It was a cold night in the city."])
112 | ]
113 | 
114 | DA_PDF_TEST_DATA = [("This is a sentence\ncut off in the middle because pdf.",
115 |                     ["This is a sentence cut off in the middle because pdf."])]
116 | 
117 | @pytest.mark.parametrize('text,expected_sents', DA_RULES_CLEAN_TEST_CASES)
118 | def test_da_sbd_clean(da_with_clean_no_span_fixture, text, expected_sents):
119 |     """Danish language SBD tests with text clean"""
120 |     segments = da_with_clean_no_span_fixture.segment(text)
121 |     segments = [s.strip() for s in segments]
122 |     assert segments == expected_sents
123 | 
124 | @pytest.mark.parametrize('text,expected_sents', DA_PDF_TEST_DATA)
125 | def test_da_pdf_type(text, expected_sents):
126 |     """SBD tests from Pragmatic Segmenter for doctype:pdf"""
127 |     seg = pysbd.Segmenter(language="da", clean=True, doc_type='pdf')
128 |     segments = seg.segment(text)
129 |     segments = [s.strip() for s in segments]
130 |     assert segments == expected_sents
131 | 


--------------------------------------------------------------------------------
/tests/lang/test_dutch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_NL_RULES_TEST_CASES = [
 5 |     ("Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.",
 6 |      ["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]),
 7 |     ("81 procent van de schoten was raak. ...en toen barste de hel los.",
 8 |      ["81 procent van de schoten was raak.", "...en toen barste de hel los."]),
 9 |     ("Afkorting aanw. vnw.", ["Afkorting aanw. vnw."])
10 | ]
11 | 
12 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_NL_RULES_TEST_CASES)
13 | def test_nl_sbd(nl_default_fixture, text, expected_sents):
14 |     """Dutch language SBD tests"""
15 |     segments = nl_default_fixture.segment(text)
16 |     segments = [s.strip() for s in segments]
17 |     assert segments == expected_sents
18 | 


--------------------------------------------------------------------------------
/tests/lang/test_english.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pytest
  3 | 
  4 | GOLDEN_EN_RULES_TEST_CASES = [
  5 |     ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
  6 |     ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
  7 |     ("There it is! I found it.", ["There it is!", "I found it."]),
  8 |     ("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
  9 |     ("Please turn to p. 55.", ["Please turn to p. 55."]),
 10 |     ("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
 11 |     ("They closed the deal with Pitt, Briggs & Co. at noon.",
 12 |         ["They closed the deal with Pitt, Briggs & Co. at noon."]),
 13 |     (
 14 |         "Let's ask Jane and co. They should know.",
 15 |         ["Let's ask Jane and co.", "They should know."]),
 16 |     (
 17 |         "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [
 18 |             "They closed the deal with Pitt, Briggs & Co.",
 19 |             "It closed yesterday."
 20 |         ],
 21 |     ),
 22 |     ("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
 23 |     (
 24 |         "St. Michael's Church is on 5th st. near the light.",
 25 |         ["St. Michael's Church is on 5th st. near the light."],
 26 |     ),
 27 |     ("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
 28 |     ("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
 29 |     (
 30 |         "I live in the E.U. How about you?",
 31 |         ["I live in the E.U.", "How about you?"],
 32 |     ),
 33 |     (
 34 |         "I live in the U.S. How about you?",
 35 |         ["I live in the U.S.", "How about you?"],
 36 |     ),
 37 |     ("I work for the U.S. Government in Virginia.",
 38 |         ["I work for the U.S. Government in Virginia."]),
 39 |     ("I have lived in the U.S. for 20 years.",
 40 |         ["I have lived in the U.S. for 20 years."]),
 41 |     # Most difficult sentence to crack
 42 |     pytest.param(
 43 |          "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
 44 |          [
 45 |              "At 5 a.m. Mr. Smith went to the bank.",
 46 |              "He left the bank at 6 P.M.", "Mr. Smith then went to the store."
 47 |          ], marks=pytest.mark.xfail),
 48 |     ("She has $100.00 in her bag.", ["She has $100.00 in her bag."]),
 49 |     ("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]),
 50 |     ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
 51 |         ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
 52 |     ("Her email is Jane.Doe@example.com. I sent her an email.",
 53 |         ["Her email is Jane.Doe@example.com.", "I sent her an email."]),
 54 |     ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
 55 |         ["The site is: https://www.example.50.com/new-site/awesome_content.html.",
 56 |             "Please check it out."]),
 57 |     (
 58 |         "She turned to him, 'This is great.' she said.",
 59 |         ["She turned to him, 'This is great.' she said."],
 60 |     ),
 61 |     (
 62 |         'She turned to him, "This is great." she said.',
 63 |         ['She turned to him, "This is great." she said.'],
 64 |     ),
 65 |     (
 66 |         'She turned to him, "This is great." She held the book out to show him.',
 67 |         [
 68 |             'She turned to him, "This is great."',
 69 |             "She held the book out to show him."
 70 |         ],
 71 |     ),
 72 |     ("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
 73 |     ("Hello?? Who is there?", ["Hello??", "Who is there?"]),
 74 |     ("Hello!? Is that you?", ["Hello!?", "Is that you?"]),
 75 |     ("Hello?! Is that you?", ["Hello?!", "Is that you?"]),
 76 |     (
 77 |         "1.) The first item 2.) The second item",
 78 |         ["1.) The first item", "2.) The second item"],
 79 |     ),
 80 |     (
 81 |         "1.) The first item. 2.) The second item.",
 82 |         ["1.) The first item.", "2.) The second item."],
 83 |     ),
 84 |     (
 85 |         "1) The first item 2) The second item",
 86 |         ["1) The first item", "2) The second item"],
 87 |     ),
 88 |     ("1) The first item. 2) The second item.",
 89 |         ["1) The first item.", "2) The second item."]),
 90 |     (
 91 |         "1. The first item 2. The second item",
 92 |         ["1. The first item", "2. The second item"],
 93 |     ),
 94 |     (
 95 |         "1. The first item. 2. The second item.",
 96 |         ["1. The first item.", "2. The second item."],
 97 |     ),
 98 |     (
 99 |         "• 9. The first item • 10. The second item",
100 |         ["• 9. The first item", "• 10. The second item"],
101 |     ),
102 |     (
103 |         "⁃9. The first item ⁃10. The second item",
104 |         ["⁃9. The first item", "⁃10. The second item"],
105 |     ),
106 |     (
107 |         "a. The first item b. The second item c. The third list item",
108 |         ["a. The first item", "b. The second item", "c. The third list item"],
109 |     ),
110 |     (
111 |         "You can find it at N°. 1026.253.553. That is where the treasure is.",
112 |         [
113 |             "You can find it at N°. 1026.253.553.",
114 |             "That is where the treasure is."
115 |         ],
116 |     ),
117 |     (
118 |         "She works at Yahoo! in the accounting department.",
119 |         ["She works at Yahoo! in the accounting department."],
120 |     ),
121 |     (
122 |         "We make a good team, you and I. Did you see Albert I. Jones yesterday?",
123 |         [
124 |             "We make a good team, you and I.",
125 |             "Did you see Albert I. Jones yesterday?"
126 |         ],
127 |     ),
128 |     (
129 |         "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
130 |         [
131 |             "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
132 |         ],
133 |     ),
134 |     (
135 |         """"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""",
136 |         [
137 |             '"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'
138 |         ],
139 |     ),
140 |     ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
141 |         [
142 |             "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .",
143 |             "Next sentence."
144 |         ]),
145 |     (
146 |         "I never meant that.... She left the store.",
147 |         ["I never meant that....", "She left the store."],
148 |     ),
149 |     (
150 |         "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
151 |         [
152 |             "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."
153 |         ],
154 |     ),
155 |     (
156 |         "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
157 |         [
158 |             "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.",
159 |             ". . . The practice was not abandoned. . . ."
160 |         ],
161 |     )
162 | ]
163 | 
164 | 
165 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_EN_RULES_TEST_CASES)
166 | def test_en_sbd(pysbd_default_en_no_clean_no_span_fixture, text, expected_sents):
167 |     """SBD tests from Pragmatic Segmenter"""
168 |     segments = pysbd_default_en_no_clean_no_span_fixture.segment(text)
169 |     segments = [s.strip() for s in segments]
170 |     assert segments == expected_sents
171 | 


--------------------------------------------------------------------------------
/tests/lang/test_french.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_FR_RULES_TEST_CASES = [
 5 | ("Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale. L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle.",
 6 |  ["Après avoir été l'un des acteurs du projet génome humain, le Genoscope met aujourd'hui le cap vers la génomique environnementale.", "L'exploitation des données de séquences, prolongée par l'identification expérimentale des fonctions biologiques, notamment dans le domaine de la biocatalyse, ouvrent des perspectives de développements en biotechnologie industrielle."]),
 7 | ("\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté.",
 8 |  ["\"Airbus livrera comme prévu 30 appareils 380 cette année avec en ligne de mire l'objectif d'équilibre financier du programme en 2015\", a-t-il ajouté."]),
 9 | ("À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires. Elle assure que ce mouvement « n’aura aucun impact sur les livraisons ».",
10 |  ["À 11 heures ce matin, la direction ne décomptait que douze grévistes en tout sur la France : ce sont ceux du site de Saran (Loiret), dont l’effectif est de 809 salariés, dont la moitié d’intérimaires.", "Elle assure que ce mouvement « n’aura aucun impact sur les livraisons »."]),
11 | ("Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle.",
12 |  ["Ce modèle permet d’afficher le texte « LL.AA.II.RR. » pour l’abréviation de « Leurs Altesses impériales et royales » avec son infobulle."]),
13 | ("Les derniers ouvrages de Intercept Ltd. sont ici.",
14 |  ["Les derniers ouvrages de Intercept Ltd. sont ici."])
15 | ]
16 | 
17 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_FR_RULES_TEST_CASES)
18 | def test_fr_sbd(fr_default_fixture, text, expected_sents):
19 |     """French language SBD tests"""
20 |     segments = fr_default_fixture.segment(text)
21 |     segments = [s.strip() for s in segments]
22 |     assert segments == expected_sents
23 | 


--------------------------------------------------------------------------------
/tests/lang/test_greek.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_EL_RULES_TEST_CASES = [
 5 | ("Με συγχωρείτε· πού είναι οι τουαλέτες; Τις Κυριακές δε δούλευε κανένας. το κόστος του σπιτιού ήταν £260.950,00.",
 6 |  ["Με συγχωρείτε· πού είναι οι τουαλέτες;", "Τις Κυριακές δε δούλευε κανένας.", "το κόστος του σπιτιού ήταν £260.950,00."]),
 7 | ]
 8 | 
 9 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_EL_RULES_TEST_CASES)
10 | def test_el_sbd(el_default_fixture, text, expected_sents):
11 |     """Greek language SBD tests"""
12 |     segments = el_default_fixture.segment(text)
13 |     segments = [s.strip() for s in segments]
14 |     assert segments == expected_sents
15 | 


--------------------------------------------------------------------------------
/tests/lang/test_hindi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_HI_RULES_TEST_CASES = [
 5 |     ("सच्चाई यह है कि इसे कोई नहीं जानता। हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।",
 6 |     ["सच्चाई यह है कि इसे कोई नहीं जानता।", "हो सकता है यह फ़्रेन्को के खिलाफ़ कोई विद्रोह रहा हो, या फिर बेकाबू हो गया कोई आनंदोत्सव।"])
 7 | ]
 8 | 
 9 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_HI_RULES_TEST_CASES)
10 | def test_hi_sbd(hi_default_fixture, text, expected_sents):
11 |     """Hindi language SBD tests from Pragmatic Segmenter"""
12 |     segments = hi_default_fixture.segment(text)
13 |     segments = [s.strip() for s in segments]
14 |     assert segments == expected_sents
15 | 


--------------------------------------------------------------------------------
/tests/lang/test_italian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_IT_RULES_TEST_CASES = [
 5 | ("Salve Sig.ra Mengoni! Come sta oggi?",
 6 |  ["Salve Sig.ra Mengoni!", "Come sta oggi?"]),
 7 | ("Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.».",
 8 |  ["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a.»."]),
 9 | ("La casa costa 170.500.000,00€!",
10 |  ["La casa costa 170.500.000,00€!"])
11 | ]
12 | 
13 | IT_MORE_TEST_CASES = [
14 | ("Salve Sig.ra Mengoni! Come sta oggi?",
15 |  ["Salve Sig.ra Mengoni!", "Come sta oggi?"]),
16 | ("Buongiorno! Sono l'Ing. Mengozzi. È presente l'Avv. Cassioni?",
17 |  ["Buongiorno!", "Sono l'Ing. Mengozzi.", "È presente l'Avv. Cassioni?"]),
18 | ("Mi fissi un appuntamento per mar. 23 Nov.. Grazie.",
19 |  ["Mi fissi un appuntamento per mar. 23 Nov..", "Grazie."]),
20 | ("Ecco il mio tel.:01234567. Mi saluti la Sig.na Manelli. Arrivederci.",
21 |  ["Ecco il mio tel.:01234567.", "Mi saluti la Sig.na Manelli.", "Arrivederci."]),
22 | ("La centrale meteor. si è guastata. Gli idraul. son dovuti andare a sistemarla.",
23 |  ["La centrale meteor. si è guastata.", "Gli idraul. son dovuti andare a sistemarla."]),
24 | ("Hanno creato un algoritmo allo st. d. arte. Si ringrazia lo psicol. Serenti.",
25 |  ["Hanno creato un algoritmo allo st. d. arte.", "Si ringrazia lo psicol. Serenti."]),
26 | ("Chiamate il V.Cte. delle F.P., adesso!",
27 |  ["Chiamate il V.Cte. delle F.P., adesso!"]),
28 | ("Giancarlo ha sostenuto l'esame di econ. az..",
29 |  ["Giancarlo ha sostenuto l'esame di econ. az.."]),
30 | ("Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!",
31 |  ["Stava viaggiando a 90 km/h verso la provincia di TR quando il Dott. Mesini ha sentito un rumore e si fermò!"]),
32 | ("Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona.",
33 |  ["Egregio Dir. Amm., le faccio sapere che l'ascensore non funziona."]),
34 | ("Stava mangiando e/o dormendo.",
35 |  ["Stava mangiando e/o dormendo."]),
36 | ("Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo.",
37 |  ["Ricordatevi che dom 25 Set. sarà il compleanno di Maria; dovremo darle un regalo."]),
38 | ("La politica è quella della austerità; quindi verranno fatti tagli agli sprechi.",
39 |  ["La politica è quella della austerità; quindi verranno fatti tagli agli sprechi."]),
40 | ("Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\".",
41 |  ["Nel tribunale, l'Avv. Fabrizi ha urlato \"Io, l'illustrissimo Fabrizi, vi si oppone!\"."]),
42 | ("Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW.",
43 |  ["Le parti fisiche di un computer (ad es. RAM, CPU, tastiera, mouse, etc.) sono definiti HW."]),
44 | ("La parola 'casa' è sinonimo di abitazione.",
45 |  ["La parola 'casa' è sinonimo di abitazione."]),
46 | ("La \"Mulino Bianco\" fa alimentari pre-confezionati.",
47 |  ["La \"Mulino Bianco\" fa alimentari pre-confezionati."]),
48 | ("\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni).",
49 |  ["\"Ei fu. Siccome immobile / dato il mortal sospiro / stette la spoglia immemore / orba di tanto spiro / [...]\" (Manzoni)."]),
50 | ("Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...».",
51 |  ["Una lettera si può iniziare in questo modo «Il/la sottoscritto/a ... nato/a a ...»."]),
52 | ("Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\"",
53 |  ["Per casa, in uno degli esercizi per i bambini c'era \"3 + (14/7) = 5\""]),
54 | ("Ai bambini è stato chiesto di fare \"4:2*2\"",
55 |  ["Ai bambini è stato chiesto di fare \"4:2*2\""]),
56 | ("La maestra esclamò: \"Bambini, quanto fa '2/3 + 4/3?'\".",
57 |  ["La maestra esclamò: \"Bambini, quanto fa \'2/3 + 4/3?\'\"."]),
58 | ("Il motore misurava 120°C.",
59 |  ["Il motore misurava 120°C."]),
60 | ("Il volume era di 3m³.",
61 |  ["Il volume era di 3m³."]),
62 | ("La stanza misurava 20m².",
63 |  ["La stanza misurava 20m²."]),
64 | ("1°C corrisponde a 33.8°F.",
65 |  ["1°C corrisponde a 33.8°F."]),
66 | ("Oggi è il 27-10-14.",
67 |  ["Oggi è il 27-10-14."]),
68 | ("La casa costa 170.500.000,00€!",
69 |  ["La casa costa 170.500.000,00€!"]),
70 | ("Il corridore 103 è arrivato 4°.",
71 |  ["Il corridore 103 è arrivato 4°."]),
72 | ("Oggi è il 27/10/2014.",
73 |  ["Oggi è il 27/10/2014."]),
74 | ("Ecco l'elenco: 1.gelato, 2.carne, 3.riso.",
75 |  ["Ecco l'elenco: 1.gelato, 2.carne, 3.riso."]),
76 | ("Devi comprare : 1)pesce 2)sale.",
77 |  ["Devi comprare : 1)pesce 2)sale."]),
78 | ("La macchina viaggiava a 100 km/h.",
79 |  ["La macchina viaggiava a 100 km/h."])
80 | ]
81 | 
82 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_IT_RULES_TEST_CASES)
83 | def test_it_sbd(it_default_fixture, text, expected_sents):
84 |     """Italian language SBD tests"""
85 |     segments = it_default_fixture.segment(text)
86 |     segments = [s.strip() for s in segments]
87 |     assert segments == expected_sents
88 | 
89 | @pytest.mark.parametrize('text,expected_sents', IT_MORE_TEST_CASES)
90 | def test_it_sbd_more_cases(it_default_fixture, text, expected_sents):
91 |     """Italian language SBD tests more examples"""
92 |     segments = it_default_fixture.segment(text)
93 |     segments = [s.strip() for s in segments]
94 |     assert segments == expected_sents
95 | 


--------------------------------------------------------------------------------
/tests/lang/test_japanese.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_JA_RULES_TEST_CASES = [
 5 | ("これはペンです。それはマーカーです。",
 6 |  ["これはペンです。", "それはマーカーです。"]),
 7 | ("それは何ですか？ペンですか？",
 8 |  ["それは何ですか？", "ペンですか？"]),
 9 | ("良かったね！すごい！",
10 |  ["良かったね！", "すごい！"]),
11 | ("自民党税制調査会の幹部は、「引き下げ幅は３．２９％以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、３０日に決定する与党税制改正大綱に盛り込むことにしています。２％台後半を目指すとする方向で最終調整に入りました。",
12 |  ["自民党税制調査会の幹部は、「引き下げ幅は３．２９％以上を目指すことになる」と指摘していて、今後、公明党と合意したうえで、３０日に決定する与党税制改正大綱に盛り込むことにしています。", "２％台後半を目指すとする方向で最終調整に入りました。"]),
13 | ]
14 | 
15 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_JA_RULES_TEST_CASES)
16 | def test_ja_sbd(ja_default_fixture, text, expected_sents):
17 |     """Japanese language SBD tests"""
18 |     segments = ja_default_fixture.segment(text)
19 |     segments = [s.strip() for s in segments]
20 |     assert segments == expected_sents
21 | 
22 | JA_TEST_CASES_CLEAN = [("これは父の\n家です。", ["これは父の家です。"])]
23 | 
24 | @pytest.mark.parametrize('text,expected_sents', JA_TEST_CASES_CLEAN)
25 | def test_ja_sbd_clean(ja_with_clean_no_span_fixture, text, expected_sents):
26 |     """Japanese language SBD tests with clean=True"""
27 |     segments = ja_with_clean_no_span_fixture.segment(text)
28 |     segments = [s.strip() for s in segments]
29 |     assert segments == expected_sents
30 | 


--------------------------------------------------------------------------------
/tests/lang/test_kazakh.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | 
 5 | GOLDEN_KK_RULES_TEST_CASES = [
 6 | ("Мұхитқа тікелей шыға алмайтын мемлекеттердің ішінде Қазақстан - ең үлкені.",
 7 |  ["Мұхитқа тікелей шыға алмайтын мемлекеттердің ішінде Қазақстан - ең үлкені."]),
 8 | ("Оқушылар үйі, Достық даңғылы, Абай даналығы, ауыл шаруашылығы – кім? не?",
 9 |  ["Оқушылар үйі, Достық даңғылы, Абай даналығы, ауыл шаруашылығы – кім?", "не?"]),
10 | ("Әр түрлі өлшемнің атауы болып табылатын м (метр), см (сантиметр), кг (киллограмм), т (тонна), га (гектар), ц (центнер), т. б. (тағы басқа), тәрізді белгілер де қысқарған сөздер болып табылады.",
11 |  ["Әр түрлі өлшемнің атауы болып табылатын м (метр), см (сантиметр), кг (киллограмм), т (тонна), га (гектар), ц (центнер), т. б. (тағы басқа), тәрізді белгілер де қысқарған сөздер болып табылады."]),
12 | ("Мысалы: обкомға (облыстық комитетке) барды, ауаткомда (аудандық атқару комитетінде) болды, педучилищеге (педагогтік училищеге) түсті, медпункттің (медициналық пункттің) алдында т. б.",
13 |  ["Мысалы: обкомға (облыстық комитетке) барды, ауаткомда (аудандық атқару комитетінде) болды, педучилищеге (педагогтік училищеге) түсті, медпункттің (медициналық пункттің) алдында т. б."]),
14 | ("Елдің жалпы ішкі өнімі ЖІӨ (номинал) = $225.619 млрд (2014)",
15 |  ["Елдің жалпы ішкі өнімі ЖІӨ (номинал) = $225.619 млрд (2014)"]),
16 | ("Ресейдiң әлеуметтiк-экономикалық жағдайы.XVIII ғасырдың бiрiншi ширегiнде Ресейге тән нәрсе.",
17 |  ["Ресейдiң әлеуметтiк-экономикалық жағдайы.", "XVIII ғасырдың бiрiншi ширегiнде Ресейге тән нәрсе."]),
18 | ("(«Егемен Қазақстан», 7 қыркүйек 2012 жыл. №590-591); Бұл туралы кеше санпедқадағалау комитетінің облыыстық департаменті хабарлады. («Айқын», 23 сəуір 2010 жыл. № 70).",
19 |  ["(«Егемен Қазақстан», 7 қыркүйек 2012 жыл. №590-591); Бұл туралы кеше санпедқадағалау комитетінің облыыстық департаменті хабарлады.", "(«Айқын», 23 сəуір 2010 жыл. № 70)."]),
20 | ("Иран революциясы (1905 — 11) және азаматтық қозғалыс (1918 — 21) кезінде А. Фарахани, М. Кермани, М. Т. Бехар, т.б. ақындар демократиялық идеяның жыршысы болды.",
21 |  ["Иран революциясы (1905 — 11) және азаматтық қозғалыс (1918 — 21) кезінде А. Фарахани, М. Кермани, М. Т. Бехар, т.б. ақындар демократиялық идеяның жыршысы болды."]),
22 | ("Владимир Федосеев: Аттар магиясы енді жоқ http://www.vremya.ru/2003/179/10/80980.html",
23 |  ["Владимир Федосеев: Аттар магиясы енді жоқ http://www.vremya.ru/2003/179/10/80980.html"]),
24 | ("Бірақ оның енді не керегі бар? — деді.",
25 |  ["Бірақ оның енді не керегі бар? — деді."]),
26 | ("Сондықтан шапаныма жегізіп отырғаным! - деп, жауап береді.",
27 |  ["Сондықтан шапаныма жегізіп отырғаным! - деп, жауап береді."]),
28 | ("Б.з.б. 6 – 3 ғасырларда конфуцийшілдік, моизм, легизм мектептерінің қалыптасуы нәтижесінде Қытай философиясы пайда болды.",
29 |  ["Б.з.б. 6 – 3 ғасырларда конфуцийшілдік, моизм, легизм мектептерінің қалыптасуы нәтижесінде Қытай философиясы пайда болды."]),
30 | ("'Та марбута' тек сөз соңында екі түрде жазылады:",
31 |  ["'Та марбута' тек сөз соңында екі түрде жазылады:"])
32 | ]
33 | 
34 | 
35 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_KK_RULES_TEST_CASES)
36 | def test_kk_sbd(kk_default_fixture, text, expected_sents):
37 |     """Kazakh language SBD tests"""
38 |     segments = kk_default_fixture.segment(text)
39 |     segments = [s.strip() for s in segments]
40 |     assert segments == expected_sents
41 | 


--------------------------------------------------------------------------------
/tests/lang/test_marathi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_MR_RULES_TEST_CASES = [
 5 |     ("आज दसरा आहे. आज खूप शुभ दिवस आहे.",
 6 |      ["आज दसरा आहे.", "आज खूप शुभ दिवस आहे."]),
 7 |     ("ढग खूप गर्जत होते; पण पाऊस पडत नव्हता.",
 8 |      ["ढग खूप गर्जत होते; पण पाऊस पडत नव्हता."]),
 9 |     ("रमाची परीक्षा कधी आहे? अवकाश आहे अजून.",
10 |      ["रमाची परीक्षा कधी आहे?", "अवकाश आहे अजून."]),
11 |     ("शाब्बास, असाच अभ्यास कर! आणि मग तुला नक्की यश मिळणार.",
12 |      ["शाब्बास, असाच अभ्यास कर!", "आणि मग तुला नक्की यश मिळणार."]),
13 |     ("\"आपली आपण करी स्तुती तो एक मूर्ख\" असे समर्थ रामदासस्वामी म्हणतात.",
14 |      ["\"आपली आपण करी स्तुती तो एक मूर्ख\" असे समर्थ रामदासस्वामी म्हणतात."])
15 | ]
16 | 
17 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_MR_RULES_TEST_CASES)
18 | def test_mr_sbd(mr_default_fixture, text, expected_sents):
19 |     """Marathi language SBD tests"""
20 |     segments = mr_default_fixture.segment(text)
21 |     segments = [s.strip() for s in segments]
22 |     assert segments == expected_sents
23 | 


--------------------------------------------------------------------------------
/tests/lang/test_persian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_FA_RULES_TEST_CASES = [
 5 |     ("خوشبختم، آقای رضا. شما کجایی هستید؟ من از تهران هستم.",
 6 |      ["خوشبختم، آقای رضا.", "شما کجایی هستید؟", "من از تهران هستم."])
 7 | ]
 8 | 
 9 | 
10 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_FA_RULES_TEST_CASES)
11 | def test_fa_sbd(fa_default_fixture, text, expected_sents):
12 |     """Persian language SBD tests"""
13 |     segments = fa_default_fixture.segment(text)
14 |     segments = [s.strip() for s in segments]
15 |     assert segments == expected_sents
16 | 


--------------------------------------------------------------------------------
/tests/lang/test_polish.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_PL_RULES_TEST_CASES = [
 5 | ("To słowo bałt. jestskrótem.",
 6 |  ["To słowo bałt. jestskrótem."]),
 7 | ]
 8 | 
 9 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_PL_RULES_TEST_CASES)
10 | def test_pl_sbd(pl_default_fixture, text, expected_sents):
11 |     """Polish language SBD tests"""
12 |     segments = pl_default_fixture.segment(text)
13 |     segments = [s.strip() for s in segments]
14 |     assert segments == expected_sents
15 | 


--------------------------------------------------------------------------------
/tests/lang/test_russian.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import pytest
  3 | 
  4 | GOLDEN_RU_RULES_TEST_CASES = [
  5 | ("Объем составляет 5 куб.м.",
  6 |  ["Объем составляет 5 куб.м."]),
  7 | ("Маленькая девочка бежала и кричала: «Не видали маму?».",
  8 |  ["Маленькая девочка бежала и кричала: «Не видали маму?»."]),
  9 | ("Сегодня 27.10.14", ["Сегодня 27.10.14"])
 10 | ]
 11 | 
 12 | RU_MORE_TEST_CASES = [("Маленькая девочка бежала и кричала: «Не видали маму?»." ,
 13 |  ["Маленькая девочка бежала и кричала: «Не видали маму?»."]),
 14 | ("«Я приду поздно»,  — сказал Андрей." ,
 15 |  ["«Я приду поздно»,  — сказал Андрей."]),
 16 | ("«К чему ты готовишься? – спросила мама. – Завтра ведь выходной»." ,
 17 |  ["«К чему ты готовишься? – спросила мама. – Завтра ведь выходной»."]),
 18 | ("По словам Пушкина, «Привычка свыше дана, замена счастью она»." ,
 19 |  ["По словам Пушкина, «Привычка свыше дана, замена счастью она»."]),
 20 | ("Он сказал: «Я очень устал», и сразу же замолчал." ,
 21 |  ["Он сказал: «Я очень устал», и сразу же замолчал."]),
 22 | ("Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей." ,
 23 |  ["Мне стало как-то ужасно грустно в это мгновение; однако что-то похожее на смех зашевелилось в душе моей."]),
 24 | ("Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…" ,
 25 |  ["Шухов как был в ватных брюках, не снятых на ночь (повыше левого колена их тоже был пришит затасканный, погрязневший лоскут, и на нем выведен черной, уже поблекшей краской номер Щ-854), надел телогрейку…"]),
 26 | ("Слово «дом» является синонимом жилища" ,
 27 |  ["Слово «дом» является синонимом жилища"]),
 28 | ("В Санкт-Петербург на гастроли приехал театр «Современник»" ,
 29 |  ["В Санкт-Петербург на гастроли приехал театр «Современник»"]),
 30 | ("Машина едет со скоростью 100 км/ч." ,
 31 |  ["Машина едет со скоростью 100 км/ч."]),
 32 | ("Я поем и/или лягу спать." ,
 33 |  ["Я поем и/или лягу спать."]),
 34 | ("Он не мог справиться с примером \"3 + (14:7) = 5\"" ,
 35 |  ["Он не мог справиться с примером \"3 + (14:7) = 5\""]),
 36 | ("Вот список: 1.мороженое, 2.мясо, 3.рис." ,
 37 |  ["Вот список: 1.мороженое, 2.мясо, 3.рис."]),
 38 | ("Квартира 234 находится на 4-ом этаже." ,
 39 |  ["Квартира 234 находится на 4-ом этаже."]),
 40 | ("В это время года температура может подниматься до 40°C." ,
 41 |  ["В это время года температура может подниматься до 40°C."]),
 42 | ("Объем составляет 5м³." ,
 43 |  ["Объем составляет 5м³."]),
 44 | ("Объем составляет 5 куб.м." ,
 45 |  ["Объем составляет 5 куб.м."]),
 46 | ("Площадь комнаты 14м²." ,
 47 |  ["Площадь комнаты 14м²."]),
 48 | ("Площадь комнаты 14 кв.м." ,
 49 |  ["Площадь комнаты 14 кв.м."]),
 50 | ("1°C соответствует 33.8°F." ,
 51 |  ["1°C соответствует 33.8°F."]),
 52 | ("Сегодня 27.10.14" ,
 53 |  ["Сегодня 27.10.14"]),
 54 | ("Сегодня 27 октября 2014 года." ,
 55 |  ["Сегодня 27 октября 2014 года."]),
 56 | ("Эта машина стоит 150 000 дол.!" ,
 57 |  ["Эта машина стоит 150 000 дол.!"]),
 58 | ("Эта машина стоит $150 000!" ,
 59 |  ["Эта машина стоит $150 000!"]),
 60 | ("Вот номер моего телефона: +39045969798. Передавайте привет г-ну Шапочкину. До свидания." ,
 61 |  ["Вот номер моего телефона: +39045969798.", "Передавайте привет г-ну Шапочкину.", "До свидания."]),
 62 | ("Постойте, разве можно указывать цены в у.е.!" ,
 63 |  ["Постойте, разве можно указывать цены в у.е.!"]),
 64 | ("Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!" ,
 65 |  ["Едем на скорости 90 км/ч в сторону пгт. Брагиновка, о котором мы так много слышали по ТВ!"]),
 66 | ("Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре." ,
 67 |  ["Д-р ветеринарных наук А. И. Семенов и пр. выступали на этом семинаре."]),
 68 | ("Уважаемый проф. Семенов! Просьба до 20.10 сдать отчет на кафедру." ,
 69 |  ["Уважаемый проф. Семенов!", "Просьба до 20.10 сдать отчет на кафедру."]),
 70 | ("Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка. Предъявите дисконтную карту, пожалуйста!" ,
 71 |  ["Первоначальная стоимость этого комплекта 30 долл., но сейчас действует скидка.", "Предъявите дисконтную карту, пожалуйста!"]),
 72 | ("Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая." ,
 73 |  ["Виктор съел пол-лимона и ушел по-английски из дома на ул. 1 Мая."]),
 74 | ("Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок." ,
 75 |  ["Напоминаю Вам, что 25.10 день рождения у Маши К., нужно будет купить ей подарок."]),
 76 | ("В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно." ,
 77 |  ["В 2010-2012 гг. Виктор посещал г. Волгоград неоднократно."]),
 78 | ("Маленькая девочка бежала и кричала: «Не видали маму?»" ,
 79 |  ["Маленькая девочка бежала и кричала: «Не видали маму?»"]),
 80 | ("Кв. 234 находится на 4 этаже." ,
 81 |  ["Кв. 234 находится на 4 этаже."]),
 82 | ("В это время года температура может подниматься до 40°C." ,
 83 |  ["В это время года температура может подниматься до 40°C."]),
 84 | ("Нужно купить 1)рыбу 2)соль." ,
 85 |  ["Нужно купить 1)рыбу 2)соль."]),
 86 | ("Машина едет со скоростью 100 км/ч." ,
 87 |  ["Машина едет со скоростью 100 км/ч."]),
 88 | ("Л.Н. Толстой написал \"Войну и мир\". Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами. Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое." ,
 89 |  ["Л.Н. Толстой написал \"Войну и мир\".", "Кроме Волконских, Л. Н. Толстой состоял в близком родстве с некоторыми другими аристократическими родами.", "Дом, где родился Л.Н.Толстой, 1898 г. В 1854 году дом продан по распоряжению писателя на вывоз в село Долгое."])
 90 | ]
 91 | 
 92 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_RU_RULES_TEST_CASES)
 93 | def test_ru_sbd(ru_default_fixture, text, expected_sents):
 94 |     """Russian language SBD tests"""
 95 |     segments = ru_default_fixture.segment(text)
 96 |     segments = [s.strip() for s in segments]
 97 |     assert segments == expected_sents
 98 | 
 99 | @pytest.mark.parametrize('text,expected_sents', RU_MORE_TEST_CASES)
100 | def test_ru_sbd(ru_default_fixture, text, expected_sents):
101 |     """Russian language SBD tests"""
102 |     segments = ru_default_fixture.segment(text)
103 |     segments = [s.strip() for s in segments]
104 |     assert segments == expected_sents
105 | 


--------------------------------------------------------------------------------
/tests/lang/test_slovak.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_SK_RULES_TEST_CASES = [
 5 |     ("Ide o majiteľov firmy ABTrade s. r. o., ktorí stoja aj za ďalšími spoločnosťami, napr. XYZCorp a.s.",
 6 |      ["Ide o majiteľov firmy ABTrade s. r. o., ktorí stoja aj za ďalšími spoločnosťami, napr. XYZCorp a.s."]),
 7 |     ("„Prieskumy beriem na ľahkú váhu. V podstate ma to nezaujíma,“ reagoval Matovič na prieskum agentúry Focus.",
 8 |      ["„Prieskumy beriem na ľahkú váhu. V podstate ma to nezaujíma,“ reagoval Matovič na prieskum agentúry Focus."]),
 9 |     ("Toto sa mi podarilo až na 10. pokus, ale stálo to za to.",
10 |      ["Toto sa mi podarilo až na 10. pokus, ale stálo to za to."]),
11 |     ("Ide o príslušníkov XII. Pluku špeciálneho určenia.",
12 |      ["Ide o príslušníkov XII. Pluku špeciálneho určenia."]),
13 |     ("Spoločnosť bola založená 7. Apríla 2020, na zmluve však figuruje dátum 20. marec 2020.",
14 |      ["Spoločnosť bola založená 7. Apríla 2020, na zmluve však figuruje dátum 20. marec 2020."]),
15 | ]
16 | 
17 | 
18 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_SK_RULES_TEST_CASES)
19 | def test_pl_sbd(sk_default_fixture, text, expected_sents):
20 |     """Slovak language SBD tests"""
21 |     segments = sk_default_fixture.segment(text)
22 |     segments = [s.strip() for s in segments]
23 |     assert segments == expected_sents
24 | 


--------------------------------------------------------------------------------
/tests/lang/test_urdu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import pytest
 3 | 
 4 | GOLDEN_UR_RULES_TEST_CASES = [
 5 | ("کیا حال ہے؟ ميرا نام ___ ەے۔ میں حالا تاوان دےدوں؟",
 6 |  ["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]),
 7 | ]
 8 | 
 9 | @pytest.mark.parametrize('text,expected_sents', GOLDEN_UR_RULES_TEST_CASES)
10 | def test_ur_sbd(ur_default_fixture, text, expected_sents):
11 |     """Urdu language SBD tests"""
12 |     segments = ur_default_fixture.segment(text)
13 |     segments = [s.strip() for s in segments]
14 |     assert segments == expected_sents
15 | 


--------------------------------------------------------------------------------
/tests/regression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nipunsadvilkar/pySBD/5905f13be4fc95f407b98392e0ec303617a33d86/tests/regression/__init__.py


--------------------------------------------------------------------------------
/tests/test_cleaner.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pysbd.cleaner import Cleaner
 3 | from pysbd.languages import Language
 4 | 
 5 | TEST_TOBE_CLEANED_DATA = [
 6 |     ("It was a cold \nnight in the city.", "It was a cold night in the city."),
 7 |     ("This is the U.S. Senate my friends. <em>Yes.</em> <em>It is</em>!",
 8 |     "This is the U.S. Senate my friends. Yes. It is!")
 9 | ]
10 | 
11 | @pytest.mark.parametrize('text,expected_cleaned_sents', TEST_TOBE_CLEANED_DATA)
12 | def test_cleaner(text, expected_cleaned_sents):
13 |     """SBD tests from Pragmatic Segmenter"""
14 |     cleaned_text = Cleaner(text, Language.get_language_code('en')).clean()
15 |     assert cleaned_text == expected_cleaned_sents
16 | 
17 | def test_cleaner_doesnt_mutate_input(text="It was a cold \nnight in the city."):
18 |     cleaned_text = Cleaner(text, Language.get_language_code('en')).clean()
19 |     assert text == "It was a cold \nnight in the city."
20 | 
21 | def test_cleaner_none_input(text=None):
22 |     cleaned_text = Cleaner(text, Language.get_language_code('en')).clean()
23 |     assert cleaned_text == text
24 | 
25 | def test_cleaner_no_input(text=""):
26 |     cleaned_text = Cleaner(text, Language.get_language_code('en')).clean()
27 |     assert cleaned_text == text
28 | 


--------------------------------------------------------------------------------
/tests/test_languages.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pysbd.languages import LANGUAGE_CODES, Language
 3 | 
 4 | 
 5 | def test_lang_code2instance_mapping():
 6 |     for code, language_module in LANGUAGE_CODES.items():
 7 |         assert Language.get_language_code(code) == language_module
 8 | 
 9 | def test_exception_on_no_lang_code_provided():
10 |     with pytest.raises(ValueError) as e:
11 |         Language.get_language_code('')
12 |     assert "Provide valid language ID i.e. ISO code." in str(e.value)
13 | 
14 | def test_exception_on_unsupported_lang_code_provided():
15 |     with pytest.raises(ValueError) as e:
16 |         Language.get_language_code('elvish')
17 |     assert "Provide valid language ID i.e. ISO code." in str(e.value)
18 | 


--------------------------------------------------------------------------------
/tests/test_segmenter.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import pysbd
  3 | from pysbd.utils import TextSpan
  4 | 
  5 | 
  6 | def test_no_input(pysbd_default_en_no_clean_no_span_fixture, text=""):
  7 |     segments = pysbd_default_en_no_clean_no_span_fixture.segment(text)
  8 |     assert segments == []
  9 | 
 10 | def test_none_input(pysbd_default_en_no_clean_no_span_fixture, text=None):
 11 |     segments = pysbd_default_en_no_clean_no_span_fixture.segment(text)
 12 |     assert segments == []
 13 | 
 14 | def test_newline_input(pysbd_default_en_no_clean_no_span_fixture, text="\n"):
 15 |     segments = pysbd_default_en_no_clean_no_span_fixture.segment(text)
 16 |     assert segments == []
 17 | 
 18 | def test_segmenter_doesnt_mutate_input(pysbd_default_en_no_clean_no_span_fixture,
 19 |                                        text='My name is Jonas E. Smith. Please turn to p. 55.'):
 20 |     segments = pysbd_default_en_no_clean_no_span_fixture.segment(text)
 21 |     segments = [s.strip() for s in segments]
 22 |     assert text == 'My name is Jonas E. Smith. Please turn to p. 55.'
 23 | 
 24 | @pytest.mark.parametrize('text,expected',
 25 |                          [('My name is Jonas E. Smith. Please turn to p. 55.',
 26 |                             [
 27 |                                 ('My name is Jonas E. Smith. ', 0, 27),
 28 |                                 ('Please turn to p. 55.', 27, 48),
 29 |                             ])
 30 |                          ])
 31 | def test_sbd_char_span(en_no_clean_with_span_fixture, text, expected):
 32 |     """Test sentences with character offsets"""
 33 |     segments = en_no_clean_with_span_fixture.segment(text)
 34 |     expected_text_spans = [TextSpan(sent_w_span[0], sent_w_span[1], sent_w_span[2])
 35 |                            for sent_w_span in expected]
 36 |     assert segments == expected_text_spans
 37 |     # clubbing sentences and matching with original text
 38 |     assert text == "".join([seg.sent for seg in segments])
 39 | 
 40 | def test_same_sentence_different_char_span(en_no_clean_with_span_fixture):
 41 |     """Test same sentences with different char offsets & check for non-destruction"""
 42 |     text = """From the AP comes this story :
 43 | President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.
 44 | ***
 45 | After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?
 46 | ***
 47 | "THE PRESIDENT: I appreciate that.
 48 | (Laughter.)
 49 | My life is too complicated right now trying to do my job.
 50 | (Laughter.)"""
 51 |     expected_text_spans = [TextSpan(sent='From the AP comes this story :\n', start=0, end=31),
 52 |     TextSpan(sent='President Bush on Tuesday nominated two individuals to replace retiring jurists on federal courts in the Washington area.\n', start=31, end=153),
 53 |     TextSpan(sent='***\n', start=153, end=157),
 54 |     TextSpan(sent='After you are elected in 2004, what will your memoirs say about you, what will the title be, and what will the main theme say?\n', start=157, end=284),
 55 |     TextSpan(sent='***\n', start=284, end=288),
 56 |     TextSpan(sent='"THE PRESIDENT: I appreciate that.\n', start=288, end=323),
 57 |     TextSpan(sent='(Laughter.)\n', start=323, end=335),
 58 |     TextSpan(sent='My life is too complicated right now trying to do my job.\n', start=335, end=393),
 59 |     TextSpan(sent='(Laughter.)', start=393, end=404)]
 60 |     segments_w_spans = en_no_clean_with_span_fixture.segment(text)
 61 |     assert segments_w_spans == expected_text_spans
 62 |     # check for non-destruction
 63 |     # clubbing sentences and matching with original text
 64 |     assert text == "".join([seg.sent for seg in segments_w_spans])
 65 | 
 66 | def test_exception_with_both_clean_and_span_true():
 67 |     """Test to not allow clean=True and char_span=True
 68 |     """
 69 |     with pytest.raises(ValueError) as e:
 70 |         seg = pysbd.Segmenter(language="en", clean=True, char_span=True)
 71 |     assert str(e.value) == "char_span must be False if clean is True. "\
 72 |                             "Since `clean=True` will modify original text."
 73 | 
 74 | def test_exception_with_doc_type_pdf_and_clean_false():
 75 |     """
 76 |     Test to force clean=True when doc_type="pdf"
 77 |     """
 78 |     with pytest.raises(ValueError) as e:
 79 |         seg = pysbd.Segmenter(language="en", clean=False, doc_type='pdf')
 80 |     assert str(e.value) == ("`doc_type='pdf'` should have `clean=True` & "
 81 |                             "`char_span` should be False since original"
 82 |                             "text will be modified.")
 83 | 
 84 | def test_exception_with_doc_type_pdf_and_both_clean_char_span_true():
 85 |     """
 86 |     Test to raise ValueError exception when doc_type="pdf" and
 87 |     both clean=True and char_span=True
 88 |     """
 89 |     with pytest.raises(ValueError) as e:
 90 |         seg = pysbd.Segmenter(language="en", clean=True,
 91 |                                 doc_type='pdf', char_span=True)
 92 |     assert str(e.value) == "char_span must be False if clean is True. "\
 93 |                             "Since `clean=True` will modify original text."
 94 | 
 95 | PDF_TEST_DATA = [
 96 |     ("This is a sentence\ncut off in the middle because pdf.",
 97 |         ["This is a sentence cut off in the middle because pdf."]),
 98 |     ("Organising your care early \nmeans you'll have months to build a good relationship with your midwife or doctor, ready for \nthe birth.",
 99 |         ["Organising your care early means you'll have months to build a good relationship with your midwife or doctor, ready for the birth."]),
100 |     ("10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:",
101 |         ["10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:"]),
102 |     ("• 9. Stop smoking \n• 10. Get some rest \n \nYou have the best chance of having a problem-free pregnancy and a healthy baby if you follow \na few simple guidelines:  \n\n1. Organise your pregnancy care early",
103 |         ["• 9. Stop smoking", "• 10. Get some rest", "You have the best chance of having a problem-free pregnancy and a healthy baby if you follow a few simple guidelines:", "1. Organise your pregnancy care early"]),
104 |     ("Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\n'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)",
105 |         ["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"]),
106 |     ("Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.\r'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)",
107 |         ["Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next.", "First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs.", "She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.", "'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.)"])
108 |         ]
109 | 
110 | @pytest.mark.parametrize('text,expected_sents', PDF_TEST_DATA)
111 | def test_en_pdf_type(text, expected_sents):
112 |     """SBD tests from Pragmatic Segmenter for doctype:pdf"""
113 |     seg = pysbd.Segmenter(language="en", clean=True, doc_type='pdf')
114 |     segments = seg.segment(text)
115 |     segments = [s.strip() for s in segments]
116 |     assert segments == expected_sents
117 | 


--------------------------------------------------------------------------------