├── .gitignore
├── .travis.yml
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── ciseau
    ├── __init__.py
    ├── constants.py
    ├── quoted_expressions.py
    ├── regular_expressions.py
    ├── sentence_tokenizer.py
    ├── wiki_markup_processing.py
    └── word_tokenizer.py
├── setup.py
└── tests
    └── test_tokenization.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | 
10 | # Packages #
11 | ############
12 | # it's better to unpack these files and commit the raw source
13 | # git has its own built in compression methods
14 | *.7z
15 | *.dmg
16 | *.gz
17 | *.iso
18 | *.jar
19 | *.rar
20 | *.tar
21 | *.zip
22 | *.gem
23 | *.pem
24 | dist/
25 | build/
26 | 
27 | # Saves #
28 | #########
29 | saves/*
30 | imported_saves/*
31 | pvdm_snapshots/*
32 | sentiment_data/*
33 | *.npy
34 | *.mat
35 | *.vocab
36 | *.svocab
37 | text8
38 | __pycache__/*
39 | *.pyc
40 | *.egg-info
41 | 
42 | # Logs and databases #
43 | ######################
44 | *.log
45 | *.sql
46 | *.sqlite
47 | 
48 | # OS generated files #
49 | ######################
50 | .DS_Store
51 | .DS_Store?
52 | ._*
53 | .Spotlight-V100
54 | .Trashes
55 | ehthumbs.db
56 | Thumbs.db
57 | =======
58 | .DS_Store
59 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | dist: trusty
 3 | python:
 4 | - '2.7'
 5 | - '3.3'
 6 | - '3.4'
 7 | - '3.5'
 8 | os:
 9 | - linux
10 | install:
11 |     - python setup.py install
12 |     - pip install nose2
13 | script: nose2
14 | notifications:
15 |     email:
16 |         recipients:
17 |         - jonathanraiman@gmail.com
18 |         on_success: change
19 |         on_failure: always
20 | after_success:
21 |   - bash <(curl -s https://codecov.io/bash)
22 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Jonathan Raiman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | recursive-include ciseau *.pyx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Ciseau
 2 | ------
 3 | 
 4 | Word and sentence tokenization in Python.
 5 | 
 6 | [![PyPI version](https://badge.fury.io/py/ciseau.svg)](https://badge.fury.io/py/ciseau)
 7 | [![Build Status](https://travis-ci.org/JonathanRaiman/ciseau.svg?branch=master)](https://travis-ci.org/JonathanRaiman/ciseau)
 8 | ![Jonathan Raiman, author](https://img.shields.io/badge/Author-Jonathan%20Raiman%20-blue.svg)
 9 | 
10 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.md)
11 | 
12 | 
13 | Usage
14 | -----
15 | 
16 | Use this package to split up strings according to sentence and word boundaries.
17 | For instance, to simply break up strings into tokens:
18 | 
19 | ```
20 | tokenize("Joey was a great sailor.")
21 | #=> ["Joey ", "was ", "a ", "great ", "sailor ", "."]
22 | ```
23 | 
24 | To also detect sentence boundaries:
25 | 
26 | ```
27 | sent_tokenize("Cat sat mat. Cat's named Cool.", keep_whitespace=True)
28 | #=> [["Cat ", "sat ", "mat", ". "], ["Cat ", "'s ", "named ", "Cool", "."]]
29 | ```
30 | 
31 | `sent_tokenize` can keep the whitespace as-is with the flags `keep_whitespace=True` and `normalize_ascii=False`.
32 | 
33 | Installation
34 | ------------
35 | 
36 | ```
37 | pip3 install ciseau
38 | ```
39 | 
40 | Testing
41 | -------
42 | 
43 | Run `nose2`.
44 | 
45 | 
46 | If you find this project useful for your work or research, here's how you can cite it:
47 | 
48 | ```latex
49 | @misc{RaimanCiseau2017,
50 |   author = {Raiman, Jonathan},
51 |   title = {Ciseau},
52 |   year = {2017},
53 |   publisher = {GitHub},
54 |   journal = {GitHub repository},
55 |   howpublished = {\url{https://github.com/jonathanraiman/ciseau}},
56 |   commit = {fe88b9d7f131b88bcdd2ff361df60b6d1cc64c04}
57 | }
58 | ```
59 | 
60 | 


--------------------------------------------------------------------------------
/ciseau/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for XML cleaning and text tokenization.
 3 | 
 4 | Usage
 5 | -----
 6 | 
 7 | > ciseau.tokenize("Joey was a great sailor.")
 8 | #=> [["Joey", "was", "a", "great", "sailor", "."]]
 9 | 
10 | """
11 | 
12 | from .wiki_markup_processing import (
13 |     to_raw_text,
14 |     to_raw_text_markupless,
15 |     to_raw_text_pairings
16 | )
17 | from .word_tokenizer import tokenize
18 | from .sentence_tokenizer import sent_tokenize
19 | 
20 | __all__ = [
21 |     "to_raw_text",
22 |     "to_raw_text_markupless",
23 |     "to_raw_text_pairings",
24 |     "sent_tokenize",
25 |     "tokenize"
26 | ]
27 | 


--------------------------------------------------------------------------------
/ciseau/constants.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | 
 4 | if sys.version_info >= (3,3):
 5 |     dashes = ["–", "--+"]
 6 |     for i in range(8208, 8214):
 7 |         dashes.append(chr(i))
 8 | else:
 9 |     dashes = [u"–", u"--+"]
10 |     for i in range(8208, 8214):
11 |         dashes.append(unichr(i))
12 | 
13 | 
14 | UNDECIDED = 0
15 | SHOULD_SPLIT = 1
16 | SHOULD_NOT_SPLIT = 2
17 | 
18 | people = [
19 |     "jr", "mr", "ms", "mrs", "dr", "prof", "esq", "sr",
20 |     "sen", "sens", "rep", "reps", "gov", "attys", "attys",
21 |     "supt", "det", "mssrs", "rev", "fr", "ss", "msgr"
22 | ]
23 | army   = ["col", "gen", "lt", "cmdr", "adm", "capt", "sgt", "cpl", "maj", "brig", "pt"]
24 | inst   = ["dept","univ", "assn", "bros", "ph.d"]
25 | place  = [
26 |     "arc", "al", "ave", "blvd", "bld", "cl", "ct",
27 |     "cres", "exp", "expy", "dist", "mt", "mtn", "ft",
28 |     "fy", "fwy", "hwy", "hway", "la", "pde", "pd","plz", "pl", "rd", "st",
29 |     "tce"
30 | ]
31 | comp   = ["mfg", "inc", "ltd", "co", "corp"]
32 | state  = [
33 |     "ala","ariz","ark","cal","calif","colo","col","conn",
34 |     "del","fed","fla","ga","ida","id","ill","ind","ia","kans",
35 |     "kan","ken","ky","la","me","md","is","mass","mich","minn",
36 |     "miss","mo","mont","neb","nebr","nev","mex","okla","ok",
37 |     "ore","penna","penn","pa","dak","tenn","tex","ut","vt",
38 |     "va","wash","wis","wisc","wy","wyo","usafa","alta",
39 |     "man","ont","que","sask","yuk"
40 | ]
41 | month  = [
42 |     "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep",
43 |     "sept", "oct", "nov", "dec"
44 | ]
45 | misc = ["vs", "etc", "no","esp", "ed", "iv", "Oper", "op", "i.e", "e.g", "v"]
46 | website = ["www"]
47 | currency = ["rs"]
48 | ABBR = {}
49 | # create a hash of these abbreviations:
50 | for abbreviation_type in [people, army, inst, place, comp, state, month, misc, website, currency]:
51 |     for abbreviation in abbreviation_type:
52 |         ABBR[abbreviation] = True
53 | 
54 | MONTHS = {
55 |     "january", "february", "march", "april", "may",
56 |     "june", "july", "august", "september", "october",
57 |     "november", "december"
58 | }
59 | PUNCT_SYMBOLS = {'.', "...", "?", "!", "..", "!!", "??", "!?", "?!", u"…"}
60 | CONTINUE_PUNCT_SYMBOLS = {';', ',', '-', ':'} | set(dashes)
61 | OPENING_SYMBOLS = {'(', '[', '"', '{', '“'}
62 | CLOSING_SYMBOLS = {')', ']', '"', '}', '”'}
63 | CLOSE_2_OPEN = {')':'(', ']': '[', '"':'"', '}':'{', '”':'“'}
64 | 


--------------------------------------------------------------------------------
/ciseau/quoted_expressions.py:
--------------------------------------------------------------------------------
  1 | from .constants import (
  2 |     OPENING_SYMBOLS,
  3 |     CLOSING_SYMBOLS,
  4 |     CLOSE_2_OPEN,
  5 |     PUNCT_SYMBOLS,
  6 |     CONTINUE_PUNCT_SYMBOLS
  7 | )
  8 | 
  9 | def group_quoted_tokens(tokens):
 10 |     sentences = []
 11 |     opening_symbols = OPENING_SYMBOLS.copy()
 12 |     closing_symbols = CLOSING_SYMBOLS.copy()
 13 | 
 14 |     inside = []
 15 |     observed_opens = 0
 16 |     open_closed_sections = []
 17 | 
 18 |     for idx, word in enumerate(tokens):
 19 |         token_stripped = word[0]
 20 |         if token_stripped in opening_symbols and token_stripped == '"':
 21 |             # probably a closing quote since there are spaces
 22 |             # after it. Let's confirm by checking if there were
 23 |             # any spaces on the previous word:
 24 |             quote_has_spaces = len(word) > len(token_stripped)
 25 |             previous_word_has_spaces = idx > 0 and tokens[idx-1].endswith(' ')
 26 |             is_last_word = idx + 1 == len(tokens)
 27 |             if idx == 0:
 28 |                 is_open_symbol = True
 29 |                 is_close_symbol = False
 30 |             elif quote_has_spaces and previous_word_has_spaces:
 31 |                 # 1. previous word has spaces before this symbol
 32 |                 # so spaces are not meaningful.
 33 | 
 34 |                 # 2. We find that we are already within a quoted section:
 35 |                 if len(inside) > 0 and inside[-1][0] == '"':
 36 |                     is_open_symbol = False
 37 |                     is_close_symbol = True
 38 |                 else:
 39 |                     # we are not within a quoted section, we resort to counting
 40 |                     # to see what is the best opening-closing strategy
 41 |                     num_expected_future_quotes = sum(symbol == '"' for symbol, _ in inside) + 1
 42 |                     num_future_quotes = sum(token[0] == '"' for token in tokens[idx+1:])
 43 |                     # find the right amount of quotes:
 44 |                     if num_expected_future_quotes == num_future_quotes:
 45 |                         is_open_symbol = True
 46 |                         is_close_symbol = False
 47 |                     else:
 48 |                         is_open_symbol = False
 49 |                         is_close_symbol = True
 50 |             elif quote_has_spaces and not previous_word_has_spaces:
 51 |                 # 'joe" ' -> closing some quotes
 52 |                 is_close_symbol = True
 53 |                 is_open_symbol = False
 54 |             elif is_last_word:
 55 |                 # last word may not have spaces
 56 |                 is_open_symbol = False
 57 |                 is_close_symbol = True
 58 |             else:
 59 |                 if (not tokens[idx-1].endswith(' ') or
 60 |                     tokens[idx+1][0] in PUNCT_SYMBOLS or
 61 |                     tokens[idx+1][0] in CONTINUE_PUNCT_SYMBOLS):
 62 |                     if len(inside) > 0 and inside[-1][0] == '"':
 63 |                         # quote is followed by semicolon, comma, etc...
 64 |                         # or preceded by a word without a space 'joe"something"'
 65 |                         is_open_symbol = False
 66 |                         is_close_symbol = True
 67 |                     else:
 68 |                         is_open_symbol = True
 69 |                         is_close_symbol = False
 70 |                 else:
 71 |                     # no spaces after this quote, can thus assume that it is opening
 72 |                     is_open_symbol = True
 73 |                     is_close_symbol = False
 74 |         else:
 75 |             is_open_symbol = token_stripped in opening_symbols
 76 |             is_close_symbol = token_stripped in closing_symbols
 77 | 
 78 |         if is_open_symbol:
 79 |             inside.append((token_stripped, idx))
 80 |             observed_opens += 1
 81 |         elif is_close_symbol:
 82 |             if len(inside) > 0:
 83 |                 if inside[-1][0] == CLOSE_2_OPEN[token_stripped]:
 84 |                     open_closed_sections.append((inside[-1][1], idx + 1))
 85 |                     inside.pop()
 86 |                 else:
 87 |                     if token_stripped in closing_symbols:
 88 |                         # this closing symbol seems to be ignored
 89 |                         closing_symbols.remove(token_stripped)
 90 |                         opening_symbols.remove(CLOSE_2_OPEN[token_stripped])
 91 |                     # from now on ignore this symbol as start or end:
 92 |                     inside = [(symbol, start)
 93 |                               for symbol, start in inside
 94 |                               if symbol != CLOSE_2_OPEN[token_stripped]]
 95 |             else:
 96 |                 if observed_opens > 0:
 97 |                     if token_stripped in closing_symbols:
 98 |                         # this closing symbol seems to be ignored
 99 |                         closing_symbols.remove(token_stripped)
100 |                         opening_symbols.remove(CLOSE_2_OPEN[token_stripped])
101 | 
102 |     earliest_start = len(tokens)
103 |     out_tokens = []
104 |     for start, end in open_closed_sections[::-1]:
105 |         if start > earliest_start:
106 |             continue
107 |         else:
108 |             if end != earliest_start:
109 |                 out_tokens = tokens[end:earliest_start] + out_tokens
110 |             out_tokens = [tokens[start:end]] + out_tokens
111 |             earliest_start = start
112 |     if earliest_start > 0:
113 |         out_tokens = tokens[0:earliest_start] + out_tokens
114 |     return out_tokens
115 | 
116 | 


--------------------------------------------------------------------------------
/ciseau/regular_expressions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import sys
 4 | 
 5 | from .constants import dashes
 6 | 
 7 | dashes_no_repeats = dashes[:]
 8 | dashes_no_repeats.remove("--+")
 9 | 
10 | matching_dashes = dashes_no_repeats + ["-+"]
11 | 
12 | word_with_alpha_and_period   = re.compile("^([^\.]+)(\.\s*)$")
13 | one_letter_long_or_repeating = re.compile("^(?:(?:[a-z])|(?:[a-z](?:\.[a-z])+))$", re.IGNORECASE)
14 | no_punctuation               = re.compile("^\w+$")
15 | left_quote_shifter           = re.compile(u"((`‘(?!`))|(‘(?!‘))\s*)(?=.*\w)", re.UNICODE)
16 | left_quote_converter         = re.compile(u'([«"“]\s*)(?=.*\w)', re.UNICODE)
17 | left_single_quote_converter  = re.compile(u"(?:(\W|^))('\s*)(?=.*\w)", re.UNICODE)
18 | right_single_quote_converter = re.compile(u"(['’]+)(?=\W|$)\s*", re.UNICODE)
19 | 
20 | if sys.version_info >= (3,3):
21 |     repeated_dash_converter = re.compile("--+")
22 |     dash_converter = re.compile("|".join(dashes_no_repeats))
23 | else:
24 |     repeated_dash_converter = re.compile(u"--+")
25 |     dash_converter = re.compile(u"|".join(dashes_no_repeats))
26 | 
27 | simple_dash_finder           = re.compile("(-\s*)")
28 | advanced_dash_finder         = re.compile("(" + "|".join(matching_dashes) + ")\s*")
29 | multi_single_quote_finder    = re.compile("('{2,})\s*")
30 | url_file_finder              = re.compile("(?:[-a-zA-Z0-9@%._\+~#=]{2,256}://)?"
31 |                                           "(?:www\.)?[-a-zA-Z0-9@:%\._\+~#=]{2,"
32 |                                           "256}\.[a-z]{2,6}[-a-zA-Z0-9@:%_\+.~#"
33 |                                           "?&//=]*\s*")
34 | numerical_expression         = re.compile(u"(\d+(?:,\d+)*(?:\.\d+)*(?![a-zA-ZÀ-ż])\s*)")
35 | remaining_quote_converter    = re.compile(u'(.)(?=["“”»])')
36 | shifted_ellipses             = re.compile("([\.\!\?¿¡]{2,})\s*")
37 | shifted_standard_punctuation = re.compile(u"([\(\[\{\}\]\)\!¡\?¿#\$%;~&+=<>|/:,—…])\s*")
38 | period_mover                 = re.compile(u"([a-zA-ZÀ-ż]{2})([\./])\s+([a-zA-ZÀ-ż]{2})")
39 | pure_whitespace              = re.compile("\s+")
40 | english_specific_appendages = re.compile(u"(\w)(?=['’]([dms])\\b)", re.UNICODE)
41 | english_nots = re.compile(u"(.)(?=n['’]t\\b)", re.UNICODE)
42 | english_contractions = re.compile(u"(.)(?=['’](ve|ll|re)\\b)")
43 | french_appendages = re.compile(u"(\\b[tjnlsmdclTJNLSMLDC]|qu)['’](?=[^tdms])")
44 | word_with_period = re.compile("[^\s\.]+\.{0,1}")
45 | 


--------------------------------------------------------------------------------
/ciseau/sentence_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from .regular_expressions import word_with_alpha_and_period
  3 | from .quoted_expressions import group_quoted_tokens
  4 | from .constants import (
  5 |     PUNCT_SYMBOLS,
  6 |     CONTINUE_PUNCT_SYMBOLS
  7 | )
  8 | from .word_tokenizer import tokenize
  9 | 
 10 | def is_end_symbol(symbol):
 11 |     return (
 12 |         symbol[:2] in PUNCT_SYMBOLS
 13 |     )
 14 | 
 15 | def detect_sentence_boundaries(tokens):
 16 |     """
 17 |     Subdivide an input list of strings (tokens)
 18 |     into multiple lists according to detected
 19 |     sentence boundaries.
 20 | 
 21 |     ```
 22 |     detect_sentence_boundaries(
 23 |         ["Cat ", "sat ", "mat", ". ", "Cat ", "'s ", "named ", "Cool", "."]
 24 |     )
 25 |     #=> [
 26 |         ["Cat ", "sat ", "mat", ". "],
 27 |         ["Cat ", "'s ", "named ", "Cool", "."]
 28 |     ]
 29 |     ```
 30 | 
 31 |     Arguments:
 32 |     ----------
 33 | 
 34 |         tokens : list<str>
 35 | 
 36 |     Returns:
 37 |     --------
 38 |         list<list<str>> : original list subdivided into multiple
 39 |             lists according to (detected) sentence boundaries.
 40 |     """
 41 |     tokenized = group_quoted_tokens(tokens)
 42 |     words = []
 43 |     sentences = []
 44 |     for i in range(len(tokenized)):
 45 |         # this is a parenthetical:
 46 |         end_sentence = False
 47 |         if isinstance(tokenized[i], list):
 48 |             if len(words) == 0:
 49 |                 # end if a sentence finishes inside quoted section,
 50 |                 # and no sentence was begun beforehand
 51 |                 if is_end_symbol(tokenized[i][-2].rstrip()):
 52 |                     end_sentence = True
 53 |             else:
 54 |                 # end if a sentence finishes inside quote marks
 55 |                 if (tokenized[i][0][0] == '"' and
 56 |                     is_end_symbol(tokenized[i][-2].rstrip()) and
 57 |                     not tokenized[i][1][0].isupper()):
 58 |                     end_sentence = True
 59 |             words.extend(tokenized[i])
 60 |         else:
 61 |             stripped_tokenized = tokenized[i].rstrip()
 62 |             if is_end_symbol(stripped_tokenized):
 63 |                 words.append(tokenized[i])
 64 |                 not_last_word = i + 1 != len(tokenized)
 65 |                 next_word_lowercase = (
 66 |                     not_last_word and
 67 |                     tokenized[i+1][0].islower()
 68 |                 )
 69 |                 next_word_continue_punct = (
 70 |                     not_last_word and
 71 |                     tokenized[i+1][0] in CONTINUE_PUNCT_SYMBOLS
 72 |                 )
 73 |                 end_sentence = not (
 74 |                     not_last_word and
 75 |                     (
 76 |                         next_word_lowercase or
 77 |                         next_word_continue_punct
 78 |                     )
 79 |                 )
 80 |             else:
 81 |                 words.append(tokenized[i])
 82 |         if end_sentence:
 83 |             sentences.append(words)
 84 |             words = []
 85 | 
 86 |     # add final sentence, if it wasn't added yet.
 87 |     if len(words) > 0:
 88 |         sentences.append(words)
 89 | 
 90 |     # If the final word ends in a period:
 91 |     if len(sentences) > 0 and sentences[-1][-1]:
 92 |         alpha_word_piece = word_with_alpha_and_period.match(sentences[-1][-1])
 93 |         if alpha_word_piece:
 94 |             sentences[-1][-1] = alpha_word_piece.group(1)
 95 |             sentences[-1].append(alpha_word_piece.group(2))
 96 |     return sentences
 97 | 
 98 | 
 99 | def remove_whitespace(sentences):
100 |     """
101 |     Clear out spaces and newlines
102 |     from the list of list of strings.
103 | 
104 |     Arguments:
105 |     ----------
106 |         sentences : list<list<str>>
107 | 
108 |     Returns:
109 |     --------
110 |         list<list<str>> : same strings as input,
111 |             without spaces or newlines.
112 |     """
113 |     return [[w.rstrip() for w in sent] for sent in sentences]
114 | 
115 | 
116 | def sent_tokenize(text, keep_whitespace=False, normalize_ascii=True):
117 |     """
118 |     Perform sentence + word tokenization on the input text
119 |     using regular expressions and english/french specific
120 |     rules.
121 | 
122 |     Arguments:
123 |     ----------
124 |         text : str, input string to tokenize
125 |         keep_whitespace : bool, whether to strip out spaces
126 |             and newlines.
127 |         normalize_ascii : bool, perform some replacements
128 |             on rare characters so that they become
129 |             easier to process in a ascii pipeline
130 |             (canonicalize dashes, replace œ -> oe, etc..)
131 |     Returns:
132 |     --------
133 |         list<list<str>> : sentences with their content held
134 |             in a list of strings for each token.
135 |     """
136 |     sentences = detect_sentence_boundaries(
137 |         tokenize(
138 |             text,
139 |             normalize_ascii
140 |         )
141 |     )
142 |     if not keep_whitespace:
143 |         sentences = remove_whitespace(sentences)
144 |     return sentences
145 | 
146 | 


--------------------------------------------------------------------------------
/ciseau/wiki_markup_processing.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from .sentence_tokenizer import sent_tokenize
  3 | 
  4 | bracket_parser           = re.compile("\[\[(?P<name>[^\]\|]+)(?:\|[\W]*(?P<trigger>[^\]\#\|]+)(?:\#[^\]\|]+)?)*\]\]")
  5 | squiggly_bracket_parser  = re.compile("{{([^}]+)}}")
  6 | table_parser             = re.compile("{\|[^}]+\|}")
  7 | mvar_parser              = re.compile("{{\d*mvar\d*\|([^}]+)}}")
  8 | remove_emphasis          = re.compile("'{2,5}([^']+)'{2,5}")
  9 | 
 10 | # handles links that don't have a pipe sign"
 11 | double_bracket_parser    = re.compile("\[\[|\]\]")
 12 | # normalizes: 01/02/2003, 2005-06-07, and 2001 type dates to 7777
 13 | date_remover              = re.compile("((\d{4}(?:[-/]\d{2}[-/]\d{2})?)|(\d{2}(?:[-/]\d{2}[-/]\d{4})))(?=[^\d]|$)")
 14 | remove_emphasis_asterix   = re.compile("\*{2,5}([^\*]+)\*{2,5}")
 15 | remove_emphasis_slash     = re.compile("/{2,5}([^/]+)/{2,5}")
 16 | remove_emphasis_low_ticks = re.compile(",{2,5}([^,]+),{2,5}")
 17 | remove_emphasis_heading   = re.compile("={2,5}([^=]+)={2,5}")
 18 | remove_emphasis_strikethrough = re.compile("~{2}([^~]+)~{2}")
 19 | remove_emphasis_underline = re.compile("_{2}([^_]+)_{2}")
 20 | remove_bullets_nbsps      = re.compile("(&amp;nbsp;|&nbsp;|[\^\n]\*{1,}|[\^\n]\#{1,}|[\^\n]:{1,})") # remove lists, bullet points, and html no breakspace
 21 | remove_wikipedia_link     = re.compile("\[\W*http[^\] ]+\b*(?P<anchor>[^\]]+)\]")
 22 | markup_normalizer         = re.compile("[',/\*_=-]{2,5}")
 23 | markup_removes            = [
 24 |     remove_emphasis,
 25 |     remove_emphasis_heading,
 26 |     remove_emphasis_asterix,
 27 |     remove_emphasis_slash,
 28 |     remove_emphasis_low_ticks,
 29 |     remove_emphasis_strikethrough,
 30 |     remove_emphasis_underline
 31 | ]
 32 | replacer                  = lambda matches: matches.group('trigger') if matches.group('trigger') != None else matches.group('name')
 33 | anchor_replacer           = lambda matches: matches.group('anchor') if matches.group('anchor') else ''
 34 | html_remover              = re.compile("<[^>]+>")
 35 | internal_html_remover     = re.compile("{{[^(}})]+}}")
 36 | math_source_sections      = re.compile("<(math|source|code|sub|sup)[^>]*>([^<]"
 37 |                                        "*)</(math|source|code|sub|sup)>")
 38 | greater_than              = re.compile("(\W)>(\W)")
 39 | less_than                 = re.compile("<([^\w/])")
 40 | single_internal_link      = re.compile("\[\[([^\]\|]+)\]\]")
 41 | category_internal_link    = re.compile("\[\[Category:([^\]\|]+)\]\]")
 42 | 
 43 | # handles links that always have a pipe sign e.g. "[[the girl|Angelina Jolie]]"
 44 | anchortag_internal_link   = re.compile("\[\[(?P<target>[^\]\|]+)\|[\W]*("
 45 |                                        "?P<anchor>[^\]\#\|]+)(?:\#[^\]\|]+)?\]\]")
 46 | url_remover               = re.compile("http://[a-zA-Z\.&/]+")
 47 | empty_space = " "
 48 | empty_string = ""
 49 | 
 50 | 
 51 | def remove_dates(text):
 52 |     return date_remover.sub("7777", text)
 53 | 
 54 | 
 55 | def remove_html(text):
 56 |     return html_remover.sub(empty_space, text)
 57 | 
 58 | 
 59 | def remove_markup(text):
 60 |     return markup_normalizer.sub(empty_string, text)
 61 | 
 62 | 
 63 | def reintroduce_less_than(text):
 64 |     #return text
 65 |     return less_than.sub("&lt;\g<1>", text)
 66 | 
 67 | 
 68 | def reintroduce_greater_than(text):
 69 |     #return text
 70 |     return greater_than.sub("\g<1>&gt;\g<2>", text)
 71 | 
 72 | 
 73 | def reintroduce_less_than_greater_than(text):
 74 |     return reintroduce_less_than(reintroduce_greater_than(text))
 75 | 
 76 | 
 77 | def remove_math_sections(text):
 78 |     return math_source_sections.sub(empty_space, reintroduce_less_than_greater_than(text))
 79 | 
 80 | 
 81 | def _remove_brackets(text):
 82 |     return anchortag_internal_link.sub(
 83 |         "\g<anchor>",
 84 |         single_internal_link.sub(
 85 |             "\g<1>",
 86 |             category_internal_link.sub(
 87 |                 "\n\g<1> .\n",
 88 |                 text
 89 |             )
 90 |         )
 91 |     )
 92 | 
 93 | 
 94 | def _remove_table(text):
 95 |     return table_parser.sub(empty_space, text)
 96 | 
 97 | 
 98 | def _remove_squiggly_bracket(text):
 99 |     return squiggly_bracket_parser.sub(empty_space, text)
100 | 
101 | 
102 | def _remove_mvar(text):
103 |     return mvar_parser.sub("\g<1>", text)
104 | 
105 | 
106 | def remove_remaining_double_brackets(text):
107 |     return double_bracket_parser.sub(empty_space, text)
108 | 
109 | 
110 | def _remove_urls(text):
111 |     return url_remover.sub("url", text)
112 | 
113 | 
114 | def remove_brackets(text):
115 |     return remove_remaining_double_brackets(_remove_brackets(text))
116 | 
117 | 
118 | def to_raw_text_markupless(text, keep_whitespace=False, normalize_ascii=True):
119 |     """
120 |     A generator to convert raw text segments, without xml to a
121 |     list of words without any markup.
122 |     Additionally dates are replaced by `7777` for normalization.
123 | 
124 |     Arguments
125 |     ---------
126 |         text: str, input text to tokenize, strip of markup.
127 |         keep_whitespace : bool, should the output retain the
128 |             whitespace of the input (so that char offsets in the
129 |             output correspond to those in the input).
130 | 
131 |     Returns
132 |     -------
133 |         generator<list<list<str>>>, a generator for sentences, with
134 |             within each sentence a list of the words separated.
135 |     """
136 |     return sent_tokenize(
137 |         remove_dates(_remove_urls(text)),
138 |         keep_whitespace,
139 |         normalize_ascii
140 |     )
141 | 
142 | 
143 | def to_raw_text(text, keep_whitespace=False, normalize_ascii=True):
144 |     """
145 |     A generator to convert raw text segments, with xml, and other
146 |     non-textual content to a list of words without any markup.
147 |     Additionally dates are replaced by `7777` for normalization.
148 | 
149 |     Arguments
150 |     ---------
151 |        text: str, input text to tokenize, strip of markup.
152 |        keep_whitespace : bool, should the output retain the
153 |           whitespace of the input (so that char offsets in the
154 |           output correspond to those in the input).
155 | 
156 |     Returns
157 |     -------
158 |         generator<list<list<str>>>, a generator for sentences, with
159 |             within each sentence a list of the words separated.
160 |     """
161 |     out = text
162 |     out = _remove_urls(text)
163 |     out = _remove_mvar(out)
164 |     out = _remove_squiggly_bracket(out)
165 |     out = _remove_table(out)
166 |     out = _remove_brackets(out)
167 |     out = remove_remaining_double_brackets(out)
168 |     out = remove_markup(out)
169 |     out = remove_wikipedia_link.sub(anchor_replacer, out)
170 |     out = remove_bullets_nbsps.sub(empty_space, out)
171 |     out = remove_dates(out)
172 |     out = remove_math_sections(out)
173 |     out = remove_html(out)
174 |     out = sent_tokenize(out, keep_whitespace, normalize_ascii)
175 |     return out
176 | 
177 | 
178 | def to_raw_text_pairings(text, keep_whitespace=False, normalize_ascii=True):
179 |     """
180 |     A generator to convert raw text segments, with xml, and other
181 |     non-textual content to a list of words without any markup.
182 |     Additionally dates are replaced by `7777` for normalization,
183 |     along with wikipedia anchors kept.
184 | 
185 |     Arguments
186 |     ---------
187 |        text: str, input text to tokenize, strip of markup.
188 |        keep_whitespace : bool, should the output retain the
189 |           whitespace of the input (so that char offsets in the
190 |           output correspond to those in the input).
191 | 
192 |     Returns
193 |     -------
194 |         generator<list<list<str>>>, a generator for sentences, with
195 |             within each sentence a list of the words separated.
196 |     """
197 |     out = text
198 |     out = _remove_mvar(out)
199 |     out = _remove_squiggly_bracket(out)
200 |     out = _remove_table(out)
201 |     out = remove_markup(out)
202 |     out = remove_wikipedia_link.sub(anchor_replacer, out)
203 |     out = remove_bullets_nbsps.sub(empty_space, out)
204 |     out = remove_math_sections(out)
205 |     out = remove_html(out)
206 |     for sentence in sent_tokenize(out, keep_whitespace, normalize_ascii):
207 |         yield sentence
208 | 


--------------------------------------------------------------------------------
/ciseau/word_tokenizer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | from .constants import (
  4 |     PUNCT_SYMBOLS,
  5 |     ABBR,
  6 |     MONTHS,
  7 |     UNDECIDED,
  8 |     SHOULD_SPLIT,
  9 |     SHOULD_NOT_SPLIT
 10 | )
 11 | from .regular_expressions import (
 12 |     word_with_period,
 13 |     no_punctuation,
 14 |     numerical_expression,
 15 |     repeated_dash_converter,
 16 |     dash_converter,
 17 |     pure_whitespace,
 18 |     left_quote_shifter,
 19 |     left_quote_converter,
 20 |     one_letter_long_or_repeating,
 21 |     left_single_quote_converter,
 22 |     remaining_quote_converter,
 23 |     english_nots,
 24 |     english_contractions,
 25 |     english_specific_appendages,
 26 |     french_appendages,
 27 |     right_single_quote_converter,
 28 |     simple_dash_finder,
 29 |     advanced_dash_finder,
 30 |     url_file_finder,
 31 |     shifted_ellipses,
 32 |     shifted_standard_punctuation,
 33 |     multi_single_quote_finder
 34 | )
 35 | 
 36 | 
 37 | def protect_shorthand(text, split_locations):
 38 |     """
 39 |     Annotate locations in a string that contain
 40 |     periods as being true periods or periods
 41 |     that are a part of shorthand (and thus should
 42 |     not be treated as punctuation marks).
 43 | 
 44 |     Arguments:
 45 |     ----------
 46 |         text : str
 47 |         split_locations : list<int>, same length as text.
 48 |     """
 49 |     word_matches = list(re.finditer(word_with_period, text))
 50 |     total_words = len(word_matches)
 51 | 
 52 |     for i, match in enumerate(word_matches):
 53 |         match_start = match.start()
 54 |         match_end = match.end()
 55 |         for char_pos in range(match_start, match_end):
 56 |             if split_locations[char_pos] == SHOULD_SPLIT and match_end - char_pos > 1:
 57 |                 match_start = char_pos
 58 |         word = text[match_start:match_end]
 59 | 
 60 |         if not word.endswith('.'):
 61 |             # ensure that words contained within other words:
 62 |             # e.g. 'chocolate.Mountains of'  -> 'chocolate. Mountains of'
 63 |             if (not word[0].isdigit() and
 64 |                 split_locations[match_start] == UNDECIDED):
 65 |                 split_locations[match_start] = SHOULD_SPLIT
 66 |             continue
 67 |         period_pos = match_end - 1
 68 |         # this is not the last word, abbreviation
 69 |         # is not the final period of the sentence,
 70 |         # moreover:
 71 |         word_is_in_abbr = word[:-1].lower() in ABBR
 72 |         is_abbr_like = (
 73 |             word_is_in_abbr or
 74 |             one_letter_long_or_repeating.match(word[:-1]) is not None
 75 |         )
 76 |         is_digit = False if is_abbr_like else word[:-1].isdigit()
 77 | 
 78 |         is_last_word = i == (total_words - 1)
 79 |         is_ending = is_last_word and (match_end == len(text) or text[match_end:].isspace())
 80 |         is_not_ending = not is_ending
 81 |         abbreviation_and_not_end = (
 82 |             len(word) > 1 and
 83 |             is_abbr_like and
 84 |             is_not_ending
 85 |         )
 86 | 
 87 |         if abbreviation_and_not_end and (
 88 |                 (not is_last_word and word_matches[i+1].group(0)[0].islower()) or
 89 |                 (not is_last_word and word_matches[i+1].group(0) in PUNCT_SYMBOLS) or
 90 |                 word[0].isupper() or
 91 |                 word_is_in_abbr or
 92 |                 len(word) == 2):
 93 |             # next word is lowercase (e.g. not a new sentence?), or next word
 94 |             # is punctuation or next word is totally uppercase (e.g. 'Mister.
 95 |             # ABAGNALE called to the stand')
 96 |             if split_locations[period_pos] == SHOULD_SPLIT and period_pos + 1 < len(split_locations):
 97 |                 split_locations[period_pos + 1] = SHOULD_SPLIT
 98 |             split_locations[period_pos] = SHOULD_NOT_SPLIT
 99 |         elif (is_digit and
100 |               len(word[:-1]) <= 2 and
101 |               not is_last_word and
102 |               word_matches[i+1].group(0).lower() in MONTHS):
103 |             # a date or weird number with a period:
104 |             if split_locations[period_pos] == SHOULD_SPLIT and period_pos + 1 < len(split_locations):
105 |                 split_locations[period_pos + 1] = SHOULD_SPLIT
106 |             split_locations[period_pos] = SHOULD_NOT_SPLIT
107 |         elif split_locations[period_pos] == UNDECIDED:
108 |             # split this period into its own segment:
109 |             split_locations[period_pos] = SHOULD_SPLIT
110 | 
111 | 
112 | def split_with_locations(text, locations):
113 |     """
114 |     Use an integer list to split the string
115 |     contained in `text`.
116 | 
117 |     Arguments:
118 |     ----------
119 |         text : str, same length as locations.
120 |         locations : list<int>, contains values
121 |             'SHOULD_SPLIT', 'UNDECIDED', and
122 |             'SHOULD_NOT_SPLIT'. Will create
123 |             strings between each 'SHOULD_SPLIT'
124 |             locations.
125 |     Returns:
126 |     --------
127 |         Generator<str> : the substrings of text
128 |             corresponding to the slices given
129 |             in locations.
130 |     """
131 |     start = 0
132 |     for pos, decision in enumerate(locations):
133 |         if decision == SHOULD_SPLIT:
134 |             if start != pos:
135 |                 yield text[start:pos]
136 |             start = pos
137 |     if start != len(text):
138 |         yield text[start:]
139 | 
140 | 
141 | def mark_regex(regex, text, split_locations):
142 |     """
143 |     Regex that adds a 'SHOULD_SPLIT' marker at the end
144 |     location of each matching group of the given regex.
145 | 
146 |     Arguments
147 |     ---------
148 |         regex : re.Expression
149 |         text : str, same length as split_locations
150 |         split_locations : list<int>, split decisions.
151 |     """
152 |     for match in regex.finditer(text):
153 |         end_match = match.end()
154 |         if end_match < len(split_locations):
155 |             split_locations[end_match] = SHOULD_SPLIT
156 | 
157 | 
158 | def mark_begin_end_regex(regex, text, split_locations):
159 |     """
160 |     Regex that adds a 'SHOULD_SPLIT' marker at the end
161 |     location of each matching group of the given regex,
162 |     and adds a 'SHOULD_SPLIT' at the beginning of the
163 |     matching group. Each character within the matching
164 |     group will be marked as 'SHOULD_NOT_SPLIT'.
165 | 
166 |     Arguments
167 |     ---------
168 |         regex : re.Expression
169 |         text : str, same length as split_locations
170 |         split_locations : list<int>, split decisions.
171 |     """
172 |     for match in regex.finditer(text):
173 |         end_match = match.end()
174 |         begin_match = match.start()
175 | 
176 |         for i in range(begin_match+1, end_match):
177 |             split_locations[i] = SHOULD_NOT_SPLIT
178 |         if end_match < len(split_locations):
179 |             if split_locations[end_match] == UNDECIDED:
180 |                 split_locations[end_match] = SHOULD_SPLIT
181 |         if split_locations[begin_match] == UNDECIDED:
182 |             split_locations[begin_match] = SHOULD_SPLIT
183 | 
184 | 
185 | def tokenize(text, normalize_ascii=True):
186 |     """
187 |     Convert a single string into a list of substrings
188 |     split along punctuation and word boundaries. Keep
189 |     whitespace intact by always attaching it to the
190 |     previous token.
191 | 
192 |     Arguments:
193 |     ----------
194 |         text : str
195 |         normalize_ascii : bool, perform some replacements
196 |             on non-ascii characters to canonicalize the
197 |             string (defaults to True).
198 | 
199 |     Returns:
200 |     --------
201 |         list<str>, list of substring tokens.
202 |     """
203 |     # 1. If there's no punctuation, return immediately
204 |     if no_punctuation.match(text):
205 |         return [text]
206 |     # 2. let's standardize the input text to ascii (if desired)
207 |     # Note: this will no longer respect input-to-output character positions
208 |     if normalize_ascii:
209 |         # normalize these greco-roman characters to ascii:
210 |         text = text.replace(u"œ", "oe").replace(u"æ", "ae")
211 |         # normalize dashes:
212 |         text = repeated_dash_converter.sub("-", text)
213 |     # 3. let's construct an integer array of the possible split locations:
214 |     split_locations = [UNDECIDED] * len(text)
215 | 
216 |     regexes = (
217 |         pure_whitespace,
218 |         left_quote_shifter,
219 |         left_quote_converter,
220 |         left_single_quote_converter,
221 |         remaining_quote_converter,
222 |         # regex can't fix this -> regex ca n't fix this
223 |         english_nots,
224 |         # you'll dig this -> you 'll dig this
225 |         english_contractions,
226 |         # the rhino's horns -> the rhino 's horns
227 |         english_specific_appendages,
228 |         # qu'a tu fais au rhino -> qu ' a tu fais au rhino,
229 |         french_appendages
230 |     )
231 |     # 4. Mark end locations for specific regular expressions:
232 |     for regex in regexes:
233 |         mark_regex(regex, text, split_locations)
234 | 
235 |     begin_end_regexes = (
236 |         multi_single_quote_finder,
237 |         right_single_quote_converter,
238 |         # use dashes as the breakpoint:
239 |         # the rhino--truck -> the rhino -- truck
240 |         simple_dash_finder if normalize_ascii else advanced_dash_finder,
241 |         numerical_expression,
242 |         url_file_finder,
243 |         shifted_ellipses,
244 |         # the #rhino! -> the # rhino ! ;
245 |         # the rino[sic] -> the rino [ sic ]
246 |         shifted_standard_punctuation
247 |     )
248 | 
249 |     # 5. Mark begin and end locations for other regular expressions:
250 |     for regex in begin_end_regexes:
251 |         mark_begin_end_regex(regex, text, split_locations)
252 | 
253 |     # 6. Remove splitting on exceptional uses of periods:
254 |     # I'm with Mr. -> I 'm with Mr. , I'm with Mister. -> I 'm with Mister .
255 |     protect_shorthand(text, split_locations)
256 | 
257 |     if normalize_ascii:
258 |         text = dash_converter.sub("-", text)
259 |     # 7. Return the split string using the integer list:
260 |     return list(split_with_locations(text, split_locations))
261 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | def readfile(fname):
 5 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 6 | 
 7 | setup(
 8 |     name='ciseau',
 9 |     version='1.0.1',
10 |     description='Word and sentence tokenization.',
11 |     long_description=readfile('README.md'),
12 |     ext_modules=[],
13 |     packages=find_packages(),
14 |     py_modules = [],
15 |     author='Jonathan Raiman',
16 |     author_email='jonathanraiman@gmail.com',
17 |     url='https://github.com/JonathanRaiman/ciseau',
18 |     download_url='https://github.com/JonathanRaiman/ciseau',
19 |     keywords='XML, tokenization, NLP',
20 |     license='MIT',
21 |     platforms='any',
22 |     zip_safe=False,
23 |     classifiers=[
24 |         'Intended Audience :: Science/Research',
25 |         'Operating System :: OS Independent',
26 |         'Programming Language :: Python :: 3.3',
27 |         'Programming Language :: Python :: 2.7',
28 |         'Topic :: Text Processing :: Linguistic',
29 |     ],
30 |     setup_requires = [],
31 |     install_requires=[
32 |     ],
33 |     include_package_data=True,
34 | )
35 | 


--------------------------------------------------------------------------------
/tests/test_tokenization.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import unittest
  3 | import sys
  4 | from ciseau import tokenize, sent_tokenize
  5 | 
  6 | class TokenizationTests(unittest.TestCase):
  7 |     def test_quoted_expressions(self):
  8 |         expression = [
  9 |             "(", "in ", "2008", ") ", "the ", "Martians ",
 10 |             "arrived ", "and ", "you", "'ll ", "see ", "what ",
 11 |             "I ", "mean", "."
 12 |         ]
 13 |         self.assertEqual(
 14 |             tokenize(
 15 |                 "".join(expression)
 16 |             ),
 17 |             expression
 18 |         )
 19 | 
 20 |     def test_pre_post_quote(self):
 21 |         expression = [
 22 |             "On ", "January ", "28", ", ", "2011 ", ", ", "''",
 23 |             "Hollywood ", "Reporter", "'' ", "announced ",
 24 |             "that ", "Paramount ", "Pictures ", "had ", "given ",
 25 |             "the ", "green ", "light", "."
 26 |         ]
 27 |         self.assertEqual(
 28 |             tokenize(
 29 |                 "".join(expression)
 30 |             ),
 31 |             expression
 32 |         )
 33 | 
 34 |     def test_weird_hybrid_expressions(self):
 35 |         expression = [
 36 |             u"Beyoncé", u"'s ", u"1840", u"'s ", u"song ", u"<", u"3lovely", u"."
 37 |         ]
 38 |         self.assertEqual(
 39 |             tokenize(
 40 |                 "".join(expression)
 41 |             ),
 42 |             expression
 43 |         )
 44 | 
 45 |     def test_numerical_commas_periods_expressions(self):
 46 |         expression = [
 47 |             "In ", "the ", "year ", "2000", ", ",
 48 |             "there ", "was ", "evidence ", "that ", "100,000 ",
 49 |             "martians ", "came ", "to ", "see ", "us", ", ",
 50 |             "but ", "I ", "did", "n't ", "even ", "hear ",
 51 |             "98.2", ",", "98.3 ", "or ", "98.4 ", "speak ",
 52 |             "about ", "it", ","
 53 |         ]
 54 |         self.assertEqual(
 55 |             tokenize("".join(expression)),
 56 |             expression
 57 |         )
 58 | 
 59 |     def test_abbreviations(self):
 60 |         expression = [
 61 |             "Mr. ", "hooligan ", "and ", "his ", "brother ", "DR. ",
 62 |             "strange ", "know ", "each ", "other ", "well ", "said ",
 63 |             "d. ", "A. ", "Joe ", "the ", "sgt. ", "in ", "charge ",
 64 |             "of ", "all ", "this ", "bs", "."
 65 |         ]
 66 |         self.assertEqual(
 67 |             tokenize("".join(expression)),
 68 |             expression
 69 |         )
 70 | 
 71 |     def test_em_dash(self):
 72 |         expression = [
 73 |             u"The ", u"earthquake ", u"was ",
 74 |             u"also ", u"felt ", u"in ", u"nearby ", u"countries ",
 75 |             u"and ", u"as ", u"far ", u"away ", u"as ", u"both ",
 76 |             u"Beijing ", u"and ", u"Shanghai ", u"—", u"1,500 ", u"km ",
 77 |             u"(", u"930 ", u"mi ", u") ", u"and ", u"1,700 ", u"km ", u"(",
 78 |             u"1,060 ", u"mi ", u") ", u"away", u"—", u"where ", u"office ",
 79 |             u"buildings ", u"swayed ", u"with ", u"the ", u"tremor", u"."
 80 |         ]
 81 |         self.assertEqual(
 82 |             tokenize("".join(expression), normalize_ascii=False),
 83 |             expression
 84 |         )
 85 | 
 86 |     def test_quoted_expressions_with_ascii(self):
 87 |         expression = [
 88 |             "Julius ", u"Cæsar ", "declared ", "-- ", "professed ", "- ",
 89 |             "his ", "passion ", "for ", "wine ", "A", "."
 90 |         ]
 91 |         self.assertEqual(
 92 |             tokenize(
 93 |                 "".join(expression),
 94 |                 normalize_ascii=False
 95 |             ),
 96 |             expression
 97 |         )
 98 |         self.assertEqual(
 99 |             tokenize(
100 |                 "".join(expression),
101 |                 normalize_ascii=True
102 |             ),
103 |             [w.replace(u"æ", "ae").replace("--", "-") for w in expression]
104 |         )
105 | 
106 |     def test_sentence_detection(self):
107 |         expression = [
108 |             [u'Maslow', u'’s ', u'‘‘', u'Third ', u'Force ', u'Psychology ',
109 |              u'Theory', u'’’ ', u'even ', u'allows ', u'literary ', u'analysts ',
110 |              u'to ', u'critically ', u'understand ', u'how ', u'characters ',
111 |              u'reflect ', u'the ', u'culture ', u'and ', u'the ', u'history ',
112 |              u'in ', u'which ', u'they ', u'are ', u'contextualized', u'. '],
113 |             [u'It ', u'also ', u'allows ', u'analysts ', u'to ', u'understand ',
114 |              u'the ', u'author', u'’s ', u'intended ', u'message ', u'and ', u'to ',
115 |              u'understand ', u'the ', u'author', u'’s ', u'psychology', u'. '],
116 |             [u'The ', u'theory ', u'suggests ', u'that ', u'human ', u'beings ',
117 |              u'possess ', u'a ', u'nature ', u'within ', u'them ', u'that ',
118 |              u'demonstrates ', u'their ', u'true ', u'“', u'self', u'” ', u'and ',
119 |              u'it ', u'suggests ', u'that ', u'the ', u'fulfillment ', u'of ',
120 |              u'this ', u'nature ', u'is ', u'the ', u'reason ', u'for ', u'living',
121 |              u'. '],
122 |             [u'It ', u'also ', u'suggests ', u'that ', u'neurological ',
123 |              u'development ', u'hinders ', u'actualizing ', u'the ', u'nature ',
124 |              u'because ', u'a ', u'person ', u'becomes ', u'estranged ', u'from ',
125 |              u'his ', u'or ', u'her ', u'true ', u'self', u'. '],
126 |             [u'Therefore', u', ', u'literary ', u'devices ', u'reflect ', u'a ',
127 |              u'characters', u'’s ', u'and ', u'an ', u'author', u'’s ', u'natural ',
128 |              u'self', u'. '],
129 |             [u'In ', u'his ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'and ',
130 |              u'the ', u'Study ', u'of ', u'Literature', u'’’', u', ', u'Paris ',
131 |              u'argues ', u'“', u'D.', u'H ', u'Lawrence', u'’s ', u'“', u'pristine ',
132 |              u'unconscious', u'” ', u'is ', u'a ', u'metaphor ', u'for ', u'the ',
133 |              u'real ', u'self', u'”', u'. '],
134 |             [u'Thus ', u'Literature ', u'is ', u'a ', u'reputable ', u'tool ',
135 |              u'that ', u'allows ', u'readers ', u'to ', u'develop ', u'and ',
136 |              u'apply ', u'critical ', u'reasoning ', u'to ', u'the ', u'nature ',
137 |              u'of ', u'emotions', u'.']
138 |         ]
139 |         self.assertEqual(
140 |             sent_tokenize(
141 |                 u"".join(w for sent in expression for w in sent),
142 |                 keep_whitespace=True
143 |             ),
144 |             expression
145 |         )
146 | 
147 |     def test_unequal_quote_detection(self):
148 |         expression = [
149 |             [u"Beyoncé", u"'s ", u'vocal ', u'range ', u'spans ', u'four ', u'octaves',
150 |              u'. '],
151 |             [u'Jody ', u'Rosen ', u'highlights ', u'her ', u'tone ', u'and ',
152 |              u'timbre ', u'as ', u'particularly ', u'distinctive', u', ',
153 |              u'describing ', u'her ', u'voice ', u'as ', u'"', u'one ', u'of ',
154 |              u'the ', u'most ', u'compelling ', u'instruments ', u'in ',
155 |              u'popular ', u'music', u'"', u'. '
156 |             ],
157 |             [u'While ', u'another ', u'critic ', u'says ', u'she ', u'is ',
158 |              u'a ', u'"', u'Vocal ', u'acrobat', u', ', u'being ', u'able ',
159 |              u'to ', u'sing ', u'long ', u'and ', u'complex ', u'melismas ',
160 |              u'and ', u'vocal ', u'runs ', u'effortlessly', u', ', u'and ',
161 |              u'in ', u'key', u'. '],
162 |             [u'Her ', u'vocal ', u'abilities ', u'mean ', u'she ', u'is ',
163 |              u'identified ', u'as ', u'the ', u'centerpiece ', u'of ', u'Destiny',
164 |              u"'s ", u'Child', u'. '],
165 |             [u'The ', u'Daily ', u'Mail ', u'calls ', u"Beyoncé", u"'s ", u'voice ',
166 |              u'"', u'versatile', u'"', u', ', u'capable ', u'of ', u'exploring ',
167 |              u'power ', u'ballads', u', ', u'soul', u', ', u'rock ', u'belting',
168 |              u', ', u'operatic ', u'flourishes', u', ', u'and ', u'hip ', u'hop',
169 |              u'. '],
170 |             [u'Jon ', u'Pareles ', u'of ', u'The ', u'New ', u'York ', u'Times ',
171 |              u'commented ', u'that ', u'her ', u'voice ', u'is ', u'"', u'velvety ',
172 |              u'yet ', u'tart', u', ', u'with ', u'an ', u'insistent ', u'flutter ',
173 |              u'and ', u'reserves ', u'of ', u'soul ', u'belting', u'"', u'. '],
174 |             [u'Rosen ', u'notes ', u'that ', u'the ', u'hip ', u'hop ', u'era ',
175 |              u'highly ', u'influenced ', u"Beyoncé", u"'s ", u'strange ', u'rhythmic ',
176 |              u'vocal ', u'style', u', ', u'but ', u'also ', u'finds ', u'her ',
177 |              u'quite ', u'traditionalist ', u'in ', u'her ', u'use ', u'of ',
178 |              u'balladry', u', ', u'gospel ', u'and ', u'falsetto', u'. '],
179 |             [u'Other ', u'critics ', u'praise ', u'her ', u'range ', u'and ',
180 |              u'power', u', ', u'with ', u'Chris ', u'Richards ', u'of ', u'The ',
181 |              u'Washington ', u'Post ', u'saying ', u'she ', u'was ', u'"',
182 |              u'capable ', u'of ', u'punctuating ', u'any ', u'beat ', u'with ',
183 |              u'goose', u'-', u'bump', u'-', u'inducing ', u'whispers ', u'or ',
184 |              u'full', u'-', u'bore ', u'diva', u'-', u'roars', u'.', u'"']
185 |         ]
186 |         self.assertEqual(
187 |             sent_tokenize(
188 |                 u"".join(w for sent in expression for w in sent),
189 |                 keep_whitespace=True
190 |             ),
191 |             expression
192 |         )
193 | 
194 |     def test_contained_period_in_quotes(self):
195 |         expression = [[
196 |             "the ", "gray ", "bird ", "(", "which ", "was ",
197 |             "famous ", "for ", "its ", "colors", ".", ") ",
198 |             "was ", "ressurected ", "\" ", "she ", "said", ".", "\""
199 |         ]]
200 |         self.assertEqual(
201 |             sent_tokenize(
202 |                 "".join(w for sent in expression for w in sent),
203 |                 keep_whitespace=True
204 |             ),
205 |             expression
206 |         )
207 | 
208 |     def test_period_sequences(self):
209 |         expression = [[
210 |             "Mr. ", "Joe ", "was ", "always ", "late ", "to ", "his ",
211 |             "dates", ", ", "appointments", ", ", "etc.", "."
212 |         ]]
213 |         self.assertEqual(
214 |             sent_tokenize(
215 |                 "".join(w for sent in expression for w in sent),
216 |                 keep_whitespace=True
217 |             ),
218 |             expression
219 |         )
220 | 
221 |     def test_spanish_tokenization(self):
222 |         expressions = [
223 |             [
224 |                 [
225 |                     u"Pero ", u"si ", u"no ", u"es ", u"el ", u"caso", u", ", u"llega ",
226 |                     u"el ", u"momento ", u"de ", u"hacerse ", u"la ", u"pregunta ", u"de ",
227 |                     u"cada ", u"año", u". "
228 |                 ],
229 |                 [
230 |                     u"¿", u"Qué ", u"hago ", u"con ", u"estos ", u"sobres ", u"de ", u"jamón ",
231 |                     u"o ", u"este ", u"lomo ", u"ibérico", u"? "
232 |                 ],
233 |                 [
234 |                     u"¿", u"Los ", u"puedo ", u"congelar ", u"o ", u"es ", u"una ", u"aberración",
235 |                     u"? ",
236 |                 ],
237 |                 [
238 |                     u"La ", u"respuesta ", u"rápida ", u"sería ", u"un ", u"sí", u"."
239 |                 ]
240 |             ],
241 |             [
242 |                 [
243 |                     u"De ", u"hecho", u", ", u"es ", u"algo ", u"que ", u"lleva ", u"mucho ", u"tiempo ",
244 |                     u"haciéndose", u". "
245 |                 ],
246 |                 [
247 |                     u"En ", u"las ", u"matanzas ", u"de ", u"los ", u"pueblos ", u"muchas ", u"piezas ",
248 |                     u"se ", u"congelan ", u"una ", u"vez ", u"curadas ", u"para ", u"ir ", u"luego ",
249 |                     u"dándoles ", u"salida ", u"a ", u"lo ", u"largo ", u"de ", u"todo ", u"el ", u"año",
250 |                     u". "
251 |                 ],
252 |                 [
253 |                     u"Otro ", u"ejemplo ", u"clásico", u": ", u"las ", u"embarazas ", u"que ", u"quieren ",
254 |                     u"evitar ", u"cualquier ", u"posible ", u"riesgo ", u"de ", u"toxoplasmosis ", u"pero ",
255 |                     u"no ", u"quieren ", u"renunciar ", u"a ", u"los ", u"embutidos ", u"durante ", u"eso ",
256 |                     u"nueve ", u"meses", u". "
257 |                 ],
258 |                 [
259 |                     u"¿", u"Solución", u"? "
260 |                 ],
261 |                 [
262 |                     u"Congelarlo", u"."
263 |                 ]
264 |             ],
265 |             [
266 |                 [
267 |                     u"Que ", u"lo ", u"sepas", u", ", u"¡", u"no ", u"pienso ", u"hacerlo ", u"todo ", u"yo ",
268 |                     u"sola", u"!"
269 |                 ]
270 |             ],
271 |             [
272 |                 [
273 |                     u"¡", u"No ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u", ", u"que ", u"lo ",
274 |                     u"sepas", u"!"
275 |                 ]
276 |             ],
277 |             [
278 |                 [
279 |                     u"¡", u"No ", u"me ", u"digas ", u"nada", u"! "
280 |                 ],
281 |                 [
282 |                     u"¡", u"Te ", u"has ", u"portado ", u"fatal", u"! "
283 |                 ],
284 |                 [
285 |                     u"¡", u"No ", u"quiero ", u"volver ", u"a ", u"saber ", u"nada ", u"de ", u"ti", u"!"
286 |                 ]
287 |             ],
288 |             [
289 |                 [
290 |                     u"¡¡¡", u"Al ", u"ladrón", u"!!!"
291 |                 ]
292 |             ]
293 |         ]
294 |         for expression in expressions:
295 |             self.assertEqual(
296 |                 sent_tokenize(
297 |                     "".join(w for sent in expression for w in sent),
298 |                     keep_whitespace=True
299 |                 ),
300 |                 expression
301 |             )
302 | 
303 |     def test_german_tokenization(self):
304 |         expressions = [
305 |             [
306 |                 [
307 |                     u"Als ", u"Vertreter ", u"des ", u"One", u"-", u"Nation", u"-", u"Konservatismus ",
308 |                     u"bekleidete ", u"er ", u"nach ", u"dem ", u"Wahlsieg ", u"der ", u"Tories ", u"1951 ",
309 |                     u"als ", u"führendes ", u"Kabinettsmitglied ", u"mehrere ", u"wichtige ", u"Regierungsämter",
310 |                     u", ", u"unter ", u"anderem ", u"das ", u"des ", u"Verteidigungsministers", u", ", u"des ",
311 |                     u"Außenministers ", u"und ", u"des ", u"Schatzkanzlers", u". "
312 |                 ],
313 |                 [
314 |                     u"Seine ", u"Amtszeit ", u"als ",
315 |                     u"Premierminister ", u"war ", u"innenpolitisch ", u"geprägt ", u"von ", u"zahlreichen ", u"Reformen ",
316 |                     u"sowie ", u"einer ", u"prosperierenden ", u"Wirtschaft ", u"mit ", u"niedriger ", u"Arbeitslosigkeit ",
317 |                     u"und ", u"ungleichmäßigem ", u"Wirtschaftswachstum", u". ",
318 |                 ],
319 |                 [
320 |                     u"Außenpolitisch ", u"behob ", u"er ", u"die ", u"durch ", u"die ", u"Sueskrise ", u"entstandene ",
321 |                     u"Entfremdung ", u"mit ", u"den ", u"USA", u", ", u"erreichte ", u"die ", u"Lieferung ", u"von ",
322 |                     u"amerikanischen ", u"Polaris", u"-", u"Mittelstreckenraketen ", u"als ", u"neuen ", u"Kern ",
323 |                     u"der ", u"britischen ", u"nuklearen ", u"Abschreckung ", u"und ", u"bereitete ", u"den ", u"Weg ",
324 |                     u"für ", u"ein ", u"partielles ", u"Atomteststoppabkommen", u"."
325 |                 ]
326 |             ]
327 |         ]
328 |         for expression in expressions:
329 |             self.assertEqual(
330 |                 sent_tokenize(
331 |                     "".join(w for sent in expression for w in sent),
332 |                     keep_whitespace=True,
333 |                     normalize_ascii=False
334 |                 ),
335 |                 expression
336 |             )
337 | 


--------------------------------------------------------------------------------