├── .gitignore ├── .travis.yml ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── ciseau ├── __init__.py ├── constants.py ├── quoted_expressions.py ├── regular_expressions.py ├── sentence_tokenizer.py ├── wiki_markup_processing.py └── word_tokenizer.py ├── setup.py └── tests └── test_tokenization.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | 10 | # Packages # 11 | ############ 12 | # it's better to unpack these files and commit the raw source 13 | # git has its own built in compression methods 14 | *.7z 15 | *.dmg 16 | *.gz 17 | *.iso 18 | *.jar 19 | *.rar 20 | *.tar 21 | *.zip 22 | *.gem 23 | *.pem 24 | dist/ 25 | build/ 26 | 27 | # Saves # 28 | ######### 29 | saves/* 30 | imported_saves/* 31 | pvdm_snapshots/* 32 | sentiment_data/* 33 | *.npy 34 | *.mat 35 | *.vocab 36 | *.svocab 37 | text8 38 | __pycache__/* 39 | *.pyc 40 | *.egg-info 41 | 42 | # Logs and databases # 43 | ###################### 44 | *.log 45 | *.sql 46 | *.sqlite 47 | 48 | # OS generated files # 49 | ###################### 50 | .DS_Store 51 | .DS_Store? 52 | ._* 53 | .Spotlight-V100 54 | .Trashes 55 | ehthumbs.db 56 | Thumbs.db 57 | ======= 58 | .DS_Store 59 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | dist: trusty 3 | python: 4 | - '2.7' 5 | - '3.3' 6 | - '3.4' 7 | - '3.5' 8 | os: 9 | - linux 10 | install: 11 | - python setup.py install 12 | - pip install nose2 13 | script: nose2 14 | notifications: 15 | email: 16 | recipients: 17 | - jonathanraiman@gmail.com 18 | on_success: change 19 | on_failure: always 20 | after_success: 21 | - bash <(curl -s https://codecov.io/bash) 22 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Jonathan Raiman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | recursive-include ciseau *.pyx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Ciseau 2 | ------ 3 | 4 | Word and sentence tokenization in Python. 5 | 6 | [![PyPI version](https://badge.fury.io/py/ciseau.svg)](https://badge.fury.io/py/ciseau) 7 | [![Build Status](https://travis-ci.org/JonathanRaiman/ciseau.svg?branch=master)](https://travis-ci.org/JonathanRaiman/ciseau) 8 | ![Jonathan Raiman, author](https://img.shields.io/badge/Author-Jonathan%20Raiman%20-blue.svg) 9 | 10 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE.md) 11 | 12 | 13 | Usage 14 | ----- 15 | 16 | Use this package to split up strings according to sentence and word boundaries. 17 | For instance, to simply break up strings into tokens: 18 | 19 | ``` 20 | tokenize("Joey was a great sailor.") 21 | #=> ["Joey ", "was ", "a ", "great ", "sailor ", "."] 22 | ``` 23 | 24 | To also detect sentence boundaries: 25 | 26 | ``` 27 | sent_tokenize("Cat sat mat. Cat's named Cool.", keep_whitespace=True) 28 | #=> [["Cat ", "sat ", "mat", ". "], ["Cat ", "'s ", "named ", "Cool", "."]] 29 | ``` 30 | 31 | `sent_tokenize` can keep the whitespace as-is with the flags `keep_whitespace=True` and `normalize_ascii=False`. 32 | 33 | Installation 34 | ------------ 35 | 36 | ``` 37 | pip3 install ciseau 38 | ``` 39 | 40 | Testing 41 | ------- 42 | 43 | Run `nose2`. 44 | 45 | 46 | If you find this project useful for your work or research, here's how you can cite it: 47 | 48 | ```latex 49 | @misc{RaimanCiseau2017, 50 | author = {Raiman, Jonathan}, 51 | title = {Ciseau}, 52 | year = {2017}, 53 | publisher = {GitHub}, 54 | journal = {GitHub repository}, 55 | howpublished = {\url{https://github.com/jonathanraiman/ciseau}}, 56 | commit = {fe88b9d7f131b88bcdd2ff361df60b6d1cc64c04} 57 | } 58 | ``` 59 | 60 | -------------------------------------------------------------------------------- /ciseau/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for XML cleaning and text tokenization. 3 | 4 | Usage 5 | ----- 6 | 7 | > ciseau.tokenize("Joey was a great sailor.") 8 | #=> [["Joey", "was", "a", "great", "sailor", "."]] 9 | 10 | """ 11 | 12 | from .wiki_markup_processing import ( 13 | to_raw_text, 14 | to_raw_text_markupless, 15 | to_raw_text_pairings 16 | ) 17 | from .word_tokenizer import tokenize 18 | from .sentence_tokenizer import sent_tokenize 19 | 20 | __all__ = [ 21 | "to_raw_text", 22 | "to_raw_text_markupless", 23 | "to_raw_text_pairings", 24 | "sent_tokenize", 25 | "tokenize" 26 | ] 27 | -------------------------------------------------------------------------------- /ciseau/constants.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | 4 | if sys.version_info >= (3,3): 5 | dashes = ["–", "--+"] 6 | for i in range(8208, 8214): 7 | dashes.append(chr(i)) 8 | else: 9 | dashes = [u"–", u"--+"] 10 | for i in range(8208, 8214): 11 | dashes.append(unichr(i)) 12 | 13 | 14 | UNDECIDED = 0 15 | SHOULD_SPLIT = 1 16 | SHOULD_NOT_SPLIT = 2 17 | 18 | people = [ 19 | "jr", "mr", "ms", "mrs", "dr", "prof", "esq", "sr", 20 | "sen", "sens", "rep", "reps", "gov", "attys", "attys", 21 | "supt", "det", "mssrs", "rev", "fr", "ss", "msgr" 22 | ] 23 | army = ["col", "gen", "lt", "cmdr", "adm", "capt", "sgt", "cpl", "maj", "brig", "pt"] 24 | inst = ["dept","univ", "assn", "bros", "ph.d"] 25 | place = [ 26 | "arc", "al", "ave", "blvd", "bld", "cl", "ct", 27 | "cres", "exp", "expy", "dist", "mt", "mtn", "ft", 28 | "fy", "fwy", "hwy", "hway", "la", "pde", "pd","plz", "pl", "rd", "st", 29 | "tce" 30 | ] 31 | comp = ["mfg", "inc", "ltd", "co", "corp"] 32 | state = [ 33 | "ala","ariz","ark","cal","calif","colo","col","conn", 34 | "del","fed","fla","ga","ida","id","ill","ind","ia","kans", 35 | "kan","ken","ky","la","me","md","is","mass","mich","minn", 36 | "miss","mo","mont","neb","nebr","nev","mex","okla","ok", 37 | "ore","penna","penn","pa","dak","tenn","tex","ut","vt", 38 | "va","wash","wis","wisc","wy","wyo","usafa","alta", 39 | "man","ont","que","sask","yuk" 40 | ] 41 | month = [ 42 | "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", 43 | "sept", "oct", "nov", "dec" 44 | ] 45 | misc = ["vs", "etc", "no","esp", "ed", "iv", "Oper", "op", "i.e", "e.g", "v"] 46 | website = ["www"] 47 | currency = ["rs"] 48 | ABBR = {} 49 | # create a hash of these abbreviations: 50 | for abbreviation_type in [people, army, inst, place, comp, state, month, misc, website, currency]: 51 | for abbreviation in abbreviation_type: 52 | ABBR[abbreviation] = True 53 | 54 | MONTHS = { 55 | "january", "february", "march", "april", "may", 56 | "june", "july", "august", "september", "october", 57 | "november", "december" 58 | } 59 | PUNCT_SYMBOLS = {'.', "...", "?", "!", "..", "!!", "??", "!?", "?!", u"…"} 60 | CONTINUE_PUNCT_SYMBOLS = {';', ',', '-', ':'} | set(dashes) 61 | OPENING_SYMBOLS = {'(', '[', '"', '{', '“'} 62 | CLOSING_SYMBOLS = {')', ']', '"', '}', '”'} 63 | CLOSE_2_OPEN = {')':'(', ']': '[', '"':'"', '}':'{', '”':'“'} 64 | -------------------------------------------------------------------------------- /ciseau/quoted_expressions.py: -------------------------------------------------------------------------------- 1 | from .constants import ( 2 | OPENING_SYMBOLS, 3 | CLOSING_SYMBOLS, 4 | CLOSE_2_OPEN, 5 | PUNCT_SYMBOLS, 6 | CONTINUE_PUNCT_SYMBOLS 7 | ) 8 | 9 | def group_quoted_tokens(tokens): 10 | sentences = [] 11 | opening_symbols = OPENING_SYMBOLS.copy() 12 | closing_symbols = CLOSING_SYMBOLS.copy() 13 | 14 | inside = [] 15 | observed_opens = 0 16 | open_closed_sections = [] 17 | 18 | for idx, word in enumerate(tokens): 19 | token_stripped = word[0] 20 | if token_stripped in opening_symbols and token_stripped == '"': 21 | # probably a closing quote since there are spaces 22 | # after it. Let's confirm by checking if there were 23 | # any spaces on the previous word: 24 | quote_has_spaces = len(word) > len(token_stripped) 25 | previous_word_has_spaces = idx > 0 and tokens[idx-1].endswith(' ') 26 | is_last_word = idx + 1 == len(tokens) 27 | if idx == 0: 28 | is_open_symbol = True 29 | is_close_symbol = False 30 | elif quote_has_spaces and previous_word_has_spaces: 31 | # 1. previous word has spaces before this symbol 32 | # so spaces are not meaningful. 33 | 34 | # 2. We find that we are already within a quoted section: 35 | if len(inside) > 0 and inside[-1][0] == '"': 36 | is_open_symbol = False 37 | is_close_symbol = True 38 | else: 39 | # we are not within a quoted section, we resort to counting 40 | # to see what is the best opening-closing strategy 41 | num_expected_future_quotes = sum(symbol == '"' for symbol, _ in inside) + 1 42 | num_future_quotes = sum(token[0] == '"' for token in tokens[idx+1:]) 43 | # find the right amount of quotes: 44 | if num_expected_future_quotes == num_future_quotes: 45 | is_open_symbol = True 46 | is_close_symbol = False 47 | else: 48 | is_open_symbol = False 49 | is_close_symbol = True 50 | elif quote_has_spaces and not previous_word_has_spaces: 51 | # 'joe" ' -> closing some quotes 52 | is_close_symbol = True 53 | is_open_symbol = False 54 | elif is_last_word: 55 | # last word may not have spaces 56 | is_open_symbol = False 57 | is_close_symbol = True 58 | else: 59 | if (not tokens[idx-1].endswith(' ') or 60 | tokens[idx+1][0] in PUNCT_SYMBOLS or 61 | tokens[idx+1][0] in CONTINUE_PUNCT_SYMBOLS): 62 | if len(inside) > 0 and inside[-1][0] == '"': 63 | # quote is followed by semicolon, comma, etc... 64 | # or preceded by a word without a space 'joe"something"' 65 | is_open_symbol = False 66 | is_close_symbol = True 67 | else: 68 | is_open_symbol = True 69 | is_close_symbol = False 70 | else: 71 | # no spaces after this quote, can thus assume that it is opening 72 | is_open_symbol = True 73 | is_close_symbol = False 74 | else: 75 | is_open_symbol = token_stripped in opening_symbols 76 | is_close_symbol = token_stripped in closing_symbols 77 | 78 | if is_open_symbol: 79 | inside.append((token_stripped, idx)) 80 | observed_opens += 1 81 | elif is_close_symbol: 82 | if len(inside) > 0: 83 | if inside[-1][0] == CLOSE_2_OPEN[token_stripped]: 84 | open_closed_sections.append((inside[-1][1], idx + 1)) 85 | inside.pop() 86 | else: 87 | if token_stripped in closing_symbols: 88 | # this closing symbol seems to be ignored 89 | closing_symbols.remove(token_stripped) 90 | opening_symbols.remove(CLOSE_2_OPEN[token_stripped]) 91 | # from now on ignore this symbol as start or end: 92 | inside = [(symbol, start) 93 | for symbol, start in inside 94 | if symbol != CLOSE_2_OPEN[token_stripped]] 95 | else: 96 | if observed_opens > 0: 97 | if token_stripped in closing_symbols: 98 | # this closing symbol seems to be ignored 99 | closing_symbols.remove(token_stripped) 100 | opening_symbols.remove(CLOSE_2_OPEN[token_stripped]) 101 | 102 | earliest_start = len(tokens) 103 | out_tokens = [] 104 | for start, end in open_closed_sections[::-1]: 105 | if start > earliest_start: 106 | continue 107 | else: 108 | if end != earliest_start: 109 | out_tokens = tokens[end:earliest_start] + out_tokens 110 | out_tokens = [tokens[start:end]] + out_tokens 111 | earliest_start = start 112 | if earliest_start > 0: 113 | out_tokens = tokens[0:earliest_start] + out_tokens 114 | return out_tokens 115 | 116 | -------------------------------------------------------------------------------- /ciseau/regular_expressions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import sys 4 | 5 | from .constants import dashes 6 | 7 | dashes_no_repeats = dashes[:] 8 | dashes_no_repeats.remove("--+") 9 | 10 | matching_dashes = dashes_no_repeats + ["-+"] 11 | 12 | word_with_alpha_and_period = re.compile("^([^\.]+)(\.\s*)$") 13 | one_letter_long_or_repeating = re.compile("^(?:(?:[a-z])|(?:[a-z](?:\.[a-z])+))$", re.IGNORECASE) 14 | no_punctuation = re.compile("^\w+$") 15 | left_quote_shifter = re.compile(u"((`‘(?!`))|(‘(?!‘))\s*)(?=.*\w)", re.UNICODE) 16 | left_quote_converter = re.compile(u'([«"“]\s*)(?=.*\w)', re.UNICODE) 17 | left_single_quote_converter = re.compile(u"(?:(\W|^))('\s*)(?=.*\w)", re.UNICODE) 18 | right_single_quote_converter = re.compile(u"(['’]+)(?=\W|$)\s*", re.UNICODE) 19 | 20 | if sys.version_info >= (3,3): 21 | repeated_dash_converter = re.compile("--+") 22 | dash_converter = re.compile("|".join(dashes_no_repeats)) 23 | else: 24 | repeated_dash_converter = re.compile(u"--+") 25 | dash_converter = re.compile(u"|".join(dashes_no_repeats)) 26 | 27 | simple_dash_finder = re.compile("(-\s*)") 28 | advanced_dash_finder = re.compile("(" + "|".join(matching_dashes) + ")\s*") 29 | multi_single_quote_finder = re.compile("('{2,})\s*") 30 | url_file_finder = re.compile("(?:[-a-zA-Z0-9@%._\+~#=]{2,256}://)?" 31 | "(?:www\.)?[-a-zA-Z0-9@:%\._\+~#=]{2," 32 | "256}\.[a-z]{2,6}[-a-zA-Z0-9@:%_\+.~#" 33 | "?&//=]*\s*") 34 | numerical_expression = re.compile(u"(\d+(?:,\d+)*(?:\.\d+)*(?![a-zA-ZÀ-ż])\s*)") 35 | remaining_quote_converter = re.compile(u'(.)(?=["“”»])') 36 | shifted_ellipses = re.compile("([\.\!\?¿¡]{2,})\s*") 37 | shifted_standard_punctuation = re.compile(u"([\(\[\{\}\]\)\!¡\?¿#\$%;~&+=<>|/:,—…])\s*") 38 | period_mover = re.compile(u"([a-zA-ZÀ-ż]{2})([\./])\s+([a-zA-ZÀ-ż]{2})") 39 | pure_whitespace = re.compile("\s+") 40 | english_specific_appendages = re.compile(u"(\w)(?=['’]([dms])\\b)", re.UNICODE) 41 | english_nots = re.compile(u"(.)(?=n['’]t\\b)", re.UNICODE) 42 | english_contractions = re.compile(u"(.)(?=['’](ve|ll|re)\\b)") 43 | french_appendages = re.compile(u"(\\b[tjnlsmdclTJNLSMLDC]|qu)['’](?=[^tdms])") 44 | word_with_period = re.compile("[^\s\.]+\.{0,1}") 45 | -------------------------------------------------------------------------------- /ciseau/sentence_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .regular_expressions import word_with_alpha_and_period 3 | from .quoted_expressions import group_quoted_tokens 4 | from .constants import ( 5 | PUNCT_SYMBOLS, 6 | CONTINUE_PUNCT_SYMBOLS 7 | ) 8 | from .word_tokenizer import tokenize 9 | 10 | def is_end_symbol(symbol): 11 | return ( 12 | symbol[:2] in PUNCT_SYMBOLS 13 | ) 14 | 15 | def detect_sentence_boundaries(tokens): 16 | """ 17 | Subdivide an input list of strings (tokens) 18 | into multiple lists according to detected 19 | sentence boundaries. 20 | 21 | ``` 22 | detect_sentence_boundaries( 23 | ["Cat ", "sat ", "mat", ". ", "Cat ", "'s ", "named ", "Cool", "."] 24 | ) 25 | #=> [ 26 | ["Cat ", "sat ", "mat", ". "], 27 | ["Cat ", "'s ", "named ", "Cool", "."] 28 | ] 29 | ``` 30 | 31 | Arguments: 32 | ---------- 33 | 34 | tokens : list 35 | 36 | Returns: 37 | -------- 38 | list> : original list subdivided into multiple 39 | lists according to (detected) sentence boundaries. 40 | """ 41 | tokenized = group_quoted_tokens(tokens) 42 | words = [] 43 | sentences = [] 44 | for i in range(len(tokenized)): 45 | # this is a parenthetical: 46 | end_sentence = False 47 | if isinstance(tokenized[i], list): 48 | if len(words) == 0: 49 | # end if a sentence finishes inside quoted section, 50 | # and no sentence was begun beforehand 51 | if is_end_symbol(tokenized[i][-2].rstrip()): 52 | end_sentence = True 53 | else: 54 | # end if a sentence finishes inside quote marks 55 | if (tokenized[i][0][0] == '"' and 56 | is_end_symbol(tokenized[i][-2].rstrip()) and 57 | not tokenized[i][1][0].isupper()): 58 | end_sentence = True 59 | words.extend(tokenized[i]) 60 | else: 61 | stripped_tokenized = tokenized[i].rstrip() 62 | if is_end_symbol(stripped_tokenized): 63 | words.append(tokenized[i]) 64 | not_last_word = i + 1 != len(tokenized) 65 | next_word_lowercase = ( 66 | not_last_word and 67 | tokenized[i+1][0].islower() 68 | ) 69 | next_word_continue_punct = ( 70 | not_last_word and 71 | tokenized[i+1][0] in CONTINUE_PUNCT_SYMBOLS 72 | ) 73 | end_sentence = not ( 74 | not_last_word and 75 | ( 76 | next_word_lowercase or 77 | next_word_continue_punct 78 | ) 79 | ) 80 | else: 81 | words.append(tokenized[i]) 82 | if end_sentence: 83 | sentences.append(words) 84 | words = [] 85 | 86 | # add final sentence, if it wasn't added yet. 87 | if len(words) > 0: 88 | sentences.append(words) 89 | 90 | # If the final word ends in a period: 91 | if len(sentences) > 0 and sentences[-1][-1]: 92 | alpha_word_piece = word_with_alpha_and_period.match(sentences[-1][-1]) 93 | if alpha_word_piece: 94 | sentences[-1][-1] = alpha_word_piece.group(1) 95 | sentences[-1].append(alpha_word_piece.group(2)) 96 | return sentences 97 | 98 | 99 | def remove_whitespace(sentences): 100 | """ 101 | Clear out spaces and newlines 102 | from the list of list of strings. 103 | 104 | Arguments: 105 | ---------- 106 | sentences : list> 107 | 108 | Returns: 109 | -------- 110 | list> : same strings as input, 111 | without spaces or newlines. 112 | """ 113 | return [[w.rstrip() for w in sent] for sent in sentences] 114 | 115 | 116 | def sent_tokenize(text, keep_whitespace=False, normalize_ascii=True): 117 | """ 118 | Perform sentence + word tokenization on the input text 119 | using regular expressions and english/french specific 120 | rules. 121 | 122 | Arguments: 123 | ---------- 124 | text : str, input string to tokenize 125 | keep_whitespace : bool, whether to strip out spaces 126 | and newlines. 127 | normalize_ascii : bool, perform some replacements 128 | on rare characters so that they become 129 | easier to process in a ascii pipeline 130 | (canonicalize dashes, replace œ -> oe, etc..) 131 | Returns: 132 | -------- 133 | list> : sentences with their content held 134 | in a list of strings for each token. 135 | """ 136 | sentences = detect_sentence_boundaries( 137 | tokenize( 138 | text, 139 | normalize_ascii 140 | ) 141 | ) 142 | if not keep_whitespace: 143 | sentences = remove_whitespace(sentences) 144 | return sentences 145 | 146 | -------------------------------------------------------------------------------- /ciseau/wiki_markup_processing.py: -------------------------------------------------------------------------------- 1 | import re 2 | from .sentence_tokenizer import sent_tokenize 3 | 4 | bracket_parser = re.compile("\[\[(?P[^\]\|]+)(?:\|[\W]*(?P[^\]\#\|]+)(?:\#[^\]\|]+)?)*\]\]") 5 | squiggly_bracket_parser = re.compile("{{([^}]+)}}") 6 | table_parser = re.compile("{\|[^}]+\|}") 7 | mvar_parser = re.compile("{{\d*mvar\d*\|([^}]+)}}") 8 | remove_emphasis = re.compile("'{2,5}([^']+)'{2,5}") 9 | 10 | # handles links that don't have a pipe sign" 11 | double_bracket_parser = re.compile("\[\[|\]\]") 12 | # normalizes: 01/02/2003, 2005-06-07, and 2001 type dates to 7777 13 | date_remover = re.compile("((\d{4}(?:[-/]\d{2}[-/]\d{2})?)|(\d{2}(?:[-/]\d{2}[-/]\d{4})))(?=[^\d]|$)") 14 | remove_emphasis_asterix = re.compile("\*{2,5}([^\*]+)\*{2,5}") 15 | remove_emphasis_slash = re.compile("/{2,5}([^/]+)/{2,5}") 16 | remove_emphasis_low_ticks = re.compile(",{2,5}([^,]+),{2,5}") 17 | remove_emphasis_heading = re.compile("={2,5}([^=]+)={2,5}") 18 | remove_emphasis_strikethrough = re.compile("~{2}([^~]+)~{2}") 19 | remove_emphasis_underline = re.compile("_{2}([^_]+)_{2}") 20 | remove_bullets_nbsps = re.compile("(&nbsp;| |[\^\n]\*{1,}|[\^\n]\#{1,}|[\^\n]:{1,})") # remove lists, bullet points, and html no breakspace 21 | remove_wikipedia_link = re.compile("\[\W*http[^\] ]+\b*(?P[^\]]+)\]") 22 | markup_normalizer = re.compile("[',/\*_=-]{2,5}") 23 | markup_removes = [ 24 | remove_emphasis, 25 | remove_emphasis_heading, 26 | remove_emphasis_asterix, 27 | remove_emphasis_slash, 28 | remove_emphasis_low_ticks, 29 | remove_emphasis_strikethrough, 30 | remove_emphasis_underline 31 | ] 32 | replacer = lambda matches: matches.group('trigger') if matches.group('trigger') != None else matches.group('name') 33 | anchor_replacer = lambda matches: matches.group('anchor') if matches.group('anchor') else '' 34 | html_remover = re.compile("<[^>]+>") 35 | internal_html_remover = re.compile("{{[^(}})]+}}") 36 | math_source_sections = re.compile("<(math|source|code|sub|sup)[^>]*>([^<]" 37 | "*)") 38 | greater_than = re.compile("(\W)>(\W)") 39 | less_than = re.compile("<([^\w/])") 40 | single_internal_link = re.compile("\[\[([^\]\|]+)\]\]") 41 | category_internal_link = re.compile("\[\[Category:([^\]\|]+)\]\]") 42 | 43 | # handles links that always have a pipe sign e.g. "[[the girl|Angelina Jolie]]" 44 | anchortag_internal_link = re.compile("\[\[(?P[^\]\|]+)\|[\W]*(" 45 | "?P[^\]\#\|]+)(?:\#[^\]\|]+)?\]\]") 46 | url_remover = re.compile("http://[a-zA-Z\.&/]+") 47 | empty_space = " " 48 | empty_string = "" 49 | 50 | 51 | def remove_dates(text): 52 | return date_remover.sub("7777", text) 53 | 54 | 55 | def remove_html(text): 56 | return html_remover.sub(empty_space, text) 57 | 58 | 59 | def remove_markup(text): 60 | return markup_normalizer.sub(empty_string, text) 61 | 62 | 63 | def reintroduce_less_than(text): 64 | #return text 65 | return less_than.sub("<\g<1>", text) 66 | 67 | 68 | def reintroduce_greater_than(text): 69 | #return text 70 | return greater_than.sub("\g<1>>\g<2>", text) 71 | 72 | 73 | def reintroduce_less_than_greater_than(text): 74 | return reintroduce_less_than(reintroduce_greater_than(text)) 75 | 76 | 77 | def remove_math_sections(text): 78 | return math_source_sections.sub(empty_space, reintroduce_less_than_greater_than(text)) 79 | 80 | 81 | def _remove_brackets(text): 82 | return anchortag_internal_link.sub( 83 | "\g", 84 | single_internal_link.sub( 85 | "\g<1>", 86 | category_internal_link.sub( 87 | "\n\g<1> .\n", 88 | text 89 | ) 90 | ) 91 | ) 92 | 93 | 94 | def _remove_table(text): 95 | return table_parser.sub(empty_space, text) 96 | 97 | 98 | def _remove_squiggly_bracket(text): 99 | return squiggly_bracket_parser.sub(empty_space, text) 100 | 101 | 102 | def _remove_mvar(text): 103 | return mvar_parser.sub("\g<1>", text) 104 | 105 | 106 | def remove_remaining_double_brackets(text): 107 | return double_bracket_parser.sub(empty_space, text) 108 | 109 | 110 | def _remove_urls(text): 111 | return url_remover.sub("url", text) 112 | 113 | 114 | def remove_brackets(text): 115 | return remove_remaining_double_brackets(_remove_brackets(text)) 116 | 117 | 118 | def to_raw_text_markupless(text, keep_whitespace=False, normalize_ascii=True): 119 | """ 120 | A generator to convert raw text segments, without xml to a 121 | list of words without any markup. 122 | Additionally dates are replaced by `7777` for normalization. 123 | 124 | Arguments 125 | --------- 126 | text: str, input text to tokenize, strip of markup. 127 | keep_whitespace : bool, should the output retain the 128 | whitespace of the input (so that char offsets in the 129 | output correspond to those in the input). 130 | 131 | Returns 132 | ------- 133 | generator>>, a generator for sentences, with 134 | within each sentence a list of the words separated. 135 | """ 136 | return sent_tokenize( 137 | remove_dates(_remove_urls(text)), 138 | keep_whitespace, 139 | normalize_ascii 140 | ) 141 | 142 | 143 | def to_raw_text(text, keep_whitespace=False, normalize_ascii=True): 144 | """ 145 | A generator to convert raw text segments, with xml, and other 146 | non-textual content to a list of words without any markup. 147 | Additionally dates are replaced by `7777` for normalization. 148 | 149 | Arguments 150 | --------- 151 | text: str, input text to tokenize, strip of markup. 152 | keep_whitespace : bool, should the output retain the 153 | whitespace of the input (so that char offsets in the 154 | output correspond to those in the input). 155 | 156 | Returns 157 | ------- 158 | generator>>, a generator for sentences, with 159 | within each sentence a list of the words separated. 160 | """ 161 | out = text 162 | out = _remove_urls(text) 163 | out = _remove_mvar(out) 164 | out = _remove_squiggly_bracket(out) 165 | out = _remove_table(out) 166 | out = _remove_brackets(out) 167 | out = remove_remaining_double_brackets(out) 168 | out = remove_markup(out) 169 | out = remove_wikipedia_link.sub(anchor_replacer, out) 170 | out = remove_bullets_nbsps.sub(empty_space, out) 171 | out = remove_dates(out) 172 | out = remove_math_sections(out) 173 | out = remove_html(out) 174 | out = sent_tokenize(out, keep_whitespace, normalize_ascii) 175 | return out 176 | 177 | 178 | def to_raw_text_pairings(text, keep_whitespace=False, normalize_ascii=True): 179 | """ 180 | A generator to convert raw text segments, with xml, and other 181 | non-textual content to a list of words without any markup. 182 | Additionally dates are replaced by `7777` for normalization, 183 | along with wikipedia anchors kept. 184 | 185 | Arguments 186 | --------- 187 | text: str, input text to tokenize, strip of markup. 188 | keep_whitespace : bool, should the output retain the 189 | whitespace of the input (so that char offsets in the 190 | output correspond to those in the input). 191 | 192 | Returns 193 | ------- 194 | generator>>, a generator for sentences, with 195 | within each sentence a list of the words separated. 196 | """ 197 | out = text 198 | out = _remove_mvar(out) 199 | out = _remove_squiggly_bracket(out) 200 | out = _remove_table(out) 201 | out = remove_markup(out) 202 | out = remove_wikipedia_link.sub(anchor_replacer, out) 203 | out = remove_bullets_nbsps.sub(empty_space, out) 204 | out = remove_math_sections(out) 205 | out = remove_html(out) 206 | for sentence in sent_tokenize(out, keep_whitespace, normalize_ascii): 207 | yield sentence 208 | -------------------------------------------------------------------------------- /ciseau/word_tokenizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from .constants import ( 4 | PUNCT_SYMBOLS, 5 | ABBR, 6 | MONTHS, 7 | UNDECIDED, 8 | SHOULD_SPLIT, 9 | SHOULD_NOT_SPLIT 10 | ) 11 | from .regular_expressions import ( 12 | word_with_period, 13 | no_punctuation, 14 | numerical_expression, 15 | repeated_dash_converter, 16 | dash_converter, 17 | pure_whitespace, 18 | left_quote_shifter, 19 | left_quote_converter, 20 | one_letter_long_or_repeating, 21 | left_single_quote_converter, 22 | remaining_quote_converter, 23 | english_nots, 24 | english_contractions, 25 | english_specific_appendages, 26 | french_appendages, 27 | right_single_quote_converter, 28 | simple_dash_finder, 29 | advanced_dash_finder, 30 | url_file_finder, 31 | shifted_ellipses, 32 | shifted_standard_punctuation, 33 | multi_single_quote_finder 34 | ) 35 | 36 | 37 | def protect_shorthand(text, split_locations): 38 | """ 39 | Annotate locations in a string that contain 40 | periods as being true periods or periods 41 | that are a part of shorthand (and thus should 42 | not be treated as punctuation marks). 43 | 44 | Arguments: 45 | ---------- 46 | text : str 47 | split_locations : list, same length as text. 48 | """ 49 | word_matches = list(re.finditer(word_with_period, text)) 50 | total_words = len(word_matches) 51 | 52 | for i, match in enumerate(word_matches): 53 | match_start = match.start() 54 | match_end = match.end() 55 | for char_pos in range(match_start, match_end): 56 | if split_locations[char_pos] == SHOULD_SPLIT and match_end - char_pos > 1: 57 | match_start = char_pos 58 | word = text[match_start:match_end] 59 | 60 | if not word.endswith('.'): 61 | # ensure that words contained within other words: 62 | # e.g. 'chocolate.Mountains of' -> 'chocolate. Mountains of' 63 | if (not word[0].isdigit() and 64 | split_locations[match_start] == UNDECIDED): 65 | split_locations[match_start] = SHOULD_SPLIT 66 | continue 67 | period_pos = match_end - 1 68 | # this is not the last word, abbreviation 69 | # is not the final period of the sentence, 70 | # moreover: 71 | word_is_in_abbr = word[:-1].lower() in ABBR 72 | is_abbr_like = ( 73 | word_is_in_abbr or 74 | one_letter_long_or_repeating.match(word[:-1]) is not None 75 | ) 76 | is_digit = False if is_abbr_like else word[:-1].isdigit() 77 | 78 | is_last_word = i == (total_words - 1) 79 | is_ending = is_last_word and (match_end == len(text) or text[match_end:].isspace()) 80 | is_not_ending = not is_ending 81 | abbreviation_and_not_end = ( 82 | len(word) > 1 and 83 | is_abbr_like and 84 | is_not_ending 85 | ) 86 | 87 | if abbreviation_and_not_end and ( 88 | (not is_last_word and word_matches[i+1].group(0)[0].islower()) or 89 | (not is_last_word and word_matches[i+1].group(0) in PUNCT_SYMBOLS) or 90 | word[0].isupper() or 91 | word_is_in_abbr or 92 | len(word) == 2): 93 | # next word is lowercase (e.g. not a new sentence?), or next word 94 | # is punctuation or next word is totally uppercase (e.g. 'Mister. 95 | # ABAGNALE called to the stand') 96 | if split_locations[period_pos] == SHOULD_SPLIT and period_pos + 1 < len(split_locations): 97 | split_locations[period_pos + 1] = SHOULD_SPLIT 98 | split_locations[period_pos] = SHOULD_NOT_SPLIT 99 | elif (is_digit and 100 | len(word[:-1]) <= 2 and 101 | not is_last_word and 102 | word_matches[i+1].group(0).lower() in MONTHS): 103 | # a date or weird number with a period: 104 | if split_locations[period_pos] == SHOULD_SPLIT and period_pos + 1 < len(split_locations): 105 | split_locations[period_pos + 1] = SHOULD_SPLIT 106 | split_locations[period_pos] = SHOULD_NOT_SPLIT 107 | elif split_locations[period_pos] == UNDECIDED: 108 | # split this period into its own segment: 109 | split_locations[period_pos] = SHOULD_SPLIT 110 | 111 | 112 | def split_with_locations(text, locations): 113 | """ 114 | Use an integer list to split the string 115 | contained in `text`. 116 | 117 | Arguments: 118 | ---------- 119 | text : str, same length as locations. 120 | locations : list, contains values 121 | 'SHOULD_SPLIT', 'UNDECIDED', and 122 | 'SHOULD_NOT_SPLIT'. Will create 123 | strings between each 'SHOULD_SPLIT' 124 | locations. 125 | Returns: 126 | -------- 127 | Generator : the substrings of text 128 | corresponding to the slices given 129 | in locations. 130 | """ 131 | start = 0 132 | for pos, decision in enumerate(locations): 133 | if decision == SHOULD_SPLIT: 134 | if start != pos: 135 | yield text[start:pos] 136 | start = pos 137 | if start != len(text): 138 | yield text[start:] 139 | 140 | 141 | def mark_regex(regex, text, split_locations): 142 | """ 143 | Regex that adds a 'SHOULD_SPLIT' marker at the end 144 | location of each matching group of the given regex. 145 | 146 | Arguments 147 | --------- 148 | regex : re.Expression 149 | text : str, same length as split_locations 150 | split_locations : list, split decisions. 151 | """ 152 | for match in regex.finditer(text): 153 | end_match = match.end() 154 | if end_match < len(split_locations): 155 | split_locations[end_match] = SHOULD_SPLIT 156 | 157 | 158 | def mark_begin_end_regex(regex, text, split_locations): 159 | """ 160 | Regex that adds a 'SHOULD_SPLIT' marker at the end 161 | location of each matching group of the given regex, 162 | and adds a 'SHOULD_SPLIT' at the beginning of the 163 | matching group. Each character within the matching 164 | group will be marked as 'SHOULD_NOT_SPLIT'. 165 | 166 | Arguments 167 | --------- 168 | regex : re.Expression 169 | text : str, same length as split_locations 170 | split_locations : list, split decisions. 171 | """ 172 | for match in regex.finditer(text): 173 | end_match = match.end() 174 | begin_match = match.start() 175 | 176 | for i in range(begin_match+1, end_match): 177 | split_locations[i] = SHOULD_NOT_SPLIT 178 | if end_match < len(split_locations): 179 | if split_locations[end_match] == UNDECIDED: 180 | split_locations[end_match] = SHOULD_SPLIT 181 | if split_locations[begin_match] == UNDECIDED: 182 | split_locations[begin_match] = SHOULD_SPLIT 183 | 184 | 185 | def tokenize(text, normalize_ascii=True): 186 | """ 187 | Convert a single string into a list of substrings 188 | split along punctuation and word boundaries. Keep 189 | whitespace intact by always attaching it to the 190 | previous token. 191 | 192 | Arguments: 193 | ---------- 194 | text : str 195 | normalize_ascii : bool, perform some replacements 196 | on non-ascii characters to canonicalize the 197 | string (defaults to True). 198 | 199 | Returns: 200 | -------- 201 | list, list of substring tokens. 202 | """ 203 | # 1. If there's no punctuation, return immediately 204 | if no_punctuation.match(text): 205 | return [text] 206 | # 2. let's standardize the input text to ascii (if desired) 207 | # Note: this will no longer respect input-to-output character positions 208 | if normalize_ascii: 209 | # normalize these greco-roman characters to ascii: 210 | text = text.replace(u"œ", "oe").replace(u"æ", "ae") 211 | # normalize dashes: 212 | text = repeated_dash_converter.sub("-", text) 213 | # 3. let's construct an integer array of the possible split locations: 214 | split_locations = [UNDECIDED] * len(text) 215 | 216 | regexes = ( 217 | pure_whitespace, 218 | left_quote_shifter, 219 | left_quote_converter, 220 | left_single_quote_converter, 221 | remaining_quote_converter, 222 | # regex can't fix this -> regex ca n't fix this 223 | english_nots, 224 | # you'll dig this -> you 'll dig this 225 | english_contractions, 226 | # the rhino's horns -> the rhino 's horns 227 | english_specific_appendages, 228 | # qu'a tu fais au rhino -> qu ' a tu fais au rhino, 229 | french_appendages 230 | ) 231 | # 4. Mark end locations for specific regular expressions: 232 | for regex in regexes: 233 | mark_regex(regex, text, split_locations) 234 | 235 | begin_end_regexes = ( 236 | multi_single_quote_finder, 237 | right_single_quote_converter, 238 | # use dashes as the breakpoint: 239 | # the rhino--truck -> the rhino -- truck 240 | simple_dash_finder if normalize_ascii else advanced_dash_finder, 241 | numerical_expression, 242 | url_file_finder, 243 | shifted_ellipses, 244 | # the #rhino! -> the # rhino ! ; 245 | # the rino[sic] -> the rino [ sic ] 246 | shifted_standard_punctuation 247 | ) 248 | 249 | # 5. Mark begin and end locations for other regular expressions: 250 | for regex in begin_end_regexes: 251 | mark_begin_end_regex(regex, text, split_locations) 252 | 253 | # 6. Remove splitting on exceptional uses of periods: 254 | # I'm with Mr. -> I 'm with Mr. , I'm with Mister. -> I 'm with Mister . 255 | protect_shorthand(text, split_locations) 256 | 257 | if normalize_ascii: 258 | text = dash_converter.sub("-", text) 259 | # 7. Return the split string using the integer list: 260 | return list(split_with_locations(text, split_locations)) 261 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | def readfile(fname): 5 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 6 | 7 | setup( 8 | name='ciseau', 9 | version='1.0.1', 10 | description='Word and sentence tokenization.', 11 | long_description=readfile('README.md'), 12 | ext_modules=[], 13 | packages=find_packages(), 14 | py_modules = [], 15 | author='Jonathan Raiman', 16 | author_email='jonathanraiman@gmail.com', 17 | url='https://github.com/JonathanRaiman/ciseau', 18 | download_url='https://github.com/JonathanRaiman/ciseau', 19 | keywords='XML, tokenization, NLP', 20 | license='MIT', 21 | platforms='any', 22 | zip_safe=False, 23 | classifiers=[ 24 | 'Intended Audience :: Science/Research', 25 | 'Operating System :: OS Independent', 26 | 'Programming Language :: Python :: 3.3', 27 | 'Programming Language :: Python :: 2.7', 28 | 'Topic :: Text Processing :: Linguistic', 29 | ], 30 | setup_requires = [], 31 | install_requires=[ 32 | ], 33 | include_package_data=True, 34 | ) 35 | -------------------------------------------------------------------------------- /tests/test_tokenization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import sys 4 | from ciseau import tokenize, sent_tokenize 5 | 6 | class TokenizationTests(unittest.TestCase): 7 | def test_quoted_expressions(self): 8 | expression = [ 9 | "(", "in ", "2008", ") ", "the ", "Martians ", 10 | "arrived ", "and ", "you", "'ll ", "see ", "what ", 11 | "I ", "mean", "." 12 | ] 13 | self.assertEqual( 14 | tokenize( 15 | "".join(expression) 16 | ), 17 | expression 18 | ) 19 | 20 | def test_pre_post_quote(self): 21 | expression = [ 22 | "On ", "January ", "28", ", ", "2011 ", ", ", "''", 23 | "Hollywood ", "Reporter", "'' ", "announced ", 24 | "that ", "Paramount ", "Pictures ", "had ", "given ", 25 | "the ", "green ", "light", "." 26 | ] 27 | self.assertEqual( 28 | tokenize( 29 | "".join(expression) 30 | ), 31 | expression 32 | ) 33 | 34 | def test_weird_hybrid_expressions(self): 35 | expression = [ 36 | u"Beyoncé", u"'s ", u"1840", u"'s ", u"song ", u"<", u"3lovely", u"." 37 | ] 38 | self.assertEqual( 39 | tokenize( 40 | "".join(expression) 41 | ), 42 | expression 43 | ) 44 | 45 | def test_numerical_commas_periods_expressions(self): 46 | expression = [ 47 | "In ", "the ", "year ", "2000", ", ", 48 | "there ", "was ", "evidence ", "that ", "100,000 ", 49 | "martians ", "came ", "to ", "see ", "us", ", ", 50 | "but ", "I ", "did", "n't ", "even ", "hear ", 51 | "98.2", ",", "98.3 ", "or ", "98.4 ", "speak ", 52 | "about ", "it", "," 53 | ] 54 | self.assertEqual( 55 | tokenize("".join(expression)), 56 | expression 57 | ) 58 | 59 | def test_abbreviations(self): 60 | expression = [ 61 | "Mr. ", "hooligan ", "and ", "his ", "brother ", "DR. ", 62 | "strange ", "know ", "each ", "other ", "well ", "said ", 63 | "d. ", "A. ", "Joe ", "the ", "sgt. ", "in ", "charge ", 64 | "of ", "all ", "this ", "bs", "." 65 | ] 66 | self.assertEqual( 67 | tokenize("".join(expression)), 68 | expression 69 | ) 70 | 71 | def test_em_dash(self): 72 | expression = [ 73 | u"The ", u"earthquake ", u"was ", 74 | u"also ", u"felt ", u"in ", u"nearby ", u"countries ", 75 | u"and ", u"as ", u"far ", u"away ", u"as ", u"both ", 76 | u"Beijing ", u"and ", u"Shanghai ", u"—", u"1,500 ", u"km ", 77 | u"(", u"930 ", u"mi ", u") ", u"and ", u"1,700 ", u"km ", u"(", 78 | u"1,060 ", u"mi ", u") ", u"away", u"—", u"where ", u"office ", 79 | u"buildings ", u"swayed ", u"with ", u"the ", u"tremor", u"." 80 | ] 81 | self.assertEqual( 82 | tokenize("".join(expression), normalize_ascii=False), 83 | expression 84 | ) 85 | 86 | def test_quoted_expressions_with_ascii(self): 87 | expression = [ 88 | "Julius ", u"Cæsar ", "declared ", "-- ", "professed ", "- ", 89 | "his ", "passion ", "for ", "wine ", "A", "." 90 | ] 91 | self.assertEqual( 92 | tokenize( 93 | "".join(expression), 94 | normalize_ascii=False 95 | ), 96 | expression 97 | ) 98 | self.assertEqual( 99 | tokenize( 100 | "".join(expression), 101 | normalize_ascii=True 102 | ), 103 | [w.replace(u"æ", "ae").replace("--", "-") for w in expression] 104 | ) 105 | 106 | def test_sentence_detection(self): 107 | expression = [ 108 | [u'Maslow', u'’s ', u'‘‘', u'Third ', u'Force ', u'Psychology ', 109 | u'Theory', u'’’ ', u'even ', u'allows ', u'literary ', u'analysts ', 110 | u'to ', u'critically ', u'understand ', u'how ', u'characters ', 111 | u'reflect ', u'the ', u'culture ', u'and ', u'the ', u'history ', 112 | u'in ', u'which ', u'they ', u'are ', u'contextualized', u'. '], 113 | [u'It ', u'also ', u'allows ', u'analysts ', u'to ', u'understand ', 114 | u'the ', u'author', u'’s ', u'intended ', u'message ', u'and ', u'to ', 115 | u'understand ', u'the ', u'author', u'’s ', u'psychology', u'. '], 116 | [u'The ', u'theory ', u'suggests ', u'that ', u'human ', u'beings ', 117 | u'possess ', u'a ', u'nature ', u'within ', u'them ', u'that ', 118 | u'demonstrates ', u'their ', u'true ', u'“', u'self', u'” ', u'and ', 119 | u'it ', u'suggests ', u'that ', u'the ', u'fulfillment ', u'of ', 120 | u'this ', u'nature ', u'is ', u'the ', u'reason ', u'for ', u'living', 121 | u'. '], 122 | [u'It ', u'also ', u'suggests ', u'that ', u'neurological ', 123 | u'development ', u'hinders ', u'actualizing ', u'the ', u'nature ', 124 | u'because ', u'a ', u'person ', u'becomes ', u'estranged ', u'from ', 125 | u'his ', u'or ', u'her ', u'true ', u'self', u'. '], 126 | [u'Therefore', u', ', u'literary ', u'devices ', u'reflect ', u'a ', 127 | u'characters', u'’s ', u'and ', u'an ', u'author', u'’s ', u'natural ', 128 | u'self', u'. '], 129 | [u'In ', u'his ', u'‘‘', u'Third ', u'Force ', u'Psychology ', u'and ', 130 | u'the ', u'Study ', u'of ', u'Literature', u'’’', u', ', u'Paris ', 131 | u'argues ', u'“', u'D.', u'H ', u'Lawrence', u'’s ', u'“', u'pristine ', 132 | u'unconscious', u'” ', u'is ', u'a ', u'metaphor ', u'for ', u'the ', 133 | u'real ', u'self', u'”', u'. '], 134 | [u'Thus ', u'Literature ', u'is ', u'a ', u'reputable ', u'tool ', 135 | u'that ', u'allows ', u'readers ', u'to ', u'develop ', u'and ', 136 | u'apply ', u'critical ', u'reasoning ', u'to ', u'the ', u'nature ', 137 | u'of ', u'emotions', u'.'] 138 | ] 139 | self.assertEqual( 140 | sent_tokenize( 141 | u"".join(w for sent in expression for w in sent), 142 | keep_whitespace=True 143 | ), 144 | expression 145 | ) 146 | 147 | def test_unequal_quote_detection(self): 148 | expression = [ 149 | [u"Beyoncé", u"'s ", u'vocal ', u'range ', u'spans ', u'four ', u'octaves', 150 | u'. '], 151 | [u'Jody ', u'Rosen ', u'highlights ', u'her ', u'tone ', u'and ', 152 | u'timbre ', u'as ', u'particularly ', u'distinctive', u', ', 153 | u'describing ', u'her ', u'voice ', u'as ', u'"', u'one ', u'of ', 154 | u'the ', u'most ', u'compelling ', u'instruments ', u'in ', 155 | u'popular ', u'music', u'"', u'. ' 156 | ], 157 | [u'While ', u'another ', u'critic ', u'says ', u'she ', u'is ', 158 | u'a ', u'"', u'Vocal ', u'acrobat', u', ', u'being ', u'able ', 159 | u'to ', u'sing ', u'long ', u'and ', u'complex ', u'melismas ', 160 | u'and ', u'vocal ', u'runs ', u'effortlessly', u', ', u'and ', 161 | u'in ', u'key', u'. '], 162 | [u'Her ', u'vocal ', u'abilities ', u'mean ', u'she ', u'is ', 163 | u'identified ', u'as ', u'the ', u'centerpiece ', u'of ', u'Destiny', 164 | u"'s ", u'Child', u'. '], 165 | [u'The ', u'Daily ', u'Mail ', u'calls ', u"Beyoncé", u"'s ", u'voice ', 166 | u'"', u'versatile', u'"', u', ', u'capable ', u'of ', u'exploring ', 167 | u'power ', u'ballads', u', ', u'soul', u', ', u'rock ', u'belting', 168 | u', ', u'operatic ', u'flourishes', u', ', u'and ', u'hip ', u'hop', 169 | u'. '], 170 | [u'Jon ', u'Pareles ', u'of ', u'The ', u'New ', u'York ', u'Times ', 171 | u'commented ', u'that ', u'her ', u'voice ', u'is ', u'"', u'velvety ', 172 | u'yet ', u'tart', u', ', u'with ', u'an ', u'insistent ', u'flutter ', 173 | u'and ', u'reserves ', u'of ', u'soul ', u'belting', u'"', u'. '], 174 | [u'Rosen ', u'notes ', u'that ', u'the ', u'hip ', u'hop ', u'era ', 175 | u'highly ', u'influenced ', u"Beyoncé", u"'s ", u'strange ', u'rhythmic ', 176 | u'vocal ', u'style', u', ', u'but ', u'also ', u'finds ', u'her ', 177 | u'quite ', u'traditionalist ', u'in ', u'her ', u'use ', u'of ', 178 | u'balladry', u', ', u'gospel ', u'and ', u'falsetto', u'. '], 179 | [u'Other ', u'critics ', u'praise ', u'her ', u'range ', u'and ', 180 | u'power', u', ', u'with ', u'Chris ', u'Richards ', u'of ', u'The ', 181 | u'Washington ', u'Post ', u'saying ', u'she ', u'was ', u'"', 182 | u'capable ', u'of ', u'punctuating ', u'any ', u'beat ', u'with ', 183 | u'goose', u'-', u'bump', u'-', u'inducing ', u'whispers ', u'or ', 184 | u'full', u'-', u'bore ', u'diva', u'-', u'roars', u'.', u'"'] 185 | ] 186 | self.assertEqual( 187 | sent_tokenize( 188 | u"".join(w for sent in expression for w in sent), 189 | keep_whitespace=True 190 | ), 191 | expression 192 | ) 193 | 194 | def test_contained_period_in_quotes(self): 195 | expression = [[ 196 | "the ", "gray ", "bird ", "(", "which ", "was ", 197 | "famous ", "for ", "its ", "colors", ".", ") ", 198 | "was ", "ressurected ", "\" ", "she ", "said", ".", "\"" 199 | ]] 200 | self.assertEqual( 201 | sent_tokenize( 202 | "".join(w for sent in expression for w in sent), 203 | keep_whitespace=True 204 | ), 205 | expression 206 | ) 207 | 208 | def test_period_sequences(self): 209 | expression = [[ 210 | "Mr. ", "Joe ", "was ", "always ", "late ", "to ", "his ", 211 | "dates", ", ", "appointments", ", ", "etc.", "." 212 | ]] 213 | self.assertEqual( 214 | sent_tokenize( 215 | "".join(w for sent in expression for w in sent), 216 | keep_whitespace=True 217 | ), 218 | expression 219 | ) 220 | 221 | def test_spanish_tokenization(self): 222 | expressions = [ 223 | [ 224 | [ 225 | u"Pero ", u"si ", u"no ", u"es ", u"el ", u"caso", u", ", u"llega ", 226 | u"el ", u"momento ", u"de ", u"hacerse ", u"la ", u"pregunta ", u"de ", 227 | u"cada ", u"año", u". " 228 | ], 229 | [ 230 | u"¿", u"Qué ", u"hago ", u"con ", u"estos ", u"sobres ", u"de ", u"jamón ", 231 | u"o ", u"este ", u"lomo ", u"ibérico", u"? " 232 | ], 233 | [ 234 | u"¿", u"Los ", u"puedo ", u"congelar ", u"o ", u"es ", u"una ", u"aberración", 235 | u"? ", 236 | ], 237 | [ 238 | u"La ", u"respuesta ", u"rápida ", u"sería ", u"un ", u"sí", u"." 239 | ] 240 | ], 241 | [ 242 | [ 243 | u"De ", u"hecho", u", ", u"es ", u"algo ", u"que ", u"lleva ", u"mucho ", u"tiempo ", 244 | u"haciéndose", u". " 245 | ], 246 | [ 247 | u"En ", u"las ", u"matanzas ", u"de ", u"los ", u"pueblos ", u"muchas ", u"piezas ", 248 | u"se ", u"congelan ", u"una ", u"vez ", u"curadas ", u"para ", u"ir ", u"luego ", 249 | u"dándoles ", u"salida ", u"a ", u"lo ", u"largo ", u"de ", u"todo ", u"el ", u"año", 250 | u". " 251 | ], 252 | [ 253 | u"Otro ", u"ejemplo ", u"clásico", u": ", u"las ", u"embarazas ", u"que ", u"quieren ", 254 | u"evitar ", u"cualquier ", u"posible ", u"riesgo ", u"de ", u"toxoplasmosis ", u"pero ", 255 | u"no ", u"quieren ", u"renunciar ", u"a ", u"los ", u"embutidos ", u"durante ", u"eso ", 256 | u"nueve ", u"meses", u". " 257 | ], 258 | [ 259 | u"¿", u"Solución", u"? " 260 | ], 261 | [ 262 | u"Congelarlo", u"." 263 | ] 264 | ], 265 | [ 266 | [ 267 | u"Que ", u"lo ", u"sepas", u", ", u"¡", u"no ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", 268 | u"sola", u"!" 269 | ] 270 | ], 271 | [ 272 | [ 273 | u"¡", u"No ", u"pienso ", u"hacerlo ", u"todo ", u"yo ", u"sola", u", ", u"que ", u"lo ", 274 | u"sepas", u"!" 275 | ] 276 | ], 277 | [ 278 | [ 279 | u"¡", u"No ", u"me ", u"digas ", u"nada", u"! " 280 | ], 281 | [ 282 | u"¡", u"Te ", u"has ", u"portado ", u"fatal", u"! " 283 | ], 284 | [ 285 | u"¡", u"No ", u"quiero ", u"volver ", u"a ", u"saber ", u"nada ", u"de ", u"ti", u"!" 286 | ] 287 | ], 288 | [ 289 | [ 290 | u"¡¡¡", u"Al ", u"ladrón", u"!!!" 291 | ] 292 | ] 293 | ] 294 | for expression in expressions: 295 | self.assertEqual( 296 | sent_tokenize( 297 | "".join(w for sent in expression for w in sent), 298 | keep_whitespace=True 299 | ), 300 | expression 301 | ) 302 | 303 | def test_german_tokenization(self): 304 | expressions = [ 305 | [ 306 | [ 307 | u"Als ", u"Vertreter ", u"des ", u"One", u"-", u"Nation", u"-", u"Konservatismus ", 308 | u"bekleidete ", u"er ", u"nach ", u"dem ", u"Wahlsieg ", u"der ", u"Tories ", u"1951 ", 309 | u"als ", u"führendes ", u"Kabinettsmitglied ", u"mehrere ", u"wichtige ", u"Regierungsämter", 310 | u", ", u"unter ", u"anderem ", u"das ", u"des ", u"Verteidigungsministers", u", ", u"des ", 311 | u"Außenministers ", u"und ", u"des ", u"Schatzkanzlers", u". " 312 | ], 313 | [ 314 | u"Seine ", u"Amtszeit ", u"als ", 315 | u"Premierminister ", u"war ", u"innenpolitisch ", u"geprägt ", u"von ", u"zahlreichen ", u"Reformen ", 316 | u"sowie ", u"einer ", u"prosperierenden ", u"Wirtschaft ", u"mit ", u"niedriger ", u"Arbeitslosigkeit ", 317 | u"und ", u"ungleichmäßigem ", u"Wirtschaftswachstum", u". ", 318 | ], 319 | [ 320 | u"Außenpolitisch ", u"behob ", u"er ", u"die ", u"durch ", u"die ", u"Sueskrise ", u"entstandene ", 321 | u"Entfremdung ", u"mit ", u"den ", u"USA", u", ", u"erreichte ", u"die ", u"Lieferung ", u"von ", 322 | u"amerikanischen ", u"Polaris", u"-", u"Mittelstreckenraketen ", u"als ", u"neuen ", u"Kern ", 323 | u"der ", u"britischen ", u"nuklearen ", u"Abschreckung ", u"und ", u"bereitete ", u"den ", u"Weg ", 324 | u"für ", u"ein ", u"partielles ", u"Atomteststoppabkommen", u"." 325 | ] 326 | ] 327 | ] 328 | for expression in expressions: 329 | self.assertEqual( 330 | sent_tokenize( 331 | "".join(w for sent in expression for w in sent), 332 | keep_whitespace=True, 333 | normalize_ascii=False 334 | ), 335 | expression 336 | ) 337 | --------------------------------------------------------------------------------