├── README.md ├── bindings └── python │ ├── MANIFEST.in │ ├── .gitignore │ ├── Cargo.toml │ ├── pyproject.toml │ ├── setup.py │ ├── src │ └── lib.rs │ └── README.md ├── cutters ├── src │ ├── parsers │ │ ├── mod.rs │ │ ├── baseline.rs │ │ ├── english.rs │ │ └── croatian.rs │ └── lib.rs ├── .gitignore ├── res │ ├── baseline.pest │ ├── hr.pest │ └── en.pest ├── Cargo.toml └── README.md └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | cutters/README.md -------------------------------------------------------------------------------- /bindings/python/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Cargo.toml 2 | recursive-include src * 3 | -------------------------------------------------------------------------------- /cutters/src/parsers/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod baseline; 2 | pub mod croatian; 3 | pub mod english; 4 | -------------------------------------------------------------------------------- /cutters/.gitignore: -------------------------------------------------------------------------------- 1 | debug/ 2 | target/ 3 | 4 | Cargo.lock 5 | 6 | **/*.rs.bk 7 | 8 | *.pdb 9 | -------------------------------------------------------------------------------- /bindings/python/.gitignore: -------------------------------------------------------------------------------- 1 | debug/ 2 | target/ 3 | 4 | Cargo.lock 5 | 6 | **/*.rs.bk 7 | 8 | *.pdb 9 | -------------------------------------------------------------------------------- /cutters/res/baseline.pest: -------------------------------------------------------------------------------- 1 | complete_ending = _{ (WHITE_SPACE* ~ SENTENCE_TERMINAL+)+ } 2 | 3 | internal_sentence = _{ 4 | ((!SENTENCE_TERMINAL ~ ANY ) ~ internal_sentence) | 5 | (complete_ending) | 6 | (!WHITE_SPACE ~ ANY)+ 7 | } 8 | 9 | sentence = { internal_sentence } 10 | 11 | sentence_list = _{ (WHITE_SPACE* ~ sentence ~ WHITE_SPACE*)* } 12 | -------------------------------------------------------------------------------- /bindings/python/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cutters-python" 3 | version = "0.1.4" 4 | authors = ["cyanic-selkie "] 5 | edition = "2021" 6 | 7 | [lib] 8 | name = "cutters" 9 | crate-type = ["cdylib"] 10 | 11 | [dependencies.cutters] 12 | version = "=0.1.4" 13 | 14 | [dependencies.pyo3] 15 | version = "=0.19.1" 16 | features = ["extension-module"] 17 | -------------------------------------------------------------------------------- /cutters/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cutters" 3 | version = "0.1.4" 4 | authors = ["cyanic-selkie "] 5 | edition = "2021" 6 | description = "Rule based sentence segmentation library." 7 | license = "MIT" 8 | readme = "README.md" 9 | repository = "https://github.com/cyanic-selkie/cutters" 10 | categories = ["text-processing"] 11 | 12 | [dependencies] 13 | pest = "=2.5.7" 14 | pest_derive = "=2.5.7" 15 | -------------------------------------------------------------------------------- /bindings/python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=1.0,<2.0"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "cutters" 7 | version = "0.1.4" 8 | authors = [ 9 | {name = "cyanic-selkie", email = "cyanic-selkie@protonmail.com"} 10 | ] 11 | license = {text = "MIT"} 12 | description = "A rule based sentence segmentation library." 13 | readme = "README.md" 14 | classifiers=[ 15 | "Natural Language :: Croatian", 16 | "Natural Language :: English", 17 | "Topic :: Text Processing", 18 | ] 19 | 20 | [project.urls] 21 | Repository = "https://github.com/cyanic-selkie/cutters" 22 | -------------------------------------------------------------------------------- /bindings/python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools_rust import Binding, RustExtension 3 | 4 | setup( 5 | name="cutters", 6 | version="0.1.4", 7 | description="A rule based sentence segmentation library.", 8 | long_description=open("README.md", "r", encoding="utf-8").read(), 9 | long_description_content_type="text/markdown", 10 | author="cyanic-selkie", 11 | author_email="cyanic-selkie@protonmail.com", 12 | url="https://github.com/cyanic-selkie/cutters", 13 | license="MIT", 14 | rust_extensions=[ 15 | RustExtension("cutters.cutters", binding=Binding.PyO3, debug=False) 16 | ], 17 | classifiers=[ 18 | "Natural Language :: Croatian", 19 | "Natural Language :: English", 20 | "Topic :: Text Processing", 21 | ], 22 | zip_safe=False, 23 | ) 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 cyanic-selkie 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /cutters/src/parsers/baseline.rs: -------------------------------------------------------------------------------- 1 | use crate::Sentence; 2 | use pest::Parser; 3 | use pest_derive::*; 4 | 5 | #[derive(Parser)] 6 | #[grammar = "../res/baseline.pest"] 7 | pub struct BaselineParser; 8 | 9 | pub fn cut(text: &str) -> Vec { 10 | let ast = BaselineParser::parse(Rule::sentence_list, text).unwrap(); 11 | 12 | let mut sentences = vec![]; 13 | 14 | for sentence in ast { 15 | sentences.push(Sentence { 16 | str: sentence.as_str(), 17 | quotes: vec![], 18 | }); 19 | } 20 | 21 | sentences 22 | } 23 | 24 | #[cfg(test)] 25 | mod test { 26 | use super::*; 27 | 28 | #[test] 29 | fn basic() { 30 | let query_sentences = vec![ 31 | r#"This is a regular sentence."#, 32 | r#"This sentence ends with an exclamation mark!"#, 33 | r#"Does this sentence end with a question mark?"#, 34 | r#"This sentence drifts off..."#, 35 | r#"This sentence ends with a mix of characters ... !?"#, 36 | r#"This sentence doesn't have any characters at the end"#, 37 | ]; 38 | 39 | let text = query_sentences.join(" "); 40 | 41 | let sentences = cut(&text); 42 | 43 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 44 | assert!(query_sentence == sentence.str); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /cutters/src/lib.rs: -------------------------------------------------------------------------------- 1 | //! The `cutters` crate is a single function library used for segmenting text into sentences. 2 | //! 3 | //! # Examples 4 | //! 5 | //! The [cut] function returns a [Vec] of [Sentence] objects. 6 | //! 7 | //! ``` 8 | //! let text = "This is some example text. It contains two sentences."; 9 | //! 10 | //! let sentences = cutters::cut(text, cutters::Language::English); 11 | //! 12 | //! assert!(sentences[0].str == "This is some example text."); 13 | //! assert!(sentences[1].str == "It contains two sentences."); 14 | //! ``` 15 | //! 16 | //! If a sentence contains quotes, you can access them via the `quotes` field of the [Sentence] struct. 17 | //! 18 | //! ``` 19 | //! let text = r#"He said: "I'll be right there.""#; 20 | //! 21 | //! let sentences = cutters::cut(text, cutters::Language::English); 22 | //! 23 | //! assert!(sentences[0].quotes[0].str == "I'll be right there."); 24 | //! ``` 25 | //! 26 | //! And finally, if a quote contains multiple subsentences, you can access them via the `sentences` field of 27 | //! the [Quote] struct. 28 | //! 29 | //! ``` 30 | //! let text = r#"He said: "I'll be right there. Give me five minutes.""#; 31 | //! 32 | //! let sentences = cutters::cut(text, cutters::Language::English); 33 | //! 34 | //! assert!(sentences[0].quotes[0].sentences[0] == "I'll be right there."); 35 | //! assert!(sentences[0].quotes[0].sentences[1] == "Give me five minutes."); 36 | //! ``` 37 | 38 | mod parsers; 39 | 40 | use parsers::{baseline, croatian, english}; 41 | 42 | #[derive(Debug)] 43 | pub struct Quote<'a> { 44 | pub str: &'a str, 45 | 46 | pub sentences: Vec<&'a str>, 47 | } 48 | 49 | #[derive(Debug)] 50 | pub struct Sentence<'a> { 51 | pub str: &'a str, 52 | 53 | pub quotes: Vec>, 54 | } 55 | 56 | #[derive(Debug)] 57 | pub enum Language { 58 | Baseline, 59 | Croatian, 60 | English, 61 | } 62 | 63 | pub fn cut(text: &str, language: Language) -> Vec { 64 | match language { 65 | Language::Baseline => baseline::cut(text), 66 | Language::Croatian => croatian::cut(text), 67 | Language::English => english::cut(text), 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /bindings/python/src/lib.rs: -------------------------------------------------------------------------------- 1 | use ::cutters as cutters_rs; 2 | use pyo3::prelude::*; 3 | 4 | #[pyclass] 5 | #[derive(Clone, Debug)] 6 | pub struct Quote { 7 | #[pyo3(get)] 8 | pub str: String, 9 | #[pyo3(get)] 10 | pub sentences: Vec, 11 | } 12 | 13 | #[pyclass] 14 | #[derive(Clone, Debug)] 15 | pub struct Sentence { 16 | #[pyo3(get)] 17 | pub str: String, 18 | #[pyo3(get)] 19 | pub quotes: Vec, 20 | } 21 | 22 | #[pymethods] 23 | impl Quote { 24 | fn __repr__(&self) -> String { 25 | format!("{:#?}", self) 26 | } 27 | 28 | fn __str__(&self) -> String { 29 | self.__repr__() 30 | } 31 | } 32 | 33 | #[pymethods] 34 | impl Sentence { 35 | fn __repr__(&self) -> String { 36 | format!("{:#?}", self) 37 | } 38 | 39 | fn __str__(&self) -> String { 40 | self.__repr__() 41 | } 42 | } 43 | 44 | #[pyfunction] 45 | pub fn cut(text: &str, language: &str) -> PyResult> { 46 | let language = match language { 47 | "baseline" => cutters_rs::Language::Baseline, 48 | "hr" => cutters_rs::Language::Croatian, 49 | "en" => cutters_rs::Language::English, 50 | _ => { 51 | return Err(pyo3::exceptions::PyValueError::new_err(format!( 52 | "Language {} not supported.", 53 | language, 54 | ))) 55 | } 56 | }; 57 | 58 | let sentences = cutters_rs::cut(text, language); 59 | let mut sentences_python = vec![]; 60 | 61 | for sentence in &sentences { 62 | let str = sentence.str.to_string(); 63 | let mut quotes = vec![]; 64 | 65 | for quote in &sentence.quotes { 66 | let str = quote.str.to_string(); 67 | let mut sentences = vec![]; 68 | 69 | for sentence in "e.sentences { 70 | sentences.push(sentence.to_string()); 71 | } 72 | 73 | quotes.push(Quote { str, sentences }); 74 | } 75 | 76 | sentences_python.push(Sentence { str, quotes }); 77 | } 78 | 79 | Ok(sentences_python) 80 | } 81 | 82 | #[pymodule] 83 | fn cutters(_py: Python, m: &PyModule) -> PyResult<()> { 84 | m.add_function(wrap_pyfunction!(cut, m)?)?; 85 | Ok(()) 86 | } 87 | -------------------------------------------------------------------------------- /bindings/python/README.md: -------------------------------------------------------------------------------- 1 |
2 |

cutters

3 |

4 | A rule based sentence segmentation library.
5 | Python bindings for the cutters library written in Rust. 6 |

7 |
8 |

9 | 10 | Release 11 | 12 | 13 | License 14 | 15 | Downloads 16 |

17 |

18 | 🚧 This library is experimental. 🚧 19 |

20 | 21 | ## Features 22 | - Full UTF-8 support. 23 | - Robust parsing. 24 | - Language specific rules (each defined by its own [PEG](https://en.wikipedia.org/wiki/Parsing_expression_grammar)). 25 | - Fast and memory efficient parsing via the [pest](https://github.com/pest-parser/pest) library. 26 | - Sentences can contain quotes which can contain subsentences. 27 | 28 | ## Supported languages 29 | - Croatian (standard) 30 | - English (standard) 31 | 32 | There is also an additional `Baseline` "language" that simply splits the text on [sentence terminals](https://unicode.org/L2/L2003/03145-sentence-term.htm) as defined by UTF-8. Its intended use is for benchmarking. 33 | 34 | ## Example 35 | 36 | After installing the `cutters` package with `pip`, usage is simple (note that the language is defined via [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) two letter language codes). 37 | 38 | ```python 39 | import cutters 40 | 41 | text = """ 42 | Petar Krešimir IV. je vladao od 1058. do 1074. St. Louis 9LX je događaj u svijetu šaha. To je prof.dr.sc. Ivan Horvat. Volim rock, punk, funk, pop itd. Tolstoj je napisao: "Sve sretne obitelji nalik su jedna na drugu. Svaka nesretna obitelj nesretna je na svoj način." 43 | """; 44 | 45 | sentences = cutters.cut(text, "hr"); 46 | 47 | print(sentences); 48 | ``` 49 | 50 | This results in the following output (note that the `str` struct fields are `&str`). 51 | ``` 52 | [Sentence { 53 | str: "Petar Krešimir IV. je vladao od 1058. do 1074. ", 54 | quotes: [], 55 | }, Sentence { 56 | str: "St. Louis 9LX je događaj u svijetu šaha.", 57 | quotes: [], 58 | }, Sentence { 59 | str: "To je prof.dr.sc. Ivan Horvat.", 60 | quotes: [], 61 | }, Sentence { 62 | str: "Volim rock, punk, funk, pop itd.", 63 | quotes: [], 64 | }, Sentence { 65 | str: "Tolstoj je napisao: \"Sve sretne obitelji nalik su jedna na drugu. Svaka nesretna obitelj nesretna je na svoj način.\"", 66 | quotes: [ 67 | Quote { 68 | str: "Sve sretne obitelji nalik su jedna na drugu. Svaka nesretna obitelj nesretna je na svoj način.", 69 | sentences: [ 70 | "Sve sretne obitelji nalik su jedna na drugu.", 71 | "Svaka nesretna obitelj nesretna je na svoj način.", 72 | ], 73 | }, 74 | ], 75 | }] 76 | ``` 77 | -------------------------------------------------------------------------------- /cutters/README.md: -------------------------------------------------------------------------------- 1 |
2 |

cutters

3 |

4 | A rule based sentence segmentation library. 5 |

6 |
7 |

8 | 9 | Release 10 | 11 | 12 | Docs 13 | 14 | 15 | License 16 | 17 | Downloads 18 |

19 |

20 | 🚧 This library is experimental. 🚧 21 |

22 | 23 | ## Features 24 | - Full UTF-8 support. 25 | - Robust parsing. 26 | - Language specific rules (each defined by its own [PEG](https://en.wikipedia.org/wiki/Parsing_expression_grammar)). 27 | - Fast and memory efficient parsing via the [pest](https://github.com/pest-parser/pest) library. 28 | - Sentences can contain quotes which can contain subsentences. 29 | 30 | ## Bindings 31 | 32 | Besides native Rust, bindings for the following programming languages are available: 33 | - [Python](https://pypi.org/project/cutters/) 34 | 35 | ## Supported languages 36 | - Croatian (standard) 37 | - English (standard) 38 | 39 | There is also an additional `Baseline` "language" that simply splits the text on [sentence terminals](https://unicode.org/L2/L2003/03145-sentence-term.htm) as defined by UTF-8. Its intended use is for benchmarking. 40 | 41 | ## Example 42 | 43 | After adding the `cutters` dependency to your `Cargo.toml` file, usage is simple. 44 | 45 | ```rust 46 | fn main(){ 47 | let text = r#"Petar Krešimir IV. je vladao od 1058. do 1074. St. Louis 9LX je događaj u svijetu šaha. To je prof.dr.sc. Ivan Horvat. Volim rock, punk, funk, pop itd. Tolstoj je napisao: "Sve sretne obitelji nalik su jedna na drugu. Svaka nesretna obitelj nesretna je na svoj način.""#; 48 | 49 | let sentences = cutters::cut(text, cutters::Language::Croatian); 50 | 51 | println!("{:#?}", sentences); 52 | } 53 | ``` 54 | 55 | This results in the following output (note that the `str` struct fields are `&str`). 56 | ``` 57 | [ 58 | Sentence { 59 | str: "Petar Krešimir IV. je vladao od 1058. do 1074. ", 60 | quotes: [], 61 | }, 62 | Sentence { 63 | str: "St. Louis 9LX je događaj u svijetu šaha.", 64 | quotes: [], 65 | }, 66 | Sentence { 67 | str: "To je prof.dr.sc. Ivan Horvat.", 68 | quotes: [], 69 | }, 70 | Sentence { 71 | str: "Volim rock, punk, funk, pop itd.", 72 | quotes: [], 73 | }, 74 | Sentence { 75 | str: "Tolstoj je napisao: \"Sve sretne obitelji nalik su jedna na drugu. Svaka nesretna obitelj nesretna je na svoj način.\"", 76 | quotes: [ 77 | Quote { 78 | str: "Sve sretne obitelji nalik su jedna na drugu. Svaka nesretna obitelj nesretna je na svoj način.", 79 | sentences: [ 80 | "Sve sretne obitelji nalik su jedna na drugu.", 81 | "Svaka nesretna obitelj nesretna je na svoj način.", 82 | ], 83 | }, 84 | ], 85 | }, 86 | ] 87 | ``` 88 | -------------------------------------------------------------------------------- /cutters/res/hr.pest: -------------------------------------------------------------------------------- 1 | abbreviation = _{ 2 | ( 3 | ( 4 | ("itd" | "tzv" | "tj" | "br" | "sl" | "npr") | // common, dr excluded since it more often means doctor 5 | ("prof" | "dr" | "izv" | "sc" | "doc" | "ing" | "dipl" | "bacc" ) | // titles 6 | ("Ave" | "Blvd" | "Cyn" | "Dr" | "Ln" | "Rd" | "St" | "Mr" | "Mrs" | "Ltd" | "no" | "vs" | "est" | "Jr" | "Sr" | "ca" | "cca") // foreign 7 | ) 8 | ~ WHITE_SPACE* ~ ".") | 9 | // special 10 | ("P" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "S" ~ WHITE_SPACE* ~ ".") | // P.S. 11 | ("P" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "P" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "S" ~ WHITE_SPACE* ~ ".") | // P.P.S. 12 | ("Q" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "E" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "D" ~ WHITE_SPACE* ~ ".") | // Q.E.D. 13 | ("R" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "I" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "P" ~ WHITE_SPACE* ~ ".") | // R.I.P. 14 | ("S" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "O" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "S" ~ WHITE_SPACE* ~ ".") | // S.O.S. 15 | ("n" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "b" ~ WHITE_SPACE* ~ ".") | // n.b. 16 | ("A" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "D" ~ WHITE_SPACE* ~ ".") | // A.D. 17 | ("B" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "C" ~ WHITE_SPACE* ~ ".") | // B.C. 18 | ("O" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "K" ~ WHITE_SPACE* ~ ".") | // O.K. 19 | ("Ph" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "D") // Ph.d 20 | } 21 | 22 | // abbreviations that are most often at the end of a sentence 23 | ending_abbreviation = _{ 24 | ("itd" | "sl") ~ WHITE_SPACE* ~ "." 25 | } 26 | 27 | roman_numeral = _{ 28 | &("M" | "D" | "C" | "L" | "X" | "V" | "I") ~ 29 | ( 30 | "M"* ~ (("C" ~ ("M" | "D")) | ("D"? ~ "C"*)) ~ (("X" ~ ("C" ~ "L")) | ("L"? ~ "X"*)) ~ (("I" ~ ("X" | "V")) | ("V"? ~ "I"*)) 31 | ) 32 | } 33 | number = _{ 34 | NUMBER+ | 35 | roman_numeral 36 | } 37 | 38 | ignoreable = _{ 39 | ("(" ~ (!")" ~ ANY)* ~ ")") | 40 | ("[" ~ (!"]" ~ ANY)* ~ "]") | 41 | ("{" ~ (!"}" ~ ANY)* ~ "}") 42 | } 43 | 44 | possible_sentence_start = _{ WHITE_SPACE* ~ (UPPERCASE_LETTER | TITLECASE_LETTER | QUOTATION_MARK | NUMBER) } 45 | complete_ending = _{ (WHITE_SPACE* ~ SENTENCE_TERMINAL+)+ } 46 | 47 | quoted_internal_sentence = _{ 48 | &(NEWLINE+) | 49 | (WHITE_SPACE+ ~ ending_abbreviation ~ &possible_sentence_start) | // abbreviation at the end of a sentence 50 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE*)+ ~ &possible_sentence_start ~ !number) | 51 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE*)+ ~ !(WHITE_SPACE* ~ (UPPERCASE_LETTER | TITLECASE_LETTER)) ~ quoted_internal_sentence) | 52 | ((ignoreable | (WHITE_SPACE+ ~ abbreviation+) | (!(SENTENCE_TERMINAL | QUOTATION_MARK) ~ ANY ) | (SENTENCE_TERMINAL ~ !possible_sentence_start)) ~ quoted_internal_sentence) | 53 | (complete_ending) | 54 | "ATION_MARK 55 | } 56 | 57 | quote_sentence = { &possible_sentence_start ~ !QUOTATION_MARK ~ WHITE_SPACE* ~ abbreviation* ~ quoted_internal_sentence } 58 | 59 | quote = { 60 | (WHITE_SPACE* ~ quote_sentence ~ (!NEWLINE ~ WHITE_SPACE)*)+ 61 | } 62 | 63 | quote_wrapper = _{ 64 | (QUOTATION_MARK ~ quote ~ (&(NEWLINE+) | QUOTATION_MARK)) 65 | } 66 | 67 | quoted_phrase = _{ 68 | (QUOTATION_MARK ~ !possible_sentence_start ~ (!QUOTATION_MARK~ ANY)* ~ (&(NEWLINE+) | QUOTATION_MARK)) 69 | } 70 | 71 | internal_sentence = _{ 72 | &(NEWLINE+) | 73 | (WHITE_SPACE+ ~ ending_abbreviation+ ~ &possible_sentence_start) | 74 | (quoted_phrase ~ internal_sentence) | 75 | (quote_wrapper ~ (&possible_sentence_start | internal_sentence | &EOI)) | 76 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE*)+ ~ ((&possible_sentence_start ~ !number) | &EOI)) | 77 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE*)+ ~ !(WHITE_SPACE* ~ (UPPERCASE_LETTER | TITLECASE_LETTER)) ~ internal_sentence) | 78 | ((ignoreable | (WHITE_SPACE+ ~ abbreviation+) | (!SENTENCE_TERMINAL ~ ANY ) | (SENTENCE_TERMINAL ~ !possible_sentence_start)) ~ internal_sentence) | 79 | (complete_ending) | 80 | (!WHITE_SPACE ~ ANY)+ 81 | } 82 | 83 | sentence = { &possible_sentence_start ~ WHITE_SPACE* ~ abbreviation* ~ internal_sentence } 84 | 85 | sentence_list = _{ (WHITE_SPACE* ~ (sentence | ANY) ~ WHITE_SPACE*)* } 86 | -------------------------------------------------------------------------------- /cutters/res/en.pest: -------------------------------------------------------------------------------- 1 | contraction = _{ 2 | ("'" ~ 3 | ("t" | "n" | "cause" | "cept" | "ve" | "ye" | "en" | "er" | "em" | "s" | "gainst" | "d" | "ll" | "re" | "nt" | "m" | "o" | "am" | "neath" | "round" | "thout" | "til" | "tis" | "twas" | "tween" | "twere" | "all" | "ren" | "at" | "know" ) // suffixes 4 | ) | 5 | ("o'clock" | "ol'") // full 6 | } 7 | 8 | abbreviation = _{ 9 | ( 10 | ( 11 | ("ca" | "cca" | "def" | "anon" | "ed" | "no" | "vs" | "est") | // common 12 | ("Mr" | "Mrs" | "Dr" | "Esq" | "Hon" | "Jr" | "Mr" | "Mrs" | "Ms" | "Msgr" | "Prof" | "Rev" | "Rt" | "Sr") | // titles 13 | ("Ave" | "Blvd" | "Cyn" | "Dr" | "Ln" | "Rd" | "St" | "Ltd") 14 | ) 15 | ~ WHITE_SPACE* ~ ".") | 16 | // special 17 | ("P" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "S" ~ WHITE_SPACE* ~ ".") | // P.S. 18 | ("P" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "P" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "S" ~ WHITE_SPACE* ~ ".") | // P.P.S. 19 | ("Q" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "E" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "D" ~ WHITE_SPACE* ~ ".") | // Q.E.D. 20 | ("R" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "I" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "P" ~ WHITE_SPACE* ~ ".") | // R.I.P. 21 | ("S" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "O" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "S" ~ WHITE_SPACE* ~ ".") | // S.O.S. 22 | ("e" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "g" ~ WHITE_SPACE* ~ ".") | // e.g. 23 | ("i" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "e" ~ WHITE_SPACE* ~ ".") | // i.e. 24 | ("n" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "b" ~ WHITE_SPACE* ~ ".") | // n.b. 25 | ("Ph" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "D") | // Ph.d 26 | ("A" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "D" ~ WHITE_SPACE* ~ ".") | // A.D 27 | ("B" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "C" ~ WHITE_SPACE* ~ ".") | // B.C. 28 | ("a" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "m" ~ WHITE_SPACE* ~ ".") | // a.m. 29 | ("p" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "m" ~ WHITE_SPACE* ~ ".") | // p.m. 30 | ("O" ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ "K" ~ WHITE_SPACE* ~ ".") // O.K. 31 | } 32 | 33 | // abbreviations that are most often at the end of a sentence 34 | ending_abbreviation = _{ 35 | ("etc") ~ WHITE_SPACE* ~ "." 36 | } 37 | 38 | roman_numeral = _{ 39 | &("M" | "D" | "C" | "L" | "X" | "V" | "I") ~ 40 | ( 41 | "M"* ~ (("C" ~ ("M" | "D")) | ("D"? ~ "C"*)) ~ (("X" ~ ("C" ~ "L")) | ("L"? ~ "X"*)) ~ (("I" ~ ("X" | "V")) | ("V"? ~ "I"*)) 42 | ) 43 | } 44 | number = _{ 45 | NUMBER+ | 46 | roman_numeral 47 | } 48 | 49 | ignoreable = _{ 50 | ("(" ~ (!")" ~ ANY)* ~ ")") | 51 | ("[" ~ (!"]" ~ ANY)* ~ "]") | 52 | ("{" ~ (!"}" ~ ANY)* ~ "}") 53 | } 54 | 55 | possible_sentence_start = _{ WHITE_SPACE* ~ (UPPERCASE_LETTER | TITLECASE_LETTER | QUOTATION_MARK | NUMBER) } 56 | complete_ending = _{ (WHITE_SPACE* ~ SENTENCE_TERMINAL+)+ } 57 | 58 | quoted_internal_sentence = _{ 59 | &(NEWLINE+) | 60 | (WHITE_SPACE+ ~ ending_abbreviation+ ~ &possible_sentence_start) | // abbreviation at the end of a sentence 61 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ number) ~ &possible_sentence_start ~ !number) | 62 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ number) ~ !(WHITE_SPACE* ~ (UPPERCASE_LETTER | TITLECASE_LETTER)) ~ quoted_internal_sentence) | 63 | ((ignoreable | contraction | (WHITE_SPACE+ ~ abbreviation+) | (!(SENTENCE_TERMINAL | QUOTATION_MARK) ~ ANY ) | (SENTENCE_TERMINAL ~ !possible_sentence_start)) ~ quoted_internal_sentence) | 64 | (complete_ending) | 65 | "ATION_MARK 66 | } 67 | 68 | quote_sentence = { &possible_sentence_start ~ !QUOTATION_MARK ~ WHITE_SPACE* ~ abbreviation* ~ quoted_internal_sentence } 69 | 70 | quote = { 71 | (WHITE_SPACE* ~ quote_sentence ~ (!NEWLINE ~ WHITE_SPACE)*)+ 72 | } 73 | 74 | quote_wrapper = _{ 75 | (QUOTATION_MARK ~ quote ~ (&(NEWLINE+) | QUOTATION_MARK)) 76 | } 77 | 78 | quoted_phrase = _{ 79 | (QUOTATION_MARK ~ !possible_sentence_start ~ (!QUOTATION_MARK~ ANY)* ~ (&(NEWLINE+) | QUOTATION_MARK)) 80 | } 81 | 82 | internal_sentence = _{ 83 | &(NEWLINE+) | 84 | (WHITE_SPACE+ ~ ending_abbreviation+ ~ &possible_sentence_start) | 85 | (contraction ~ internal_sentence) | 86 | (quoted_phrase ~ internal_sentence) | 87 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ number) ~ ((&possible_sentence_start ~ !number) | &EOI)) | 88 | ((number ~ WHITE_SPACE* ~ "." ~ WHITE_SPACE* ~ number) ~ !(WHITE_SPACE* ~ (UPPERCASE_LETTER | TITLECASE_LETTER)) ~ internal_sentence) | 89 | (quote_wrapper ~ (&possible_sentence_start | internal_sentence | &EOI)) | 90 | ((ignoreable | (WHITE_SPACE+ ~ abbreviation+) | (!SENTENCE_TERMINAL ~ ANY ) | (SENTENCE_TERMINAL ~ !possible_sentence_start)) ~ internal_sentence) | 91 | (complete_ending) | 92 | (!WHITE_SPACE ~ ANY)+ 93 | } 94 | 95 | sentence = { &possible_sentence_start ~ WHITE_SPACE* ~ abbreviation* ~ internal_sentence } 96 | 97 | sentence_list = _{ (WHITE_SPACE* ~ (sentence | ANY) ~ WHITE_SPACE*)* } 98 | -------------------------------------------------------------------------------- /cutters/src/parsers/english.rs: -------------------------------------------------------------------------------- 1 | use crate::{Quote, Sentence}; 2 | use pest::Parser; 3 | use pest_derive::*; 4 | 5 | #[derive(Parser)] 6 | #[grammar = "../res/en.pest"] 7 | pub struct EnglishParser; 8 | 9 | pub fn cut(text: &str) -> Vec { 10 | let ast = EnglishParser::parse(Rule::sentence_list, text).unwrap(); 11 | 12 | let mut sentences = vec![]; 13 | 14 | for sentence in ast { 15 | let str = sentence.as_str(); 16 | let mut quotes = vec![]; 17 | 18 | for quote in sentence.into_inner() { 19 | let str = quote.as_str(); 20 | let mut sentences = vec![]; 21 | 22 | for sentence in quote.into_inner() { 23 | sentences.push(sentence.as_str()); 24 | } 25 | 26 | quotes.push(Quote { str, sentences }); 27 | } 28 | 29 | sentences.push(Sentence { str, quotes }); 30 | } 31 | 32 | sentences 33 | } 34 | 35 | #[cfg(test)] 36 | mod test { 37 | use super::*; 38 | 39 | #[test] 40 | fn basic() { 41 | let query_sentences = vec![ 42 | r#"This is a declarative sentence."#, 43 | r#"This is an exclamatory sentence!"#, 44 | r#"This is an interrogative sentence?"#, 45 | r#"This sentencec ends with three dots..."#, 46 | r#"This sentence ends with a sequence of sentence terminals ... !?"#, 47 | r#"This sentence doesn't end with a sentence terminal"#, 48 | ]; 49 | 50 | let text = query_sentences.join(" "); 51 | 52 | let sentences = cut(&text); 53 | 54 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 55 | assert!(query_sentence == sentence.str); 56 | } 57 | } 58 | 59 | #[test] 60 | fn brackets() { 61 | let query_sentences = vec![ 62 | r#"European Union (hrv. Europska Unija) is a political and economic union of 27 member states that are located primarily in Europe."#, 63 | ]; 64 | 65 | let text = query_sentences.join(" "); 66 | 67 | let sentences = cut(&text); 68 | 69 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 70 | assert!(query_sentence == sentence.str); 71 | } 72 | } 73 | 74 | #[test] 75 | fn quotes() { 76 | let query_quotes = vec![ 77 | vec![vec![ 78 | r#"Sve sretne obitelji nalik su jedna na drugu, svaka nesretna obitelj nesretna je na svoj način."#, 79 | ]], 80 | vec![vec![r#"Hvala."#, r#"Ja također."#]], 81 | vec![vec![r#"Pazi!"#]], 82 | vec![vec![r#"Koliko je sati?"#], vec![r#"Pola jedan."#]], 83 | vec![vec![r#"Uspjet ćemo sve napraviti na vrijeme"#]], 84 | ]; 85 | 86 | let query_sentences = vec![ 87 | format!(r#"Tolstoj je napisao: „{}”"#, query_quotes[0][0][0]), 88 | format!( 89 | r#"Rekao je: „{} {}”"#, 90 | query_quotes[1][0][0], query_quotes[1][0][1] 91 | ), 92 | format!(r#"Uzviknuo je: '{}'"#, query_quotes[2][0][0]), 93 | format!( 94 | r#"Upitao je: „{}”, a ja sam rekao: "{}""#, 95 | query_quotes[3][0][0], query_quotes[3][1][0] 96 | ), 97 | format!(r#""{}", rekao je."#, query_quotes[4][0][0]), 98 | ]; 99 | 100 | let text = query_sentences.join(" "); 101 | 102 | let sentences = cut(&text); 103 | 104 | for (i, sentence) in sentences.iter().enumerate() { 105 | assert!(query_sentences[i] == sentence.str); 106 | 107 | for (j, quote) in sentence.quotes.iter().enumerate() { 108 | for (k, quote_sentence) in quote.sentences.iter().enumerate() { 109 | assert!(&query_quotes[i][j][k] == quote_sentence); 110 | } 111 | } 112 | } 113 | } 114 | 115 | #[test] 116 | fn numbers() { 117 | let query_sentences = vec![ 118 | r#"The average is 23.42 points and 12,18% of the students failed the test."#, 119 | r#"The mode is 23.1."#, 120 | r#"Before he fell, he was 1st."#, 121 | ]; 122 | 123 | let text = query_sentences.join(" "); 124 | 125 | let sentences = cut(&text); 126 | 127 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 128 | assert!(query_sentence == sentence.str); 129 | } 130 | } 131 | 132 | #[test] 133 | fn abbreviations() { 134 | let query_sentences = vec![ 135 | r#"St. Louis 9LX is a chess event."#, 136 | r#"We listened to Beethoven, Schubert, Liszt etc."#, 137 | r#"That is Dr. John smith."#, 138 | ]; 139 | 140 | let text = query_sentences.join(" "); 141 | 142 | let sentences = cut(&text); 143 | 144 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 145 | assert!(query_sentence == sentence.str); 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /cutters/src/parsers/croatian.rs: -------------------------------------------------------------------------------- 1 | use crate::{Quote, Sentence}; 2 | use pest::Parser; 3 | use pest_derive::*; 4 | 5 | #[derive(Parser)] 6 | #[grammar = "../res/hr.pest"] 7 | pub struct CroatianParser; 8 | 9 | pub fn cut(text: &str) -> Vec { 10 | let ast = CroatianParser::parse(Rule::sentence_list, text).unwrap(); 11 | 12 | let mut sentences = vec![]; 13 | 14 | for sentence in ast { 15 | let str = sentence.as_str(); 16 | let mut quotes = vec![]; 17 | 18 | for quote in sentence.into_inner() { 19 | let str = quote.as_str(); 20 | let mut sentences = vec![]; 21 | 22 | for sentence in quote.into_inner() { 23 | sentences.push(sentence.as_str()); 24 | } 25 | 26 | quotes.push(Quote { str, sentences }); 27 | } 28 | 29 | sentences.push(Sentence { str, quotes }); 30 | } 31 | 32 | sentences 33 | } 34 | 35 | #[cfg(test)] 36 | mod test { 37 | use super::*; 38 | 39 | #[test] 40 | fn basic() { 41 | let query_sentences = vec![ 42 | r#"Ovo je izjavna rečenica."#, 43 | r#"Ovo je usklična rečenica!"#, 44 | r#"Ovo je upitna rečenica?"#, 45 | r#"Ovo je rečenica s tri točkice..."#, 46 | r#"Ova rečenica završava s nizom znakova ... !?"#, 47 | r#"Ova rečenica nema točke na kraju"#, 48 | ]; 49 | 50 | let text = query_sentences.join(" "); 51 | 52 | let sentences = cut(&text); 53 | 54 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 55 | assert!(query_sentence == sentence.str); 56 | } 57 | } 58 | 59 | #[test] 60 | fn brackets() { 61 | let query_sentences = vec![ 62 | r#"Novi standard temelji se na smjernicama iz novog Priručnika za sastavljače i korisnike statističkih pokazatelja o inozemnoj zaduženosti (engl. External Debt Statistics - Guide for Compilers and Users), a prihvatile su ga zemlje potpisnice Posebnog standarda o statističkom izvješčivanju (engl. Special Data Dissemination Standard - SDDS)."#, 63 | ]; 64 | 65 | let text = query_sentences.join(" "); 66 | 67 | let sentences = cut(&text); 68 | 69 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 70 | assert!(query_sentence == sentence.str); 71 | } 72 | } 73 | 74 | #[test] 75 | fn quotes() { 76 | let query_quotes = vec![ 77 | vec![vec![ 78 | r#"Sve sretne obitelji nalik su jedna na drugu, svaka nesretna obitelj nesretna je na svoj način."#, 79 | ]], 80 | vec![vec![r#"Hvala."#, r#"Ja također."#]], 81 | vec![vec![r#"Pazi!"#]], 82 | vec![vec![r#"Koliko je sati?"#], vec![r#"Pola jedan."#]], 83 | vec![vec![r#"Uspjet ćemo sve napraviti na vrijeme"#]], 84 | ]; 85 | 86 | let query_sentences = vec![ 87 | format!(r#"Tolstoj je napisao: „{}”"#, query_quotes[0][0][0]), 88 | format!( 89 | r#"Rekao je: „{} {}”"#, 90 | query_quotes[1][0][0], query_quotes[1][0][1] 91 | ), 92 | format!(r#"Uzviknuo je: '{}'"#, query_quotes[2][0][0]), 93 | format!( 94 | r#"Upitao je: „{}”, a ja sam rekao: "{}""#, 95 | query_quotes[3][0][0], query_quotes[3][1][0] 96 | ), 97 | format!(r#""{}", rekao je."#, query_quotes[4][0][0]), 98 | ]; 99 | 100 | let text = query_sentences.join(" "); 101 | 102 | let sentences = cut(&text); 103 | 104 | for (i, sentence) in sentences.iter().enumerate() { 105 | assert!(query_sentences[i] == sentence.str); 106 | 107 | for (j, quote) in sentence.quotes.iter().enumerate() { 108 | for (k, quote_sentence) in quote.sentences.iter().enumerate() { 109 | assert!(&query_quotes[i][j][k] == quote_sentence); 110 | } 111 | } 112 | } 113 | } 114 | 115 | #[test] 116 | fn numbers() { 117 | let query_sentences = vec![ 118 | r#"Završila sam 2. razred."#, 119 | r#"Sretna 2013.!"#, 120 | r#"U vrijeme rata (1991. – 1995.) sve je bilo drukčije."#, 121 | r#"Upisala se na studij 2005./2006., a diplomirala je 2009./2010."#, 122 | r#"Možeš li doći do 15.?"#, 123 | r#"Rođen je 6. XI. 1989. godine."#, 124 | r#"Petar Krešimir IV. jedan je od najslavnijih hrvatskih vladara."#, 125 | r#"Prosjek je 23.42 bodova, a 12,18% studenata je palo."#, 126 | r#"Ovo takoder, ali sa datumom npr. 28.8.1999."#, 127 | ]; 128 | 129 | let text = query_sentences.join(" "); 130 | 131 | let sentences = cut(&text); 132 | 133 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 134 | assert!(query_sentence == sentence.str); 135 | } 136 | } 137 | 138 | #[test] 139 | fn abbreviations() { 140 | let query_sentences = vec![ 141 | r#"St. Louis 9LX događaj u svijetu šaha."#, 142 | r#"Slušali smo Beethovena, Schuberta, Liszta itd."#, 143 | r#"To je izv.prof.dr.sc. Ivan Horvat i predaje na sveučilištu u Zagrebu."#, 144 | ]; 145 | 146 | let text = query_sentences.join(" "); 147 | 148 | let sentences = cut(&text); 149 | 150 | for (sentence, query_sentence) in sentences.iter().zip(query_sentences) { 151 | assert!(query_sentence == sentence.str); 152 | } 153 | } 154 | } 155 | --------------------------------------------------------------------------------