├── .gitignore ├── rustfmt.toml ├── .editorconfig ├── .gitlab-ci.yml ├── Cargo.toml ├── LICENSE-MIT ├── .travis.yml ├── README.md ├── examples └── markov.rs ├── LICENSE-APACHE └── src └── lib.rs /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | format_strings = false 2 | reorder_imports = true 3 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | charset = utf-8 6 | trim_trailing_whitespace = true 7 | insert_final_newline = true 8 | indent_style = space 9 | indent_size = 4 10 | 11 | [*.md] 12 | trim_trailing_whitespace = false 13 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | build: 2 | script: 3 | - cargo build --verbose --release 4 | - cargo test -- verbose 5 | tags: 6 | - rust 7 | pages: 8 | stage: deploy 9 | script: 10 | - cargo doc --verbose 11 | - mkdir -p public/ 12 | - cp -r target/doc/* public/ 13 | artifacts: 14 | paths: 15 | - public 16 | only: 17 | - master -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "ngrams" 3 | version = "1.0.1" 4 | authors = ["Paul Woolcock "] 5 | repository = "https://gitlab.com/pwoolcoc/ngrams.git" 6 | homepage = "https://pwoolcoc.gitlab.io/ngrams/ngrams/" 7 | license = "MIT/Apache-2.0" 8 | readme = "README.md" 9 | documentation = "https://pwoolcoc.gitlab.io/ngrams" 10 | description = """ 11 | Generate n-grams from sequences 12 | """ 13 | 14 | [dependencies] 15 | clippy = { version = "0.0.23", optional = true } 16 | 17 | [features] 18 | default = [] 19 | dev = ["clippy"] 20 | 21 | [dev-dependencies] 22 | rand = "^0.3.12" 23 | flate2 = "^0.2.11" 24 | curl = "^0.3.5" 25 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 The Rust Project Developers 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: rust 3 | rust: 4 | - nightly 5 | - beta 6 | - stable 7 | matrix: 8 | allow_failures: 9 | - rust: nightly 10 | before_script: 11 | - | 12 | pip install 'travis-cargo<0.2' --user && 13 | export PATH=$HOME/.local/bin:$PATH 14 | script: 15 | - | 16 | travis-cargo build && 17 | travis-cargo test && 18 | travis-cargo bench && 19 | travis-cargo --only stable doc 20 | addons: 21 | apt: 22 | packages: 23 | - libcurl4-openssl-dev 24 | - libelf-dev 25 | - libdw-dev 26 | after_success: 27 | - travis-cargo --only stable doc-upload 28 | - travis-cargo coveralls --no-sudo 29 | notifications: 30 | email: 31 | on_success: never 32 | env: 33 | global: 34 | - TRAVIS_CARGO_NIGHTLY_FEATURE=dev 35 | - secure: lJr+B+7MVBqEcoXUxNN60Akksbqn0JVPU56dkQ3aL5XXYQ5MJ+jH3k30GcjzmKaSkuUVYLhS2a9NWz+1PXKKV6be5qh8yA5POairLOAobe2qDNXtIQIJg5jjsXE2GodiPyuKgY7B+l6I8sackgK1Q+pYV7EN4SpjCpRtQWkhk2sL1GVDYUiNbnSiJQn41pldK52y+qB4NUySoKEOj0SFDQCIiX++PziQykveMeWEJ9XviDa6fdzB+ihQEpAD98Kcv7eQrCJFr9xJBwbewIdAmJkNXqDc4z3Qadw29BcLoWu30lwRhCZvMGWCb/zyoGpCEO3gAxyOmNmGgcSfwBChmghWEu4olaOHabPxpWx0SU6D9qszlPRfAh3NrzYnCvHOGhy9W3cLLYSs3hJWHmbrpNwLMp5CuDpu4n4OM7JTny2AWQUcDnq/5le1fntC+RZNbkpCqT1knD8jKmBmygw9JiPzC0a7Fi8JRiQoWLLqS/ho4VxlvKSWZcp6gLU08xn9V3IN0Me1zprkE+94/H6/+KEozv403w4H73iT8dEGlxXKDXWKXYIu2TbGVsn5xiKHSlZ/IWPMPtIEIrXazd+KtSVuGeFQfC+JTzIbj1q9DTJKNmxmfRhv2AHbxvPzFJ+6ZUHHDht1lbP5AHP2GIi/NjT7QrjmOv+qmVtuZwLz860= 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # N-grams 2 | 3 | [![Build Status](https://gitlab.com/pwoolcoc/ngrams/badges/master/build.svg)](https://gitlab.com/pwoolcoc/ngrams) 4 | [![Coverage Status](https://coveralls.io/repos/pwoolcoc/ngrams/badge.svg?branch=master&service=github)](https://coveralls.io/github/pwoolcoc/ngrams?branch=master) 5 | [![](https://meritbadge.herokuapp.com/ngrams)](https://crates.io/crates/ngrams) 6 | 7 | [Documentation](https://pwoolcoc.gitlab.io/ngrams/ngrams) 8 | 9 | This crate takes a sequence of tokens and generates an n-gram for it. 10 | For more information about n-grams, check wikipedia: https://en.wikipedia.org/wiki/N-gram 11 | 12 | *Note*: The canonical version of this crate is hosted on [Gitlab](https://gitlab.com/pwoolcoc/ngrams) 13 | 14 | ## Usage 15 | 16 | Probably the easiest way to use it is to use the iterator adaptor. If 17 | your tokens are strings (&str, String, char, or Vec), you don't have 18 | to do anything other than generate the token stream: 19 | 20 | ```rust 21 | use ngrams::Ngram; 22 | let grams: Vec<_> = "one two three".split(' ').ngrams(2).collect(); 23 | // => vec![ 24 | // vec!["\u{2060}", "one"], 25 | // vec!["one", "two"], 26 | // vec!["two", "three"], 27 | // vec!["three", "\u{2060}"], 28 | // ] 29 | ``` 30 | 31 | (re: the "\u{2060}": We use the unicode `WORD JOINER` symbol as padding on the beginning and 32 | end of the token stream.) 33 | 34 | If your token type isn't one of the listed types, you can still use the 35 | iterator adaptor by implementing the `ngram::Pad` trait for your type. 36 | -------------------------------------------------------------------------------- /examples/markov.rs: -------------------------------------------------------------------------------- 1 | //! This is a just awful, awful implementation of a markov chain generator, but 2 | //! it serves the purpose, which is to use ngrams to generate subsequences of words 3 | //! in a corpus 4 | 5 | extern crate ngrams; 6 | extern crate rand; 7 | extern crate flate2; 8 | extern crate curl; 9 | 10 | use ngrams::Ngrams; 11 | use std::collections::HashMap; 12 | use std::io::{Cursor, BufRead, BufReader}; 13 | use flate2::read::GzDecoder; 14 | use curl::easy::Easy; 15 | 16 | use rand::distributions::{Weighted, WeightedChoice, IndependentSample}; 17 | 18 | #[derive(Debug, Clone, PartialEq)] 19 | struct Markov { 20 | data: HashMap>, 21 | } 22 | 23 | fn extract_sentence(s: String) -> String { 24 | s.split('\t').nth(1).unwrap().into() 25 | } 26 | 27 | fn tokenize_sentence(sentence: String) -> Vec { 28 | let tokens: Vec = sentence.split(|c| { 29 | match c { 30 | '"' | ',' | ';' | ':' => true, 31 | a if a.is_whitespace() => true, 32 | _ => false 33 | } 34 | }).filter(|a| a.len() > 0).map(|a| a.trim().to_owned()).collect(); 35 | tokens 36 | } 37 | 38 | impl Markov { 39 | fn new(url: &'static str) -> Markov { 40 | let mut map: HashMap> = HashMap::new(); 41 | let mut handle = Easy::new(); 42 | let mut data = Vec::new(); 43 | handle.url(url).unwrap(); 44 | { 45 | let mut transfer = handle.transfer(); 46 | transfer.write_function(|new_data| { 47 | data.extend_from_slice(new_data); 48 | Ok(new_data.len()) 49 | }).unwrap(); 50 | transfer.perform().unwrap(); 51 | } 52 | let file = BufReader::new( 53 | GzDecoder::new(Cursor::new(data) 54 | ).unwrap()); 55 | println!("extracting sentences..."); 56 | let sentences = file.lines().map(|a| a.unwrap()).map(extract_sentence).map(tokenize_sentence); 57 | print!("Building map of ngrams..."); 58 | for tokenized in sentences { 59 | let grams = Ngrams::new(tokenized.into_iter(), 2).pad(); 60 | for gram in grams { 61 | let first = gram[0].clone(); 62 | let second = gram[1].clone(); 63 | let entry = map.entry(first).or_insert(HashMap::new()); 64 | let entry = entry.entry(second).or_insert(0); 65 | *entry += 1; 66 | } 67 | } 68 | println!("Done!"); 69 | Markov { 70 | data: map 71 | } 72 | } 73 | 74 | fn random_word(&self, s: &str) -> String { 75 | match self.data.get(s) { 76 | Some(h) => { 77 | let mut choices = vec![]; 78 | for (word, count) in h { 79 | choices.push(Weighted { weight: *count, item: word }); 80 | } 81 | let wc = WeightedChoice::new(&mut choices); 82 | let mut rng = rand::thread_rng(); 83 | wc.ind_sample(&mut rng).clone() 84 | }, 85 | None => "whoops...".to_owned() 86 | } 87 | } 88 | 89 | fn sentence_generator(&self) -> SentenceGenerator { 90 | SentenceGenerator { 91 | markov: self.clone(), 92 | state: "\u{2060}".to_owned(), 93 | } 94 | } 95 | } 96 | 97 | struct SentenceGenerator { 98 | markov: Markov, 99 | state: String, 100 | } 101 | 102 | impl Iterator for SentenceGenerator { 103 | type Item = String; 104 | 105 | fn next(&mut self) -> Option { 106 | if self.state.ends_with('.') { 107 | return None; 108 | } 109 | self.state = self.markov.random_word(&self.state); 110 | Some(self.state.clone()) 111 | } 112 | } 113 | 114 | fn main() { 115 | let url = "https://gitlab.com/pwoolcoc/ngrams/raw/master/examples/eng_news_2005_1M-sentences.gz"; 116 | println!("Generating markov chain from input data\n\n\t{}\n\nThis is gonna take a while...", url); 117 | let chain = Markov::new(url); 118 | for _ in 0..10 { 119 | println!("{:?}", chain.sentence_generator().collect::>().join(" ")); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Ngrams 2 | //! 3 | //! Produce n-gram sequences from a sequence of tokens 4 | //! 5 | //! ## Examples 6 | //! 7 | //! ```rust 8 | //! use ngrams::Ngram; 9 | //! 10 | //! let grams: Vec<_> = "foo".chars().ngrams(2).pad().collect(); 11 | //! assert_eq!( 12 | //! grams, 13 | //! vec![ 14 | //! vec!['\u{2060}', 'f'], 15 | //! vec!['f', 'o'], 16 | //! vec!['o', 'o'], 17 | //! vec!['o', '\u{2060}'] 18 | //! ] 19 | //! ); 20 | //! ``` 21 | //! 22 | //! ```rust 23 | //! use ngrams::Ngrams; // notice `Ngram` vs `Ngrams` 24 | //! 25 | //! let iter = "one two three".split(' '); 26 | //! let grams: Vec<_> = Ngrams::new(iter, 3).pad().collect(); 27 | //! assert_eq!( 28 | //! grams, 29 | //! vec![ 30 | //! vec!["\u{2060}", "\u{2060}", "one"], 31 | //! vec!["\u{2060}", "one", "two"], 32 | //! vec!["one", "two", "three"], 33 | //! vec!["two", "three", "\u{2060}"], 34 | //! vec!["three", "\u{2060}", "\u{2060}"], 35 | //! ] 36 | //! ); 37 | //! ``` 38 | 39 | #![deny(missing_docs, 40 | missing_debug_implementations, missing_copy_implementations, 41 | trivial_casts, trivial_numeric_casts, 42 | unsafe_code, 43 | unstable_features, 44 | unused_import_braces, unused_qualifications)] 45 | #![cfg_attr(feature = "dev", allow(unstable_features))] 46 | #![cfg_attr(feature = "dev", feature(plugin))] 47 | #![cfg_attr(feature = "dev", plugin(clippy))] 48 | #![cfg_attr(feature = "dev", deny(clippy))] 49 | 50 | use std::fmt; 51 | use std::collections::VecDeque; 52 | 53 | const WORD_SEP: &'static str = "\u{2060}"; 54 | 55 | /// Iterator adaptor, allows you to call the method `.ngrams(n)` on your iterator, as long as the 56 | /// `Item` of the `Iterator` fits the trait bound 57 | /// 58 | /// ## Example 59 | /// 60 | /// ```rust 61 | /// use ngrams::Ngram; 62 | /// let s: Vec<_> = "hello".chars().ngrams(2).collect(); 63 | /// assert_eq!(s, vec![ 64 | /// vec!['h', 'e'], 65 | /// vec!['e', 'l'], 66 | /// vec!['l', 'l'], 67 | /// vec!['l', 'o'], 68 | /// ]); 69 | /// ``` 70 | pub trait Ngram<'a, T: 'a + Pad + fmt::Debug + Clone>: Iterator where Self: Sized { 71 | #[allow(missing_docs)] 72 | fn ngrams(self, usize) -> Ngrams<'a, T>; 73 | } 74 | 75 | impl<'a, T: 'a + Pad + fmt::Debug + Clone, U: 'a + Iterator> Ngram<'a, T> for U { 76 | fn ngrams(self, n: usize) -> Ngrams<'a, T> { 77 | Ngrams::new(self, n) 78 | } 79 | } 80 | 81 | /// Main data type, implements the logic on splitting and grouping n-grams 82 | pub struct Ngrams<'a, T: 'a + Pad + fmt::Debug + Clone> { 83 | source: Box + 'a>, 84 | num: usize, 85 | memsize: usize, 86 | memory: VecDeque, 87 | pad: bool, 88 | } 89 | 90 | impl<'a, T: 'a + Pad + fmt::Debug + Clone> fmt::Debug for Ngrams<'a, T> { 91 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 92 | write!(f, "Ngrams(tokens, N)") 93 | } 94 | } 95 | 96 | impl<'a, T: 'a + Pad + fmt::Debug + Clone + Sized> Ngrams<'a, T> { 97 | /// The source for the `Ngrams` is expected to be pre-tokenized, this library 98 | /// does not make any decisions regarding how your input should be tokenized. 99 | pub fn new>(source: V, n: usize) -> Ngrams<'a, T> { 100 | let memsize = n - 1; 101 | Ngrams { 102 | source: Box::new(source), 103 | num: n, 104 | memsize: memsize, 105 | memory: VecDeque::with_capacity(memsize), 106 | pad: false, 107 | } 108 | } 109 | 110 | /// Include padding at the beginning and end of the input. By default, this crate includes 111 | /// implementations for some common data structures, that prepends and appends the "WORD_SEP" 112 | /// unicode character onto the input. 113 | pub fn pad(mut self) -> Self { 114 | self.pad = true; 115 | self.source = Box::new(Padded::new(self.source, self.num)); 116 | self 117 | } 118 | 119 | fn fill_memory(&mut self) { 120 | while self.memory.len() < self.memsize { 121 | // Can I unwrap here? I need to make sure that 122 | // .next() can't return None before .memory is full 123 | let a = self.source.next().unwrap(); 124 | self.memory.push_back(a); 125 | } 126 | } 127 | } 128 | 129 | impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Ngrams<'a, T> { 130 | type Item = Vec; 131 | 132 | fn next(&mut self) -> Option { 133 | self.fill_memory(); 134 | 135 | self.source.next().map(|n| { 136 | let mut result = Vec::with_capacity(self.num); 137 | 138 | for elem in &self.memory { 139 | result.push(elem.clone()); 140 | } 141 | 142 | result.push(n.clone()); 143 | 144 | let _ = self.memory.pop_front(); 145 | self.memory.push_back(n.clone()); 146 | 147 | result 148 | }) 149 | } 150 | } 151 | 152 | /* 153 | impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for &'a Ngrams<'a, T> { 154 | type Item = Vec<&'a T>; 155 | 156 | fn next(&mut self) -> Option { 157 | self.fill_memory(); 158 | let next_item = self.source.next(); 159 | 160 | match next_item { 161 | None => None, 162 | Some(n) => { 163 | let mut result = Vec::with_capacity(self.num); 164 | 165 | for elem in &self.memory { 166 | } 167 | result.push(&n); 168 | 169 | let _ = self.memory.pop_front(); 170 | self.memory.push_back(n.clone()); 171 | 172 | Some(result) 173 | } 174 | } 175 | } 176 | } 177 | */ 178 | 179 | /// Implement this so `ngrams` knows how to pad the beginning and end of your input. 180 | /// 181 | /// There are default implementations for `&str`, `String`, and `Vec` 182 | pub trait Pad { 183 | /// The item returned from this method will be used to pad the beginning and end of each n-gram 184 | fn symbol() -> Self; 185 | 186 | /// Specifies how many characters of padding to add. Defaults to N - 1 187 | fn len(n: usize) -> usize { 188 | n - 1 189 | } 190 | } 191 | 192 | impl<'a> Pad for &'a str { 193 | fn symbol() -> Self { 194 | WORD_SEP 195 | } 196 | } 197 | 198 | impl Pad for String { 199 | fn symbol() -> Self { 200 | WORD_SEP.to_owned() 201 | } 202 | } 203 | 204 | impl Pad for Vec { 205 | fn symbol() -> Self { 206 | WORD_SEP.to_owned().into() 207 | } 208 | } 209 | 210 | impl Pad for char { 211 | fn symbol() -> Self { 212 | WORD_SEP.chars().next().unwrap() 213 | } 214 | } 215 | 216 | struct Padded<'a, T: 'a + Pad + fmt::Debug + Clone> { 217 | source: Box + 'a>, 218 | len: usize, 219 | symbol: T, 220 | remaining: usize, 221 | end: bool, 222 | } 223 | 224 | impl<'a, T: 'a + Pad + fmt::Debug + Clone> Padded<'a, T> { 225 | fn new + Sized>(source: U, n: usize) -> Padded<'a, T> { 226 | let l = T::len(n); 227 | Padded { 228 | source: Box::new(source), 229 | len: l, 230 | symbol: T::symbol(), 231 | remaining: l, 232 | end: false, 233 | } 234 | } 235 | } 236 | 237 | impl<'a, T: 'a + Pad + fmt::Debug + Clone> Iterator for Padded<'a, T> { 238 | type Item = T; 239 | 240 | fn next(&mut self) -> Option { 241 | if self.remaining > 0 { 242 | self.remaining -= 1; 243 | return Some(self.symbol.clone()); 244 | } 245 | 246 | let result = self.source.next(); 247 | 248 | if result.is_none() { 249 | 250 | if !self.end { 251 | // then this is the first time 252 | // we have seen this return None. 253 | self.end = true; 254 | self.remaining = self.len; 255 | } 256 | 257 | if self.remaining > 0 { 258 | self.remaining -= 1; 259 | return Some(self.symbol.clone()); 260 | } 261 | 262 | } 263 | 264 | result 265 | } 266 | } 267 | 268 | #[cfg(test)] 269 | mod tests { 270 | 271 | use super::{Ngram, Ngrams}; 272 | use std::string::ToString; 273 | 274 | #[test] 275 | fn test_words_iter_adaptor_padded() { 276 | let result: Vec<_> = "one two three four five".split(' ').ngrams(4).pad().collect(); 277 | assert_eq!( 278 | result, 279 | vec![ 280 | vec!["\u{2060}", "\u{2060}", "\u{2060}", "one"], 281 | vec!["\u{2060}", "\u{2060}", "one", "two"], 282 | vec!["\u{2060}", "one", "two", "three"], 283 | vec!["one", "two", "three", "four"], 284 | vec!["two", "three", "four", "five"], 285 | vec!["three", "four", "five", "\u{2060}"], 286 | vec!["four", "five", "\u{2060}", "\u{2060}"], 287 | vec!["five", "\u{2060}", "\u{2060}", "\u{2060}"], 288 | ] 289 | ); 290 | } 291 | 292 | #[test] 293 | fn test_words_padded() { 294 | let seq = "one two three four".split(' '); 295 | let result: Vec<_> = Ngrams::new(seq, 2).pad().collect(); 296 | assert_eq!(result, 297 | vec![ 298 | vec!["\u{2060}", "one"], 299 | vec!["one", "two"], 300 | vec!["two", "three"], 301 | vec!["three", "four"], 302 | vec!["four", "\u{2060}"], 303 | ]); 304 | } 305 | 306 | #[test] 307 | fn test_chars_padded() { 308 | let seq = "test string".chars().map(|c| c.to_string()); 309 | let result: Vec<_> = Ngrams::new(seq, 4).pad().collect(); 310 | assert_eq!(result, 311 | vec![ 312 | vec!["\u{2060}", "\u{2060}", "\u{2060}", "t"], 313 | vec!["\u{2060}", "\u{2060}", "t", "e"], 314 | vec!["\u{2060}", "t", "e", "s"], 315 | vec!["t", "e", "s", "t"], 316 | vec!["e", "s", "t", " "], 317 | vec!["s", "t", " ", "s"], 318 | vec!["t", " ", "s", "t"], 319 | vec![" ", "s", "t", "r"], 320 | vec!["s", "t", "r", "i"], 321 | vec!["t", "r", "i", "n"], 322 | vec!["r", "i", "n", "g"], 323 | vec!["i", "n", "g", "\u{2060}"], 324 | vec!["n", "g", "\u{2060}", "\u{2060}"], 325 | vec!["g", "\u{2060}", "\u{2060}", "\u{2060}"], 326 | ]); 327 | } 328 | #[test] 329 | fn test_words_iter_adaptor() { 330 | let result: Vec<_> = "one two three four five".split(' ').ngrams(4).collect(); 331 | assert_eq!( 332 | result, 333 | vec![ 334 | vec!["one", "two", "three", "four"], 335 | vec!["two", "three", "four", "five"], 336 | ] 337 | ); 338 | } 339 | 340 | #[test] 341 | fn test_words() { 342 | let seq = "one two three four".split(' '); 343 | let result: Vec<_> = Ngrams::new(seq, 2).collect(); 344 | assert_eq!(result, 345 | vec![ 346 | vec!["one", "two"], 347 | vec!["two", "three"], 348 | vec!["three", "four"], 349 | ]); 350 | } 351 | 352 | #[test] 353 | fn test_chars() { 354 | let seq = "test string".chars().map(|c| c.to_string()); 355 | let result: Vec<_> = Ngrams::new(seq, 4).collect(); 356 | assert_eq!(result, 357 | vec![ 358 | vec!["t", "e", "s", "t"], 359 | vec!["e", "s", "t", " "], 360 | vec!["s", "t", " ", "s"], 361 | vec!["t", " ", "s", "t"], 362 | vec![" ", "s", "t", "r"], 363 | vec!["s", "t", "r", "i"], 364 | vec!["t", "r", "i", "n"], 365 | vec!["r", "i", "n", "g"], 366 | ]); 367 | } 368 | } 369 | --------------------------------------------------------------------------------