├── .github └── workflows │ └── rust.yml ├── Cargo.lock ├── Cargo.toml ├── README.md ├── experiments └── rsnltk-experiment │ ├── Cargo.lock │ ├── Cargo.toml │ ├── README.md │ ├── examples │ ├── data │ │ ├── classes.txt │ │ └── dicts │ │ │ ├── 30wdict.txt │ │ │ ├── common_words.txt │ │ │ ├── domain_words │ │ │ ├── THUOCL_IT.txt │ │ │ ├── THUOCL_animal.txt │ │ │ ├── THUOCL_caijing.txt │ │ │ ├── THUOCL_car.txt │ │ │ ├── THUOCL_chengyu.txt │ │ │ ├── THUOCL_diming.txt │ │ │ ├── THUOCL_food.txt │ │ │ ├── THUOCL_law.txt │ │ │ ├── THUOCL_lishimingren.txt │ │ │ ├── THUOCL_medical.txt │ │ │ └── THUOCL_poem.txt │ │ │ └── stopwords │ │ │ ├── baidu_stopwords.txt │ │ │ ├── cn_stopwords.txt │ │ │ ├── hit_stopwords.txt │ │ │ └── scu_stopwords.txt │ ├── how_to_solve_moved_data │ │ ├── test.rs │ │ ├── test1.rs │ │ └── test3.rs │ ├── json_test.rs │ └── test │ │ └── test1.rs │ ├── rsnltk.iml │ ├── src │ ├── api │ │ ├── mod.rs │ │ ├── natural.rs │ │ ├── whatlang.rs │ │ └── yn.rs │ ├── lib.rs │ ├── main.rs │ ├── native │ │ ├── chardata.rs │ │ ├── mod.rs │ │ ├── nlpsvc │ │ │ ├── annotated_document.rs │ │ │ ├── english_rules.rs │ │ │ ├── mod.rs │ │ │ ├── node_label.rs │ │ │ ├── readme.rs │ │ │ ├── regex │ │ │ │ ├── mod.rs │ │ │ │ ├── reinterp.rs │ │ │ │ ├── reparse.rs │ │ │ │ ├── reprog.rs │ │ │ │ ├── reterm.rs │ │ │ │ ├── retrans.rs │ │ │ │ ├── sparse.rs │ │ │ │ └── util.rs │ │ │ ├── regex_tokenizer.rs │ │ │ ├── text_source.rs │ │ │ └── tree_sequence.rs │ │ ├── segmentation.rs │ │ ├── summarizer.rs │ │ ├── text.rs │ │ ├── token.rs │ │ ├── toksiter.rs │ │ └── word2vec.rs │ ├── stanza.rs │ └── wordnet.rs │ └── tests │ ├── 3rdparty_test.rs │ ├── nlpsvc_test.rs │ ├── segmentation_test.rs │ ├── stanza_test.rs │ ├── text_test.rs │ └── wordnet_test.rs ├── rsnltk.iml ├── src ├── api │ ├── mod.rs │ ├── natural.rs │ ├── whatlang.rs │ └── yn.rs ├── lib.rs ├── main.rs ├── native │ ├── chardata.rs │ ├── mod.rs │ ├── segmentation.rs │ ├── summarizer.rs │ ├── token.rs │ ├── toksiter.rs │ └── word2vec.rs ├── stanza.rs └── wordnet.rs └── tests ├── 3rdparty_test.rs ├── native_rust_test.rs ├── segmentation_test.rs ├── stanza_test.rs └── wordnet_test.rs /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Build 20 | run: cargo build --verbose 21 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [[bin]] 2 | name="rsnltk_main" 3 | path="src/main.rs" 4 | 5 | [package] 6 | name = "rsnltk" 7 | version = "0.1.3" 8 | edition = "2021" 9 | description = "Rust-based Natural Language Toolkit" 10 | readme = "README.md" 11 | repository = "https://github.com/dhchenx/rsnltk/" 12 | documentation = "https://docs.rs/crate/rsnltk/latest" 13 | license = "MIT" 14 | keywords = [ "nltk", "Stanza","natural-language","text-analysis","semantics"] 15 | categories = ["text-processing","parsing"] 16 | exclude=[ 17 | "data", 18 | ".idea", 19 | "target", 20 | "examples", 21 | "experiments" 22 | ] 23 | 24 | [lib] 25 | crate-type=["cdylib","rlib"] 26 | name="rsnltk" 27 | path= "src/lib.rs" 28 | 29 | [dependencies] 30 | natural = { version = "0.4.0", features = ["serde_support"]} 31 | serde = "1.0" 32 | whatlang = "0.12.0" 33 | yn = "0.1.1" 34 | unicode-segmentation = "1.8.0" 35 | ndarray = "0.15.4" 36 | word2vec = "0.3.3" 37 | 38 | 39 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 40 | 41 | [dependencies.pyo3] 42 | version = "0.15.1" 43 | features = ["auto-initialize"] 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rust-based Natural Language Toolkit (rsnltk) 2 | A Rust library to support natural language processing with pure Rust implementation and Python bindings 3 | 4 | [Rust Docs](https://docs.rs/rsnltk/0.1.1) | [Crates Home Page](https://crates.io/crates/rsnltk) | [Tests](https://github.com/dhchenx/rsnltk/tree/main/tests) | [NER-Kit](https://pypi.org/project/ner-kit/) 5 | 6 | ![example workflow](https://github.com/dhchenx/rsnltk/actions/workflows/rust.yml/badge.svg) 7 | 8 | ## Features 9 | The `rsnltk` library integrates various existing Python-written NLP toolkits for powerful text analysis in Rust-based applications. 10 | 11 | ## Functions 12 | This toolkit is based on the Python-written [Stanza](https://stanfordnlp.github.io/stanza/) and other important NLP crates. 13 | 14 | A list of functions from Stanza and others we bind here include: 15 | - Tokenize 16 | - Sentence Segmentation 17 | - Multi-Word Token Expansion 18 | - Part-of-Speech & Morphological Features 19 | - Named Entity Recognition 20 | - Sentiment Analysis 21 | - Language Identification 22 | - Dependency Tree Analysis 23 | 24 | Some amazing crates are also included in `rsnltk` but with simplified APIs for actual use: 25 | - [word2vec](https://crates.io/crates/word2vec) 26 | - [natural](https://crates.io/crates/natural), [yn](https://crates.io/crates/yn), [whatlang](https://crates.io/crates/whatlang). 27 | 28 | Additionally, we can calculate the similarity between words based on WordNet though the `semantic-kit` PyPI project via `pip install semantic-kit`. 29 | 30 | ## Installation 31 | 32 | 1. Make sure you install Python 3.6.6+ and PIP environment in your computer. Type `python -V` in the Terminal should print no error message; 33 | 34 | 2. Install our Python-based [ner-kit](https://pypi.org/project/ner-kit/) (version>=0.0.5a2) for binding the `Stanza` package via `pip install ner-kit==0.0.5a2`; 35 | 36 | 3. Then, Rust should be also installed in your computer. I use IntelliJ to develop Rust-based applications, where you can write Rust codes; 37 | 38 | 4. Create a simple Rust application project with a `main()` function. 39 | 40 | 5. Add the `rsnltk` dependency to the `Cargo.toml` file, keep up the Latest version. 41 | 42 | 6. After you add the `rsnltk` dependency in the `toml file`, install necessary language models from Stanza using the following Rust code for the first time you use this package. 43 | 44 | ```rust 45 | fn init_rsnltk_and_test(){ 46 | // 1. first install the necessary language models 47 | // using language codes 48 | let list_lang=vec!["en","zh"]; 49 | //e.g. you install two language models, 50 | // namely, for English and Chinese text analysis. 51 | download_langs(list_lang); 52 | // 2. then do test NLP tasks 53 | let text="I like Beijing!"; 54 | let lang="en"; 55 | // 2. Uncomment the below codes for Chinese NER 56 | // let text="我喜欢北京、上海和纽约!"; 57 | // let lang="zh"; 58 | let list_ner=ner(text,lang); 59 | for ner in list_ner{ 60 | println!("{:?}",ner); 61 | } 62 | } 63 | ``` 64 | 65 | Or you can manually install those [language models](https://stanfordnlp.github.io/stanza/available_models.html) via the Python-written `ner-kit` package which provides more features in using Stanza. Go to: [ner-kit](https://pypi.org/project/ner-kit/) 66 | 67 | If no error occurs in the above example, then it works. Finally, you can try the following advanced example usage. 68 | 69 | Currently, we tested the use of English and Chinese language models; however, other language models should work as well. 70 | 71 | ## Examples with Stanza Bindings 72 | 73 | Example 1: Part-of-speech Analysis 74 | 75 | ```rust 76 | fn test_pos(){ 77 | //let text="我喜欢北京、上海和纽约!"; 78 | //let lang="zh"; 79 | let text="I like apple"; 80 | let lang="en"; 81 | let list_result=pos(text,lang); 82 | for word in list_result{ 83 | println!("{:?}",word); 84 | } 85 | } 86 | ``` 87 | 88 | Example 2: Sentiment Analysis 89 | ```rust 90 | fn test_sentiment(){ 91 | //let text="I like Beijing!"; 92 | //let lang="en"; 93 | let text="我喜欢北京"; 94 | let lang="zh"; 95 | let sentiments=sentiment(text,lang); 96 | for sen in sentiments{ 97 | println!("{:?}",sen); 98 | } 99 | } 100 | ``` 101 | 102 | Example 3: Named Entity Recognition 103 | 104 | ```rust 105 | fn test_ner(){ 106 | // 1. for English NER 107 | let text="I like Beijing!"; 108 | let lang="en"; 109 | // 2. Uncomment the below codes for Chinese NER 110 | // let text="我喜欢北京、上海和纽约!"; 111 | // let lang="zh"; 112 | let list_ner=ner(text,lang); 113 | for ner in list_ner{ 114 | println!("{:?}",ner); 115 | } 116 | } 117 | ``` 118 | 119 | Example 4: Tokenize for Multiple Languages 120 | 121 | ```rust 122 | fn test_tokenize(){ 123 | let text="我喜欢北京、上海和纽约!"; 124 | let lang="zh"; 125 | let list_result=tokenize(text,lang); 126 | for ner in list_result{ 127 | println!("{:?}",ner); 128 | } 129 | } 130 | ``` 131 | 132 | Example 5: Tokenize Sentence 133 | 134 | ```rust 135 | fn test_tokenize_sentence(){ 136 | let text="I like apple. Do you like it? No, I am not sure!"; 137 | let lang="en"; 138 | let list_sentences=tokenize_sentence(text,lang); 139 | for sentence in list_sentences{ 140 | println!("Sentence: {}",sentence); 141 | } 142 | } 143 | ``` 144 | 145 | Example 6: Language Identification 146 | 147 | ```rust 148 | fn test_lang(){ 149 | let list_text = vec!["I like Beijing!", 150 | "我喜欢北京!", 151 | "Bonjour le monde!"]; 152 | let list_result=lang(list_text); 153 | for lang in list_result{ 154 | println!("{:?}",lang); 155 | } 156 | } 157 | ``` 158 | 159 | Example 7: MWT expand 160 | 161 | ```rust 162 | fn test_mwt_expand(){ 163 | let text="Nous avons atteint la fin du sentier."; 164 | let lang="fr"; 165 | let list_result=mwt_expand(text,lang); 166 | } 167 | ``` 168 | 169 | Example 8: Estimate the similarity between words in WordNet 170 | 171 | You need to firstly install `semantic-kit` PyPI package! 172 | 173 | ```rust 174 | fn test_wordnet_similarity(){ 175 | let s1="dog.n.1"; 176 | let s2="cat.n.2"; 177 | let sims=wordnet_similarity(s1,s2); 178 | for sim in sims{ 179 | println!("{:?}",sim); 180 | } 181 | } 182 | ``` 183 | 184 | Example 9: Obtain a dependency tree from a text 185 | ```rust 186 | fn test_dependency_tree(){ 187 | let text="I like you. Do you like me?"; 188 | let lang="en"; 189 | let list_results=dependency_tree(text,lang); 190 | for list_token in list_results{ 191 | for token in list_token{ 192 | println!("{:?}",token) 193 | } 194 | 195 | } 196 | } 197 | ``` 198 | 199 | ## Examples in Pure Rust 200 | 201 | Example 1: Word2Vec similarity 202 | 203 | ```rust 204 | fn test_open_wv_bin(){ 205 | let wv_model=wv_get_model("GoogleNews-vectors-negative300.bin"); 206 | let positive = vec!["woman", "king"]; 207 | let negative = vec!["man"]; 208 | println!("analogy: {:?}", wv_analogy(&wv_model,positive, negative, 10)); 209 | println!("cosine: {:?}", wv_cosine(&wv_model,"man", 10)); 210 | } 211 | ``` 212 | 213 | Example 2: Text summarization 214 | 215 | ```rust 216 | use rsnltk::native::summarizer::*; 217 | fn test_summarize(){ 218 | let text="Some large txt..."; 219 | let stopwords=&[]; 220 | let summarized_text=summarize(text,stopwords,5); 221 | println!("{}",summarized_text); 222 | } 223 | ``` 224 | 225 | Example 3: Get token list from English strings 226 | ```rust 227 | use rsnltk::native::token::get_token_list; 228 | fn test_get_token_list(){ 229 | let s="Hello, Rust. How are you?"; 230 | let result=get_token_list(s); 231 | for r in result{ 232 | println!("{}\t{:?}",r.text,r); 233 | } 234 | } 235 | ``` 236 | 237 | Example 4: Word segmentation for some language where no space exists between terms, e.g. Chinese text. 238 | 239 | We implement three word segmentation methods in this version: 240 | 241 | - Forward Maximum Matching (fmm), which is baseline method 242 | - Backward Maximum Matching (bmm), which is considered better 243 | - Bidirectional Maximum Matching (bimm), high accuracy but low speed 244 | 245 | ```rust 246 | use rsnltk::native::segmentation::*; 247 | fn test_real_word_segmentation(){ 248 | let dict_path="30wdict.txt"; // empty if only for tokenizing 249 | let stop_path="baidu_stopwords.txt";// empty when no stop words 250 | let _sentence="美国太空总署希望,在深海的探险发现将有助于解开一些外太空的秘密,\ 251 | 同时也可以测试前往太阳系其他星球探险所需的一些设备和实验。"; 252 | let meaningful_words=get_segmentation(_sentence,dict_path,stop_path, "bimm"); 253 | // bimm can be changed to fmm or bmm. 254 | println!("Result: {:?}",meaningful_words); 255 | } 256 | ``` 257 | 258 | ## Credits 259 | 260 | Thank [Stanford NLP Group](https://github.com/stanfordnlp/stanza) for their hard work in [Stanza](https://stanfordnlp.github.io/stanza/). 261 | 262 | ## License 263 | The `rsnltk` library with MIT License is provided by [Donghua Chen](https://github.com/dhchenx). 264 | 265 | 266 | 267 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/Cargo.toml: -------------------------------------------------------------------------------- 1 | [[bin]] 2 | name="rsnltk_main" 3 | path="src/main.rs" 4 | 5 | [package] 6 | name = "rsnltk-experiment" 7 | version = "0.1.1" 8 | edition = "2021" 9 | description = "Rust-based Natural Language Toolkit using Python Bindings" 10 | readme = "README.md" 11 | repository = "https://github.com/dhchenx/rsnltk/" 12 | documentation = "https://docs.rs/crate/rsnltk/latest" 13 | license = "MIT" 14 | keywords = [ "nltk", "Stanza","CoreNLP","text-analysis","semantics"] 15 | categories = ["text-processing","parsing"] 16 | exclude=[ 17 | "data", 18 | ".idea", 19 | "target", 20 | "examples" 21 | ] 22 | publish=false 23 | 24 | [lib] 25 | crate-type=["cdylib","rlib"] 26 | name="rsnltk" 27 | path= "src/lib.rs" 28 | 29 | [dependencies] 30 | natural = { version = "0.4.0", features = ["serde_support"]} 31 | serde = "1.0" 32 | whatlang = "0.12.0" 33 | yn = "0.1.1" 34 | unicode-segmentation = "1.8.0" 35 | ndarray = "0.15.4" 36 | word2vec = "0.3.3" 37 | serde_json = "1.0.78" 38 | indextree = "1.0.1" 39 | getopts = "0.2.21" 40 | 41 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 42 | 43 | [dependencies.pyo3] 44 | version = "0.15.1" 45 | features = ["auto-initialize"] 46 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/README.md: -------------------------------------------------------------------------------- 1 | # Rust-based Natural Language Toolkit (rsnltk) 2 | A Rust library to support natural language processing with Python bindings 3 | 4 | [Rust Docs](https://docs.rs/rsnltk/0.1.1) | [Crates Home Page](https://crates.io/crates/rsnltk) | [Tests](https://github.com/dhchenx/rsnltk/tree/main/tests) | [NER-Kit](https://pypi.org/project/ner-kit/) 5 | 6 | ## Features 7 | The `rsnltk` library integrates various existing Python-written NLP toolkits for powerful text analysis in Rust-based applications. 8 | 9 | ## Functions 10 | This toolkit is based on the Python-written [Stanza](https://stanfordnlp.github.io/stanza/) and other important NLP crates. 11 | 12 | A list of functions from Stanza and others we bind here include: 13 | - Tokenize 14 | - Sentence Segmentation 15 | - Multi-Word Token Expansion 16 | - Part-of-Speech & Morphological Features 17 | - Named Entity Recognition 18 | - Sentiment Analysis 19 | - Language Identification 20 | - Dependency Tree Analysis 21 | 22 | Some amazing crates are also included in `rsnltk` but with simplified APIs for actual use: 23 | - [word2vec](https://crates.io/crates/word2vec) 24 | - [natural](https://crates.io/crates/natural), [yn](https://crates.io/crates/yn), [whatlang](https://crates.io/crates/whatlang). 25 | 26 | Additionally, we can calculate the similarity between words based on WordNet though the `semantic-kit` PyPI project via `pip install semantic-kit`. 27 | 28 | ## Installation 29 | 30 | 1. Make sure you install Python 3.6.6+ and PIP environment in your computer. Type `python -V` in the Terminal should print no error message; 31 | 32 | 2. Install our Python-based [ner-kit](https://pypi.org/project/ner-kit/) (version>=0.0.5a2) for binding the `Stanza` package via `pip install ner-kit==0.0.5a2`; 33 | 34 | 3. Then, Rust should be also installed in your computer. I use IntelliJ to develop Rust-based applications, where you can write Rust codes; 35 | 36 | 4. Create a simple Rust application project with a `main()` function. 37 | 38 | 5. Add the `rsnltk` dependency to the `Cargo.toml` file, keep up the Latest version. 39 | 40 | 6. After you add the `rsnltk` dependency in the `toml file`, install necessary language models from Stanza using the following Rust code for the first time you use this package. 41 | 42 | ```rust 43 | fn init_rsnltk_and_test(){ 44 | // 1. first install the necessary language models using language codes 45 | let list_lang=vec!["en","zh"]; 46 | //e.g. you install two language models, 47 | // namely, for English and Chinese text analysis. 48 | download_langs(list_lang); 49 | // 2. then do test NLP tasks 50 | let text="I like Beijing!"; 51 | let lang="en"; 52 | // 2. Uncomment the below codes for Chinese NER 53 | // let text="我喜欢北京、上海和纽约!"; 54 | // let lang="zh"; 55 | let list_ner=ner(text,lang); 56 | for ner in list_ner{ 57 | println!("{:?}",ner); 58 | } 59 | } 60 | ``` 61 | 62 | Or you can manually install those [language models](https://stanfordnlp.github.io/stanza/available_models.html) via the Python-written `ner-kit` package which provides more features in using Stanza. Go to: [ner-kit](https://pypi.org/project/ner-kit/) 63 | 64 | If no error occurs in the above example, then it works. Finally, you can try the following advanced example usage. 65 | 66 | Currently, we tested the use of English and Chinese language models; however, other language models should work as well. 67 | 68 | ## Examples with Stanza Bindings 69 | 70 | Example 1: Part-of-speech Analysis 71 | 72 | ```rust 73 | fn test_pos(){ 74 | //let text="我喜欢北京、上海和纽约!"; 75 | //let lang="zh"; 76 | let text="I like apple"; 77 | let lang="en"; 78 | 79 | let list_result=pos(text,lang); 80 | for word in list_result{ 81 | println!("{:?}",word); 82 | } 83 | } 84 | ``` 85 | 86 | Example 2: Sentiment Analysis 87 | ```rust 88 | fn test_sentiment(){ 89 | //let text="I like Beijing!"; 90 | //let lang="en"; 91 | let text="我喜欢北京"; 92 | let lang="zh"; 93 | 94 | let sentiments=sentiment(text,lang); 95 | for sen in sentiments{ 96 | println!("{:?}",sen); 97 | } 98 | } 99 | ``` 100 | 101 | Example 3: Named Entity Recognition 102 | 103 | ```rust 104 | fn test_ner(){ 105 | // 1. for English NER 106 | let text="I like Beijing!"; 107 | let lang="en"; 108 | // 2. Uncomment the below codes for Chinese NER 109 | // let text="我喜欢北京、上海和纽约!"; 110 | // let lang="zh"; 111 | let list_ner=ner(text,lang); 112 | for ner in list_ner{ 113 | println!("{:?}",ner); 114 | } 115 | } 116 | ``` 117 | 118 | Example 4: Tokenize for Multiple Languages 119 | 120 | ```rust 121 | fn test_tokenize(){ 122 | 123 | let text="我喜欢北京、上海和纽约!"; 124 | let lang="zh"; 125 | 126 | let list_result=tokenize(text,lang); 127 | for ner in list_result{ 128 | println!("{:?}",ner); 129 | } 130 | } 131 | ``` 132 | 133 | Example 5: Tokenize Sentence 134 | 135 | ```rust 136 | fn test_tokenize_sentence(){ 137 | let text="I like apple. Do you like it? No, I am not sure!"; 138 | let lang="en"; 139 | let list_sentences=tokenize_sentence(text,lang); 140 | for sentence in list_sentences{ 141 | println!("Sentence: {}",sentence); 142 | } 143 | } 144 | ``` 145 | 146 | Example 6: Language Identification 147 | 148 | ```rust 149 | fn test_lang(){ 150 | let list_text = vec!["I like Beijing!", 151 | "我喜欢北京!", 152 | "Bonjour le monde!"]; 153 | let list_result=lang(list_text); 154 | for lang in list_result{ 155 | println!("{:?}",lang); 156 | } 157 | } 158 | ``` 159 | 160 | Example 7: MWT expand 161 | 162 | ```rust 163 | fn test_mwt_expand(){ 164 | let text="Nous avons atteint la fin du sentier."; 165 | let lang="fr"; 166 | let list_result=mwt_expand(text,lang); 167 | } 168 | ``` 169 | 170 | Example 8: Estimate the similarity between words in WordNet 171 | 172 | You need to firstly install `semantic-kit` PyPI package! 173 | 174 | ```rust 175 | fn test_wordnet_similarity(){ 176 | let s1="dog.n.1"; 177 | let s2="cat.n.2"; 178 | let sims=wordnet_similarity(s1,s2); 179 | for sim in sims{ 180 | println!("{:?}",sim); 181 | } 182 | } 183 | ``` 184 | 185 | Example 9: Obtain a dependency tree from a text 186 | ```rust 187 | fn test_dependency_tree(){ 188 | let text="I like you. Do you like me?"; 189 | let lang="en"; 190 | let list_results=dependency_tree(text,lang); 191 | for list_token in list_results{ 192 | for token in list_token{ 193 | println!("{:?}",token) 194 | } 195 | 196 | } 197 | } 198 | ``` 199 | 200 | ## Examples in Pure Rust 201 | 202 | Example 1: Word2Vec similarity 203 | 204 | ```rust 205 | fn test_open_wv_bin(){ 206 | let wv_model=wv_get_model("GoogleNews-vectors-negative300.bin"); 207 | let positive = vec!["woman", "king"]; 208 | let negative = vec!["man"]; 209 | println!("analogy: {:?}", wv_analogy(&wv_model,positive, negative, 10)); 210 | println!("cosine: {:?}", wv_cosine(&wv_model,"man", 10)); 211 | } 212 | ``` 213 | 214 | Example 2: Text summarization 215 | 216 | ```rust 217 | use rsnltk::native::summarizer::*; 218 | fn test_summarize(){ 219 | let text="Some large txt..."; 220 | let stopwords=&[]; 221 | let summarized_text=summarize(text,stopwords,5); 222 | println!("{}",summarized_text); 223 | } 224 | ``` 225 | 226 | Example 3: Get token list from English strings 227 | ```rust 228 | use rsnltk::native::token::get_token_list; 229 | fn test_get_token_list(){ 230 | let s="Hello, Rust. How are you?"; 231 | let result=get_token_list(s); 232 | for r in result{ 233 | println!("{}\t{:?}",r.text,r); 234 | } 235 | } 236 | ``` 237 | 238 | Example 4: Word segmentation for some language where no space exists between terms, e.g. Chinese text. 239 | 240 | We implement three word segmentation methods in this version: 241 | 242 | - Forward Maximum Matching (fmm), which is baseline method 243 | - Backward Maximum Matching (bmm), which is considered better 244 | - Bidirectional Maximum Matching (bimm), high accuracy but low speed 245 | 246 | ```rust 247 | use rsnltk::native::segmentation::*; 248 | fn test_real_word_segmentation(){ 249 | let dict_path="30wdict.txt"; // empty if only for tokenizing 250 | let stop_path="baidu_stopwords.txt";// empty when no stop words 251 | let _sentence="美国太空总署希望,在深海的探险发现将有助于解开一些外太空的秘密,\ 252 | 同时也可以测试前往太阳系其他星球探险所需的一些设备和实验。"; 253 | let meaningful_words=get_segmentation(_sentence,dict_path,stop_path, "bimm"); 254 | // bimm can be changed to fmm or bmm. 255 | println!("Result: {:?}",meaningful_words); 256 | } 257 | ``` 258 | 259 | ## Credits 260 | 261 | Thank [Stanford NLP Group](https://github.com/stanfordnlp/stanza) for their hard work in [Stanza](https://stanfordnlp.github.io/stanza/). 262 | 263 | ## License 264 | MIT 265 | 266 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/data/classes.txt: -------------------------------------------------------------------------------- 1 | Over time, I've written a few mdbooks/gitbooks using Rust to explain interesting concepts or try to take a deep dive into some of Rust's inner workings. Common to all of them is that they relied a lot on unstable features, most notably the "original" asm! macro (later renamed to llvm_asm!). 2 | 3 | Due to frequent changes to these features, several examples didn't compile or work as they did when the books were written, and it's been a problem for quite a long time. 4 | 5 | I just wanted to let everyone know that the books have now been updated with the new asm! syntax, the dialect changed from AT&T to Intel (which is the new default) and all examples are now working in addition to other minor improvements. -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/data/dicts/stopwords/cn_stopwords.txt: -------------------------------------------------------------------------------- 1 | $ 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | ? 13 | _ 14 | “ 15 | ” 16 | 、 17 | 。 18 | 《 19 | 》 20 | 一 21 | 一些 22 | 一何 23 | 一切 24 | 一则 25 | 一方面 26 | 一旦 27 | 一来 28 | 一样 29 | 一般 30 | 一转眼 31 | 万一 32 | 上 33 | 上下 34 | 下 35 | 不 36 | 不仅 37 | 不但 38 | 不光 39 | 不单 40 | 不只 41 | 不外乎 42 | 不如 43 | 不妨 44 | 不尽 45 | 不尽然 46 | 不得 47 | 不怕 48 | 不惟 49 | 不成 50 | 不拘 51 | 不料 52 | 不是 53 | 不比 54 | 不然 55 | 不特 56 | 不独 57 | 不管 58 | 不至于 59 | 不若 60 | 不论 61 | 不过 62 | 不问 63 | 与 64 | 与其 65 | 与其说 66 | 与否 67 | 与此同时 68 | 且 69 | 且不说 70 | 且说 71 | 两者 72 | 个 73 | 个别 74 | 临 75 | 为 76 | 为了 77 | 为什么 78 | 为何 79 | 为止 80 | 为此 81 | 为着 82 | 乃 83 | 乃至 84 | 乃至于 85 | 么 86 | 之 87 | 之一 88 | 之所以 89 | 之类 90 | 乌乎 91 | 乎 92 | 乘 93 | 也 94 | 也好 95 | 也罢 96 | 了 97 | 二来 98 | 于 99 | 于是 100 | 于是乎 101 | 云云 102 | 云尔 103 | 些 104 | 亦 105 | 人 106 | 人们 107 | 人家 108 | 什么 109 | 什么样 110 | 今 111 | 介于 112 | 仍 113 | 仍旧 114 | 从 115 | 从此 116 | 从而 117 | 他 118 | 他人 119 | 他们 120 | 以 121 | 以上 122 | 以为 123 | 以便 124 | 以免 125 | 以及 126 | 以故 127 | 以期 128 | 以来 129 | 以至 130 | 以至于 131 | 以致 132 | 们 133 | 任 134 | 任何 135 | 任凭 136 | 似的 137 | 但 138 | 但凡 139 | 但是 140 | 何 141 | 何以 142 | 何况 143 | 何处 144 | 何时 145 | 余外 146 | 作为 147 | 你 148 | 你们 149 | 使 150 | 使得 151 | 例如 152 | 依 153 | 依据 154 | 依照 155 | 便于 156 | 俺 157 | 俺们 158 | 倘 159 | 倘使 160 | 倘或 161 | 倘然 162 | 倘若 163 | 借 164 | 假使 165 | 假如 166 | 假若 167 | 傥然 168 | 像 169 | 儿 170 | 先不先 171 | 光是 172 | 全体 173 | 全部 174 | 兮 175 | 关于 176 | 其 177 | 其一 178 | 其中 179 | 其二 180 | 其他 181 | 其余 182 | 其它 183 | 其次 184 | 具体地说 185 | 具体说来 186 | 兼之 187 | 内 188 | 再 189 | 再其次 190 | 再则 191 | 再有 192 | 再者 193 | 再者说 194 | 再说 195 | 冒 196 | 冲 197 | 况且 198 | 几 199 | 几时 200 | 凡 201 | 凡是 202 | 凭 203 | 凭借 204 | 出于 205 | 出来 206 | 分别 207 | 则 208 | 则甚 209 | 别 210 | 别人 211 | 别处 212 | 别是 213 | 别的 214 | 别管 215 | 别说 216 | 到 217 | 前后 218 | 前此 219 | 前者 220 | 加之 221 | 加以 222 | 即 223 | 即令 224 | 即使 225 | 即便 226 | 即如 227 | 即或 228 | 即若 229 | 却 230 | 去 231 | 又 232 | 又及 233 | 及 234 | 及其 235 | 及至 236 | 反之 237 | 反而 238 | 反过来 239 | 反过来说 240 | 受到 241 | 另 242 | 另一方面 243 | 另外 244 | 另悉 245 | 只 246 | 只当 247 | 只怕 248 | 只是 249 | 只有 250 | 只消 251 | 只要 252 | 只限 253 | 叫 254 | 叮咚 255 | 可 256 | 可以 257 | 可是 258 | 可见 259 | 各 260 | 各个 261 | 各位 262 | 各种 263 | 各自 264 | 同 265 | 同时 266 | 后 267 | 后者 268 | 向 269 | 向使 270 | 向着 271 | 吓 272 | 吗 273 | 否则 274 | 吧 275 | 吧哒 276 | 吱 277 | 呀 278 | 呃 279 | 呕 280 | 呗 281 | 呜 282 | 呜呼 283 | 呢 284 | 呵 285 | 呵呵 286 | 呸 287 | 呼哧 288 | 咋 289 | 和 290 | 咚 291 | 咦 292 | 咧 293 | 咱 294 | 咱们 295 | 咳 296 | 哇 297 | 哈 298 | 哈哈 299 | 哉 300 | 哎 301 | 哎呀 302 | 哎哟 303 | 哗 304 | 哟 305 | 哦 306 | 哩 307 | 哪 308 | 哪个 309 | 哪些 310 | 哪儿 311 | 哪天 312 | 哪年 313 | 哪怕 314 | 哪样 315 | 哪边 316 | 哪里 317 | 哼 318 | 哼唷 319 | 唉 320 | 唯有 321 | 啊 322 | 啐 323 | 啥 324 | 啦 325 | 啪达 326 | 啷当 327 | 喂 328 | 喏 329 | 喔唷 330 | 喽 331 | 嗡 332 | 嗡嗡 333 | 嗬 334 | 嗯 335 | 嗳 336 | 嘎 337 | 嘎登 338 | 嘘 339 | 嘛 340 | 嘻 341 | 嘿 342 | 嘿嘿 343 | 因 344 | 因为 345 | 因了 346 | 因此 347 | 因着 348 | 因而 349 | 固然 350 | 在 351 | 在下 352 | 在于 353 | 地 354 | 基于 355 | 处在 356 | 多 357 | 多么 358 | 多少 359 | 大 360 | 大家 361 | 她 362 | 她们 363 | 好 364 | 如 365 | 如上 366 | 如上所述 367 | 如下 368 | 如何 369 | 如其 370 | 如同 371 | 如是 372 | 如果 373 | 如此 374 | 如若 375 | 始而 376 | 孰料 377 | 孰知 378 | 宁 379 | 宁可 380 | 宁愿 381 | 宁肯 382 | 它 383 | 它们 384 | 对 385 | 对于 386 | 对待 387 | 对方 388 | 对比 389 | 将 390 | 小 391 | 尔 392 | 尔后 393 | 尔尔 394 | 尚且 395 | 就 396 | 就是 397 | 就是了 398 | 就是说 399 | 就算 400 | 就要 401 | 尽 402 | 尽管 403 | 尽管如此 404 | 岂但 405 | 己 406 | 已 407 | 已矣 408 | 巴 409 | 巴巴 410 | 并 411 | 并且 412 | 并非 413 | 庶乎 414 | 庶几 415 | 开外 416 | 开始 417 | 归 418 | 归齐 419 | 当 420 | 当地 421 | 当然 422 | 当着 423 | 彼 424 | 彼时 425 | 彼此 426 | 往 427 | 待 428 | 很 429 | 得 430 | 得了 431 | 怎 432 | 怎么 433 | 怎么办 434 | 怎么样 435 | 怎奈 436 | 怎样 437 | 总之 438 | 总的来看 439 | 总的来说 440 | 总的说来 441 | 总而言之 442 | 恰恰相反 443 | 您 444 | 惟其 445 | 慢说 446 | 我 447 | 我们 448 | 或 449 | 或则 450 | 或是 451 | 或曰 452 | 或者 453 | 截至 454 | 所 455 | 所以 456 | 所在 457 | 所幸 458 | 所有 459 | 才 460 | 才能 461 | 打 462 | 打从 463 | 把 464 | 抑或 465 | 拿 466 | 按 467 | 按照 468 | 换句话说 469 | 换言之 470 | 据 471 | 据此 472 | 接着 473 | 故 474 | 故此 475 | 故而 476 | 旁人 477 | 无 478 | 无宁 479 | 无论 480 | 既 481 | 既往 482 | 既是 483 | 既然 484 | 时候 485 | 是 486 | 是以 487 | 是的 488 | 曾 489 | 替 490 | 替代 491 | 最 492 | 有 493 | 有些 494 | 有关 495 | 有及 496 | 有时 497 | 有的 498 | 望 499 | 朝 500 | 朝着 501 | 本 502 | 本人 503 | 本地 504 | 本着 505 | 本身 506 | 来 507 | 来着 508 | 来自 509 | 来说 510 | 极了 511 | 果然 512 | 果真 513 | 某 514 | 某个 515 | 某些 516 | 某某 517 | 根据 518 | 欤 519 | 正值 520 | 正如 521 | 正巧 522 | 正是 523 | 此 524 | 此地 525 | 此处 526 | 此外 527 | 此时 528 | 此次 529 | 此间 530 | 毋宁 531 | 每 532 | 每当 533 | 比 534 | 比及 535 | 比如 536 | 比方 537 | 没奈何 538 | 沿 539 | 沿着 540 | 漫说 541 | 焉 542 | 然则 543 | 然后 544 | 然而 545 | 照 546 | 照着 547 | 犹且 548 | 犹自 549 | 甚且 550 | 甚么 551 | 甚或 552 | 甚而 553 | 甚至 554 | 甚至于 555 | 用 556 | 用来 557 | 由 558 | 由于 559 | 由是 560 | 由此 561 | 由此可见 562 | 的 563 | 的确 564 | 的话 565 | 直到 566 | 相对而言 567 | 省得 568 | 看 569 | 眨眼 570 | 着 571 | 着呢 572 | 矣 573 | 矣乎 574 | 矣哉 575 | 离 576 | 竟而 577 | 第 578 | 等 579 | 等到 580 | 等等 581 | 简言之 582 | 管 583 | 类如 584 | 紧接着 585 | 纵 586 | 纵令 587 | 纵使 588 | 纵然 589 | 经 590 | 经过 591 | 结果 592 | 给 593 | 继之 594 | 继后 595 | 继而 596 | 综上所述 597 | 罢了 598 | 者 599 | 而 600 | 而且 601 | 而况 602 | 而后 603 | 而外 604 | 而已 605 | 而是 606 | 而言 607 | 能 608 | 能否 609 | 腾 610 | 自 611 | 自个儿 612 | 自从 613 | 自各儿 614 | 自后 615 | 自家 616 | 自己 617 | 自打 618 | 自身 619 | 至 620 | 至于 621 | 至今 622 | 至若 623 | 致 624 | 般的 625 | 若 626 | 若夫 627 | 若是 628 | 若果 629 | 若非 630 | 莫不然 631 | 莫如 632 | 莫若 633 | 虽 634 | 虽则 635 | 虽然 636 | 虽说 637 | 被 638 | 要 639 | 要不 640 | 要不是 641 | 要不然 642 | 要么 643 | 要是 644 | 譬喻 645 | 譬如 646 | 让 647 | 许多 648 | 论 649 | 设使 650 | 设或 651 | 设若 652 | 诚如 653 | 诚然 654 | 该 655 | 说来 656 | 诸 657 | 诸位 658 | 诸如 659 | 谁 660 | 谁人 661 | 谁料 662 | 谁知 663 | 贼死 664 | 赖以 665 | 赶 666 | 起 667 | 起见 668 | 趁 669 | 趁着 670 | 越是 671 | 距 672 | 跟 673 | 较 674 | 较之 675 | 边 676 | 过 677 | 还 678 | 还是 679 | 还有 680 | 还要 681 | 这 682 | 这一来 683 | 这个 684 | 这么 685 | 这么些 686 | 这么样 687 | 这么点儿 688 | 这些 689 | 这会儿 690 | 这儿 691 | 这就是说 692 | 这时 693 | 这样 694 | 这次 695 | 这般 696 | 这边 697 | 这里 698 | 进而 699 | 连 700 | 连同 701 | 逐步 702 | 通过 703 | 遵循 704 | 遵照 705 | 那 706 | 那个 707 | 那么 708 | 那么些 709 | 那么样 710 | 那些 711 | 那会儿 712 | 那儿 713 | 那时 714 | 那样 715 | 那般 716 | 那边 717 | 那里 718 | 都 719 | 鄙人 720 | 鉴于 721 | 针对 722 | 阿 723 | 除 724 | 除了 725 | 除外 726 | 除开 727 | 除此之外 728 | 除非 729 | 随 730 | 随后 731 | 随时 732 | 随着 733 | 难道说 734 | 非但 735 | 非徒 736 | 非特 737 | 非独 738 | 靠 739 | 顺 740 | 顺着 741 | 首先 742 | ! 743 | , 744 | : 745 | ; 746 | ? 747 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/data/dicts/stopwords/hit_stopwords.txt: -------------------------------------------------------------------------------- 1 | ——— 2 | 》), 3 | )÷(1- 4 | ”, 5 | )、 6 | =( 7 | : 8 | → 9 | ℃ 10 | & 11 | * 12 | 一一 13 | ~~~~ 14 | ’ 15 | . 16 | 『 17 | .一 18 | ./ 19 | -- 20 | 』 21 | =″ 22 | 【 23 | [*] 24 | }> 25 | [⑤]] 26 | [①D] 27 | c] 28 | ng昉 29 | * 30 | // 31 | [ 32 | ] 33 | [②e] 34 | [②g] 35 | ={ 36 | } 37 | ,也 38 | ‘ 39 | A 40 | [①⑥] 41 | [②B] 42 | [①a] 43 | [④a] 44 | [①③] 45 | [③h] 46 | ③] 47 | 1. 48 | -- 49 | [②b] 50 | ’‘ 51 | ××× 52 | [①⑧] 53 | 0:2 54 | =[ 55 | [⑤b] 56 | [②c] 57 | [④b] 58 | [②③] 59 | [③a] 60 | [④c] 61 | [①⑤] 62 | [①⑦] 63 | [①g] 64 | ∈[ 65 | [①⑨] 66 | [①④] 67 | [①c] 68 | [②f] 69 | [②⑧] 70 | [②①] 71 | [①C] 72 | [③c] 73 | [③g] 74 | [②⑤] 75 | [②②] 76 | 一. 77 | [①h] 78 | .数 79 | [] 80 | [①B] 81 | 数/ 82 | [①i] 83 | [③e] 84 | [①①] 85 | [④d] 86 | [④e] 87 | [③b] 88 | [⑤a] 89 | [①A] 90 | [②⑧] 91 | [②⑦] 92 | [①d] 93 | [②j] 94 | 〕〔 95 | ][ 96 | :// 97 | ′∈ 98 | [②④ 99 | [⑤e] 100 | 12% 101 | b] 102 | ... 103 | ................... 104 | …………………………………………………③ 105 | ZXFITL 106 | [③F] 107 | 」 108 | [①o] 109 | ]∧′=[ 110 | ∪φ∈ 111 | ′| 112 | {- 113 | ②c 114 | } 115 | [③①] 116 | R.L. 117 | [①E] 118 | Ψ 119 | -[*]- 120 | ↑ 121 | .日 122 | [②d] 123 | [② 124 | [②⑦] 125 | [②②] 126 | [③e] 127 | [①i] 128 | [①B] 129 | [①h] 130 | [①d] 131 | [①g] 132 | [①②] 133 | [②a] 134 | f] 135 | [⑩] 136 | a] 137 | [①e] 138 | [②h] 139 | [②⑥] 140 | [③d] 141 | [②⑩] 142 | e] 143 | 〉 144 | 】 145 | 元/吨 146 | [②⑩] 147 | 2.3% 148 | 5:0 149 | [①] 150 | :: 151 | [②] 152 | [③] 153 | [④] 154 | [⑤] 155 | [⑥] 156 | [⑦] 157 | [⑧] 158 | [⑨] 159 | …… 160 | —— 161 | ? 162 | 、 163 | 。 164 | “ 165 | ” 166 | 《 167 | 》 168 | ! 169 | , 170 | : 171 | ; 172 | ? 173 | . 174 | , 175 | . 176 | ' 177 | ? 178 | · 179 | ——— 180 | ── 181 | ? 182 | — 183 | < 184 | > 185 | ( 186 | ) 187 | 〔 188 | 〕 189 | [ 190 | ] 191 | ( 192 | ) 193 | - 194 | + 195 | ~ 196 | × 197 | / 198 | / 199 | ① 200 | ② 201 | ③ 202 | ④ 203 | ⑤ 204 | ⑥ 205 | ⑦ 206 | ⑧ 207 | ⑨ 208 | ⑩ 209 | Ⅲ 210 | В 211 | " 212 | ; 213 | # 214 | @ 215 | γ 216 | μ 217 | φ 218 | φ. 219 | × 220 | Δ 221 | ■ 222 | ▲ 223 | sub 224 | exp 225 | sup 226 | sub 227 | Lex 228 | # 229 | % 230 | & 231 | ' 232 | + 233 | +ξ 234 | ++ 235 | - 236 | -β 237 | < 238 | <± 239 | <Δ 240 | <λ 241 | <φ 242 | << 243 | = 244 | = 245 | =☆ 246 | =- 247 | > 248 | >λ 249 | _ 250 | ~± 251 | ~+ 252 | [⑤f] 253 | [⑤d] 254 | [②i] 255 | ≈ 256 | [②G] 257 | [①f] 258 | LI 259 | ㈧ 260 | [- 261 | ...... 262 | 〉 263 | [③⑩] 264 | 第二 265 | 一番 266 | 一直 267 | 一个 268 | 一些 269 | 许多 270 | 种 271 | 有的是 272 | 也就是说 273 | 末##末 274 | 啊 275 | 阿 276 | 哎 277 | 哎呀 278 | 哎哟 279 | 唉 280 | 俺 281 | 俺们 282 | 按 283 | 按照 284 | 吧 285 | 吧哒 286 | 把 287 | 罢了 288 | 被 289 | 本 290 | 本着 291 | 比 292 | 比方 293 | 比如 294 | 鄙人 295 | 彼 296 | 彼此 297 | 边 298 | 别 299 | 别的 300 | 别说 301 | 并 302 | 并且 303 | 不比 304 | 不成 305 | 不单 306 | 不但 307 | 不独 308 | 不管 309 | 不光 310 | 不过 311 | 不仅 312 | 不拘 313 | 不论 314 | 不怕 315 | 不然 316 | 不如 317 | 不特 318 | 不惟 319 | 不问 320 | 不只 321 | 朝 322 | 朝着 323 | 趁 324 | 趁着 325 | 乘 326 | 冲 327 | 除 328 | 除此之外 329 | 除非 330 | 除了 331 | 此 332 | 此间 333 | 此外 334 | 从 335 | 从而 336 | 打 337 | 待 338 | 但 339 | 但是 340 | 当 341 | 当着 342 | 到 343 | 得 344 | 的 345 | 的话 346 | 等 347 | 等等 348 | 地 349 | 第 350 | 叮咚 351 | 对 352 | 对于 353 | 多 354 | 多少 355 | 而 356 | 而况 357 | 而且 358 | 而是 359 | 而外 360 | 而言 361 | 而已 362 | 尔后 363 | 反过来 364 | 反过来说 365 | 反之 366 | 非但 367 | 非徒 368 | 否则 369 | 嘎 370 | 嘎登 371 | 该 372 | 赶 373 | 个 374 | 各 375 | 各个 376 | 各位 377 | 各种 378 | 各自 379 | 给 380 | 根据 381 | 跟 382 | 故 383 | 故此 384 | 固然 385 | 关于 386 | 管 387 | 归 388 | 果然 389 | 果真 390 | 过 391 | 哈 392 | 哈哈 393 | 呵 394 | 和 395 | 何 396 | 何处 397 | 何况 398 | 何时 399 | 嘿 400 | 哼 401 | 哼唷 402 | 呼哧 403 | 乎 404 | 哗 405 | 还是 406 | 还有 407 | 换句话说 408 | 换言之 409 | 或 410 | 或是 411 | 或者 412 | 极了 413 | 及 414 | 及其 415 | 及至 416 | 即 417 | 即便 418 | 即或 419 | 即令 420 | 即若 421 | 即使 422 | 几 423 | 几时 424 | 己 425 | 既 426 | 既然 427 | 既是 428 | 继而 429 | 加之 430 | 假如 431 | 假若 432 | 假使 433 | 鉴于 434 | 将 435 | 较 436 | 较之 437 | 叫 438 | 接着 439 | 结果 440 | 借 441 | 紧接着 442 | 进而 443 | 尽 444 | 尽管 445 | 经 446 | 经过 447 | 就 448 | 就是 449 | 就是说 450 | 据 451 | 具体地说 452 | 具体说来 453 | 开始 454 | 开外 455 | 靠 456 | 咳 457 | 可 458 | 可见 459 | 可是 460 | 可以 461 | 况且 462 | 啦 463 | 来 464 | 来着 465 | 离 466 | 例如 467 | 哩 468 | 连 469 | 连同 470 | 两者 471 | 了 472 | 临 473 | 另 474 | 另外 475 | 另一方面 476 | 论 477 | 嘛 478 | 吗 479 | 慢说 480 | 漫说 481 | 冒 482 | 么 483 | 每 484 | 每当 485 | 们 486 | 莫若 487 | 某 488 | 某个 489 | 某些 490 | 拿 491 | 哪 492 | 哪边 493 | 哪儿 494 | 哪个 495 | 哪里 496 | 哪年 497 | 哪怕 498 | 哪天 499 | 哪些 500 | 哪样 501 | 那 502 | 那边 503 | 那儿 504 | 那个 505 | 那会儿 506 | 那里 507 | 那么 508 | 那么些 509 | 那么样 510 | 那时 511 | 那些 512 | 那样 513 | 乃 514 | 乃至 515 | 呢 516 | 能 517 | 你 518 | 你们 519 | 您 520 | 宁 521 | 宁可 522 | 宁肯 523 | 宁愿 524 | 哦 525 | 呕 526 | 啪达 527 | 旁人 528 | 呸 529 | 凭 530 | 凭借 531 | 其 532 | 其次 533 | 其二 534 | 其他 535 | 其它 536 | 其一 537 | 其余 538 | 其中 539 | 起 540 | 起见 541 | 起见 542 | 岂但 543 | 恰恰相反 544 | 前后 545 | 前者 546 | 且 547 | 然而 548 | 然后 549 | 然则 550 | 让 551 | 人家 552 | 任 553 | 任何 554 | 任凭 555 | 如 556 | 如此 557 | 如果 558 | 如何 559 | 如其 560 | 如若 561 | 如上所述 562 | 若 563 | 若非 564 | 若是 565 | 啥 566 | 上下 567 | 尚且 568 | 设若 569 | 设使 570 | 甚而 571 | 甚么 572 | 甚至 573 | 省得 574 | 时候 575 | 什么 576 | 什么样 577 | 使得 578 | 是 579 | 是的 580 | 首先 581 | 谁 582 | 谁知 583 | 顺 584 | 顺着 585 | 似的 586 | 虽 587 | 虽然 588 | 虽说 589 | 虽则 590 | 随 591 | 随着 592 | 所 593 | 所以 594 | 他 595 | 他们 596 | 他人 597 | 它 598 | 它们 599 | 她 600 | 她们 601 | 倘 602 | 倘或 603 | 倘然 604 | 倘若 605 | 倘使 606 | 腾 607 | 替 608 | 通过 609 | 同 610 | 同时 611 | 哇 612 | 万一 613 | 往 614 | 望 615 | 为 616 | 为何 617 | 为了 618 | 为什么 619 | 为着 620 | 喂 621 | 嗡嗡 622 | 我 623 | 我们 624 | 呜 625 | 呜呼 626 | 乌乎 627 | 无论 628 | 无宁 629 | 毋宁 630 | 嘻 631 | 吓 632 | 相对而言 633 | 像 634 | 向 635 | 向着 636 | 嘘 637 | 呀 638 | 焉 639 | 沿 640 | 沿着 641 | 要 642 | 要不 643 | 要不然 644 | 要不是 645 | 要么 646 | 要是 647 | 也 648 | 也罢 649 | 也好 650 | 一 651 | 一般 652 | 一旦 653 | 一方面 654 | 一来 655 | 一切 656 | 一样 657 | 一则 658 | 依 659 | 依照 660 | 矣 661 | 以 662 | 以便 663 | 以及 664 | 以免 665 | 以至 666 | 以至于 667 | 以致 668 | 抑或 669 | 因 670 | 因此 671 | 因而 672 | 因为 673 | 哟 674 | 用 675 | 由 676 | 由此可见 677 | 由于 678 | 有 679 | 有的 680 | 有关 681 | 有些 682 | 又 683 | 于 684 | 于是 685 | 于是乎 686 | 与 687 | 与此同时 688 | 与否 689 | 与其 690 | 越是 691 | 云云 692 | 哉 693 | 再说 694 | 再者 695 | 在 696 | 在下 697 | 咱 698 | 咱们 699 | 则 700 | 怎 701 | 怎么 702 | 怎么办 703 | 怎么样 704 | 怎样 705 | 咋 706 | 照 707 | 照着 708 | 者 709 | 这 710 | 这边 711 | 这儿 712 | 这个 713 | 这会儿 714 | 这就是说 715 | 这里 716 | 这么 717 | 这么点儿 718 | 这么些 719 | 这么样 720 | 这时 721 | 这些 722 | 这样 723 | 正如 724 | 吱 725 | 之 726 | 之类 727 | 之所以 728 | 之一 729 | 只是 730 | 只限 731 | 只要 732 | 只有 733 | 至 734 | 至于 735 | 诸位 736 | 着 737 | 着呢 738 | 自 739 | 自从 740 | 自个儿 741 | 自各儿 742 | 自己 743 | 自家 744 | 自身 745 | 综上所述 746 | 总的来看 747 | 总的来说 748 | 总的说来 749 | 总而言之 750 | 总之 751 | 纵 752 | 纵令 753 | 纵然 754 | 纵使 755 | 遵照 756 | 作为 757 | 兮 758 | 呃 759 | 呗 760 | 咚 761 | 咦 762 | 喏 763 | 啐 764 | 喔唷 765 | 嗬 766 | 嗯 767 | 嗳 768 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/data/dicts/stopwords/scu_stopwords.txt: -------------------------------------------------------------------------------- 1 | 打开天窗说亮话 2 | 到目前为止 3 | 赶早不赶晚 4 | 常言说得好 5 | 何乐而不为 6 | 毫无保留地 7 | 由此可见 8 | 这就是说 9 | 这么点儿 10 | 综上所述 11 | 总的来看 12 | 总的来说 13 | 总的说来 14 | 总而言之 15 | 相对而言 16 | 除此之外 17 | 反过来说 18 | 恰恰相反 19 | 如上所述 20 | 换句话说 21 | 具体地说 22 | 具体说来 23 | 另一方面 24 | 与此同时 25 | 一则通过 26 | 毫无例外 27 | 不然的话 28 | 从此以后 29 | 从古到今 30 | 从古至今 31 | 从今以后 32 | 大张旗鼓 33 | 从无到有 34 | 从早到晚 35 | 弹指之间 36 | 不亦乐乎 37 | 不知不觉 38 | 不止一次 39 | 不择手段 40 | 不可开交 41 | 不可抗拒 42 | 不仅仅是 43 | 不管怎样 44 | 挨家挨户 45 | 长此下去 46 | 长话短说 47 | 除此而外 48 | 除此以外 49 | 除此之外 50 | 得天独厚 51 | 川流不息 52 | 长期以来 53 | 挨门挨户 54 | 挨门逐户 55 | 多多少少 56 | 多多益善 57 | 二话不说 58 | 更进一步 59 | 二话没说 60 | 分期分批 61 | 风雨无阻 62 | 归根到底 63 | 归根结底 64 | 反之亦然 65 | 大面儿上 66 | 倒不如说 67 | 成年累月 68 | 换句话说 69 | 或多或少 70 | 简而言之 71 | 接连不断 72 | 尽如人意 73 | 尽心竭力 74 | 尽心尽力 75 | 尽管如此 76 | 据我所知 77 | 具体地说 78 | 具体来说 79 | 具体说来 80 | 近几年来 81 | 每时每刻 82 | 屡次三番 83 | 三番两次 84 | 三番五次 85 | 三天两头 86 | 另一方面 87 | 老老实实 88 | 年复一年 89 | 恰恰相反 90 | 顷刻之间 91 | 穷年累月 92 | 千万千万 93 | 日复一日 94 | 如此等等 95 | 如前所述 96 | 如上所述 97 | 一方面 98 | 切不可 99 | 顷刻间 100 | 全身心 101 | 另方面 102 | 另一个 103 | 猛然间 104 | 默默地 105 | 就是说 106 | 近年来 107 | 尽可能 108 | 接下来 109 | 简言之 110 | 急匆匆 111 | 即是说 112 | 基本上 113 | 换言之 114 | 充其极 115 | 充其量 116 | 暗地里 117 | 反之则 118 | 比如说 119 | 背地里 120 | 背靠背 121 | 并没有 122 | 不得不 123 | 不得了 124 | 不得已 125 | 不仅仅 126 | 不经意 127 | 不能不 128 | 不外乎 129 | 不由得 130 | 不怎么 131 | 不至于 132 | 策略地 133 | 差不多 134 | 常言道 135 | 常言说 136 | 多年来 137 | 多年前 138 | 差一点 139 | 敞开儿 140 | 抽冷子 141 | 大不了 142 | 反倒是 143 | 反过来 144 | 大体上 145 | 当口儿 146 | 倒不如 147 | 怪不得 148 | 动不动 149 | 看起来 150 | 看上去 151 | 看样子 152 | 够瞧的 153 | 到了儿 154 | 呆呆地 155 | 来不及 156 | 来得及 157 | 到头来 158 | 连日来 159 | 于是乎 160 | 为什么 161 | 这会儿 162 | 换言之 163 | 那会儿 164 | 那么些 165 | 那么样 166 | 什么样 167 | 反过来 168 | 紧接着 169 | 就是说 170 | 要不然 171 | 要不是 172 | 一方面 173 | 以至于 174 | 自个儿 175 | 自各儿 176 | 之所以 177 | 这么些 178 | 这么样 179 | 怎么办 180 | 怎么样 181 | 谁知 182 | 顺着 183 | 似的 184 | 虽然 185 | 虽说 186 | 虽则 187 | 随着 188 | 所以 189 | 他们 190 | 他人 191 | 它们 192 | 她们 193 | 倘或 194 | 倘然 195 | 倘若 196 | 倘使 197 | 要么 198 | 要是 199 | 也罢 200 | 也好 201 | 以便 202 | 依照 203 | 以及 204 | 以免 205 | 以至 206 | 以致 207 | 抑或 208 | 因此 209 | 因而 210 | 因为 211 | 由于 212 | 有的 213 | 有关 214 | 有些 215 | 于是 216 | 与否 217 | 与其 218 | 越是 219 | 云云 220 | 一般 221 | 一旦 222 | 一来 223 | 一切 224 | 一样 225 | 同时 226 | 万一 227 | 为何 228 | 为了 229 | 为着 230 | 嗡嗡 231 | 我们 232 | 呜呼 233 | 乌乎 234 | 无论 235 | 无宁 236 | 沿着 237 | 毋宁 238 | 向着 239 | 照着 240 | 怎么 241 | 咱们 242 | 在下 243 | 再说 244 | 再者 245 | 怎样 246 | 这边 247 | 这儿 248 | 这个 249 | 这里 250 | 这么 251 | 这时 252 | 这些 253 | 这样 254 | 正如 255 | 之类 256 | 之一 257 | 只是 258 | 只限 259 | 只要 260 | 只有 261 | 至于 262 | 诸位 263 | 着呢 264 | 纵令 265 | 纵然 266 | 纵使 267 | 遵照 268 | 作为 269 | 喔唷 270 | 自从 271 | 自己 272 | 自家 273 | 自身 274 | 总之 275 | 要不 276 | 哎呀 277 | 哎哟 278 | 俺们 279 | 按照 280 | 吧哒 281 | 罢了 282 | 本着 283 | 比方 284 | 比如 285 | 鄙人 286 | 彼此 287 | 别的 288 | 别说 289 | 并且 290 | 不比 291 | 不成 292 | 不单 293 | 不但 294 | 不独 295 | 不管 296 | 不光 297 | 不过 298 | 不仅 299 | 不拘 300 | 不论 301 | 不怕 302 | 不然 303 | 不如 304 | 不特 305 | 不惟 306 | 不问 307 | 不只 308 | 朝着 309 | 趁着 310 | 除非 311 | 除了 312 | 此间 313 | 此外 314 | 从而 315 | 但是 316 | 当着 317 | 的话 318 | 等等 319 | 叮咚 320 | 对于 321 | 多少 322 | 而况 323 | 而且 324 | 而是 325 | 而外 326 | 而言 327 | 而已 328 | 尔后 329 | 反之 330 | 非但 331 | 非徒 332 | 否则 333 | 嘎登 334 | 各个 335 | 各位 336 | 各种 337 | 各自 338 | 根据 339 | 故此 340 | 固然 341 | 关于 342 | 果然 343 | 果真 344 | 哈哈 345 | 何处 346 | 何况 347 | 何时 348 | 哼唷 349 | 呼哧 350 | 还是 351 | 还有 352 | 或是 353 | 或者 354 | 极了 355 | 及其 356 | 及至 357 | 即便 358 | 即或 359 | 即令 360 | 即若 361 | 即使 362 | 既然 363 | 既是 364 | 继而 365 | 加之 366 | 假如 367 | 假若 368 | 假使 369 | 鉴于 370 | 几时 371 | 较之 372 | 接着 373 | 结果 374 | 进而 375 | 尽管 376 | 经过 377 | 就是 378 | 可见 379 | 可是 380 | 可以 381 | 况且 382 | 开始 383 | 开外 384 | 来着 385 | 例如 386 | 连同 387 | 两者 388 | 另外 389 | 慢说 390 | 漫说 391 | 每当 392 | 莫若 393 | 某个 394 | 某些 395 | 哪边 396 | 哪儿 397 | 哪个 398 | 哪里 399 | 哪年 400 | 哪怕 401 | 哪天 402 | 哪些 403 | 哪样 404 | 那边 405 | 那儿 406 | 那个 407 | 那里 408 | 那么 409 | 那时 410 | 那些 411 | 那样 412 | 乃至 413 | 宁可 414 | 宁肯 415 | 宁愿 416 | 你们 417 | 啪达 418 | 旁人 419 | 凭借 420 | 其次 421 | 其二 422 | 其他 423 | 其它 424 | 其一 425 | 其余 426 | 其中 427 | 起见 428 | 起见 429 | 岂但 430 | 前后 431 | 前者 432 | 然而 433 | 然后 434 | 然则 435 | 人家 436 | 任何 437 | 任凭 438 | 如此 439 | 如果 440 | 如何 441 | 如其 442 | 如若 443 | 若非 444 | 若是 445 | 上下 446 | 尚且 447 | 设若 448 | 设使 449 | 甚而 450 | 甚么 451 | 甚至 452 | 省得 453 | 时候 454 | 什么 455 | 使得 456 | 是的 457 | 首先 458 | 首先 459 | 其次 460 | 再次 461 | 最后 462 | 您们 463 | 它们 464 | 她们 465 | 他们 466 | 我们 467 | 你是 468 | 您是 469 | 我是 470 | 他是 471 | 她是 472 | 它是 473 | 不是 474 | 你们 475 | 啊哈 476 | 啊呀 477 | 啊哟 478 | 挨次 479 | 挨个 480 | 挨着 481 | 哎呀 482 | 哎哟 483 | 俺们 484 | 按理 485 | 按期 486 | 默然 487 | 按时 488 | 按说 489 | 按照 490 | 暗中 491 | 暗自 492 | 昂然 493 | 八成 494 | 倍感 495 | 倍加 496 | 本人 497 | 本身 498 | 本着 499 | 并非 500 | 别人 501 | 必定 502 | 比起 503 | 比如 504 | 比照 505 | 鄙人 506 | 毕竟 507 | 必将 508 | 必须 509 | 并肩 510 | 并没 511 | 并排 512 | 并且 513 | 并无 514 | 勃然 515 | 不必 516 | 不常 517 | 不大 518 | 不单 519 | 不但 520 | 而且 521 | 不得 522 | 不迭 523 | 不定 524 | 不独 525 | 不对 526 | 不妨 527 | 不管 528 | 不光 529 | 不过 530 | 不会 531 | 不仅 532 | 不拘 533 | 不力 534 | 不了 535 | 不料 536 | 不论 537 | 不满 538 | 不免 539 | 不起 540 | 不巧 541 | 不然 542 | 不日 543 | 不少 544 | 不胜 545 | 不时 546 | 不是 547 | 不同 548 | 不能 549 | 不要 550 | 不外 551 | 不下 552 | 不限 553 | 不消 554 | 不已 555 | 不再 556 | 不曾 557 | 不止 558 | 不只 559 | 才能 560 | 彻夜 561 | 趁便 562 | 趁机 563 | 趁热 564 | 趁势 565 | 趁早 566 | 趁着 567 | 成心 568 | 乘机 569 | 乘势 570 | 乘隙 571 | 乘虚 572 | 诚然 573 | 迟早 574 | 充分 575 | 出来 576 | 出去 577 | 除此 578 | 除非 579 | 除开 580 | 除了 581 | 除去 582 | 除却 583 | 除外 584 | 处处 585 | 传说 586 | 传闻 587 | 纯粹 588 | 此后 589 | 此间 590 | 此外 591 | 此中 592 | 次第 593 | 匆匆 594 | 从不 595 | 从此 596 | 从而 597 | 从宽 598 | 从来 599 | 从轻 600 | 从速 601 | 从头 602 | 从未 603 | 从小 604 | 从新 605 | 从严 606 | 从优 607 | 从中 608 | 从重 609 | 凑巧 610 | 存心 611 | 达旦 612 | 打从 613 | 大大 614 | 大抵 615 | 大都 616 | 大多 617 | 大凡 618 | 大概 619 | 大家 620 | 大举 621 | 大略 622 | 大约 623 | 大致 624 | 待到 625 | 单纯 626 | 单单 627 | 但是 628 | 但愿 629 | 当场 630 | 当儿 631 | 当即 632 | 当然 633 | 当庭 634 | 当头 635 | 当下 636 | 当真 637 | 当中 638 | 当着 639 | 倒是 640 | 到处 641 | 到底 642 | 到头 643 | 得起 644 | 的话 645 | 的确 646 | 等到 647 | 等等 648 | 顶多 649 | 动辄 650 | 陡然 651 | 独自 652 | 断然 653 | 对于 654 | 顿时 655 | 多次 656 | 多多 657 | 多亏 658 | 而后 659 | 而论 660 | 而且 661 | 而是 662 | 而外 663 | 而言 664 | 而已 665 | 而又 666 | 尔等 667 | 反倒 668 | 反而 669 | 反手 670 | 反之 671 | 方才 672 | 方能 673 | 非常 674 | 非但 675 | 非得 676 | 分头 677 | 奋勇 678 | 愤然 679 | 更为 680 | 更加 681 | 根据 682 | 个人 683 | 各式 684 | 刚才 685 | 敢情 686 | 该当 687 | 嘎嘎 688 | 否则 689 | 赶快 690 | 敢于 691 | 刚好 692 | 刚巧 693 | 高低 694 | 格外 695 | 隔日 696 | 隔夜 697 | 公然 698 | 过于 699 | 果然 700 | 果真 701 | 光是 702 | 关于 703 | 共总 704 | 姑且 705 | 故此 706 | 故而 707 | 故意 708 | 固然 709 | 惯常 710 | 毫不 711 | 毫无 712 | 很多 713 | 何须 714 | 好在 715 | 何必 716 | 何尝 717 | 何妨 718 | 何苦 719 | 何况 720 | 何止 721 | 很少 722 | 轰然 723 | 后来 724 | 呼啦 725 | 哗啦 726 | 互相 727 | 忽地 728 | 忽然 729 | 话说 730 | 或是 731 | 伙同 732 | 豁然 733 | 恍然 734 | 还是 735 | 或许 736 | 或者 737 | 基本 738 | 基于 739 | 极大 740 | 极度 741 | 极端 742 | 极力 743 | 极其 744 | 极为 745 | 即便 746 | 即将 747 | 及其 748 | 及至 749 | 即刻 750 | 即令 751 | 即使 752 | 几度 753 | 几番 754 | 几乎 755 | 几经 756 | 既然 757 | 继而 758 | 继之 759 | 加上 760 | 加以 761 | 加之 762 | 假如 763 | 假若 764 | 假使 765 | 间或 766 | 将才 767 | 简直 768 | 鉴于 769 | 将近 770 | 将要 771 | 交口 772 | 较比 773 | 较为 774 | 较之 775 | 皆可 776 | 截然 777 | 截至 778 | 藉以 779 | 借此 780 | 借以 781 | 届时 782 | 尽快 783 | 近来 784 | 进而 785 | 进来 786 | 进去 787 | 尽管 788 | 尽量 789 | 尽然 790 | 就算 791 | 居然 792 | 就此 793 | 就地 794 | 竟然 795 | 究竟 796 | 经常 797 | 尽早 798 | 精光 799 | 经过 800 | 就是 801 | 局外 802 | 举凡 803 | 据称 804 | 据此 805 | 据实 806 | 据说 807 | 可好 808 | 看来 809 | 开外 810 | 绝不 811 | 决不 812 | 据悉 813 | 决非 814 | 绝顶 815 | 绝对 816 | 绝非 817 | 可见 818 | 可能 819 | 可是 820 | 可以 821 | 恐怕 822 | 来讲 823 | 来看 824 | 快要 825 | 况且 826 | 拦腰 827 | 牢牢 828 | 老是 829 | 累次 830 | 累年 831 | 理当 832 | 理该 833 | 理应 834 | 例如 835 | 立地 836 | 立刻 837 | 立马 838 | 立时 839 | 联袂 840 | 连连 841 | 连日 842 | 路经 843 | 临到 844 | 连声 845 | 连同 846 | 连袂 847 | 另外 848 | 另行 849 | 屡次 850 | 屡屡 851 | 缕缕 852 | 率尔 853 | 率然 854 | 略加 855 | 略微 856 | 略为 857 | 论说 858 | 马上 859 | 猛然 860 | 没有 861 | 每当 862 | 每逢 863 | 每每 864 | 莫不 865 | 莫非 866 | 莫如 867 | 莫若 868 | 哪怕 869 | 那么 870 | 那末 871 | 那些 872 | 乃至 873 | 难道 874 | 难得 875 | 难怪 876 | 难说 877 | 你们 878 | 凝神 879 | 宁可 880 | 宁肯 881 | 宁愿 882 | 偶而 883 | 偶尔 884 | 碰巧 885 | 譬如 886 | 偏偏 887 | 平素 888 | 迫于 889 | 扑通 890 | 其次 891 | 其后 892 | 其实 893 | 其它 894 | 起初 895 | 起来 896 | 起首 897 | 起头 898 | 起先 899 | 岂但 900 | 岂非 901 | 岂止 902 | 恰逢 903 | 恰好 904 | 恰恰 905 | 恰巧 906 | 恰如 907 | 恰似 908 | 前后 909 | 前者 910 | 切莫 911 | 切切 912 | 切勿 913 | 亲口 914 | 亲身 915 | 亲手 916 | 亲眼 917 | 亲自 918 | 顷刻 919 | 请勿 920 | 取道 921 | 权时 922 | 全都 923 | 全力 924 | 全年 925 | 全然 926 | 然而 927 | 然后 928 | 人家 929 | 人人 930 | 仍旧 931 | 仍然 932 | 日见 933 | 日渐 934 | 日益 935 | 日臻 936 | 如常 937 | 如次 938 | 如果 939 | 如今 940 | 如期 941 | 如若 942 | 如上 943 | 如下 944 | 上来 945 | 上去 946 | 瑟瑟 947 | 沙沙 948 | 啊 949 | 哎 950 | 唉 951 | 俺 952 | 按 953 | 吧 954 | 把 955 | 甭 956 | 别 957 | 嘿 958 | 很 959 | 乎 960 | 会 961 | 或 962 | 既 963 | 及 964 | 啦 965 | 了 966 | 们 967 | 你 968 | 您 969 | 哦 970 | 砰 971 | 啊 972 | 你 973 | 我 974 | 他 975 | 她 976 | 它 977 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/how_to_solve_moved_data/test.rs: -------------------------------------------------------------------------------- 1 | # https://www.cnblogs.com/dhcn/p/12152116.html 2 | 3 | use std::rc::Rc; 4 | use std::cell::RefCell; 5 | 6 | struct Info{ 7 | s:String 8 | } 9 | 10 | impl Info{ 11 | fn new(a: &str)->Info{ 12 | Info{ 13 | s:a.to_string(), 14 | } 15 | } 16 | } 17 | 18 | fn abc(a:Rc>){ 19 | a.borrow_mut().s=="bbbb".to_string(); 20 | } 21 | 22 | 23 | fn main(){ 24 | let bar=Rc::new(RefCell::new(Info::new("abc"))); 25 | println!("1: {}",bar.borrow().s); 26 | abc(bar.clone()); 27 | println!("2: {}",bar.borrow().s); 28 | abc(bar.clone()); 29 | println!("3: {}",bar.borrow().s); 30 | } 31 | 32 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/how_to_solve_moved_data/test1.rs: -------------------------------------------------------------------------------- 1 | 2 | struct Info{ 3 | pub s:String 4 | } 5 | 6 | impl Info{ 7 | fn fn_a(&mut self){ 8 | self.s+="1"; 9 | } 10 | } 11 | 12 | fn main(){ 13 | 14 | let mut foo=Info{s:"Hello".to_string()}; 15 | println!("1: {}",foo.s); 16 | foo.fn_a(); 17 | println!("2: {}",foo.s); 18 | foo.fn_a(); 19 | println!("3: {}",foo.s); 20 | 21 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/how_to_solve_moved_data/test3.rs: -------------------------------------------------------------------------------- 1 | 2 | 3 | struct Info{ 4 | s:i32 5 | } 6 | 7 | impl Info{ 8 | fn new(a:i32)->Info{ 9 | Info{ 10 | s:a, 11 | } 12 | } 13 | } 14 | 15 | impl Clone for Info{ 16 | fn clone(&self)->Info{ 17 | Info{s:self.s} 18 | } 19 | } 20 | 21 | fn abc(a:Info)->Info{ 22 | Info{s:a.s+1} 23 | } 24 | 25 | fn main(){ 26 | let mut foo=Info::new(111); 27 | println!("1: {}", foo.s); 28 | abc(foo.clone()); 29 | println!("2: {}",foo.s); 30 | abc(foo.clone()); 31 | println!("3: {}",foo.s); 32 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/json_test.rs: -------------------------------------------------------------------------------- 1 | #[macro_use] 2 | extern crate serde_json; 3 | 4 | fn main() { 5 | let capitals = json!({ 6 | "Cook Islands": "Avarua", 7 | "Fiji": "Suva", 8 | "Kiribati": "South Tarawa", 9 | "Niue": "Alofi", 10 | "Tonga": "Nuku'alofa", 11 | "Tuvalu": "Funafuti" 12 | }); 13 | 14 | println!("Capital of Tonga is: {}", capitals["Tonga"]) 15 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/examples/test/test1.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::*; 3 | 4 | fn main() { 5 | let things = vec!["Apple", "Banana", "Dog"]; 6 | let animals = vec![]; 7 | for thing in things { 8 | if thing == "Dog" { 9 | animals.push(thing); 10 | } 11 | } 12 | println!("{:?} ", animals); 13 | } 14 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/rsnltk.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/api/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod natural; 2 | pub mod whatlang; 3 | pub mod yn; -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/api/natural.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests{ 3 | use crate::api::natural::*; 4 | # [test] 5 | fn test_distance(){ 6 | println!("lev = {}",lev_dist("kitten", "sitting")); 7 | println!("winkler = {}",jw_dist("dixon", "dicksonx")); 8 | } 9 | 10 | # [test] 11 | fn test_soundx(){ 12 | println!("{}",is_soundex("robert","rupert")); 13 | } 14 | 15 | # [test] 16 | fn test_tokenize(){ 17 | let str1="hello, world!"; 18 | let srtr="My dog has fleas."; 19 | println!("{:?}",tokenize(str1)); 20 | 21 | } 22 | 23 | # [test] 24 | fn test_ngrams(){ 25 | // no padding 26 | let str1="hello my darling"; 27 | let results=get_ngram(str1,2); 28 | for l in results{ 29 | println!("{:?}",l); 30 | } 31 | println!(); 32 | // with padding 33 | let results=get_ngram_with_padding(str1,2,"---"); 34 | for l in results{ 35 | println!("{:?}",l); 36 | } 37 | } 38 | 39 | # [test] 40 | fn test_classification(){ 41 | let mut list_str=Vec::new(); 42 | list_str.push("Hello World"); 43 | list_str.push("Hello Chen's World"); 44 | list_str.push("World is Amazing"); 45 | let mut list_label=Vec::new(); 46 | list_label.push("a"); 47 | list_label.push("b"); 48 | list_label.push("c"); 49 | println!("guess = {}",nb_guess(list_str,list_label,"Hello")); 50 | } 51 | 52 | # [test] 53 | fn test_tf_idf(){ 54 | let mut list_str=Vec::new(); 55 | list_str.push("this document is about rust."); 56 | list_str.push("this document is about erlang."); 57 | list_str.push("this document is about erlang and rust."); 58 | list_str.push("this document is about rust. it has rust examples"); 59 | 60 | println!("tf-idf value = {}",get_tf_idf(list_str,"rust")); 61 | 62 | } 63 | 64 | } 65 | 66 | extern crate natural; 67 | use natural::distance::jaro_winkler_distance; 68 | use natural::distance::levenshtein_distance; 69 | use natural::phonetics::soundex; 70 | use natural::classifier::NaiveBayesClassifier; 71 | use natural::tf_idf::TfIdf; 72 | 73 | pub fn lev_dist(str1:&str,str2:&str)->usize{ 74 | return levenshtein_distance(str1, str2); 75 | } 76 | 77 | pub fn jw_dist(str1:&str,str2:&str)->f32{ 78 | return jaro_winkler_distance(str1, str2); 79 | } 80 | 81 | pub fn is_soundex(str1:&str,str2:&str)->bool{ 82 | let result=soundex(str1,str2); 83 | return result; 84 | } 85 | 86 | pub fn tokenize(str:&str)->Vec<&str>{ 87 | natural::tokenize::tokenize(&str) 88 | } 89 | 90 | 91 | pub fn get_ngram(str:&str,n:usize)->Vec>{ 92 | natural::ngram::get_ngram(str, n) 93 | } 94 | 95 | pub fn get_ngram_with_padding<'a>(str:&'a str,n:usize,padding:&'a str)->Vec>{ 96 | let result = natural::ngram::get_ngram_with_padding(str, n,padding); 97 | result 98 | } 99 | 100 | pub fn nb_guess(train_strs:Vec<&str>,labels:Vec<&str>,str_guess:&str)->String{ 101 | 102 | 103 | let mut nbc = NaiveBayesClassifier::new(); 104 | let mut idx=0; 105 | for train_str in train_strs{ 106 | nbc.train(train_str, labels[idx]); 107 | idx+=1; 108 | } 109 | 110 | nbc.guess(str_guess) //returns a label with the highest probability 111 | 112 | } 113 | 114 | pub fn get_tf_idf(strs:Vec<&str>,s:&str)->f32{ 115 | 116 | let mut tf_idf=TfIdf::new(); 117 | 118 | 119 | for str in strs{ 120 | tf_idf.add(str); 121 | } 122 | 123 | tf_idf.get(s) //0.21859923 124 | } 125 | 126 | 127 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/api/whatlang.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use crate::api::whatlang::*; 4 | # [test] 5 | fn test_whatlang(){ 6 | let text = "Ĉu vi ne volas eklerni Esperanton? Bonvolu! Estas unu de la plej bonaj aferoj!"; 7 | let ret=whatlang(text); 8 | println!("{:?}",ret); 9 | } 10 | 11 | } 12 | 13 | use std::collections::HashMap; 14 | 15 | 16 | extern crate whatlang; 17 | use whatlang::detect; 18 | pub fn whatlang(str:&str) ->HashMap{ 19 | 20 | 21 | let info = detect(str).unwrap(); 22 | let mut result:HashMap=HashMap::new(); 23 | result.insert(String::from("lang"),info.lang().to_string()); 24 | result.insert(String::from("script"),info.script().to_string()); 25 | result.insert(String::from("confidence"),info.confidence().to_string()); 26 | result.insert(String::from("is_reliable"),info.is_reliable().to_string()); 27 | 28 | result 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/api/yn.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use crate::api::yn::*; 4 | # [test] 5 | fn test_yes(){ 6 | let s="yes"; 7 | println!("{:?}",yes(s)); 8 | 9 | println!("{:?}",is_somewhat_yes("this has a y so it is the word")); 10 | 11 | println!("{:?}",is_kinda_yes("very much so")); 12 | } 13 | 14 | } 15 | extern crate yn; 16 | 17 | pub fn yes(str:&str)->bool{ 18 | return yn::yes(str); 19 | } 20 | pub fn is_somewhat_yes(str:&str)->bool{ 21 | return yn::is_somewhat_yes(str); 22 | } 23 | 24 | pub fn is_kinda_yes(str:&str)->bool{ 25 | yn::is_kinda_yes(str) 26 | } 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate core; 2 | 3 | pub mod wordnet; 4 | pub mod stanza; 5 | pub mod api; 6 | pub mod native; 7 | 8 | pub use stanza::*; 9 | 10 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/main.rs: -------------------------------------------------------------------------------- 1 | 2 | fn main(){ 3 | println!("Hello, rsnltk!"); 4 | 5 | 6 | use rsnltk::native::text::similar_with_english; 7 | let text="I like you!"; 8 | println!("{}",similar_with_english(text)) 9 | //let meaningful_words=get_segmentation(); 10 | 11 | //println!("Result: {:?}",meaningful_words); 12 | 13 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/chardata.rs: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////// 2 | // CharDataIter and friends 3 | // 4 | // Probably CharDataIter could be replaced by a clever 5 | // call to map() on the underlying char iterator... 6 | // 7 | pub static END_OF_STRING: char = '\0'; 8 | 9 | #[derive(Debug)] 10 | pub struct CharData { 11 | pub ch: char, 12 | pub byte_offset: usize, 13 | pub char_offset: usize, 14 | } 15 | 16 | pub struct CharDataIter<'a> { 17 | char_stream: &'a mut dyn Iterator, 18 | byte_offset: usize, 19 | char_offset: usize, 20 | really_done: bool, 21 | } 22 | 23 | impl<'a> CharDataIter<'a> { 24 | pub fn new(chs: &'a mut dyn Iterator) -> Self { 25 | CharDataIter { 26 | char_stream: chs, 27 | byte_offset: 0, 28 | char_offset: 0, 29 | really_done: false, 30 | } 31 | } 32 | } 33 | 34 | impl<'a> Iterator for CharDataIter<'a> { 35 | type Item = CharData; 36 | 37 | fn next(&mut self) -> Option { 38 | match self.char_stream.next() { 39 | Some(c) => { 40 | let result = CharData { 41 | ch: c, 42 | byte_offset: self.byte_offset, 43 | char_offset: self.char_offset, 44 | }; 45 | self.char_offset += 1; 46 | self.byte_offset += c.len_utf8(); 47 | Some(result) 48 | }, 49 | None => { 50 | if self.really_done { 51 | None 52 | } else { 53 | // Special marker 54 | self.really_done = true; 55 | Some ( 56 | CharData { 57 | ch: END_OF_STRING, // should be ignored! 58 | byte_offset: self.byte_offset, 59 | char_offset: self.char_offset, 60 | } 61 | ) 62 | } 63 | } 64 | } 65 | } 66 | } 67 | // 68 | // CharDataIter 69 | ///////////////////////////////////////////////////////// -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod summarizer; 2 | pub mod word2vec; 3 | pub mod token; 4 | pub mod segmentation; 5 | mod chardata; 6 | mod toksiter; 7 | pub mod text; 8 | pub mod nlpsvc; 9 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/annotated_document.rs: -------------------------------------------------------------------------------- 1 | extern crate indextree; 2 | 3 | use std::collections::HashMap; 4 | 5 | use crate::native::nlpsvc::*; 6 | 7 | 8 | pub use node_label::NodeLabel; 9 | pub use tree_sequence::TreeSequence; 10 | pub use tree_sequence::TreeCursor; 11 | pub use tree_sequence::CursorMemo; 12 | 13 | 14 | #[cfg(test)] 15 | mod tests { 16 | 17 | use crate::native::nlpsvc::annotated_document::*; 18 | 19 | fn print_label(cursor: &TreeCursor, doc: &AnnotatedDocument) { 20 | let label = cursor.get().unwrap(); 21 | let span = label.get_span().unwrap(); 22 | println!("({:>02}, {:>02}) [{}]", span.0, span.1, 23 | &doc.get_text()[span.0..span.1]); 24 | } 25 | 26 | #[test] 27 | fn push_tokens_and_traverse() { 28 | // Fake tokenizer 29 | let mut doc = AnnotatedDocument::new("01 Hello!"); 30 | let mut lbl0 = NodeLabel::new(); 31 | lbl0.set_span(0, 2) 32 | .set_sym_val("toktype", "NUMBER"); 33 | doc.get_trees_mut().push_back(lbl0); 34 | let mut lbl1 = NodeLabel::new(); 35 | lbl1.set_span(3, 8) 36 | .set_sym_val("toktype", "WORD"); 37 | doc.get_trees_mut().push_back(lbl1); 38 | let mut lbl2 = NodeLabel::new(); 39 | lbl2.set_span(8, 9) 40 | .set_sym_val("toktype", "PUNCT"); 41 | doc.get_trees_mut().push_back(lbl2); 42 | 43 | // Traverse (and print) 44 | let mut cursor = doc.get_trees().first(); 45 | while cursor.is_valid() { 46 | print_label(&cursor, &doc); 47 | cursor.next(); 48 | } 49 | } 50 | 51 | #[test] 52 | fn test_chunking() { 53 | let txt = "aa bb cc dd ee ff"; 54 | let mut doc = AnnotatedDocument::new(txt); 55 | for (i, _) in txt.split_whitespace().enumerate() { 56 | let b = i * 3; 57 | let e = b + 2; 58 | let mut lbl = NodeLabel::new(); 59 | lbl.set_span(b, e) 60 | .set_sym_val("toktype", "WORD"); 61 | doc.get_trees_mut().push_back(lbl); 62 | } 63 | 64 | { 65 | println!("===================="); 66 | let mut cursor = doc.get_trees().first(); 67 | while cursor.is_valid() { 68 | print_label(&cursor, &doc); 69 | cursor.next(); 70 | } 71 | println!("===================="); 72 | } 73 | 74 | let (first_child, last_child) = fake_parse(&doc); 75 | let mut label = NodeLabel::new(); 76 | label.set_sym_val("cat", "cc_ee"); 77 | doc.get_trees_mut().chunk(label, first_child, last_child); 78 | doc.get_trees().print(); 79 | } 80 | 81 | fn fake_parse(doc: &AnnotatedDocument) -> (CursorMemo, CursorMemo) { 82 | let mut cursor = doc.get_trees().first(); // reset cursor 83 | cursor.next(); 84 | cursor.next(); 85 | // cursor should now be sitting on [cc] 86 | print_label(&cursor, &doc); 87 | let first_child = cursor.to_memo(); 88 | cursor.next(); 89 | cursor.next(); 90 | cursor.next(); 91 | // cursor should now be sitting on [ff] 92 | let last_child = cursor.to_memo(); 93 | (first_child, last_child) 94 | } 95 | 96 | } 97 | 98 | 99 | 100 | pub struct AnnotatedDocument { 101 | doc_string: String, 102 | tree_sequence: TreeSequence, 103 | } 104 | 105 | impl AnnotatedDocument { 106 | 107 | pub fn new(text: &str) -> AnnotatedDocument { 108 | AnnotatedDocument { 109 | doc_string: String::from(text), 110 | tree_sequence: TreeSequence::new(), 111 | } 112 | } 113 | pub fn get_text(&self) -> &str { 114 | &self.doc_string 115 | } 116 | pub fn get_trees_mut(&mut self) -> &mut TreeSequence { 117 | &mut self.tree_sequence 118 | } 119 | pub fn get_trees(&self) -> &TreeSequence { 120 | &self.tree_sequence 121 | } 122 | } 123 | 124 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/english_rules.rs: -------------------------------------------------------------------------------- 1 | /// english_rules.rs 2 | /// 3 | /// `EnglishTokenizer` wraps a ThompsonInterpreter around a set of regex 4 | /// patterns for ordinary English token types. It also implements the 5 | /// `RegexTokenizer` trait, which in turn requires it to implement 6 | /// the `TokenReactor` and `TokenRecognizer` traits. 7 | 8 | use crate::native::nlpsvc::regex::reinterp::ThompsonInterpreter; 9 | use crate::native::nlpsvc::regex::reinterp::TokenRecognizer; 10 | use crate::native::nlpsvc::regex::reinterp::MatchRecord; 11 | use crate::native::nlpsvc::regex_tokenizer::TokenReactor; 12 | use crate::native::nlpsvc::regex_tokenizer::ThompsonProgramBuilder; 13 | use crate::native::nlpsvc::regex_tokenizer::RegexTokenizer; 14 | use crate::native::nlpsvc::annotated_document::*; 15 | 16 | pub struct EnglishTokenizer { 17 | matcher: ThompsonInterpreter, 18 | } 19 | 20 | impl EnglishTokenizer { 21 | 22 | pub fn new() -> EnglishTokenizer { 23 | let english_patterns = ThompsonProgramBuilder::new() 24 | .add_rule(r"(?i)[a-z]+") // [0] words 25 | .add_rule(r"[0-9,.]*[0-9]+") // [1] numbers 26 | .add_rule(r"[.,?!]") // [2] punctuation 27 | .build(); 28 | EnglishTokenizer { 29 | matcher: ThompsonInterpreter::new(english_patterns), 30 | } 31 | } 32 | 33 | fn word_action(&mut self, _begin: usize, _end: usize, token: &mut NodeLabel) { 34 | //println!("WORD [{}] at {}", &doc.get_text()[begin..end], begin); 35 | token.set_sym_val("toktype", "WORD"); 36 | } 37 | 38 | fn num_action(&mut self, _begin:usize, _end: usize, token: &mut NodeLabel) { 39 | //println!("NUMBER [{}] at {}", &doc.get_text()[begin..end], begin); 40 | token.set_sym_val("toktype", "NUMBER"); 41 | } 42 | 43 | fn punct_action(&mut self, _begin: usize, _end: usize, token: &mut NodeLabel) { 44 | //println!("PUNCT [{}] at {}", &doc.get_text()[begin..end], begin); 45 | token.set_sym_val("toktype", "PUNCT"); 46 | } 47 | } 48 | 49 | impl TokenRecognizer for EnglishTokenizer { 50 | fn next_token(&mut self, text: &str, pos: usize) -> Option { 51 | self.matcher.next_token(text, pos) 52 | } 53 | } 54 | 55 | 56 | impl TokenReactor for EnglishTokenizer { 57 | /// Append a token 58 | /// 59 | /// Append a token starting at `begin` with text `text`, that 60 | /// matched rule #`rule_id`. 61 | fn append(&mut self, 62 | begin: usize, 63 | end:usize, 64 | rule_id: usize, 65 | doc: &mut AnnotatedDocument 66 | ) { 67 | let mut token = NodeLabel::new(); 68 | token.set_span(begin, end); 69 | match rule_id { 70 | 0 => { self.word_action(begin, end, &mut token); } 71 | 1 => { self.num_action(begin, end, &mut token); } 72 | 2 => { self.punct_action(begin, end, &mut token); } 73 | _ => { panic!("Unrecognized rule ID {} at pos {}", rule_id, begin); } 74 | }; 75 | println!( 76 | "{} [{}] at {}", 77 | token.get_sym_val("toktype"), 78 | &doc.get_text()[begin..end], 79 | begin 80 | ); 81 | doc.get_trees_mut().push_back(token); 82 | } 83 | 84 | /// Skip an unhandled character 85 | /// 86 | /// The character at `begin` is not the first character of any pattern 87 | /// that this tokenizer knows about. For symmetry with `append()`, 88 | /// the text is passed in as a &str, but in general it should only be 89 | /// one character long. 90 | fn skip(&mut self, begin: usize, text: &str) { 91 | println!("No rule matched at pos {} ('{}')", begin, &text[0..1]); 92 | } 93 | } 94 | 95 | impl RegexTokenizer for EnglishTokenizer {} -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod annotated_document; 2 | pub mod english_rules; 3 | pub mod node_label; 4 | pub mod regex_tokenizer; 5 | pub mod tree_sequence; 6 | pub mod regex; 7 | mod readme; 8 | mod text_source; 9 | 10 | #[cfg(test)] 11 | mod test{ 12 | use crate::native::nlpsvc::annotated_document::*; 13 | use crate::native::nlpsvc::english_rules::EnglishTokenizer; 14 | use crate::native::nlpsvc::regex_tokenizer::RegexTokenizer; 15 | # [test] 16 | fn test1(){ 17 | let text="A Rust library to support natural language processing with pure Rust implementation and Python bindings!"; 18 | let mut tokenizer = EnglishTokenizer::new(); // compile regex patterns 19 | let mut doc = AnnotatedDocument::new(text); 20 | tokenizer.apply_to(&mut doc); 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/node_label.rs: -------------------------------------------------------------------------------- 1 | //! # node_label.rs 2 | //! 3 | //! Representation for data stored in tree nodes (and maybe elsewhere) 4 | 5 | use std::fmt; 6 | use std::collections::HashMap; 7 | 8 | #[derive(Debug, Clone)] 9 | pub struct NodeLabel { 10 | pub span: Option<(usize, usize)>, 11 | pub attributes: HashMap, 12 | } 13 | 14 | impl NodeLabel { 15 | pub fn new() -> NodeLabel { 16 | NodeLabel { span: None, attributes: HashMap::new(), } 17 | } 18 | 19 | pub fn set_span(&mut self, begin: usize, end: usize) -> &mut Self { 20 | // TODO: Check for end < begin, etc. 21 | self.span = Some((begin, end)); 22 | self 23 | } 24 | 25 | pub fn get_span(&self) -> Option<(usize, usize)> { 26 | self.span 27 | } 28 | 29 | pub fn set_sym_val(&mut self, attr: &str, val: &str) -> &mut Self { 30 | self.attributes.insert(attr.to_string(), val.to_string()); 31 | self 32 | } 33 | 34 | pub fn get_sym_val(&self, attr: &str) -> &str { 35 | &self.attributes[attr] 36 | } 37 | } 38 | 39 | impl fmt::Display for NodeLabel { 40 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 41 | match self.span { 42 | None => write!(f, "_ _"), 43 | Some((b, e)) => write!(f, "{} {} ", 44 | self.span.unwrap().0, 45 | self.span.unwrap().1) 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/readme.rs: -------------------------------------------------------------------------------- 1 | /// 2 | /// This code is from https://github.com/tlcornell/rs-nlpsvc but we optimized for modern day use. 3 | /// 4 | 5 | pub fn readme(){ 6 | println!("Code from https://github.com/tlcornell/rs-nlpsvc!") 7 | } 8 | 9 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod reparse; 2 | pub mod retrans; 3 | pub mod reinterp; 4 | 5 | mod reterm; 6 | pub mod reprog; 7 | mod sparse; 8 | mod util; 9 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/reinterp.rs: -------------------------------------------------------------------------------- 1 | /** 2 | * Thompson style "breadth first" NFA interpreter. 3 | * Add dynamic programming, and you get a "just in time" DFA compiler. 4 | * 5 | * Multiple patterns: 6 | * Append all the programs? Each one has 1 start instruction and 1 match. 7 | * Ideally we want to keep track of which Match instructions we encounter, 8 | * not just which string positions we are in when we hit a Match. 9 | * Appending all programs means we still just have one clist and one nlist. 10 | */ 11 | 12 | use std::mem::swap; 13 | use std::cmp::{PartialOrd, Ordering}; 14 | use crate::native::nlpsvc::regex::reprog::*; 15 | use crate::native::nlpsvc::regex::sparse::SparseSet; // cribbed from regex crate, and from its ancestors 16 | use crate::native::nlpsvc::regex::reprog::Instruction::*; 17 | use crate::native::nlpsvc::regex::util::char_at; 18 | 19 | 20 | 21 | 22 | /// Record of candidate matches. 23 | /// 24 | /// The engine does not keep track of where the match starts, since all 25 | /// candidates start at the same place. 26 | /// 27 | #[derive(Debug, Clone, Copy, PartialEq)] 28 | pub struct MatchRecord { 29 | pub len: usize, 30 | pub rule: usize, 31 | } 32 | 33 | impl MatchRecord { 34 | pub fn new(len: usize, rule: usize) -> MatchRecord { 35 | MatchRecord { len, rule } 36 | } 37 | } 38 | 39 | impl PartialOrd for MatchRecord { 40 | 41 | /// A MatchRecord is bigger if it is longer, or same length but its rule is lower numbered 42 | fn partial_cmp(&self, other: &Self) -> Option { 43 | if self.len > other.len { 44 | return Some(Ordering::Greater); 45 | //best = m.clone(); 46 | } else if self.len == other.len { 47 | if self.rule < other.rule { 48 | return Some(Ordering::Greater); 49 | } else if self.rule == other.rule { 50 | return Some(Ordering::Equal); 51 | } 52 | } 53 | // self.len < other.len || equal && self.rule > other.rule 54 | return Some(Ordering::Less); 55 | } 56 | } 57 | 58 | 59 | 60 | pub trait TokenRecognizer { 61 | fn next_token(&mut self, text: &str, pos: usize) -> Option; 62 | } 63 | 64 | 65 | 66 | struct TaskList { 67 | t: SparseSet, 68 | } 69 | 70 | impl TaskList { 71 | pub fn new(len: usize) -> TaskList { 72 | TaskList { t: SparseSet::new(len) } 73 | } 74 | 75 | pub fn clear(&mut self) { 76 | self.t.clear(); 77 | } 78 | 79 | pub fn len(&self) -> usize { 80 | self.t.len() 81 | } 82 | 83 | pub fn is_empty(&self) -> bool { 84 | self.t.is_empty() 85 | } 86 | 87 | pub fn add_task(&mut self, pc: Label) { 88 | //println!("Adding task with pc = {}", pc); 89 | if !self.t.contains(pc) { 90 | self.t.insert(pc); 91 | } 92 | } 93 | } 94 | 95 | 96 | 97 | pub struct ThompsonInterpreter { 98 | pub matches: Vec, 99 | prog: Program, 100 | } 101 | 102 | impl ThompsonInterpreter { 103 | 104 | /// Make a new ThompsonInterpreter, with program `p` and no matches. 105 | pub fn new(p: Program) -> ThompsonInterpreter { 106 | ThompsonInterpreter { 107 | matches: vec![], 108 | prog: p, 109 | } 110 | } 111 | 112 | /// Return the best match at our current position 113 | /// 114 | /// Where "best" means "longest". Ties are broken according to the 115 | /// order of rules: earlier (lower-numbered) rules win. 116 | /// So clients should put the special cases first, 117 | /// and default rules later on. 118 | fn best_match(&self) -> Option { 119 | if self.matches.is_empty() { 120 | return None; 121 | } 122 | let mut best = MatchRecord {len: 0, rule: 0}; 123 | for m in &self.matches { 124 | if m > &best { 125 | best = m.clone(); 126 | } 127 | } 128 | // NOTE: If no match compares better than {0,0}, we will end up 129 | // returning that. This could happen if (1) a rule matched the 130 | // empty string (BAD IDEA!), and (2) it was not rule #0. 131 | Some(best) 132 | } 133 | 134 | /// Execute tasks in clist 135 | /// 136 | /// Loop through clist. Epsilon transitions (Split) add new entries to clist, 137 | /// so this implements epsilon-closure. All other instructions add new 138 | /// entries to nlist. 139 | /// So this will apply all character tests to the current character, and 140 | /// return when it is done. 141 | /// There is no direct notion of failure here. If nothing is added to nlist, 142 | /// then the whole procedure will terminate very soon. There is a global 143 | /// notion of failure which can be checked then, namely were there any 144 | /// matches. 145 | fn advance( 146 | &mut self, 147 | str_pos: usize, 148 | ch: char, 149 | clist: &mut TaskList, 150 | nlist: &mut TaskList 151 | ) { 152 | //println!("advance: '{}'", ch); 153 | let mut i: usize = 0; 154 | loop { 155 | if i >= clist.len() { 156 | //println!("finished with clist, end of match advance"); 157 | return; // really we want to break out of the outer loop here... 158 | } 159 | 160 | let pc = clist.t.at(i); 161 | i += 1; 162 | 163 | //println!("Executing instruction at line {}", pc); 164 | let inst = &self.prog[pc]; 165 | match *inst { 166 | Char(ref data) => { 167 | if data.ch == ch { 168 | //println!("Matched '{}' at string pos {}", data.ch, str_pos); 169 | //println!("Add task to nlist at {}", pc + 1); 170 | nlist.add_task(data.goto); 171 | } else if data.nocase { 172 | if data.ch.to_lowercase().collect::() == 173 | ch.to_lowercase().collect::() { 174 | //println!("i-Matched '{}' at string pos {}", data.ch, str_pos); 175 | nlist.add_task(data.goto); 176 | } 177 | } 178 | // otherwise the thread dies here 179 | } 180 | AnyChar(ref data) => { 181 | nlist.add_task(data.goto); 182 | } 183 | CharClass(ref ccd) => { 184 | if ccd.data.matches(ch) { 185 | //println!("CharClass {} matches {} at {}", ccd.data, ch, str_pos); 186 | nlist.add_task(ccd.goto); 187 | } else if ccd.nocase { 188 | if ccd.data.matches(ch.to_lowercase().next().unwrap()) { 189 | //println!("CharClass {} i-matches {} at {}", ccd.data, ch, str_pos); 190 | nlist.add_task(ccd.goto); 191 | } 192 | } 193 | } 194 | Match(ref data) => { 195 | //println!("Match: {} [{}]", str_pos, data.rule_id); 196 | self.matches.push(MatchRecord::new(str_pos, data.rule_id)); 197 | } 198 | Split(l1, l2) => { 199 | //println!("Task at {} added to clist", l1); 200 | clist.add_task(l1); 201 | //println!("Task at {} added to clist", l2); 202 | clist.add_task(l2); 203 | } 204 | } 205 | } 206 | 207 | } 208 | 209 | 210 | 211 | /// Find a token starting at &text[begin..], if possible. 212 | /// 213 | /// Results are stored in self.matches, and so "failure" is indicated 214 | /// by an empty match list. 215 | /// 216 | /// Note that we only match patterns that are prefixes of text. 217 | /// In effect, all patterns start with an implicit '^' anchor. 218 | fn all_matches_at(&mut self, text: &str) { 219 | 220 | let plen = self.prog.len(); 221 | let mut clist = TaskList::new(plen); // 'current' tasks 222 | let mut nlist = TaskList::new(plen); // 'next' tasks 223 | 224 | self.matches.clear(); 225 | 226 | for start in &self.prog.starts { 227 | //println!(">> Adding entry point {} to clist", *start); 228 | clist.add_task(*start); 229 | } 230 | let mut pos = 0; 231 | let mut nxt = 0; 232 | let mut ch: char; 233 | while !clist.is_empty() { 234 | 235 | pos += nxt; 236 | 237 | match char_at(&text[pos..]) { 238 | None => { 239 | if pos == text.len() { 240 | // At end of string. None is expected. 241 | ch = 0 as char; 242 | } else { 243 | panic!("ERROR: Could not decode character at {}", pos); 244 | } 245 | } 246 | Some((c, byte_len)) => { 247 | nxt = byte_len; 248 | ch = c; 249 | //println!("pos: {}; nxt: {}; ch: '{}'", pos, nxt, ch); 250 | } 251 | } 252 | 253 | self.advance(pos, ch, &mut clist, &mut nlist); 254 | 255 | // rebind clist and nlist 256 | swap(&mut clist, &mut nlist); 257 | nlist.clear(); 258 | } 259 | } 260 | 261 | 262 | } 263 | 264 | impl TokenRecognizer for ThompsonInterpreter { 265 | /// Find the best match for a prefix of `&text[pos..]`. 266 | fn next_token(&mut self, text:&str, pos: usize) -> Option { 267 | self.all_matches_at(&text[pos..]); 268 | self.best_match() 269 | } 270 | 271 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/reparse.rs: -------------------------------------------------------------------------------- 1 | use crate::native::nlpsvc::regex::reterm::*; 2 | 3 | 4 | struct ParseContext { 5 | no_case: bool, 6 | } 7 | 8 | impl ParseContext { 9 | pub fn new() -> ParseContext { 10 | ParseContext { no_case: false, } 11 | } 12 | } 13 | 14 | pub fn parse(text: &str) -> Term 15 | { 16 | let mut ctx = ParseContext::new(); 17 | match parse_regex(text, &mut ctx) { 18 | Some((t, s)) => { 19 | if !s.is_empty() { 20 | println!("Did not parse the whole regex string. Remainder: '{}'", s); 21 | } 22 | t 23 | }, 24 | None => panic!("Parse failed!") 25 | } 26 | } 27 | 28 | /** 29 | * ::= 30 | * ::= OR '|' 31 | * ::= OR 32 | * ::= OR '*' OR '+' OR '?' 33 | * ::= OR '(' ')' OR '\' OR '.' 34 | */ 35 | fn parse_regex<'a>(text: &'a str, ctx: &mut ParseContext) -> Option<(Term, &'a str)> 36 | { 37 | //println!("parse_regex '{}'", text); 38 | parse_alt(text, ctx) 39 | } 40 | 41 | fn parse_alt<'a>(text: &'a str, ctx: &mut ParseContext) -> Option<(Term, &'a str)> { 42 | //println!("parse_alt '{}'", text); 43 | match parse_conc(text, ctx) { 44 | None => None, 45 | Some((t1, rmdr1)) => { 46 | if !rmdr1.starts_with("|") { 47 | Some((t1, rmdr1)) 48 | } else { 49 | match parse_alt(&rmdr1[1..], ctx) { 50 | None => None, 51 | Some((t2, rmdr2)) => 52 | Some((Term::new(TermType::Alternation, vec!(t1, t2)), rmdr2)) 53 | } 54 | } 55 | } 56 | } 57 | } 58 | 59 | fn parse_conc<'a>(text: &'a str, ctx: &mut ParseContext) -> Option<(Term, &'a str)> { 60 | //println!("parse_conc '{}'", text); 61 | match parse_iter(text, ctx) { 62 | None => None, 63 | Some((t1, rmdr1)) => { 64 | if rmdr1.is_empty() || is_operator(rmdr1.chars().next().unwrap()) { 65 | // 'is_operator' really means 'is_not_a_character_literal' 66 | Some((t1, rmdr1)) 67 | } else { 68 | match parse_conc(rmdr1, ctx) { 69 | None => None, 70 | Some((t2, rmdr2)) => 71 | Some((Term::new(TermType::Concatenation, vec!(t1, t2)), rmdr2)) 72 | } 73 | } 74 | } 75 | } 76 | } 77 | 78 | /** 79 | * Because of expressions like 'b**', the rule has to be: 80 | * -> '*' 81 | * But this is left-recursive. 82 | */ 83 | fn parse_iter<'a>(text: &'a str, ctx: &mut ParseContext) -> Option<(Term, &'a str)> { 84 | //println!("parse_iter '{}'", text); 85 | match parse_atom(text, ctx) { 86 | None => None, 87 | Some((mut t1, mut rmdr1)) => { 88 | loop { 89 | match rmdr1.chars().next() { 90 | None => break, 91 | Some(c1) => match c1 { 92 | '*' => t1 = Term::new(TermType::Iteration, vec!(t1)), 93 | '+' => t1 = Term::new(TermType::PositiveIteration, vec!(t1)), 94 | '?' => t1 = Term::new(TermType::Optional, vec!(t1)), 95 | _ => break 96 | } 97 | } 98 | rmdr1 = &rmdr1[1..]; 99 | } 100 | Some((t1, rmdr1)) 101 | } 102 | } 103 | } 104 | 105 | fn parse_atom<'a>(text: &'a str, ctx: &mut ParseContext) -> Option<(Term, &'a str)> { 106 | //println!("parse_atom '{}'", text); 107 | if text.starts_with("(") { 108 | if text[1..].starts_with("?") { 109 | let rmdr = scan_flags(&text[2..], ctx); 110 | parse_atom(rmdr, ctx) 111 | } else { 112 | match parse_regex(&text[1..], ctx) { 113 | None => None, 114 | Some((t, rmdr)) => { 115 | if !rmdr.starts_with(")") { 116 | None 117 | } else { 118 | Some((t, &rmdr[1..])) 119 | } 120 | } 121 | } 122 | } 123 | } else if text.starts_with("\\") { 124 | let optc = text.chars().nth(1); 125 | match optc { 126 | None => panic!("PARSE ERROR: String ends in a backslash"), 127 | Some(c) => Some((Term::new(TermType::Atom(c, ctx.no_case), 128 | vec!()), 129 | &text[2..])) 130 | } 131 | } else if text.starts_with("[") { 132 | parse_char_class(&text[1..], ctx) 133 | } else { 134 | let c = text.chars().next().unwrap(); 135 | if c == '.' { 136 | Some((Term::new(TermType::AnyCharTerm, vec!()), &text[1..])) 137 | } else { 138 | Some((Term::new(TermType::Atom(c, ctx.no_case), vec!()), 139 | &text[1..])) 140 | } 141 | } 142 | } 143 | 144 | /** 145 | * Used to tell when something is a boundary for concatenation. 146 | * No string that starts with one of these can be concatenated 147 | * with the preceding term. 148 | */ 149 | fn is_operator(ch: char) -> bool { 150 | match ch { 151 | '|' | '*' | '+' | '?' | ')' => true, 152 | _ => false 153 | } 154 | } 155 | 156 | /** 157 | * The caller has already consumed the leading '[', so text[0] is either 158 | * '^' or a single char or the start of a char range. 159 | */ 160 | fn parse_char_class<'a>(text: &'a str, ctx: &mut ParseContext) -> Option<(Term, &'a str)> { 161 | //let mut i = 0; 162 | let mut rmdr = text; 163 | let mut negated = false; 164 | if scan_given("^", rmdr) { 165 | negated = true; 166 | rmdr = &rmdr[1..]; 167 | } 168 | // There must be a character at text[i], 169 | // but we don't know whether it is a singleton, or the start of a range. 170 | let mut preds: Vec = vec![]; 171 | loop { 172 | match scan_class_elt(rmdr) { 173 | None => { break; }, 174 | Some((pred, nxt)) => { 175 | preds.push(pred); 176 | rmdr = nxt; 177 | } 178 | } 179 | } 180 | rmdr = &rmdr[1..]; 181 | 182 | let ccd = CharClassData::new(!negated, preds); 183 | Some((Term::new(TermType::CharClassTerm(ccd, ctx.no_case), vec![]), 184 | rmdr)) 185 | } 186 | 187 | /** 188 | * Scan text for singleton chars and char ranges. 189 | * Return a char range (in either case), and the position of the 190 | * next unread byte in text. 191 | * Probably needs to be wrapped in an Option. 192 | * Note that a character might be represented as an escape sequence! 193 | * E.g., to include ']' or maybe '^'. 194 | * 195 | * Someday there will be named classes, but this is not that day. 196 | */ 197 | fn scan_class_elt(text: &str) -> Option<(CharClassPredicate, &str)> { 198 | let mut rmdr = text; 199 | if scan_given("]", rmdr) { 200 | return None; 201 | } 202 | match scan_class_elt_char(rmdr) { 203 | None => { return None; } 204 | Some((ch1, rmdr1)) => { 205 | rmdr = rmdr1; 206 | if ch1 == '[' { 207 | // Might be a named character class... 208 | } 209 | if !scan_given("-", rmdr) { 210 | return Some((CharClassPredicate::Individual(ch1), rmdr)); 211 | } 212 | rmdr = &rmdr[1..]; 213 | match scan_class_elt_char(rmdr) { 214 | None => { None } 215 | Some((ch2, rmdr2)) => { 216 | Some((CharClassPredicate::Range(ch1, ch2), rmdr2)) 217 | } 218 | } 219 | } 220 | } 221 | } 222 | 223 | fn scan_class_elt_char(text: &str) -> Option<(char, &str)> { 224 | let mut bytes = text.bytes(); 225 | let mut first: u8 = bytes.next().unwrap(); 226 | let mut start = 0; 227 | let end; 228 | if first == b'\\' { 229 | first = bytes.next().unwrap(); 230 | start += 1; 231 | } 232 | if first & 0b1000_0000 == 0b0000_0000 { 233 | end = start + 1; 234 | } else if first & 0b1110_0000 == 0b1100_0000 { 235 | end = start + 2; 236 | } else if first & 0b1111_0000 == 0b1110_0000 { 237 | end = start + 3; 238 | } else if first & 0b1111_1000 == 0b1111_0000 { 239 | end = start + 4; 240 | } else { 241 | unreachable!("UTF8 char scan failed!"); 242 | } 243 | 244 | let c = text[start..].chars().next().unwrap(); 245 | 246 | Some((c, &text[end..])) 247 | } 248 | 249 | 250 | /** 251 | * When this is called, we have already consumed the "(?" prefix 252 | * 253 | * ::= '(' '?' + ')' 254 | * ::= '-'? 255 | * ::= 'i' 256 | */ 257 | fn scan_flags<'a>(text: &'a str, ctx: &mut ParseContext) -> &'a str /*-> Option<(Term, &str)>*/ { 258 | let mut unset = false; 259 | let mut i = 0; 260 | for c in text.bytes() { 261 | i += 1; 262 | match c { 263 | b')' => { break; } 264 | b'-' => { unset = true; } 265 | b'i' => { 266 | if unset { 267 | ctx.no_case = false; 268 | } else { 269 | ctx.no_case = true; 270 | } 271 | } 272 | _ => { panic!("PARSE ERROR: Unrecognized flag character {}", c); } 273 | } 274 | } 275 | &text[i..] 276 | } 277 | 278 | 279 | /** 280 | * The expectation here is that ch will be a one ASCII-char string. 281 | * This is for scanning for syntactically active characters, not general 282 | * unicode code points. 283 | * The character is not consumed from text even if it matches. 284 | * The caller has to manage that. 285 | */ 286 | fn scan_given(ch: &str, text: &str) -> bool { 287 | return ch.as_bytes()[0] == text.as_bytes()[0]; 288 | } 289 | 290 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/reprog.rs: -------------------------------------------------------------------------------- 1 | //////////////////////////////////////////////////////////////////////////// 2 | // reprog.rs 3 | 4 | use std::ops::{Index, IndexMut}; 5 | use std::fmt; 6 | use std::collections::HashMap; 7 | use crate::native::nlpsvc::regex::reterm::CharClassData; 8 | 9 | pub type Label = usize; 10 | 11 | #[derive(Debug)] 12 | pub enum Instruction { 13 | Char(CharInstData), 14 | AnyChar(AnyCharInst), 15 | CharClass(CharClassInst), 16 | Match(MatchInst), // arg: rule# 17 | Split(Label, Label), 18 | } 19 | 20 | 21 | #[derive(Clone, Copy, Debug)] 22 | pub struct CharInstData { 23 | pub ch: char, 24 | pub nocase: bool, 25 | pub goto: Label, 26 | } 27 | 28 | #[derive(Clone, Copy, Debug)] 29 | pub struct AnyCharInst { 30 | pub goto: Label, 31 | } 32 | 33 | #[derive(Clone, Copy, Debug)] 34 | pub struct MatchInst { 35 | pub rule_id: usize, 36 | //pub goto: Label, 37 | } 38 | 39 | #[derive(Debug)] 40 | pub struct CharClassInst { 41 | pub data: CharClassData, 42 | pub nocase: bool, 43 | pub goto: Label, 44 | } 45 | 46 | 47 | 48 | 49 | impl fmt::Display for Instruction { 50 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 51 | use self::Instruction::*; 52 | match *self { 53 | Char(ref data) => write!(f, "char {} goto {} {}", data.ch, data.goto, 54 | if data.nocase { " [nocase]" } else { "" }), 55 | AnyChar(ref data) => write!(f, "any_char goto {}", data.goto), 56 | CharClass(ref cc) => write!(f, "{} goto {} {}", cc.data, cc.goto, 57 | if cc.nocase { " [nocase]" } else { "" }), 58 | Match(ref data) => write!(f, "match {}", data.rule_id), 59 | Split(l1, l2) => write!(f, "split {}, {}", l1, l2), 60 | } 61 | } 62 | } 63 | 64 | 65 | 66 | 67 | #[derive(Debug)] 68 | pub struct Program { 69 | code: Vec, 70 | pub starts: Vec, // entry points 71 | } 72 | 73 | impl Program { 74 | pub fn new() -> Program { 75 | Program { 76 | code: vec![], 77 | starts: vec![], 78 | } 79 | } 80 | pub fn len(&self) -> usize { 81 | self.code.len() 82 | } 83 | pub fn push(&mut self, instr: Instruction) { 84 | self.code.push(instr); 85 | } 86 | pub fn print(&self) { 87 | for (pos, inst) in self.code.iter().enumerate() { 88 | println!("{:03}: {}", pos, *inst); 89 | } 90 | } 91 | pub fn add_start(&mut self, start: usize) { 92 | self.starts.push(start); 93 | } 94 | pub fn ground_labels(&mut self, lblmap: &HashMap) { 95 | use self::Instruction::*; 96 | let mut code_new = Vec::with_capacity(self.code.len()); 97 | for inst in self.code.iter() { 98 | //let ref mut i: Instruction = *inst; 99 | match *inst { 100 | Char(ref data) => { 101 | code_new.push(Char(CharInstData { 102 | ch: data.ch, 103 | nocase: data.nocase, 104 | goto: lblmap[&data.goto], 105 | })); 106 | } 107 | AnyChar(ref data) => { 108 | code_new.push(AnyChar(AnyCharInst { 109 | goto: lblmap[&data.goto], 110 | })); 111 | } 112 | CharClass(ref ccdata) => { 113 | code_new.push(CharClass(CharClassInst { 114 | data: ccdata.data.clone(), 115 | nocase: ccdata.nocase, 116 | goto: lblmap[&ccdata.goto], 117 | })); 118 | } 119 | Match(ref data) => { 120 | code_new.push(Match(MatchInst { 121 | rule_id: data.rule_id, 122 | //goto: lblmap[&data.goto], 123 | })); 124 | } 125 | Split(l1, l2) => { 126 | let l1_new = lblmap[&l1]; 127 | let l2_new = lblmap[&l2]; 128 | code_new.push(Split(l1_new, l2_new)); 129 | } 130 | } 131 | } 132 | self.code = code_new; 133 | } 134 | } 135 | 136 | impl Index for Program { 137 | type Output = Instruction; 138 | fn index(&self, index: usize) -> &Instruction { 139 | &self.code[index] 140 | } 141 | } 142 | 143 | impl IndexMut for Program { 144 | //type Output = Instruction; 145 | fn index_mut(&mut self, index: usize) -> &mut Instruction { 146 | &mut self.code[index] 147 | } 148 | } 149 | 150 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/reterm.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | #[derive(Debug)] 4 | pub enum TermType { 5 | Alternation, 6 | Concatenation, 7 | Iteration, 8 | PositiveIteration, 9 | Optional, 10 | Atom(char, bool), 11 | CharClassTerm(CharClassData, bool), 12 | AnyCharTerm, 13 | } 14 | 15 | #[derive(Debug)] 16 | pub struct Term { 17 | pub op: TermType, 18 | pub subs: Vec, 19 | } 20 | 21 | impl Term { 22 | 23 | /** 24 | * Note that there's no arity checking between the op and the 25 | * sub-term array. So far all our operators have strict arity 26 | * requirements, so such a check should probably be added. 27 | */ 28 | pub fn new(op: TermType, subs: Vec) -> Term { 29 | Term { 30 | op: op, 31 | subs: subs 32 | } 33 | } 34 | } 35 | 36 | impl fmt::Display for Term { 37 | fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result { 38 | pretty_print(&self, 0) 39 | } 40 | } 41 | 42 | /** 43 | * There has to be a better way to do this, but for the life of me, 44 | * I can't find it. 45 | */ 46 | fn tab_over(n: usize) { 47 | for _ in 0..n { 48 | print!(" "); 49 | } 50 | } 51 | 52 | fn print_label(t: &Term) { 53 | use self::TermType::*; 54 | match t.op { 55 | Concatenation => { print!("CONCATENATION"); }, 56 | Alternation => { print!("ALTERNATION"); }, 57 | Iteration => { print!("FREE_ITERATION"); }, 58 | PositiveIteration => { print!("POSITIVE_ITERATION"); }, 59 | Optional => { print!("OPTIONAL"); }, 60 | Atom(c, nocase) => { 61 | print!("ATOM '{}'", c); 62 | if nocase { 63 | print!(" (?i)"); 64 | } 65 | }, 66 | CharClassTerm(ref ccd, nocase) => { 67 | print!("CHAR_CLASS {}", ccd); 68 | if nocase { 69 | print!(" (?i)"); 70 | } 71 | }, 72 | AnyCharTerm => { print!("ANY_CHAR"); }, 73 | } 74 | } 75 | 76 | 77 | fn pretty_print(t: &Term, tab: usize) -> fmt::Result { 78 | tab_over(tab); 79 | print_label(t); 80 | println!(""); 81 | for sb in &t.subs { 82 | pretty_print(sb, tab + 4).unwrap(); 83 | } 84 | Ok(()) 85 | } 86 | 87 | 88 | #[derive(Debug, Clone)] 89 | pub struct CharClassData { 90 | positive: bool, 91 | ranges: Vec, 92 | } 93 | 94 | 95 | /** 96 | * The implementation of matches() doesn't really belong here. 97 | * It has to harmonize with other matches() methods used by the interpreter. 98 | * So probably there needs to be a trait defined somewhere that 99 | * allows us to extend CharClassData with what we need to interpret it. 100 | * This is all because this struct is shared between the char class term 101 | * and the char class instruction. 102 | */ 103 | impl CharClassData { 104 | 105 | pub fn new(pos: bool, preds: Vec) -> CharClassData { 106 | CharClassData { 107 | positive: pos, 108 | ranges: preds, // take ownership 109 | } 110 | } 111 | 112 | pub fn matches(&self, ch: char) -> bool { 113 | use self::CharClassPredicate::*; 114 | for pred in &self.ranges { 115 | match *pred { 116 | Range(c1, c2) => { 117 | //println!("Range({}, {})", c1, c2); 118 | if ch >= c1 && ch <= c2 && self.positive { 119 | return true; 120 | } 121 | } 122 | Individual(c1) => { 123 | //println!("Individual({})", c1); 124 | if c1 == ch && self.positive { 125 | return true; 126 | } 127 | } 128 | Named(_) => { 129 | panic!("matches() unimplemented for Named"); 130 | } 131 | } 132 | } 133 | !self.positive 134 | } 135 | 136 | } 137 | 138 | impl fmt::Display for CharClassData { 139 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 140 | if !self.positive { 141 | r#try!(write!(f, "NOT ")); 142 | } 143 | for rng in &self.ranges { 144 | r#try!(write!(f, "{} ", rng)); 145 | } 146 | Ok(()) 147 | } 148 | } 149 | 150 | 151 | #[derive(Debug, Clone)] 152 | pub enum CharClassPredicate { 153 | Range(char, char), 154 | Individual(char), 155 | Named(String), 156 | } 157 | 158 | impl fmt::Display for CharClassPredicate { 159 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 160 | use self::CharClassPredicate::*; 161 | match *self { 162 | Range(c1, c2) => { 163 | write!(f, "{}-{}", c1, c2) 164 | } 165 | Individual(c) => { 166 | write!(f, "{}", c) 167 | } 168 | Named(ref nm) => { 169 | write!(f, "[:{}:]", nm) 170 | } 171 | } 172 | } 173 | } 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/retrans.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use crate::native::nlpsvc::regex::reterm::{Term, CharClassData}; 3 | use crate::native::nlpsvc::regex::reprog::{Program, Label, Instruction}; 4 | use crate::native::nlpsvc::regex::reprog::{CharInstData, AnyCharInst, MatchInst, CharClassInst}; 5 | use crate::native::nlpsvc::regex::reprog::Instruction::*; 6 | use crate::native::nlpsvc::regex::reterm::TermType::*; 7 | 8 | pub struct RegexTranslator { 9 | pub prog: Program, 10 | next_label: usize, 11 | label_map: HashMap, 12 | } 13 | 14 | impl RegexTranslator { 15 | pub fn new() -> RegexTranslator { 16 | RegexTranslator { 17 | prog: Program::new(), 18 | next_label: 0, 19 | label_map: HashMap::new(), 20 | } 21 | } 22 | 23 | pub fn get_program(&self) -> &Program { 24 | &self.prog 25 | } 26 | 27 | fn gen_label(&mut self) -> Label { 28 | let nxt = self.next_label; 29 | self.next_label += 1; 30 | nxt 31 | } 32 | 33 | /** 34 | * This method is meant to be called multiple times, so that a batch 35 | * of regular expressions can be matched in parallel, as if they were 36 | * all combined into a single disjunction. 37 | */ 38 | pub fn compile(&mut self, regex: &Term, rule_nbr: usize) { 39 | let start = self.prog.len(); 40 | self.prog.add_start(start); 41 | self.translate_root(regex, rule_nbr); 42 | } 43 | 44 | pub fn finish(&mut self) { 45 | self.prog.ground_labels(&self.label_map); 46 | } 47 | 48 | fn translate_root(&mut self, regex: &Term, rule_nbr: usize) { 49 | let l1 = self.gen_label(); 50 | let l2 = self.gen_label(); 51 | self.translate(regex, l1, l2); 52 | self.emit(Match(MatchInst {rule_id: rule_nbr, /*goto: l2*/}), l2); 53 | } 54 | 55 | fn translate(&mut self, regex: &Term, l0: Label, l: Label) { 56 | match regex.op { 57 | Alternation => self.trans_alt(regex, l0, l), 58 | Concatenation => self.trans_conc(regex, l0, l), 59 | Iteration => self.trans_iter(regex, l0, l), 60 | Optional => self.trans_opt(regex, l0, l), 61 | PositiveIteration => self.trans_pos(regex, l0, l), 62 | Atom(c, nocase) => self.trans_char(c, nocase, l0, l), 63 | CharClassTerm(ref ccd, nocase) => self.trans_chcls(ccd, nocase, l0, l), 64 | AnyCharTerm => self.trans_any_char(l0, l), 65 | } 66 | } 67 | 68 | fn emit(&mut self, instr: Instruction, at_line: Label) { 69 | self.prog.push(instr); 70 | let n = self.label_map.len(); 71 | self.label_map.insert(at_line, n); 72 | } 73 | 74 | /* 75 | translate(a, L0) --> 76 | char a goto L0 77 | */ 78 | 79 | /* 80 | translate(e1|e2, L0, L): 81 | L0: split L1, L2 82 | L1: translate(e1, L1, L) 83 | L2: translate(e2, L2, L) 84 | */ 85 | fn trans_alt(&mut self, regex: &Term, l0: Label, l: Label) { 86 | let l1 = self.gen_label(); 87 | let l2 = self.gen_label(); 88 | self.emit(Split(l1, l2), l0); 89 | self.translate(®ex.subs[0], l1, l); 90 | self.translate(®ex.subs[1], l2, l); 91 | } 92 | 93 | /* 94 | translate(e1.e2, L0, L): 95 | L0: translate(e1, L0, L1) 96 | L1: translate(e2, L1, L) 97 | */ 98 | fn trans_conc(&mut self, regex: &Term, l0: Label, l: Label) { 99 | let l1 = self.gen_label(); 100 | self.translate(®ex.subs[0], l0, l1); 101 | self.translate(®ex.subs[1], l1, l); 102 | } 103 | 104 | /* 105 | translate(e*, L0, L): 106 | L0: split L1, L 107 | L1: translate(e, L1, L0) 108 | */ 109 | fn trans_iter(&mut self, regex: &Term, l0: Label, l: Label) { 110 | let l1 = self.gen_label(); 111 | self.emit(Split(l1, l), l0); 112 | self.translate(®ex.subs[0], l1, l0); 113 | } 114 | 115 | /* 116 | translate(e?, L0, L): 117 | L0: split L1, L 118 | L1: translate(e, L1, L) 119 | */ 120 | fn trans_opt(&mut self, regex: &Term, l0: Label, l: Label) { 121 | let l1 = self.gen_label(); 122 | self.emit(Split(l1, l), l0); 123 | self.translate(®ex.subs[0], l1, l); 124 | } 125 | 126 | /* 127 | translate(e+, L0, L): 128 | L0: translate(e, L0, L1) 129 | L1: split L0, L 130 | */ 131 | fn trans_pos(&mut self, regex: &Term, l0: Label, l: Label) { 132 | let l1 = self.gen_label(); 133 | self.translate(®ex.subs[0], l0, l1); 134 | self.emit(Split(l0, l), l1); 135 | } 136 | 137 | fn trans_char(&mut self, c: char, nocase: bool, l0: Label, l: Label) { 138 | self.emit(Char(CharInstData {ch: c, nocase: nocase, goto: l} ), l0); 139 | } 140 | 141 | fn trans_any_char(&mut self, l0: Label, l: Label) { 142 | self.emit(AnyChar(AnyCharInst {goto: l}), l0); 143 | } 144 | 145 | /* 146 | translate([es], L0, L: 147 | L0: charclass es goto L 148 | */ 149 | fn trans_chcls(&mut self, 150 | clsdata: &CharClassData, nocase: bool, 151 | l0: Label, l: Label) { 152 | self.emit(CharClass(CharClassInst { 153 | data: clsdata.clone(), 154 | nocase: nocase, 155 | goto: l, 156 | }), l0); 157 | } 158 | 159 | pub fn print_prog(&self) { 160 | self.prog.print(); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/sparse.rs: -------------------------------------------------------------------------------- 1 | /** 2 | * sparse.rs 3 | * 4 | * Sparse set implementation. Mainly copied from the Rust regex crate source. 5 | * Based on an implementation from RE2, and apparently many ancestors before 6 | * that. 7 | */ 8 | 9 | #[derive(Clone, Debug)] 10 | pub struct SparseSet { 11 | dense: Vec, 12 | sparse: Vec, 13 | size: usize, 14 | } 15 | 16 | impl SparseSet { 17 | 18 | /** 19 | * Note that the original implementation of this data structure 20 | * stressed NOT initializing the memory. It was a trick to save 21 | * costly initialization time, among other things. I am using it 22 | * because it has very good set iteration properties that make it 23 | * a good agenda implementation. And following std::regex and RE2, 24 | * I am only using two pre-allocated sets, and "double buffering" them. 25 | */ 26 | pub fn new(sz: usize) -> SparseSet { 27 | SparseSet { 28 | dense: vec![0; sz], 29 | sparse: vec![0; sz], 30 | size: 0, 31 | } 32 | } 33 | 34 | /** 35 | * For iteration 36 | */ 37 | pub fn at(&self, idx: usize) -> usize { 38 | if idx < self.size { 39 | self.dense[idx] 40 | } else { 41 | panic!("Sparse set index {} out of bounds ({})", idx, self.size); 42 | } 43 | } 44 | 45 | pub fn len(&self) -> usize { 46 | self.size 47 | } 48 | 49 | pub fn is_empty(&self) -> bool { 50 | self.len() == 0 51 | } 52 | 53 | // The following are mostly directly copy-pasted from std::regex: 54 | 55 | /** 56 | * Note no membership test, so you might want to guard calls to insert() 57 | * to prevent self.dense from growing and containing garbage. 58 | * That is, we assume here as a pre-condition that value is known 59 | * not to be in the set already. 60 | */ 61 | pub fn insert(&mut self, value: usize) { 62 | let i = self.size; 63 | self.dense[i] = value; 64 | self.sparse[value] = i; 65 | self.size += 1; 66 | } 67 | 68 | /** 69 | * See https://research.swtch.com/sparse 70 | */ 71 | pub fn contains(&self, value: usize) -> bool { 72 | let i = self.sparse[value]; 73 | i < self.size && self.dense[i] == value 74 | } 75 | 76 | /** 77 | * This data structure is designed to work well with uninitialized data, 78 | * so there is no need to clear everything to zero here. 79 | */ 80 | pub fn clear(&mut self) { 81 | self.size = 0; 82 | } 83 | 84 | } 85 | 86 | 87 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex/util.rs: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * Return Some((ch, len)) if there is a character at the start of text, 4 | * None otherwise. 5 | */ 6 | pub fn char_at(text: &str) -> Option<(char, usize)> { 7 | static HI_BIT: u8 = 0b1000_0000; 8 | if text.is_empty() { 9 | return None; 10 | } 11 | let leader: u8 = text.as_bytes()[0]; 12 | let mut length = 1; 13 | if leader & HI_BIT == 0 { 14 | return Some((leader as char, length)); 15 | } 16 | let mut bits: u32; 17 | if leader >= 0b1111_0000 { 18 | bits = (leader & 0b0000_0111) as u32; 19 | length = 4; 20 | } else if leader >= 0b1110_0000 { 21 | bits = (leader & 0b0000_1111) as u32; 22 | length = 3; 23 | } else if leader >= 0b1100_0000 { 24 | bits = (leader & 0b0001_1111) as u32; 25 | length = 2; 26 | } else { 27 | unreachable!(); 28 | } 29 | 30 | if text.len() < length { 31 | panic!("UTF-8 cutoff error: String does not contain a whole character"); 32 | } 33 | 34 | for i in 1..length { 35 | let byte: u8 = text.as_bytes()[i]; 36 | bits = (bits << 6) | (byte & 0b0011_1111) as u32; 37 | } 38 | match ::std::char::from_u32(bits) { 39 | None => None, 40 | Some(ch) => Some((ch, length)) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/regex_tokenizer.rs: -------------------------------------------------------------------------------- 1 | use crate::native::nlpsvc::regex::reinterp::TokenRecognizer; 2 | use crate::native::nlpsvc::regex::retrans::RegexTranslator; 3 | use crate::native::nlpsvc::regex::reparse; 4 | use crate::native::nlpsvc::regex::reprog::Program; 5 | 6 | use crate::native::nlpsvc::annotated_document::*; 7 | 8 | /// Trait for holding actions to take upon token recognition 9 | /// 10 | /// 11 | pub trait TokenReactor { 12 | /// Append a token 13 | /// 14 | /// Append a token starting at `begin` with text `text`, that 15 | /// matched rule #`rule_id`. 16 | fn append(&mut self, begin: usize, end: usize, rule_id: usize, doc: &mut AnnotatedDocument); 17 | 18 | /// Skip an unhandled character 19 | /// 20 | /// The character at `begin` is not the first character of any pattern 21 | /// that this tokenizer knows about. For symmetry with `append()`, 22 | /// the text is passed in as a &str, but in general it should only be 23 | /// one character long. 24 | fn skip(&mut self, begin: usize, text: &str); 25 | } 26 | 27 | 28 | pub trait RegexTokenizer: TokenRecognizer + TokenReactor { 29 | 30 | fn apply_to(&mut self, doc: &mut AnnotatedDocument) { 31 | let mut pos: usize = 0; 32 | //let text = doc.get_text(); 33 | while pos < doc.get_text().len() { 34 | match self.next_token(doc.get_text(), pos) { 35 | None => { 36 | self.skip(pos, &doc.get_text()[pos..pos + 1]); 37 | pos += 1; 38 | } 39 | Some(match_rec) => { 40 | let new_pos = pos + match_rec.len; 41 | self.append(pos, new_pos, match_rec.rule, doc); 42 | pos = new_pos; 43 | } 44 | } 45 | } 46 | } 47 | 48 | } 49 | 50 | /// Designed to apply a regex compiler to a sequence of regexes 51 | /// 52 | /// This way we can run them all in parallel, and keep track of which ones 53 | /// matched. 54 | pub struct ThompsonProgramBuilder { 55 | compiler: RegexTranslator, 56 | rule_nbr: usize, 57 | } 58 | 59 | impl ThompsonProgramBuilder { 60 | 61 | pub fn new() -> ThompsonProgramBuilder { 62 | ThompsonProgramBuilder { 63 | compiler: RegexTranslator::new(), 64 | rule_nbr: 0, 65 | } 66 | } 67 | 68 | /// Compile the pattern and add to the current program. 69 | pub fn add_rule(mut self, pattern: &str) -> ThompsonProgramBuilder { 70 | let tree = reparse::parse(pattern); 71 | self.compiler.compile(&tree, self.rule_nbr); 72 | self.rule_nbr += 1; 73 | self 74 | } 75 | 76 | pub fn build(mut self) -> Program { 77 | self.compiler.finish(); // ground instruction labels 78 | self.compiler.print_prog(); 79 | self.compiler.prog 80 | } 81 | 82 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/text_source.rs: -------------------------------------------------------------------------------- 1 | 2 | extern crate getopts; 3 | use getopts::Options; 4 | use std::env; 5 | use std::process; 6 | 7 | use std::io; 8 | use std::io::prelude::*; 9 | use std::fs::File; 10 | use std::path::Path; 11 | 12 | struct AppConfig { 13 | text_file: Option, 14 | } 15 | 16 | impl AppConfig { 17 | fn new() -> AppConfig { 18 | AppConfig { 19 | text_file: None, 20 | } 21 | } 22 | } 23 | 24 | fn configure() -> AppConfig { 25 | let args: Vec = env::args().collect(); 26 | let mut opts = Options::new(); 27 | opts.optflag("h", "help", "print this message and exit"); 28 | opts.optopt("f", "file", "match text from file", "NAME"); 29 | let matches = match opts.parse(&args[1..]) { 30 | Ok(m) => { m } 31 | Err(f) => { panic!("{}",f.to_string()) } 32 | }; 33 | if matches.opt_present("h") { 34 | print_usage(&args[0], &opts); 35 | } 36 | 37 | let mut cfg: AppConfig = AppConfig::new(); 38 | cfg.text_file = matches.opt_str("f"); 39 | 40 | cfg 41 | } 42 | 43 | fn print_usage(program: &str, opts: &Options) { 44 | let brief = format!("\nUsage: {} [options]", program); 45 | print!("{}", opts.usage(&brief)); 46 | println!("\nIf no file is given, input will be read from stdin."); 47 | process::exit(1); 48 | } 49 | 50 | 51 | struct TextSource { 52 | text: String, 53 | } 54 | 55 | impl TextSource { 56 | pub fn new(cfg: &AppConfig) -> TextSource { 57 | // Get the text to match against (from file or stdin) 58 | let mut txt = String::new(); 59 | match cfg.text_file { 60 | None => { 61 | let stdin = io::stdin(); 62 | stdin.lock().read_to_string(&mut txt).unwrap(); 63 | }, 64 | Some(ref fname) => { 65 | let fpath = Path::new(&fname); 66 | let mut f = File::open(fpath).expect("Could not open file"); 67 | f.read_to_string(&mut txt).expect("Could not read file"); 68 | } 69 | } 70 | 71 | TextSource { text: txt } 72 | } 73 | 74 | pub fn get_text(&self) -> &str { 75 | &self.text 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/nlpsvc/tree_sequence.rs: -------------------------------------------------------------------------------- 1 | //! # tree_sequence.rs 2 | //! 3 | //! Manage a sequence of trees. 4 | //! 5 | //! Basically the many many children of a single root node that 6 | //! remains implicit. 7 | 8 | 9 | use std::fmt; 10 | use indextree::Arena; 11 | use indextree::NodeId; 12 | use crate::native::nlpsvc::node_label::*; 13 | 14 | type TreeArena = Arena; 15 | 16 | pub struct TreeSequence { 17 | first: Option, 18 | last: Option, 19 | arena: TreeArena, 20 | } 21 | 22 | impl TreeSequence { 23 | 24 | pub fn new() -> TreeSequence { 25 | TreeSequence { 26 | first: None, 27 | last: None, 28 | arena: TreeArena::new(), 29 | } 30 | } 31 | 32 | pub fn activate(&self, memo: CursorMemo) -> TreeCursor { 33 | TreeCursor::new(memo.node, &self.arena) 34 | } 35 | 36 | pub fn first(&self) -> TreeCursor { 37 | TreeCursor::new(self.first, &self.arena) 38 | } 39 | 40 | pub fn print(&self) { 41 | print_tree_sequence(self.first, &self.arena, 0); 42 | } 43 | 44 | /// Create a new atomic tree and append it to the tree sequence 45 | /// 46 | /// This should end up being roughly equivalent to chunk() with no 47 | /// child sequence. 48 | pub fn push_back(&mut self, lbl: NodeLabel) { 49 | let node = self.arena.new_node(lbl); 50 | match self.last { 51 | None => { 52 | // then so is self.first 53 | self.last = Some(node); 54 | self.first = self.last; 55 | } 56 | Some(last_node) => { 57 | last_node.insert_after(node, &mut self.arena); 58 | self.last = Some(node); 59 | } 60 | } 61 | } 62 | 63 | /// `end` is not included in the interval. So it could be None, 64 | /// if we were working on the very tail of the tree list. 65 | /// That case has to be accounted for! 66 | /// 67 | /// This should probably return a TreeCursor wrapping the new root node. 68 | pub fn chunk(&mut self, lbl: NodeLabel, begin: CursorMemo, end: CursorMemo) { 69 | // 1. Check that begin and end are not None 70 | // 2. Check that they are not equal 71 | let root: NodeId = self.arena.new_node(lbl); 72 | let end_id: NodeId = end.node.unwrap(); 73 | let mut child: NodeId = begin.node.unwrap(); 74 | let b_off = self.arena[child].data.get_span().unwrap().0; 75 | let mut e_off = 0; 76 | child.insert_before(root, &mut self.arena); 77 | while child != end_id { 78 | e_off = self.arena[child].data.get_span().unwrap().1; 79 | //println!("DEBUG {:?} != {:?}", child, end_id); 80 | root.append(child, &mut self.arena); 81 | let next_opt = self.arena[root].next_sibling(); 82 | //println!("DEBUG next_opt = {:?}", next_opt); 83 | child = next_opt.unwrap(); 84 | } 85 | self.arena[root].data.set_span(b_off, e_off); 86 | } 87 | } 88 | 89 | 90 | 91 | pub struct TreeCursor<'a> { 92 | node: Option, 93 | arena: &'a TreeArena, 94 | } 95 | 96 | #[derive(Debug)] 97 | pub struct CursorMemo { 98 | node: Option, 99 | } 100 | 101 | /// Once you move off the edge of the tree, you can't go back, so maybe 102 | /// this struct needs some lookahead methods as well? 103 | /// Maybe always leave behind a copy? Instead of returning Option, 104 | /// return a TreeCursor? 105 | impl<'a> TreeCursor<'a> { 106 | 107 | pub fn new(node: Option, arena: &TreeArena) -> TreeCursor { 108 | TreeCursor { node: node, arena } 109 | } 110 | 111 | pub fn is_valid(&self) -> bool { 112 | self.node.is_some() 113 | } 114 | 115 | pub fn get(&self) -> Option<&NodeLabel> { 116 | match self.node { 117 | None => None, 118 | Some(node) => Some(&self.arena[node].data) 119 | } 120 | } 121 | 122 | pub fn to_memo(&self) -> CursorMemo { 123 | CursorMemo { node: self.node } 124 | } 125 | 126 | /// Move the cursor up 127 | /// 128 | /// Returns the previous value of self.node. 129 | /// It's based on an iterator. Imagine an iterator was sitting on the 130 | /// first element of a sequence. You would want to both get that value, 131 | /// and increment the iterator. Otherwise you would never see that value. 132 | /// There could be a `get()` method or something, but it would never 133 | /// work in a `for x in iter` pattern, I don't guess. 134 | /// 135 | /// This may not be the best behavior for a cursor, though. 136 | /// Just always remember to use cursor.up().get(), and ignore the 137 | /// return value? 138 | pub fn up(&mut self) -> Option { 139 | match self.node.take() { 140 | Some(node) => { 141 | self.node = self.arena[node].parent(); 142 | Some(node) 143 | } 144 | None => None 145 | } 146 | } 147 | 148 | /// Move the cursor to its right sibling 149 | pub fn next(&mut self) -> Option { 150 | match self.node.take() { 151 | Some(node) => { 152 | self.node = self.arena[node].next_sibling(); 153 | Some(node) 154 | } 155 | None => None 156 | } 157 | } 158 | 159 | /// Move the cursor to its leftmost child 160 | pub fn first(&mut self) -> Option { 161 | match self.node.take() { 162 | Some(node) => { 163 | self.node = self.arena[node].first_child(); 164 | Some(node) 165 | } 166 | None => None 167 | } 168 | } 169 | } 170 | 171 | 172 | impl<'a> fmt::Debug for TreeCursor<'a> { 173 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 174 | write!(f, "TreeCursor {{ node: {:?}, arena: ... }}", self.node) 175 | } 176 | } 177 | 178 | 179 | 180 | /// Used to print a root-list when there is no single root node. 181 | /// 182 | /// The root list is formed by a sequence of nodes connected as siblings, 183 | /// but with no parents. 184 | fn print_tree_sequence(node: Option, arena: &TreeArena, depth: i32) { 185 | if node.is_none() { 186 | return; 187 | } 188 | for t in node.unwrap().following_siblings(arena) { 189 | print_tree(t, arena, depth); 190 | } 191 | } 192 | 193 | /// Print a tree in outline form 194 | /// 195 | /// Indent tab size is hard-coded as 4. 196 | fn print_tree(node: NodeId, arena: &TreeArena, depth: i32) { 197 | // print label at indent 198 | let indent = depth * 4; 199 | for _ in 0..indent { 200 | print!(" "); 201 | } 202 | println!("{}", arena[node].data); 203 | // print child list at depth + 1 204 | for t in node.children(arena) { 205 | print_tree(t, arena, depth + 1); 206 | } 207 | } 208 | 209 | 210 | #[cfg(test)] 211 | mod tests { 212 | use super::*; 213 | 214 | #[test] 215 | fn it_works() { 216 | //assert_eq!(4, add_two(2)); 217 | } 218 | } 219 | 220 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/segmentation.rs: -------------------------------------------------------------------------------- 1 | 2 | 3 | use std::*; 4 | extern crate unicode_segmentation; 5 | use std::collections::HashMap; 6 | use unicode_segmentation::UnicodeSegmentation; 7 | 8 | 9 | /// 10 | /// Get word segmentation results from customized dictionaries and methods 11 | /// 12 | /// Parameters: 13 | /// 14 | /// _sentence: a string, 15 | /// 16 | /// dict_path: the dictionary file path where each line is a term, 17 | /// 18 | /// stopwords_path: the stopwords file path where each line is a stop word, 19 | /// 20 | /// method: if empty, use 'bimm', other optional values: fmm, bmm. 21 | /// 22 | pub fn get_segmentation(_sentence:&str,dict_path:&str,stopwords_path:&str,method:&str)->Vec{ 23 | 24 | if dict_path.eq(""){ 25 | let result= get_word_list(_sentence); 26 | let mut result_final:Vec=Vec::new(); 27 | for r in result{ 28 | result_final.push(String::from(r)); 29 | } 30 | return result_final; 31 | } 32 | 33 | // println!("loading common dictionary"); 34 | let common_words=load_dictionary(dict_path); 35 | 36 | 37 | let stop_words:Vec; 38 | if stopwords_path.eq(""){ 39 | // println!("loading stopwords dictionary"); 40 | stop_words=Vec::new(); 41 | }else{ 42 | stop_words=load_dictionary(stopwords_path); 43 | } 44 | 45 | 46 | // println!("doing segmentation tasks"); 47 | 48 | let sentence=_sentence.graphemes(true).collect::>(); 49 | let mut list_result:Vec=Vec::new(); 50 | 51 | if method=="bimm" || method==""{ 52 | list_result=bimm(sentence,common_words); 53 | }else if method=="fmm"{ 54 | list_result=fmm(sentence,common_words); 55 | }else if method=="bmm"{ 56 | list_result=bmm(sentence,common_words); 57 | } 58 | 59 | 60 | if !stopwords_path.eq(""){ 61 | let mut meaningful_words:Vec=Vec::new(); 62 | // println!("removing stop words..."); 63 | for word in list_result{ 64 | if !stop_words.contains(&word){ 65 | meaningful_words.push(word); 66 | } 67 | } 68 | return meaningful_words; 69 | }else{ 70 | return list_result; 71 | } 72 | 73 | 74 | } 75 | 76 | 77 | /// 78 | /// Bidirection Maximum Matching Method 79 | /// 80 | pub fn bimm(sentence:Vec<&str>, words_dict:Vec)->Vec{ 81 | let s1=sentence.clone(); 82 | let s2=sentence.clone(); 83 | let dict1=words_dict.clone(); 84 | let dict2=words_dict.clone(); 85 | let forward =fmm(s1,dict1); 86 | let backward=bmm(s2,dict2); 87 | // println!("FMM: {:?}",forward); 88 | // println!("BMM: {:?}",backward); 89 | let mut f_single_word=0; 90 | let mut b_single_word=0; 91 | let mut tot_fmm=forward.len(); 92 | let mut tot_bmm=backward.len(); 93 | let mut oov_fmm=0; 94 | let mut oov_bmm=0; 95 | let mut score_fmm=0; 96 | let mut score_bmm=0; 97 | if forward==backward{ 98 | return backward; 99 | }else{ 100 | for each in forward.clone(){ 101 | if each.len()==1{ 102 | f_single_word+=1; 103 | } 104 | } 105 | for each in backward.clone(){ 106 | if each.len()==1{ 107 | b_single_word+=1; 108 | } 109 | } 110 | for each in forward.clone(){ 111 | if !words_dict.contains(&each){ 112 | oov_fmm+=1; 113 | } 114 | } 115 | for each in backward.clone(){ 116 | if !words_dict.contains(&each){ 117 | oov_bmm+=1; 118 | } 119 | } 120 | if oov_fmm>oov_bmm{ 121 | score_bmm+=1; 122 | } 123 | if oov_fmmtot_bmm{ 127 | score_bmm+=1; 128 | }else if tot_fmmb_single_word{ 133 | score_bmm+=1; 134 | }else if f_single_wordVec<&str>{ 152 | // let s = "我喜欢吃苹果,也爱打羽毛球"; 153 | let g = str.graphemes(true).collect::>(); 154 | // println!("{:?}",g); 155 | return g; 156 | } 157 | 158 | /// 159 | /// Word Segmentation Based on Backward Maximum Matching 160 | /// 161 | pub fn bmm(sentence:Vec<&str>,dict:Vec)->Vec{ 162 | 163 | let mut list_words:Vec=Vec::new(); 164 | let mut index:i32=sentence.len() as i32; 165 | let window_size:i32=4; 166 | while index>0{ 167 | let mut match_flag=false; 168 | let mut i=window_size.clone(); 169 | // println!("i={i}"); 170 | while i>=0{ 171 | // println!("i={}",i); 172 | let a; 173 | if index-i<0{ 174 | a=0 as usize; 175 | }else{ 176 | a=(index-i) as usize; 177 | } 178 | // let a = (index-i) as usize; 179 | let b =index as usize; 180 | // println!("({},{})",a,b); 181 | let sub_str=sentence[a..b].concat(); 182 | if dict.contains(&sub_str) { 183 | match_flag = true; 184 | list_words.push(sub_str); 185 | index -= i; 186 | break; 187 | } 188 | i-=1; 189 | } 190 | if match_flag==false{ 191 | 192 | if index-1<0{ 193 | index=1; 194 | } 195 | let a=(index-1) as usize; 196 | list_words.push(String::from(sentence[a])); 197 | index-=1; 198 | } 199 | } 200 | list_words.reverse(); 201 | return list_words; 202 | 203 | } 204 | 205 | /// 206 | /// Word Segmentation Based on Forward Maximum Matching 207 | /// 208 | pub fn fmm(sentence:Vec<&str>,dict:Vec)->Vec{ 209 | let token_len=sentence.len() as i32; 210 | // println!("token len: {}",token_len); 211 | let mut index:i32=0; 212 | 213 | let mut list_words:Vec=Vec::new(); 214 | let window_size=4; 215 | /* 216 | for char in sentence.chars(){ 217 | println!("{}",char); 218 | } 219 | */ 220 | while index=0{ 224 | //println!("i={}",i); 225 | 226 | let a=index as usize; 227 | let mut b=(index+i) as usize; 228 | // println!("({},{})",a,b); 229 | if b>(token_len) as usize{ 230 | b=token_len as usize; 231 | } 232 | let sub_str=sentence[a..b].concat(); 233 | //println!("sub_str: {}",sub_str); 234 | if dict.contains(&sub_str){ 235 | match_flag=true; 236 | list_words.push(sub_str); 237 | index+=i; 238 | break; 239 | } 240 | //println!(); 241 | i-=1; 242 | } 243 | if match_flag==false{ 244 | let a=index as usize; 245 | let v=String::from(sentence[a]); 246 | list_words.push(v); 247 | index+=1; 248 | } 249 | 250 | } 251 | return list_words; 252 | } 253 | 254 | use std::io::{self, BufRead}; 255 | use std::fs::{File, read_dir}; 256 | use std::io::prelude::*; 257 | use std::path::Path; 258 | 259 | fn _read_lines

(filename: P) -> io::Result>> 260 | where P: AsRef, { 261 | let file = File::open(filename)?; 262 | Ok(io::BufReader::new(file).lines()) 263 | } 264 | 265 | /// 266 | /// Read a list of lines from a file 267 | /// 268 | pub fn load_dictionary(filepath:&str)->Vec{ 269 | // The output is wrapped in a Result to allow matching on errors 270 | // Returns an Iterator to the Reader of the lines of the file. 271 | 272 | let mut strings=Vec::new(); 273 | 274 | if let Ok(lines) = _read_lines(filepath) { 275 | // Consumes the iterator, returns an (Optional) String 276 | for line in lines { 277 | if let Ok(line) = line { 278 | // println!("{}", ip); 279 | //let word=line.replace("\n","").trim(); 280 | strings.push(String::from(line.replace("\n","").trim())); 281 | } 282 | } 283 | } 284 | strings 285 | } 286 | 287 | 288 | 289 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/summarizer.rs: -------------------------------------------------------------------------------- 1 | // Ref: https://towardsdatascience.com/a-simple-text-summarizer-written-in-rust-4df05f9327a5 2 | // From Author: Charles Chan 3 | #[cfg(test)] 4 | mod tests{ 5 | use crate::native::summarizer::*; 6 | # [test] 7 | fn test_summarize(){ 8 | let text="As of Sunday, there were more than 58.2 million reported cases of COVID-19 worldwide, with more than 37.2 million of those cases listed as recovered, according to a COVID-19 tracking tool maintained by Johns Hopkins University. The global death toll stood at more than 1.3 million. In Asia, the daily tally of reported cases in Japan hit a record for the fourth day in a row, with 2,508 people confirmed infected, the Health Ministry said Sunday. A flurry of criticism has erupted, from opposition legislators and the public, slamming the government as having acted too slowly in halting its \"GoTo\" campaign, which encouraged travel and dining out with discounts. In Europe, French authorities ordered the culling of all minks at a farm after analysis showed a mutated version of the coronavirus was circulating among the animals. The move follows virus developments in mink farms in Denmark and other countries, including the Netherlands, Sweden and Greece. In the Americas, Chile says it will open its main border crossing and principal airport to foreign visitors on Monday after an eight-month pandemic shutdown. Arrivals will have to present evidence of a recent negative test for the novel coronavirus, as well as health insurance. They'll also have to report their whereabouts and health status for a two-week watch period. Those coming from high-risk countries will have to quarantine for 14 days. In Africa, Sudan's minister of cabinet affairs on Sunday tested positive for the coronavirus, the prime minister's office said, the latest in a string of senior officials to be infected as the country shows an increase of confirmed cases of COVID-19. Over the past month, acting ministers of finance and health, the central bank governor and two associates to Prime Minister Abdalla Hamdok have tested positive."; 9 | let stopwords=&[]; 10 | let summarized_text=summarize(text,stopwords,5); 11 | println!("{}",summarized_text); 12 | } 13 | } 14 | 15 | use unicode_segmentation::UnicodeSegmentation; 16 | use std::collections::BTreeSet; 17 | use ndarray::{Array1, Array2}; 18 | 19 | /// 20 | /// Summarize text 21 | /// 22 | pub fn summarize(text: &str, stop_words: &[&str], num_sentence: usize) -> String { 23 | let sentences = text.unicode_sentences().collect::>(); 24 | if num_sentence >= sentences.len() { 25 | return text.to_string(); 26 | } 27 | let mut sentences_and_words = vec![]; 28 | sentences.iter().for_each(|&sentence| { 29 | let words = split_into_words(sentence); 30 | sentences_and_words.push(words); 31 | }); 32 | let matrix = build_similarity_matrix(&sentences_and_words, stop_words); 33 | let ranks = calculate_sentence_rank(&matrix); 34 | let mut sorted_ranks = ranks.clone(); 35 | sorted_ranks.sort_by(|a, b| b.partial_cmp(a).unwrap()); 36 | let least_rank = sorted_ranks[num_sentence + 1]; 37 | let mut result: Vec<&str> = vec![]; 38 | let mut included_count = 0; 39 | for i in 0..sentences.len() { 40 | if ranks[i] >= least_rank { 41 | included_count = included_count + 1; 42 | result.push(sentences[i]); 43 | } 44 | if included_count == num_sentence { 45 | break; 46 | } 47 | } 48 | result.join("") 49 | } 50 | 51 | fn get_all_words_lc<'a>(sentence1: &[&'a str], sentence2: &[&'a str]) -> BTreeSet { 52 | let mut all_words: BTreeSet = BTreeSet::new(); 53 | 54 | sentence1.iter().for_each(|w| { 55 | all_words.insert(w.to_lowercase()); 56 | }); 57 | 58 | sentence2.iter().for_each(|w| { 59 | all_words.insert(w.to_lowercase()); 60 | }); 61 | return all_words; 62 | } 63 | 64 | /// 65 | /// Retrieve a sentence vector based on the frequency of words that appears in the all_words_lc set. 66 | /// all_words_lc should be a sorted set of lower cased words 67 | /// The size of the resulting vector is the same as the all_words_lc set 68 | /// stop_words are skipped 69 | /// 70 | fn get_sentence_vector(sentence: &[&str], all_words_lc: &BTreeSet, stop_words: &[&str]) -> Vec { 71 | let mut vector: Vec = vec![0; all_words_lc.len()]; 72 | for word in sentence { 73 | let word_lc = word.to_lowercase(); 74 | if !stop_words.contains(&word_lc.as_str()) { 75 | let index = all_words_lc.iter().position(|x| x.eq(&word_lc)).unwrap(); 76 | vector[index] += 1; 77 | } 78 | } 79 | return vector; 80 | } 81 | 82 | /// 83 | /// Calculates the cosine distance between two vectors 84 | /// Refer to [YouTube](https://www.youtube.com/watch?v=3X0wLRwU_Ws) 85 | /// 86 | fn cosine_distance(vec1: &Vec, vec2: &Vec) -> f64 { 87 | let dot_product = dot_product(vec1, vec2); 88 | let root_sum_square1 = root_sum_square(vec1); 89 | let root_sum_square2 = root_sum_square(vec2); 90 | return dot_product as f64 / (root_sum_square1 * root_sum_square2); 91 | } 92 | 93 | fn root_sum_square(vec: &Vec) -> f64 { 94 | let mut sum_square = 0; 95 | for i in 0..vec.len() { 96 | sum_square += vec[i] * vec[i]; 97 | } 98 | (sum_square as f64).sqrt() 99 | } 100 | 101 | fn dot_product(vec1: &Vec, vec2: &Vec) -> usize { 102 | let delta = vec1.len() - vec2.len(); 103 | let shortest_vec = match delta { 104 | d if d < 0 => vec1, 105 | d if d > 0 => vec2, 106 | _ => vec1 107 | }; 108 | let mut dot_product = 0; 109 | for i in 0..shortest_vec.len() { 110 | dot_product += vec1[i] * vec2[i]; 111 | } 112 | dot_product 113 | } 114 | 115 | fn sentence_similarity(s1: &[&str], s2: &[&str], stop_words: &[&str]) -> f64 { 116 | let all_words = get_all_words_lc(s1, s2); 117 | let v1 = get_sentence_vector(s1, &all_words, stop_words); 118 | let v2 = get_sentence_vector(s2, &all_words, stop_words); 119 | 1.0 - cosine_distance(&v1, &v2) 120 | } 121 | 122 | /// 123 | /// Calculate a similarity matrix for the given sentences. 124 | /// Returns a 2-D array M_i,j such that for all 'j', sum(i, M_i,j) = 1 125 | /// We take a leap of faith here and assume that cosine similarity is similar to the probability 126 | /// that a sentence is important for summarization 127 | /// 128 | fn build_similarity_matrix(sentences: &Vec>, stop_words: &[&str]) -> Array2 { 129 | let len = sentences.len(); 130 | let mut matrix = Array2::::zeros((len, len)); 131 | let mut sum_column: Vec = vec![0.0; len]; 132 | for i in 0..len { 133 | for j in 0..len { 134 | if i == j { 135 | continue; 136 | } 137 | matrix[[i, j]] = sentence_similarity(sentences[i].as_slice(), sentences[j].as_slice(), stop_words); 138 | } 139 | } 140 | // at this point we have the cosine similarity of each sentence. 141 | // take a leap of faith and assume that the cosine similarity is the probability that a sentence 142 | // is important for summarization. 143 | // We do this by normalizing the matrix along the column. The column values should add up to 1. 144 | for j in 0..len { 145 | let mut sum: f64 = 0.0; 146 | for i in 0..len { 147 | if i == j { 148 | continue; 149 | } 150 | sum += matrix[[i, j]]; 151 | } 152 | sum_column[j] = sum; 153 | } 154 | for i in 0..len { 155 | for j in 0..len { 156 | if i == j { 157 | continue; 158 | } 159 | matrix[[i, j]] = matrix[[i, j]] / sum_column[j]; 160 | } 161 | } 162 | matrix 163 | } 164 | 165 | /// 166 | /// Calculate a sentence rank similar to a page rank. 167 | /// Please refer to [PageRank](https://en.wikipedia.org/wiki/PageRank) for more details. 168 | /// 169 | fn calculate_sentence_rank(similarity_matrix: &Array2) -> Vec { 170 | let num_sentence = similarity_matrix.shape()[1]; 171 | let threshold = 0.001; 172 | // Initialize a vector with the same value 1/number of sentences. Uniformly distributed across 173 | // all sentences. NOTE: perhaps we can make some sentences more important than the rest? 174 | let initial_vector: Vec = vec![1.0 / num_sentence as f64; num_sentence]; 175 | let mut result = Array1::from(initial_vector); 176 | let mut prev_result = result.clone(); 177 | let damping_factor = 0.85; 178 | let initial_m = damping_factor * similarity_matrix + (1.0 - damping_factor) / num_sentence as f64; 179 | loop { 180 | result = initial_m.dot(&result); 181 | let delta = &result - &prev_result; 182 | let mut converged = true; 183 | for i in 0..delta.len() { 184 | if delta[i] > threshold { 185 | converged = false; 186 | break; 187 | } 188 | } 189 | if converged { 190 | break; 191 | } 192 | prev_result = result.clone(); 193 | } 194 | result.into_raw_vec() 195 | } 196 | 197 | fn split_into_words(sentence: &str) -> Vec<&str> { 198 | let mut result = vec![]; 199 | let words = sentence.unicode_words(); 200 | for word in words { 201 | result.push(word); 202 | } 203 | result 204 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/text.rs: -------------------------------------------------------------------------------- 1 | use std::collections::btree_map::Entry::*; 2 | use std::collections::BTreeMap; 3 | 4 | /// 5 | /// 6 | /// Credit: https://github.com/LazyEmpiricist/text_analysis/blob/main/src/lib.rs 7 | /// 8 | 9 | #[cfg(test)] 10 | mod tests { 11 | use crate::native::text::*; 12 | use std::collections::HashMap; 13 | 14 | #[test] 15 | fn test_text_if_english() { 16 | let text = "I like you!"; 17 | println!("{}", similar_with_english(text)) 18 | } 19 | 20 | #[test] 21 | fn test_trim_to_words() { 22 | let words = "(_test] {test2!=".to_string(); 23 | let trimmed = trim_to_words(words); 24 | println!("{:?}", trimmed) 25 | } 26 | 27 | #[test] 28 | fn test_count_words() { 29 | let words = vec![ 30 | "one".to_string(), 31 | "two".to_string(), 32 | "two".to_string(), 33 | "three".to_string(), 34 | "three".to_string(), 35 | "three".to_string(), 36 | ]; 37 | let counted = count_words(&words); 38 | let mut words_map = HashMap::new(); 39 | words_map.insert("one".to_string(), 1 as u32); 40 | words_map.insert("two".to_string(), 2 as u32); 41 | words_map.insert("three".to_string(), 3 as u32); 42 | println!("{:?}",words_map); 43 | assert_eq!(counted, words_map); 44 | } 45 | } 46 | 47 | use std::collections::HashMap; 48 | 49 | /// Sort words in HashMap according to frequency into Vec. 50 | pub fn sort_map_to_vec( 51 | frequency: HashMap, 52 | ) -> std::vec::Vec<(std::string::String, u32)> { 53 | let mut vec_sorted: Vec<(String, u32)> = frequency.into_iter().collect(); 54 | vec_sorted.sort_by(|a, b| b.1.cmp(&a.1)); 55 | vec_sorted 56 | } 57 | 58 | /// Get mininum index and guarantee that index is alway >=0 59 | pub fn get_index_min(index: &usize) -> usize { 60 | if *index as isize - 5 < 0 { 61 | //check if index -5 would result in negative number, return 0 in case 62 | 0 63 | } else { 64 | //if index-5 > 0, return index-5 65 | index - 5 66 | } 67 | } 68 | 69 | // Get maximum index and garantee that index does not exeed total length of Vec 70 | pub fn get_index_max(index: &usize, max_len: &usize) -> usize { 71 | if index + 5 > *max_len { 72 | *max_len as usize 73 | } else { 74 | index + 5 75 | } 76 | } 77 | 78 | 79 | 80 | 81 | 82 | 83 | /// 84 | /// 85 | /// 86 | pub fn count_words(words: &[String]) -> std::collections::HashMap { 87 | let mut frequency: HashMap = HashMap::new(); 88 | for word in words { 89 | //ignore words constiting of only one char? 90 | //if word.len() > 1 { 91 | *frequency.entry(word.to_owned()).or_insert(0) += 1; 92 | //} 93 | } 94 | frequency 95 | } 96 | 97 | /// Uses the Bhattacharyya coefficient to determine if text is likely to be English. 98 | /// 99 | /// Higher is better. 100 | pub fn similar_with_english(text: &str) -> f64 { 101 | // count of the number of times a character occurs in the given text 102 | let mut count: BTreeMap = BTreeMap::new(); 103 | for letter in text.chars() { 104 | // println!("k = {}",char::to_uppercase(letter).to_string()); 105 | 106 | //let k=char::to_uppercase(letter)[0] as char; 107 | 108 | let k=letter; 109 | let k=letter.to_uppercase().collect::>()[0]; 110 | match count.entry(k) { 111 | Vacant(entry) => { entry.insert(1f64); }, 112 | Occupied(mut entry) => *entry.get_mut() += 1f64, 113 | } 114 | } 115 | 116 | // total number of characters in the given text 117 | let total = text.len() as f64; 118 | 119 | // relative frequency of letters in the English language 120 | let mut english_frequencies: BTreeMap = BTreeMap::new(); 121 | english_frequencies.insert('A', 0.0651738); 122 | english_frequencies.insert('B', 0.0124248); 123 | english_frequencies.insert('C', 0.0217339); 124 | english_frequencies.insert('D', 0.0349835); 125 | english_frequencies.insert('E', 0.1041442); 126 | english_frequencies.insert('F', 0.0197881); 127 | english_frequencies.insert('G', 0.0158610); 128 | english_frequencies.insert('H', 0.0492888); 129 | english_frequencies.insert('I', 0.0558094); 130 | english_frequencies.insert('J', 0.0009033); 131 | english_frequencies.insert('K', 0.0050529); 132 | english_frequencies.insert('L', 0.0331490); 133 | english_frequencies.insert('M', 0.0202124); 134 | english_frequencies.insert('N', 0.0564513); 135 | english_frequencies.insert('O', 0.0596302); 136 | english_frequencies.insert('P', 0.0137645); 137 | english_frequencies.insert('Q', 0.0008606); 138 | english_frequencies.insert('R', 0.0497563); 139 | english_frequencies.insert('S', 0.0515760); 140 | english_frequencies.insert('T', 0.0729357); 141 | english_frequencies.insert('U', 0.0225134); 142 | english_frequencies.insert('V', 0.0082903); 143 | english_frequencies.insert('W', 0.0171272); 144 | english_frequencies.insert('X', 0.0013692); 145 | english_frequencies.insert('Y', 0.0145984); 146 | english_frequencies.insert('Z', 0.0007836); 147 | english_frequencies.insert(' ', 0.1918182); 148 | 149 | // update the counts to be the relative frequency of letters in the given text 150 | // and then calculate the Bhattacharyya coefficient as our score 151 | let mut score = 0.0; 152 | for letter in english_frequencies.keys() { 153 | match count.entry(*letter) { 154 | Vacant(entry) => { entry.insert(0.0); }, 155 | Occupied(mut entry) => *entry.get_mut() /= total, 156 | } 157 | let partition_overlap = count[&*letter] * english_frequencies[&*letter]; 158 | score += partition_overlap.sqrt(); 159 | } 160 | 161 | score 162 | } 163 | 164 | 165 | 166 | /// 167 | /// 168 | /// Credits: Splits String at whitespaces and removes chars like , or ?. Change the relevant line to remove or add chars from provided String. 169 | /// 170 | pub fn trim_to_words(content: String) -> std::vec::Vec { 171 | let content: Vec = content 172 | .to_lowercase() 173 | .replace(&['-'][..], " ") 174 | //should 's be replaced? 175 | .replace("'s", "") 176 | .replace( 177 | &[ 178 | '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'', 179 | '’', '?', '!', '“', '‘', 180 | ][..], 181 | "", 182 | ) 183 | .split_whitespace() 184 | .map(String::from) 185 | .collect::>(); 186 | content 187 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/token.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::prelude::*; 3 | use crate::native::toksiter::*; 4 | use crate::native::chardata::*; 5 | 6 | #[cfg(test)] 7 | mod tests { 8 | use std::borrow::Borrow; 9 | use crate::native::word2vec::*; 10 | use crate::native::toksiter::*; 11 | use crate::native::chardata::*; 12 | use crate::native::token::get_token_list; 13 | 14 | # [test] 15 | fn token_analyze(){ 16 | let mut s="hello world!"; 17 | 18 | let mut chs = s.chars(); 19 | let mut chds = CharDataIter::new(&mut chs); 20 | let mut toks = TokenIter::new(&mut chds); 21 | 22 | // Run the tokenizer, dump debug info for each token: 23 | loop { 24 | match toks.next() { 25 | Some(tok) => { println!("{:?}", tok) }, 26 | None => { println!(""); break; } 27 | } 28 | } 29 | } 30 | 31 | # [test] 32 | fn test_get_token_list(){ 33 | let s="Hello, Rust. How are you?"; 34 | let result=get_token_list(s); 35 | for r in result{ 36 | println!("{}\t{:?}",r.text,r); 37 | } 38 | } 39 | 40 | } 41 | 42 | pub fn get_token_list(s:&str)->Vec{ 43 | 44 | let mut chs = s.chars(); 45 | let mut chds = CharDataIter::new(&mut chs); 46 | let mut toks = TokenIter::new(&mut chds); 47 | let mut list_token:Vec=Vec::new(); 48 | // Run the tokenizer, dump debug info for each token: 49 | loop { 50 | match toks.next() { 51 | Some(tok) => { 52 | println!("{:?}", tok); 53 | list_token.push(tok); 54 | }, 55 | None => { println!(""); break; } 56 | } 57 | } 58 | list_token 59 | 60 | } 61 | 62 | 63 | 64 | fn main() { 65 | // Get stdin into a string 66 | let stdin = io::stdin(); 67 | let mut s = String::new(); 68 | stdin.lock().read_to_string(&mut s).unwrap(); 69 | println!("{}", s); 70 | 71 | // Construct a tokenizer by adapting some more primitive iterators 72 | let mut chs = s.chars(); 73 | let mut chds = CharDataIter::new(&mut chs); 74 | let mut toks = TokenIter::new(&mut chds); 75 | 76 | // Run the tokenizer, dump debug info for each token: 77 | loop { 78 | match toks.next() { 79 | Some(tok) => { println!("{:?}", tok) }, 80 | None => { println!(""); break; } 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/toksiter.rs: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////// 2 | // TokenIter 3 | // 4 | 5 | use crate::native::chardata; 6 | 7 | static IN_TOKEN: u8 = 1; 8 | static BTWN_TOKS: u8 = 0; 9 | 10 | /* TODO: Probably there should not be a String member here. 11 | We should either borrow a string slice from the original text, 12 | or else leave it out, and provide some other facility for 13 | converting Tokens to Strings, given the underlying string. 14 | In a full fledged parser, the parsed-document representation 15 | would handle that. 16 | */ 17 | #[derive(Debug)] 18 | pub struct Token { 19 | pub text: String, 20 | pub byte_offsets: (usize, usize), 21 | pub char_offsets: (usize, usize), 22 | pub token_offset: usize 23 | } 24 | 25 | impl Token { 26 | fn new() -> Token { 27 | Token { 28 | text: "".to_string(), 29 | byte_offsets: (0, 0), 30 | char_offsets: (0, 0), 31 | token_offset: 0, 32 | } 33 | } 34 | } 35 | 36 | pub struct TokenIter<'a> { 37 | chdat_stream: &'a mut chardata::CharDataIter<'a>, 38 | curr_tok_offset: usize, 39 | state: u8, 40 | } 41 | 42 | impl<'a> TokenIter<'a> { 43 | pub fn new(chdats: &'a mut chardata::CharDataIter<'a>) -> Self { 44 | TokenIter { 45 | chdat_stream: chdats, 46 | curr_tok_offset: 0, 47 | state: BTWN_TOKS, 48 | } 49 | } 50 | 51 | fn is_boundary_char(ch: char) -> bool { 52 | if ch == chardata::END_OF_STRING { 53 | true 54 | } else if ch.is_whitespace() { 55 | true 56 | } else { 57 | false 58 | } 59 | } 60 | } 61 | 62 | /* Always start out BTWN_TOKS, and therefore always end in BTWN_TOKS. 63 | Start by skipping characters until state changes to IN_TOKEN. 64 | Then (1) set the token start offsets; (2) march the char data iter forward 65 | until state changes to BTWN_TOKS, then fix the end offsets of the token 66 | under construction. Update the current token offset. 67 | Leave the resulting Token as the return value of next(). 68 | If the underlying chardata::CharDataIter yields END_OF_SENTENCE: 69 | IN_TOKEN --> ship the current token 70 | BTWN_TOKS --> return None 71 | In the first case, the next call to next() will immediately trigger 72 | the second case. 73 | */ 74 | 75 | impl<'a> Iterator for TokenIter<'a> { 76 | type Item = Token; 77 | 78 | fn next(&mut self) -> Option { 79 | assert_eq!(self.state, BTWN_TOKS); 80 | let mut curr_tok = Token::new(); 81 | loop { 82 | match self.chdat_stream.next() { 83 | 84 | Some( chardata::CharData {ch, byte_offset, char_offset} ) => { 85 | 86 | if TokenIter::is_boundary_char(ch) { 87 | if self.state == IN_TOKEN { 88 | // ship token 89 | curr_tok.byte_offsets.1 = byte_offset; 90 | curr_tok.char_offsets.1 = char_offset; 91 | self.state = BTWN_TOKS; 92 | self.curr_tok_offset += 1; 93 | return Some(curr_tok); 94 | } 95 | // else do nothing -- skip boundary chars 96 | } else { 97 | if self.state == BTWN_TOKS { 98 | // start token 99 | curr_tok.token_offset = self.curr_tok_offset; 100 | curr_tok.byte_offsets.0 = byte_offset; 101 | curr_tok.char_offsets.0 = char_offset; 102 | self.state = IN_TOKEN; 103 | } 104 | // Accumulate characters 105 | curr_tok.text.push(ch); 106 | curr_tok.byte_offsets.1 = byte_offset; 107 | curr_tok.char_offsets.1 = char_offset; 108 | } 109 | }, 110 | 111 | None => { 112 | // May need to ship a token here! 113 | if self.state == IN_TOKEN { 114 | self.state = BTWN_TOKS; 115 | return Some(curr_tok); 116 | } 117 | return None; 118 | } 119 | } 120 | } 121 | } 122 | } 123 | // 124 | // TokenIter 125 | ///////////////////////////////////////////////////////// -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/native/word2vec.rs: -------------------------------------------------------------------------------- 1 | extern crate word2vec; 2 | 3 | use word2vec::wordclusters::WordClusters; 4 | use word2vec::wordvectors::WordVector; 5 | 6 | #[cfg(test)] 7 | mod tests{ 8 | use std::borrow::Borrow; 9 | use crate::native::word2vec::*; 10 | 11 | # [test] 12 | fn test_word2vec(){ 13 | //this is an issue to fix 14 | let clusters=wv_clusters_create("D:\\UIBEResearch\\classes.txt"); 15 | let index=wv_get_cluster_from_clusters(clusters,"problem"); 16 | println!("index = {}",index); 17 | } 18 | 19 | # [test] 20 | fn test_open_wv_bin(){ 21 | let wv_model=wv_get_model("D:\\UIBEResearch\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin"); 22 | let positive = vec!["woman", "king"]; 23 | let negative = vec!["man"]; 24 | println!("analogy: {:?}", wv_analogy(&wv_model,positive, negative, 10)); 25 | println!("cosine: {:?}", wv_cosine(&wv_model,"man", 10)); 26 | } 27 | 28 | # [test] 29 | fn test_origin(){ 30 | /* 31 | let model = word2vec::wordvectors::WordVector::load_from_binary( 32 | 33 | "D:\\UIBEResearch\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin").expect("Unable to load word vector model"); 34 | println!("{:?}", model.cosine("snow", 10)); 35 | let positive = vec!["woman", "king"]; 36 | let negative = vec!["man"]; 37 | println!("{:?}", model.analogy(positive, negative, 10)); 38 | */ 39 | 40 | 41 | let clusters = word2vec::wordclusters::WordClusters::load_from_file( 42 | "D:\\UIBEResearch\\classes1.txt").expect("Unable to load word clusters"); 43 | println!("{:?}", clusters.get_cluster("belarus")); 44 | println!("{:?}", clusters.get_words_on_cluster(6)); 45 | 46 | 47 | } 48 | 49 | } 50 | 51 | pub fn wv_get_model(bin_path:&str)->WordVector{ 52 | let model = word2vec::wordvectors::WordVector::load_from_binary( 53 | bin_path).expect("Unable to load word vector model"); 54 | return model 55 | } 56 | 57 | 58 | /// 59 | /// let model = word2vec::wordvectors::WordVector::load_from_binary( 60 | /// "vectors.bin").expect("Unable to load word vector model"); 61 | /// println!("{:?}", model.cosine("snow", 10)); 62 | /// 63 | /// 64 | pub fn wv_cosine(model:&WordVector,word:&str,n:usize)->Vec<(String,f32)>{ 65 | 66 | let ret=model.cosine(word,n); 67 | match ret { 68 | Some(r)=>{ 69 | r 70 | }, 71 | None=>{ 72 | Vec::new() 73 | }, 74 | } 75 | } 76 | 77 | /// 78 | /// let positive = vec!["woman", "king"]; 79 | /// let negative = vec!["man"]; 80 | /// println!("{:?}", model.analogy(positive, negative, 10)); 81 | /// 82 | pub fn wv_analogy(model:&WordVector, positive:Vec<&str>,negative:Vec<&str>,n:usize)->Vec<(String,f32)>{ 83 | let re=model.analogy(positive, negative, n); 84 | // println!("{:?}",re ); 85 | match re{ 86 | Some(v)=>v, 87 | None=>{ 88 | eprintln!("error"); 89 | Vec::new() 90 | } 91 | } 92 | } 93 | 94 | /// 95 | /// 96 | /// let clusters = word2vec::wordclusters::WordClusters::load_from_file( 97 | /// "classes.txt").expect("Unable to load word clusters"); 98 | /// println!("{:?}", clusters.get_cluster("belarus")); 99 | /// println!("{:?}", clusters.get_words_on_cluster(6)); 100 | /// 101 | /// 102 | pub fn wv_clusters_create(filepath:&str)->WordClusters{ 103 | let clusters = word2vec::wordclusters::WordClusters::load_from_file( 104 | filepath).expect("Unable to load word clusters"); 105 | return clusters; 106 | } 107 | 108 | /// 109 | /// println!("{:?}", clusters.get_words_on_cluster(6)); 110 | /// 111 | pub fn wv_get_cluster_from_clusters(clusters:WordClusters,word:&str)->i32{ 112 | match clusters.get_cluster(word){ 113 | Some(&v)=>{ 114 | v 115 | }, 116 | None=>{ 117 | println!("error"); 118 | -1 119 | } 120 | } 121 | } 122 | 123 | /// 124 | /// println!("{:?}", clusters.get_cluster("belarus")); 125 | /// 126 | pub fn wv_get_cluster_string(clusters:WordClusters,index:i32)->Vec{ 127 | match clusters.get_words_on_cluster(index){ 128 | Some(v)=>{ 129 | v.clone() 130 | }, 131 | None=>{ 132 | println!("error"); 133 | Vec::new() 134 | } 135 | } 136 | } 137 | 138 | 139 | fn main(){ 140 | 141 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/src/wordnet.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use pyo3::prelude::*; 3 | /// 4 | /// Estimate the similarity between twn synsets based on WordNet (pip install semantic-kit) required 5 | /// 6 | pub fn wordnet_similarity(s1:&str,s2:&str)->HashMap{ 7 | match _wordnet_similarity(s1,s2){ 8 | Ok(sims)=>sims, 9 | Err(e)=>{ 10 | eprintln!("{:?}",e); 11 | HashMap::new() 12 | } 13 | } 14 | } 15 | 16 | fn _wordnet_similarity(s1:&str,s2:&str)-> PyResult> { 17 | Python::with_gil(|py| { 18 | let semantickit = PyModule::import(py, "semantickit.similarity.wordnet_similarity")?; 19 | let sim: HashMap = semantickit.getattr("wordnet_similarity_all")?. 20 | call1((s1,s2))?.extract()?; 21 | // println!("Result: {:?}",sim); 22 | Ok(sim) 23 | }) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/tests/3rdparty_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | 4 | use rsnltk::wordnet::wordnet_similarity; 5 | use rsnltk::api::natural::*; 6 | use rsnltk::api::whatlang::*; 7 | use rsnltk::api::yn::*; 8 | 9 | # [test] 10 | fn test_distance(){ 11 | println!("lev = {}",lev_dist("kitten", "sitting")); 12 | println!("winkler = {}",jw_dist("dixon", "dicksonx")); 13 | } 14 | 15 | # [test] 16 | fn test_whatlang(){ 17 | let text = "Ĉu vi ne volas eklerni Esperanton? Bonvolu! Estas unu de la plej bonaj aferoj!"; 18 | let ret=whatlang(text); 19 | println!("{:?}",ret); 20 | } 21 | 22 | # [test] 23 | fn test_yes(){ 24 | let s="yes"; 25 | println!("{:?}",yes(s)); 26 | 27 | println!("{:?}",is_somewhat_yes("this has a y so it is the word")); 28 | 29 | println!("{:?}",is_kinda_yes("very much so")); 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/tests/nlpsvc_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use rsnltk::native::nlpsvc::annotated_document::*; 4 | use rsnltk::native::nlpsvc::english_rules::EnglishTokenizer; 5 | use rsnltk::native::nlpsvc::regex_tokenizer::RegexTokenizer; 6 | 7 | // Tokenize the English text 8 | # [test] 9 | fn get_token_pos_list(){ 10 | let text="A Rust library to support natural language processing!"; 11 | let mut tokenizer = EnglishTokenizer::new(); // compile regex patterns 12 | let mut doc = AnnotatedDocument::new(text); 13 | tokenizer.apply_to(&mut doc); 14 | println!("Result is: "); 15 | let mut cursor = doc.get_trees().first(); 16 | while cursor.is_valid() { 17 | print_label(&cursor, &doc); 18 | cursor.next(); 19 | } 20 | } 21 | 22 | use rsnltk::native::nlpsvc::annotated_document::*; 23 | fn print_label(cursor: &TreeCursor, doc: &AnnotatedDocument) { 24 | let label = cursor.get().unwrap(); 25 | let span = label.get_span().unwrap(); 26 | println!("({:>02}, {:>02}) [{}]", span.0, span.1, 27 | &doc.get_text()[span.0..span.1]); 28 | } 29 | 30 | // Manually set token information 31 | #[test] 32 | fn push_tokens_and_traverse() { 33 | // Fake tokenizer 34 | let mut doc = AnnotatedDocument::new("01 Hello!"); 35 | let mut lbl0 = NodeLabel::new(); 36 | lbl0.set_span(0, 2) 37 | .set_sym_val("toktype", "NUMBER"); 38 | doc.get_trees_mut().push_back(lbl0); 39 | let mut lbl1 = NodeLabel::new(); 40 | lbl1.set_span(3, 8) 41 | .set_sym_val("toktype", "WORD"); 42 | doc.get_trees_mut().push_back(lbl1); 43 | let mut lbl2 = NodeLabel::new(); 44 | lbl2.set_span(8, 9) 45 | .set_sym_val("toktype", "PUNCT"); 46 | doc.get_trees_mut().push_back(lbl2); 47 | 48 | // Traverse (and print) 49 | let mut cursor = doc.get_trees().first(); 50 | while cursor.is_valid() { 51 | print_label(&cursor, &doc); 52 | cursor.next(); 53 | } 54 | } 55 | 56 | #[test] 57 | fn test_chunking() { 58 | let txt = "aa bb cc dd ee ff"; 59 | let mut doc = AnnotatedDocument::new(txt); 60 | // split by whitespace and iterate 61 | for (i, _) in txt.split_whitespace().enumerate() { 62 | let b = i * 3; 63 | let e = b + 2; 64 | let mut lbl = NodeLabel::new(); 65 | lbl.set_span(b, e) 66 | .set_sym_val("toktype", "WORD"); 67 | doc.get_trees_mut().push_back(lbl); 68 | } 69 | 70 | { 71 | println!("===================="); 72 | let mut cursor = doc.get_trees().first(); 73 | while cursor.is_valid() { 74 | print_label(&cursor, &doc); 75 | cursor.next(); 76 | } 77 | println!("===================="); 78 | } 79 | 80 | let (first_child, last_child) = fake_parse(&doc); 81 | let mut label = NodeLabel::new(); 82 | label.set_sym_val("cat", "cc_ee"); 83 | doc.get_trees_mut().chunk(label, first_child, last_child); 84 | doc.get_trees().print(); 85 | } 86 | 87 | fn fake_parse(doc: &AnnotatedDocument) -> (CursorMemo, CursorMemo) { 88 | let mut cursor = doc.get_trees().first(); // reset cursor 89 | cursor.next(); 90 | cursor.next(); 91 | // cursor should now be sitting on [cc] 92 | print_label(&cursor, &doc); 93 | let first_child = cursor.to_memo(); 94 | cursor.next(); 95 | cursor.next(); 96 | cursor.next(); 97 | // cursor should now be sitting on [ff] 98 | let last_child = cursor.to_memo(); 99 | (first_child, last_child) 100 | } 101 | 102 | } 103 | 104 | -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/tests/segmentation_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests{ 3 | use rsnltk::native::segmentation::*; 4 | extern crate unicode_segmentation; 5 | use unicode_segmentation::UnicodeSegmentation; 6 | # [test] 7 | fn test_utf8(){ 8 | let s = "我喜欢吃苹果,也爱打羽毛球"; 9 | let g = s.graphemes(true).collect::>(); 10 | println!("{:?}",g); 11 | } 12 | 13 | # [test] 14 | fn test_bmm(){ 15 | let sss=String::from("我喜欢吃苹果,也爱打羽毛球"); 16 | let sentence = sss.graphemes(true).collect::>(); 17 | 18 | let dict=vec!["我".to_string(),"喜欢".to_string(),"苹果".to_string(),"羽毛球".to_string(),"爱".to_string()]; 19 | 20 | let results=bmm(sentence,dict); 21 | 22 | println!("{:?}",results); 23 | } 24 | 25 | # [test] 26 | fn test_fmm(){ 27 | let sss=String::from("我喜欢吃苹果,也爱打羽毛球"); 28 | let sentence = sss.graphemes(true).collect::>(); 29 | 30 | let dict=vec!["我".to_string(),"喜欢".to_string(),"苹果".to_string(),"羽毛球".to_string(),"爱".to_string()]; 31 | 32 | let result=fmm(sentence,dict); 33 | 34 | println!("{:?}",result) 35 | 36 | } 37 | 38 | # [test] 39 | fn test_bimm(){ 40 | let sss=String::from("我喜欢吃苹果,也爱打羽毛球"); 41 | let sentence = sss.graphemes(true).collect::>(); 42 | 43 | let dict=vec!["我".to_string(),"喜欢".to_string(),"苹果".to_string(),"羽毛球".to_string(),"爱".to_string()]; 44 | 45 | let result=bimm(sentence,dict); 46 | 47 | println!("{:?}",result) 48 | } 49 | 50 | # [test] 51 | fn test_real_word_segmentation(){ 52 | let dict_path="D:\\GitHub\\rsnltk\\experiments\\rsnltk-experiment\\examples\\data\\dicts\\30wdict.txt"; 53 | let stop_path="D:\\GitHub\\rsnltk\\experiments\\rsnltk-experiment\\examples\\data\\dicts\\stopwords\\baidu_stopwords.txt"; 54 | 55 | // let dict_path=""; 56 | // let stop_path=""; 57 | 58 | let _sentence="美国太空总署希望,在深海的探险发现将有助于解开一些外太空的秘密,同时也可以测试前往太阳系其他星球探险所需的一些设备和实验。"; 59 | let meaningful_words=get_segmentation(_sentence,dict_path,stop_path, ""); 60 | 61 | println!("Result: {:?}",meaningful_words); 62 | } 63 | 64 | # [test] 65 | fn test_segmentation_performance(){ 66 | use std::time::{Duration, Instant}; 67 | // set a dictionary 68 | let dict_path="D:\\GitHub\\rsnltk\\experiments\\rsnltk-experiment\\examples\\data\\dicts\\30wdict.txt"; 69 | let stop_path="D:\\GitHub\\rsnltk\\experiments\\rsnltk-experiment\\examples\\data\\dicts\\stopwords\\baidu_stopwords.txt"; 70 | // target sentence 71 | let _sentence="美国太空总署希望,在深海的探险发现将有助于解开一些外太空的秘密,同时也可以测试前往太阳系其他星球探险所需的一些设备和实验。"; 72 | // start to time recording 73 | let mut start = Instant::now(); 74 | let bimm_result=get_segmentation(_sentence,dict_path,stop_path, "bimm"); 75 | println!("bimm's time cost: {:?}",start.elapsed()); 76 | start = Instant::now(); 77 | let fmm_result=get_segmentation(_sentence,dict_path,stop_path, "fmm"); 78 | println!("fmm's time cost: {:?}",start.elapsed()); 79 | start = Instant::now(); 80 | let bmm_result=get_segmentation(_sentence,dict_path,stop_path, "bmm"); 81 | println!("bmm's time cost: {:?}",start.elapsed()); 82 | 83 | } 84 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/tests/stanza_test.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(test)] 3 | mod tests { 4 | use rsnltk::{download_lang, ner, tokenize, download_langs, tokenize_sentence, lang, sentiment, mwt_expand, pos, dependency_tree}; 5 | 6 | # [test] // before use the rsnltk library, you need to download target language package from Stanza's website. 7 | fn test_download_langs(){ 8 | // 1. first install the package 9 | let list_lang=vec!["en","zh"]; 10 | download_langs(list_lang); 11 | // 2. then do NLP tasks 12 | let text="I like Beijing!"; 13 | let lang="en"; 14 | // 2. Uncomment the below codes for Chinese NER 15 | // let text="我喜欢北京、上海和纽约!"; 16 | // let lang="zh"; 17 | let list_ner=ner(text,lang); 18 | for ner in list_ner{ 19 | println!("{:?}",ner); 20 | } 21 | 22 | } 23 | 24 | #[test] 25 | fn test_ner(){ 26 | // 1. for English NER 27 | let text="I like Beijing!"; 28 | let lang="en"; 29 | // 2. Uncomment the below codes for Chinese NER 30 | // let text="我喜欢北京、上海和纽约!"; 31 | // let lang="zh"; 32 | let list_ner=ner(text,lang); 33 | for ner in list_ner{ 34 | println!("{:?}",ner); 35 | } 36 | } 37 | # [test] 38 | fn test_tokenize(){ 39 | 40 | let text="我喜欢北京、上海和纽约!"; 41 | let lang="zh"; 42 | 43 | let list_result=tokenize(text,lang); 44 | for ner in list_result{ 45 | println!("{:?}",ner); 46 | } 47 | } 48 | # [test] 49 | fn test_tokenize_sentence(){ 50 | let text="I like apple. Do you like it? No, I am not sure!"; 51 | let lang="en"; 52 | let list_sentences=tokenize_sentence(text,lang); 53 | for sentence in list_sentences{ 54 | println!("Sentence: {}",sentence); 55 | } 56 | } 57 | # [test] 58 | fn test_lang(){ 59 | let list_text = vec!["I like Beijing!","我喜欢北京!", "Bonjour le monde!"]; 60 | let list_result=lang(list_text); 61 | for lang in list_result{ 62 | println!("{:?}",lang); 63 | } 64 | } 65 | # [test] 66 | fn test_mwt_expand(){ 67 | let text="Nous avons atteint la fin du sentier."; 68 | let lang="fr"; 69 | let list_result=mwt_expand(text,lang); 70 | } 71 | # [test] 72 | fn test_tag(){ 73 | //let text="我喜欢北京、上海和纽约!"; 74 | //let lang="zh"; 75 | let text="I like apple"; 76 | let lang="en"; 77 | 78 | let list_result=pos(text,lang); 79 | for word in list_result{ 80 | println!("{:?}",word); 81 | } 82 | } 83 | # [test] 84 | fn test_sentiment(){ 85 | //let text="I like Beijing!"; 86 | //let lang="en"; 87 | let text="我讨厌北京"; 88 | let lang="zh"; 89 | 90 | let sentiments=sentiment(text,lang); 91 | for sen in sentiments{ 92 | println!("{:?}",sen); 93 | } 94 | } 95 | 96 | # [test] 97 | fn test_dependency_tree(){ 98 | let text="I like you. Do you like me?"; 99 | let lang="en"; 100 | let list_results=dependency_tree(text,lang); 101 | for list_token in list_results{ 102 | for token in list_token{ 103 | println!("{:?}",token) 104 | } 105 | 106 | } 107 | } 108 | 109 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/tests/text_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use rsnltk::native::text::*; 4 | use std::collections::HashMap; 5 | 6 | #[test] 7 | fn test_text_if_english() { 8 | let text = "I like you!"; 9 | println!("English probability: {}", similar_with_english(text)) 10 | } 11 | 12 | #[test] 13 | fn test_trim_to_words() { 14 | let words = "I like you, do you like me?".to_string(); 15 | let trimmed = trim_to_words(words); 16 | println!("{:?}", trimmed) 17 | } 18 | 19 | #[test] 20 | fn test_count_words() { 21 | let words = vec![ 22 | "one".to_string(), 23 | "two".to_string(), 24 | "two".to_string(), 25 | "three".to_string(), 26 | "three".to_string(), 27 | "three".to_string(), 28 | ]; 29 | let counted = count_words(&words); 30 | 31 | println!("{:?}",counted); 32 | 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /experiments/rsnltk-experiment/tests/wordnet_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests{ 3 | use rsnltk::wordnet::wordnet_similarity; 4 | #[test] 5 | fn test_wordnet_similarity(){ 6 | let s1="dog.n.1"; 7 | let s2="cat.n.2"; 8 | let sims=wordnet_similarity(s1,s2); 9 | for sim in sims{ 10 | println!("{:?}",sim); 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /rsnltk.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/api/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod natural; 2 | pub mod whatlang; 3 | pub mod yn; -------------------------------------------------------------------------------- /src/api/natural.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests{ 3 | use crate::api::natural::*; 4 | # [test] 5 | fn test_distance(){ 6 | println!("lev = {}",lev_dist("kitten", "sitting")); 7 | println!("winkler = {}",jw_dist("dixon", "dicksonx")); 8 | } 9 | 10 | # [test] 11 | fn test_soundx(){ 12 | println!("{}",is_soundex("robert","rupert")); 13 | } 14 | 15 | # [test] 16 | fn test_tokenize(){ 17 | let str1="hello, world!"; 18 | let srtr="My dog has fleas."; 19 | println!("{:?}",tokenize(str1)); 20 | 21 | } 22 | 23 | # [test] 24 | fn test_ngrams(){ 25 | // no padding 26 | let str1="hello my darling"; 27 | let results=get_ngram(str1,2); 28 | for l in results{ 29 | println!("{:?}",l); 30 | } 31 | println!(); 32 | // with padding 33 | let results=get_ngram_with_padding(str1,2,"---"); 34 | for l in results{ 35 | println!("{:?}",l); 36 | } 37 | } 38 | 39 | # [test] 40 | fn test_classification(){ 41 | let mut list_str=Vec::new(); 42 | list_str.push("Hello World"); 43 | list_str.push("Hello Chen's World"); 44 | list_str.push("World is Amazing"); 45 | let mut list_label=Vec::new(); 46 | list_label.push("a"); 47 | list_label.push("b"); 48 | list_label.push("c"); 49 | println!("guess = {}",nb_guess(list_str,list_label,"Hello")); 50 | } 51 | 52 | # [test] 53 | fn test_tf_idf(){ 54 | let mut list_str=Vec::new(); 55 | list_str.push("this document is about rust."); 56 | list_str.push("this document is about erlang."); 57 | list_str.push("this document is about erlang and rust."); 58 | list_str.push("this document is about rust. it has rust examples"); 59 | 60 | println!("tf-idf value = {}",get_tf_idf(list_str,"rust")); 61 | 62 | } 63 | 64 | } 65 | 66 | extern crate natural; 67 | use natural::distance::jaro_winkler_distance; 68 | use natural::distance::levenshtein_distance; 69 | use natural::phonetics::soundex; 70 | use natural::classifier::NaiveBayesClassifier; 71 | use natural::tf_idf::TfIdf; 72 | 73 | pub fn lev_dist(str1:&str,str2:&str)->usize{ 74 | return levenshtein_distance(str1, str2); 75 | } 76 | 77 | pub fn jw_dist(str1:&str,str2:&str)->f32{ 78 | return jaro_winkler_distance(str1, str2); 79 | } 80 | 81 | pub fn is_soundex(str1:&str,str2:&str)->bool{ 82 | let result=soundex(str1,str2); 83 | return result; 84 | } 85 | 86 | pub fn tokenize(str:&str)->Vec<&str>{ 87 | natural::tokenize::tokenize(&str) 88 | } 89 | 90 | 91 | pub fn get_ngram(str:&str,n:usize)->Vec>{ 92 | natural::ngram::get_ngram(str, n) 93 | } 94 | 95 | pub fn get_ngram_with_padding<'a>(str:&'a str,n:usize,padding:&'a str)->Vec>{ 96 | let result = natural::ngram::get_ngram_with_padding(str, n,padding); 97 | result 98 | } 99 | 100 | pub fn nb_guess(train_strs:Vec<&str>,labels:Vec<&str>,str_guess:&str)->String{ 101 | 102 | 103 | let mut nbc = NaiveBayesClassifier::new(); 104 | let mut idx=0; 105 | for train_str in train_strs{ 106 | nbc.train(train_str, labels[idx]); 107 | idx+=1; 108 | } 109 | 110 | nbc.guess(str_guess) //returns a label with the highest probability 111 | 112 | } 113 | 114 | pub fn get_tf_idf(strs:Vec<&str>,s:&str)->f32{ 115 | 116 | let mut tf_idf=TfIdf::new(); 117 | 118 | 119 | for str in strs{ 120 | tf_idf.add(str); 121 | } 122 | 123 | tf_idf.get(s) //0.21859923 124 | } 125 | 126 | 127 | -------------------------------------------------------------------------------- /src/api/whatlang.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use crate::api::whatlang::*; 4 | # [test] 5 | fn test_whatlang(){ 6 | let text = "Ĉu vi ne volas eklerni Esperanton? Bonvolu! Estas unu de la plej bonaj aferoj!"; 7 | let ret=whatlang(text); 8 | println!("{:?}",ret); 9 | } 10 | 11 | } 12 | 13 | use std::collections::HashMap; 14 | 15 | 16 | extern crate whatlang; 17 | use whatlang::detect; 18 | pub fn whatlang(str:&str) ->HashMap{ 19 | 20 | 21 | let info = detect(str).unwrap(); 22 | let mut result:HashMap=HashMap::new(); 23 | result.insert(String::from("lang"),info.lang().to_string()); 24 | result.insert(String::from("script"),info.script().to_string()); 25 | result.insert(String::from("confidence"),info.confidence().to_string()); 26 | result.insert(String::from("is_reliable"),info.is_reliable().to_string()); 27 | 28 | result 29 | 30 | } 31 | 32 | -------------------------------------------------------------------------------- /src/api/yn.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use crate::api::yn::*; 4 | # [test] 5 | fn test_yes(){ 6 | let s="yes"; 7 | println!("{:?}",yes(s)); 8 | 9 | println!("{:?}",is_somewhat_yes("this has a y so it is the word")); 10 | 11 | println!("{:?}",is_kinda_yes("very much so")); 12 | } 13 | 14 | } 15 | extern crate yn; 16 | 17 | pub fn yes(str:&str)->bool{ 18 | return yn::yes(str); 19 | } 20 | pub fn is_somewhat_yes(str:&str)->bool{ 21 | return yn::is_somewhat_yes(str); 22 | } 23 | 24 | pub fn is_kinda_yes(str:&str)->bool{ 25 | yn::is_kinda_yes(str) 26 | } 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate core; 2 | 3 | pub mod wordnet; 4 | pub mod stanza; 5 | pub mod api; 6 | pub mod native; 7 | 8 | pub use stanza::*; 9 | 10 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | 2 | fn main(){ 3 | println!("Hello, rsnltk!") 4 | } -------------------------------------------------------------------------------- /src/native/chardata.rs: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////// 2 | // CharDataIter and friends 3 | // 4 | // Probably CharDataIter could be replaced by a clever 5 | // call to map() on the underlying char iterator... 6 | // 7 | pub static END_OF_STRING: char = '\0'; 8 | 9 | #[derive(Debug)] 10 | pub struct CharData { 11 | pub ch: char, 12 | pub byte_offset: usize, 13 | pub char_offset: usize, 14 | } 15 | 16 | pub struct CharDataIter<'a> { 17 | char_stream: &'a mut dyn Iterator, 18 | byte_offset: usize, 19 | char_offset: usize, 20 | really_done: bool, 21 | } 22 | 23 | impl<'a> CharDataIter<'a> { 24 | pub fn new(chs: &'a mut dyn Iterator) -> Self { 25 | CharDataIter { 26 | char_stream: chs, 27 | byte_offset: 0, 28 | char_offset: 0, 29 | really_done: false, 30 | } 31 | } 32 | } 33 | 34 | impl<'a> Iterator for CharDataIter<'a> { 35 | type Item = CharData; 36 | 37 | fn next(&mut self) -> Option { 38 | match self.char_stream.next() { 39 | Some(c) => { 40 | let result = CharData { 41 | ch: c, 42 | byte_offset: self.byte_offset, 43 | char_offset: self.char_offset, 44 | }; 45 | self.char_offset += 1; 46 | self.byte_offset += c.len_utf8(); 47 | Some(result) 48 | }, 49 | None => { 50 | if self.really_done { 51 | None 52 | } else { 53 | // Special marker 54 | self.really_done = true; 55 | Some ( 56 | CharData { 57 | ch: END_OF_STRING, // should be ignored! 58 | byte_offset: self.byte_offset, 59 | char_offset: self.char_offset, 60 | } 61 | ) 62 | } 63 | } 64 | } 65 | } 66 | } 67 | // 68 | // CharDataIter 69 | ///////////////////////////////////////////////////////// -------------------------------------------------------------------------------- /src/native/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod summarizer; 2 | pub mod word2vec; 3 | pub mod token; 4 | pub mod segmentation; 5 | mod chardata; 6 | mod toksiter; 7 | -------------------------------------------------------------------------------- /src/native/segmentation.rs: -------------------------------------------------------------------------------- 1 | 2 | 3 | use std::*; 4 | extern crate unicode_segmentation; 5 | use std::collections::HashMap; 6 | use unicode_segmentation::UnicodeSegmentation; 7 | 8 | 9 | 10 | /// 11 | /// Get word segmentation results from customized dictionaries and methods 12 | /// 13 | /// Parameters: 14 | /// 15 | /// _sentence: a string, 16 | /// 17 | /// dict_path: the dictionary file path where each line is a term, 18 | /// 19 | /// stopwords_path: the stopwords file path where each line is a stop word, 20 | /// 21 | /// method: if empty, use 'bimm', other optional values: fmm, bmm. 22 | /// 23 | pub fn get_segmentation(_sentence:&str,dict_path:&str,stopwords_path:&str,method:&str)->Vec{ 24 | 25 | if dict_path.eq(""){ 26 | let result= get_word_list(_sentence); 27 | let mut result_final:Vec=Vec::new(); 28 | for r in result{ 29 | result_final.push(String::from(r)); 30 | } 31 | return result_final; 32 | } 33 | 34 | // println!("loading common dictionary"); 35 | let common_words=load_dictionary(dict_path); 36 | 37 | 38 | let stop_words:Vec; 39 | if stopwords_path.eq(""){ 40 | // println!("loading stopwords dictionary"); 41 | stop_words=Vec::new(); 42 | }else{ 43 | stop_words=load_dictionary(stopwords_path); 44 | } 45 | 46 | 47 | // println!("doing segmentation tasks"); 48 | 49 | let sentence=_sentence.graphemes(true).collect::>(); 50 | let mut list_result:Vec=Vec::new(); 51 | 52 | if method=="bimm" || method==""{ 53 | list_result=bimm(sentence,common_words); 54 | }else if method=="fmm"{ 55 | list_result=fmm(sentence,common_words); 56 | }else if method=="bmm"{ 57 | list_result=bmm(sentence,common_words); 58 | } 59 | 60 | 61 | if !stopwords_path.eq(""){ 62 | let mut meaningful_words:Vec=Vec::new(); 63 | // println!("removing stop words..."); 64 | for word in list_result{ 65 | if !stop_words.contains(&word){ 66 | meaningful_words.push(word); 67 | } 68 | } 69 | return meaningful_words; 70 | }else{ 71 | return list_result; 72 | } 73 | 74 | 75 | } 76 | 77 | /// 78 | /// Bidirection Maximum Matching Method 79 | /// 80 | pub fn bimm(sentence:Vec<&str>, words_dict:Vec)->Vec{ 81 | let s1=sentence.clone(); 82 | let s2=sentence.clone(); 83 | let dict1=words_dict.clone(); 84 | let dict2=words_dict.clone(); 85 | let forward =fmm(s1,dict1); 86 | let backward=bmm(s2,dict2); 87 | // println!("FMM: {:?}",forward); 88 | // println!("BMM: {:?}",backward); 89 | let mut f_single_word=0; 90 | let mut b_single_word=0; 91 | let mut tot_fmm=forward.len(); 92 | let mut tot_bmm=backward.len(); 93 | let mut oov_fmm=0; 94 | let mut oov_bmm=0; 95 | let mut score_fmm=0; 96 | let mut score_bmm=0; 97 | if forward==backward{ 98 | return backward; 99 | }else{ 100 | for each in forward.clone(){ 101 | if each.len()==1{ 102 | f_single_word+=1; 103 | } 104 | } 105 | for each in backward.clone(){ 106 | if each.len()==1{ 107 | b_single_word+=1; 108 | } 109 | } 110 | for each in forward.clone(){ 111 | if !words_dict.contains(&each){ 112 | oov_fmm+=1; 113 | } 114 | } 115 | for each in backward.clone(){ 116 | if !words_dict.contains(&each){ 117 | oov_bmm+=1; 118 | } 119 | } 120 | if oov_fmm>oov_bmm{ 121 | score_bmm+=1; 122 | } 123 | if oov_fmmtot_bmm{ 127 | score_bmm+=1; 128 | }else if tot_fmmb_single_word{ 133 | score_bmm+=1; 134 | }else if f_single_wordVec<&str>{ 152 | // let s = "我喜欢吃苹果,也爱打羽毛球"; 153 | let g = str.graphemes(true).collect::>(); 154 | // println!("{:?}",g); 155 | return g; 156 | } 157 | 158 | /// 159 | /// Word Segmentation Based on Backward Maximum Matching 160 | /// 161 | pub fn bmm(sentence:Vec<&str>,dict:Vec)->Vec{ 162 | 163 | let mut list_words:Vec=Vec::new(); 164 | let mut index:i32=sentence.len() as i32; 165 | let window_size:i32=4; 166 | while index>0{ 167 | let mut match_flag=false; 168 | let mut i=window_size.clone(); 169 | // println!("i={i}"); 170 | while i>=0{ 171 | // println!("i={}",i); 172 | let a; 173 | if index-i<0{ 174 | a=0 as usize; 175 | }else{ 176 | a=(index-i) as usize; 177 | } 178 | // let a = (index-i) as usize; 179 | let b =index as usize; 180 | // println!("({},{})",a,b); 181 | let sub_str=sentence[a..b].concat(); 182 | if dict.contains(&sub_str) { 183 | match_flag = true; 184 | list_words.push(sub_str); 185 | index -= i; 186 | break; 187 | } 188 | i-=1; 189 | } 190 | if match_flag==false{ 191 | 192 | if index-1<0{ 193 | index=1; 194 | } 195 | let a=(index-1) as usize; 196 | list_words.push(String::from(sentence[a])); 197 | index-=1; 198 | } 199 | } 200 | list_words.reverse(); 201 | return list_words; 202 | 203 | } 204 | 205 | /// 206 | /// Word Segmentation Based on Forward Maximum Matching 207 | /// 208 | pub fn fmm(sentence:Vec<&str>,dict:Vec)->Vec{ 209 | let token_len=sentence.len() as i32; 210 | // println!("token len: {}",token_len); 211 | let mut index:i32=0; 212 | 213 | let mut list_words:Vec=Vec::new(); 214 | let window_size=4; 215 | /* 216 | for char in sentence.chars(){ 217 | println!("{}",char); 218 | } 219 | */ 220 | while index=0{ 224 | //println!("i={}",i); 225 | 226 | let a=index as usize; 227 | let mut b=(index+i) as usize; 228 | // println!("({},{})",a,b); 229 | if b>(token_len) as usize{ 230 | b=token_len as usize; 231 | } 232 | let sub_str=sentence[a..b].concat(); 233 | //println!("sub_str: {}",sub_str); 234 | if dict.contains(&sub_str){ 235 | match_flag=true; 236 | list_words.push(sub_str); 237 | index+=i; 238 | break; 239 | } 240 | //println!(); 241 | i-=1; 242 | } 243 | if match_flag==false{ 244 | let a=index as usize; 245 | let v=String::from(sentence[a]); 246 | list_words.push(v); 247 | index+=1; 248 | } 249 | 250 | } 251 | return list_words; 252 | } 253 | 254 | use std::io::{self, BufRead}; 255 | use std::fs::{File, read_dir}; 256 | use std::io::prelude::*; 257 | use std::path::Path; 258 | 259 | pub fn _read_lines

(filename: P) -> io::Result>> 260 | where P: AsRef, { 261 | let file = File::open(filename)?; 262 | Ok(io::BufReader::new(file).lines()) 263 | } 264 | 265 | /// 266 | /// Read a list of lines from a file 267 | /// 268 | pub fn load_dictionary(filepath:&str)->Vec{ 269 | // The output is wrapped in a Result to allow matching on errors 270 | // Returns an Iterator to the Reader of the lines of the file. 271 | 272 | let mut strings=Vec::new(); 273 | 274 | if let Ok(lines) = _read_lines(filepath) { 275 | // Consumes the iterator, returns an (Optional) String 276 | for line in lines { 277 | if let Ok(line) = line { 278 | // println!("{}", ip); 279 | //let word=line.replace("\n","").trim(); 280 | strings.push(String::from(line.replace("\n","").trim())); 281 | } 282 | } 283 | } 284 | strings 285 | } 286 | 287 | 288 | 289 | -------------------------------------------------------------------------------- /src/native/summarizer.rs: -------------------------------------------------------------------------------- 1 | // Ref: https://towardsdatascience.com/a-simple-text-summarizer-written-in-rust-4df05f9327a5 2 | // From Author: Charles Chan 3 | #[cfg(test)] 4 | mod tests{ 5 | use crate::native::summarizer::*; 6 | # [test] 7 | fn test_summarize(){ 8 | let text="As of Sunday, there were more than 58.2 million reported cases of COVID-19 worldwide, with more than 37.2 million of those cases listed as recovered, according to a COVID-19 tracking tool maintained by Johns Hopkins University. The global death toll stood at more than 1.3 million. In Asia, the daily tally of reported cases in Japan hit a record for the fourth day in a row, with 2,508 people confirmed infected, the Health Ministry said Sunday. A flurry of criticism has erupted, from opposition legislators and the public, slamming the government as having acted too slowly in halting its \"GoTo\" campaign, which encouraged travel and dining out with discounts. In Europe, French authorities ordered the culling of all minks at a farm after analysis showed a mutated version of the coronavirus was circulating among the animals. The move follows virus developments in mink farms in Denmark and other countries, including the Netherlands, Sweden and Greece. In the Americas, Chile says it will open its main border crossing and principal airport to foreign visitors on Monday after an eight-month pandemic shutdown. Arrivals will have to present evidence of a recent negative test for the novel coronavirus, as well as health insurance. They'll also have to report their whereabouts and health status for a two-week watch period. Those coming from high-risk countries will have to quarantine for 14 days. In Africa, Sudan's minister of cabinet affairs on Sunday tested positive for the coronavirus, the prime minister's office said, the latest in a string of senior officials to be infected as the country shows an increase of confirmed cases of COVID-19. Over the past month, acting ministers of finance and health, the central bank governor and two associates to Prime Minister Abdalla Hamdok have tested positive."; 9 | let stopwords=&[]; 10 | let summarized_text=summarize(text,stopwords,5); 11 | println!("{}",summarized_text); 12 | } 13 | } 14 | 15 | use unicode_segmentation::UnicodeSegmentation; 16 | use std::collections::BTreeSet; 17 | use ndarray::{Array1, Array2}; 18 | 19 | /// 20 | /// Summarize text 21 | /// 22 | pub fn summarize(text: &str, stop_words: &[&str], num_sentence: usize) -> String { 23 | let sentences = text.unicode_sentences().collect::>(); 24 | if num_sentence >= sentences.len() { 25 | return text.to_string(); 26 | } 27 | let mut sentences_and_words = vec![]; 28 | sentences.iter().for_each(|&sentence| { 29 | let words = split_into_words(sentence); 30 | sentences_and_words.push(words); 31 | }); 32 | let matrix = build_similarity_matrix(&sentences_and_words, stop_words); 33 | let ranks = calculate_sentence_rank(&matrix); 34 | let mut sorted_ranks = ranks.clone(); 35 | sorted_ranks.sort_by(|a, b| b.partial_cmp(a).unwrap()); 36 | let least_rank = sorted_ranks[num_sentence + 1]; 37 | let mut result: Vec<&str> = vec![]; 38 | let mut included_count = 0; 39 | for i in 0..sentences.len() { 40 | if ranks[i] >= least_rank { 41 | included_count = included_count + 1; 42 | result.push(sentences[i]); 43 | } 44 | if included_count == num_sentence { 45 | break; 46 | } 47 | } 48 | result.join("") 49 | } 50 | 51 | fn get_all_words_lc<'a>(sentence1: &[&'a str], sentence2: &[&'a str]) -> BTreeSet { 52 | let mut all_words: BTreeSet = BTreeSet::new(); 53 | 54 | sentence1.iter().for_each(|w| { 55 | all_words.insert(w.to_lowercase()); 56 | }); 57 | 58 | sentence2.iter().for_each(|w| { 59 | all_words.insert(w.to_lowercase()); 60 | }); 61 | return all_words; 62 | } 63 | 64 | /// 65 | /// Retrieve a sentence vector based on the frequency of words that appears in the all_words_lc set. 66 | /// all_words_lc should be a sorted set of lower cased words 67 | /// The size of the resulting vector is the same as the all_words_lc set 68 | /// stop_words are skipped 69 | /// 70 | fn get_sentence_vector(sentence: &[&str], all_words_lc: &BTreeSet, stop_words: &[&str]) -> Vec { 71 | let mut vector: Vec = vec![0; all_words_lc.len()]; 72 | for word in sentence { 73 | let word_lc = word.to_lowercase(); 74 | if !stop_words.contains(&word_lc.as_str()) { 75 | let index = all_words_lc.iter().position(|x| x.eq(&word_lc)).unwrap(); 76 | vector[index] += 1; 77 | } 78 | } 79 | return vector; 80 | } 81 | 82 | /// 83 | /// Calculates the cosine distance between two vectors 84 | /// Refer to [YouTube](https://www.youtube.com/watch?v=3X0wLRwU_Ws) 85 | /// 86 | fn cosine_distance(vec1: &Vec, vec2: &Vec) -> f64 { 87 | let dot_product = dot_product(vec1, vec2); 88 | let root_sum_square1 = root_sum_square(vec1); 89 | let root_sum_square2 = root_sum_square(vec2); 90 | return dot_product as f64 / (root_sum_square1 * root_sum_square2); 91 | } 92 | 93 | fn root_sum_square(vec: &Vec) -> f64 { 94 | let mut sum_square = 0; 95 | for i in 0..vec.len() { 96 | sum_square += vec[i] * vec[i]; 97 | } 98 | (sum_square as f64).sqrt() 99 | } 100 | 101 | fn dot_product(vec1: &Vec, vec2: &Vec) -> usize { 102 | let delta = vec1.len() - vec2.len(); 103 | let shortest_vec = match delta { 104 | d if d < 0 => vec1, 105 | d if d > 0 => vec2, 106 | _ => vec1 107 | }; 108 | let mut dot_product = 0; 109 | for i in 0..shortest_vec.len() { 110 | dot_product += vec1[i] * vec2[i]; 111 | } 112 | dot_product 113 | } 114 | 115 | fn sentence_similarity(s1: &[&str], s2: &[&str], stop_words: &[&str]) -> f64 { 116 | let all_words = get_all_words_lc(s1, s2); 117 | let v1 = get_sentence_vector(s1, &all_words, stop_words); 118 | let v2 = get_sentence_vector(s2, &all_words, stop_words); 119 | 1.0 - cosine_distance(&v1, &v2) 120 | } 121 | 122 | /// 123 | /// Calculate a similarity matrix for the given sentences. 124 | /// Returns a 2-D array M_i,j such that for all 'j', sum(i, M_i,j) = 1 125 | /// We take a leap of faith here and assume that cosine similarity is similar to the probability 126 | /// that a sentence is important for summarization 127 | /// 128 | fn build_similarity_matrix(sentences: &Vec>, stop_words: &[&str]) -> Array2 { 129 | let len = sentences.len(); 130 | let mut matrix = Array2::::zeros((len, len)); 131 | let mut sum_column: Vec = vec![0.0; len]; 132 | for i in 0..len { 133 | for j in 0..len { 134 | if i == j { 135 | continue; 136 | } 137 | matrix[[i, j]] = sentence_similarity(sentences[i].as_slice(), sentences[j].as_slice(), stop_words); 138 | } 139 | } 140 | // at this point we have the cosine similarity of each sentence. 141 | // take a leap of faith and assume that the cosine similarity is the probability that a sentence 142 | // is important for summarization. 143 | // We do this by normalizing the matrix along the column. The column values should add up to 1. 144 | for j in 0..len { 145 | let mut sum: f64 = 0.0; 146 | for i in 0..len { 147 | if i == j { 148 | continue; 149 | } 150 | sum += matrix[[i, j]]; 151 | } 152 | sum_column[j] = sum; 153 | } 154 | for i in 0..len { 155 | for j in 0..len { 156 | if i == j { 157 | continue; 158 | } 159 | matrix[[i, j]] = matrix[[i, j]] / sum_column[j]; 160 | } 161 | } 162 | matrix 163 | } 164 | 165 | /// 166 | /// Calculate a sentence rank similar to a page rank. 167 | /// Please refer to [PageRank](https://en.wikipedia.org/wiki/PageRank) for more details. 168 | /// 169 | fn calculate_sentence_rank(similarity_matrix: &Array2) -> Vec { 170 | let num_sentence = similarity_matrix.shape()[1]; 171 | let threshold = 0.001; 172 | // Initialize a vector with the same value 1/number of sentences. Uniformly distributed across 173 | // all sentences. NOTE: perhaps we can make some sentences more important than the rest? 174 | let initial_vector: Vec = vec![1.0 / num_sentence as f64; num_sentence]; 175 | let mut result = Array1::from(initial_vector); 176 | let mut prev_result = result.clone(); 177 | let damping_factor = 0.85; 178 | let initial_m = damping_factor * similarity_matrix + (1.0 - damping_factor) / num_sentence as f64; 179 | loop { 180 | result = initial_m.dot(&result); 181 | let delta = &result - &prev_result; 182 | let mut converged = true; 183 | for i in 0..delta.len() { 184 | if delta[i] > threshold { 185 | converged = false; 186 | break; 187 | } 188 | } 189 | if converged { 190 | break; 191 | } 192 | prev_result = result.clone(); 193 | } 194 | result.into_raw_vec() 195 | } 196 | 197 | fn split_into_words(sentence: &str) -> Vec<&str> { 198 | let mut result = vec![]; 199 | let words = sentence.unicode_words(); 200 | for word in words { 201 | result.push(word); 202 | } 203 | result 204 | } -------------------------------------------------------------------------------- /src/native/token.rs: -------------------------------------------------------------------------------- 1 | use std::io; 2 | use std::io::prelude::*; 3 | use crate::native::toksiter::*; 4 | use crate::native::chardata::*; 5 | 6 | #[cfg(test)] 7 | mod tests { 8 | use std::borrow::Borrow; 9 | use crate::native::word2vec::*; 10 | use crate::native::toksiter::*; 11 | use crate::native::chardata::*; 12 | use crate::native::token::get_token_list; 13 | 14 | # [test] 15 | fn token_analyze(){ 16 | let mut s="hello world!"; 17 | 18 | let mut chs = s.chars(); 19 | let mut chds = CharDataIter::new(&mut chs); 20 | let mut toks = TokenIter::new(&mut chds); 21 | 22 | // Run the tokenizer, dump debug info for each token: 23 | loop { 24 | match toks.next() { 25 | Some(tok) => { println!("{:?}", tok) }, 26 | None => { println!(""); break; } 27 | } 28 | } 29 | } 30 | 31 | # [test] 32 | fn test_get_token_list(){ 33 | let s="Hello, Rust. How are you?"; 34 | let result=get_token_list(s); 35 | for r in result{ 36 | println!("{}\t{:?}",r.text,r); 37 | } 38 | } 39 | 40 | } 41 | 42 | pub fn get_token_list(s:&str)->Vec{ 43 | 44 | let mut chs = s.chars(); 45 | let mut chds = CharDataIter::new(&mut chs); 46 | let mut toks = TokenIter::new(&mut chds); 47 | let mut list_token:Vec=Vec::new(); 48 | // Run the tokenizer, dump debug info for each token: 49 | loop { 50 | match toks.next() { 51 | Some(tok) => { 52 | println!("{:?}", tok); 53 | list_token.push(tok); 54 | }, 55 | None => { println!(""); break; } 56 | } 57 | } 58 | list_token 59 | 60 | } 61 | 62 | 63 | 64 | fn main() { 65 | // Get stdin into a string 66 | let stdin = io::stdin(); 67 | let mut s = String::new(); 68 | stdin.lock().read_to_string(&mut s).unwrap(); 69 | println!("{}", s); 70 | 71 | // Construct a tokenizer by adapting some more primitive iterators 72 | let mut chs = s.chars(); 73 | let mut chds = CharDataIter::new(&mut chs); 74 | let mut toks = TokenIter::new(&mut chds); 75 | 76 | // Run the tokenizer, dump debug info for each token: 77 | loop { 78 | match toks.next() { 79 | Some(tok) => { println!("{:?}", tok) }, 80 | None => { println!(""); break; } 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/native/toksiter.rs: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////// 2 | // TokenIter 3 | // 4 | 5 | use crate::native::chardata; 6 | 7 | static IN_TOKEN: u8 = 1; 8 | static BTWN_TOKS: u8 = 0; 9 | 10 | /* TODO: Probably there should not be a String member here. 11 | We should either borrow a string slice from the original text, 12 | or else leave it out, and provide some other facility for 13 | converting Tokens to Strings, given the underlying string. 14 | In a full fledged parser, the parsed-document representation 15 | would handle that. 16 | */ 17 | #[derive(Debug)] 18 | pub struct Token { 19 | pub text: String, 20 | pub byte_offsets: (usize, usize), 21 | pub char_offsets: (usize, usize), 22 | pub token_offset: usize 23 | } 24 | 25 | impl Token { 26 | fn new() -> Token { 27 | Token { 28 | text: "".to_string(), 29 | byte_offsets: (0, 0), 30 | char_offsets: (0, 0), 31 | token_offset: 0, 32 | } 33 | } 34 | } 35 | 36 | pub struct TokenIter<'a> { 37 | chdat_stream: &'a mut chardata::CharDataIter<'a>, 38 | curr_tok_offset: usize, 39 | state: u8, 40 | } 41 | 42 | impl<'a> TokenIter<'a> { 43 | pub fn new(chdats: &'a mut chardata::CharDataIter<'a>) -> Self { 44 | TokenIter { 45 | chdat_stream: chdats, 46 | curr_tok_offset: 0, 47 | state: BTWN_TOKS, 48 | } 49 | } 50 | 51 | fn is_boundary_char(ch: char) -> bool { 52 | if ch == chardata::END_OF_STRING { 53 | true 54 | } else if ch.is_whitespace() { 55 | true 56 | } else { 57 | false 58 | } 59 | } 60 | } 61 | 62 | /* Always start out BTWN_TOKS, and therefore always end in BTWN_TOKS. 63 | Start by skipping characters until state changes to IN_TOKEN. 64 | Then (1) set the token start offsets; (2) march the char data iter forward 65 | until state changes to BTWN_TOKS, then fix the end offsets of the token 66 | under construction. Update the current token offset. 67 | Leave the resulting Token as the return value of next(). 68 | If the underlying chardata::CharDataIter yields END_OF_SENTENCE: 69 | IN_TOKEN --> ship the current token 70 | BTWN_TOKS --> return None 71 | In the first case, the next call to next() will immediately trigger 72 | the second case. 73 | */ 74 | 75 | impl<'a> Iterator for TokenIter<'a> { 76 | type Item = Token; 77 | 78 | fn next(&mut self) -> Option { 79 | assert_eq!(self.state, BTWN_TOKS); 80 | let mut curr_tok = Token::new(); 81 | loop { 82 | match self.chdat_stream.next() { 83 | 84 | Some( chardata::CharData {ch, byte_offset, char_offset} ) => { 85 | 86 | if TokenIter::is_boundary_char(ch) { 87 | if self.state == IN_TOKEN { 88 | // ship token 89 | curr_tok.byte_offsets.1 = byte_offset; 90 | curr_tok.char_offsets.1 = char_offset; 91 | self.state = BTWN_TOKS; 92 | self.curr_tok_offset += 1; 93 | return Some(curr_tok); 94 | } 95 | // else do nothing -- skip boundary chars 96 | } else { 97 | if self.state == BTWN_TOKS { 98 | // start token 99 | curr_tok.token_offset = self.curr_tok_offset; 100 | curr_tok.byte_offsets.0 = byte_offset; 101 | curr_tok.char_offsets.0 = char_offset; 102 | self.state = IN_TOKEN; 103 | } 104 | // Accumulate characters 105 | curr_tok.text.push(ch); 106 | curr_tok.byte_offsets.1 = byte_offset; 107 | curr_tok.char_offsets.1 = char_offset; 108 | } 109 | }, 110 | 111 | None => { 112 | // May need to ship a token here! 113 | if self.state == IN_TOKEN { 114 | self.state = BTWN_TOKS; 115 | return Some(curr_tok); 116 | } 117 | return None; 118 | } 119 | } 120 | } 121 | } 122 | } 123 | // 124 | // TokenIter 125 | ///////////////////////////////////////////////////////// -------------------------------------------------------------------------------- /src/native/word2vec.rs: -------------------------------------------------------------------------------- 1 | extern crate word2vec; 2 | 3 | use word2vec::wordclusters::WordClusters; 4 | use word2vec::wordvectors::WordVector; 5 | 6 | #[cfg(test)] 7 | mod tests{ 8 | use std::borrow::Borrow; 9 | use crate::native::word2vec::*; 10 | 11 | 12 | fn test_word2vec(){ 13 | //this is an issue to fix 14 | let clusters=wv_clusters_create("D:\\UIBEResearch\\classes.txt"); 15 | let index=wv_get_cluster_from_clusters(clusters,"problem"); 16 | println!("index = {}",index); 17 | } 18 | 19 | 20 | fn test_open_wv_bin(){ 21 | let wv_model=wv_get_model("D:\\UIBEResearch\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin"); 22 | let positive = vec!["woman", "king"]; 23 | let negative = vec!["man"]; 24 | println!("analogy: {:?}", wv_analogy(&wv_model,positive, negative, 10)); 25 | println!("cosine: {:?}", wv_cosine(&wv_model,"man", 10)); 26 | } 27 | 28 | 29 | fn test_origin(){ 30 | /* 31 | let model = word2vec::wordvectors::WordVector::load_from_binary( 32 | 33 | "D:\\UIBEResearch\\GoogleNews-vectors-negative300.bin\\GoogleNews-vectors-negative300.bin").expect("Unable to load word vector model"); 34 | println!("{:?}", model.cosine("snow", 10)); 35 | let positive = vec!["woman", "king"]; 36 | let negative = vec!["man"]; 37 | println!("{:?}", model.analogy(positive, negative, 10)); 38 | */ 39 | 40 | 41 | let clusters = word2vec::wordclusters::WordClusters::load_from_file( 42 | "D:\\UIBEResearch\\classes1.txt").expect("Unable to load word clusters"); 43 | println!("{:?}", clusters.get_cluster("belarus")); 44 | println!("{:?}", clusters.get_words_on_cluster(6)); 45 | 46 | 47 | } 48 | 49 | } 50 | 51 | pub fn wv_get_model(bin_path:&str)->WordVector{ 52 | let model = word2vec::wordvectors::WordVector::load_from_binary( 53 | bin_path).expect("Unable to load word vector model"); 54 | return model 55 | } 56 | 57 | 58 | /// 59 | /// let model = word2vec::wordvectors::WordVector::load_from_binary( 60 | /// "vectors.bin").expect("Unable to load word vector model"); 61 | /// println!("{:?}", model.cosine("snow", 10)); 62 | /// 63 | /// 64 | pub fn wv_cosine(model:&WordVector,word:&str,n:usize)->Vec<(String,f32)>{ 65 | 66 | let ret=model.cosine(word,n); 67 | match ret { 68 | Some(r)=>{ 69 | r 70 | }, 71 | None=>{ 72 | Vec::new() 73 | }, 74 | } 75 | } 76 | 77 | /// 78 | /// let positive = vec!["woman", "king"]; 79 | /// let negative = vec!["man"]; 80 | /// println!("{:?}", model.analogy(positive, negative, 10)); 81 | /// 82 | pub fn wv_analogy(model:&WordVector, positive:Vec<&str>,negative:Vec<&str>,n:usize)->Vec<(String,f32)>{ 83 | let re=model.analogy(positive, negative, n); 84 | // println!("{:?}",re ); 85 | match re{ 86 | Some(v)=>v, 87 | None=>{ 88 | eprintln!("error"); 89 | Vec::new() 90 | } 91 | } 92 | } 93 | 94 | /// 95 | /// 96 | /// let clusters = word2vec::wordclusters::WordClusters::load_from_file( 97 | /// "classes.txt").expect("Unable to load word clusters"); 98 | /// println!("{:?}", clusters.get_cluster("belarus")); 99 | /// println!("{:?}", clusters.get_words_on_cluster(6)); 100 | /// 101 | /// 102 | pub fn wv_clusters_create(filepath:&str)->WordClusters{ 103 | let clusters = word2vec::wordclusters::WordClusters::load_from_file( 104 | filepath).expect("Unable to load word clusters"); 105 | return clusters; 106 | } 107 | 108 | /// 109 | /// println!("{:?}", clusters.get_words_on_cluster(6)); 110 | /// 111 | pub fn wv_get_cluster_from_clusters(clusters:WordClusters,word:&str)->i32{ 112 | match clusters.get_cluster(word){ 113 | Some(&v)=>{ 114 | v 115 | }, 116 | None=>{ 117 | println!("error"); 118 | -1 119 | } 120 | } 121 | } 122 | 123 | /// 124 | /// println!("{:?}", clusters.get_cluster("belarus")); 125 | /// 126 | pub fn wv_get_cluster_string(clusters:WordClusters,index:i32)->Vec{ 127 | match clusters.get_words_on_cluster(index){ 128 | Some(v)=>{ 129 | v.clone() 130 | }, 131 | None=>{ 132 | println!("error"); 133 | Vec::new() 134 | } 135 | } 136 | } 137 | 138 | 139 | fn main(){ 140 | 141 | } -------------------------------------------------------------------------------- /src/wordnet.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use pyo3::prelude::*; 3 | /// 4 | /// Estimate the similarity between twn synsets based on WordNet (pip install semantic-kit) required 5 | /// 6 | pub fn wordnet_similarity(s1:&str,s2:&str)->HashMap{ 7 | match _wordnet_similarity(s1,s2){ 8 | Ok(sims)=>sims, 9 | Err(e)=>{ 10 | eprintln!("{:?}",e); 11 | HashMap::new() 12 | } 13 | } 14 | } 15 | 16 | fn _wordnet_similarity(s1:&str,s2:&str)-> PyResult> { 17 | Python::with_gil(|py| { 18 | let semantickit = PyModule::import(py, "semantickit.similarity.wordnet_similarity")?; 19 | let sim: HashMap = semantickit.getattr("wordnet_similarity_all")?. 20 | call1((s1,s2))?.extract()?; 21 | // println!("Result: {:?}",sim); 22 | Ok(sim) 23 | }) 24 | } 25 | 26 | -------------------------------------------------------------------------------- /tests/3rdparty_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | 4 | use rsnltk::wordnet::wordnet_similarity; 5 | use rsnltk::api::natural::*; 6 | use rsnltk::api::whatlang::*; 7 | use rsnltk::api::yn::*; 8 | 9 | # [test] 10 | fn test_distance(){ 11 | println!("lev = {}",lev_dist("kitten", "sitting")); 12 | println!("winkler = {}",jw_dist("dixon", "dicksonx")); 13 | } 14 | 15 | # [test] 16 | fn test_whatlang(){ 17 | let text = "Ĉu vi ne volas eklerni Esperanton? Bonvolu! Estas unu de la plej bonaj aferoj!"; 18 | let ret=whatlang(text); 19 | println!("{:?}",ret); 20 | } 21 | 22 | # [test] 23 | fn test_yes(){ 24 | let s="yes"; 25 | println!("{:?}",yes(s)); 26 | 27 | println!("{:?}",is_somewhat_yes("this has a y so it is the word")); 28 | 29 | println!("{:?}",is_kinda_yes("very much so")); 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /tests/native_rust_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests { 3 | use rsnltk::native::token::*; 4 | use rsnltk::native::summarizer::*; 5 | # [test] 6 | fn test_get_token_list(){ 7 | let s="Hello, Rust. How are you?"; 8 | let result=get_token_list(s); 9 | for r in result{ 10 | println!("{}\t{:?}",r.text,r); 11 | } 12 | } 13 | 14 | # [test] 15 | fn test_summarize(){ 16 | let text="As of Sunday, there were more than 58.2 million reported cases of COVID-19 worldwide, with more than 37.2 million of those cases listed as recovered, according to a COVID-19 tracking tool maintained by Johns Hopkins University. The global death toll stood at more than 1.3 million. In Asia, the daily tally of reported cases in Japan hit a record for the fourth day in a row, with 2,508 people confirmed infected, the Health Ministry said Sunday. A flurry of criticism has erupted, from opposition legislators and the public, slamming the government as having acted too slowly in halting its \"GoTo\" campaign, which encouraged travel and dining out with discounts. In Europe, French authorities ordered the culling of all minks at a farm after analysis showed a mutated version of the coronavirus was circulating among the animals. The move follows virus developments in mink farms in Denmark and other countries, including the Netherlands, Sweden and Greece. In the Americas, Chile says it will open its main border crossing and principal airport to foreign visitors on Monday after an eight-month pandemic shutdown. Arrivals will have to present evidence of a recent negative test for the novel coronavirus, as well as health insurance. They'll also have to report their whereabouts and health status for a two-week watch period. Those coming from high-risk countries will have to quarantine for 14 days. In Africa, Sudan's minister of cabinet affairs on Sunday tested positive for the coronavirus, the prime minister's office said, the latest in a string of senior officials to be infected as the country shows an increase of confirmed cases of COVID-19. Over the past month, acting ministers of finance and health, the central bank governor and two associates to Prime Minister Abdalla Hamdok have tested positive."; 17 | let stopwords=&[]; 18 | let summarized_text=summarize(text,stopwords,5); 19 | println!("{}",summarized_text); 20 | } 21 | 22 | } -------------------------------------------------------------------------------- /tests/segmentation_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests{ 3 | use rsnltk::native::segmentation::*; 4 | extern crate unicode_segmentation; 5 | use unicode_segmentation::UnicodeSegmentation; 6 | # [test] 7 | fn test_utf8(){ 8 | let s = "我喜欢吃苹果,也爱打羽毛球"; 9 | let g = s.graphemes(true).collect::>(); 10 | println!("{:?}",g); 11 | } 12 | 13 | # [test] 14 | fn test_bmm(){ 15 | let sss=String::from("我喜欢吃苹果,也爱打羽毛球"); 16 | let sentence = sss.graphemes(true).collect::>(); 17 | 18 | let dict=vec!["我".to_string(),"喜欢".to_string(),"苹果".to_string(),"羽毛球".to_string(),"爱".to_string()]; 19 | 20 | let results=bmm(sentence,dict); 21 | 22 | println!("{:?}",results); 23 | } 24 | 25 | # [test] 26 | fn test_fmm(){ 27 | let sss=String::from("我喜欢吃苹果,也爱打羽毛球"); 28 | let sentence = sss.graphemes(true).collect::>(); 29 | 30 | let dict=vec!["我".to_string(),"喜欢".to_string(),"苹果".to_string(),"羽毛球".to_string(),"爱".to_string()]; 31 | 32 | let result=fmm(sentence,dict); 33 | 34 | println!("{:?}",result) 35 | 36 | } 37 | 38 | # [test] 39 | fn test_bimm(){ 40 | let sss=String::from("我喜欢吃苹果,也爱打羽毛球"); 41 | let sentence = sss.graphemes(true).collect::>(); 42 | 43 | let dict=vec!["我".to_string(),"喜欢".to_string(),"苹果".to_string(),"羽毛球".to_string(),"爱".to_string()]; 44 | 45 | let result=bimm(sentence,dict); 46 | 47 | println!("{:?}",result) 48 | } 49 | 50 | # [test] 51 | fn test_real_word_segmentation(){ 52 | let dict_path="D:\\GitHub\\rsnltk\\experiments\\rsnltk-experiment\\examples\\data\\dicts\\30wdict.txt"; 53 | let stop_path="D:\\GitHub\\rsnltk\\experiments\\rsnltk-experiment\\examples\\data\\dicts\\stopwords\\baidu_stopwords.txt"; 54 | 55 | // let dict_path=""; 56 | // let stop_path=""; 57 | 58 | let _sentence="美国太空总署希望,在深海的探险发现将有助于解开一些外太空的秘密,同时也可以测试前往太阳系其他星球探险所需的一些设备和实验。"; 59 | let meaningful_words=get_segmentation(_sentence,dict_path,stop_path, ""); 60 | 61 | println!("Result: {:?}",meaningful_words); 62 | } 63 | 64 | # [test] 65 | fn test_segmentation_performance(){ 66 | use std::time::{Duration, Instant}; 67 | // set a dictionary 68 | let dict_path="30wdict.txt"; 69 | let stop_path="baidu_stopwords.txt"; 70 | // target sentence 71 | let _sentence="美国太空总署希望,在深海的探险发现将有助于解开一些外太空的秘密,同时也可以测试前往太阳系其他星球探险所需的一些设备和实验。"; 72 | // start to time recording 73 | let mut start = Instant::now(); 74 | let bimm_result=get_segmentation(_sentence,dict_path,stop_path, "bimm"); 75 | println!("bimm's time cost: {:?}",start.elapsed()); 76 | start = Instant::now(); 77 | let fmm_result=get_segmentation(_sentence,dict_path,stop_path, "fmm"); 78 | println!("fmm's time cost: {:?}",start.elapsed()); 79 | start = Instant::now(); 80 | let bmm_result=get_segmentation(_sentence,dict_path,stop_path, "bmm"); 81 | println!("bmm's time cost: {:?}",start.elapsed()); 82 | 83 | } 84 | } -------------------------------------------------------------------------------- /tests/stanza_test.rs: -------------------------------------------------------------------------------- 1 | 2 | #[cfg(test)] 3 | mod tests { 4 | use rsnltk::{download_lang, ner, tokenize, download_langs, tokenize_sentence, lang, sentiment, mwt_expand, pos, dependency_tree}; 5 | 6 | # [test] // before use the rsnltk library, you need to download target language package from Stanza's website. 7 | fn test_download_langs(){ 8 | // 1. first install the package 9 | let list_lang=vec!["en","zh"]; 10 | download_langs(list_lang); 11 | // 2. then do NLP tasks 12 | let text="I like Beijing!"; 13 | let lang="en"; 14 | // 2. Uncomment the below codes for Chinese NER 15 | // let text="我喜欢北京、上海和纽约!"; 16 | // let lang="zh"; 17 | let list_ner=ner(text,lang); 18 | for ner in list_ner{ 19 | println!("{:?}",ner); 20 | } 21 | 22 | } 23 | 24 | #[test] 25 | fn test_ner(){ 26 | // 1. for English NER 27 | let text="I like Beijing!"; 28 | let lang="en"; 29 | // 2. Uncomment the below codes for Chinese NER 30 | // let text="我喜欢北京、上海和纽约!"; 31 | // let lang="zh"; 32 | let list_ner=ner(text,lang); 33 | for ner in list_ner{ 34 | println!("{:?}",ner); 35 | } 36 | } 37 | # [test] 38 | fn test_tokenize(){ 39 | 40 | let text="我喜欢北京、上海和纽约!"; 41 | let lang="zh"; 42 | 43 | let list_result=tokenize(text,lang); 44 | for ner in list_result{ 45 | println!("{:?}",ner); 46 | } 47 | } 48 | # [test] 49 | fn test_tokenize_sentence(){ 50 | let text="I like apple. Do you like it? No, I am not sure!"; 51 | let lang="en"; 52 | let list_sentences=tokenize_sentence(text,lang); 53 | for sentence in list_sentences{ 54 | println!("Sentence: {}",sentence); 55 | } 56 | } 57 | # [test] 58 | fn test_lang(){ 59 | let list_text = vec!["I like Beijing!","我喜欢北京!", "Bonjour le monde!"]; 60 | let list_result=lang(list_text); 61 | for lang in list_result{ 62 | println!("{:?}",lang); 63 | } 64 | } 65 | # [test] 66 | fn test_mwt_expand(){ 67 | let text="Nous avons atteint la fin du sentier."; 68 | let lang="fr"; 69 | let list_result=mwt_expand(text,lang); 70 | } 71 | # [test] 72 | fn test_tag(){ 73 | //let text="我喜欢北京、上海和纽约!"; 74 | //let lang="zh"; 75 | let text="I like apple"; 76 | let lang="en"; 77 | 78 | let list_result=pos(text,lang); 79 | for word in list_result{ 80 | println!("{:?}",word); 81 | } 82 | } 83 | # [test] 84 | fn test_sentiment(){ 85 | //let text="I like Beijing!"; 86 | //let lang="en"; 87 | let text="我讨厌北京"; 88 | let lang="zh"; 89 | 90 | let sentiments=sentiment(text,lang); 91 | for sen in sentiments{ 92 | println!("{:?}",sen); 93 | } 94 | } 95 | 96 | # [test] 97 | fn test_dependency_tree(){ 98 | let text="I like you. Do you like me?"; 99 | let lang="en"; 100 | let list_results=dependency_tree(text,lang); 101 | for list_token in list_results{ 102 | for token in list_token{ 103 | println!("{:?}",token) 104 | } 105 | 106 | } 107 | } 108 | 109 | } -------------------------------------------------------------------------------- /tests/wordnet_test.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | mod tests{ 3 | use rsnltk::wordnet::wordnet_similarity; 4 | #[test] 5 | fn test_wordnet_similarity(){ 6 | let s1="dog.n.1"; 7 | let s2="cat.n.2"; 8 | let sims=wordnet_similarity(s1,s2); 9 | for sim in sims{ 10 | println!("{:?}",sim); 11 | } 12 | } 13 | } --------------------------------------------------------------------------------