├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .vscode └── settings.json ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-JS ├── LICENSE-MIT ├── LICENSE-WORDS ├── README.md ├── benches └── bench.rs ├── examples └── export_json.rs ├── fuzz ├── .gitignore ├── Cargo.toml └── fuzz_targets │ └── fuzz_en.rs ├── js └── lunr.ar.js ├── src ├── config.rs ├── document_store.rs ├── inverted_index.rs ├── lang │ ├── ar.rs │ ├── common.rs │ ├── da.rs │ ├── de.rs │ ├── du.rs │ ├── en.rs │ ├── es.rs │ ├── fi.rs │ ├── fr.rs │ ├── hu.rs │ ├── it.rs │ ├── ja.rs │ ├── ko.rs │ ├── mod.rs │ ├── no.rs │ ├── pt.rs │ ├── ro.rs │ ├── ru.rs │ ├── sv.rs │ ├── tr.rs │ └── zh.rs ├── lib.rs └── pipeline.rs └── tests ├── data ├── ar.in.txt ├── ar.out.txt ├── da.in.txt ├── da.out.txt ├── de.in.txt ├── de.out.txt ├── du.in.txt ├── du.out.txt ├── en.in.txt ├── en.out.txt ├── es.in.txt ├── es.out.txt ├── fi.in.txt ├── fi.out.txt ├── fr.in.txt ├── fr.out.txt ├── hu.in.txt ├── hu.out.txt ├── it.in.txt ├── it.out.txt ├── ja.in.txt ├── ja.out.txt ├── ko.in.txt ├── ko.out.txt ├── no.in.txt ├── no.out.txt ├── pt.in.txt ├── pt.out.txt ├── ro.in.txt ├── ro.out.txt ├── ru.in.txt ├── ru.out.txt ├── sv.in.txt ├── sv.out.txt ├── tr.in.txt ├── tr.out.txt ├── zh.in.txt └── zh.out.txt ├── lunr-fixture-gen ├── index.js ├── package-lock.json └── package.json ├── searchindex_fixture_en.json ├── searchindex_fixture_ja.json ├── test-index.rs └── test-pipeline.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | * text eol=lf -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | tags: [ '*' ] 7 | workflow_dispatch: 8 | pull_request: 9 | 10 | jobs: 11 | ci: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | rust: 16 | - stable 17 | - beta 18 | - nightly 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Setup ${{ matrix.rust }} toolchain 24 | uses: actions-rust-lang/setup-rust-toolchain@v1 25 | with: 26 | toolchain: ${{ matrix.rust }} 27 | override: true 28 | 29 | - name: Build (all features) 30 | run: cargo build --all-features 31 | 32 | - name: Build 33 | run: cargo build 34 | 35 | - name: Test (all features) 36 | run: cargo test --all-features 37 | 38 | - name: Test 39 | run: cargo test 40 | 41 | 42 | msrv: 43 | runs-on: ubuntu-latest 44 | env: 45 | MSRV: '1.60.0' 46 | steps: 47 | - uses: actions/checkout@v4 48 | 49 | - name: Setup ${{ env.MSRV }} toolchain 50 | uses: actions-rust-lang/setup-rust-toolchain@v1 51 | with: 52 | toolchain: '${{ env.MSRV }}' 53 | override: false 54 | 55 | - name: Install cargo-hack 56 | run: cargo +stable install --locked cargo-hack 57 | 58 | - name: Install cargo-minimal-versions 59 | run: cargo +stable install --locked cargo-minimal-versions 60 | 61 | - name: Check minimal versions on stable 62 | run: cargo +stable minimal-versions check 63 | 64 | - name: Test minimal versions on stable 65 | run: cargo +stable minimal-versions test 66 | 67 | - name: Check minimal versions on MSRV 68 | run: cargo +${{ env.MSRV }} minimal-versions check 69 | 70 | - name: Test minimal versions on MSRV 71 | run: cargo +${{ env.MSRV }} minimal-versions test 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here http://doc.crates.io/guide.html#cargotoml-vs-cargolock 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | examples/out.json 12 | out.json 13 | **/node_modules/ 14 | .idea/ -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "editor.formatOnSaveMode": "modificationsIfAvailable", 4 | "rust-analyzer.cargo.features": "all" 5 | } -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [3.0.3] - 2025-03-16 10 | ### Changed 11 | - Rewrote the English stemmer for improved performance ([#48](https://github.com/mattico/elasticlunr-rs/pull/48)) 12 | - Changed the rust-version in Cargo.toml to match the Minimum Supported Rust Version (1.60.0). 13 | - Added support for either Criterion 0.4.0 or 0.5.0 14 | - Updated the Cargo.toml license specification to better match the situation described in the README. 15 | 16 | ## [3.0.2] - 2023-03-17 17 | ### Changed 18 | - Updated Minimum Supported Rust Version to 1.60.0. 19 | - Updated Criterion to 0.4.0. 20 | 21 | ### Added 22 | - Language support for Korean ([#50](https://github.com/mattico/elasticlunr-rs/pull/50)) 23 | - Language support for Hungarian ([#51](https://github.com/mattico/elasticlunr-rs/pull/51)) 24 | 25 | ## [3.0.1] - 2022-07-23 26 | ### Changed 27 | - Updated dependencies and MSRVs to fix builds. ([#47](https://github.com/mattico/elasticlunr-rs/pull/47) et. al.) 28 | 29 | ## [3.0.0] - 2022-06-01 30 | ### Added 31 | - Language support for Arabic ([#40](https://github.com/mattico/elasticlunr-rs/pull/40])). 32 | - Add the `Language` trait to make it easier to implement languages outside the crate. 33 | - Add `IndexBuilder::add_field_with_tokenizer` to specify the tokenizer for a field. 34 | 35 | ### Changed 36 | - Update to 2018 edition, and bump MSRV to 1.54.0. 37 | - Change benchmarks to use Criterion. 38 | - Remove dependency on lazy_static. 39 | - Update dependencies. 40 | - Use Unicode character classes for trimmer. 41 | - `IndexBuilder` functions which add fields will now panic if the same field is added multiple times. 42 | - Fix `IndexBuilder` not respecting field insertion order. 43 | 44 | ### Removed 45 | - Remove the `default` feature. You now need to opt-in to the `languages` feature. 46 | - Remove the deprecated function `Pipeline::for_language`. 47 | - Remove the `pipeline::tokenize*` functions, which are now implemented as part of the `Language` trait. 48 | - Remove `Index::add_doc_with_tokenizer(s)`, replaced by `IndexBuilder::add_field_with_tokenizer`. 49 | - Remove the `Language` enum. Use the `Language` trait implementations in the `lang` modules, and the free functions `lang::from_name`, `lang::from_code`, and `lang::languages`. 50 | 51 | 52 | [Unreleased]: https://github.com/mattico/elasticlunr-rs/compare/v3.0.3...HEAD 53 | [3.0.0]: https://github.com/mattico/elasticlunr-rs/compare/v2.3.14...v3.0.0 54 | [3.0.1]: https://github.com/mattico/elasticlunr-rs/compare/v3.0.0...v3.0.1 55 | [3.0.2]: https://github.com/mattico/elasticlunr-rs/compare/v3.0.1...v3.0.2 56 | [3.0.3]: https://github.com/mattico/elasticlunr-rs/compare/v3.0.2...v3.0.3 57 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Matt Ickstadt "] 3 | license = "(MIT OR Apache-2.0) AND MIT" 4 | name = "elasticlunr-rs" 5 | version = "3.0.3" 6 | description = "A partial port of elasticlunr.js to Rust for generating static document search indexes" 7 | documentation = "https://docs.rs/elasticlunr-rs" 8 | repository = "https://github.com/mattico/elasticlunr-rs" 9 | keywords = ["search", "index", "indexing", "lunr", "elasticlunr"] 10 | exclude = ["tests/lunr-fixture-gen/", "js/", ".github/", ".vscode/"] 11 | readme = "README.md" 12 | edition = "2018" 13 | rust-version = "1.60.0" 14 | 15 | [badges] 16 | maintenance = { status = "passively-maintained" } 17 | 18 | [lib] 19 | name = "elasticlunr" 20 | 21 | [[bench]] 22 | name = "bench" 23 | harness = false 24 | 25 | [dev-dependencies] 26 | criterion = ">=0.4.0,<0.6.0" 27 | maplit = "1" 28 | 29 | [dependencies] 30 | regex = "1" 31 | rust-stemmers = { version = "1.2.0", optional = true } # 1.2.0 minimum for Norwegian 32 | serde = "1" 33 | serde_derive = "1.0.34" # First verstion to support #[serde(flatten)] 34 | serde_json = "1" 35 | jieba-rs = { version = "0.6", optional = true } 36 | lindera = { version = "0.14", optional = true, features = ["ipadic"] } 37 | lindera-core = { version = "0.13.5", optional = true } 38 | 39 | [features] 40 | languages = ["ar", "da", "de", "du", "es", "fi", "fr", "hu", "it", "ja", "ko", "no", "pt", "ro", "ru", "sv", "tr", "zh"] 41 | ar = [] 42 | da = ["rust-stemmers"] 43 | de = ["rust-stemmers"] 44 | du = ["rust-stemmers"] 45 | es = ["rust-stemmers"] 46 | fi = ["rust-stemmers"] 47 | fr = ["rust-stemmers"] 48 | hu = ["rust-stemmers"] 49 | it = ["rust-stemmers"] 50 | ja = ["lindera", "lindera-core"] 51 | ko = [] 52 | no = ["rust-stemmers"] 53 | pt = ["rust-stemmers"] 54 | ro = ["rust-stemmers"] 55 | ru = ["rust-stemmers"] 56 | sv = ["rust-stemmers"] 57 | tr = ["rust-stemmers"] 58 | zh = ["jieba-rs"] 59 | -------------------------------------------------------------------------------- /LICENSE-JS: -------------------------------------------------------------------------------- 1 | Portions of this library's code is ported from elasticlunr.js 2 | Used under the terms of the MIT license. 3 | 4 | Copyright (C) 2017 by Wei Song 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Matthew Ickstadt 2 | 3 | Permission is hereby granted, free of charge, to any 4 | person obtaining a copy of this software and associated 5 | documentation files (the "Software"), to deal in the 6 | Software without restriction, including without 7 | limitation the rights to use, copy, modify, merge, 8 | publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software 10 | is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice 14 | shall be included in all copies or substantial portions 15 | of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF 18 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED 19 | TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 20 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 21 | SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 23 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR 24 | IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /LICENSE-WORDS: -------------------------------------------------------------------------------- 1 | Word lists originally from https://github.com/brenes/stopwords-filter 2 | Used under the terms of the MIT license. 3 | 4 | Copyright (c) 2012 David J. Brenes 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining 7 | a copy of this software and associated documentation files (the 8 | "Software"), to deal in the Software without restriction, including 9 | without limitation the rights to use, copy, modify, merge, publish, 10 | distribute, sublicense, and/or sell copies of the Software, and to 11 | permit persons to whom the Software is furnished to do so, subject to 12 | the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be 15 | included in all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 18 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 20 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 21 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 22 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 23 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # elasticlunr-rs 2 | 3 | ![Build Status](https://github.com/mattico/elasticlunr-rs/workflows/CI/badge.svg) 4 | [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs) 5 | [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs) 6 | ![Maintenance](https://img.shields.io/badge/Maintenance-Passive-yellow) 7 | ![MSRV](https://img.shields.io/badge/MSRV-1.60.0-orange) 8 | 9 | A partial port of [elasticlunr.js][eljs] to Rust. Intended to be used for generating compatible search indices. 10 | 11 | This library is passively maintained to support existing users. New users are encouraged to use a different library such as [tinysearch](https://github.com/tinysearch/tinysearch) or [Pagefind](https://pagefind.app/). 12 | 13 | ## Example 14 | 15 | ```Rust 16 | use std::fs::File; 17 | use std::io::Write; 18 | use elasticlunr::Index; 19 | 20 | let mut index = Index::new(&["title", "body"]); 21 | index.add_doc("1", &["This is a title", "This is body text!"]); 22 | // Add more documents... 23 | let mut file = File::create("out.json").unwrap(); 24 | file.write_all(index.to_json_pretty().as_bytes()); 25 | ``` 26 | 27 | ## Minimum Supported Rust Version 28 | 29 | 1.60.0 30 | 31 | Changing the minimum supported Rust version is not considered a breaking change for semver purposes. 32 | 33 | The supported version is constrained by the version supported by our transitive dependencies. Earlier rustc versions may 34 | work if you have older versions of these in your `Cargo.lock`, but this is not tested. 35 | 36 | ## Languages 37 | 38 | This library includes optional support for non-English languages, see the features in `Cargo.toml`. Like in the JavaScript 39 | version, the language support is designed to be compatible with the [lunr-languages plugins][lunr-languages]. Some 40 | languages use a modified version, which is included in the `js` directory of the repository. 41 | 42 | ## License 43 | 44 | This repository is offered under the terms of the 45 | 46 | - Apache License, Version 2.0, (LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0) 47 | - MIT license (LICENSE-MIT or http://opensource.org/licenses/MIT) 48 | 49 | at your option. 50 | 51 | Unless you explicitly state otherwise, any contribution intentionally submitted 52 | for inclusion in the work by you, as defined in the Apache-2.0 license, shall be 53 | dual licensed as above, without any additional terms or conditions. 54 | 55 | Includes code ported from [elasticlunr.js][eljs] Copyright (C) 2017 by Wei Song, 56 | used under license. See LICENSE-JS for details. 57 | 58 | Includes stop word lists ported from [stopwords-filter][swft] Copyright (C) 2012 59 | David J. Brenes, used under license. See LICENSE-WORDS for details. 60 | 61 | Bundled javascript code in the repository (not included in the cargo package) may have other licenses. 62 | 63 | [lunr-languages]: https://github.com/MihaiValentin/lunr-languages 64 | [eljs]: https://github.com/weixsong/elasticlunr.js 65 | [swft]: https://github.com/brenes/stopwords-filter 66 | -------------------------------------------------------------------------------- /benches/bench.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; 2 | use elasticlunr::Index; 3 | 4 | fn bench_main(c: &mut Criterion) { 5 | // BTreeMap: 3,165,389 ns/iter (+/- 420,869) 6 | // BTreeMap: 2,920,902 ns/iter (+/- 118,729) 7 | c.bench_function("create_index", |b| { 8 | let text = include_str!("../tests/data/en.in.txt"); 9 | let sections: Vec<_> = text.split("\n\n").collect(); 10 | b.iter(|| { 11 | let mut index = Index::new(&["section"]); 12 | for (i, section) in sections.iter().enumerate() { 13 | index.add_doc(&format!("section_{}", i), &[section]); 14 | } 15 | black_box(index.to_json()); 16 | }) 17 | }); 18 | } 19 | 20 | criterion_group!(benches, bench_main); 21 | criterion_main!(benches); 22 | -------------------------------------------------------------------------------- /examples/export_json.rs: -------------------------------------------------------------------------------- 1 | use elasticlunr::Index; 2 | use std::fs::File; 3 | use std::io::Write; 4 | 5 | fn main() { 6 | let mut index = Index::new(&["title", "body"]); 7 | index.add_doc( 8 | "1", 9 | &[ 10 | "This Week in Rust 207", 11 | "Hello and welcome to another issue of This Week in Rust!", 12 | ], 13 | ); 14 | index.add_doc( 15 | "2", 16 | &[ 17 | "This Week in Rust 206", 18 | "Hello and welcome to another issue of This Week in Rust!", 19 | ], 20 | ); 21 | let mut file = File::create("examples/out.json").unwrap(); 22 | file.write_all(index.to_json_pretty().as_bytes()).unwrap(); 23 | } 24 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | coverage 5 | -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "elasticlunr-rs-fuzz" 3 | version = "0.0.0" 4 | publish = false 5 | edition = "2018" 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies] 11 | libfuzzer-sys = "0.4" 12 | 13 | [dependencies.elasticlunr-rs] 14 | path = ".." 15 | 16 | [[bin]] 17 | name = "fuzz_en" 18 | path = "fuzz_targets/fuzz_en.rs" 19 | test = false 20 | doc = false 21 | bench = false 22 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/fuzz_en.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use elasticlunr::lang::{English, Language}; 4 | use libfuzzer_sys::fuzz_target; 5 | 6 | fuzz_target!(|data: &[u8]| { 7 | if let Ok(s) = std::str::from_utf8(data) { 8 | // libfuzz was finding a ton of UTF-8 inputs which caused panics. That's worth addressing, but it needs to be 9 | // done with an eye towards compatilibity with elasticlunr.js. Punt for now. 10 | if !s.is_ascii() { 11 | return; 12 | } 13 | let en = English::new(); 14 | let pipeline = en.make_pipeline(); 15 | let tokens = en.tokenize(s); 16 | let filtered = pipeline.run(tokens); 17 | std::hint::black_box(filtered); 18 | } 19 | }); 20 | -------------------------------------------------------------------------------- /js/lunr.ar.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Simple Arabic stemmer based on lunr.ar.js from https://github.com/MihaiValentin/lunr-languages 3 | * 4 | * Copyright 2018, Dalia Al-Shahrabi 5 | * http://www.mozilla.org/MPL/ 6 | */ 7 | 8 | /** 9 | * export the module via AMD, CommonJS or as a browser global 10 | * Export code from https://github.com/umdjs/umd/blob/master/returnExports.js 11 | */ 12 | ; 13 | (function (root, factory) { 14 | if (typeof define === 'function' && define.amd) { 15 | // AMD. Register as an anonymous module. 16 | define(factory) 17 | } else if (typeof exports === 'object') { 18 | /** 19 | * Node. Does not work with strict CommonJS, but 20 | * only CommonJS-like environments that support module.exports, 21 | * like Node. 22 | */ 23 | module.exports = factory() 24 | } else { 25 | // Browser globals (root is window) 26 | factory()(root.lunr); 27 | } 28 | }(this, function () { 29 | /** 30 | * Just return a value to define the module export. 31 | * This example returns an object, but the module 32 | * can return a function as the exported value. 33 | */ 34 | return function (lunr) { 35 | /* throw error if lunr is not yet included */ 36 | if ('undefined' === typeof lunr) { 37 | throw new Error('Lunr is not present. Please include / require Lunr before this script.'); 38 | } 39 | 40 | /* register specific locale function */ 41 | lunr.ar = function () { 42 | this.pipeline.reset(); 43 | this.pipeline.add( 44 | lunr.ar.stemmer 45 | ); 46 | 47 | // for lunr version 2 48 | // this is necessary so that every searched word is also stemmed before 49 | // in lunr <= 1 this is not needed, as it is done using the normal pipeline 50 | if (this.searchPipeline) { 51 | this.searchPipeline.reset(); 52 | this.searchPipeline.add(lunr.ar.stemmer) 53 | } 54 | }; 55 | 56 | /* lunr stemmer function */ 57 | lunr.ar.stemmer = (function () { 58 | 59 | /* remove elongating character */ 60 | self.removeElongating = function (word) { 61 | return word.replace(/[\u0640]/gi, ''); 62 | } 63 | 64 | self.removeDiacritics = function (word) { 65 | return word.replace(/[\u064b-\u065b]/gi, ''); 66 | } 67 | 68 | /*Replace all variations of alef (آأإٱى) to a plain alef (ا)*/ 69 | self.cleanAlef = function (word) { 70 | return word.replace(/[\u0622\u0623\u0625\u0671\u0649]/gi, "\u0627"); 71 | } 72 | 73 | self.execArray = [ 74 | 'removeElongating', 75 | 'removeDiacritics', 76 | 'cleanAlef' 77 | ]; 78 | 79 | self.stem = function (word) { 80 | var counter = 0; 81 | while (counter < self.execArray.length) { 82 | word = self[self.execArray[counter]](word); 83 | counter++; 84 | } 85 | return word; 86 | } 87 | 88 | return function (word) { 89 | return self.stem(word); 90 | } 91 | })(); 92 | 93 | lunr.Pipeline.registerFunction(lunr.ar.stemmer, 'stemmer-ar'); 94 | }; 95 | })) -------------------------------------------------------------------------------- /src/config.rs: -------------------------------------------------------------------------------- 1 | //! These types are not used for generating `Index`es. They are provided to help with 2 | //! creating compatible JSON structures for configuring the JavaScript search 3 | //! function. 4 | //! 5 | //! *Reference:* 6 | //! 7 | 8 | use std::collections::BTreeMap; 9 | 10 | /// Used to set the search configuration for a specific field. 11 | /// When `expand` or `bool` is `None`, elasticlunr.js will use the value from 12 | /// the global configuration. The `boost` field, if present, 13 | /// increases the importance of this field when ordering search results. 14 | #[derive(Serialize, Deserialize, Default, Debug, Copy, Clone, Eq, PartialEq)] 15 | pub struct SearchOptionsField { 16 | #[serde(skip_serializing_if = "Option::is_none")] 17 | pub boost: Option, 18 | #[serde(skip_serializing_if = "Option::is_none")] 19 | pub bool: Option, 20 | #[serde(skip_serializing_if = "Option::is_none")] 21 | pub expand: Option, 22 | } 23 | 24 | /// Sets which boolean model is used for searching with 25 | /// multiple terms. Defaults to `Or`. 26 | /// 27 | /// - *AND* requires every search term to be present in results 28 | /// - *OR* accepts results which have at least one term 29 | /// 30 | #[derive(Serialize, Deserialize, Debug, Copy, Clone, Eq, PartialEq)] 31 | #[serde(rename_all = "SCREAMING_SNAKE_CASE")] 32 | pub enum SearchBool { 33 | Or, 34 | And, 35 | } 36 | 37 | impl Default for SearchBool { 38 | fn default() -> Self { 39 | SearchBool::Or 40 | } 41 | } 42 | 43 | /// The search configuration map which is passed to the 44 | /// elasticlunr.js `Index.search()` function. 45 | /// 46 | /// |Key |Default| 47 | /// |--------|-------| 48 | /// |`bool` |`OR` | 49 | /// |`expand`|`false`| 50 | #[derive(Serialize, Deserialize, Default, Debug, Clone, Eq, PartialEq)] 51 | pub struct SearchOptions { 52 | pub bool: SearchBool, 53 | pub expand: bool, 54 | pub fields: BTreeMap, 55 | } 56 | 57 | #[cfg(test)] 58 | mod tests { 59 | use super::*; 60 | use serde_json; 61 | 62 | #[test] 63 | fn test_normal_config() { 64 | let options = SearchOptions { 65 | fields: btreemap![ 66 | "title".into() => SearchOptionsField { 67 | boost: Some(5), 68 | ..Default::default() 69 | }, 70 | "body".into() => SearchOptionsField { 71 | boost: Some(1), 72 | ..Default::default() 73 | }, 74 | ], 75 | ..Default::default() 76 | }; 77 | let stringed = serde_json::to_string(&options).unwrap(); 78 | 79 | assert_eq!( 80 | stringed, 81 | r#"{"bool":"OR","expand":false,"fields":{"body":{"boost":1},"title":{"boost":5}}}"# 82 | ); 83 | } 84 | 85 | #[test] 86 | fn test_complex_config() { 87 | let options = SearchOptions { 88 | fields: btreemap! { 89 | "title".into() => SearchOptionsField { 90 | expand: Some(true), 91 | ..Default::default() 92 | }, 93 | "body".into() => SearchOptionsField { 94 | bool: Some(SearchBool::Or), 95 | ..Default::default() 96 | }, 97 | "breadcrumbs".into() => SearchOptionsField { 98 | bool: Some(SearchBool::default()), 99 | boost: Some(200), 100 | ..Default::default() 101 | }, 102 | }, 103 | expand: false, 104 | bool: SearchBool::And, 105 | }; 106 | let stringed = serde_json::to_string_pretty(&options).unwrap(); 107 | 108 | assert_eq!( 109 | stringed, 110 | r#"{ 111 | "bool": "AND", 112 | "expand": false, 113 | "fields": { 114 | "body": { 115 | "bool": "OR" 116 | }, 117 | "breadcrumbs": { 118 | "boost": 200, 119 | "bool": "OR" 120 | }, 121 | "title": { 122 | "expand": true 123 | } 124 | } 125 | }"# 126 | ); 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/lang/ar.rs: -------------------------------------------------------------------------------- 1 | use super::Language; 2 | use crate::pipeline::{Pipeline, PipelineFn}; 3 | use regex::Regex; 4 | 5 | /// Arabic Language 6 | /// 7 | /// Designed to be compatibile with the included Javascript implementation. See `js/lunr.ar.js`. 8 | pub struct Arabic {} 9 | 10 | impl Arabic { 11 | pub fn new() -> Self { 12 | Self {} 13 | } 14 | } 15 | 16 | impl Language for Arabic { 17 | fn name(&self) -> String { 18 | "Arabic".into() 19 | } 20 | fn code(&self) -> String { 21 | "ar".into() 22 | } 23 | 24 | fn tokenize(&self, text: &str) -> Vec { 25 | super::tokenize_whitespace(text) 26 | } 27 | 28 | fn make_pipeline(&self) -> Pipeline { 29 | Pipeline { 30 | queue: vec![Box::new(Stemmer::new())], 31 | } 32 | } 33 | } 34 | 35 | struct Stemmer { 36 | diacritics: Regex, 37 | alefs: Regex, 38 | } 39 | 40 | impl Stemmer { 41 | pub fn new() -> Self { 42 | let diacritics = Regex::new("[\u{0640}\u{064b}-\u{065b}]").unwrap(); 43 | let alefs = Regex::new("[\u{0622}\u{0623}\u{0625}\u{0671}\u{0649}]").unwrap(); 44 | Self { diacritics, alefs } 45 | } 46 | } 47 | 48 | impl PipelineFn for Stemmer { 49 | fn name(&self) -> String { 50 | "stemmer-ar".into() 51 | } 52 | 53 | fn filter(&self, token: String) -> Option { 54 | // remove diacritics and elongating character 55 | let result = self.diacritics.replace(&token, ""); 56 | // replace all variations of alef (آأإٱى) to a plain alef (ا) 57 | let result = self.alefs.replace(&result, "\u{0627}"); 58 | if result.is_empty() { 59 | None 60 | } else if result == token { 61 | Some(token) 62 | } else { 63 | Some(result.into()) 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/lang/common.rs: -------------------------------------------------------------------------------- 1 | use crate::pipeline::PipelineFn; 2 | use regex::Regex; 3 | use std::collections::HashSet; 4 | 5 | #[derive(Clone)] 6 | pub struct StopWordFilter { 7 | name: String, 8 | stop_words: HashSet, 9 | } 10 | 11 | impl StopWordFilter { 12 | pub fn new(name: &str, stop_words: &[&str]) -> Self { 13 | Self { 14 | name: name.into(), 15 | stop_words: stop_words.iter().map(|s| s.to_string()).collect(), 16 | } 17 | } 18 | } 19 | 20 | impl PipelineFn for StopWordFilter { 21 | fn name(&self) -> String { 22 | self.name.clone() 23 | } 24 | 25 | fn filter(&self, token: String) -> Option { 26 | if self.stop_words.contains(&token) { 27 | None 28 | } else { 29 | Some(token) 30 | } 31 | } 32 | } 33 | 34 | #[derive(Clone)] 35 | pub struct RegexTrimmer { 36 | name: String, 37 | trimmer: Regex, 38 | } 39 | 40 | impl RegexTrimmer { 41 | pub fn new(name: &str, word_chars: &str) -> Self { 42 | let name = name.into(); 43 | let trimmer = Regex::new(&format!("^[^{0}]+|[^{0}]+$", word_chars)).unwrap(); 44 | Self { name, trimmer } 45 | } 46 | } 47 | 48 | impl PipelineFn for RegexTrimmer { 49 | fn name(&self) -> String { 50 | self.name.clone() 51 | } 52 | 53 | fn filter(&self, token: String) -> Option { 54 | let result = self.trimmer.replace_all(&token, ""); 55 | if result.is_empty() { 56 | None 57 | } else if result == token { 58 | Some(token) 59 | } else { 60 | Some(result.into()) 61 | } 62 | } 63 | } 64 | 65 | #[cfg(feature = "rust-stemmers")] 66 | pub struct RustStemmer { 67 | name: String, 68 | stemmer: rust_stemmers::Stemmer, 69 | } 70 | 71 | #[cfg(feature = "rust-stemmers")] 72 | impl RustStemmer { 73 | pub fn new(name: &str, algo: rust_stemmers::Algorithm) -> Self { 74 | Self { 75 | name: name.into(), 76 | stemmer: rust_stemmers::Stemmer::create(algo), 77 | } 78 | } 79 | } 80 | 81 | #[cfg(feature = "rust-stemmers")] 82 | impl PipelineFn for RustStemmer { 83 | fn name(&self) -> String { 84 | self.name.clone() 85 | } 86 | 87 | fn filter(&self, token: String) -> Option { 88 | let result = self.stemmer.stem(&token); 89 | if result.is_empty() { 90 | None 91 | } else if result == token { 92 | Some(token) 93 | } else { 94 | Some(result.into()) 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/lang/da.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Danish {} 10 | 11 | impl Danish { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Danish { 18 | fn name(&self) -> String { 19 | "Danish".into() 20 | } 21 | fn code(&self) -> String { 22 | "da".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-da", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-da", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-da", Algorithm::Danish)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", "ad", "af", "alle", "alt", "anden", "at", "blev", "blive", "bliver", "da", "de", "dem", 42 | "den", "denne", "der", "deres", "det", "dette", "dig", "din", "disse", "dog", "du", "efter", 43 | "eller", "en", "end", "er", "et", "for", "fra", "ham", "han", "hans", "har", "havde", "have", 44 | "hende", "hendes", "her", "hos", "hun", "hvad", "hvis", "hvor", "i", "ikke", "ind", "jeg", 45 | "jer", "jo", "kunne", "man", "mange", "med", "meget", "men", "mig", "min", "mine", "mit", 46 | "mod", "ned", "noget", "nogle", "nu", "når", "og", "også", "om", "op", "os", "over", "på", 47 | "selv", "sig", "sin", "sine", "sit", "skal", "skulle", "som", "sådan", "thi", "til", "ud", 48 | "under", "var", "vi", "vil", "ville", "vor", "være", "været", 49 | ]; 50 | -------------------------------------------------------------------------------- /src/lang/de.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct German {} 10 | 11 | impl German { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for German { 18 | fn name(&self) -> String { 19 | "German".into() 20 | } 21 | fn code(&self) -> String { 22 | "de".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-de", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-de", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-de", Algorithm::German)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "aber", 43 | "alle", 44 | "allem", 45 | "allen", 46 | "aller", 47 | "alles", 48 | "als", 49 | "also", 50 | "am", 51 | "an", 52 | "ander", 53 | "andere", 54 | "anderem", 55 | "anderen", 56 | "anderer", 57 | "anderes", 58 | "anderm", 59 | "andern", 60 | "anderr", 61 | "anders", 62 | "auch", 63 | "auf", 64 | "aus", 65 | "bei", 66 | "bin", 67 | "bis", 68 | "bist", 69 | "da", 70 | "damit", 71 | "dann", 72 | "das", 73 | "dasselbe", 74 | "dazu", 75 | "daß", 76 | "dein", 77 | "deine", 78 | "deinem", 79 | "deinen", 80 | "deiner", 81 | "deines", 82 | "dem", 83 | "demselben", 84 | "den", 85 | "denn", 86 | "denselben", 87 | "der", 88 | "derer", 89 | "derselbe", 90 | "derselben", 91 | "des", 92 | "desselben", 93 | "dessen", 94 | "dich", 95 | "die", 96 | "dies", 97 | "diese", 98 | "dieselbe", 99 | "dieselben", 100 | "diesem", 101 | "diesen", 102 | "dieser", 103 | "dieses", 104 | "dir", 105 | "doch", 106 | "dort", 107 | "du", 108 | "durch", 109 | "ein", 110 | "eine", 111 | "einem", 112 | "einen", 113 | "einer", 114 | "eines", 115 | "einig", 116 | "einige", 117 | "einigem", 118 | "einigen", 119 | "einiger", 120 | "einiges", 121 | "einmal", 122 | "er", 123 | "es", 124 | "etwas", 125 | "euch", 126 | "euer", 127 | "eure", 128 | "eurem", 129 | "euren", 130 | "eurer", 131 | "eures", 132 | "für", 133 | "gegen", 134 | "gewesen", 135 | "hab", 136 | "habe", 137 | "haben", 138 | "hat", 139 | "hatte", 140 | "hatten", 141 | "hier", 142 | "hin", 143 | "hinter", 144 | "ich", 145 | "ihm", 146 | "ihn", 147 | "ihnen", 148 | "ihr", 149 | "ihre", 150 | "ihrem", 151 | "ihren", 152 | "ihrer", 153 | "ihres", 154 | "im", 155 | "in", 156 | "indem", 157 | "ins", 158 | "ist", 159 | "jede", 160 | "jedem", 161 | "jeden", 162 | "jeder", 163 | "jedes", 164 | "jene", 165 | "jenem", 166 | "jenen", 167 | "jener", 168 | "jenes", 169 | "jetzt", 170 | "kann", 171 | "kein", 172 | "keine", 173 | "keinem", 174 | "keinen", 175 | "keiner", 176 | "keines", 177 | "können", 178 | "könnte", 179 | "machen", 180 | "man", 181 | "manche", 182 | "manchem", 183 | "manchen", 184 | "mancher", 185 | "manches", 186 | "mein", 187 | "meine", 188 | "meinem", 189 | "meinen", 190 | "meiner", 191 | "meines", 192 | "mich", 193 | "mir", 194 | "mit", 195 | "muss", 196 | "musste", 197 | "nach", 198 | "nicht", 199 | "nichts", 200 | "noch", 201 | "nun", 202 | "nur", 203 | "ob", 204 | "oder", 205 | "ohne", 206 | "sehr", 207 | "sein", 208 | "seine", 209 | "seinem", 210 | "seinen", 211 | "seiner", 212 | "seines", 213 | "selbst", 214 | "sich", 215 | "sie", 216 | "sind", 217 | "so", 218 | "solche", 219 | "solchem", 220 | "solchen", 221 | "solcher", 222 | "solches", 223 | "soll", 224 | "sollte", 225 | "sondern", 226 | "sonst", 227 | "um", 228 | "und", 229 | "uns", 230 | "unse", 231 | "unsem", 232 | "unsen", 233 | "unser", 234 | "unses", 235 | "unter", 236 | "viel", 237 | "vom", 238 | "von", 239 | "vor", 240 | "war", 241 | "waren", 242 | "warst", 243 | "was", 244 | "weg", 245 | "weil", 246 | "weiter", 247 | "welche", 248 | "welchem", 249 | "welchen", 250 | "welcher", 251 | "welches", 252 | "wenn", 253 | "werde", 254 | "werden", 255 | "wie", 256 | "wieder", 257 | "will", 258 | "wir", 259 | "wird", 260 | "wirst", 261 | "wo", 262 | "wollen", 263 | "wollte", 264 | "während", 265 | "würde", 266 | "würden", 267 | "zu", 268 | "zum", 269 | "zur", 270 | "zwar", 271 | "zwischen", 272 | "über", 273 | ]; 274 | -------------------------------------------------------------------------------- /src/lang/du.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Dutch {} 10 | 11 | impl Dutch { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Dutch { 18 | fn name(&self) -> String { 19 | "Dutch".into() 20 | } 21 | fn code(&self) -> String { 22 | "du".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-du", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-du", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-du", Algorithm::Dutch)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", "aan", "al", "alles", "als", "altijd", "andere", "ben", "bij", "daar", "dan", "dat", "de", 42 | "der", "deze", "die", "dit", "doch", "doen", "door", "dus", "een", "eens", "en", "er", "ge", 43 | "geen", "geweest", "haar", "had", "heb", "hebben", "heeft", "hem", "het", "hier", "hij", "hoe", 44 | "hun", "iemand", "iets", "ik", "in", "is", "ja", "je", "kan", "kon", "kunnen", "maar", "me", 45 | "meer", "men", "met", "mij", "mijn", "moet", "na", "naar", "niet", "niets", "nog", "nu", "of", 46 | "om", "omdat", "onder", "ons", "ook", "op", "over", "reeds", "te", "tegen", "toch", "toen", 47 | "tot", "u", "uit", "uw", "van", "veel", "voor", "want", "waren", "was", "wat", "werd", "wezen", 48 | "wie", "wil", "worden", "wordt", "zal", "ze", "zelf", "zich", "zij", "zijn", "zo", "zonder", 49 | "zou", 50 | ]; 51 | -------------------------------------------------------------------------------- /src/lang/es.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Spanish {} 10 | 11 | impl Spanish { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Spanish { 18 | fn name(&self) -> String { 19 | "Spanish".into() 20 | } 21 | fn code(&self) -> String { 22 | "es".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-es", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-es", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-es", Algorithm::Spanish)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "a", 43 | "al", 44 | "algo", 45 | "algunas", 46 | "algunos", 47 | "ante", 48 | "antes", 49 | "como", 50 | "con", 51 | "contra", 52 | "cual", 53 | "cuando", 54 | "de", 55 | "del", 56 | "desde", 57 | "donde", 58 | "durante", 59 | "e", 60 | "el", 61 | "ella", 62 | "ellas", 63 | "ellos", 64 | "en", 65 | "entre", 66 | "era", 67 | "erais", 68 | "eran", 69 | "eras", 70 | "eres", 71 | "es", 72 | "esa", 73 | "esas", 74 | "ese", 75 | "eso", 76 | "esos", 77 | "esta", 78 | "estaba", 79 | "estabais", 80 | "estaban", 81 | "estabas", 82 | "estad", 83 | "estada", 84 | "estadas", 85 | "estado", 86 | "estados", 87 | "estamos", 88 | "estando", 89 | "estar", 90 | "estaremos", 91 | "estará", 92 | "estarán", 93 | "estarás", 94 | "estaré", 95 | "estaréis", 96 | "estaría", 97 | "estaríais", 98 | "estaríamos", 99 | "estarían", 100 | "estarías", 101 | "estas", 102 | "este", 103 | "estemos", 104 | "esto", 105 | "estos", 106 | "estoy", 107 | "estuve", 108 | "estuviera", 109 | "estuvierais", 110 | "estuvieran", 111 | "estuvieras", 112 | "estuvieron", 113 | "estuviese", 114 | "estuvieseis", 115 | "estuviesen", 116 | "estuvieses", 117 | "estuvimos", 118 | "estuviste", 119 | "estuvisteis", 120 | "estuviéramos", 121 | "estuviésemos", 122 | "estuvo", 123 | "está", 124 | "estábamos", 125 | "estáis", 126 | "están", 127 | "estás", 128 | "esté", 129 | "estéis", 130 | "estén", 131 | "estés", 132 | "fue", 133 | "fuera", 134 | "fuerais", 135 | "fueran", 136 | "fueras", 137 | "fueron", 138 | "fuese", 139 | "fueseis", 140 | "fuesen", 141 | "fueses", 142 | "fui", 143 | "fuimos", 144 | "fuiste", 145 | "fuisteis", 146 | "fuéramos", 147 | "fuésemos", 148 | "ha", 149 | "habida", 150 | "habidas", 151 | "habido", 152 | "habidos", 153 | "habiendo", 154 | "habremos", 155 | "habrá", 156 | "habrán", 157 | "habrás", 158 | "habré", 159 | "habréis", 160 | "habría", 161 | "habríais", 162 | "habríamos", 163 | "habrían", 164 | "habrías", 165 | "habéis", 166 | "había", 167 | "habíais", 168 | "habíamos", 169 | "habían", 170 | "habías", 171 | "han", 172 | "has", 173 | "hasta", 174 | "hay", 175 | "haya", 176 | "hayamos", 177 | "hayan", 178 | "hayas", 179 | "hayáis", 180 | "he", 181 | "hemos", 182 | "hube", 183 | "hubiera", 184 | "hubierais", 185 | "hubieran", 186 | "hubieras", 187 | "hubieron", 188 | "hubiese", 189 | "hubieseis", 190 | "hubiesen", 191 | "hubieses", 192 | "hubimos", 193 | "hubiste", 194 | "hubisteis", 195 | "hubiéramos", 196 | "hubiésemos", 197 | "hubo", 198 | "la", 199 | "las", 200 | "le", 201 | "les", 202 | "lo", 203 | "los", 204 | "me", 205 | "mi", 206 | "mis", 207 | "mucho", 208 | "muchos", 209 | "muy", 210 | "más", 211 | "mí", 212 | "mía", 213 | "mías", 214 | "mío", 215 | "míos", 216 | "nada", 217 | "ni", 218 | "no", 219 | "nos", 220 | "nosotras", 221 | "nosotros", 222 | "nuestra", 223 | "nuestras", 224 | "nuestro", 225 | "nuestros", 226 | "o", 227 | "os", 228 | "otra", 229 | "otras", 230 | "otro", 231 | "otros", 232 | "para", 233 | "pero", 234 | "poco", 235 | "por", 236 | "porque", 237 | "que", 238 | "quien", 239 | "quienes", 240 | "qué", 241 | "se", 242 | "sea", 243 | "seamos", 244 | "sean", 245 | "seas", 246 | "seremos", 247 | "será", 248 | "serán", 249 | "serás", 250 | "seré", 251 | "seréis", 252 | "sería", 253 | "seríais", 254 | "seríamos", 255 | "serían", 256 | "serías", 257 | "seáis", 258 | "sido", 259 | "siendo", 260 | "sin", 261 | "sobre", 262 | "sois", 263 | "somos", 264 | "son", 265 | "soy", 266 | "su", 267 | "sus", 268 | "suya", 269 | "suyas", 270 | "suyo", 271 | "suyos", 272 | "sí", 273 | "también", 274 | "tanto", 275 | "te", 276 | "tendremos", 277 | "tendrá", 278 | "tendrán", 279 | "tendrás", 280 | "tendré", 281 | "tendréis", 282 | "tendría", 283 | "tendríais", 284 | "tendríamos", 285 | "tendrían", 286 | "tendrías", 287 | "tened", 288 | "tenemos", 289 | "tenga", 290 | "tengamos", 291 | "tengan", 292 | "tengas", 293 | "tengo", 294 | "tengáis", 295 | "tenida", 296 | "tenidas", 297 | "tenido", 298 | "tenidos", 299 | "teniendo", 300 | "tenéis", 301 | "tenía", 302 | "teníais", 303 | "teníamos", 304 | "tenían", 305 | "tenías", 306 | "ti", 307 | "tiene", 308 | "tienen", 309 | "tienes", 310 | "todo", 311 | "todos", 312 | "tu", 313 | "tus", 314 | "tuve", 315 | "tuviera", 316 | "tuvierais", 317 | "tuvieran", 318 | "tuvieras", 319 | "tuvieron", 320 | "tuviese", 321 | "tuvieseis", 322 | "tuviesen", 323 | "tuvieses", 324 | "tuvimos", 325 | "tuviste", 326 | "tuvisteis", 327 | "tuviéramos", 328 | "tuviésemos", 329 | "tuvo", 330 | "tuya", 331 | "tuyas", 332 | "tuyo", 333 | "tuyos", 334 | "tú", 335 | "un", 336 | "una", 337 | "uno", 338 | "unos", 339 | "vosotras", 340 | "vosotros", 341 | "vuestra", 342 | "vuestras", 343 | "vuestro", 344 | "vuestros", 345 | "y", 346 | "ya", 347 | "yo", 348 | "él", 349 | "éramos", 350 | ]; 351 | -------------------------------------------------------------------------------- /src/lang/fi.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Finnish {} 10 | 11 | impl Finnish { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Finnish { 18 | fn name(&self) -> String { 19 | "Finnish".into() 20 | } 21 | fn code(&self) -> String { 22 | "fi".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-fi", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-fi", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-fi", Algorithm::Finnish)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "ei", 43 | "eivät", 44 | "emme", 45 | "en", 46 | "et", 47 | "ette", 48 | "että", 49 | "he", 50 | "heidän", 51 | "heidät", 52 | "heihin", 53 | "heille", 54 | "heillä", 55 | "heiltä", 56 | "heissä", 57 | "heistä", 58 | "heitä", 59 | "hän", 60 | "häneen", 61 | "hänelle", 62 | "hänellä", 63 | "häneltä", 64 | "hänen", 65 | "hänessä", 66 | "hänestä", 67 | "hänet", 68 | "häntä", 69 | "itse", 70 | "ja", 71 | "johon", 72 | "joiden", 73 | "joihin", 74 | "joiksi", 75 | "joilla", 76 | "joille", 77 | "joilta", 78 | "joina", 79 | "joissa", 80 | "joista", 81 | "joita", 82 | "joka", 83 | "joksi", 84 | "jolla", 85 | "jolle", 86 | "jolta", 87 | "jona", 88 | "jonka", 89 | "jos", 90 | "jossa", 91 | "josta", 92 | "jota", 93 | "jotka", 94 | "kanssa", 95 | "keiden", 96 | "keihin", 97 | "keiksi", 98 | "keille", 99 | "keillä", 100 | "keiltä", 101 | "keinä", 102 | "keissä", 103 | "keistä", 104 | "keitä", 105 | "keneen", 106 | "keneksi", 107 | "kenelle", 108 | "kenellä", 109 | "keneltä", 110 | "kenen", 111 | "kenenä", 112 | "kenessä", 113 | "kenestä", 114 | "kenet", 115 | "ketkä", 116 | "ketkä", 117 | "ketä", 118 | "koska", 119 | "kuin", 120 | "kuka", 121 | "kun", 122 | "me", 123 | "meidän", 124 | "meidät", 125 | "meihin", 126 | "meille", 127 | "meillä", 128 | "meiltä", 129 | "meissä", 130 | "meistä", 131 | "meitä", 132 | "mihin", 133 | "miksi", 134 | "mikä", 135 | "mille", 136 | "millä", 137 | "miltä", 138 | "minkä", 139 | "minkä", 140 | "minua", 141 | "minulla", 142 | "minulle", 143 | "minulta", 144 | "minun", 145 | "minussa", 146 | "minusta", 147 | "minut", 148 | "minuun", 149 | "minä", 150 | "minä", 151 | "missä", 152 | "mistä", 153 | "mitkä", 154 | "mitä", 155 | "mukaan", 156 | "mutta", 157 | "ne", 158 | "niiden", 159 | "niihin", 160 | "niiksi", 161 | "niille", 162 | "niillä", 163 | "niiltä", 164 | "niin", 165 | "niin", 166 | "niinä", 167 | "niissä", 168 | "niistä", 169 | "niitä", 170 | "noiden", 171 | "noihin", 172 | "noiksi", 173 | "noilla", 174 | "noille", 175 | "noilta", 176 | "noin", 177 | "noina", 178 | "noissa", 179 | "noista", 180 | "noita", 181 | "nuo", 182 | "nyt", 183 | "näiden", 184 | "näihin", 185 | "näiksi", 186 | "näille", 187 | "näillä", 188 | "näiltä", 189 | "näinä", 190 | "näissä", 191 | "näistä", 192 | "näitä", 193 | "nämä", 194 | "ole", 195 | "olemme", 196 | "olen", 197 | "olet", 198 | "olette", 199 | "oli", 200 | "olimme", 201 | "olin", 202 | "olisi", 203 | "olisimme", 204 | "olisin", 205 | "olisit", 206 | "olisitte", 207 | "olisivat", 208 | "olit", 209 | "olitte", 210 | "olivat", 211 | "olla", 212 | "olleet", 213 | "ollut", 214 | "on", 215 | "ovat", 216 | "poikki", 217 | "se", 218 | "sekä", 219 | "sen", 220 | "siihen", 221 | "siinä", 222 | "siitä", 223 | "siksi", 224 | "sille", 225 | "sillä", 226 | "sillä", 227 | "siltä", 228 | "sinua", 229 | "sinulla", 230 | "sinulle", 231 | "sinulta", 232 | "sinun", 233 | "sinussa", 234 | "sinusta", 235 | "sinut", 236 | "sinuun", 237 | "sinä", 238 | "sinä", 239 | "sitä", 240 | "tai", 241 | "te", 242 | "teidän", 243 | "teidät", 244 | "teihin", 245 | "teille", 246 | "teillä", 247 | "teiltä", 248 | "teissä", 249 | "teistä", 250 | "teitä", 251 | "tuo", 252 | "tuohon", 253 | "tuoksi", 254 | "tuolla", 255 | "tuolle", 256 | "tuolta", 257 | "tuon", 258 | "tuona", 259 | "tuossa", 260 | "tuosta", 261 | "tuota", 262 | "tähän", 263 | "täksi", 264 | "tälle", 265 | "tällä", 266 | "tältä", 267 | "tämä", 268 | "tämän", 269 | "tänä", 270 | "tässä", 271 | "tästä", 272 | "tätä", 273 | "vaan", 274 | "vai", 275 | "vaikka", 276 | "yli", 277 | ]; 278 | -------------------------------------------------------------------------------- /src/lang/fr.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct French {} 10 | 11 | impl French { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for French { 18 | fn name(&self) -> String { 19 | "French".into() 20 | } 21 | fn code(&self) -> String { 22 | "fr".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-fr", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-fr", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-fr", Algorithm::French)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", "ai", "aie", "aient", "aies", "ait", "as", "au", "aura", "aurai", "auraient", "aurais", 42 | "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aux", "avaient", "avais", 43 | "avait", "avec", "avez", "aviez", "avions", "avons", "ayant", "ayez", "ayons", "c", "ce", 44 | "ceci", "celà", "ces", "cet", "cette", "d", "dans", "de", "des", "du", "elle", "en", "es", 45 | "est", "et", "eu", "eue", "eues", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", 46 | "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "furent", "fus", "fusse", "fussent", 47 | "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "ici", "il", "ils", "j", "je", 48 | "l", "la", "le", "les", "leur", "leurs", "lui", "m", "ma", "mais", "me", "mes", "moi", "mon", 49 | "même", "n", "ne", "nos", "notre", "nous", "on", "ont", "ou", "par", "pas", "pour", "qu", 50 | "que", "quel", "quelle", "quelles", "quels", "qui", "s", "sa", "sans", "se", "sera", "serai", 51 | "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", 52 | "ses", "soi", "soient", "sois", "soit", "sommes", "son", "sont", "soyez", "soyons", "suis", 53 | "sur", "t", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", "y", 54 | "à", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", 55 | "êtes", 56 | ]; 57 | -------------------------------------------------------------------------------- /src/lang/it.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Italian {} 10 | 11 | impl Italian { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Italian { 18 | fn name(&self) -> String { 19 | "Italian".into() 20 | } 21 | fn code(&self) -> String { 22 | "it".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-it", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-it", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-it", Algorithm::Italian)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "a", 43 | "abbia", 44 | "abbiamo", 45 | "abbiano", 46 | "abbiate", 47 | "ad", 48 | "agl", 49 | "agli", 50 | "ai", 51 | "al", 52 | "all", 53 | "alla", 54 | "alle", 55 | "allo", 56 | "anche", 57 | "avemmo", 58 | "avendo", 59 | "avesse", 60 | "avessero", 61 | "avessi", 62 | "avessimo", 63 | "aveste", 64 | "avesti", 65 | "avete", 66 | "aveva", 67 | "avevamo", 68 | "avevano", 69 | "avevate", 70 | "avevi", 71 | "avevo", 72 | "avrai", 73 | "avranno", 74 | "avrebbe", 75 | "avrebbero", 76 | "avrei", 77 | "avremmo", 78 | "avremo", 79 | "avreste", 80 | "avresti", 81 | "avrete", 82 | "avrà", 83 | "avrò", 84 | "avuta", 85 | "avute", 86 | "avuti", 87 | "avuto", 88 | "c", 89 | "che", 90 | "chi", 91 | "ci", 92 | "coi", 93 | "col", 94 | "come", 95 | "con", 96 | "contro", 97 | "cui", 98 | "da", 99 | "dagl", 100 | "dagli", 101 | "dai", 102 | "dal", 103 | "dall", 104 | "dalla", 105 | "dalle", 106 | "dallo", 107 | "degl", 108 | "degli", 109 | "dei", 110 | "del", 111 | "dell", 112 | "della", 113 | "delle", 114 | "dello", 115 | "di", 116 | "dov", 117 | "dove", 118 | "e", 119 | "ebbe", 120 | "ebbero", 121 | "ebbi", 122 | "ed", 123 | "era", 124 | "erano", 125 | "eravamo", 126 | "eravate", 127 | "eri", 128 | "ero", 129 | "essendo", 130 | "faccia", 131 | "facciamo", 132 | "facciano", 133 | "facciate", 134 | "faccio", 135 | "facemmo", 136 | "facendo", 137 | "facesse", 138 | "facessero", 139 | "facessi", 140 | "facessimo", 141 | "faceste", 142 | "facesti", 143 | "faceva", 144 | "facevamo", 145 | "facevano", 146 | "facevate", 147 | "facevi", 148 | "facevo", 149 | "fai", 150 | "fanno", 151 | "farai", 152 | "faranno", 153 | "farebbe", 154 | "farebbero", 155 | "farei", 156 | "faremmo", 157 | "faremo", 158 | "fareste", 159 | "faresti", 160 | "farete", 161 | "farà", 162 | "farò", 163 | "fece", 164 | "fecero", 165 | "feci", 166 | "fosse", 167 | "fossero", 168 | "fossi", 169 | "fossimo", 170 | "foste", 171 | "fosti", 172 | "fu", 173 | "fui", 174 | "fummo", 175 | "furono", 176 | "gli", 177 | "ha", 178 | "hai", 179 | "hanno", 180 | "ho", 181 | "i", 182 | "il", 183 | "in", 184 | "io", 185 | "l", 186 | "la", 187 | "le", 188 | "lei", 189 | "li", 190 | "lo", 191 | "loro", 192 | "lui", 193 | "ma", 194 | "mi", 195 | "mia", 196 | "mie", 197 | "miei", 198 | "mio", 199 | "ne", 200 | "negl", 201 | "negli", 202 | "nei", 203 | "nel", 204 | "nell", 205 | "nella", 206 | "nelle", 207 | "nello", 208 | "noi", 209 | "non", 210 | "nostra", 211 | "nostre", 212 | "nostri", 213 | "nostro", 214 | "o", 215 | "per", 216 | "perché", 217 | "più", 218 | "quale", 219 | "quanta", 220 | "quante", 221 | "quanti", 222 | "quanto", 223 | "quella", 224 | "quelle", 225 | "quelli", 226 | "quello", 227 | "questa", 228 | "queste", 229 | "questi", 230 | "questo", 231 | "sarai", 232 | "saranno", 233 | "sarebbe", 234 | "sarebbero", 235 | "sarei", 236 | "saremmo", 237 | "saremo", 238 | "sareste", 239 | "saresti", 240 | "sarete", 241 | "sarà", 242 | "sarò", 243 | "se", 244 | "sei", 245 | "si", 246 | "sia", 247 | "siamo", 248 | "siano", 249 | "siate", 250 | "siete", 251 | "sono", 252 | "sta", 253 | "stai", 254 | "stando", 255 | "stanno", 256 | "starai", 257 | "staranno", 258 | "starebbe", 259 | "starebbero", 260 | "starei", 261 | "staremmo", 262 | "staremo", 263 | "stareste", 264 | "staresti", 265 | "starete", 266 | "starà", 267 | "starò", 268 | "stava", 269 | "stavamo", 270 | "stavano", 271 | "stavate", 272 | "stavi", 273 | "stavo", 274 | "stemmo", 275 | "stesse", 276 | "stessero", 277 | "stessi", 278 | "stessimo", 279 | "steste", 280 | "stesti", 281 | "stette", 282 | "stettero", 283 | "stetti", 284 | "stia", 285 | "stiamo", 286 | "stiano", 287 | "stiate", 288 | "sto", 289 | "su", 290 | "sua", 291 | "sue", 292 | "sugl", 293 | "sugli", 294 | "sui", 295 | "sul", 296 | "sull", 297 | "sulla", 298 | "sulle", 299 | "sullo", 300 | "suo", 301 | "suoi", 302 | "ti", 303 | "tra", 304 | "tu", 305 | "tua", 306 | "tue", 307 | "tuo", 308 | "tuoi", 309 | "tutti", 310 | "tutto", 311 | "un", 312 | "una", 313 | "uno", 314 | "vi", 315 | "voi", 316 | "vostra", 317 | "vostre", 318 | "vostri", 319 | "vostro", 320 | "è", 321 | ]; 322 | -------------------------------------------------------------------------------- /src/lang/ja.rs: -------------------------------------------------------------------------------- 1 | use super::{common::RegexTrimmer, Language}; 2 | use crate::pipeline::{FnWrapper, Pipeline}; 3 | use lindera::tokenizer::{Tokenizer, TokenizerConfig}; 4 | use lindera_core::viterbi::Mode; 5 | 6 | #[derive(Clone)] 7 | pub struct Japanese { 8 | tokenizer: Tokenizer, 9 | } 10 | 11 | impl Japanese { 12 | pub fn new() -> Self { 13 | let config = TokenizerConfig { 14 | mode: Mode::Decompose(Default::default()), 15 | ..Default::default() 16 | }; 17 | Self::with_config(config) 18 | } 19 | 20 | pub fn with_config(config: TokenizerConfig) -> Self { 21 | // NB: unwrap() is okay since the errors are only related to user-supplied dictionaries. 22 | let tokenizer = Tokenizer::with_config(config).unwrap(); 23 | Self { tokenizer } 24 | } 25 | } 26 | 27 | impl Language for Japanese { 28 | fn name(&self) -> String { 29 | "Japanese".into() 30 | } 31 | fn code(&self) -> String { 32 | "ja".into() 33 | } 34 | 35 | fn tokenize(&self, text: &str) -> Vec { 36 | self.tokenizer 37 | .tokenize(text) 38 | .unwrap() 39 | .into_iter() 40 | .filter_map(|tok| match tok.detail.get(0).map(|d| d.as_str()) { 41 | Some("助詞") | Some("助動詞") | Some("記号") | Some("UNK") => None, 42 | _ => Some(tok.text.to_string()), 43 | }) 44 | .collect() 45 | } 46 | 47 | fn make_pipeline(&self) -> Pipeline { 48 | Pipeline { 49 | queue: vec![ 50 | Box::new(RegexTrimmer::new("trimmer-ja", WORD_CHARS)), 51 | Box::new(FnWrapper("stemmer-ja".into(), stemmer)), 52 | ], 53 | } 54 | } 55 | } 56 | 57 | const WORD_CHARS: &str = r"0-9A-Za-z\p{Hiragana}\p{Katakana}\p{Unified_Ideograph}"; 58 | 59 | fn stemmer(token: String) -> Option { 60 | Some(token) 61 | } 62 | 63 | #[cfg(test)] 64 | mod tests { 65 | use crate::pipeline::PipelineFn; 66 | use super::*; 67 | 68 | #[test] 69 | fn test_trimmer() { 70 | let trimmer = RegexTrimmer::new("trimmer-ja".into(), WORD_CHARS); 71 | assert_eq!( 72 | trimmer.filter(" こんにちは、世界!".to_string()), 73 | Some("こんにちは、世界".to_string()) 74 | ); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/lang/mod.rs: -------------------------------------------------------------------------------- 1 | //! Intended to be compatible with . Each supported 2 | //! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use 3 | //! these modules directly. 4 | 5 | pub mod common; 6 | 7 | use crate::Pipeline; 8 | 9 | pub trait Language { 10 | /// The name of the language in English 11 | fn name(&self) -> String; 12 | 13 | /// The ISO 639-1 language code of the language 14 | fn code(&self) -> String; 15 | 16 | /// Separates the input text into individual tokens. In most languages a token is a word, separated by whitespace. 17 | fn tokenize(&self, text: &str) -> Vec; 18 | 19 | /// Returns the [`Pipeline`] to process the tokens with 20 | fn make_pipeline(&self) -> Pipeline; 21 | } 22 | 23 | /// Splits a text string into a vector of individual tokens. 24 | pub fn tokenize_whitespace(text: &str) -> Vec { 25 | text.split(|c: char| c.is_whitespace() || c == '-') 26 | .filter(|s| !s.is_empty()) 27 | .map(|s| s.trim().to_lowercase()) 28 | .collect() 29 | } 30 | 31 | macro_rules! impl_language { 32 | ($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => { 33 | /// Returns a list of all the [`Language`] implementations in the crate 34 | pub fn languages() -> Vec> { 35 | vec![ 36 | $( 37 | $(#[$cfgs])? 38 | Box::new($code::$name::new()), 39 | )+ 40 | ] 41 | } 42 | 43 | /// Returns the [`Language`] for the given two-character [ISO 639-1][iso] language code if the 44 | /// language is supported. Returns `None` if not supported. 45 | /// 46 | /// *Note:* 47 | /// 48 | /// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name 49 | /// and pipeline suffix in order to match lunr-languages. 50 | /// 51 | /// [iso]: https://en.wikipedia.org/wiki/ISO_639-1 52 | pub fn from_code(code: &str) -> Option> { 53 | match code.to_ascii_lowercase().as_str() { 54 | $( 55 | $(#[$cfgs])? 56 | stringify!($code) => Some(Box::new($code::$name::new())), 57 | )+ 58 | _ => None, 59 | } 60 | } 61 | 62 | /// Returns the [`Language`] for the given English language name if the 63 | /// language is supported. Returns `None` if not supported. The first letter must 64 | /// be capitalized. 65 | pub fn from_name(name: &str) -> Option> { 66 | match name { 67 | $( 68 | $(#[$cfgs])? 69 | stringify!($name) => Some(Box::new($code::$name::new())), 70 | )+ 71 | _ => None, 72 | } 73 | } 74 | 75 | $( 76 | $(#[$cfgs])? 77 | mod $code; 78 | 79 | $(#[$cfgs])? 80 | pub use $code::$name; 81 | )+ 82 | }; 83 | } 84 | 85 | impl_language! { 86 | (English, en), 87 | (Arabic, ar, #[cfg(feature = "ar")]), 88 | (Chinese, zh, #[cfg(feature = "zh")]), 89 | (Danish, da, #[cfg(feature = "da")]), 90 | (Dutch, du, #[cfg(feature = "du")]), 91 | (Finnish, fi, #[cfg(feature = "fi")]), 92 | (French, fr, #[cfg(feature = "fr")]), 93 | (German, de, #[cfg(feature = "de")]), 94 | (Hungarian, hu, #[cfg(feature = "hu")]), 95 | (Italian, it, #[cfg(feature = "it")]), 96 | (Japanese, ja, #[cfg(feature = "ja")]), 97 | (Korean, ko, #[cfg(feature = "ko")]), 98 | (Norwegian, no, #[cfg(feature = "no")]), 99 | (Portuguese, pt, #[cfg(feature = "pt")]), 100 | (Romanian, ro, #[cfg(feature = "ro")]), 101 | (Russian, ru, #[cfg(feature = "ru")]), 102 | (Spanish, es, #[cfg(feature = "es")]), 103 | (Swedish, sv, #[cfg(feature = "sv")]), 104 | (Turkish, tr, #[cfg(feature = "tr")]), 105 | } 106 | 107 | #[cfg(test)] 108 | mod tests { 109 | use super::tokenize_whitespace; 110 | 111 | #[test] 112 | fn split_simple_strings() { 113 | let string = "this is a simple string"; 114 | assert_eq!( 115 | &tokenize_whitespace(string), 116 | &["this", "is", "a", "simple", "string"] 117 | ); 118 | } 119 | 120 | #[test] 121 | fn multiple_white_space() { 122 | let string = " foo bar "; 123 | assert_eq!(&tokenize_whitespace(string), &["foo", "bar"]); 124 | } 125 | 126 | #[test] 127 | fn hyphens() { 128 | let string = "take the New York-San Francisco flight"; 129 | assert_eq!( 130 | &tokenize_whitespace(string), 131 | &["take", "the", "new", "york", "san", "francisco", "flight"] 132 | ); 133 | } 134 | 135 | #[test] 136 | fn splitting_strings_with_hyphens() { 137 | let string = "Solve for A - B"; 138 | assert_eq!(&tokenize_whitespace(string), &["solve", "for", "a", "b"]); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/lang/no.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Norwegian {} 10 | 11 | impl Norwegian { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Norwegian { 18 | fn name(&self) -> String { 19 | "Norwegian".into() 20 | } 21 | fn code(&self) -> String { 22 | "no".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-no", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-no", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-no", Algorithm::Norwegian)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "alle", 43 | "at", 44 | "av", 45 | "bare", 46 | "begge", 47 | "ble", 48 | "blei", 49 | "bli", 50 | "blir", 51 | "blitt", 52 | "både", 53 | "båe", 54 | "da", 55 | "de", 56 | "deg", 57 | "dei", 58 | "deim", 59 | "deira", 60 | "deires", 61 | "dem", 62 | "den", 63 | "denne", 64 | "der", 65 | "dere", 66 | "deres", 67 | "det", 68 | "dette", 69 | "di", 70 | "din", 71 | "disse", 72 | "ditt", 73 | "du", 74 | "dykk", 75 | "dykkar", 76 | "då", 77 | "eg", 78 | "ein", 79 | "eit", 80 | "eitt", 81 | "eller", 82 | "elles", 83 | "en", 84 | "enn", 85 | "er", 86 | "et", 87 | "ett", 88 | "etter", 89 | "for", 90 | "fordi", 91 | "fra", 92 | "før", 93 | "ha", 94 | "hadde", 95 | "han", 96 | "hans", 97 | "har", 98 | "hennar", 99 | "henne", 100 | "hennes", 101 | "her", 102 | "hjå", 103 | "ho", 104 | "hoe", 105 | "honom", 106 | "hoss", 107 | "hossen", 108 | "hun", 109 | "hva", 110 | "hvem", 111 | "hver", 112 | "hvilke", 113 | "hvilken", 114 | "hvis", 115 | "hvor", 116 | "hvordan", 117 | "hvorfor", 118 | "i", 119 | "ikke", 120 | "ikkje", 121 | "ikkje", 122 | "ingen", 123 | "ingi", 124 | "inkje", 125 | "inn", 126 | "inni", 127 | "ja", 128 | "jeg", 129 | "kan", 130 | "kom", 131 | "korleis", 132 | "korso", 133 | "kun", 134 | "kunne", 135 | "kva", 136 | "kvar", 137 | "kvarhelst", 138 | "kven", 139 | "kvi", 140 | "kvifor", 141 | "man", 142 | "mange", 143 | "me", 144 | "med", 145 | "medan", 146 | "meg", 147 | "meget", 148 | "mellom", 149 | "men", 150 | "mi", 151 | "min", 152 | "mine", 153 | "mitt", 154 | "mot", 155 | "mykje", 156 | "ned", 157 | "no", 158 | "noe", 159 | "noen", 160 | "noka", 161 | "noko", 162 | "nokon", 163 | "nokor", 164 | "nokre", 165 | "nå", 166 | "når", 167 | "og", 168 | "også", 169 | "om", 170 | "opp", 171 | "oss", 172 | "over", 173 | "på", 174 | "samme", 175 | "seg", 176 | "selv", 177 | "si", 178 | "si", 179 | "sia", 180 | "sidan", 181 | "siden", 182 | "sin", 183 | "sine", 184 | "sitt", 185 | "sjøl", 186 | "skal", 187 | "skulle", 188 | "slik", 189 | "so", 190 | "som", 191 | "som", 192 | "somme", 193 | "somt", 194 | "så", 195 | "sånn", 196 | "til", 197 | "um", 198 | "upp", 199 | "ut", 200 | "uten", 201 | "var", 202 | "vart", 203 | "varte", 204 | "ved", 205 | "vere", 206 | "verte", 207 | "vi", 208 | "vil", 209 | "ville", 210 | "vore", 211 | "vors", 212 | "vort", 213 | "vår", 214 | "være", 215 | "være", 216 | "vært", 217 | "å", 218 | ]; 219 | -------------------------------------------------------------------------------- /src/lang/pt.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Portuguese {} 10 | 11 | impl Portuguese { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Portuguese { 18 | fn name(&self) -> String { 19 | "Portuguese".into() 20 | } 21 | fn code(&self) -> String { 22 | "pt".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-pt", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-pt", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-pt", Algorithm::Portuguese)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "a", 43 | "ao", 44 | "aos", 45 | "aquela", 46 | "aquelas", 47 | "aquele", 48 | "aqueles", 49 | "aquilo", 50 | "as", 51 | "até", 52 | "com", 53 | "como", 54 | "da", 55 | "das", 56 | "de", 57 | "dela", 58 | "delas", 59 | "dele", 60 | "deles", 61 | "depois", 62 | "do", 63 | "dos", 64 | "e", 65 | "ela", 66 | "elas", 67 | "ele", 68 | "eles", 69 | "em", 70 | "entre", 71 | "era", 72 | "eram", 73 | "essa", 74 | "essas", 75 | "esse", 76 | "esses", 77 | "esta", 78 | "estamos", 79 | "estas", 80 | "estava", 81 | "estavam", 82 | "este", 83 | "esteja", 84 | "estejam", 85 | "estejamos", 86 | "estes", 87 | "esteve", 88 | "estive", 89 | "estivemos", 90 | "estiver", 91 | "estivera", 92 | "estiveram", 93 | "estiverem", 94 | "estivermos", 95 | "estivesse", 96 | "estivessem", 97 | "estivéramos", 98 | "estivéssemos", 99 | "estou", 100 | "está", 101 | "estávamos", 102 | "estão", 103 | "eu", 104 | "foi", 105 | "fomos", 106 | "for", 107 | "fora", 108 | "foram", 109 | "forem", 110 | "formos", 111 | "fosse", 112 | "fossem", 113 | "fui", 114 | "fôramos", 115 | "fôssemos", 116 | "haja", 117 | "hajam", 118 | "hajamos", 119 | "havemos", 120 | "hei", 121 | "houve", 122 | "houvemos", 123 | "houver", 124 | "houvera", 125 | "houveram", 126 | "houverei", 127 | "houverem", 128 | "houveremos", 129 | "houveria", 130 | "houveriam", 131 | "houvermos", 132 | "houverá", 133 | "houverão", 134 | "houveríamos", 135 | "houvesse", 136 | "houvessem", 137 | "houvéramos", 138 | "houvéssemos", 139 | "há", 140 | "hão", 141 | "isso", 142 | "isto", 143 | "já", 144 | "lhe", 145 | "lhes", 146 | "mais", 147 | "mas", 148 | "me", 149 | "mesmo", 150 | "meu", 151 | "meus", 152 | "minha", 153 | "minhas", 154 | "muito", 155 | "na", 156 | "nas", 157 | "nem", 158 | "no", 159 | "nos", 160 | "nossa", 161 | "nossas", 162 | "nosso", 163 | "nossos", 164 | "num", 165 | "numa", 166 | "não", 167 | "nós", 168 | "o", 169 | "os", 170 | "ou", 171 | "para", 172 | "pela", 173 | "pelas", 174 | "pelo", 175 | "pelos", 176 | "por", 177 | "qual", 178 | "quando", 179 | "que", 180 | "quem", 181 | "se", 182 | "seja", 183 | "sejam", 184 | "sejamos", 185 | "sem", 186 | "serei", 187 | "seremos", 188 | "seria", 189 | "seriam", 190 | "será", 191 | "serão", 192 | "seríamos", 193 | "seu", 194 | "seus", 195 | "somos", 196 | "sou", 197 | "sua", 198 | "suas", 199 | "são", 200 | "só", 201 | "também", 202 | "te", 203 | "tem", 204 | "temos", 205 | "tenha", 206 | "tenham", 207 | "tenhamos", 208 | "tenho", 209 | "terei", 210 | "teremos", 211 | "teria", 212 | "teriam", 213 | "terá", 214 | "terão", 215 | "teríamos", 216 | "teu", 217 | "teus", 218 | "teve", 219 | "tinha", 220 | "tinham", 221 | "tive", 222 | "tivemos", 223 | "tiver", 224 | "tivera", 225 | "tiveram", 226 | "tiverem", 227 | "tivermos", 228 | "tivesse", 229 | "tivessem", 230 | "tivéramos", 231 | "tivéssemos", 232 | "tu", 233 | "tua", 234 | "tuas", 235 | "tém", 236 | "tínhamos", 237 | "um", 238 | "uma", 239 | "você", 240 | "vocês", 241 | "vos", 242 | "à", 243 | "às", 244 | "éramos", 245 | ]; 246 | -------------------------------------------------------------------------------- /src/lang/ro.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Romanian {} 10 | 11 | impl Romanian { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Romanian { 18 | fn name(&self) -> String { 19 | "Romanian".into() 20 | } 21 | fn code(&self) -> String { 22 | "ro".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-ro", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-ro", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-ro", Algorithm::Romanian)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "acea", 43 | "aceasta", 44 | "această", 45 | "aceea", 46 | "acei", 47 | "aceia", 48 | "acel", 49 | "acela", 50 | "acele", 51 | "acelea", 52 | "acest", 53 | "acesta", 54 | "aceste", 55 | "acestea", 56 | "aceşti", 57 | "aceştia", 58 | "acolo", 59 | "acord", 60 | "acum", 61 | "ai", 62 | "aia", 63 | "aibă", 64 | "aici", 65 | "al", 66 | "ale", 67 | "alea", 68 | "altceva", 69 | "altcineva", 70 | "am", 71 | "ar", 72 | "are", 73 | "asemenea", 74 | "asta", 75 | "astea", 76 | "astăzi", 77 | "asupra", 78 | "au", 79 | "avea", 80 | "avem", 81 | "aveţi", 82 | "azi", 83 | "aş", 84 | "aşadar", 85 | "aţi", 86 | "bine", 87 | "bucur", 88 | "bună", 89 | "ca", 90 | "care", 91 | "caut", 92 | "ce", 93 | "cel", 94 | "ceva", 95 | "chiar", 96 | "cinci", 97 | "cine", 98 | "cineva", 99 | "contra", 100 | "cu", 101 | "cum", 102 | "cumva", 103 | "curând", 104 | "curînd", 105 | "când", 106 | "cât", 107 | "câte", 108 | "câtva", 109 | "câţi", 110 | "cînd", 111 | "cît", 112 | "cîte", 113 | "cîtva", 114 | "cîţi", 115 | "că", 116 | "căci", 117 | "cărei", 118 | "căror", 119 | "cărui", 120 | "către", 121 | "da", 122 | "dacă", 123 | "dar", 124 | "datorită", 125 | "dată", 126 | "dau", 127 | "de", 128 | "deci", 129 | "deja", 130 | "deoarece", 131 | "departe", 132 | "deşi", 133 | "din", 134 | "dinaintea", 135 | "dintr-", 136 | "dintre", 137 | "doi", 138 | "doilea", 139 | "două", 140 | "drept", 141 | "după", 142 | "dă", 143 | "ea", 144 | "ei", 145 | "el", 146 | "ele", 147 | "eram", 148 | "este", 149 | "eu", 150 | "eşti", 151 | "face", 152 | "fata", 153 | "fi", 154 | "fie", 155 | "fiecare", 156 | "fii", 157 | "fim", 158 | "fiu", 159 | "fiţi", 160 | "frumos", 161 | "fără", 162 | "graţie", 163 | "halbă", 164 | "iar", 165 | "ieri", 166 | "la", 167 | "le", 168 | "li", 169 | "lor", 170 | "lui", 171 | "lângă", 172 | "lîngă", 173 | "mai", 174 | "mea", 175 | "mei", 176 | "mele", 177 | "mereu", 178 | "meu", 179 | "mi", 180 | "mie", 181 | "mine", 182 | "mult", 183 | "multă", 184 | "mulţi", 185 | "mulţumesc", 186 | "mâine", 187 | "mîine", 188 | "mă", 189 | "ne", 190 | "nevoie", 191 | "nici", 192 | "nicăieri", 193 | "nimeni", 194 | "nimeri", 195 | "nimic", 196 | "nişte", 197 | "noastre", 198 | "noastră", 199 | "noi", 200 | "noroc", 201 | "nostru", 202 | "nouă", 203 | "noştri", 204 | "nu", 205 | "opt", 206 | "ori", 207 | "oricare", 208 | "orice", 209 | "oricine", 210 | "oricum", 211 | "oricând", 212 | "oricât", 213 | "oricînd", 214 | "oricît", 215 | "oriunde", 216 | "patra", 217 | "patru", 218 | "patrulea", 219 | "pe", 220 | "pentru", 221 | "peste", 222 | "pic", 223 | "poate", 224 | "pot", 225 | "prea", 226 | "prima", 227 | "primul", 228 | "prin", 229 | "puţin", 230 | "puţina", 231 | "puţină", 232 | "până", 233 | "pînă", 234 | "rog", 235 | "sa", 236 | "sale", 237 | "sau", 238 | "se", 239 | "spate", 240 | "spre", 241 | "sub", 242 | "sunt", 243 | "suntem", 244 | "sunteţi", 245 | "sută", 246 | "sînt", 247 | "sîntem", 248 | "sînteţi", 249 | "să", 250 | "săi", 251 | "său", 252 | "ta", 253 | "tale", 254 | "te", 255 | "timp", 256 | "tine", 257 | "toate", 258 | "toată", 259 | "tot", 260 | "totuşi", 261 | "toţi", 262 | "trei", 263 | "treia", 264 | "treilea", 265 | "tu", 266 | "tăi", 267 | "tău", 268 | "un", 269 | "una", 270 | "unde", 271 | "undeva", 272 | "unei", 273 | "uneia", 274 | "unele", 275 | "uneori", 276 | "unii", 277 | "unor", 278 | "unora", 279 | "unu", 280 | "unui", 281 | "unuia", 282 | "unul", 283 | "vi", 284 | "voastre", 285 | "voastră", 286 | "voi", 287 | "vostru", 288 | "vouă", 289 | "voştri", 290 | "vreme", 291 | "vreo", 292 | "vreun", 293 | "vă", 294 | "zece", 295 | "zero", 296 | "zi", 297 | "zice", 298 | "îi", 299 | "îl", 300 | "îmi", 301 | "împotriva", 302 | "în", 303 | "înainte", 304 | "înaintea", 305 | "încotro", 306 | "încât", 307 | "încît", 308 | "între", 309 | "întrucât", 310 | "întrucît", 311 | "îţi", 312 | "ăla", 313 | "ălea", 314 | "ăsta", 315 | "ăstea", 316 | "ăştia", 317 | "şapte", 318 | "şase", 319 | "şi", 320 | "ştiu", 321 | "ţi", 322 | "ţie", 323 | ]; 324 | -------------------------------------------------------------------------------- /src/lang/sv.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Swedish {} 10 | 11 | impl Swedish { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Swedish { 18 | fn name(&self) -> String { 19 | "Swedish".into() 20 | } 21 | fn code(&self) -> String { 22 | "sv".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-sv", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-sv", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-sv", Algorithm::Swedish)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", "alla", "allt", "att", "av", "blev", "bli", "blir", "blivit", "de", "dem", "den", "denna", 42 | "deras", "dess", "dessa", "det", "detta", "dig", "din", "dina", "ditt", "du", "där", "då", 43 | "efter", "ej", "eller", "en", "er", "era", "ert", "ett", "från", "för", "ha", "hade", "han", 44 | "hans", "har", "henne", "hennes", "hon", "honom", "hur", "här", "i", "icke", "ingen", "inom", 45 | "inte", "jag", "ju", "kan", "kunde", "man", "med", "mellan", "men", "mig", "min", "mina", 46 | "mitt", "mot", "mycket", "ni", "nu", "när", "någon", "något", "några", "och", "om", "oss", 47 | "på", "samma", "sedan", "sig", "sin", "sina", "sitta", "själv", "skulle", "som", "så", "sådan", 48 | "sådana", "sådant", "till", "under", "upp", "ut", "utan", "vad", "var", "vara", "varför", 49 | "varit", "varje", "vars", "vart", "vem", "vi", "vid", "vilka", "vilkas", "vilken", "vilket", 50 | "vår", "våra", "vårt", "än", "är", "åt", "över", 51 | ]; 52 | -------------------------------------------------------------------------------- /src/lang/tr.rs: -------------------------------------------------------------------------------- 1 | use super::{ 2 | common::{RustStemmer, StopWordFilter, RegexTrimmer}, 3 | Language, 4 | }; 5 | use crate::pipeline::Pipeline; 6 | use rust_stemmers::Algorithm; 7 | 8 | #[derive(Clone)] 9 | pub struct Turkish {} 10 | 11 | impl Turkish { 12 | pub fn new() -> Self { 13 | Self {} 14 | } 15 | } 16 | 17 | impl Language for Turkish { 18 | fn name(&self) -> String { 19 | "Turkish".into() 20 | } 21 | fn code(&self) -> String { 22 | "tr".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | super::tokenize_whitespace(text) 27 | } 28 | 29 | fn make_pipeline(&self) -> Pipeline { 30 | Pipeline { 31 | queue: vec![ 32 | Box::new(RegexTrimmer::new("trimmer-tr", r"\p{Latin}")), 33 | Box::new(StopWordFilter::new("stopWordFilter-tr", STOP_WORDS)), 34 | Box::new(RustStemmer::new("stemmer-tr", Algorithm::Turkish)), 35 | ], 36 | } 37 | } 38 | } 39 | 40 | const STOP_WORDS: &[&str] = &[ 41 | "", 42 | "acaba", 43 | "altmış", 44 | "altı", 45 | "ama", 46 | "ancak", 47 | "arada", 48 | "aslında", 49 | "ayrıca", 50 | "bana", 51 | "bazı", 52 | "belki", 53 | "ben", 54 | "benden", 55 | "beni", 56 | "benim", 57 | "beri", 58 | "beş", 59 | "bile", 60 | "bin", 61 | "bir", 62 | "biri", 63 | "birkaç", 64 | "birkez", 65 | "birçok", 66 | "birşey", 67 | "birşeyi", 68 | "biz", 69 | "bizden", 70 | "bize", 71 | "bizi", 72 | "bizim", 73 | "bu", 74 | "buna", 75 | "bunda", 76 | "bundan", 77 | "bunlar", 78 | "bunları", 79 | "bunların", 80 | "bunu", 81 | "bunun", 82 | "burada", 83 | "böyle", 84 | "böylece", 85 | "da", 86 | "daha", 87 | "dahi", 88 | "de", 89 | "defa", 90 | "değil", 91 | "diye", 92 | "diğer", 93 | "doksan", 94 | "dokuz", 95 | "dolayı", 96 | "dolayısıyla", 97 | "dört", 98 | "edecek", 99 | "eden", 100 | "ederek", 101 | "edilecek", 102 | "ediliyor", 103 | "edilmesi", 104 | "ediyor", 105 | "elli", 106 | "en", 107 | "etmesi", 108 | "etti", 109 | "ettiği", 110 | "ettiğini", 111 | "eğer", 112 | "gibi", 113 | "göre", 114 | "halen", 115 | "hangi", 116 | "hatta", 117 | "hem", 118 | "henüz", 119 | "hep", 120 | "hepsi", 121 | "her", 122 | "herhangi", 123 | "herkesin", 124 | "hiç", 125 | "hiçbir", 126 | "iki", 127 | "ile", 128 | "ilgili", 129 | "ise", 130 | "itibaren", 131 | "itibariyle", 132 | "için", 133 | "işte", 134 | "kadar", 135 | "karşın", 136 | "katrilyon", 137 | "kendi", 138 | "kendilerine", 139 | "kendini", 140 | "kendisi", 141 | "kendisine", 142 | "kendisini", 143 | "kez", 144 | "ki", 145 | "kim", 146 | "kimden", 147 | "kime", 148 | "kimi", 149 | "kimse", 150 | "kırk", 151 | "milyar", 152 | "milyon", 153 | "mu", 154 | "mü", 155 | "mı", 156 | "nasıl", 157 | "ne", 158 | "neden", 159 | "nedenle", 160 | "nerde", 161 | "nerede", 162 | "nereye", 163 | "niye", 164 | "niçin", 165 | "o", 166 | "olan", 167 | "olarak", 168 | "oldu", 169 | "olduklarını", 170 | "olduğu", 171 | "olduğunu", 172 | "olmadı", 173 | "olmadığı", 174 | "olmak", 175 | "olması", 176 | "olmayan", 177 | "olmaz", 178 | "olsa", 179 | "olsun", 180 | "olup", 181 | "olur", 182 | "olursa", 183 | "oluyor", 184 | "on", 185 | "ona", 186 | "ondan", 187 | "onlar", 188 | "onlardan", 189 | "onları", 190 | "onların", 191 | "onu", 192 | "onun", 193 | "otuz", 194 | "oysa", 195 | "pek", 196 | "rağmen", 197 | "sadece", 198 | "sanki", 199 | "sekiz", 200 | "seksen", 201 | "sen", 202 | "senden", 203 | "seni", 204 | "senin", 205 | "siz", 206 | "sizden", 207 | "sizi", 208 | "sizin", 209 | "tarafından", 210 | "trilyon", 211 | "tüm", 212 | "var", 213 | "vardı", 214 | "ve", 215 | "veya", 216 | "ya", 217 | "yani", 218 | "yapacak", 219 | "yapmak", 220 | "yaptı", 221 | "yaptıkları", 222 | "yaptığı", 223 | "yaptığını", 224 | "yapılan", 225 | "yapılması", 226 | "yapıyor", 227 | "yedi", 228 | "yerine", 229 | "yetmiş", 230 | "yine", 231 | "yirmi", 232 | "yoksa", 233 | "yüz", 234 | "zaten", 235 | "çok", 236 | "çünkü", 237 | "öyle", 238 | "üzere", 239 | "üç", 240 | "şey", 241 | "şeyden", 242 | "şeyi", 243 | "şeyler", 244 | "şu", 245 | "şuna", 246 | "şunda", 247 | "şundan", 248 | "şunları", 249 | "şunu", 250 | "şöyle", 251 | ]; 252 | -------------------------------------------------------------------------------- /src/lang/zh.rs: -------------------------------------------------------------------------------- 1 | use super::{common::RegexTrimmer, Language}; 2 | use crate::pipeline::{FnWrapper, Pipeline}; 3 | 4 | #[derive(Clone)] 5 | pub struct Chinese { 6 | jieba: jieba_rs::Jieba, 7 | } 8 | 9 | impl Chinese { 10 | pub fn new() -> Self { 11 | Self { 12 | jieba: jieba_rs::Jieba::new(), 13 | } 14 | } 15 | } 16 | 17 | impl Language for Chinese { 18 | fn name(&self) -> String { 19 | "Chinese".into() 20 | } 21 | fn code(&self) -> String { 22 | "zh".into() 23 | } 24 | 25 | fn tokenize(&self, text: &str) -> Vec { 26 | self.jieba 27 | .cut_for_search(text, false) 28 | .iter() 29 | .map(|s| s.to_string()) 30 | .collect() 31 | } 32 | 33 | fn make_pipeline(&self) -> Pipeline { 34 | Pipeline { 35 | queue: vec![ 36 | Box::new(RegexTrimmer::new("trimmer-zh", r"\p{Unified_Ideograph}\p{Latin}")), 37 | Box::new(FnWrapper("stopWordFilter-zh".into(), stop_word_filter)), 38 | Box::new(FnWrapper("stemmer-zh".into(), stemmer)), 39 | ], 40 | } 41 | } 42 | } 43 | 44 | // TODO: lunr.zh.js has a much larger set of stop words 45 | fn stop_word_filter(token: String) -> Option { 46 | match token.as_str() { 47 | "的" | "了" => None, 48 | _ => Some(token), 49 | } 50 | } 51 | 52 | // lunr.zh.js has an empty stemmer as well 53 | fn stemmer(token: String) -> Option { 54 | Some(token) 55 | } 56 | -------------------------------------------------------------------------------- /src/pipeline.rs: -------------------------------------------------------------------------------- 1 | //! Defines the pipeline which processes text for inclusion in the index. Most users do not need 2 | //! to use this module directly. 3 | 4 | use serde::ser::{Serialize, SerializeSeq, Serializer}; 5 | 6 | pub trait PipelineFn { 7 | fn name(&self) -> String; 8 | 9 | fn filter(&self, token: String) -> Option; 10 | } 11 | 12 | #[derive(Clone)] 13 | pub struct FnWrapper(pub String, pub fn(String) -> Option); 14 | 15 | impl PipelineFn for FnWrapper { 16 | fn name(&self) -> String { 17 | self.0.clone() 18 | } 19 | 20 | fn filter(&self, token: String) -> Option { 21 | (self.1)(token) 22 | } 23 | } 24 | 25 | /// A sequence of `PipelineFn`s which are run on tokens to prepare them for searching. 26 | #[derive(Deserialize)] 27 | pub struct Pipeline { 28 | #[serde(skip_deserializing)] 29 | pub queue: Vec>, 30 | } 31 | 32 | impl Serialize for Pipeline { 33 | fn serialize(&self, serializer: S) -> Result 34 | where 35 | S: Serializer, 36 | { 37 | let mut seq = serializer.serialize_seq(Some(self.queue.len()))?; 38 | for elem in &self.queue { 39 | seq.serialize_element(&elem.name())?; 40 | } 41 | seq.end() 42 | } 43 | } 44 | 45 | impl Pipeline { 46 | /// Run the Pipeline against the given vector of tokens. The returned vector may be shorter 47 | /// than the input if a pipeline function returns `None` for a token. 48 | pub fn run(&self, tokens: Vec) -> Vec { 49 | let mut ret = vec![]; 50 | for token in tokens { 51 | let mut token = Some(token); 52 | for func in &self.queue { 53 | if let Some(t) = token { 54 | token = func.filter(t); 55 | } else { 56 | break; 57 | } 58 | } 59 | if let Some(t) = token { 60 | ret.push(t); 61 | } 62 | } 63 | ret 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /tests/data/ar.in.txt: -------------------------------------------------------------------------------- 1 | استعار جحا مرة آنية من جاره وعندما أعادها له أعاد معها آنية صغيرة 2 | فسأله جاره لماذا أعدت مع أنيتي آنية صغيرة يا جحا؟ 3 | فقال له جحا: إنّ آنيتك ولدت في الأمس آنية صغيرة وإنّها الآن من حقك، فرح الرجل وأخذ الطنجرة ودخل بيته، 4 | وبعد فترة من الزمان ذهب جحا إلى جاره وطلب منه أنية أخرى، فأعطاه جاره ما طلب، مرّ وقت طويل ولم يُعد جحا الآنية، 5 | فذهب جاره إلى بيته ليطلبها منه، فاستقبله جحا باكياً منتحباً، 6 | فقال له الرجل: مالي أراك باكياً يا جحا؟!! فقال له جحا وهو يبكي إنّ آنيتك توفيت بالأمس يا صاحبي، 7 | فقال له جاره وهو غاضب: وكيف لآنيةٍ أن تموت يا رجل؟!! فقال جحا أتصدق أنّ إناء قد يلد ولا تصدق أنّه قد يموت؟! 8 | -------------------------------------------------------------------------------- /tests/data/ar.out.txt: -------------------------------------------------------------------------------- 1 | استعار 2 | جحا 3 | مرة 4 | انية 5 | من 6 | جاره 7 | وعندما 8 | اعادها 9 | له 10 | اعاد 11 | معها 12 | انية 13 | صغيرة 14 | فساله 15 | جاره 16 | لماذا 17 | اعدت 18 | مع 19 | انيتي 20 | انية 21 | صغيرة 22 | يا 23 | جحا؟ 24 | فقال 25 | له 26 | جحا: 27 | ان 28 | انيتك 29 | ولدت 30 | في 31 | الامس 32 | انية 33 | صغيرة 34 | وانها 35 | الان 36 | من 37 | حقك، 38 | فرح 39 | الرجل 40 | واخذ 41 | الطنجرة 42 | ودخل 43 | بيته، 44 | وبعد 45 | فترة 46 | من 47 | الزمان 48 | ذهب 49 | جحا 50 | الى 51 | جاره 52 | وطلب 53 | منه 54 | انية 55 | اخرى، 56 | فاعطاه 57 | جاره 58 | ما 59 | طلب، 60 | مر 61 | وقت 62 | طويل 63 | ولم 64 | يعد 65 | جحا 66 | الانية، 67 | فذهب 68 | جاره 69 | الى 70 | بيته 71 | ليطلبها 72 | منه، 73 | فاستقبله 74 | جحا 75 | باكيا 76 | منتحبا، 77 | فقال 78 | له 79 | الرجل: 80 | مالي 81 | اراك 82 | باكيا 83 | يا 84 | جحا؟!! 85 | فقال 86 | له 87 | جحا 88 | وهو 89 | يبكي 90 | ان 91 | انيتك 92 | توفيت 93 | بالامس 94 | يا 95 | صاحبي، 96 | فقال 97 | له 98 | جاره 99 | وهو 100 | غاضب: 101 | وكيف 102 | لانية 103 | ان 104 | تموت 105 | يا 106 | رجل؟!! 107 | فقال 108 | جحا 109 | اتصدق 110 | ان 111 | اناء 112 | قد 113 | يلد 114 | ولا 115 | تصدق 116 | انه 117 | قد 118 | يموت؟! 119 | -------------------------------------------------------------------------------- /tests/data/da.in.txt: -------------------------------------------------------------------------------- 1 | I det lille Værtshus i Genf, hvor Russerne plejede at have deres 2 | Tilhold, nød Helene hurtigt sit beskedne Aftensmaaltid uden som 3 | sædvanlig at drikke en Kop Kaffe dertil -- en Luksus, hun ikke havde 4 | nægtet sig lige siden den Dag, hun havde faaet sine Elever i Russisk. 5 | Men i Aften maatte hun skynde sig; et længe ventet Brev fra Rusland laa 6 | gemt i hendes Lomme. Hun havde for et Øjeblik siden faaet det af den 7 | gamle, hvidhaarede Urmager, til hvem hele hendes udenlandske 8 | Korrespondance blev adresseret, og hun brændte af Utaalmodighed efter at 9 | erfare de Nyheder, som det i al Almindelighed maatte indeholde, og efter 10 | at faa det overbragt til sin Ven Andrey, hvem det dog fornemmelig angik. 11 | 12 | Hun vekslede nogle Ord med en anden landflygtig, krydsede imellem de 13 | mange Rækker smaa Borde, ved hvilke der overalt sad Mænd i 14 | Arbejdsbluser, og naaede ud paa Gaden. Klokken var kun halvsyv, hun var 15 | sikker paa at træffe Andrey hjemme. Han boede i Nærheden, og efter fem 16 | Minutters Forløb befandt Helene sig uden for hans Dør. Hendes smukke, 17 | noget stillestaaende Ansigt havde faaet en let Farve af den hurtige 18 | Gang. 19 | 20 | Andrey var alene, i Færd med at gøre Uddrag af en Statistik, som han 21 | benyttede til Grundlag for den Artikel, han hver Uge skrev til et 22 | russisk Provinsblad. Han vendte Hovedet og rejste sig med udstrakt Haand 23 | for at byde sin Gæst velkommen. 24 | 25 | „Her er et Brev til dig!“ sagde Helene, idet hun gav ham Haanden. 26 | 27 | „Naa, endelig!“ udbrød han. 28 | 29 | Andrey var en Mand paa seks-syv og tyve Aar med et alvorligt, godmodigt 30 | Ansigt, lidt skarpt og regelmæssigt i Trækkene. Over hans Pande laa Spor 31 | af tidlige Sorger, og hans Øjne var ualmindelig dybe og tankefulde, men 32 | dette forringede ikke det Indtryk af Ro og Bestemthed, man fik af hele 33 | hans kraftige, velformede Skikkelse. 34 | 35 | Der gled en let Rødme over hans Pande, idet hans slanke, muskelstærke 36 | Fingre med nervøs Hast rev Konvolutten op og fremdrog et stort Ark 37 | Papir, bedækket med Linier i vid Afstand fra hinanden, skrevne med en 38 | uregelmæssig, sammentrængt Haandskrift. 39 | 40 | Helene, der ikke syntes at være mindre utaalmodig end han, gik hen til 41 | ham og lagde Haanden paa hans Skulder for ogsaa at kunne læse i Brevet. 42 | 43 | „Det er bedre, at vi sætter os ned, Helene!“ sagde den unge Mand. „Du 44 | skygger for Lyset med dine Krøller!“ 45 | 46 | Det mere end tarvelige Værelse var kun sparsomt oplyst af en eneste 47 | Lampe, dækket af en grøn Papirskærm, saaledes at kun en Del af 48 | Brædegulvet, Benene paa nogle simple Stole og den nederste Del af en 49 | Mahogni Kommode -- Værelsets fornemste Prydelse -- var helt oplyst. 50 | Væggene, som var betrukne med gult Tapetpapir og prydede med et billigt 51 | Litografi af den schweiziske General Dufour, et Landskab, et Fotografi 52 | af Værtindens afdøde Ægteherre og hendes Eksamensbevis fra Skoletiden, 53 | indfattet i Glas og Ramme, var hyllede i et diskret Tusmørke, meget 54 | klædeligt for disse Kunstværker, men umuligt at læse i. 55 | 56 | Andrey stillede endnu en Stol hen til det runde Spisebord, som var 57 | dækket med Bøger og Papirer, og drejede Lampeskærmen saaledes, at det 58 | Hjørne, han plejede at bruge som Skrivebord, var helt oplyst. Helene 59 | satte sig ved Siden af ham og saa nær, at hendes Haar undertiden berørte 60 | hans; men ingen af dem ænsede det, saa optagne var de af deres Tanker. -------------------------------------------------------------------------------- /tests/data/da.out.txt: -------------------------------------------------------------------------------- 1 | lil 2 | værtshus 3 | genf 4 | rus 5 | plejed 6 | tilhold 7 | nød 8 | hel 9 | hurt 10 | beskedn 11 | aftensmaaltid 12 | uden 13 | sædvan 14 | drik 15 | kop 16 | kaf 17 | dertil 18 | luksus 19 | nægt 20 | lig 21 | sid 22 | dag 23 | faaet 24 | elev 25 | russisk 26 | aft 27 | maat 28 | skynd 29 | læng 30 | vent 31 | brev 32 | rusland 33 | laa 34 | gemt 35 | lom 36 | øjeblik 37 | sid 38 | faaet 39 | gaml 40 | hvidhaared 41 | urmag 42 | hvem 43 | hel 44 | udenlandsk 45 | korrespondanc 46 | adres 47 | brænd 48 | utaalmod 49 | erfar 50 | nyhed 51 | al 52 | almind 53 | maat 54 | indehold 55 | faa 56 | overbrag 57 | ven 58 | andrey 59 | hvem 60 | fornem 61 | angik 62 | veksled 63 | ord 64 | landflyg 65 | krydsed 66 | imellem 67 | ræk 68 | smaa 69 | bord 70 | ved 71 | hvilk 72 | overalt 73 | sad 74 | mænd 75 | arbejdsblus 76 | naaed 77 | paa 78 | gad 79 | klok 80 | kun 81 | halvsyv 82 | sik 83 | paa 84 | træf 85 | andrey 86 | hjem 87 | boed 88 | nær 89 | fem 90 | minut 91 | forløb 92 | befand 93 | hel 94 | uden 95 | dør 96 | smuk 97 | stillestaa 98 | ans 99 | faaet 100 | let 101 | farv 102 | hurt 103 | gang 104 | andrey 105 | alen 106 | færd 107 | gør 108 | uddrag 109 | statistik 110 | benytted 111 | grundlag 112 | artikel 113 | hver 114 | uge 115 | skrev 116 | russisk 117 | provinsblad 118 | vend 119 | hoved 120 | rejst 121 | udstrak 122 | haand 123 | byd 124 | gæst 125 | velkom 126 | brev 127 | sagd 128 | hel 129 | idet 130 | gav 131 | haand 132 | naa 133 | end 134 | udbrød 135 | andrey 136 | mand 137 | paa 138 | sek 139 | syv 140 | tyv 141 | aar 142 | alvor 143 | godmod 144 | ans 145 | lidt 146 | skarpt 147 | regelmæs 148 | træk 149 | pand 150 | laa 151 | spor 152 | tid 153 | sorg 154 | øjn 155 | ualmind 156 | dyb 157 | tankefuld 158 | forringed 159 | indtryk 160 | ro 161 | bestemt 162 | fik 163 | hel 164 | kraft 165 | velformed 166 | skik 167 | gled 168 | let 169 | rødm 170 | pand 171 | idet 172 | slank 173 | muskelstærk 174 | fingr 175 | nervøs 176 | hast 177 | rev 178 | konvolut 179 | fremdrog 180 | stort 181 | ark 182 | papir 183 | bedæk 184 | lini 185 | vid 186 | afstand 187 | hinand 188 | skrevn 189 | uregelmæs 190 | sammentræng 191 | haandskrift 192 | hel 193 | synt 194 | mindr 195 | utaalmod 196 | gik 197 | hen 198 | lagd 199 | haand 200 | paa 201 | skuld 202 | ogsaa 203 | læs 204 | brev 205 | bedr 206 | sæt 207 | hel 208 | sagd 209 | ung 210 | mand 211 | skyg 212 | lys 213 | din 214 | krøl 215 | mer 216 | tarv 217 | vær 218 | kun 219 | sparsomt 220 | oplyst 221 | enest 222 | lamp 223 | dæk 224 | grøn 225 | papirskærm 226 | saaled 227 | kun 228 | del 229 | brædegulv 230 | ben 231 | paa 232 | simpl 233 | stol 234 | nederst 235 | del 236 | mahogni 237 | kommod 238 | vær 239 | fornemst 240 | pryd 241 | helt 242 | oplyst 243 | væg 244 | betrukn 245 | gult 246 | tapetpapir 247 | pryded 248 | bil 249 | litografi 250 | schweizisk 251 | general 252 | dufour 253 | landskab 254 | fotografi 255 | værtind 256 | afdød 257 | ægteher 258 | eksamensbevis 259 | skoletid 260 | indfat 261 | glas 262 | ram 263 | hylled 264 | diskr 265 | tusmørk 266 | klæd 267 | kunstværk 268 | umu 269 | læs 270 | andrey 271 | stilled 272 | endnu 273 | stol 274 | hen 275 | rund 276 | spisebord 277 | dæk 278 | bøg 279 | papir 280 | drejed 281 | lampeskærm 282 | saaled 283 | hjørn 284 | plejed 285 | brug 286 | skrivebord 287 | helt 288 | oplyst 289 | hel 290 | sat 291 | ved 292 | sid 293 | saa 294 | nær 295 | haar 296 | undertid 297 | berørt 298 | ing 299 | ænsed 300 | saa 301 | optagn 302 | tank 303 | -------------------------------------------------------------------------------- /tests/data/de.in.txt: -------------------------------------------------------------------------------- 1 | Briefe und die letzten Vorbereitungen füllten den gestrigen Tag. 2 | Müde und abgespannt, eigentlich krank und fiebernd stieg ich in Graz 3 | Abends 6 Uhr in den Eisenbahnwagen; erst da ich heute Morgens das Meer 4 | wieder sah und dem alten Lieblinge das freudige Θάλαττα! Θάλαττα! 5 | entgegenrufen konnte, ward mir wieder wohl in Leib und Seele. 6 | 7 | Die Nacht war kalt gewesen, wie wenn dem Kalender zum Trotze der Winter 8 | noch fortdauere. Oder wollte sich die Heimath nur eindringlich dem 9 | Scheidenden in’s Gedächtniß heften? Umsonst die Angst, daß ich sie 10 | vergesse! es liegt ja die Nothwendigkeit der Rückkehr vor mir. Lange 11 | konnte ich den Schlaf nicht finden; dafür fand ich in der Ungestörtheit 12 | des Alleinseins mich selbst wieder, der sich in den Sorgen und Mühen 13 | der letzten Monate verloren hatte. Es ist das ein Vortheil des 14 | Reisens, daß es uns mit der Unabhängigkeit auch die unabweisliche 15 | Selbständigkeit gibt; herausgerissen aus der Bequemlichkeit der 16 | gewöhnlichen Verhältnisse, zwingt es uns die Gedanken und die Hilfe, 17 | die wir sonst rechts und links neben uns schon hergerichtet fanden, 18 | nunmehr in uns selbst zu suchen. Menschen, die sich bisher noch gar 19 | nicht kannten, haben sich oft am ersten Reisetage erst erkennen lernen. 20 | Ein Gang in die weite Welt ist die beste Schule für das Leben, und 21 | gerade für uns Kinder der Civilisation eine um so unentbehrlichere, 22 | als wir in stubenhockerischen Gewohnheiten den Contact mit der Natur 23 | verloren haben. Diese und sich selbst findet der verzogene Mensch 24 | dort wieder und so auch die Freiheit, die nur dort ist, wo der Mensch 25 | allein, oder wo er fremd unter Hunderten seines Gleichen steht. 26 | 27 | Nach 6 Uhr erwache ich. Ich sehe den Karst, auf dessen Höhe wir 28 | fahren; die Sonne ist vom Regen versteckt, der die Steinfelder dieser 29 | Berge noch unwirthlicher als sonst erscheinen läßt. In Nabresina hält 30 | der Zug; die Bahn nach Italien trennt sich hier von der, welche den 31 | Karst hinab nach Triest führt. Der Bahnhof ist groß und zweckmäßig 32 | eingerichtet. Schon singt Alles das Italienische. Erfreut durch die 33 | bekannten Klänge beobachte ich das zu- und abströmende Gedränge. 34 | Ein Conducteur war mir darin aufgefallen, weil seine Blicke mich 35 | unablässig verfolgten. War der Mann ein Vertrauter der Polizei und 36 | hielt er mich für einen Flüchtling? Jetzt drängte er sich zu an die 37 | offene Wagenthüre, umfaßte meine Knie, er hatte mich erkannt! Es war 38 | Venerando, der Gondolier, der mich in Venedig immer geführt hatte. Wie 39 | aber auch hätte ich ihn, den zierlichen, schlanken Burschen, der mich 40 | so oft in der ärgsten Sommerhitze, nichts als ein Hemd und die leichte 41 | Hose an, nach dem Lido, nach den Inseln, nach Torcello oder nach San 42 | Francesco del Deserto gerudert hatte, in der steifen, zugeknöpften 43 | Eisenbahnuniform erkennen sollen? Früh Morgens schon klopfte er damals 44 | an meine Thüre. Ich wollte die Leute schonen und so verneinte ich 45 | die Absicht einer Fahrt. Er aber kannte die stille Neigung meiner 46 | Wünsche und aufopfernd wußte er mich bald zu überreden, mich ihm und 47 | seinem Genossen hinzugeben. Landeten wir dann nach stundenlanger 48 | Fahrt an einsam abgelegener Küste und hatte ich die Früchte, die ich 49 | mitgenommen, mit ihnen getheilt, so geleitete er mich in das Innere des 50 | Landes, dem Fremdlinge die herrlichen Reste einer abgestorbenen Kunst 51 | mit all’ dem Schönheitssinn und all’ der Liebe zu seinem Vaterlande 52 | zu erklären, die dem Südländer, und dem Italiener insbesondere, 53 | eigen sind. War ich müde geworden, so ruhten wir neben einander auf 54 | dem Strande aus, dem das Meer mit leicht aufschlagenden Wellen, die 55 | immer näher unsern Füßen kamen, vertraute Grüße aus entlegenen 56 | Fernen zubrachte. Sein fortwährendes Gelispel machte die Rede meines 57 | Venerando noch geschwätziger. Von Venedig erzählte er mir, das vor uns 58 | lag im Dufte gluthvoller Mittagssonne, von den Lagunen und von den 59 | Geheimnissen, die sich nächtlich darauf begeben; zuweilen auch, wenn 60 | ich ihm besonders geneigt schien, von sich und seinen Freunden und 61 | daß er schon einmal das Messer gezückt, weil man seinem Weibe zu nahe 62 | treten wollte. Ich hörte ihm immer mit regem Interesse zu; seine Worte 63 | waren gut gewählt und seine Stimme klang melodisch. Erst Abends, wenn 64 | die Sonne schon auf den schneeigen Gipfeln der Alpen ruhte, ruderte 65 | er mich zurück durch das purpurfarbene Meer nach der goldbethürmten, 66 | kuppelbedeckten Stadt. Mit mir trug ich kostbare Erinnerungen, die 67 | ich unvergeßlich festhalte und ihm treulich danke. Sein Gefährte hieß 68 | Beppo, aber er war vergleichsweise unbedeutend. -------------------------------------------------------------------------------- /tests/data/de.out.txt: -------------------------------------------------------------------------------- 1 | brief 2 | letzt 3 | vorbereit 4 | fullt 5 | gestrig 6 | tag 7 | mud 8 | abgespannt 9 | eigent 10 | krank 11 | fiebernd 12 | stieg 13 | graz 14 | abend 15 | uhr 16 | eisenbahnwag 17 | erst 18 | heut 19 | morg 20 | meer 21 | sah 22 | alt 23 | liebling 24 | freudig 25 | entgegenruf 26 | konnt 27 | ward 28 | wohl 29 | leib 30 | seel 31 | nacht 32 | kalt 33 | kalend 34 | trotz 35 | wint 36 | fortdau 37 | heimath 38 | eindring 39 | scheidend 40 | in’s 41 | gedachtniss 42 | heft 43 | umson 44 | angst 45 | vergess 46 | liegt 47 | ja 48 | nothwend 49 | ruckkehr 50 | lang 51 | konnt 52 | schlaf 53 | find 54 | dafur 55 | fand 56 | ungestort 57 | alleinsein 58 | sorg 59 | muh 60 | letzt 61 | monat 62 | verlor 63 | vortheil 64 | reis 65 | unabhang 66 | unabweis 67 | selbstand 68 | gibt 69 | herausgeriss 70 | bequem 71 | gewohn 72 | verhaltnis 73 | zwingt 74 | gedank 75 | hilf 76 | recht 77 | link 78 | neb 79 | schon 80 | hergerichtet 81 | fand 82 | nunmehr 83 | such 84 | mensch 85 | bish 86 | gar 87 | kannt 88 | oft 89 | erst 90 | reisetag 91 | erst 92 | erkenn 93 | lern 94 | gang 95 | weit 96 | welt 97 | best 98 | schul 99 | leb 100 | gerad 101 | kind 102 | civilisation 103 | unentbehr 104 | stubenhocker 105 | gewohn 106 | contact 107 | natur 108 | verlor 109 | findet 110 | verzog 111 | mensch 112 | freiheit 113 | mensch 114 | allein 115 | fremd 116 | hundert 117 | gleich 118 | steht 119 | uhr 120 | erwach 121 | seh 122 | karst 123 | hoh 124 | fahr 125 | sonn 126 | reg 127 | versteckt 128 | steinfeld 129 | berg 130 | unwirth 131 | erschein 132 | lasst 133 | nabresina 134 | halt 135 | zug 136 | bahn 137 | itali 138 | trennt 139 | karst 140 | hinab 141 | triest 142 | fuhrt 143 | bahnhof 144 | gross 145 | zweckmass 146 | eingerichtet 147 | schon 148 | singt 149 | italien 150 | erfreut 151 | bekannt 152 | klang 153 | beobacht 154 | abstrom 155 | gedrang 156 | conducteur 157 | darin 158 | aufgefall 159 | blick 160 | unablass 161 | verfolgt 162 | mann 163 | vertraut 164 | polizei 165 | hielt 166 | fluchtling 167 | drangt 168 | off 169 | wagenthur 170 | umfasst 171 | knie 172 | erkannt 173 | venerando 174 | gondoli 175 | vened 176 | imm 177 | gefuhrt 178 | hatt 179 | zierlich 180 | schlank 181 | bursch 182 | oft 183 | argst 184 | sommerhitz 185 | hemd 186 | leicht 187 | hos 188 | lido 189 | inseln 190 | torcello 191 | san 192 | francesco 193 | del 194 | deserto 195 | gerudert 196 | steif 197 | zugeknopft 198 | eisenbahnuniform 199 | erkenn 200 | soll 201 | fruh 202 | morg 203 | schon 204 | klopft 205 | damal 206 | thur 207 | leut 208 | schon 209 | verneint 210 | absicht 211 | fahrt 212 | kannt 213 | still 214 | neigung 215 | wunsch 216 | aufopfernd 217 | wusst 218 | bald 219 | uberred 220 | genoss 221 | hinzugeb 222 | landet 223 | stundenlang 224 | fahrt 225 | einsam 226 | abgeleg 227 | kust 228 | frucht 229 | mitgenomm 230 | getheilt 231 | geleitet 232 | inn 233 | land 234 | fremdling 235 | herrlich 236 | rest 237 | abgestorb 238 | kunst 239 | all 240 | schonheitssinn 241 | all 242 | lieb 243 | vaterland 244 | erklar 245 | sudland 246 | itali 247 | insbesond 248 | eig 249 | mud 250 | geword 251 | ruht 252 | neb 253 | einand 254 | strand 255 | meer 256 | leicht 257 | aufschlag 258 | well 259 | imm 260 | nah 261 | uns 262 | fuss 263 | kam 264 | vertraut 265 | gruss 266 | entleg 267 | fern 268 | zubracht 269 | fortwahr 270 | gelispel 271 | macht 272 | red 273 | venerando 274 | geschwatz 275 | vened 276 | erzahlt 277 | lag 278 | duft 279 | gluthvoll 280 | mittagssonn 281 | lagun 282 | geheimnis 283 | nachtlich 284 | darauf 285 | begeb 286 | zuweil 287 | besond 288 | geneigt 289 | schien 290 | freund 291 | schon 292 | mess 293 | gezuckt 294 | weib 295 | nah 296 | tret 297 | hort 298 | imm 299 | reg 300 | interess 301 | wort 302 | gut 303 | gewahlt 304 | stimm 305 | klang 306 | melod 307 | erst 308 | abend 309 | sonn 310 | schon 311 | schneeig 312 | gipfeln 313 | alp 314 | ruht 315 | rudert 316 | zuruck 317 | purpurfarb 318 | meer 319 | goldbethurmt 320 | kuppelbedeckt 321 | stadt 322 | trug 323 | kostbar 324 | erinner 325 | unvergess 326 | festhalt 327 | treulich 328 | dank 329 | gefahrt 330 | hiess 331 | beppo 332 | vergleichsweis 333 | unbedeut 334 | -------------------------------------------------------------------------------- /tests/data/du.in.txt: -------------------------------------------------------------------------------- 1 | [Doel der vertaling.] 2 | 3 | Het doel van deze vertaling is den Nederlandschen lezer in kennis te 4 | stellen met den volledigen inhoud van Dante's Gedicht. De vertaling 5 | is zooveel mogelijk woordelijk, kan dus ook als handleiding dienen bij 6 | het lezen en bestudeeren van den oorspronkelijken, Italiaanschen tekst. 7 | 8 | [Waarom in proza?] 9 | 10 | De vertaling is in Proza. 11 | 12 | Waarom? Omdat de woorden, waarin het Gedicht vervat is, den 13 | dichter werden ingegeven in het scheppingsoogenblik door de 14 | volheid zijner fantasieën, gevoelens en gedachten zelve. Ook in het 15 | practisch-onmogelijke, maar theoretisch stelbare geval dat de vertaler 16 | evenzeer vervuld ware als de dichter van hetgeen uitgedrukt moet 17 | worden, zoude het onmogelijk zijn, dat de tweede, de Nederlandsche 18 | dichter kwam tot een uitdrukkingsvorm, die ook maar eenigszins 19 | gelijkliep met den vorm door den eersten, den Italiaanschen dichter 20 | gevonden. Dante zelf zegt op dit stuk: "En daarom wete een ieder, 21 | dat geen enkele zaak, door den band der muziek harmonisch uitgedrukt, 22 | uit hare eigene taal in eene andere kan worden overgebracht, zonder 23 | dat men al hare zoetheid en harmonie verbreke." 24 | 25 | [Naam v.h. gedicht.] 26 | 27 | Het hier den Nederlandschen lezer aangeboden werk is het eerste 28 | van drie gedichten (Canzoni), "de Hel," "de Louteringsberg" en "het 29 | Paradijs," door Dante tezamen genoemd "Comedia", om de eenvoudige 30 | reden, dat het er in vervatte verhaal begint met 's Dichters tocht 31 | door de Hel, dus met treurigheid, vervolgens handelt van 's Dichters 32 | tocht langs den Louteringsberg en eindigt met 's Dichters tocht door 33 | den Hemel, of het Paradijs, dus een blijden afloop heeft. Comedia 34 | beteekent niet anders dan "blij-eindend Dicht." "Divina" is de Comedia 35 | eerst later door een bewonderend nageslacht genoemd. 36 | 37 | [Wat de inl. behelst.] 38 | 39 | Het Gedicht, waarin deze tocht verhaald wordt, en alles tot de kleinste 40 | bijzonderheden den lezer voor oogen wordt gesteld, kan eigenlijk 41 | geheel voor zich zelf spreken. Daar echter de Dichter op zijn tocht 42 | door die drie Rijken een ontzaggelijk groot aantal personen ontmoet, 43 | zoowel uit zijn eigen als uit vroegere tijden, hebben wij, vooral tot 44 | beter begrip van de gesprekken met personen uit 's Dichters eigen 45 | tijd, gemeend den lezer geen onwelkomen dienst te bewijzen, door 46 | eenige hoofdzaken aangaande 's Dichters leven en tijd mede te deelen. 47 | 48 | [Dante niet duister, wel diep.] 49 | 50 | Even wil ik nog den lezer op het hart drukken, dat het Gedicht nooit 51 | duister is, wèl op sommige plaatsen zeer diep van zin, zoodat menige 52 | plaats, behalve den eersten, bij de lezing onmiddellijk begrijpbaren 53 | zin bij nadere beschouwing blijkt nog veel meer te bevatten. Zulke 54 | plaatsen hebben dan ook aanleiding gegeven tot oneindige discussie, 55 | ten onrechte, daar er van discussie geen kwestie mocht zijn, nl. van 56 | een strijd van verschillende partijen, die ieder voor zich gelijk 57 | willen hebben, maar wel van een wedstrijd wie het diepst in den zin 58 | des dichters vermocht door te dringen. 59 | 60 | Maar vóór alles zij nog dit gezegd. Dante's Gedicht is niet maar eene 61 | schildering van zijn tijd; het is de schildering van den mensch, 62 | in al zijne vermogens en mogelijkheden, in al zijne eigenschappen, 63 | zoowel die hem tot de diepste zonde, als die hem tot den hoogsten 64 | heilstaat brengen. Daarom voert Dante, zelf alle ellende, loutering en 65 | geleidelijk-groeiend geluk doorlevend, den mensch van de gruwelijkste 66 | onvergoeilijke zonden, door die welke door boetedoeningen overwonnen 67 | kunnen worden tot het hoogste zielegeluk, d. w. z. door de Hel, 68 | langs den Louteringsberg naar den Hemel. 69 | -------------------------------------------------------------------------------- /tests/data/du.out.txt: -------------------------------------------------------------------------------- 1 | doel 2 | vertal 3 | doel 4 | vertal 5 | den 6 | nederlandsch 7 | lezer 8 | kennis 9 | stell 10 | den 11 | volled 12 | inhoud 13 | dante' 14 | gedicht 15 | vertal 16 | zoovel 17 | mogelijk 18 | woordelijk 19 | handleid 20 | dien 21 | lez 22 | bestuder 23 | den 24 | oorspronk 25 | italiaansch 26 | tekst 27 | waarom 28 | proza 29 | vertal 30 | proza 31 | waarom 32 | woord 33 | waarin 34 | gedicht 35 | vervat 36 | den 37 | dichter 38 | werd 39 | ingegev 40 | scheppingsoogenblik 41 | volheid 42 | zijner 43 | fantasieen 44 | gevoelen 45 | gedacht 46 | zelv 47 | practisch 48 | onmog 49 | theoretisch 50 | stelbar 51 | geval 52 | vertaler 53 | evenzer 54 | vervuld 55 | war 56 | dichter 57 | hetgen 58 | uitgedrukt 59 | zoud 60 | onmog 61 | twed 62 | nederlandsch 63 | dichter 64 | kwam 65 | uitdrukkingsvorm 66 | eenigszin 67 | gelijkliep 68 | den 69 | vorm 70 | den 71 | eerst 72 | den 73 | italiaansch 74 | dichter 75 | gevond 76 | dant 77 | zegt 78 | stuk 79 | daarom 80 | wet 81 | ieder 82 | enkel 83 | zak 84 | den 85 | band 86 | muziek 87 | harmonisch 88 | uitgedrukt 89 | har 90 | eig 91 | tal 92 | een 93 | overgebracht 94 | har 95 | zoetheid 96 | harmonie 97 | verbrek 98 | nam 99 | v.h 100 | gedicht 101 | den 102 | nederlandsch 103 | lezer 104 | aangebod 105 | werk 106 | eerst 107 | drie 108 | gedicht 109 | canzoni 110 | hel 111 | louteringsberg 112 | paradijs 113 | dant 114 | tezam 115 | genoemd 116 | comedia 117 | eenvoud 118 | red 119 | vervat 120 | verhal 121 | begint 122 | s 123 | dichter 124 | tocht 125 | hel 126 | treurig 127 | vervolgen 128 | handelt 129 | s 130 | dichter 131 | tocht 132 | lang 133 | den 134 | louteringsberg 135 | eindigt 136 | s 137 | dichter 138 | tocht 139 | den 140 | hemel 141 | paradijs 142 | blijd 143 | aflop 144 | comedia 145 | beteekent 146 | ander 147 | blij 148 | eindend 149 | dicht 150 | divina 151 | comedia 152 | eerst 153 | later 154 | bewonder 155 | nageslacht 156 | genoemd 157 | inl 158 | behelst 159 | gedicht 160 | waarin 161 | tocht 162 | verhaald 163 | kleinst 164 | bijzonder 165 | den 166 | lezer 167 | oog 168 | gesteld 169 | eigen 170 | gehel 171 | sprek 172 | echter 173 | dichter 174 | tocht 175 | drie 176 | rijk 177 | ontzagg 178 | grot 179 | aantal 180 | person 181 | ontmoet 182 | zoowel 183 | eig 184 | vroeger 185 | tijd 186 | wij 187 | vooral 188 | beter 189 | begrip 190 | gesprek 191 | person 192 | s 193 | dichter 194 | eig 195 | tijd 196 | gemeend 197 | den 198 | lezer 199 | onwelkom 200 | dienst 201 | bewijz 202 | eenig 203 | hoofdzak 204 | aangaand 205 | s 206 | dichter 207 | lev 208 | tijd 209 | med 210 | del 211 | dant 212 | duister 213 | wel 214 | diep 215 | even 216 | den 217 | lezer 218 | hart 219 | druk 220 | gedicht 221 | nooit 222 | duister 223 | wèl 224 | sommig 225 | plaats 226 | zer 227 | diep 228 | zin 229 | zoodat 230 | menig 231 | plat 232 | behalv 233 | den 234 | eerst 235 | lezing 236 | onmiddel 237 | begrijpbar 238 | zin 239 | nader 240 | beschouw 241 | blijkt 242 | bevat 243 | zulk 244 | plaats 245 | aanleid 246 | gegev 247 | oneind 248 | discussie 249 | ten 250 | onrecht 251 | discussie 252 | kwestie 253 | mocht 254 | nl 255 | strijd 256 | verschill 257 | partij 258 | ieder 259 | gelijk 260 | will 261 | wel 262 | wedstrijd 263 | diepst 264 | den 265 | zin 266 | des 267 | dichter 268 | vermocht 269 | dring 270 | vor 271 | gezegd 272 | dante' 273 | gedicht 274 | een 275 | schilder 276 | tijd 277 | schilder 278 | den 279 | mensch 280 | zijn 281 | vermogen 282 | mogelijk 283 | zijn 284 | eigenschapp 285 | zoowel 286 | diepst 287 | zond 288 | den 289 | hoogst 290 | heilstat 291 | breng 292 | daarom 293 | voert 294 | dant 295 | all 296 | ellend 297 | louter 298 | geleid 299 | groeiend 300 | geluk 301 | doorlev 302 | den 303 | mensch 304 | gruwelijkst 305 | onvergoei 306 | zond 307 | welk 308 | boetedoen 309 | overwonn 310 | hoogst 311 | zielegeluk 312 | d 313 | w 314 | z 315 | hel 316 | lang 317 | den 318 | louteringsberg 319 | den 320 | hemel 321 | -------------------------------------------------------------------------------- /tests/data/en.in.txt: -------------------------------------------------------------------------------- 1 | It is a truth universally acknowledged, that a single man in possession 2 | of a good fortune, must be in want of a wife. 3 | 4 | However little known the feelings or views of such a man may be on his 5 | first entering a neighbourhood, this truth is so well fixed in the minds 6 | of the surrounding families, that he is considered the rightful property 7 | of some one or other of their daughters. 8 | 9 | “My dear Mr. Bennet,” said his lady to him one day, “have you heard that 10 | Netherfield Park is let at last?” 11 | 12 | Mr. Bennet replied that he had not. 13 | 14 | “But it is,” returned she; “for Mrs. Long has just been here, and she 15 | told me all about it.” 16 | 17 | Mr. Bennet made no answer. 18 | 19 | “Do you not want to know who has taken it?” cried his wife impatiently. 20 | 21 | “_You_ want to tell me, and I have no objection to hearing it.” 22 | 23 | This was invitation enough. 24 | 25 | “Why, my dear, you must know, Mrs. Long says that Netherfield is taken 26 | by a young man of large fortune from the north of England; that he came 27 | down on Monday in a chaise and four to see the place, and was so much 28 | delighted with it, that he agreed with Mr. Morris immediately; that he 29 | is to take possession before Michaelmas, and some of his servants are to 30 | be in the house by the end of next week.” 31 | 32 | “What is his name?” 33 | 34 | “Bingley.” 35 | 36 | “Is he married or single?” 37 | 38 | “Oh! Single, my dear, to be sure! A single man of large fortune; four or 39 | five thousand a year. What a fine thing for our girls!” 40 | 41 | “How so? How can it affect them?” 42 | 43 | “My dear Mr. Bennet,” replied his wife, “how can you be so tiresome! You 44 | must know that I am thinking of his marrying one of them.” 45 | 46 | “Is that his design in settling here?” 47 | 48 | “Design! Nonsense, how can you talk so! But it is very likely that he 49 | _may_ fall in love with one of them, and therefore you must visit him as 50 | soon as he comes.” 51 | 52 | “I see no occasion for that. You and the girls may go, or you may send 53 | them by themselves, which perhaps will be still better, for as you are 54 | as handsome as any of them, Mr. Bingley may like you the best of the 55 | party.” 56 | 57 | “My dear, you flatter me. I certainly _have_ had my share of beauty, but 58 | I do not pretend to be anything extraordinary now. When a woman has five 59 | grown-up daughters, she ought to give over thinking of her own beauty.” 60 | 61 | “In such cases, a woman has not often much beauty to think of.” 62 | 63 | “But, my dear, you must indeed go and see Mr. Bingley when he comes into 64 | the neighbourhood.” 65 | 66 | “It is more than I engage for, I assure you.” 67 | 68 | “But consider your daughters. Only think what an establishment it would 69 | be for one of them. Sir William and Lady Lucas are determined to 70 | go, merely on that account, for in general, you know, they visit no 71 | newcomers. Indeed you must go, for it will be impossible for _us_ to 72 | visit him if you do not.” 73 | 74 | “You are over-scrupulous, surely. I dare say Mr. Bingley will be very 75 | glad to see you; and I will send a few lines by you to assure him of my 76 | hearty consent to his marrying whichever he chooses of the girls; though 77 | I must throw in a good word for my little Lizzy.” 78 | 79 | “I desire you will do no such thing. Lizzy is not a bit better than the 80 | others; and I am sure she is not half so handsome as Jane, nor half so 81 | good-humoured as Lydia. But you are always giving _her_ the preference.” 82 | 83 | “They have none of them much to recommend them,” replied he; “they are 84 | all silly and ignorant like other girls; but Lizzy has something more of 85 | quickness than her sisters.” 86 | 87 | “Mr. Bennet, how _can_ you abuse your own children in such a way? You 88 | take delight in vexing me. You have no compassion for my poor nerves.” 89 | 90 | “You mistake me, my dear. I have a high respect for your nerves. They 91 | are my old friends. I have heard you mention them with consideration 92 | these last twenty years at least.” 93 | 94 | “Ah, you do not know what I suffer.” 95 | 96 | “But I hope you will get over it, and live to see many young men of four 97 | thousand a year come into the neighbourhood.” 98 | 99 | “It will be no use to us, if twenty such should come, since you will not 100 | visit them.” 101 | 102 | “Depend upon it, my dear, that when there are twenty, I will visit them 103 | all.” 104 | 105 | Mr. Bennet was so odd a mixture of quick parts, sarcastic humour, 106 | reserve, and caprice, that the experience of three-and-twenty years had 107 | been insufficient to make his wife understand his character. _Her_ mind 108 | was less difficult to develop. She was a woman of mean understanding, 109 | little information, and uncertain temper. When she was discontented, 110 | she fancied herself nervous. The business of her life was to get her 111 | daughters married; its solace was visiting and news. -------------------------------------------------------------------------------- /tests/data/en.out.txt: -------------------------------------------------------------------------------- 1 | truth 2 | univers 3 | acknowledg 4 | singl 5 | man 6 | possess 7 | good 8 | fortun 9 | want 10 | wife 11 | littl 12 | known 13 | feel 14 | view 15 | such 16 | man 17 | first 18 | enter 19 | neighbourhood 20 | truth 21 | well 22 | fix 23 | mind 24 | surround 25 | famili 26 | consid 27 | right 28 | properti 29 | on 30 | daughter 31 | mr 32 | bennet 33 | ladi 34 | on 35 | day 36 | heard 37 | netherfield 38 | park 39 | last 40 | mr 41 | bennet 42 | repli 43 | return 44 | mr 45 | long 46 | here 47 | told 48 | mr 49 | bennet 50 | made 51 | answer 52 | want 53 | know 54 | taken 55 | cri 56 | wife 57 | impati 58 | _you_ 59 | want 60 | tell 61 | object 62 | hear 63 | invit 64 | enough 65 | know 66 | mr 67 | long 68 | netherfield 69 | taken 70 | young 71 | man 72 | larg 73 | fortun 74 | north 75 | england 76 | came 77 | down 78 | monday 79 | chais 80 | four 81 | see 82 | place 83 | much 84 | delight 85 | agre 86 | mr 87 | morri 88 | immedi 89 | take 90 | possess 91 | befor 92 | michaelma 93 | servant 94 | hous 95 | end 96 | next 97 | week 98 | name 99 | bingley 100 | marri 101 | singl 102 | oh 103 | singl 104 | sure 105 | singl 106 | man 107 | larg 108 | fortun 109 | four 110 | five 111 | thousand 112 | year 113 | fine 114 | thing 115 | girl 116 | affect 117 | mr 118 | bennet 119 | repli 120 | wife 121 | tiresom 122 | know 123 | think 124 | marri 125 | on 126 | design 127 | settl 128 | here 129 | design 130 | nonsens 131 | talk 132 | veri 133 | _may_ 134 | fall 135 | love 136 | on 137 | therefor 138 | visit 139 | soon 140 | come 141 | see 142 | occas 143 | girl 144 | go 145 | send 146 | themselv 147 | perhap 148 | still 149 | better 150 | handsom 151 | mr 152 | bingley 153 | best 154 | parti 155 | flatter 156 | certainli 157 | _have_ 158 | share 159 | beauti 160 | pretend 161 | anyth 162 | extraordinari 163 | now 164 | woman 165 | five 166 | grown 167 | up 168 | daughter 169 | ought 170 | give 171 | over 172 | think 173 | beauti 174 | such 175 | case 176 | woman 177 | much 178 | beauti 179 | think 180 | inde 181 | go 182 | see 183 | mr 184 | bingley 185 | come 186 | neighbourhood 187 | more 188 | engag 189 | assur 190 | consid 191 | daughter 192 | think 193 | establish 194 | on 195 | sir 196 | william 197 | ladi 198 | luca 199 | determin 200 | go 201 | mere 202 | account 203 | gener 204 | know 205 | visit 206 | newcom 207 | inde 208 | go 209 | imposs 210 | _us_ 211 | visit 212 | over 213 | scrupul 214 | sure 215 | dare 216 | mr 217 | bingley 218 | veri 219 | glad 220 | see 221 | send 222 | few 223 | line 224 | assur 225 | hearti 226 | consent 227 | marri 228 | whichev 229 | choos 230 | girl 231 | though 232 | throw 233 | good 234 | word 235 | littl 236 | lizzi 237 | desir 238 | such 239 | thing 240 | lizzi 241 | bit 242 | better 243 | other 244 | sure 245 | half 246 | handsom 247 | jane 248 | half 249 | good 250 | humour 251 | lydia 252 | alway 253 | give 254 | _her_ 255 | prefer 256 | none 257 | much 258 | recommend 259 | repli 260 | silli 261 | ignor 262 | girl 263 | lizzi 264 | someth 265 | more 266 | quick 267 | sister 268 | mr 269 | bennet 270 | _can_ 271 | abus 272 | children 273 | such 274 | way 275 | take 276 | delight 277 | vex 278 | compass 279 | poor 280 | nerv 281 | mistak 282 | high 283 | respect 284 | nerv 285 | old 286 | friend 287 | heard 288 | mention 289 | consider 290 | last 291 | twenti 292 | year 293 | ah 294 | know 295 | suffer 296 | hope 297 | over 298 | live 299 | see 300 | mani 301 | young 302 | men 303 | four 304 | thousand 305 | year 306 | come 307 | neighbourhood 308 | us 309 | twenti 310 | such 311 | come 312 | visit 313 | depend 314 | upon 315 | twenti 316 | visit 317 | mr 318 | bennet 319 | odd 320 | mixtur 321 | quick 322 | part 323 | sarcast 324 | humour 325 | reserv 326 | capric 327 | experi 328 | three 329 | twenti 330 | year 331 | insuffici 332 | make 333 | wife 334 | understand 335 | charact 336 | _her_ 337 | mind 338 | less 339 | difficult 340 | develop 341 | woman 342 | mean 343 | understand 344 | littl 345 | inform 346 | uncertain 347 | temper 348 | discont 349 | fanci 350 | herself 351 | nervou 352 | busi 353 | life 354 | daughter 355 | marri 356 | solac 357 | visit 358 | new 359 | -------------------------------------------------------------------------------- /tests/data/es.in.txt: -------------------------------------------------------------------------------- 1 | En el piso bajo de la izquierda de una humilde pero graciosa y limpia 2 | casa de la calle de Preciados, calle muy estrecha y retorcida en aquel 3 | entonces, y teatro de la refriega en tal momento, vivían[13] solas, esto 4 | es, sin la compañía de hombre ninguno, tres buenas y piadosas[14] 5 | mujeres, que mucho se diferenciaban entre sí en cuanto al ser físico y 6 | estado social, puesto que éranse que se eran[15] una señora mayor, 7 | viuda, guipuzcoana, de aspecto grave y distinguido; una hija suya, 8 | joven, soltera, natural de Madrid, y bastante guapa, aunque de tipo 9 | diferente al de la madre (lo cual daba a entender que había salido en 10 | todo a su padre),[16] y una doméstica,[17] imposible de filiar o 11 | describir, sin edad, figura ni casi sexo determinables, bautizada, hasta 12 | cierto punto,[18] en Mondoñedo, y a la cual ya hemos hecho demasiado 13 | favor (como también se lo hizo aquel señor Cura) con reconocer que 14 | pertenecía a la especie humana... 15 | 16 | La mencionada joven parecía el símbolo o representación, viva y con 17 | faldas,[19] del sentido común: tal equilibrio había entre su hermosura 18 | y su naturalidad, entre su elegancia y su sencillez, entre su gracia y 19 | su modestia. Facilísimo[20] era que pasase inadvertida por la vía 20 | pública, sin alborotar a los galanteadores de oficio, pero imposible que 21 | nadie dejara de admirarla[21] y de prendarse de sus múltiples 22 | encantos,[22] luego que fijase en ella la atención.[23] 23 | 24 | No era, no (o, por mejor decir, no quería ser), una de esas beldades 25 | llamativas, aparatosas, fulminantes, que atraen todas las miradas no 26 | bien se presentan en un salón, teatro, o paseo, y que comprometen o 27 | anulan al pobrete que las acompaña, sea novio, sea marido, sea padre, 28 | sea el mismísimo Preste Juan de las Indias...[24] Era un conjunto sabio 29 | y armónico de perfecciones físicas y morales, cuya prodigiosa 30 | regularidad no entusiasmaba al pronto, como no entusiasman la paz y el 31 | orden; o como acontece con los monumentos bien proporcionados, donde 32 | nada nos choca ni maravilla hasta que[25] formamos juicio de que,[26] si 33 | todo resulta llano, fácil y natural, consiste en que todo es igualmente 34 | bello. Dijérase[27] que aquella diosa honrada de la clase media había 35 | estudiado su modo de vestirse, de peinarse, de mirar, de moverse, de 36 | conllevar, en fin, los tesoros de su espléndida juventud, en tal forma y 37 | manera, que no se la creyese pagada[28] de sí misma, ni presuntuosa, ni 38 | incitante, sino muy diferente de las deidades por casar que hacen feria 39 | de sus hechizos y van por esas calles[29] de Dios diciendo a todo el 40 | mundo: _Esta casa se vende... o se alquila_. 41 | 42 | Pero no nos detengamos en floreos ni dibujos,[30] que es mucho lo que 43 | tenemos que referir, y poquísimo el tiempo de que disponemos. 44 | -------------------------------------------------------------------------------- /tests/data/es.out.txt: -------------------------------------------------------------------------------- 1 | pis 2 | baj 3 | izquierd 4 | humild 5 | gracios 6 | limpi 7 | cas 8 | call 9 | preci 10 | call 11 | estrech 12 | retorc 13 | aquel 14 | entonc 15 | teatr 16 | refrieg 17 | tal 18 | moment 19 | viv 20 | sol 21 | compañ 22 | hombr 23 | ningun 24 | tres 25 | buen 26 | piad 27 | mujer 28 | diferenci 29 | cuant 30 | ser 31 | fisic 32 | social 33 | puest 34 | erans 35 | señor 36 | mayor 37 | viud 38 | guipuzcoan 39 | aspect 40 | grav 41 | distingu 42 | hij 43 | jov 44 | solter 45 | natural 46 | madr 47 | bastant 48 | guap 49 | aunqu 50 | tip 51 | diferent 52 | madr 53 | dab 54 | entend 55 | sal 56 | padr 57 | domest 58 | impos 59 | fili 60 | describ 61 | edad 62 | figur 63 | casi 64 | sex 65 | determin 66 | bautiz 67 | ciert 68 | punt 69 | mondoñed 70 | hech 71 | demasi 72 | favor 73 | hiz 74 | aquel 75 | señor 76 | cur 77 | reconoc 78 | pertenec 79 | especi 80 | human 81 | mencion 82 | jov 83 | parec 84 | simbol 85 | represent 86 | viv 87 | fald 88 | sent 89 | comun 90 | tal 91 | equilibri 92 | hermosur 93 | natural 94 | eleg 95 | sencillez 96 | graci 97 | modesti 98 | facilisim 99 | pas 100 | inadvert 101 | via 102 | public 103 | alborot 104 | galant 105 | ofici 106 | impos 107 | nadi 108 | dej 109 | admir 110 | prend 111 | multipl 112 | encant 113 | lueg 114 | fij 115 | atencion 116 | mejor 117 | dec 118 | quer 119 | ser 120 | beldad 121 | llamat 122 | aparat 123 | fulmin 124 | atra 125 | tod 126 | mir 127 | bien 128 | present 129 | salon 130 | teatr 131 | pase 132 | compromet 133 | anul 134 | pobret 135 | acompañ 136 | novi 137 | mar 138 | padr 139 | mismisim 140 | prest 141 | juan 142 | indi 143 | conjunt 144 | sabi 145 | armon 146 | perfeccion 147 | fisic 148 | moral 149 | cuy 150 | prodigi 151 | regular 152 | entusiasm 153 | pront 154 | entusiasm 155 | paz 156 | orden 157 | acontec 158 | monument 159 | bien 160 | proporcion 161 | choc 162 | maravill 163 | form 164 | juici 165 | si 166 | result 167 | llan 168 | facil 169 | natural 170 | cons 171 | igual 172 | bell 173 | dijer 174 | aquell 175 | dios 176 | honr 177 | clas 178 | medi 179 | estudi 180 | mod 181 | vest 182 | pein 183 | mir 184 | mov 185 | conllev 186 | fin 187 | tesor 188 | esplend 189 | juventud 190 | tal 191 | form 192 | maner 193 | creyes 194 | pag 195 | mism 196 | presuntu 197 | incit 198 | sin 199 | diferent 200 | deidad 201 | cas 202 | hac 203 | feri 204 | hechiz 205 | van 206 | call 207 | dios 208 | dic 209 | mund 210 | cas 211 | vend 212 | alquil 213 | deteng 214 | flore 215 | dibuj 216 | refer 217 | poquisim 218 | tiemp 219 | dispon 220 | -------------------------------------------------------------------------------- /tests/data/fi.in.txt: -------------------------------------------------------------------------------- 1 | MÄKÄRÄ. 2 | 3 | _(Kumarrellen sakastin rappusilla.)_ Hyvä herra, armollinen 4 | kirkonpalvelija... 5 | 6 | SUNTIO. 7 | 8 | Ka, mene, mene! Ei tänne saa tulla. 9 | 10 | MÄKÄRÄ. 11 | 12 | Enhän minä .. suokaa anteeksi.. Olisin vaan kaikessa nöyryydessäni 13 | tullut herran temppeliin. 14 | 15 | SUNTIO. 16 | 17 | Sinä et kuitenkaan malta olla yhdessä kohden. Juoksentelet ympäri 18 | kirkkoa ja höpiset jonkin joutavaa. 19 | 20 | MÄKÄRÄ. 21 | 22 | He, hee.. Höpisenkö minä?.. No ei sitten. Tämä pääkartano ei tahdo taas 23 | pitää kutiaan. _(Hän nauraa kähäyttää.)_ 24 | 25 | SUNTIO. 26 | 27 | Senpätähden saat pysyä poikessa. 28 | 29 | MÄKÄRÄ. 30 | 31 | _(Vedet silmissä.)_ Ni-ni-niinhän tuota pitänee.. Kas, pääsky lensi.. 32 | Mutta mitä pahaa minä sitten olen tehnyt, kun sinä minua ajelet kirkosta 33 | pois? 34 | 35 | SUNTIO. 36 | 37 | Ethän sinä pysy yhdessä kohdenkaan, ja kun pappi saarnassaan muun muassa 38 | sanoi: kyllä herra hullut hoitaa, niin silloin sinä heti ääneen huusit 39 | että kyllä se hoitaa. Sopiiko tämä nyt kirkossa? 40 | 41 | MÄKÄRÄ. 42 | 43 | _(On hetken alakuloisen näköinen, mutta sitten iloisesti.)_ Mutta 44 | hoitaakinhan se. Mitäs pahaa siinä on? 45 | 46 | SUNTIO. 47 | 48 | Eihän kirkossa saa huutaa ja juoksennella. Ymmärräthän sen sinäkin. 49 | 50 | MÄKÄRÄ. 51 | 52 | Niin .. niinhän se on, eihän sitä saisi, vaan kun ne henget viettelevät, 53 | niin minkäpä sille taitaa. 54 | 55 | SUNTIO. 56 | 57 | Miten ne henget viettelevät? 58 | 59 | MÄKÄRÄ. 60 | 61 | Nekö? Nehän pitävät Mäkärää aivan narrillaan. Suhkavat kirkossakin 62 | korvaan: juokse, juokse!.. ja silloin täytyy juosta. Toinen tulee ja 63 | kuiskaa: elä vainenkaan juokse, elä vainenkaan juokse, mutta huuda .. ja 64 | minä huudan. Muutoin ne tekisivät kerrassa kummia, niitä täytyy totella. 65 | Kyllä ne kirkossa kumminkin vähän siivommalla ovat, mutta, vie sun, kun 66 | ne kotona vehkeilevät, niin jos siinä ei ole sen seitsemässäkin 67 | höyräkässä. Se pappa-piru on -- koira vieköön -- kaikista ilkein .. 68 | katsos.. _(Hän levittää kätensä ja sormensa, panee naamansa hyvin julman 69 | näköiseksi, irvistää ja hyppää suntioon päin.)_ Näin se tekee. 70 | 71 | _(SUNTIO ärjäsee ja vetäytyy säikähtäen taaksepäin.)_ 72 | 73 | MÄKÄRÄ. 74 | 75 | _(Nauraa viekkaasti ja räpyttää silmiään.)_ Ei tämä mitään Ole sen 76 | suhteen.. Vaan niin se tekee ja ottaa kirveen penkin alta ja huutaa: 77 | Mäkärä, pane pää pölkylle! Ja jos minä silloin en olisi tiukkana ja 78 | sukkelana, niin arvaathan sen, mitä se piru silloin tekisi. Mutta minä 79 | hyppään näin ikään .. taaksepäin, kun se tulee kirveineen ja manaan 80 | häntä jumalan nimessä menemään pois. Sitten se vähitellen vetäytyy 81 | jonkun pimeän nurkan kautta pellolle, mutta kauvanhan peijakkaan mustat 82 | silmät sieltä vielä kiiluvat. 83 | 84 | SUNTIO. 85 | 86 | Katsos peijakasta. Kyllä kai se minua peloittaisi. 87 | 88 | MÄKÄRÄ. 89 | 90 | Vaarassa niiden kanssa toki onkin tuolla kulkiessaan. Kun vaan kaivon 91 | kohdalle sattuu, niin silloin tuo, joka on kaikista suurin roisto, tuo 92 | Ansgaarius, joka kirkkoonkin tulee .. se tuntuu aivan kuin niskasta 93 | kiini ottavan ja suhkaa: Mäkärä, Mäkärä, hyppää kaivoon! Vaan silloin 94 | minä pyöräytän sitä tuolla lailla .. ja alan juosta sen minkä käpälästä 95 | lähtee.. Kas, kas; tuolla kun oriit tappelevat. 96 | 97 | SUNTIO. 98 | 99 | _(Juoksee katsomaan.)_ Missä .. missä? 100 | 101 | MÄKÄRÄ. 102 | 103 | _(Kiiruhtaa sakastin rappusia ylös päästäkseen kirkkoon.)_ Siellä, 104 | siellä.. He, he, he!.. He, he, he! 105 | 106 | SUNTIO. 107 | 108 | _(Rientää heti Mäkärän perästä ja tapaa hänet kiini vaatteen liepeestä 109 | juuri sakastin ovella.)_ Eläpäs menekään.. Kas peijakkaan, kun oli 110 | sukkela. 111 | 112 | MÄKÄRÄ. 113 | 114 | _(Vetäytyy siivosti takaisin ja viekkaasti nauraen räpyttää silmiään.)_ 115 | 116 | Ilmanhan minä vaan säikäytin. Pidä vasta ovesi tarkemmin kiini, ettei 117 | syntinen kirkkoon pääse.. Hähä! Jo tulen. 118 | 119 | _(SUNTIO menee sakastiin ja sukkelasti vetää oven kiini jälessään.)_ 120 | 121 | MÄKÄRÄ. 122 | -------------------------------------------------------------------------------- /tests/data/fi.out.txt: -------------------------------------------------------------------------------- 1 | mäkär 2 | kumarrel 3 | sakast 4 | rappus 5 | hyvä 6 | her 7 | armollin 8 | kirkonpalvelij 9 | suntio 10 | ka 11 | mene 12 | mene 13 | tän 14 | saa 15 | tul 16 | mäkär 17 | en 18 | suoka 19 | ant 20 | kaike 21 | nöyryyd 22 | tulu 23 | her 24 | temppel 25 | suntio 26 | kuite 27 | mal 28 | yhd 29 | kohd 30 | juoksentel 31 | ympär 32 | kirko 33 | höpis 34 | jon 35 | joutav 36 | mäkär 37 | hee 38 | höpis 39 | no 40 | sit 41 | pääkartano 42 | tahdo 43 | taas 44 | pitä 45 | kutia 46 | naura 47 | kähäyt 48 | suntio 49 | senpätähd 50 | saat 51 | pysy 52 | poike 53 | mäkär 54 | vede 55 | silm 56 | ni 57 | ni 58 | niin 59 | pitän 60 | kas 61 | pääsky 62 | len 63 | paha 64 | sit 65 | tehny 66 | ajel 67 | kirko 68 | pois 69 | suntio 70 | et 71 | pysy 72 | yhd 73 | kohd 74 | pap 75 | saarn 76 | muun 77 | muas 78 | sanoi 79 | kyl 80 | her 81 | hulu 82 | hoita 83 | silo 84 | heti 85 | ääne 86 | huusi 87 | kyl 88 | hoita 89 | sopi 90 | kirko 91 | mäkär 92 | hetk 93 | alakulois 94 | näköin 95 | sit 96 | ilois 97 | hoitaak 98 | mitäs 99 | paha 100 | suntio 101 | eihä 102 | kirko 103 | saa 104 | huuta 105 | juoksen 106 | ymmär 107 | sinä 108 | mäkär 109 | niin 110 | eihä 111 | saisi 112 | heng 113 | viettelev 114 | mink 115 | taita 116 | suntio 117 | mite 118 | heng 119 | viettelev 120 | mäkär 121 | nekö 122 | nehä 123 | pitäv 124 | mäkär 125 | aiva 126 | nar 127 | suhkav 128 | kirko 129 | korv 130 | juoks 131 | juoks 132 | silo 133 | täytyy 134 | juos 135 | toine 136 | tule 137 | kuisk 138 | elä 139 | vaine 140 | juoks 141 | elä 142 | vaine 143 | juoks 144 | huuda 145 | huuda 146 | muuto 147 | tekisiv 148 | ker 149 | kum 150 | täytyy 151 | tote 152 | kyl 153 | kirko 154 | kum 155 | vähä 156 | siivom 157 | vie 158 | sun 159 | koto 160 | vehkeilev 161 | seitsem 162 | höyräk 163 | pap 164 | piru 165 | koira 166 | viekö 167 | kaik 168 | ilk 169 | katsos 170 | levit 171 | käte 172 | sorm 173 | pane 174 | naama 175 | hyv 176 | julm 177 | näköis 178 | irvist 179 | hyp 180 | suntio 181 | päin 182 | näin 183 | teke 184 | suntio 185 | ärjäs 186 | vetäytyy 187 | säikähtäe 188 | taaksep 189 | mäkär 190 | naura 191 | viekkaast 192 | räpyt 193 | silmiä 194 | mitä 195 | suht 196 | teke 197 | ot 198 | kirv 199 | pen 200 | al 201 | huuta 202 | mäkär 203 | pane 204 | pää 205 | pölky 206 | silo 207 | tiuk 208 | sukkel 209 | arv 210 | piru 211 | silo 212 | teki 213 | hyp 214 | näin 215 | ikä 216 | taaksep 217 | tule 218 | kirv 219 | mana 220 | jumal 221 | nime 222 | menem 223 | pois 224 | sit 225 | vähitel 226 | vetäytyy 227 | jonku 228 | pimeä 229 | nurk 230 | kaut 231 | pelo 232 | kauva 233 | peijak 234 | must 235 | silm 236 | siel 237 | vielä 238 | kiiluv 239 | suntio 240 | katsos 241 | peijak 242 | kyl 243 | kai 244 | peloitai 245 | mäkär 246 | vaara 247 | toki 248 | on 249 | kulkie 250 | kaivo 251 | kohd 252 | satu 253 | silo 254 | kaik 255 | suur 256 | roisto 257 | ansgaarius 258 | kirko 259 | tule 260 | tuntu 261 | aiva 262 | nisk 263 | kiini 264 | ottav 265 | suhk 266 | mäkär 267 | mäkär 268 | hyp 269 | kaivo 270 | silo 271 | pyöräyt 272 | lail 273 | ala 274 | juos 275 | käpäl 276 | läht 277 | kas 278 | kas 279 | ori 280 | tappelev 281 | suntio 282 | juoks 283 | katsom 284 | mäkär 285 | kiiruht 286 | sakast 287 | rappus 288 | ylös 289 | päästäks 290 | kirko 291 | siel 292 | siel 293 | suntio 294 | rient 295 | heti 296 | mäkär 297 | perä 298 | tapa 299 | kiini 300 | vaat 301 | liepe 302 | juuri 303 | sakast 304 | ove 305 | eläpäs 306 | mene 307 | kas 308 | peijak 309 | sukkel 310 | mäkär 311 | vetäytyy 312 | siivost 313 | takais 314 | viekkaast 315 | naurae 316 | räpyt 317 | silmiä 318 | ilm 319 | säikäyt 320 | pidä 321 | vas 322 | ove 323 | tarkem 324 | kiini 325 | etei 326 | syntin 327 | kirko 328 | pääse 329 | hähä 330 | jo 331 | tule 332 | suntio 333 | mene 334 | sakast 335 | sukkel 336 | vetä 337 | ove 338 | kiini 339 | jäle 340 | mäkär 341 | -------------------------------------------------------------------------------- /tests/data/fr.in.txt: -------------------------------------------------------------------------------- 1 | Zanette, c'était son nom de Jeanne, de Jeannette, comme elle le 2 | prononçait en zézayant, lorsqu'elle était toute petite. Tel il lui était 3 | resté. Ce qui, aussi, lui était resté, c'était sa grâce d'enfance, on ne 4 | sait quoi de tout mignon, de plus jeune qu'elle-même. Elle était belle 5 | de ses beaux seize ans, de son profil de Grecque, et de ses cheveux 6 | noirs, qui, sous le hennin à l'arlésienne, pendaient lourdement sur la 7 | blancheur dorée de son cou. 8 | 9 | Elle avait seize ans avec l'air d'en avoir douze. Pourtant, on sentait 10 | la vie jeune et forte palpiter dans la chapelle, c'est-à-dire dans 11 | l'entre-bâillement des fichus aux plis innombrables, qui laissent voir 12 | un peu de la poitrine nue sur laquelle brille la croix d'or suspendue à 13 | la chaînette des grand'mères. 14 | 15 | Zanette vivait à la ferme de la Sirène, bien tranquille à soigner ses 16 | poules, ses lapins, auprès de son père, maître Augias, le bayle. À 17 | l'ordinaire elle allait en Arles tous les dimanches. 18 | 19 | Et bien souvent, assise au bord du Petit Rhône, seule, sous les saules 20 | et les aubes, elle rêvait en regardant l'eau, l'eau qui s'en allait vers 21 | la mer, vers la mer si grande, où des bateaux vont et viennent, comme 22 | des bêtes de rêve, comme de grands oiseaux aux ailes blanches.... Un 23 | songe d'inconnu accompagnait toujours Zanette. Ses beaux seize ans 24 | espéraient. 25 | 26 | ...N'est-ce pas qu'elle porte un joli nom, la ferme de la Sirène? La 27 | Sirène (la Sereno) si vous interrogez les paysans, ils vous le diront, 28 | est un oiseau de passage, qui jamais ne s'arrête chez nous, et qui 29 | traverse seulement notre ciel, très haut. Quelquefois, le laboureur, en 30 | novembre, arrête son attelage, parce qu'il a entendu une harmonie 31 | lointaine, confuse, comme un son prolongé de viole ou de mandoline.... 32 | 33 | Et il écoute, en rêvant.... 34 | 35 | Ce sont les sirènes qui passent là-haut, tout là-haut. Elles sont plus 36 | petites que des tourterelles et leurs plumes miroitantes ont toutes les 37 | couleurs de l'arc-en-ciel. On ne sait pas si la musique qu'elles font 38 | sort de leur gosier ou vient simplement de le vibration de leurs ailes. 39 | On croit plutôt que leur vol est harmonieux. Leur voix y ajoute une 40 | seule note qui, de temps en temps, scande et domine la mélodie des 41 | ailes.... Un jour, dit-on, comme on venait à peine de construire le 42 | château et sa ferme, une sirène un instant se posa sur le bouquet de 43 | tamaris en fleurs que les maçons plantent au bout d'une perche, sur la 44 | toiture, dès qu'elle est achevée. Et le château, et la ferme qui le 45 | touche, furent, voilà bien longtemps, baptisés du nom qu'ils portent 46 | encore. 47 | 48 | Entre la ferme et la château, une vieille chapelle décrépite, où jadis 49 | on disait la messe, se dresse, étroite et longue. 50 | 51 | On la dirait bâtie sur le modèle des huttes camarguaises. 52 | 53 | Les huttes sont en «tape», en argile desséchée, recouvertes de roseaux, 54 | et la chapelle est en moellons, et recouverte de pierres plates, mais 55 | les deux toits ont la même forme, celle d'un bateau long, la quille en 56 | l'air; et sur leurs toitures, les cabanes, aussi bien que la chapelle, 57 | portent toutes une croix penchée, comme renversée en arrière. Toutes ces 58 | croix penchantes font songer au mistral éternel qui incline ainsi un peu 59 | tous les arbres des plaines provençales, dans la même direction. Tous 60 | ils gardent un peu la marque du vent maître, «magistral», à qui les 61 | Romains avaient élevé un temple, comme à la puissance divine, 62 | protectrice de ce pays qu'il balaye et assainit sans cesse.... Elles 63 | donnent encore, les petites croix qu'on plante ainsi à dessein penchées, 64 | l'impression des choses de la religion, à la fois vaincues et 65 | résistantes. Elles sont là, tenaces mais inclinées, jamais arrachées 66 | mais toujours penchantes, et elles disent le triomphe obstiné d'une foi 67 | sans relâche battue des vents.... -------------------------------------------------------------------------------- /tests/data/fr.out.txt: -------------------------------------------------------------------------------- 1 | zanet 2 | c'et 3 | nom 4 | jeann 5 | jeannet 6 | comm 7 | prononc 8 | zézai 9 | lorsqu'el 10 | tout 11 | petit 12 | tel 13 | rest 14 | auss 15 | rest 16 | c'et 17 | grâc 18 | d'enfanc 19 | sait 20 | quoi 21 | tout 22 | mignon 23 | plus 24 | jeun 25 | qu'el 26 | bel 27 | beau 28 | seiz 29 | an 30 | profil 31 | grecqu 32 | cheveux 33 | noir 34 | sous 35 | hennin 36 | l'arlésien 37 | pend 38 | lourd 39 | blancheur 40 | dor 41 | cou 42 | seiz 43 | an 44 | l'air 45 | d'en 46 | avoir 47 | douz 48 | pourt 49 | sent 50 | vi 51 | jeun 52 | fort 53 | palpit 54 | chapel 55 | c'est 56 | dir 57 | l'entr 58 | bâill 59 | fichus 60 | plis 61 | innombr 62 | laissent 63 | voir 64 | peu 65 | poitrin 66 | nu 67 | laquel 68 | brill 69 | croix 70 | d'or 71 | suspendu 72 | chaînet 73 | grand'mer 74 | zanet 75 | viv 76 | ferm 77 | siren 78 | bien 79 | tranquill 80 | soign 81 | poul 82 | lapin 83 | aupres 84 | per 85 | maîtr 86 | augi 87 | bayl 88 | l'ordinair 89 | allait 90 | arle 91 | tous 92 | dimanch 93 | bien 94 | souvent 95 | assis 96 | bord 97 | pet 98 | rhôn 99 | seul 100 | sous 101 | saul 102 | aub 103 | rêv 104 | regard 105 | l'eau 106 | l'eau 107 | s'en 108 | allait 109 | ver 110 | mer 111 | ver 112 | mer 113 | si 114 | grand 115 | où 116 | bateau 117 | vont 118 | viennent 119 | comm 120 | bêt 121 | rêv 122 | comm 123 | grand 124 | oiseau 125 | ail 126 | blanch 127 | song 128 | d'inconnu 129 | accompagn 130 | toujour 131 | zanet 132 | beau 133 | seiz 134 | an 135 | esper 136 | n'est 137 | qu'el 138 | port 139 | jol 140 | nom 141 | ferm 142 | siren 143 | siren 144 | sereno 145 | si 146 | interrog 147 | paysan 148 | diront 149 | oiseau 150 | passag 151 | jam 152 | s'arrêt 153 | chez 154 | travers 155 | seul 156 | ciel 157 | tres 158 | haut 159 | quelquefois 160 | laboureur 161 | novembr 162 | arrêt 163 | attelag 164 | parc 165 | qu'il 166 | a 167 | entendu 168 | harmon 169 | lointain 170 | confus 171 | comm 172 | prolong 173 | viol 174 | mandolin 175 | écout 176 | rêv 177 | siren 178 | passent 179 | là 180 | haut 181 | tout 182 | là 183 | haut 184 | elle 185 | plus 186 | petit 187 | tourterel 188 | plum 189 | miroit 190 | tout 191 | couleur 192 | l'arc 193 | ciel 194 | sait 195 | si 196 | musiqu 197 | qu'el 198 | font 199 | sort 200 | gosi 201 | vient 202 | simpl 203 | vibrat 204 | ail 205 | croit 206 | plutôt 207 | vol 208 | harmoni 209 | voix 210 | ajout 211 | seul 212 | not 213 | temp 214 | temp 215 | scand 216 | domin 217 | mélod 218 | ail 219 | jour 220 | dit 221 | comm 222 | ven 223 | pein 224 | construir 225 | château 226 | ferm 227 | siren 228 | instant 229 | pos 230 | bouquet 231 | tamar 232 | fleur 233 | maçon 234 | plantent 235 | bout 236 | d'un 237 | perch 238 | toitur 239 | des 240 | qu'el 241 | achev 242 | château 243 | ferm 244 | touch 245 | voilà 246 | bien 247 | longtemp 248 | baptis 249 | nom 250 | qu'il 251 | portent 252 | encor 253 | entre 254 | ferm 255 | château 256 | vieil 257 | chapel 258 | décrépit 259 | où 260 | jad 261 | dis 262 | mess 263 | dress 264 | étroit 265 | longu 266 | dir 267 | bât 268 | model 269 | hutt 270 | camarguais 271 | hutt 272 | tap 273 | argil 274 | dessech 275 | recouvert 276 | roseau 277 | chapel 278 | moellon 279 | recouvert 280 | pierr 281 | plat 282 | deux 283 | toit 284 | form 285 | cel 286 | d'un 287 | bateau 288 | long 289 | quill 290 | l'air 291 | toitur 292 | caban 293 | auss 294 | bien 295 | chapel 296 | portent 297 | tout 298 | croix 299 | pench 300 | comm 301 | renvers 302 | arrier 303 | tout 304 | croix 305 | pench 306 | font 307 | song 308 | mistral 309 | éternel 310 | inclin 311 | ains 312 | peu 313 | tous 314 | arbre 315 | plain 316 | provençal 317 | direct 318 | tous 319 | gardent 320 | peu 321 | marqu 322 | vent 323 | maîtr 324 | magistral 325 | romain 326 | élev 327 | templ 328 | comm 329 | puissanc 330 | divin 331 | protectric 332 | pay 333 | qu'il 334 | balay 335 | assain 336 | cess 337 | elle 338 | donnent 339 | encor 340 | petit 341 | croix 342 | qu'on 343 | plant 344 | ains 345 | dessein 346 | pench 347 | l'impress 348 | chos 349 | religion 350 | fois 351 | vaincu 352 | résist 353 | elle 354 | là 355 | tenac 356 | inclin 357 | jam 358 | arrach 359 | toujour 360 | pench 361 | elle 362 | disent 363 | triomph 364 | obstin 365 | d'un 366 | foi 367 | relâch 368 | battu 369 | vent 370 | -------------------------------------------------------------------------------- /tests/data/hu.in.txt: -------------------------------------------------------------------------------- 1 | Színházak és újságok, mintha tartanának tőle, hogy valaki megelőzi őket az ünneplésben, nem tudják bevárni március tizenötödikét. Korán kezdik az ünneplést és valószínűleg nem sietnek majd a befejezésével sem. Nagyon szép vonás ez. Az igazi ünnep nem is férhet bele a huszonnégy órás keretbe, s a mesék is úgy tanítják, hogy a nagy lakodalmak három hétig tartanak. Hadd kezdődjék hát a nagy nemzeti ünnep és tartson el akár április végéig, legalább elhelyezkedik ebben a nagy keretben mindenkinek a választott dátuma, meg a hivatalos is. Ma hegyen-völgyön tart a széles jókedv, szépen összeolvad az ünnepi hangulatban március tizenötödike és április tizenegyedike, s ebben az ünnepi folytonosságban lesz valami költői igazság is. 2 | 3 | A megindult ünneplésből azonban csak egy kis epizódra tereljük ezúttal a figyelmet. Csekélység az egész, nem is bevallott pont az ünnepi programban. De nekünk úgy tetszik, hogy a véletlen sokkal szebben tud ünnepet csinálni, mint a közönséges halandó. És a nemzet voltaképpen hálával tartozik annak az ismeretlen egyetemi hallgatónak (bolond diszkréció, hogy az újságok nem akarják a nevét kiírni), aki annakidején megvette hitelbe a Jókai Mór összes munkáit és a kikötött részleteket, természetesen, nem fizette meg. Kiskorú volt a diák, s a könyvkereskedő ennélfogva a szülőt fogta pörbe, de az apa nem akart fizetni, mondván, hogy kiskorú fiának csak olyan vásárlásaiért felelős, amelyek fenntartásához föltétlenül szükségesek. Így történt, hogy a bíróságnak ítélkeznie kellett a pörös dologban és Kiss Ferenc táblai bíró mint a kerületi járásbíróság vezetője a diák apját elmarasztalta, mert Jókai művei nem jelentenek fényűzési cikket, hanem oly szükséges beszerzése minden egyetemi polgárnak, amelyért, ha a diák kiskorú, az apa teljes felelősséggel tartozik. 4 | 5 | ...Hogy ez a kis diák, aki nem tud fizetni, ez az apa, aki nem akar fizetni, s ez a bíró, aki az ötvenedik márciusban ítélkezett: hárman együtt micsoda szép ünnepet csináltak Jókai Mórnak, az nem tartozik ebbe a strófába. Az agg költő szeme talán megnedvesedik tőle, és eszébe jut, hogy a negyvennyolcadiki márciustól a mostani tavaszig nem dolgozott hiába. Talán megtelik a szíve örömmel, hogy abban a fiúban és abban az apában két generáció mérkőzött miatta, és az elfogulatlan bíró döntése szerint a költő munkája nem fényűzési cikk, hanem olyan bevásárlás, amely az ifjúság fenntartásához szükséges. Ez a költő külön diadala és külön ünnepe, s mind a kettőt becsületesen megérdemelte. Nekünk pedig jólesik, hogy a nagy március ötvenedik évfordulójára a véletlen ilyen örömet tartogatott annak az embernek, aki mind a két március - nem, mind az ötven március munkájából és dicsőségéből kivehette részét. 6 | 7 | De programon kívül szép epizódja ez az ítélet a nemzeti ünneplésnek is. Ha az a kis diák véletlenül nem csinálja ezt az adósságot, szándékosan kellett volna egyet fogni a helyébe, csak hogy ezen az emlékezetes tavaszon ilyen ítélet hangozhassék el. Mert csonka ünnep volna, ha azt az időt, mikor a magyar nemzet költői jártak legelül, a nemzeti irodalom megbecsülésének különösebb jele nélkül ünnepelnők meg. Hogy sírok és emlékszobrok koszorúkkal fognak megtelni, hogy zarándok népnek élén dicsőítő beszédek fognak ott elhangozni, az nem minden, ami az ünneplő nemzettől kitelik. Ha idegen nemzetek volnának olyan szerencsés helyzetben, hogy az ünneplésre megérett korszak szereplői közül élőket is lássanak körükben, sohasem mulasztanák el, hogy azokat tegyék az ünneplés középpontjává. S ha ilyen korszakokból egy istentől megáldott poéta, egy egész külön irodalom forrása járna közöttük, az idegen nemzetek egy percig sem haboznának, hogy az egész ünnepben ezt az írót, ezt az eleven emléket tiszteljék meg. Mert abban a poétában látnák a múlt harcos apostolát, aki ott volt a kezdet kezdetén, amikor a szabadságfa levelezni kezdett, és a végzet különös kegyelméből itt van ma is, amikor e fának árnyékában az unokák találnak pihenőt. S ha nem is kell szükségképpen a spanyolok példáját követni, akik kisebb alkalomból és kisebb poétával szemben, a vérük tüzes fellobbanásában, költőkirállyá koronázták Zorillát, talán a magyar nemzet is módját találhatta volna, hogy ünnepében Jókai is legyen egy - programpont. 8 | 9 | S mi azért találjuk a tegnapi bírói döntést olyan szép epizódnak, mert teljessé teszi az ünnepet. És függetlenül az írótól, aki véletlenül a nemzeti szabadságnak is első katonái közül való, éppen a nagy március jubileumán mondja ki, hogy a nagy nemzeti írók munkája nem fényűzési cikk, hanem az önfenntartáshoz szükséges, mint a kenyér. 10 | 11 | -------------------------------------------------------------------------------- /tests/data/hu.out.txt: -------------------------------------------------------------------------------- 1 | színház 2 | újság 3 | tartan 4 | megelőz 5 | ünneplés 6 | tudja 7 | bevárn 8 | március 9 | tizenötöd 10 | kor 11 | kezd 12 | ünneplés 13 | valószínűleg 14 | siet 15 | befejezés 16 | szép 17 | vonás 18 | igaz 19 | ünnep 20 | férh 21 | bel 22 | huszonnégy 23 | órás 24 | keret 25 | mese 26 | tanítja 27 | lakodalm 28 | hét 29 | tart 30 | had 31 | kezdődje 32 | nemzet 33 | ünnep 34 | tarts 35 | április 36 | vég 37 | elhelyezked 38 | keret 39 | minden 40 | választot 41 | dátum 42 | hivatalos 43 | hegy 44 | völgy 45 | tar 46 | széles 47 | jókedv 48 | összeolv 49 | ünnep 50 | hangulat 51 | március 52 | tizenötöd 53 | április 54 | tizenegyed 55 | ünnep 56 | folytonosság 57 | költő 58 | igazság 59 | megindul 60 | ünneplés 61 | kis 62 | epizó 63 | terel 64 | ezú 65 | figyel 66 | csekélység 67 | bevallot 68 | pon 69 | ünnep 70 | progr 71 | tetsz 72 | véletl 73 | szeb 74 | tu 75 | ünnep 76 | csináln 77 | közönséges 78 | halandó 79 | nemz 80 | volt 81 | hál 82 | tartoz 83 | ismeretl 84 | egyetem 85 | hallgató 86 | bolon 87 | diszkréció 88 | újság 89 | akarja 90 | nev 91 | kiírn 92 | annakide 93 | megvett 94 | hitel 95 | jó 96 | mór 97 | összes 98 | munka 99 | kikötöt 100 | részlet 101 | természetes 102 | fizett 103 | kiskorú 104 | dia 105 | könyvkereskedő 106 | ennélfogv 107 | szülő 108 | fogt 109 | pör 110 | ap 111 | akar 112 | fizetn 113 | mondv 114 | kiskorú 115 | fi 116 | vásárlás 117 | felelős 118 | fenntartás 119 | föltétlen 120 | szükséges 121 | tört 122 | bíróság 123 | ítélkezn 124 | pörös 125 | dolog 126 | kiss 127 | ferenc 128 | tábl 129 | bíró 130 | kerület 131 | járásbíróság 132 | vezető 133 | dia 134 | ap 135 | elmarasztalt 136 | jó 137 | műv 138 | jelent 139 | fényűzés 140 | cik 141 | oly 142 | szükséges 143 | beszerzés 144 | egyetem 145 | polgár 146 | amely 147 | dia 148 | kiskorú 149 | ap 150 | felelősség 151 | tartoz 152 | kis 153 | dia 154 | tu 155 | fizetn 156 | ap 157 | akar 158 | fizetn 159 | bíró 160 | ötvened 161 | március 162 | ítélkezet 163 | hár 164 | micsod 165 | szép 166 | ünnep 167 | csinált 168 | jó 169 | mór 170 | tartoz 171 | stróf 172 | agg 173 | költő 174 | szem 175 | megnedvesed 176 | esz 177 | ju 178 | negyvennyolcadi 179 | március 180 | mostan 181 | tavasz 182 | dolgozot 183 | megtel 184 | szív 185 | örö 186 | fiú 187 | ap 188 | generáció 189 | mérkőzöt 190 | elfogulatl 191 | bíró 192 | döntés 193 | költő 194 | munká 195 | fényűzés 196 | bevásárlás 197 | ifjúság 198 | fenntartás 199 | szükséges 200 | költő 201 | diadal 202 | ünnep 203 | becsületes 204 | megérdemelt 205 | március 206 | ötvened 207 | évforduló 208 | véletl 209 | örö 210 | tartogatot 211 | ember 212 | március 213 | március 214 | munká 215 | dicsőség 216 | kivehett 217 | rész 218 | progr 219 | szép 220 | epizód 221 | ítél 222 | nemzet 223 | ünneplés 224 | kis 225 | dia 226 | véletlen 227 | csinál 228 | adósság 229 | szándékos 230 | fogn 231 | hely 232 | emlékezetes 233 | tavasz 234 | ítél 235 | hangozhasse 236 | cson 237 | ünnep 238 | idő 239 | magyar 240 | nemz 241 | költő 242 | járt 243 | legel 244 | nemzet 245 | irodal 246 | megbecsülés 247 | különösebb 248 | jel 249 | ünnepelnő 250 | sír 251 | emlékszobr 252 | koszorú 253 | fog 254 | megteln 255 | zaránd 256 | nép 257 | él 258 | dicsőítő 259 | beszéd 260 | fog 261 | elhangozn 262 | ünneplő 263 | nemzet 264 | kitel 265 | ideg 266 | nemzet 267 | szerencsés 268 | helyzet 269 | ünneplés 270 | megéret 271 | korsz 272 | szereplő 273 | élő 274 | láss 275 | kör 276 | sohas 277 | mulasztana 278 | tegye 279 | ünneplés 280 | középpont 281 | korszak 282 | isten 283 | megáldot 284 | poét 285 | irodal 286 | forrás 287 | járn 288 | között 289 | ideg 290 | nemzet 291 | perc 292 | habozn 293 | ünnep 294 | író 295 | elev 296 | emle 297 | tisztelje 298 | poét 299 | látna 300 | múl 301 | harcos 302 | apostol 303 | kez 304 | kezdet 305 | szabadságf 306 | levelezn 307 | kezdet 308 | végz 309 | különös 310 | kegyelm 311 | fá 312 | árnye 313 | unoka 314 | talál 315 | pihenő 316 | szükség 317 | spanyol 318 | példá 319 | követn 320 | kisebb 321 | alkal 322 | kisebb 323 | poét 324 | vér 325 | tüzes 326 | fellobbanás 327 | költőkirály 328 | koronázta 329 | zorill 330 | magyar 331 | nemz 332 | mód 333 | találhatt 334 | ünnep 335 | jó 336 | programpon 337 | talál 338 | tegnap 339 | bíró 340 | döntés 341 | szép 342 | epizó 343 | teljes 344 | tesz 345 | ünnep 346 | független 347 | író 348 | véletlen 349 | nemzet 350 | szabadság 351 | katona 352 | március 353 | jubileum 354 | mond 355 | nemzet 356 | író 357 | munká 358 | fényűzés 359 | önfenntartás 360 | szükséges 361 | kenyér 362 | -------------------------------------------------------------------------------- /tests/data/it.in.txt: -------------------------------------------------------------------------------- 1 | Al cadere d'una bella giornata d'aprile dell'anno 1503 la campana 2 | di San Domenico in Barletta sonava gli ultimi tocchi dell'avemaria. 3 | Sulla piazza vicina in riva al mare, luogo di ritrovo degli abitanti 4 | tranquilli che, nelle terricciuole dei climi meridionali specialmente, 5 | sogliono sulla sera essere insieme a barattar parole al sereno per 6 | riposarsi dalle faccende del giorno, stavano col fine medesimo 7 | dispersi in varj gruppi molti soldati spagnuoli ed italiani, alcuni 8 | passeggiando, altri fermi, o seduti, od appoggiati alle barche tirate 9 | a secco, delle quali era ingombra la spiaggia, e, com'è costume delle 10 | soldatesche d'ogni età e d'ogni nazione, il loro contegno era tale 11 | che pareva dire: il mondo è nostro. Di fatto, lasciato loro il campo 12 | migliore, si tenevano i terrazzani in disparte, dando così a questa 13 | loro burbanza tacita approvazione. Chi per figurarsi questo quadro si 14 | volesse rappresentare una simile radunata de' nostri soldati moderni 15 | nella loro misera _uniforme_, sarebbe lontano assai dall'averne una 16 | giusta immagine. L'esercito di Consalvo, le fanterie specialmente, 17 | quantunque le meglio in arnese, e le migliori di tutta cristianità, 18 | non conoscevano però, più di qualunque altra milizia del secolo XVI, 19 | la stretta disciplina moderna, che è giunta a render simili un soldato 20 | all'altro dalle scarpe al cappello. Qui invece, ogni uomo che facesse 21 | il mestier dell'arme a piede o a cavallo, poteva vestirsi, armarsi ed 22 | adornarsi come più gli piacesse; onde nasceva fra questa turba una 23 | mirabile varietà e vaghezza nelle fogge, ne' colori e nel portamento, 24 | dal quale si poteva facilmente conoscere a qual nazione appartenesse 25 | ogni individuo. Gli Spagnuoli, per lo più serii, immobili, atteggiati 26 | da bravacci, ed avvolti (o com'essi dicono _embozados_) nella _capa_ 27 | nazionale, dalla quale si vedeva uscir per di sotto la lunga e sottil 28 | lama di Toledo; gl'Italiani loquaci e pronti al gestire, in sajo od in 29 | farsetto colla daga pistolese appesa dietro le reni. 30 | 31 | Al sonare della campana era cessato il susurro, e scomparendo la 32 | maggior parte de' cappelli, le teste eran rimaste scoperte, perchè in 33 | quel tempo anche i soldati credevano in Dio, e talvolta lo pregavano. 34 | Dopo piccola pausa tornarono a luogo i cappelli, ricominciò il 35 | bisbiglio; e benchè quella turba presa insieme avesse al primo aspetto 36 | un non so che di gajo e di vivace, si poteva tuttavia facilmente 37 | avvedersi, girando fra i diversi crocchi, esservi un motivo comune di 38 | tristezza e di scoramento, al quale erano volte le menti e le parole 39 | di tutti. Infatti il motivo era vero e possente. La fame cominciava 40 | a farsi sentire fra i soldati ed anche fra gli abitanti di Barletta, 41 | ove il gran Capitano, aspettando i tardi ajuti di Spagna, teneva 42 | chiuso l'esercito di troppo inferiore a quello dei Francesi perchè 43 | s'arrischiasse commetter la somma delle cose alla fortuna d'una 44 | giornata. 45 | 46 | Tre lati della piazza erano chiusi da certe povere case di marinaj e 47 | pescatori, dalla chiesa e dall'osteria. Il quarto s'apriva alla marina, 48 | ingombro, com'è costume di tali luoghi, di barche, reti e di altri 49 | attrezzi pescherecci; ed all'ultima linea dell'orizzonte si vedeva 50 | sorgere dal seno delle acque la bruna forma del monte Gargano, sulla 51 | cui vetta andava morendo l'ultimo raggio del sole cadente. 52 | 53 | Nello spazio frapposto, veleggiava chetamente un legno sottile; e si 54 | volgeva tratto tratto per cercare il vento che soffiava incostante in 55 | quel golfo, increspando qua e là a lunghe strisce la superficie del 56 | mare. La distanza tuttavia della nave e la dubbia luce del crepuscolo 57 | non lasciavano distinguere qual fosse la sua bandiera. 58 | 59 | Uno Spagnuolo, che insieme con molti soldati era presso alla riva, 60 | la guardava fisso, aguzzando le ciglia, ed attorcigliandosi certi 61 | grandissimi baffi più bigi che neri. 62 | 63 | --Che cosa guardi che sembri una statua, e non dai retta a chi discorre 64 | con te?-- 65 | 66 | Quest'apostrofe d'un soldato napoletano, che non avendo ottenuta 67 | risposta ad una prima domanda, se l'aveva per male, non mosse nè punto 68 | nè poco l'imperturbabile Spagnuolo. Alla fine con un sospiro che pareva 69 | uscire più da un mantice che dal petto d'un uomo, disse: 70 | 71 | --_Voto a Dios que nuestra segnora de Gaeta_, che manda buon vento e 72 | buon cammino a tanti che la pregano in mare, potrebbe mandar ora questa 73 | fusta a noi che la preghiamo in terra, e non abbiamo da metter sotto i 74 | denti altro che il calcio dell'archibuso! Chi sa che non porti grano e 75 | provvisioni a quei _descomulgados_ di Francesi che ci tengono stretti 76 | in questa gabbia per farci morir di fame...... _Y mala Pasqua me de 77 | Dios y sea la primera que viniere, si a su gracia el segnor Gonzalo 78 | Hernandez_[1] quando ha ben pranzato e meglio cenato gl'importa di noi 79 | più che del _cuero de sus zapatos_[2]. 80 | 81 | --Che cosa può far Consalvo?--rispose con istizza il Napoletano, 82 | contento di contraddire:--dovrà diventar pane per entrar in corpo ad 83 | una bestia come te? Quando ne avrà, ne darà; e le navi che il malanno 84 | loro ha portate nelle secche di Manfredonia, chi l'ha divorate? 85 | Consalvo, o voi altri?-- 86 | 87 | Lo Spagnuolo un po' mutato in viso mostrava di voler rispondere, ma 88 | fu interrotto da un altro del crocchio, il quale, battendogli sulla 89 | spalla, scuotendo la testa, ed abbassando la voce, come per dar maggior 90 | peso alle parole, -------------------------------------------------------------------------------- /tests/data/it.out.txt: -------------------------------------------------------------------------------- 1 | cad 2 | d'un 3 | bell 4 | giorn 5 | d'april 6 | dell'ann 7 | campan 8 | san 9 | domen 10 | barlett 11 | son 12 | ultim 13 | tocc 14 | dell'avemar 15 | piazz 16 | vicin 17 | riv 18 | mar 19 | luog 20 | ritrov 21 | abit 22 | tranquill 23 | terricciuol 24 | clim 25 | meridional 26 | special 27 | sogl 28 | ser 29 | esser 30 | insiem 31 | baratt 32 | parol 33 | seren 34 | ripos 35 | facc 36 | giorn 37 | fin 38 | medesim 39 | disp 40 | varj 41 | grupp 42 | molt 43 | sold 44 | spagnuol 45 | italian 46 | alcun 47 | passegg 48 | altri 49 | ferm 50 | sed 51 | od 52 | appogg 53 | barc 54 | tir 55 | secc 56 | qual 57 | ingombr 58 | spiagg 59 | com' 60 | costum 61 | soldatesc 62 | d'ogn 63 | età 64 | d'ogn 65 | nazion 66 | contegn 67 | tal 68 | par 69 | dir 70 | mond 71 | fatt 72 | lasc 73 | camp 74 | miglior 75 | ten 76 | terrazzan 77 | dispart 78 | dand 79 | cos 80 | burbanz 81 | tac 82 | approv 83 | figur 84 | quadr 85 | voless 86 | rappresent 87 | simil 88 | radun 89 | de 90 | sold 91 | modern 92 | miser 93 | uniform 94 | lont 95 | assa 96 | dall'av 97 | giust 98 | immagin 99 | l'eserc 100 | consalv 101 | fanter 102 | special 103 | quantunqu 104 | megl 105 | arnes 106 | miglior 107 | tutt 108 | cristian 109 | conosc 110 | per 111 | qualunqu 112 | altra 113 | miliz 114 | secol 115 | xvi 116 | strett 117 | disciplin 118 | modern 119 | giunt 120 | render 121 | simil 122 | sold 123 | all'altr 124 | scarp 125 | cappell 126 | qui 127 | invec 128 | ogni 129 | uom 130 | mestier 131 | dell'arm 132 | pied 133 | cavall 134 | pot 135 | vest 136 | armars 137 | adorn 138 | piacess 139 | onde 140 | nasc 141 | fra 142 | turb 143 | mirabil 144 | variet 145 | vaghezz 146 | fogg 147 | color 148 | port 149 | pot 150 | facil 151 | conosc 152 | qual 153 | nazion 154 | apparteness 155 | ogni 156 | individu 157 | spagnuol 158 | ser 159 | immobil 160 | attegg 161 | bravacc 162 | avvolt 163 | com'ess 164 | dic 165 | embozados 166 | cap 167 | nazional 168 | ved 169 | uscir 170 | sott 171 | lung 172 | sottil 173 | lam 174 | toled 175 | gl'italian 176 | loquac 177 | pront 178 | gest 179 | saj 180 | od 181 | farsett 182 | coll 183 | dag 184 | pistoles 185 | appes 186 | dietr 187 | ren 188 | son 189 | campan 190 | cess 191 | susurr 192 | scompar 193 | maggior 194 | part 195 | de 196 | cappell 197 | test 198 | eran 199 | rimast 200 | scopert 201 | perc 202 | quel 203 | temp 204 | sold 205 | cred 206 | dio 207 | talvolt 208 | preg 209 | dop 210 | piccol 211 | paus 212 | torn 213 | luog 214 | cappell 215 | ricominc 216 | bisbigl 217 | benc 218 | turb 219 | pres 220 | insiem 221 | prim 222 | aspett 223 | so 224 | gaj 225 | vivac 226 | pot 227 | tuttav 228 | facil 229 | avved 230 | gir 231 | fra 232 | div 233 | crocc 234 | esserv 235 | mot 236 | comun 237 | tristezz 238 | scor 239 | volt 240 | ment 241 | parol 242 | infatt 243 | mot 244 | ver 245 | possent 246 | fam 247 | cominc 248 | fars 249 | sent 250 | fra 251 | sold 252 | fra 253 | abit 254 | barlett 255 | ove 256 | gran 257 | capit 258 | aspett 259 | tard 260 | ajut 261 | spagn 262 | ten 263 | chius 264 | l'eserc 265 | tropp 266 | inferior 267 | frances 268 | perc 269 | s'arrisc 270 | commetter 271 | somm 272 | cos 273 | fortun 274 | d'un 275 | giorn 276 | tre 277 | lat 278 | piazz 279 | chius 280 | cert 281 | pov 282 | cas 283 | marinaj 284 | pescator 285 | chies 286 | dall'oster 287 | quart 288 | s'apr 289 | marin 290 | ingombr 291 | com' 292 | costum 293 | tal 294 | luog 295 | barc 296 | ret 297 | altri 298 | attrezz 299 | pescherecc 300 | all'ultim 301 | line 302 | dell'orizzont 303 | ved 304 | sorg 305 | sen 306 | acque 307 | brun 308 | form 309 | mont 310 | garg 311 | vett 312 | andav 313 | mor 314 | l'ultim 315 | ragg 316 | sol 317 | cadent 318 | spaz 319 | frappost 320 | velegg 321 | chet 322 | legn 323 | sottil 324 | volg 325 | tratt 326 | tratt 327 | cerc 328 | vent 329 | soff 330 | incost 331 | quel 332 | golf 333 | incresp 334 | qua 335 | là 336 | lung 337 | strisc 338 | superfic 339 | mar 340 | distanz 341 | tuttav 342 | nav 343 | dubb 344 | luc 345 | crepuscol 346 | lasc 347 | distingu 348 | qual 349 | bandier 350 | spagnuol 351 | insiem 352 | molt 353 | sold 354 | press 355 | riv 356 | guard 357 | fiss 358 | aguzz 359 | cigl 360 | attorcigl 361 | cert 362 | grandissim 363 | baff 364 | big 365 | ner 366 | cos 367 | guard 368 | sembr 369 | statu 370 | rett 371 | discorr 372 | te 373 | quest'apostrof 374 | d'un 375 | sold 376 | napolet 377 | otten 378 | rispost 379 | prim 380 | domand 381 | l'av 382 | mal 383 | moss 384 | nè 385 | punt 386 | nè 387 | poc 388 | l'imperturb 389 | spagnuol 390 | fin 391 | sospir 392 | par 393 | uscir 394 | mantic 395 | pett 396 | d'un 397 | uom 398 | diss 399 | vot 400 | dios 401 | que 402 | nuestr 403 | segnor 404 | de 405 | gaet 406 | mand 407 | buon 408 | vent 409 | buon 410 | cammin 411 | tant 412 | preg 413 | mar 414 | potrebb 415 | mand 416 | ora 417 | fust 418 | preg 419 | terr 420 | metter 421 | sott 422 | dent 423 | altro 424 | calc 425 | dell'archibus 426 | sa 427 | port 428 | gran 429 | provvision 430 | que 431 | descomulgados 432 | frances 433 | teng 434 | strett 435 | gabb 436 | farc 437 | mor 438 | fam 439 | y 440 | mal 441 | pasqu 442 | me 443 | de 444 | dios 445 | y 446 | sea 447 | primer 448 | que 449 | vin 450 | grac 451 | el 452 | segnor 453 | gonzal 454 | hernandez 455 | quand 456 | ben 457 | pranz 458 | megl 459 | cen 460 | gl'import 461 | cuer 462 | de 463 | sus 464 | zapatos 465 | cos 466 | può 467 | far 468 | consalv 469 | rispos 470 | istizz 471 | napolet 472 | content 473 | contradd 474 | dovr 475 | divent 476 | pan 477 | entrar 478 | corp 479 | best 480 | te 481 | quand 482 | dar 483 | nav 484 | malann 485 | port 486 | secc 487 | manfredon 488 | l'ha 489 | divor 490 | consalv 491 | altri 492 | spagnuol 493 | po 494 | mut 495 | vis 496 | mostr 497 | voler 498 | rispond 499 | interrott 500 | altro 501 | crocc 502 | batt 503 | spall 504 | scuot 505 | test 506 | abbass 507 | voc 508 | dar 509 | maggior 510 | pes 511 | parol 512 | -------------------------------------------------------------------------------- /tests/data/ja.in.txt: -------------------------------------------------------------------------------- 1 | バイト仲間で、ものすごく気の合うメンバーがいた。 2 | なぜ気が合うかというと、共通の上司がヤバいやつだったからだった。 3 | どうヤバいかここで説明するのは割愛する。主題からずれるので。 4 | そのヤバい上司の愚痴を言っている間は、お互い仲間意識を持っていたように思う。 5 | 月日は流れ、私はそこを辞め、しばらくしてその気が合うメンバーも辞めた。 6 | 何回かその後、そのメンバーと合っているうちに、意見の衝突などから、険悪な雰囲気になることが増えた。 7 | そして私も面倒なので、もう会わないようになった。 8 | それからもう何年も経つ。 9 | 10 | 「共通の敵」を作ると、結束が固まるという話はよく聞くが、それは非常に壊れやすいものと思う。 11 | 敵が去ったあとは、内紛が起こる。 12 | 人類に文明が発生してから、地球上のあらゆる場所で繰り返してきたことかもしれない。 13 | 14 | 漢の劉邦が中国を統一したとき、つまり宿敵項羽を倒したあと、敵がいなくなった。 15 | その時、今まで一緒に戦ってきた功臣の何人かは、劉邦によって降格されたり、またそれを恨みに思った功臣が謀反を起こし、一族ごと処罰を受けたりもした。 16 | ただ、軍師の張良は天下統一の後、「政治には興味ありません。オカルトの研究だけはさせてください」と自分は安全であるというアピールをしたからなのか助かったようである。 17 | 軍師として成果を上げるくらいだから、やはり人間の性質を理解していたようだ。 18 | 19 | こんな感じで、共通の敵がいるという条件下で育まれた友情のようなものは、いずれ崩壊する儚いものであると考えておくのがいい。 20 | 喫煙所で上司の悪口を言ってる暇があったら、自分のスキルアップに時間を使う方が有益と言える。 21 | -------------------------------------------------------------------------------- /tests/data/ja.out.txt: -------------------------------------------------------------------------------- 1 | バイト 2 | 仲間 3 | ものすごく 4 | 気 5 | 合う 6 | メンバ 7 | い 8 | なぜ 9 | 気 10 | 合う 11 | いう 12 | 共通 13 | 上司 14 | い 15 | やつ 16 | どう 17 | いか 18 | ここ 19 | 説明 20 | する 21 | の 22 | 割愛 23 | する 24 | 主題 25 | ずれる 26 | その 27 | い 28 | 上司 29 | 愚痴 30 | 言っ 31 | いる 32 | 間 33 | お互い 34 | 仲間 35 | 意識 36 | 持っ 37 | い 38 | よう 39 | 思う 40 | 月日 41 | 流れ 42 | 私 43 | そこ 44 | 辞め 45 | しばらく 46 | し 47 | その 48 | 気 49 | 合う 50 | メンバ 51 | 辞め 52 | 何 53 | 回 54 | その後 55 | その 56 | メンバ 57 | 合っ 58 | いる 59 | うち 60 | 意見 61 | 衝突 62 | 険悪 63 | 雰囲気 64 | なる 65 | こと 66 | 増え 67 | そして 68 | 私 69 | 面倒 70 | もう 71 | 会わ 72 | よう 73 | なっ 74 | それから 75 | もう 76 | 何 77 | 年 78 | 経つ 79 | 共通 80 | 敵 81 | 作る 82 | 結束 83 | 固まる 84 | 話 85 | よく 86 | 聞く 87 | それ 88 | 非常 89 | 壊れ 90 | やすい 91 | もの 92 | 思う 93 | 敵 94 | 去っ 95 | あと 96 | 内紛 97 | 起こる 98 | 人類 99 | 文明 100 | 発生 101 | し 102 | 地球 103 | 上 104 | あらゆる 105 | 場所 106 | 繰り返し 107 | き 108 | こと 109 | しれ 110 | 漢 111 | 劉邦 112 | 中国 113 | 統一 114 | し 115 | とき 116 | つまり 117 | 宿敵 118 | 項羽 119 | 倒し 120 | あと 121 | 敵 122 | い 123 | なっ 124 | その 125 | 時 126 | 今 127 | 一緒 128 | 戦っ 129 | き 130 | 功臣 131 | 何 132 | 人 133 | 劉邦 134 | 降格 135 | さ 136 | れ 137 | また 138 | それ 139 | 恨み 140 | 思っ 141 | 功臣 142 | 謀反 143 | 起こし 144 | 一族 145 | ごと 146 | 処罰 147 | 受け 148 | し 149 | ただ 150 | 軍師 151 | 張 152 | 良 153 | 天下 154 | 統一 155 | 後 156 | 政治 157 | 興味 158 | あり 159 | 研究 160 | さ 161 | せ 162 | ください 163 | 自分 164 | 安全 165 | アピール 166 | し 167 | の 168 | 助かっ 169 | よう 170 | 軍師 171 | 成果 172 | 上げる 173 | やはり 174 | 人間 175 | 性質 176 | 理解 177 | し 178 | い 179 | よう 180 | こんな 181 | 感じ 182 | 共通 183 | 敵 184 | いる 185 | 条件下 186 | 育ま 187 | れ 188 | 友情 189 | よう 190 | もの 191 | いずれ 192 | 崩壊 193 | する 194 | 儚い 195 | もの 196 | 考え 197 | おく 198 | の 199 | いい 200 | 喫煙 201 | 所 202 | 上司 203 | 悪口 204 | 言っ 205 | てる 206 | 暇 207 | あっ 208 | 自分 209 | スキル 210 | アップ 211 | 時間 212 | 使う 213 | 方 214 | 有益 215 | 言える 216 | -------------------------------------------------------------------------------- /tests/data/ko.in.txt: -------------------------------------------------------------------------------- 1 | 국회는 국가의 예산안을 심의·확정한다. 헌법재판소의 조직과 운영 기타 2 | 필요한 사항은 법률로 정한다. 국회에서 의결된 법률안은 정부에 이송되어 3 | 15일 이내에 대통령이 공포한다. 4 | 5 | 국가는 지역간의 균형있는 발전을 위하여 지역경제를 육성할 의무를 진다. 6 | 국민경제의 발전을 위한 중요정책의 수립에 관하여 대통령의 자문에 응하기 7 | 위하여 국민경제자문회의를 둘 수 있다. 8 | 9 | 국가는 전통문화의 계승·발전과 민족문화의 창달에 노력하여야 한다. 모든 10 | 국민은 인간으로서의 존엄과 가치를 가지며, 행복을 추구할 권리를 가진다. 11 | 국가는 개인이 가지는 불가침의 기본적 인권을 확인하고 이를 보장할 의무를 12 | 진다. 13 | 14 | 중앙선거관리위원회는 대통령이 임명하는 3인, 국회에서 선출하는 3인과 15 | 대법원장이 지명하는 3인의 위원으로 구성한다. 위원장은 위원중에서 16 | 호선한다. 17 | 18 | 국가는 농수산물의 수급균형과 유통구조의 개선에 노력하여 가격안정을 19 | 도모함으로써 농·어민의 이익을 보호한다. 국가원로자문회의의 의장은 20 | 직전대통령이 된다. 다만, 직전대통령이 없을 때에는 대통령이 지명한다. -------------------------------------------------------------------------------- /tests/data/ko.out.txt: -------------------------------------------------------------------------------- 1 | 국회는 2 | 국가의 3 | 예산안을 4 | 심의·확정한다 5 | 헌법재판소의 6 | 조직과 7 | 운영 8 | 필요한 9 | 사항은 10 | 법률로 11 | 정한다 12 | 국회에서 13 | 의결된 14 | 법률안은 15 | 정부에 16 | 이송되어 17 | 이내에 18 | 대통령이 19 | 공포한다 20 | 국가는 21 | 지역간의 22 | 균형있는 23 | 발전을 24 | 지역경제를 25 | 육성할 26 | 의무를 27 | 진다 28 | 국민경제의 29 | 발전을 30 | 위한 31 | 중요정책의 32 | 수립에 33 | 대통령의 34 | 자문에 35 | 응하기 36 | 국민경제자문회의를 37 | 수 38 | 국가는 39 | 전통문화의 40 | 계승·발전과 41 | 민족문화의 42 | 창달에 43 | 노력하여야 44 | 한다 45 | 모든 46 | 국민은 47 | 인간으로서의 48 | 존엄과 49 | 가치를 50 | 가지며 51 | 행복을 52 | 추구할 53 | 권리를 54 | 가진다 55 | 국가는 56 | 개인이 57 | 가지는 58 | 불가침의 59 | 기본적 60 | 인권을 61 | 확인하고 62 | 이를 63 | 보장할 64 | 의무를 65 | 진다 66 | 중앙선거관리위원회는 67 | 대통령이 68 | 임명하는 69 | 인 70 | 국회에서 71 | 선출하는 72 | 인과 73 | 대법원장이 74 | 지명하는 75 | 인의 76 | 위원으로 77 | 구성한다 78 | 위원장은 79 | 위원중에서 80 | 호선한다 81 | 국가는 82 | 농수산물의 83 | 수급균형과 84 | 유통구조의 85 | 개선에 86 | 노력하여 87 | 가격안정을 88 | 도모함으로써 89 | 농·어민의 90 | 이익을 91 | 보호한다 92 | 국가원로자문회의의 93 | 의장은 94 | 직전대통령이 95 | 된다 96 | 직전대통령이 97 | 없을 98 | 때에는 99 | 대통령이 100 | 지명한다 -------------------------------------------------------------------------------- /tests/data/no.in.txt: -------------------------------------------------------------------------------- 1 | Hver dag blir vi litt klokere på pandemien som har snudd samfunnet 2 | vårt på hodet. Hver dag er vi ett skritt nærmere en vaksine og ett 3 | skritt nærmere det som skal være vår nye hverdag, etter krisen. Jeg 4 | tror vi alle kjenner på at det tærer litt på nå – syv måneder nærmest 5 | i unntakstilstand, med mer usikkerhet og mindre frihet enn vi noen 6 | gang trodde vi skulle oppleve. 7 | 8 | Vi fikk raskt kontroll på smitten. Og vi kunne bruke våre økonomiske 9 | muskler til å dempe tilbakeslaget. Gjeninnhentingen kom raskere enn vi 10 | så for oss. Men vi blir stadig minnet på hvor skjør situasjonen er. 11 | 12 | Å hindre nye smitteutbrudd er den viktigste jobben fremover også, 13 | antakelig langt inn i neste år. Den jobben kan ingen gjøre alene. Vi 14 | må gjøre den sammen. Hver og en av oss må fortsette å holde avstand, 15 | vi må vaske hendene og være hjemme hvis vi er syke, for å beskytte de 16 | mest sårbare blant oss, for å ta vare på arbeidsplassene, for at alt 17 | skal bli bra igjen – selv om vi kjenner at det røyner på. 18 | 19 | De siste månedene har jeg møtt folk og bedrifter fra hele Norge for å 20 | høre deres historier og deres tanker om fremtiden. Det er ett ord som 21 | går igjen: usikkerhet. Men det er ofte med en betryggende undertone av 22 | optimisme og innsatsvilje – betryggende fordi vi må ha med privat 23 | næringsliv på laget hvis vi skal ri denne stormen av uten å miste 24 | kurs. Det beste vi kan gjøre, er å legge til rette for at det kan skje 25 | – og vise vei. 26 | 27 | Med budsjettet for 2021 forsterker regjeringen det langsiktige 28 | arbeidet for å styrke bærekraften og konkurransekraften i norsk 29 | økonomi. Den jobben startet vi i 2013. For å trygge jobbene og fremme 30 | omstilling har vi investert mer i kunnskap, i forskning, i 31 | infrastruktur og i vekstfremmende skattelettelser. 32 | 33 | Prioriteringene i budsjettet skal bringe Norge mot seks mål som alle 34 | handler om å komme gjennom denne krisen, uten å miste de langsiktige 35 | perspektivene av syne. 36 | 37 | Det første målet handler om å få folk tilbake i jobb. I mars steg 38 | antallet permitterte dramatisk. På det meste var mer enn hver tiende 39 | person i arbeidsstyrken registrert som helt ledig hos Nav. Bildet har 40 | bedret seg siden den gang. Men fortsatt er mer enn 100 000 mennesker 41 | helt arbeidsledige i Norge. Det er altfor mange. 42 | 43 | Det viktigste for å bevare et samfunn med små forskjeller og gode 44 | velferdsordninger er at flest mulig er i jobb. Derfor må vi unngå at 45 | ledigheten nå biter seg fast på et høyt nivå. Vi må være spesielt 46 | oppmerksom på ungdom som er på vei ut i arbeidslivet. Og vi må unngå 47 | at de som i utgangspunktet hadde svak tilknytning til arbeidslivet, 48 | støtes varig ut. 49 | 50 | Å legge til rette for nye arbeidsplasser i privat sektor har vært en 51 | viktig del av den økonomiske politikken regjeringen har ført i syv 52 | år. Før krisen var sysselsettingsandelen på vei opp. Vi må tilbake til 53 | det sporet. 54 | 55 | Siden mars har vi iverksatt kraftfulle tiltak for å gi økonomisk 56 | trygghet til alle de som plutselig fikk inntektsgrunnlaget revet bort, 57 | for å sikre at kommunene og helsetjenesten har ressurser nok til å 58 | fortsette å ta vare på oss, og for å hjelpe levedyktige bedrifter 59 | gjennom denne krevende perioden. Tiltakene har virket. Aktiviteten i 60 | norsk økonomi har tatt seg opp igjen etter den dramatiske nedgangen i 61 | vår. Men gjeninnhentingen er skjør, krisen er ikke over. Vi må ta 62 | høyde for at mange bedrifter – og særlig i næringer som reiseliv og 63 | kultur – fortsatt vil være rammet av smitteverntiltak som begrenser 64 | aktiviteten, og at ordrebøkene til eksportbedriftene antakelig vil 65 | tynnes ut på grunn av svak etterspørsel fra landene vi handler 66 | med. Hos handelspartnerne våre har den økonomiske nedgangen vært enda 67 | dypere, og gjeninnhentingen har vært svakere. 68 | 69 | I budsjettet for 2021 legger vi opp til at oljepengebruken skal være 70 | på 313 mrd. kr. Det tilsvarer 3 pst. av fondsverdien og betyr at vi 71 | allerede neste år kan være tilbake på den langsiktige rettesnoren for 72 | bærekraftig bruk av oljeinntektene. Men det er samtidig over 60 73 | mrd. kr mer enn vi brukte i 2019. 74 | 75 | Budsjettet for neste år vil virke ekspansivt i den økonomiske 76 | politikken. Alt fra bygging av vei og bane, investeringer i forsvaret, 77 | overføringer til kommuner, fylker og helsetjenester, nye 78 | byggeprosjekter og ulike støtteordninger skaper aktivitet og arbeid 79 | over hele landet. I tillegg vil vi forsterke satsingen på 80 | arbeidsmarkedstiltak for dem som står uten jobb. 81 | 82 | Koronapandemien kommer til å endre vanene våre. Vi har blitt mer 83 | digitale. Mye tyder på at vi kommer til å reise mindre og være mer på 84 | hjemmekontor. Varige endringer er endringer som næringslivet må 85 | tilpasse seg. Vi må hjelpe næringslivet gjennom krisen – uten å svekke 86 | innovasjonskraften og uten å ødelegge omstillingsevnen. Det er en 87 | krevende balansegang. Bedrifter går konkurs – i gode tider og i 88 | vanskelige tider. Vi politikere må være kloke nok til å erkjenne at vi 89 | ikke vet hvem som er morgendagens vinnere. Markedet må avgjøre hvilke 90 | bedrifter som skal være med videre. 91 | 92 | Norge har naturressurser, teknologi og kompetanse som gir oss 93 | fantastiske muligheter. Det andre målet i regjeringens strategi er at 94 | vi må sikre flere ben å stå på. Vi må fortsette å fornye Norge. Vi 95 | trenger flere jobber, i flere bransjer, over hele landet. Og veksten 96 | må komme i privat næringsliv. 97 | 98 | Gjennom syv år i regjering har vi gjort mye for å bedre 99 | rammebetingelsene for bedriftene. Konkurranseevnen er kraftig bedret, 100 | bl.a. fordi bedriftene tar i bruk ny teknologi, og fordi vi har senket 101 | skattene ned mot nivået i land vi konkurrerer med. Vi har prioritert 102 | lavere selskapsskatt, lavere marginalskatt på arbeid og lavere 103 | formuesskatt fordi det gir arbeidsplasser og arbeidslyst. 104 | 105 | Nå kutter vi skattene enda mer, med brede lettelser i inntektsskatten, 106 | som lavere trinnskatt, høyere minstefradrag både på lønn og på pensjon 107 | og videre nedtrapping av skatten på arbeidende kapital. Det kan gi 108 | norske eiere muskler til å investere i norske arbeidsplasser. Og vi 109 | øker skattefordelen for dem som kjøper aksjer i bedriften de er ansatt 110 | i. Vi ønsker at flere medarbeidere skal bli medeiere i bedriftene. 111 | 112 | Vi foreslår også å endre vannkraftbeskatningen og tilfører næringen 113 | betydelig likviditet. Det vil legge til rette for investeringer og 114 | nødvendige oppgraderinger i en næring som skaper aktivitet i 115 | lokalsamfunn over hele landet. 116 | 117 | I tillegg vil vi øke frikortgrensen til 60 000 kr. Det kommer ungdom 118 | til gode, som nå kan beholde litt mer av de pengene de tjener. 119 | 120 | Fra 1. juli neste år vil vi også innføre kildeskatt på enkelte 121 | betalinger til nærstående selskap i lavskatteland. Formålet er å 122 | motvirke overskuddsflytting, unngå at inntekter som skapes i Norge, 123 | blir beskattet i et annet land med lavere skatt. 124 | 125 | Skatt og avgift er den klart største inntektskilden i 126 | statsbudsjettet. Å beskytte det norske skattegrunnlaget er derfor helt 127 | nødvendig for at vi skal kunne holde skattene lave for våre bedrifter 128 | og arbeidsplasser. 129 | 130 | Med dette budsjettet vil en vanlig familie betale 14 000 kr mindre i 131 | skatt neste år enn om det rød-grønne skattenivået fra 2013 hadde blitt 132 | videreført – 14 000 kr for en vanlig familie. 133 | 134 | Til sammen er skatte- og avgiftsnivået redusert med nesten 30 mrd. kr 135 | i vår regjeringsperiode. 136 | 137 | Gjennom sterk satsing på forskning og utvikling skaper regjeringen 138 | grobunn for omstilling og vekst. I 2021 øker vi FoU-bevilgningene med 139 | mer enn 2 mrd. kr, til over 45 mrd. kr. Pengene skal bl.a. gå til å 140 | trappe opp langtidsplanen for forskning og høyere utdanning og til 141 | deltagelse i Horisont Europa og EUs romprogram. Norske bedrifter og 142 | forskere hevder seg bra i konkurransen om penger fra 143 | EU-programmene. Deltagelse der bidrar til nye jobber her og til at vi 144 | løser mange store samfunnsoppgaver. 145 | -------------------------------------------------------------------------------- /tests/data/pt.in.txt: -------------------------------------------------------------------------------- 1 | Eu poucas vezes canto os casos melancolicos, 2 | Os lethargos gentis, os extasis bucolicos 3 | E as desditas crueis do proprio coração; 4 | Mas não celebro o vicio e odeio o desalinho 5 | Da muza sem pudor que mostra no caminho 6 | A liga á multidão. 7 | 8 | A sagrada poesia, a peregrina eterna, 9 | Ouvi dizer que soffre uma affecção moderna, 10 | Uns fastios sem nome, uns tedios ideaes; 11 | Que ensaia, presumida, o gesto romanesco 12 | E, vaidosa de si, no collo eburneo e fresco, 13 | Põe crémes triviaes! 14 | 15 | Oh, pensam mal de ti, da tua castidade! 16 | Deslumbra-os o fulgor dos astros da cidade, 17 | Os falsos ouropeis das cortezãs gentis, 18 | E julgam já tocar-te as roçagantes vestes 19 | Ó deusa virginal das coleras celestes, 20 | Das graças juvenis! 21 | 22 | Retine a cançoneta alegre das bachantes, 23 | Saudadas nos wagons, nos caes, nos restaurantes, 24 | Visões d'olhar travesso e provocantes pés, 25 | E julgam já escutar a voz do paraiso, 26 | Amando o que ha de falso e torpe no sorriso 27 | Das musas dos cafés! 28 | 29 | Oh, tu não és, de certo, a virgem quebradiça 30 | Estiolada e gentil, que vem depois da missa 31 | Mostrar pela cidade o seu fino desdem, 32 | Nem a fada que sente um vaporoso tedio 33 | Emquanto vae sonhando um noivo rico e nédio 34 | Que a possa pagar bem! 35 | 36 | Nem posso mesmo crêr, archanjo, que tu sejas 37 | A menina gentil que ás portas das egrejas 38 | Emquanto a multidão galante adora a cruz, 39 | A bem do pobre enfermo á turba pede esmola 40 | Nas pompas ideaes da moda, que a consola 41 | Das magoas do Jesus! 42 | 43 | E nas horas de luta emquanto os povos choram 44 | E a guerra tudo mata e os reis tudo devoram, 45 | Não posso dizer bem se acaso tu serás 46 | A senhora que espalha os languidos fastios 47 | Nos pomposos salões, sorrindo a fazer fios 48 | Á viva luz do gaz! 49 | 50 | Tu és a apparição gentil, meia selvagem, 51 | D'olhar profundo e bom, de candida roupagem, 52 | De fronte immaculada e seios virginaes, 53 | Que desenha no espaço o limpido contorno 54 | E cinge na cabeça o virginal adorno 55 | De folhas naturaes. 56 | 57 | Tens a linha ideal das candidas figuras; 58 | As curvas divinaes; as tintas sãs e puras 59 | Da austera virgindade; as bellas correcções; 60 | E segues magestosa em teu longo caminho 61 | Deixando fluctuar a tunica de linho 62 | Ás frescas virações! 63 | 64 | Quando trava batalha a tua irmã Justiça 65 | Acodes ao combate e apontas sobre a liça 66 | Uma espada de luz ao Mal dominador: 67 | E pensas na belleza harmonica das cousas 68 | Sentindo que se move um mundo sob as louzas 69 | No germen d'uma flôr! 70 | 71 | N'um sorriso cruel, pungente d'ironia, 72 | Tambem sabes vibrar, serena, altiva e fria, 73 | O latego febril das grandes punições; 74 | E vendo-te sorrir, a geração doente, 75 | Sentir cuida, talvez, a nota decadente, 76 | Das morbidas canções! 77 | 78 | Oh, vôa sem cessar traçando nos teus hombros 79 | O manto constellado, ó deusa dos assombros, 80 | Até chegar um dia ás regiões de luz, 81 | Aonde, na poeira aurifera dos astros, 82 | Contricto, Satanaz enxugará de rastos, 83 | As chagas de Jesus! 84 | 85 | Logar á minha fada ó languidas senhoras! 86 | E vós que amaes do circo as noites tentadoras, 87 | Os fluctuantes véos, os gestos divinaes, 88 | Podeis vel-a passar n'um turbilhão fantastico, 89 | Voando no corcel febril, nervoso, elastico, 90 | Dos novos ideaes! -------------------------------------------------------------------------------- /tests/data/pt.out.txt: -------------------------------------------------------------------------------- 1 | pouc 2 | vez 3 | cant 4 | cas 5 | melancol 6 | letharg 7 | gent 8 | extas 9 | bucol 10 | desdit 11 | cru 12 | propri 13 | coraçã 14 | celebr 15 | vici 16 | odei 17 | desalinh 18 | muz 19 | pudor 20 | mostr 21 | caminh 22 | lig 23 | á 24 | multidã 25 | sagr 26 | poes 27 | peregrin 28 | etern 29 | ouv 30 | diz 31 | soffr 32 | affecçã 33 | modern 34 | uns 35 | fasti 36 | nom 37 | uns 38 | tedi 39 | idea 40 | ensa 41 | presum 42 | gest 43 | romanesc 44 | vaidos 45 | si 46 | coll 47 | eburn 48 | fresc 49 | põ 50 | crém 51 | trivia 52 | oh 53 | pens 54 | mal 55 | ti 56 | castidad 57 | deslumbr 58 | fulgor 59 | astros 60 | cidad 61 | fals 62 | ourop 63 | cortezãs 64 | gent 65 | julg 66 | toc 67 | roçag 68 | vest 69 | ó 70 | deus 71 | virginal 72 | col 73 | cel 74 | grac 75 | juven 76 | retin 77 | cançonet 78 | alegr 79 | bachant 80 | saud 81 | wagons 82 | caes 83 | restaur 84 | visõ 85 | d'olh 86 | travess 87 | provoc 88 | pés 89 | julg 90 | escut 91 | voz 92 | parais 93 | amand 94 | ha 95 | fals 96 | torp 97 | sorris 98 | mus 99 | cafés 100 | oh 101 | és 102 | cert 103 | virg 104 | quebradic 105 | estiol 106 | gentil 107 | vem 108 | miss 109 | mostr 110 | cidad 111 | fin 112 | desd 113 | fad 114 | sent 115 | vapor 116 | tedi 117 | emquant 118 | vae 119 | sonh 120 | noiv 121 | ric 122 | nédi 123 | poss 124 | pag 125 | bem 126 | poss 127 | crêr 128 | archanj 129 | sej 130 | menin 131 | gentil 132 | ás 133 | port 134 | egrej 135 | emquant 136 | multidã 137 | galant 138 | ador 139 | cruz 140 | bem 141 | pobr 142 | enferm 143 | á 144 | turb 145 | ped 146 | esmol 147 | pomp 148 | idea 149 | mod 150 | consol 151 | mago 152 | jesus 153 | hor 154 | lut 155 | emquant 156 | pov 157 | chor 158 | guerr 159 | tud 160 | mat 161 | reis 162 | tud 163 | devor 164 | poss 165 | diz 166 | bem 167 | acas 168 | serás 169 | senhor 170 | espalh 171 | langu 172 | fasti 173 | pompos 174 | salõ 175 | sorr 176 | faz 177 | fios 178 | á 179 | viv 180 | luz 181 | gaz 182 | és 183 | appariçã 184 | gentil 185 | mei 186 | selvag 187 | d'olh 188 | profund 189 | bom 190 | cand 191 | roupag 192 | front 193 | immacul 194 | sei 195 | virgina 196 | desenh 197 | espac 198 | limp 199 | contorn 200 | cing 201 | cabec 202 | virginal 203 | adorn 204 | folh 205 | natura 206 | tens 207 | linh 208 | ideal 209 | cand 210 | figur 211 | curv 212 | divina 213 | tint 214 | sãs 215 | pur 216 | aust 217 | virgindad 218 | bell 219 | correcçõ 220 | segu 221 | magest 222 | long 223 | caminh 224 | deix 225 | fluctu 226 | tunic 227 | linh 228 | ás 229 | fresc 230 | viraçõ 231 | trav 232 | batalh 233 | irmã 234 | justic 235 | acod 236 | combat 237 | apont 238 | sobr 239 | lic 240 | espad 241 | luz 242 | mal 243 | domin 244 | pens 245 | bellez 246 | harmon 247 | cous 248 | sent 249 | mov 250 | mund 251 | sob 252 | louz 253 | germen 254 | d'um 255 | flôr 256 | n'um 257 | sorris 258 | cruel 259 | pungent 260 | d'iron 261 | tamb 262 | sab 263 | vibr 264 | seren 265 | altiv 266 | fri 267 | lateg 268 | febril 269 | grand 270 | puniçõ 271 | vend 272 | sorr 273 | geraçã 274 | doent 275 | sent 276 | cuid 277 | talvez 278 | not 279 | decadent 280 | morb 281 | cançõ 282 | oh 283 | vôa 284 | cess 285 | trac 286 | hombr 287 | mant 288 | constell 289 | ó 290 | deus 291 | assombr 292 | cheg 293 | dia 294 | ás 295 | regiõ 296 | luz 297 | aond 298 | poeir 299 | aurif 300 | astros 301 | contrict 302 | satanaz 303 | enxug 304 | rast 305 | chag 306 | jesus 307 | log 308 | á 309 | fad 310 | ó 311 | langu 312 | senhor 313 | vós 314 | ama 315 | circ 316 | noit 317 | tentador 318 | fluctuant 319 | véos 320 | gest 321 | divina 322 | pod 323 | vel 324 | pass 325 | n'um 326 | turbilhã 327 | fantast 328 | voand 329 | corcel 330 | febril 331 | nervos 332 | elast 333 | nov 334 | idea 335 | -------------------------------------------------------------------------------- /tests/data/ro.in.txt: -------------------------------------------------------------------------------- 1 | Aceasta carte contine teoria mea originala, numita MDT (Modeling Devices Theory), 2 | asupra functiilor hardware de baza ale unui creier (animal sau uman). 3 | 4 | Fiind o teorie stiintifica, ea este de fapt un model simbolic. Orice model 5 | simbolic trebuie sa contina un numar foarte limitat de termeni fundamentali si un 6 | numar foarte limitat de relatii fundamentale intre termenii fundamentali. Pentru 7 | termenii fundamentali si numai pentru ei, se accepta' definitii bazate pe 8 | descrieri. Toti ceilalti termeni sunt generati de model, odata cu definitiile lor, 9 | prin operatii logico-matematice. Acestea sunt caracteristicile fundamentale ale 10 | oricarei teorii stiintifice. Teoria prezentata urmeaza aceste reguli de baza. 11 | 12 | Aceasta teorie se afla' in totala opozitie cu toate stiintele actuale care 13 | studiaza functionarea creierului si care stiinte nu se bazeaza pe un singur model 14 | simbolic. In acest fel, aceasta teorie descalifica' din start tot ce s-a creat in 15 | ultimele citeva sute de ani in domenii cum ar fi psihologia, psihiatria, 16 | gnoseologia, epistemologia, stiintele comportamentelor animalelor, partial 17 | stiintele sociale si alte domenii conexe. 18 | 19 | Aceasta incercare de revolutie totala este necesara si justificata de urmatoarea 20 | situatie, care situayie exista' independent de existenta sau nu a teoriei mele. 21 | 22 | In psihologie, de exemplu, se folosesc o serie de termeni (constiinta, realitate, 23 | adevar, perceptii, emotii, etc.) care nu au definitii universal acceptate. In 24 | fapt, fiecare psiholog are propriile variante de definitii descriptive asupra 25 | tuturor termenilor folositi de el. Psihologia nu este o stiinta exacta, lucru 26 | universal acceptat. Atunci cind va aparea o stiinta exacta care sa acopere si 27 | domeniul psihologiei, atunci tot ce s-a scris deja in psihologie trebuie abandonat 28 | sau rescris in baza acelei teorii stintifice. 29 | 30 | Intr-o stiinta exacta cum ar fi Mecanica lui Newton, toti termenii folositi au 31 | exact aceleasi definitii pentru oricine, oriunde si oricind, fara nici o 32 | modificare de aproximativ 340 de ani de cind au fost creati. 33 | 34 | De exemplu, termenul "viteza" are o definitie generata de modelul simbolic. Acesta 35 | definitie este v=s/t (se imparte spatiul la timp). Termenul "viteza" nu este deci 36 | introdus prin descriere. 37 | 38 | Sa presupunem acum ca cineva a creat sau va crea un model simbolic fundamental (o 39 | stiinta exacta) care explica' functionarea creierului in mod acceptabil. Prima 40 | consecinta a aparitiei acestui model este ca absolut toti termenii folositi in 41 | domeniile acoperite de acel model, vor fi definiti pe baza modelului. Rezultatul 42 | este cel care a fost enuntat mai sus si anume, tot ce s-a scris in ultimii citeva 43 | sute de ani in asa zisele stiinte asociate creierului, va trebui abandonat sau 44 | rescris. 45 | 46 | Indiferent daca teoria prezentata in aceasta carte va fi sau nu acceptata, mai 47 | repede sau mai tirziu, tot va aparea un model simbolic fundamental care sa explice 48 | functionarea creierului si deci, mai repede sau mai tirziu, tot se va intimpla 49 | aceasta revolutie. 50 | 51 | Aici apare insa o problema suplimentara. Pseudo-stiintele actuale asociate 52 | creierului sunt sustinute de o puternica structura academica si cu caracter 53 | aplicativ/lucrativ. Oamenii care sustin aceasta structura nu au cum sa accepte 54 | nici o teorie bazata pe un singur model simbolic, deoarece asta inseamna' sa ia 55 | totul de la zero. 56 | 57 | Consecinta este faptul ca, chiar daca ar aparea un model simbolic fundamental 58 | "absolut corect ", opozitia care ar aparea ar fi enorma. Nu-mi creez nici o iluzie 59 | ca cineva care deja lucreaza in domeniile acestor pseudo-stiinte va accepta sau 60 | chiar va lua in considerare aceasta teorie sau oricare alta de acest fel. 61 | 62 | Bazat pe experienta de peste 10 ani de cind exista' aceasta teorie, ea a avut 63 | succes la persoanele care lucreaza deja in domeniul stiintelor exacte 64 | (matematicieni, fizicieni,..) dar si la tinerii intre 12 si 20 de ani. Mai precis, 65 | la tinerii care nu sunt inca remorcati de sistemul social-economic actual. Un 66 | student care a primit deja o tema de lucrare de diploma, va trebui sa urmeze linia 67 | trasata de profesorii lui. El nu are cum sa-si riste viitorul aventurindu-se intr- 68 | un domeniu neinteles de profesorii lui. 69 | 70 | Sa vedem ce ofera aceasta teorie. In primul rind, fiind un model simbolic, ea este 71 | bazata pe logica. Ea da definitii extrem de precise si neinterpretabile tuturor 72 | termenilor folositi in asociatie cu functionarea creierului. 73 | 74 | Teoria explica' principiul de functionare al creierului, animal sau uman, pina la 75 | a fi in stare sa faca un proiect logic functional, adica un proiect de dispozitiv 76 | logic, care poate sintetiza functiile de baza ale creierului animal sau uman. 77 | 78 | De fapt, creierul este tratat ca un produs tehnologic. Astfel, se definesc 79 | cerintele fundamentale dar si deficientele fundamentale de proiectare. Sunt 80 | explicate problemele si solutiile legate de implementarea tehnologica a 81 | creierului, in multiplele lui variante. 82 | 83 | Teoria sugereaza faptul ca proiectantul, in decursul zecilor de milenii, a facut 84 | mai multe variante tehnologice care se pot recunoaste in realitatea externa. Se 85 | analizeaza daca prin evolutie se poate trece sau nu, de la un creier de animal la 86 | un creier de om. 87 | 88 | Sunt tratate si problemele de proiectare sau tehnologice, cunoscute sub denumirea 89 | de deficiente/boli psihice (in forme patologice sau nu). 90 | 91 | Teoria trateaza intr-un mod stiintific si asa zisele fenomene paranormale si 92 | sugereaza metode pentru dezvoltarea abilitatilor in acest domeniu. 93 | 94 | Cartea are doua parti. Prima prezinta teoria generala impreuna cu citeva aplicatii 95 | considerate mai importante. In a doua parte sunt prezentate mai detaliat, un numar 96 | de exemple, teste si aplicatii, care sa sustina intelegerea teoriei generale. 97 | 98 | Din cauza ca teoria, numita de mine MDT (Modeling-Devices Theory), a fost scrisa 99 | initial in limba engleza (din 1997 elemente ale ei se afla pe WEB), un mare numar 100 | de termeni sunt prescurtati folosind terminologia engleza. 101 | 102 | Elementele de baza ale teoriei au aparut cam in 1993 si prima forma scrisa in 103 | 1995. De atunci teoria a fost perfectionata si dezvoltata si procesul continua. 104 | In anul 2003 o versiune foarte apropiata de aceasta a fost publicata la editura 105 | Cosmos din Sibiu. Aceasta versiune poate fi considerata ca o editie imbunatatita 106 | si adaugita a cartii din 2003. -------------------------------------------------------------------------------- /tests/data/ro.out.txt: -------------------------------------------------------------------------------- 1 | cart 2 | contin 3 | teor 4 | original 5 | numit 6 | mdt 7 | modeling 8 | devices 9 | theory 10 | funct 11 | hardw 12 | baz 13 | creier 14 | animal 15 | uman 16 | fiind 17 | o 18 | teor 19 | stiintif 20 | fapt 21 | model 22 | simbol 23 | model 24 | simbol 25 | trebui 26 | contin 27 | numar 28 | foart 29 | limit 30 | termen 31 | fundamental 32 | si 33 | numar 34 | foart 35 | limit 36 | relat 37 | fundamental 38 | intre 39 | termen 40 | fundamental 41 | termen 42 | fundamental 43 | si 44 | numa 45 | accept 46 | defin 47 | bazat 48 | descrier 49 | tot 50 | ceilalt 51 | termen 52 | gener 53 | model 54 | odat 55 | defin 56 | oper 57 | logico 58 | matemat 59 | caracterist 60 | fundamental 61 | oricare 62 | teor 63 | stiintif 64 | teor 65 | prezent 66 | urmeaz 67 | regul 68 | baz 69 | teor 70 | afla 71 | in 72 | total 73 | opozit 74 | stiint 75 | actual 76 | studiaz 77 | function 78 | creier 79 | si 80 | stiint 81 | bazeaz 82 | singur 83 | model 84 | simbol 85 | in 86 | fel 87 | teor 88 | descalif 89 | start 90 | s 91 | a 92 | creat 93 | in 94 | ultim 95 | citev 96 | sut 97 | ani 98 | in 99 | domen 100 | psiholog 101 | psihiatr 102 | gnoseolog 103 | epistemolog 104 | stiint 105 | comportament 106 | animal 107 | partial 108 | stiint 109 | social 110 | si 111 | alte 112 | domen 113 | conex 114 | incerc 115 | revolut 116 | total 117 | necesar 118 | si 119 | justific 120 | urmat 121 | situat 122 | situay 123 | exist 124 | independent 125 | existent 126 | a 127 | teor 128 | in 129 | psiholog 130 | exemplu 131 | folos 132 | o 133 | ser 134 | termen 135 | constiint 136 | realitat 137 | adevar 138 | percept 139 | emot 140 | etc 141 | defin 142 | universal 143 | accept 144 | in 145 | fapt 146 | psiholog 147 | propr 148 | variant 149 | defin 150 | descript 151 | tuturor 152 | termen 153 | folos 154 | psiholog 155 | o 156 | stiint 157 | exact 158 | lucru 159 | universal 160 | accept 161 | atunc 162 | cind 163 | va 164 | apar 165 | o 166 | stiint 167 | exact 168 | acop 169 | si 170 | domen 171 | psiholog 172 | atunc 173 | s 174 | a 175 | scris 176 | in 177 | psiholog 178 | trebui 179 | abandon 180 | rescris 181 | in 182 | baz 183 | acele 184 | teor 185 | stintif 186 | intr 187 | o 188 | stiint 189 | exact 190 | mecan 191 | newton 192 | tot 193 | termen 194 | folos 195 | exact 196 | aceleas 197 | defin 198 | si 199 | oric 200 | far 201 | o 202 | modific 203 | aproxim 204 | ani 205 | cind 206 | fost 207 | creat 208 | exemplu 209 | termen 210 | vitez 211 | o 212 | definit 213 | gener 214 | model 215 | simbol 216 | definit 217 | v=s/t 218 | impart 219 | spat 220 | termen 221 | vitez 222 | introdus 223 | descrier 224 | presupun 225 | a 226 | creat 227 | va 228 | cre 229 | model 230 | simbol 231 | fundamental 232 | o 233 | stiint 234 | exact 235 | explic 236 | function 237 | creier 238 | in 239 | mod 240 | accept 241 | consecint 242 | a 243 | apar 244 | acest 245 | model 246 | absol 247 | tot 248 | termen 249 | folos 250 | in 251 | domen 252 | acoper 253 | model 254 | vor 255 | defin 256 | baz 257 | model 258 | rezult 259 | a 260 | fost 261 | enunt 262 | sus 263 | si 264 | anum 265 | s 266 | a 267 | scris 268 | in 269 | ultim 270 | citev 271 | sut 272 | ani 273 | in 274 | asa 275 | zis 276 | stiint 277 | asoc 278 | creier 279 | va 280 | treb 281 | abandon 282 | rescris 283 | indiferent 284 | dac 285 | teor 286 | prezent 287 | in 288 | cart 289 | va 290 | accept 291 | reped 292 | tirziu 293 | va 294 | apar 295 | model 296 | simbol 297 | fundamental 298 | explic 299 | function 300 | creier 301 | si 302 | reped 303 | tirziu 304 | va 305 | intimpl 306 | revolut 307 | apar 308 | insa 309 | o 310 | problem 311 | suplimentar 312 | pseudo 313 | stiint 314 | actual 315 | asoc 316 | creier 317 | sustin 318 | o 319 | putern 320 | structur 321 | academ 322 | si 323 | caracter 324 | aplicativ/lucr 325 | oamen 326 | sustin 327 | structur 328 | accept 329 | o 330 | teor 331 | bazat 332 | singur 333 | model 334 | simbol 335 | inseamn 336 | ia 337 | tot 338 | consecint 339 | fapt 340 | dac 341 | apar 342 | model 343 | simbol 344 | fundamental 345 | absol 346 | corect 347 | opozit 348 | apar 349 | enorm 350 | creez 351 | o 352 | iluz 353 | lucreaz 354 | in 355 | domen 356 | acestor 357 | pseudo 358 | stiint 359 | va 360 | accept 361 | va 362 | lua 363 | in 364 | consider 365 | teor 366 | alta 367 | fel 368 | bazat 369 | experient 370 | ani 371 | cind 372 | exist 373 | teor 374 | a 375 | avut 376 | succes 377 | persoan 378 | lucreaz 379 | in 380 | domen 381 | stiint 382 | exact 383 | matematicien 384 | fizicien 385 | si 386 | tiner 387 | intre 388 | si 389 | ani 390 | precis 391 | tiner 392 | inca 393 | remorc 394 | sist 395 | social 396 | econom 397 | actual 398 | student 399 | a 400 | primit 401 | o 402 | tem 403 | lucr 404 | diplom 405 | va 406 | treb 407 | urmez 408 | lini 409 | trasat 410 | profesor 411 | si 412 | rist 413 | viitor 414 | aventur 415 | intr 416 | domeniu 417 | neinteles 418 | profesor 419 | ved 420 | ofer 421 | teor 422 | in 423 | rind 424 | fiind 425 | model 426 | simbol 427 | bazat 428 | logic 429 | defin 430 | extrem 431 | prec 432 | si 433 | neinterpret 434 | tuturor 435 | termen 436 | folos 437 | in 438 | asociat 439 | function 440 | creier 441 | teor 442 | explic 443 | princip 444 | function 445 | creier 446 | animal 447 | uman 448 | pin 449 | a 450 | in 451 | star 452 | fac 453 | proiect 454 | logic 455 | functional 456 | adic 457 | proiect 458 | dispoz 459 | logic 460 | sintetiz 461 | funct 462 | baz 463 | creier 464 | animal 465 | uman 466 | fapt 467 | creier 468 | tratat 469 | produs 470 | tehnolog 471 | astfel 472 | defin 473 | cerint 474 | fundamental 475 | si 476 | deficient 477 | fundamental 478 | proiect 479 | explic 480 | problem 481 | si 482 | solut 483 | legat 484 | implement 485 | tehnolog 486 | a 487 | creier 488 | in 489 | multipl 490 | variant 491 | teor 492 | sugereaz 493 | fapt 494 | proiect 495 | in 496 | decurs 497 | zec 498 | milen 499 | a 500 | facut 501 | mult 502 | variant 503 | tehnolog 504 | recunoast 505 | in 506 | realitat 507 | extern 508 | analizeaz 509 | dac 510 | evolut 511 | trec 512 | creier 513 | animal 514 | creier 515 | om 516 | tratat 517 | si 518 | problem 519 | proiect 520 | tehnolog 521 | cunosc 522 | denum 523 | deficiente/bol 524 | psihic 525 | in 526 | form 527 | patolog 528 | teor 529 | trateaz 530 | intr 531 | mod 532 | stiintif 533 | si 534 | asa 535 | zis 536 | fenomen 537 | paranormal 538 | si 539 | sugereaz 540 | metod 541 | dezvolt 542 | abil 543 | in 544 | domeniu 545 | cart 546 | dou 547 | part 548 | prezint 549 | teor 550 | general 551 | impreun 552 | citev 553 | aplic 554 | consider 555 | import 556 | in 557 | a 558 | dou 559 | part 560 | prezent 561 | detal 562 | numar 563 | exempl 564 | test 565 | si 566 | aplic 567 | sustin 568 | inteleg 569 | teor 570 | general 571 | cauz 572 | teor 573 | numit 574 | mdt 575 | modeling 576 | devices 577 | theory 578 | a 579 | fost 580 | scris 581 | initial 582 | in 583 | limb 584 | englez 585 | element 586 | afla 587 | web 588 | mar 589 | numar 590 | termen 591 | prescurt 592 | folos 593 | terminolog 594 | englez 595 | element 596 | baz 597 | teor 598 | apar 599 | cam 600 | in 601 | si 602 | form 603 | scris 604 | in 605 | atunc 606 | teor 607 | a 608 | fost 609 | perfection 610 | si 611 | dezvolt 612 | si 613 | proces 614 | continu 615 | in 616 | an 617 | o 618 | versiun 619 | foart 620 | aprop 621 | a 622 | fost 623 | public 624 | editur 625 | cosmos 626 | sibiu 627 | versiun 628 | consider 629 | o 630 | edit 631 | imbunatat 632 | si 633 | adaug 634 | a 635 | cart 636 | -------------------------------------------------------------------------------- /tests/data/ru.in.txt: -------------------------------------------------------------------------------- 1 | Московія! Въ понятіи иностранцевъ, отправлявшихся въ этотъ далекій, 2 | загадочный, снѣжный край — центральная ли только Россія? Или и Югъ 3 | съ златоглавымъ Кіевомъ, и Архангельскъ рыбный, и Каспій мутноводный, 4 | и Сибирь съ пушнымъ звѣремъ, и Кавказъ? 5 | 6 | Все это, вмѣстѣ взятое. 7 | 8 | Границы не были четко очерчены. Гдѣ кончалась Московія въ представленіи 9 | даже ученыхъ географовъ того времени, а тѣмъ болѣе въ воображеніи 10 | художниковъ, просто туристовъ, искателей приключеній, составителей 11 | мемуаровъ? 12 | 13 | Границы страны, по которой бродятъ бѣлые медвѣди, гдѣ снѣгъ лежитъ 14 | толстой пеленой, гдѣ люди питаются сырымъ мясомъ и даже поѣдаютъ другъ 15 | друга! 16 | 17 | Но туда ѣдутъ любознательные и пытливые путешественники: Олеарій, 18 | Корбъ, Герберштейнъ и другіе, и понемногу проливается свѣтъ на невѣдомую 19 | страну. Захватывая и всѣ окраины Россіи (Сибирь, Кавказъ), путешественники, 20 | однако, смѣшиваютъ свои представленія о людяхъ, обычаяхъ, костюмахъ, 21 | почти отождествляя, напримѣръ, татарина съ великороссомъ; они искажаютъ 22 | и архитектурныя формы: на ихъ рисункахъ главки Василія Блаженнаго и 23 | русскихъ монастырей пріобрѣтаютъ формы куполовъ персидскихъ дворцовъ 24 | и самаркандскихъ мечетей. 25 | 26 | Но отъ этого экзотическій интересъ ихъ живописныхъ показаній не 27 | ослабляется, а напротивъ усиливается. Явно восточнаго типа халаты, 28 | мѣховыя шапки, длинные рукава — и тутъ же великорусскія кольчуги и 29 | чисто русскіе уборы коней. 30 | 31 | Бытъ, жизнь Московіи кажется имъ суровой. Сколько наказаній тѣлесныхъ — 32 | висѣлицъ! Лѣсомъ цѣлымъ стоятъ висѣлицы на площадяхъ, людей живьемъ 33 | зарываютъ въ могилы, а тутъ же пышные кортежи, пріемы пословъ, 34 | засѣданія Думы Боярской — вотъ картины, проходящія передъ зрителемъ 35 | этихъ изображеній. Заѣзжія экспедиціи и труды отдѣльныхъ современниковъ 36 | иностранцевъ даютъ богатѣйшій матеріалъ, являющійся основой разысканій 37 | о Россіи былыхъ временъ. 38 | 39 | Интересъ къ Россіи, къ ея исторіи, быту, культурѣ, всюду нарастаетъ; 40 | онъ неизбѣжно станетъ еще большимъ. Уже и сейчасъ въ Англіи, въ Германіи, 41 | въ Чехіи издаются книги о старомъ и новомъ русскомъ искусствѣ. 42 | 43 | То же наблюдается и въ Парижѣ, гдѣ усиленно коллекціонируются гравюры 44 | и книги, относящіяся къ Россіи. 45 | 46 | Изъ числа коллекцій, содержащихъ богатый подборъ книгъ, упомянемъ 47 | собранія Апостола, Катенева, Нелидова, Тищенко, Трубецкой, Гревса, 48 | Шуваловой и др. Гравюры, изображающія русскую жизнь и русскій бытъ, 49 | собираютъ всѣ, кто можетъ. 50 | 51 | Особенно цѣннымъ для вопроса, насъ занимающаго нынѣ, является собраніе 52 | П. Н. Апостола, заключающее въ себѣ рѣдкія изданія Олеарія, Корба, 53 | Герберштейна и др. Старательно подобранныя, эти изданія представляютъ 54 | собою особую рѣдкость за границей, ибо многихъ изъ нихъ не имѣетъ даже 55 | Парижская Національная Библіотека. 56 | 57 | Обзоръ хотя бы трехъ-четырехъ авторовъ изъ собранія Апостола даетъ уже 58 | такой богатый матеріалъ для характеристики Россіи XVI-XVII вѣковъ, что 59 | мы и ограничимся пока репродукціями гравюръ изъ этихъ книгъ. Когда я 60 | пересматривалъ чудесныя in quarto и in folio, въ одинъ изъ уютныхъ 61 | вечеровъ, проведенныхъ мною въ Парижѣ, въ квартирѣ П.Н. Апостола, 62 | у меня явилась мысль подобрать такіе наиболѣе интересные моменты изъ 63 | русской жизни, которые будутъ характерны для пониманія иностранцами 64 | Россіи — тогда _Московіи_. 65 | 66 | Выбравъ эти гравюры, я просилъ компетентнаго П.Н. Апостола составить 67 | къ нимъ описаніе. Нынѣ это изданіе, съ необходимыми комментаріями, 68 | предлагается на судъ читателя. 69 | 70 | Думается, _Московія_ въ этихъ, хотя бы немногихъ, «штрихахъ» 71 | закрѣплена. 72 | -------------------------------------------------------------------------------- /tests/data/ru.out.txt: -------------------------------------------------------------------------------- 1 | москові 2 | въ 3 | поняті 4 | иностранцевъ 5 | отправля 6 | въ 7 | этотъ 8 | далекі 9 | загадочн 10 | снѣжны 11 | кра 12 | центральн 13 | россі 14 | югъ 15 | съ 16 | златоглавымъ 17 | кіевомъ 18 | архангельскъ 19 | рыбн 20 | каспі 21 | мутноводн 22 | сибир 23 | съ 24 | пушнымъ 25 | звѣремъ 26 | кавказъ 27 | вмѣстѣ 28 | взят 29 | границ 30 | четк 31 | очерч 32 | гдѣ 33 | конча 34 | москові 35 | въ 36 | представлені 37 | ученыхъ 38 | географовъ 39 | а 40 | тѣмъ 41 | болѣ 42 | въ 43 | воображені 44 | художниковъ 45 | туристовъ 46 | искател 47 | приключені 48 | составител 49 | мемуаровъ 50 | границ 51 | стран 52 | бродятъ 53 | бѣлы 54 | медвѣд 55 | гдѣ 56 | снѣгъ 57 | лежитъ 58 | толст 59 | пелен 60 | гдѣ 61 | пита 62 | сырымъ 63 | мясомъ 64 | поѣдаютъ 65 | другъ 66 | друг 67 | ѣдутъ 68 | любознательн 69 | пытлив 70 | путешественник 71 | олеарі 72 | корбъ 73 | герберштейнъ 74 | другі 75 | понемног 76 | пролива 77 | свѣтъ 78 | невѣдом 79 | стран 80 | захватыв 81 | всѣ 82 | окраин 83 | россі 84 | сибир 85 | кавказъ 86 | путешественник 87 | смѣшиваютъ 88 | представлені 89 | людяхъ 90 | обычаяхъ 91 | костюмахъ 92 | отождествл 93 | напримѣръ 94 | татарин 95 | съ 96 | великороссомъ 97 | искажаютъ 98 | архитектурны 99 | форм 100 | ихъ 101 | рисункахъ 102 | главк 103 | василі 104 | блаженнаг 105 | русскихъ 106 | монастыр 107 | пріобрѣтаютъ 108 | форм 109 | куполовъ 110 | персидскихъ 111 | дворцовъ 112 | самаркандскихъ 113 | мечет 114 | отъ 115 | экзотическі 116 | интересъ 117 | ихъ 118 | живописныхъ 119 | показані 120 | ослабля 121 | а 122 | напротивъ 123 | усилива 124 | явн 125 | восточнаг 126 | тип 127 | халат 128 | мѣховы 129 | шапк 130 | длин 131 | рукав 132 | тутъ 133 | великорусскі 134 | кольчуг 135 | чист 136 | русскі 137 | убор 138 | кон 139 | бытъ 140 | москові 141 | имъ 142 | суров 143 | наказані 144 | тѣлесныхъ 145 | висѣлицъ 146 | лѣсомъ 147 | цѣлымъ 148 | стоятъ 149 | висѣлиц 150 | площадяхъ 151 | люд 152 | живьемъ 153 | зарываютъ 154 | въ 155 | могил 156 | а 157 | тутъ 158 | пышн 159 | кортеж 160 | пріем 161 | пословъ 162 | засѣдані 163 | дум 164 | боярск 165 | вотъ 166 | картин 167 | проходящі 168 | передъ 169 | зрителемъ 170 | этихъ 171 | изображені 172 | заѣзжі 173 | экспедиці 174 | труд 175 | отдѣльныхъ 176 | современниковъ 177 | иностранцевъ 178 | даютъ 179 | богатѣйші 180 | матеріалъ 181 | являющі 182 | основ 183 | разыскані 184 | россі 185 | былыхъ 186 | временъ 187 | интересъ 188 | къ 189 | россі 190 | къ 191 | е 192 | исторі 193 | быт 194 | культурѣ 195 | нарастаетъ 196 | онъ 197 | неизбѣжн 198 | станетъ 199 | большимъ 200 | сейчасъ 201 | въ 202 | англі 203 | въ 204 | германі 205 | въ 206 | чехі 207 | изда 208 | книг 209 | старомъ 210 | новомъ 211 | русскомъ 212 | искусствѣ 213 | наблюда 214 | въ 215 | парижѣ 216 | гдѣ 217 | усилен 218 | коллекціонир 219 | гравюр 220 | книг 221 | относящі 222 | къ 223 | россі 224 | изъ 225 | числ 226 | коллекці 227 | содержащихъ 228 | богат 229 | подборъ 230 | книгъ 231 | упомянемъ 232 | собрані 233 | апостол 234 | катенев 235 | нелидов 236 | тищенк 237 | трубецк 238 | гревс 239 | шувалов 240 | др 241 | гравюр 242 | изображающі 243 | русск 244 | русскі 245 | бытъ 246 | собираютъ 247 | всѣ 248 | можетъ 249 | цѣннымъ 250 | вопрос 251 | насъ 252 | занимающаг 253 | нынѣ 254 | явля 255 | собрані 256 | п 257 | н 258 | апостол 259 | заключа 260 | въ 261 | себѣ 262 | рѣдкія 263 | издані 264 | олеарі 265 | корб 266 | герберштейн 267 | др 268 | старательн 269 | подобранны 270 | издані 271 | представляютъ 272 | особ 273 | рѣдкост 274 | границ 275 | иб 276 | многихъ 277 | изъ 278 | нихъ 279 | имѣетъ 280 | парижск 281 | національн 282 | библіотек 283 | обзоръ 284 | трехъ 285 | четырехъ 286 | авторовъ 287 | изъ 288 | собрані 289 | апостол 290 | даетъ 291 | богат 292 | матеріалъ 293 | характеристик 294 | россі 295 | вѣковъ 296 | огранич 297 | репродукці 298 | гравюръ 299 | изъ 300 | этихъ 301 | книгъ 302 | пересматривалъ 303 | чудесны 304 | въ 305 | одинъ 306 | изъ 307 | уютныхъ 308 | вечеровъ 309 | проведенныхъ 310 | въ 311 | парижѣ 312 | въ 313 | квартирѣ 314 | п.н 315 | апостол 316 | яв 317 | мысл 318 | подобра 319 | такі 320 | наиболѣ 321 | интересн 322 | момент 323 | изъ 324 | русск 325 | жизн 326 | будутъ 327 | характерн 328 | понимані 329 | иностранц 330 | россі 331 | москові 332 | выбравъ 333 | гравюр 334 | просилъ 335 | компетентнаг 336 | п.н 337 | апостол 338 | состав 339 | къ 340 | нимъ 341 | описані 342 | нынѣ 343 | издані 344 | съ 345 | необходим 346 | комментарі 347 | предлага 348 | судъ 349 | читател 350 | дума 351 | москові 352 | въ 353 | этихъ 354 | немногихъ 355 | штрихахъ 356 | закрѣпл 357 | -------------------------------------------------------------------------------- /tests/data/tr.in.txt: -------------------------------------------------------------------------------- 1 | Ah gençlik!.. Tıpkı ezeli bir baharın ilk çiçekli günlerine benzer. Yeşil kırlar, kelebek dolu bahçeler, güzel kokular içinde serçelerin şen efsanelerini doymadan dinleyerek dolaşırız. İdealimizin rüyası bize hayat kışının fırtınalarını, karlarını, tipilerini hatırlatmaz. Ben işte bu hiç bitmez sanılan baharı İzmir'de geçirdim. On dokuz yaşındaydım. Galiba on beş sene evvel... Evet, seneler nasıl bir ok gölgesi gibi uçuyor! Meşrutiyetin bu hür, bu serbest günlerinden çok uzaktık. Lâkin o eski, zalim idarenin ezici kahrını, gafletim sayesinde hiç duymuyordum. Mersinli'deki minimini evimde, kocaman çınar ağaçlarının hiç durmadan öten ninnileri içinde, kitapların dipsiz girdabına dalmış gitmiştim. Haricî kainat umrumda değildi. Sözde, felsefe feneriyle büyük bir hakikat bulacaktım. Heyhat! Şimdi bu masum hülyamı aklıma getirince, nasıl acı acı gülüyorum... Bir kelimeyi, bir satırı, bir sözü haftalarca, aylarca düşünür, bir cümlenin altındaki —var tevehhüm ettiğim— gizli mânâyı bulmak için birçok geceler uyuyamazdım. Filozofların pek o kadar mânâ murad etmeden yumurtladığı fikirler, bence bir "ilahi nass" gibiydi. Hatta romanlarda rasgeldiğim "ukalalık"lar bile gözümden kaçmazdı. Onları da fişlere yazar, notlarımın arasına kordum. 2 | 3 | Bu "ukalalık"lardan birisi, beni tam üç ay düşündürdü. Tam yüz beş gece gözüme uyku girmedi. Flaubert'in miydi, yoksa bir başkasının mı, iyice hatırlayamıyorum. "Le grade dégrade...", yani: "Rütbe, haysiyeti düşürtür." cümlesi! Bundan bir türlü mânâ çıkaramadım. Bilakis, fikrimce rütbe insanı herkesin seviyesinden yukarı kaldırır, yükseltir, hatta sahibine hususi bir haysiyet verirdi. Artık başka kitap, gazete falan okuyamaz oldum... Her satırın altında, mânâsını anlamadığım bu "Le grade dégrade.." cümlesi kararıyor, bir avuç istifham işaretinden yuğrulmuş sabit bir fikir gibi dimağımda düğümleniyordu. Sakin evimde oturamıyor, bulamadığım mânâyı arayarak tenha sahillerde, kalabalık caddelerde, dar sokaklarda serseri serseri dolaşıyordum. Bir "meçhul", bir "sır" insana ne kadar ıztırap verir; bâhusus masum bir iman da olursa... Bir gün yine deli gibi, içimden: "Le grade dégrade..." diye söylenerek Hükümet Konağı'nın önünden geçiyordum. İsmimi işittim. Döndüm. Bir de baktım ki, riyâziye muallimim, Logaritmacı Hasan! Askerî Kıraathanesinin ta köşesinde bir sandalyeye kurulmuş nargilesini çekiyor... -------------------------------------------------------------------------------- /tests/data/tr.out.txt: -------------------------------------------------------------------------------- 1 | ah 2 | gençlik 3 | tıpkı 4 | ezel 5 | bahar 6 | ilk 7 | çiçekli 8 | gün 9 | benzer 10 | yeşil 11 | kır 12 | kelebek 13 | dol 14 | bahçe 15 | güzel 16 | koku 17 | iç 18 | serçe 19 | şen 20 | efsane 21 | doyma 22 | dinleyerek 23 | dolaşır 24 | i̇deal 25 | rüyas 26 | hayat 27 | kış 28 | fırtına 29 | kar 30 | tipi 31 | hatırlatmaz 32 | bitmez 33 | sanıla 34 | bahar 35 | i̇zmir' 36 | geçir 37 | yaş 38 | galip 39 | se 40 | evvel 41 | evet 42 | sene 43 | ok 44 | gölges 45 | uçuyor 46 | meşrutiyet 47 | hür 48 | serbest 49 | gün 50 | uzak 51 | lâkin 52 | eski 53 | zal 54 | idare 55 | eziç 56 | kahr 57 | gaflet 58 | saye 59 | duymuyor 60 | mersinli' 61 | minim 62 | ev 63 | kocama 64 | çınar 65 | ağaç 66 | durma 67 | ö 68 | ninni 69 | iç 70 | kitap 71 | dipsiz 72 | girdap 73 | dal 74 | gitmiş 75 | haricî 76 | kainat 77 | umr 78 | değil 79 | söz 80 | felsef 81 | fener 82 | büyük 83 | hakikat 84 | bulacak 85 | heyhat 86 | ş 87 | mas 88 | hülya 89 | akl 90 | getir 91 | aç 92 | aç 93 | gülüyor 94 | kelime 95 | satır 96 | söz 97 | hafta 98 | ay 99 | düşünür 100 | cümle 101 | alt 102 | tevehh 103 | ettik 104 | gizli 105 | mânâyı 106 | bulmak 107 | gece 108 | uyuyamaz 109 | filozof 110 | mânâ 111 | muradı 112 | etme 113 | yumurtladık 114 | fikir 115 | be 116 | ilah 117 | nass 118 | gip 119 | roman 120 | rasgeldik 121 | ukalalık" 122 | göz 123 | kaçmaz 124 | fiş 125 | yazar 126 | not 127 | ara 128 | kor 129 | ukalalık" 130 | biris 131 | tam 132 | ay 133 | düşündür 134 | tam 135 | geç 136 | göz 137 | uyku 138 | girmedi 139 | flaubert' 140 | mi 141 | başka 142 | iyiç 143 | hatırlayamıyor 144 | le 145 | grade 146 | dégrade 147 | rütbe 148 | haysiyet 149 | düşür 150 | cümles 151 | türlü 152 | mânâ 153 | çıkaramadı 154 | bilakis 155 | fikr 156 | rütbe 157 | insa 158 | seviye 159 | yukar 160 | kaldırır 161 | yüksel 162 | sahip 163 | hususi 164 | haysiyet 165 | verir 166 | ar 167 | başka 168 | kitap 169 | gaze 170 | fala 171 | okuyamaz 172 | ol 173 | satır 174 | alt 175 | mânâs 176 | anlamadık 177 | le 178 | grade 179 | dégrade 180 | cümles 181 | kararıyor 182 | avuç 183 | istifha 184 | işaret 185 | yuğrul 186 | sabit 187 | fikir 188 | dimak 189 | düğümleniyor 190 | sak 191 | ev 192 | oturamıyor 193 | bulamadık 194 | mânâyı 195 | arayarak 196 | tenha 197 | sahil 198 | kalabalık 199 | cadde 200 | dar 201 | sokak 202 | serser 203 | serser 204 | dolaşıyor 205 | meçhul 206 | sır 207 | insa 208 | ıztırap 209 | verir 210 | bâhusus 211 | mas 212 | ima 213 | gün 214 | del 215 | iç 216 | le 217 | grade 218 | dégrade 219 | söylenerek 220 | hükümet 221 | konağı'n 222 | ön 223 | geçiyor 224 | i̇sm 225 | işit 226 | dö 227 | bak 228 | riyâzi 229 | muall 230 | logaritmaç 231 | hasa 232 | askerî 233 | kıraathane 234 | ta 235 | köşe 236 | sandalye 237 | kurul 238 | nargile 239 | çekiyor 240 | -------------------------------------------------------------------------------- /tests/data/zh.in.txt: -------------------------------------------------------------------------------- 1 | 这条法国邮船白拉日隆子爵号(VicomtedeBragelonne)正向中国开来。早晨八点多钟,冲洗过的三等舱甲板湿意未干,但已坐满了人,法国人、德国流亡出来的犹太人、印度人、安南人,不用说还有中国人。海风里早含着燥热,胖人身体给炎风吹干了,上一层汗结的盐霜,仿佛刚在巴勒斯坦的死海里洗过澡。毕竟是清晨,人的兴致还没给太阳晒萎,烘懒,说话做事都很起劲。那几个新派到安南或中国租界当警察的法国人,正围了那年轻善撒娇的犹太女人在调情。俾斯麦曾说过,法国公使大使的特点,就是一句外国话不会讲;这几位警察并不懂德文,居然传情达意,引得犹太女人格格地笑,比他们的外交官强多了。这女人的漂亮丈夫,在旁顾而乐之,因为他几天来,香烟、啤酒、柠檬水沾光了不少。红海已过,不怕热极引火,所以等一会甲板上零星果皮、纸片、瓶塞之外,香烟头定又遍处皆是。法国人的思想是有名的清楚,他的文章也明白干净,但是他的做事,无不混乱、肮脏、喧哗,但看这船上的乱糟糟。这船,倚仗人的机巧,载满人的扰攘,寄满人的希望,热闹地行着,每分钟把沾污了人气的一小方小面,还给那无情、无尽、无际的大海。 -------------------------------------------------------------------------------- /tests/data/zh.out.txt: -------------------------------------------------------------------------------- 1 | 这 2 | 条 3 | 法国 4 | 邮船 5 | 白 6 | 拉 7 | 日隆 8 | 子爵 9 | 号 10 | VicomtedeBragelonne 11 | 正向 12 | 中国 13 | 开来 14 | 早晨 15 | 八点 16 | 多 17 | 钟 18 | 冲洗 19 | 过 20 | 三等 21 | 三等舱 22 | 甲板 23 | 湿 24 | 意 25 | 未 26 | 干 27 | 但 28 | 已 29 | 坐满 30 | 人 31 | 法国 32 | 国人 33 | 法国人 34 | 德国 35 | 流亡 36 | 出来 37 | 犹太 38 | 犹太人 39 | 印度 40 | 印度人 41 | 安南 42 | 人 43 | 不用 44 | 不用说 45 | 还有 46 | 中国 47 | 人 48 | 海风 49 | 里 50 | 早 51 | 含 52 | 着 53 | 燥热 54 | 胖 55 | 人 56 | 身体 57 | 给 58 | 炎风 59 | 吹干 60 | 上 61 | 一层 62 | 汗 63 | 结 64 | 盐霜 65 | 仿佛 66 | 刚 67 | 在 68 | 巴勒 69 | 勒斯 70 | 巴勒斯 71 | 巴勒斯坦 72 | 死 73 | 海里 74 | 洗过 75 | 洗过澡 76 | 毕竟 77 | 是 78 | 清晨 79 | 人 80 | 兴致 81 | 还 82 | 没 83 | 给 84 | 太阳 85 | 晒 86 | 萎 87 | 烘 88 | 懒 89 | 说话 90 | 做事 91 | 都 92 | 很 93 | 起劲 94 | 那 95 | 几个 96 | 新派 97 | 到 98 | 安南 99 | 或 100 | 中国 101 | 租界 102 | 当 103 | 警察 104 | 法国 105 | 国人 106 | 法国人 107 | 正 108 | 围 109 | 那 110 | 年轻 111 | 善 112 | 撒娇 113 | 犹太 114 | 女人 115 | 在 116 | 调情 117 | 俾斯麦 118 | 曾 119 | 说 120 | 过 121 | 法国 122 | 公使 123 | 大使 124 | 特点 125 | 就是 126 | 一句 127 | 外国 128 | 话 129 | 不会 130 | 讲 131 | 这 132 | 几位 133 | 警察 134 | 并 135 | 不 136 | 懂 137 | 德文 138 | 居然 139 | 传情 140 | 达意 141 | 引得 142 | 犹太 143 | 女人 144 | 格格 145 | 地 146 | 笑 147 | 比 148 | 他们 149 | 外交 150 | 外交官 151 | 强 152 | 多 153 | 这 154 | 女人 155 | 漂亮 156 | 丈夫 157 | 在 158 | 旁 159 | 顾 160 | 而 161 | 乐 162 | 之 163 | 因为 164 | 他 165 | 几天 166 | 来 167 | 香烟 168 | 啤酒 169 | 柠檬 170 | 柠檬水 171 | 沾光 172 | 不少 173 | 红海 174 | 已 175 | 过 176 | 不怕 177 | 热 178 | 极 179 | 引火 180 | 所以 181 | 等 182 | 一会 183 | 甲板 184 | 上 185 | 零星 186 | 果皮 187 | 纸片 188 | 瓶塞 189 | 之外 190 | 香烟 191 | 烟头 192 | 香烟头 193 | 定 194 | 又 195 | 遍 196 | 处 197 | 皆 198 | 是 199 | 法国 200 | 国人 201 | 法国人 202 | 思想 203 | 是 204 | 有名 205 | 清楚 206 | 他 207 | 文章 208 | 也 209 | 明白 210 | 干净 211 | 但是 212 | 他 213 | 做事 214 | 无不 215 | 混乱 216 | 肮脏 217 | 喧哗 218 | 但 219 | 看 220 | 这 221 | 船上 222 | 乱糟 223 | 乱糟糟 224 | 这 225 | 船 226 | 倚仗 227 | 人 228 | 机巧 229 | 载满 230 | 人 231 | 扰攘 232 | 寄满 233 | 人 234 | 希望 235 | 热闹 236 | 地 237 | 行 238 | 着 239 | 分钟 240 | 每分钟 241 | 把 242 | 沾污 243 | 人气 244 | 一小 245 | 方 246 | 小 247 | 面 248 | 还给 249 | 那 250 | 无情 251 | 无尽 252 | 无际 253 | 大海 254 | -------------------------------------------------------------------------------- /tests/lunr-fixture-gen/index.js: -------------------------------------------------------------------------------- 1 | var lunr = require('lunr'); 2 | require("lunr-languages/lunr.stemmer.support.js")(lunr); 3 | const fs = require('fs'); 4 | 5 | for (let file of fs.readdirSync("../data")) { 6 | if (file.endsWith(".in.txt")) { 7 | let code = file.substring(0, 2); 8 | let inp = fs.readFileSync(`../data/${code}.in.txt`); 9 | let outf = fs.openSync(`../data/${code}.out.txt`, 'w'); 10 | 11 | var pipeline = new lunr.Pipeline; 12 | if (code !== "en") 13 | { 14 | require(`lunr-languages/lunr.${code}.js`)(lunr); 15 | 16 | pipeline.add(lunr[code].trimmer); 17 | pipeline.add(lunr[code].stopWordFilter); 18 | pipeline.add(lunr[code].stemmer); 19 | } else { 20 | pipeline.add(lunr.trimmer); 21 | pipeline.add(lunr.stopWordFilter); 22 | pipeline.add(lunr.stemmer); 23 | } 24 | var tokens = lunr.tokenizer(inp); 25 | tokens = pipeline.run(tokens); 26 | 27 | for (var tok of tokens) { 28 | tok = tok.toString(); 29 | if (tok && tok.length > 0) 30 | fs.writeSync(outf, tok + '\n'); 31 | } 32 | fs.closeSync(outf); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tests/lunr-fixture-gen/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lunr-fixture-gen", 3 | "requires": true, 4 | "lockfileVersion": 1, 5 | "dependencies": { 6 | "lunr": { 7 | "version": "2.1.6", 8 | "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.1.6.tgz", 9 | "integrity": "sha512-ydJpB8CX8cZ/VE+KMaYaFcZ6+o2LruM6NG76VXdflYTgluvVemz1lW4anE+pyBbLvxJHZdvD1Jy/fOqdzAEJog==" 10 | }, 11 | "lunr-languages": { 12 | "version": "1.0.0", 13 | "resolved": "https://registry.npmjs.org/lunr-languages/-/lunr-languages-1.0.0.tgz", 14 | "integrity": "sha1-gwvKL+hktxPr4T/zPtSO4G5jVyM=" 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/lunr-fixture-gen/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lunr-fixture-gen", 3 | "main": "index.js", 4 | "dependencies": { 5 | "lunr": "^2.1.6", 6 | "lunr-languages": "^1.0.0" 7 | }, 8 | "scripts": { 9 | "test": "node index.js" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /tests/test-index.rs: -------------------------------------------------------------------------------- 1 | use elasticlunr::*; 2 | use serde_json::json; 3 | use std::fs::{self, File}; 4 | use std::path::Path; 5 | 6 | fn create_index(lang: Box, docs: &'static [[&'static str; 2]]) -> serde_json::Value { 7 | let mut index = Index::with_language(lang, &["title", "body"]); 8 | for (i, doc) in docs.iter().enumerate() { 9 | index.add_doc(&(i + 1).to_string(), doc); 10 | } 11 | json!(index) 12 | } 13 | 14 | fn generate_fixture( 15 | lang: Box, 16 | docs: &'static [[&'static str; 2]], 17 | ) -> serde_json::Value { 18 | let code = lang.code(); 19 | let src = create_index(lang, docs); 20 | let dest = Path::new(env!("CARGO_MANIFEST_DIR")) 21 | .join(format!("tests/searchindex_fixture_{}.json", code)); 22 | let dest = File::create(&dest).unwrap(); 23 | serde_json::to_writer_pretty(dest, &src).unwrap(); 24 | src 25 | } 26 | 27 | fn read_fixture(lang: &dyn Language) -> serde_json::Value { 28 | let src = Path::new(env!("CARGO_MANIFEST_DIR")) 29 | .join(format!("tests/searchindex_fixture_{}.json", lang.code())); 30 | let json = fs::read_to_string(src).unwrap(); 31 | serde_json::from_str(&json).expect("Unable to deserialize the fixture") 32 | } 33 | 34 | const GENERATE_FIXTURE: bool = false; 35 | 36 | fn check_index(lang: L, docs: &'static [[&'static str; 2]]) { 37 | let new_index = create_index(Box::new(lang.clone()), docs); 38 | let name = lang.name(); 39 | let fixture_index = if GENERATE_FIXTURE { 40 | generate_fixture(Box::new(lang), docs) 41 | } else { 42 | read_fixture(&lang) 43 | }; 44 | if new_index != fixture_index { 45 | panic!("The {} search index has changed from the fixture", name); 46 | } 47 | } 48 | 49 | #[test] 50 | fn en_search_index_hasnt_changed_accidentally() { 51 | check_index(lang::English::new(), DOCS_EN); 52 | } 53 | 54 | #[cfg(feature = "ja")] 55 | #[test] 56 | fn ja_search_index_hasnt_changed_accidentally() { 57 | check_index(lang::Japanese::new(), DOCS_JA); 58 | } 59 | 60 | const DOCS_EN: &[[&str; 2]] = &[ 61 | [ 62 | "Chapter 1", 63 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit", 64 | ], 65 | [ 66 | "Chapter 2", 67 | "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad", 68 | ], 69 | [ 70 | "Chapter 3", 71 | "minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex", 72 | ], 73 | [ 74 | "Chapter 4", 75 | "ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate", 76 | ], 77 | [ 78 | "Chapter 5", 79 | "velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat", 80 | ], 81 | ["Chapter 6", "Spatiëring shouldn’t cause a panic."], 82 | ]; 83 | 84 | #[cfg(feature = "ja")] 85 | const DOCS_JA: &'static [[&'static str; 2]] = &[ 86 | [ 87 | "第1章", 88 | "吾輩は猫である。名前はまだ無い。", 89 | ], 90 | [ 91 | "第2章", 92 | "どこで生れたかとんと見当がつかぬ。何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している。", 93 | ], 94 | [ 95 | "第3章", 96 | "吾輩はここで始めて人間というものを見た。しかもあとで聞くとそれは書生という人間中で一番獰悪な種族であったそうだ。この書生というのは時々我々を捕えて煮て食うという話である。しかしその当時は何という考もなかったから別段恐しいとも思わなかった。ただ彼の掌に載せられてスーと持ち上げられた時何だかフワフワした感じがあったばかりである。掌の上で少し落ちついて書生の顔を見たのがいわゆる人間というものの見始であろう。この時妙なものだと思った感じが今でも残っている。", 97 | ], 98 | [ 99 | "第4章", 100 | "第一毛をもって装飾されべきはずの顔がつるつるしてまるで薬缶だ。その後猫にもだいぶ逢ったがこんな片輪には一度も出会わした事がない。のみならず顔の真中があまりに突起している。", 101 | ], 102 | ]; 103 | -------------------------------------------------------------------------------- /tests/test-pipeline.rs: -------------------------------------------------------------------------------- 1 | // Input text is excerpted from public domain books on gutenberg.org or wikisource.org 2 | 3 | use elasticlunr::*; 4 | use std::fs::File; 5 | use std::io::{BufRead, BufReader, Read, Write}; 6 | use std::path::Path; 7 | 8 | #[allow(dead_code)] 9 | fn write_output(lang: &dyn Language) { 10 | let code = lang.code(); 11 | let base = Path::new(env!("CARGO_MANIFEST_DIR")) 12 | .join("tests") 13 | .join("data"); 14 | 15 | let input = base.join(&format!("{}.in.txt", code)); 16 | let mut input_str = String::new(); 17 | File::open(&input) 18 | .unwrap() 19 | .read_to_string(&mut input_str) 20 | .unwrap(); 21 | 22 | let output = base.join(&format!("{}.out.txt", code)); 23 | let mut output = File::create(&output).unwrap(); 24 | 25 | let pipeline = lang.make_pipeline(); 26 | let tokens = pipeline.run(lang.tokenize(&input_str)); 27 | 28 | for tok in tokens { 29 | writeln!(&mut output, "{}", tok).unwrap(); 30 | } 31 | } 32 | 33 | fn compare_to_fixture(lang: &dyn Language) { 34 | let code = lang.code(); 35 | let base = Path::new(env!("CARGO_MANIFEST_DIR")) 36 | .join("tests") 37 | .join("data"); 38 | 39 | let input = base.join(&format!("{}.in.txt", code)); 40 | let mut input_str = String::new(); 41 | File::open(&input) 42 | .unwrap() 43 | .read_to_string(&mut input_str) 44 | .unwrap(); 45 | 46 | let output = base.join(&format!("{}.out.txt", code)); 47 | let mut output = BufReader::new(File::open(&output).unwrap()).lines(); 48 | 49 | let pipeline = lang.make_pipeline(); 50 | let tokens = pipeline.run(lang.tokenize(&input_str)); 51 | 52 | for tok in tokens { 53 | assert_eq!( 54 | tok, 55 | output.next().unwrap().unwrap(), 56 | "Comparing pipeline tokens to fixture for {}", 57 | lang.name() 58 | ); 59 | } 60 | } 61 | 62 | #[test] 63 | fn test_languages() { 64 | for lang in lang::languages() { 65 | //write_output(lang.as_ref()); 66 | compare_to_fixture(lang.as_ref()); 67 | } 68 | } 69 | --------------------------------------------------------------------------------