├── .gitignore ├── .travis.yml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── data └── indices │ └── .gitignore └── src ├── analysis ├── filters │ ├── asciifolding.rs │ ├── lowercase.rs │ ├── mod.rs │ └── ngram.rs ├── lucene_asciifold.rs ├── mod.rs ├── ngram_generator.rs └── tokenizers │ ├── mod.rs │ ├── ngram.rs │ └── standard.rs ├── api ├── alias_api.rs ├── bulk_api.rs ├── document_api.rs ├── index_api.rs ├── mapping_api.rs ├── mod.rs ├── search_api.rs └── utils.rs ├── cluster ├── metadata │ ├── mod.rs │ └── name_registry.rs └── mod.rs ├── document.rs ├── index ├── maintenance.rs ├── metadata │ ├── file.rs │ ├── mod.rs │ └── parse │ │ ├── analysis_analyzer.rs │ │ ├── analysis_filter.rs │ │ ├── analysis_tokenizer.rs │ │ └── mod.rs └── mod.rs ├── main.rs ├── mapping ├── build.rs ├── mod.rs └── parse.rs ├── query_parser ├── and_query.rs ├── constant_score_query.rs ├── filtered_query.rs ├── match_all_query.rs ├── match_none_query.rs ├── match_query.rs ├── mod.rs ├── multi_match_query.rs ├── not_query.rs ├── or_query.rs ├── prefix_query.rs ├── term_query.rs ├── terms_query.rs └── utils.rs ├── search ├── backends │ ├── mod.rs │ └── rocksdb │ │ ├── benches │ │ ├── insert_document.rs │ │ └── merge_segments.rs │ │ ├── document_index.rs │ │ ├── key_builder.rs │ │ ├── mod.rs │ │ ├── search │ │ ├── mod.rs │ │ ├── planner │ │ │ ├── boolean_query.rs │ │ │ ├── mod.rs │ │ │ └── score_function.rs │ │ └── statistics.rs │ │ ├── segment.rs │ │ ├── segment_builder.rs │ │ ├── segment_manager.rs │ │ ├── segment_ops.rs │ │ ├── segment_stats.rs │ │ └── term_dictionary.rs ├── collectors │ ├── mod.rs │ ├── top_score.rs │ └── total_count.rs ├── document.rs ├── mod.rs ├── query │ ├── mod.rs │ ├── multi_term_selector.rs │ └── term_scorer.rs ├── schema.rs ├── segment.rs ├── similarity.rs ├── term.rs ├── term_vector.rs └── token.rs └── system.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | 3 | rust: 4 | - stable 5 | - nightly 6 | 7 | dist: trusty 8 | sudo: true 9 | 10 | addons: 11 | apt: 12 | sources: 13 | - ubuntu-toolchain-r-test 14 | packages: 15 | - gcc-5 16 | - g++-5 17 | 18 | script: 19 | - cargo test 20 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rusticsearch" 3 | version = "0.0.2" 4 | authors = ["Karl Hobley "] 5 | description = "A lightweight, Elasticsearch-compatible search server (early WIP)" 6 | readme = "README.md" 7 | license = "Apache-2.0" 8 | 9 | [workspace] 10 | 11 | [[bin]] 12 | name = "rusticsearch" 13 | 14 | [dependencies] 15 | iron = "0.4.0" 16 | router = "0.2.0" 17 | persistent = "0.2.0" 18 | url = "1.1.1" 19 | unicode-segmentation = "0.1.2" 20 | maplit = "0.1.3" 21 | chrono = { version = "0.4", features = ["serde"] } 22 | roaring = "0.5.0" 23 | byteorder = "0.5" 24 | slog = "2.0" 25 | slog-term = "2.3" 26 | slog-async = "2.1" 27 | uuid = { version = "0.3", features = ["v4"] } 28 | serde = "1.0" 29 | serde_derive = "1.0" 30 | serde_json = "1.0" 31 | atomicwrites = "0.1" 32 | fnv = "1.0" 33 | bitflags = "0.7.0" 34 | rocksdb = "0.10" 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rusticsearch 2 | 3 | **Not actively developed. Please check out [toshi](https://github.com/hntd187/toshi) instead!** 4 | 5 | Lightweight Elasticsearch compatible search server. 6 | 7 | ## Why? 8 | 9 | A good quality search engine is important for many websites and Elasticsearch provides that with an easy to use REST API. But the problem with Elasticsearch is that it requires a minimum of 2GB of memory, which makes it expensive to run. 10 | 11 | The aim of this project is to build new search server that takes the powerful search features and simple API of Elasticsearch, but implement it in a language with more control over memory usage. We aim to keep memory usage below 100MB (excluding cache) so it should be very cheap to run. 12 | 13 | ## Project Goals 14 | 15 | - Decent performance with predictible resource usage 16 | - Focus on simplicity and stability over features 17 | - Elasticsearch compatibility 18 | - Simple to install and operate 19 | 20 | ## Why Rust? 21 | 22 | Rust frees memory as it goes rather than leaving unused memory to be collected later by a "garbage collector" like Java. In Elasticsearch, this heap of garbage can waste gigabytes of memory that could otherwise be used as cache. 23 | 24 | [Rust](http://www.rustlang.org/) is a systems programing language from Mozilla that's designed for building fast, secure and reliable software. 25 | 26 | ## Status 27 | 28 | Please consider this project pre-alpha quality. It currently only supports a subset of Elasticsearch's APIs 29 | which is probably not enough to run most applications. 30 | 31 | It currently supports indexing, both in bulk, and individually (However, the bulk indexer is quite slow at the moment), 32 | and searching using the BM25 similarity algorithm. 33 | 34 | See the [roadmap](https://github.com/kaedroho/rusticsearch/wiki/Initial-development-roadmap) for a list of things 35 | being worked on at the moment. 36 | 37 | ### TODO before first alpha release 38 | 39 | - [ ] Make bulk indexing API faster (It currently indexes each document individually, instead of batching) 40 | - [ ] Implement persistence for analyzers and aliases 41 | - [ ] Implement a method of configuring the server from an external configuration file 42 | 43 | ### Elasticsearch compatibility 44 | 45 | See [Elasticsearch query DSL support](https://github.com/kaedroho/rusticsearch/wiki/Elasticsearch-query-DSL-support). 46 | 47 | ## Running it 48 | 49 | Rusticsearch has only been officially tested on Linux and Windows, but it should also run on Mac OS X. 50 | 51 | Rusticsearch can be compiled with Rust stable 1.15 or later. You can [download it from the Rust website](https://www.rust-lang.org/en-US/downloads.html) or you could use [rustup](https://github.com/rust-lang-nursery/rustup.rs). 52 | 53 | Once Rust is installed, clone the repo and run ``cargo run``: 54 | 55 | ``` 56 | git clone git@github.com:kaedroho/rusticsearch.git 57 | cd rusticsearch 58 | cargo run 59 | ``` 60 | -------------------------------------------------------------------------------- /data/indices/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /src/analysis/filters/asciifolding.rs: -------------------------------------------------------------------------------- 1 | //! Converts any non-ASCII character into ASCII if a reasonable equivilent exists 2 | //! 3 | //! For example, "Ĥéllø" is converted to "Hello" but non-latin scripts such as 4 | //! arabic or hiragana are not changed. 5 | 6 | use std::str; 7 | 8 | use search::{Term, Token}; 9 | 10 | use analysis::lucene_asciifold::fold_to_ascii; 11 | 12 | 13 | pub struct ASCIIFoldingFilter<'a> { 14 | tokens: Box + 'a>, 15 | } 16 | 17 | 18 | impl<'a> ASCIIFoldingFilter<'a> { 19 | pub fn new(tokens: Box +'a >) -> ASCIIFoldingFilter<'a> { 20 | ASCIIFoldingFilter { 21 | tokens: tokens, 22 | } 23 | } 24 | } 25 | 26 | 27 | impl<'a> Iterator for ASCIIFoldingFilter<'a> { 28 | type Item = Token; 29 | 30 | fn next(&mut self) -> Option { 31 | match self.tokens.next() { 32 | Some(token) => { 33 | Some(Token { 34 | term: match str::from_utf8(token.term.as_bytes()) { 35 | Ok(ref string) => { 36 | Term::from_string(&fold_to_ascii(string)) 37 | } 38 | _ => token.term.clone(), 39 | }, 40 | position: token.position, 41 | }) 42 | } 43 | None => None 44 | } 45 | } 46 | } 47 | 48 | 49 | #[cfg(test)] 50 | mod tests { 51 | use search::{Term, Token}; 52 | 53 | use super::ASCIIFoldingFilter; 54 | 55 | #[test] 56 | fn test_simple() { 57 | let mut tokens: Vec = vec![ 58 | Token { term: Term::from_string("Ĥéllø"), position: 1 }, 59 | ]; 60 | 61 | let token_filter = ASCIIFoldingFilter::new(Box::new(tokens.drain(..))); 62 | let tokens = token_filter.collect::>(); 63 | 64 | assert_eq!(tokens, vec![ 65 | Token { term: Term::from_string("Hello"), position: 1 } 66 | ]); 67 | } 68 | 69 | #[test] 70 | fn test_hiragana_not_changed() { 71 | let mut tokens: Vec = vec![ 72 | Token { term: Term::from_string("こんにちは"), position: 1 }, 73 | Token { term: Term::from_string("ハチ公"), position: 2 }, 74 | ]; 75 | 76 | let token_filter = ASCIIFoldingFilter::new(Box::new(tokens.drain(..))); 77 | let tokens = token_filter.collect::>(); 78 | 79 | assert_eq!(tokens, vec![ 80 | Token { term: Term::from_string("こんにちは"), position: 1 }, 81 | Token { term: Term::from_string("ハチ公"), position: 2 }, 82 | ]); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/analysis/filters/lowercase.rs: -------------------------------------------------------------------------------- 1 | //! Converts each token into lowercase 2 | 3 | use std::str; 4 | 5 | use search::{Term, Token}; 6 | 7 | 8 | pub struct LowercaseFilter<'a> { 9 | tokens: Box + 'a>, 10 | } 11 | 12 | 13 | impl<'a> LowercaseFilter<'a> { 14 | pub fn new(tokens: Box +'a>) -> LowercaseFilter<'a> { 15 | LowercaseFilter { 16 | tokens: tokens, 17 | } 18 | } 19 | } 20 | 21 | 22 | impl<'a> Iterator for LowercaseFilter<'a> { 23 | type Item = Token; 24 | 25 | fn next(&mut self) -> Option { 26 | match self.tokens.next() { 27 | Some(token) => { 28 | Some(Token { 29 | term: match str::from_utf8(token.term.as_bytes()) { 30 | Ok(string) => { 31 | // TODO: Can this be done in place? 32 | Term::from_string(&string.to_lowercase()) 33 | } 34 | _ => token.term.clone(), 35 | }, 36 | position: token.position, 37 | }) 38 | } 39 | None => None 40 | } 41 | } 42 | } 43 | 44 | 45 | #[cfg(test)] 46 | mod tests { 47 | use search::{Term, Token}; 48 | 49 | use super::LowercaseFilter; 50 | 51 | #[test] 52 | fn test_lowercase_filter() { 53 | let mut tokens: Vec = vec![ 54 | Token { term: Term::from_string("Hulk"), position: 1 }, 55 | Token { term: Term::from_string("SMASH"), position: 2 } 56 | ]; 57 | 58 | let token_filter = LowercaseFilter::new(Box::new(tokens.drain(..))); 59 | let tokens = token_filter.collect::>(); 60 | 61 | assert_eq!(tokens, vec![ 62 | Token { term: Term::from_string("hulk"), position: 1 }, 63 | Token { term: Term::from_string("smash"), position: 2 } 64 | ]); 65 | } 66 | 67 | #[test] 68 | fn test_lowercase_filter_cjk() { 69 | let mut tokens: Vec = vec![ 70 | Token { term: Term::from_string("こんにちは"), position: 1 }, 71 | Token { term: Term::from_string("ハチ公"), position: 2 }, 72 | Token { term: Term::from_string("Test"), position: 3 } 73 | ]; 74 | 75 | let token_filter = LowercaseFilter::new(Box::new(tokens.drain(..))); 76 | let tokens = token_filter.collect::>(); 77 | 78 | assert_eq!(tokens, vec![ 79 | Token { term: Term::from_string("こんにちは"), position: 1 }, 80 | Token { term: Term::from_string("ハチ公"), position: 2 }, 81 | Token { term: Term::from_string("test"), position: 3 } 82 | ]); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/analysis/filters/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod lowercase; 2 | pub mod ngram; 3 | pub mod asciifolding; 4 | 5 | use serde::{Serialize, Serializer}; 6 | use search::Token; 7 | 8 | use analysis::ngram_generator::Edge; 9 | use analysis::filters::lowercase::LowercaseFilter; 10 | use analysis::filters::ngram::NGramFilter; 11 | use analysis::filters::asciifolding::ASCIIFoldingFilter; 12 | 13 | 14 | /// Defines a token filter 15 | /// 16 | /// You can use this to define a token filter before having to bind it to any data 17 | /// 18 | /// # Examples 19 | /// 20 | /// ``` 21 | /// use search::{Term, Token}; 22 | /// use search::analysis::tokenizers::TokenizerSpec; 23 | /// use search::analysis::filters::FilterSpec; 24 | /// 25 | /// let standard_tokenizer = TokenizerSpec::Standard; 26 | /// let token_stream = standard_tokenizer.initialise("Hello, WORLD!"); 27 | /// 28 | /// // Lowercase filter 29 | /// let lowercase_filter = FilterSpec::Lowercase; 30 | /// let filtered_token_stream = lowercase_filter.initialise(token_stream); 31 | /// 32 | /// let tokens = filtered_token_stream.collect::>(); 33 | /// 34 | /// assert_eq!(tokens, vec![ 35 | /// Token { term: Term::from_string("hello"), position: 1 }, 36 | /// Token { term: Term::from_string("world"), position: 2 }, 37 | /// ]); 38 | /// ``` 39 | #[derive(Debug, Clone, PartialEq)] 40 | pub enum FilterSpec { 41 | Lowercase, 42 | NGram { 43 | min_size: usize, 44 | max_size: usize, 45 | edge: Edge, 46 | }, 47 | ASCIIFolding, 48 | } 49 | 50 | 51 | impl FilterSpec { 52 | pub fn initialise<'a>(&self, input: Box + 'a>) -> Box + 'a> { 53 | match *self { 54 | FilterSpec::Lowercase => { 55 | Box::new(LowercaseFilter::new(input)) 56 | } 57 | FilterSpec::NGram{min_size, max_size, edge} => { 58 | Box::new(NGramFilter::new(input, min_size, max_size, edge)) 59 | } 60 | FilterSpec::ASCIIFolding => { 61 | Box::new(ASCIIFoldingFilter::new(input)) 62 | } 63 | } 64 | } 65 | } 66 | 67 | 68 | impl Serialize for FilterSpec { 69 | fn serialize(&self, serializer: S) -> Result { 70 | let json = match *self { 71 | FilterSpec::Lowercase => { 72 | json!({ 73 | "type": "lowercase", 74 | }) 75 | } 76 | FilterSpec::NGram{min_size, max_size, edge} => { 77 | match edge { 78 | Edge::Left => { 79 | json!({ 80 | "type": "edgeNGram", 81 | "side": "front", 82 | "min_gram": min_size, 83 | "max_gram": max_size, 84 | }) 85 | } 86 | Edge::Right => { 87 | json!({ 88 | "type": "edgeNGram", 89 | "side": "back", 90 | "min_gram": min_size, 91 | "max_gram": max_size, 92 | }) 93 | } 94 | Edge::Neither => { 95 | json!({ 96 | "type": "ngram", 97 | "min_gram": min_size, 98 | "max_gram": max_size, 99 | }) 100 | } 101 | } 102 | } 103 | FilterSpec::ASCIIFolding => { 104 | json!({ 105 | "type": "asciifolding", 106 | }) 107 | } 108 | }; 109 | 110 | json.serialize(serializer) 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/analysis/filters/ngram.rs: -------------------------------------------------------------------------------- 1 | //! Generates a set of "ngram" tokens for each source token 2 | 3 | use std::collections::VecDeque; 4 | use std::str; 5 | 6 | use search::{Term, Token}; 7 | 8 | use analysis::ngram_generator::{Edge, NGramGenerator}; 9 | 10 | 11 | pub struct NGramFilter<'a> { 12 | tokens: Box + 'a>, 13 | min_size: usize, 14 | max_size: usize, 15 | edge: Edge, 16 | output_buffer: VecDeque, 17 | } 18 | 19 | 20 | impl<'a> NGramFilter<'a> { 21 | pub fn new(tokens: Box +'a >, min_size: usize, max_size: usize, edge: Edge) -> NGramFilter<'a> { 22 | NGramFilter { 23 | tokens: tokens, 24 | min_size: min_size, 25 | max_size: max_size, 26 | edge: edge, 27 | output_buffer: VecDeque::new(), 28 | } 29 | } 30 | } 31 | 32 | 33 | impl<'a> Iterator for NGramFilter<'a> { 34 | type Item = Token; 35 | 36 | fn next(&mut self) -> Option { 37 | while self.output_buffer.is_empty() { 38 | // Generate ngrams for next token 39 | let token = self.tokens.next(); 40 | 41 | match token { 42 | Some(token) => { 43 | if let Ok(ref word) = str::from_utf8(&token.term.as_bytes()) { 44 | let ngram_generator = NGramGenerator::new(&word, self.min_size, self.max_size, self.edge); 45 | 46 | for gram in ngram_generator { 47 | self.output_buffer.push_back(Token { 48 | term: Term::from_string(gram), 49 | position: token.position, 50 | }); 51 | } 52 | } 53 | } 54 | None => return None 55 | } 56 | } 57 | 58 | self.output_buffer.pop_front() 59 | } 60 | } 61 | 62 | 63 | #[cfg(test)] 64 | mod tests { 65 | use search::{Term, Token}; 66 | 67 | use analysis::ngram_generator::Edge; 68 | 69 | use super::NGramFilter; 70 | 71 | #[test] 72 | fn test_ngram_filter() { 73 | let mut tokens: Vec = vec![ 74 | Token { term: Term::from_string("hello"), position: 1 }, 75 | ]; 76 | 77 | let token_filter = NGramFilter::new(Box::new(tokens.drain(..)), 2, 3, Edge::Neither); 78 | let tokens = token_filter.collect::>(); 79 | 80 | assert_eq!(tokens, vec![ 81 | Token { term: Term::from_string("he"), position: 1 }, 82 | Token { term: Term::from_string("hel"), position: 1 }, 83 | Token { term: Term::from_string("el"), position: 1 }, 84 | Token { term: Term::from_string("ell"), position: 1 }, 85 | Token { term: Term::from_string("ll"), position: 1 }, 86 | Token { term: Term::from_string("llo"), position: 1 }, 87 | Token { term: Term::from_string("lo"), position: 1 }, 88 | ]); 89 | } 90 | 91 | #[test] 92 | fn test_edgengram_filter() { 93 | let mut tokens: Vec = vec![ 94 | Token { term: Term::from_string("hello"), position: 1 }, 95 | Token { term: Term::from_string("world"), position: 2 } 96 | ]; 97 | 98 | let token_filter = NGramFilter::new(Box::new(tokens.drain(..)), 2, 3, Edge::Left); 99 | let tokens = token_filter.collect::>(); 100 | 101 | assert_eq!(tokens, vec![ 102 | Token { term: Term::from_string("he"), position: 1 }, 103 | Token { term: Term::from_string("hel"), position: 1 }, 104 | Token { term: Term::from_string("wo"), position: 2 }, 105 | Token { term: Term::from_string("wor"), position: 2 }, 106 | ]); 107 | } 108 | 109 | #[test] 110 | fn test_edgengram_filter_max_size() { 111 | let mut tokens: Vec = vec![ 112 | Token { term: Term::from_string("hello"), position: 1 }, 113 | ]; 114 | 115 | let token_filter = NGramFilter::new(Box::new(tokens.drain(..)), 2, 1000, Edge::Left); 116 | let tokens = token_filter.collect::>(); 117 | 118 | assert_eq!(tokens, vec![ 119 | Token { term: Term::from_string("he"), position: 1 }, 120 | Token { term: Term::from_string("hel"), position: 1 }, 121 | Token { term: Term::from_string("hell"), position: 1 }, 122 | Token { term: Term::from_string("hello"), position: 1 }, 123 | ]); 124 | } 125 | 126 | #[test] 127 | fn test_edgengram_filter_right() { 128 | let mut tokens: Vec = vec![ 129 | Token { term: Term::from_string("hello"), position: 1 }, 130 | Token { term: Term::from_string("world"), position: 2 } 131 | ]; 132 | 133 | let token_filter = NGramFilter::new(Box::new(tokens.drain(..)), 2, 3, Edge::Right); 134 | let tokens = token_filter.collect::>(); 135 | 136 | assert_eq!(tokens, vec![ 137 | Token { term: Term::from_string("lo"), position: 1 }, 138 | Token { term: Term::from_string("llo"), position: 1 }, 139 | Token { term: Term::from_string("ld"), position: 2 }, 140 | Token { term: Term::from_string("rld"), position: 2 }, 141 | ]); 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /src/analysis/mod.rs: -------------------------------------------------------------------------------- 1 | //! The analysis module 2 | //! 3 | //! This module provides a library of tools for breaking down a string of text 4 | //! into Tokens. 5 | //! 6 | //! These tools are sorted into three categories: 7 | //! 8 | //! - Tokenisers split a string of text into a stream of tokens 9 | //! - Filters apply transformations to streams of tokens 10 | //! - Analyzers are a combination of a tokeniser and a group of filters 11 | 12 | pub mod ngram_generator; 13 | pub mod lucene_asciifold; 14 | pub mod tokenizers; 15 | pub mod filters; 16 | 17 | use search::token::Token; 18 | 19 | use analysis::tokenizers::TokenizerSpec; 20 | use analysis::filters::FilterSpec; 21 | 22 | 23 | /// Defines an analyzer 24 | /// 25 | /// You can use this to define an analyzer before having to bind it to any data 26 | /// 27 | /// # Examples 28 | /// 29 | /// ``` 30 | /// use search::{Term, Token}; 31 | /// use search::analysis::tokenizers::TokenizerSpec; 32 | /// use search::analysis::filters::FilterSpec; 33 | /// use search::analysis::AnalyzerSpec; 34 | /// 35 | /// // Define an analyzer that splits words and converts them into lowercase 36 | /// let analyzer = AnalyzerSpec { 37 | /// tokenizer: TokenizerSpec::Standard, 38 | /// filters: vec![ 39 | /// FilterSpec::Lowercase, 40 | /// ] 41 | /// }; 42 | /// 43 | /// let token_stream = analyzer.initialise("Hello, WORLD!"); 44 | /// let tokens = token_stream.collect::>(); 45 | /// 46 | /// assert_eq!(tokens, vec![ 47 | /// Token { term: Term::from_string("hello"), position: 1 }, 48 | /// Token { term: Term::from_string("world"), position: 2 }, 49 | /// ]); 50 | /// ``` 51 | #[derive(Debug, Clone, PartialEq)] 52 | pub struct AnalyzerSpec { 53 | pub tokenizer: TokenizerSpec, 54 | pub filters: Vec, 55 | } 56 | 57 | 58 | impl AnalyzerSpec { 59 | pub fn initialise<'a>(&self, input: &'a str) -> Box + 'a> { 60 | let mut analyzer = self.tokenizer.initialise(input); 61 | 62 | for filter in self.filters.iter() { 63 | analyzer = filter.initialise(analyzer); 64 | } 65 | 66 | analyzer 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/analysis/ngram_generator.rs: -------------------------------------------------------------------------------- 1 | use std::cmp; 2 | 3 | use unicode_segmentation::UnicodeSegmentation; 4 | 5 | 6 | #[derive(Debug, PartialEq, Clone, Copy)] 7 | pub enum Edge { 8 | Neither, 9 | Left, 10 | Right, 11 | } 12 | 13 | 14 | #[derive(Debug)] 15 | pub struct NGramGenerator<'a> { 16 | word: &'a str, 17 | word_len: usize, 18 | min_size: usize, 19 | max_size: usize, 20 | edge: Edge, 21 | 22 | current_position: usize, 23 | current_size: usize, 24 | finished: bool, 25 | } 26 | 27 | 28 | impl <'a> NGramGenerator<'a> { 29 | pub fn new(word: &'a str, min_size: usize, max_size: usize, edge: Edge) -> NGramGenerator { 30 | let word_len = word.graphemes(true).count(); 31 | let max_size = cmp::min(max_size, word_len); 32 | 33 | NGramGenerator { 34 | word: word, 35 | word_len: word_len, 36 | min_size: min_size, 37 | max_size: max_size, 38 | edge: edge, 39 | 40 | current_position: 0, 41 | current_size: min_size, 42 | finished: (max_size == 0 || min_size > word_len), 43 | } 44 | } 45 | 46 | /// Retrieve the current gram 47 | #[inline] 48 | fn current_gram(&self) -> &'a str { 49 | let mut start = self.current_position; 50 | 51 | // On right edge we take from the end of the string, instead of the beginning 52 | if self.edge == Edge::Right { 53 | start += self.word_len - self.current_position - self.current_size; 54 | } 55 | 56 | // Find byte positions of the first and last graphemes we are interested in 57 | let mut grapheme_indices = self.word.grapheme_indices(true).skip(start).take(self.current_size); 58 | 59 | let first_grapheme = match grapheme_indices.next() { 60 | Some(first_grapheme) => first_grapheme, 61 | None => return "", 62 | }; 63 | 64 | let last_grapheme = match grapheme_indices.last() { 65 | Some(last_grapheme) => last_grapheme, 66 | None => first_grapheme, 67 | }; 68 | 69 | // Slice the original string using the byte positions of first/last grapheme 70 | let first_byte = first_grapheme.0; 71 | let last_byte = last_grapheme.0 + last_grapheme.1.len(); 72 | &self.word[first_byte..last_byte] 73 | } 74 | 75 | #[inline] 76 | fn current_max_size(&self) -> usize { 77 | cmp::min(self.max_size, self.word_len - self.current_position) 78 | } 79 | 80 | /// Advance to the next gram 81 | /// Note: this will set the "finished" attribute if there are no grams left 82 | fn next_gram(&mut self) { 83 | self.current_size += 1; 84 | 85 | if self.current_size > self.current_max_size() { 86 | if self.edge != Edge::Neither { 87 | self.finished = true; 88 | return; 89 | } 90 | 91 | self.current_size = self.min_size; 92 | self.current_position += 1; 93 | 94 | if self.current_size > self.current_max_size() { 95 | self.finished = true; 96 | return; 97 | } 98 | } 99 | } 100 | } 101 | 102 | 103 | impl <'a> Iterator for NGramGenerator<'a> { 104 | type Item = &'a str; 105 | 106 | fn next(&mut self) -> Option<&'a str> { 107 | if self.finished { 108 | return None; 109 | } 110 | 111 | let gram = self.current_gram(); 112 | self.next_gram(); 113 | Some(gram) 114 | } 115 | } 116 | 117 | 118 | #[cfg(test)] 119 | mod tests { 120 | use super::{Edge, NGramGenerator}; 121 | 122 | #[test] 123 | fn test_ngram() { 124 | let gen = NGramGenerator::new("hello", 2, 3, Edge::Neither); 125 | let ngrams = gen.collect::>(); 126 | 127 | assert_eq!(ngrams, vec![ 128 | "he", "hel", "el", "ell", "ll", "llo", "lo" 129 | ]); 130 | } 131 | 132 | #[test] 133 | fn test_ngram_left_edge() { 134 | let gen = NGramGenerator::new("hello", 2, 4, Edge::Left); 135 | let ngrams = gen.collect::>(); 136 | 137 | assert_eq!(ngrams, vec![ 138 | "he", "hel", "hell" 139 | ]); 140 | } 141 | 142 | #[test] 143 | fn test_ngram_right_edge() { 144 | let gen = NGramGenerator::new("hello", 2, 4, Edge::Right); 145 | let ngrams = gen.collect::>(); 146 | 147 | assert_eq!(ngrams, vec![ 148 | "lo", "llo", "ello" 149 | ]); 150 | } 151 | 152 | #[test] 153 | fn test_ngram_cjk() { 154 | let gen = NGramGenerator::new("こんにちは", 2, 3, Edge::Neither); 155 | let ngrams = gen.collect::>(); 156 | 157 | assert_eq!(ngrams, vec![ 158 | "こん", "こんに", "んに", "んにち", "にち", "にちは", "ちは" 159 | ]); 160 | } 161 | 162 | #[test] 163 | fn test_ngram_graphemes() { 164 | let gen = NGramGenerator::new("u͔n͈̰̎i̙̮͚̦c͚̉o̼̩̰͗d͔̆̓ͥé", 2, 3, Edge::Neither); 165 | let ngrams = gen.collect::>(); 166 | 167 | assert_eq!(ngrams, vec![ 168 | "u\u{354}n\u{30e}\u{348}\u{330}", 169 | "u\u{354}n\u{30e}\u{348}\u{330}i\u{319}\u{32e}\u{35a}\u{326}", 170 | "n\u{30e}\u{348}\u{330}i\u{319}\u{32e}\u{35a}\u{326}", 171 | "n\u{30e}\u{348}\u{330}i\u{319}\u{32e}\u{35a}\u{326}c\u{309}\u{35a}", 172 | "i\u{319}\u{32e}\u{35a}\u{326}c\u{309}\u{35a}", 173 | "i\u{319}\u{32e}\u{35a}\u{326}c\u{309}\u{35a}o\u{357}\u{33c}\u{329}\u{330}", 174 | "c\u{309}\u{35a}o\u{357}\u{33c}\u{329}\u{330}", 175 | "c\u{309}\u{35a}o\u{357}\u{33c}\u{329}\u{330}d\u{306}\u{343}\u{365}\u{354}", 176 | "o\u{357}\u{33c}\u{329}\u{330}d\u{306}\u{343}\u{365}\u{354}", 177 | "o\u{357}\u{33c}\u{329}\u{330}d\u{306}\u{343}\u{365}\u{354}e\u{301}", 178 | "d\u{306}\u{343}\u{365}\u{354}e\u{301}", 179 | ]); 180 | } 181 | 182 | #[test] 183 | fn test_ngram_blank_string() { 184 | let gen = NGramGenerator::new("", 2, 3, Edge::Neither); 185 | let ngrams = gen.collect::>(); 186 | 187 | let empty_result: Vec<&str> = vec![]; 188 | assert_eq!(ngrams, empty_result); 189 | } 190 | 191 | #[test] 192 | fn test_ngram_high_size() { 193 | let gen = NGramGenerator::new("hello", 20, 20, Edge::Neither); 194 | let ngrams = gen.collect::>(); 195 | 196 | let empty_result: Vec<&str> = vec![]; 197 | assert_eq!(ngrams, empty_result); 198 | } 199 | 200 | #[test] 201 | fn test_ngram_zero_size() { 202 | let gen = NGramGenerator::new("hello", 0, 0, Edge::Neither); 203 | let ngrams = gen.collect::>(); 204 | 205 | let empty_result: Vec<&str> = vec![]; 206 | assert_eq!(ngrams, empty_result); 207 | } 208 | 209 | #[test] 210 | fn test_ngram_invalid_size() { 211 | // TODO: Should this panic? 212 | let gen = NGramGenerator::new("hello", 20, 5, Edge::Neither); 213 | let ngrams = gen.collect::>(); 214 | 215 | let empty_result: Vec<&str> = vec![]; 216 | assert_eq!(ngrams, empty_result); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/analysis/tokenizers/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod standard; 2 | pub mod ngram; 3 | 4 | use serde::{Serialize, Serializer}; 5 | use search::token::Token; 6 | 7 | use analysis::ngram_generator::Edge; 8 | use analysis::filters::lowercase::LowercaseFilter; 9 | use analysis::tokenizers::standard::StandardTokenizer; 10 | use analysis::tokenizers::ngram::NGramTokenizer; 11 | 12 | 13 | /// Defines a tokenizer 14 | /// 15 | /// You can use this to define a tokenizer before having to bind it to any data 16 | /// 17 | /// # Examples 18 | /// 19 | /// ``` 20 | /// use search::{Term, Token}; 21 | /// use search::analysis::tokenizers::TokenizerSpec; 22 | /// 23 | /// let standard_tokenizer = TokenizerSpec::Standard; 24 | /// let token_stream = standard_tokenizer.initialise("Hello, world!"); 25 | /// 26 | /// let tokens = token_stream.collect::>(); 27 | /// 28 | /// assert_eq!(tokens, vec![ 29 | /// Token { term: Term::from_string("Hello"), position: 1 }, 30 | /// Token { term: Term::from_string("world"), position: 2 }, 31 | /// ]); 32 | /// ``` 33 | #[derive(Debug, Clone, PartialEq)] 34 | pub enum TokenizerSpec { 35 | Standard, 36 | Lowercase, 37 | NGram { 38 | min_size: usize, 39 | max_size: usize, 40 | edge: Edge, 41 | } 42 | } 43 | 44 | 45 | impl TokenizerSpec { 46 | pub fn initialise<'a>(&self, input: &'a str) -> Box + 'a> { 47 | match *self { 48 | TokenizerSpec::Standard => { 49 | Box::new(StandardTokenizer::new(input)) 50 | } 51 | TokenizerSpec::Lowercase => { 52 | Box::new(LowercaseFilter::new(Box::new(StandardTokenizer::new(input)))) 53 | } 54 | TokenizerSpec::NGram{min_size, max_size, edge} => { 55 | Box::new(NGramTokenizer::new(input, min_size, max_size, edge)) 56 | } 57 | } 58 | } 59 | } 60 | 61 | impl Serialize for TokenizerSpec { 62 | fn serialize(&self, serializer: S) -> Result { 63 | let json = match *self { 64 | TokenizerSpec::Standard => { 65 | json!({ 66 | "type": "standard", 67 | }) 68 | } 69 | TokenizerSpec::Lowercase => { 70 | json!({ 71 | "type": "lowercase", 72 | }) 73 | } 74 | TokenizerSpec::NGram{min_size, max_size, edge} => { 75 | match edge { 76 | Edge::Left => { 77 | json!({ 78 | "type": "edgeNGram", 79 | "side": "front", 80 | "min_gram": min_size, 81 | "max_gram": max_size, 82 | }) 83 | } 84 | Edge::Right => { 85 | json!({ 86 | "type": "edgeNGram", 87 | "side": "back", 88 | "min_gram": min_size, 89 | "max_gram": max_size, 90 | }) 91 | } 92 | Edge::Neither => { 93 | json!({ 94 | "type": "ngram", 95 | "min_gram": min_size, 96 | "max_gram": max_size, 97 | }) 98 | } 99 | } 100 | } 101 | }; 102 | 103 | json.serialize(serializer) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/analysis/tokenizers/ngram.rs: -------------------------------------------------------------------------------- 1 | use unicode_segmentation::{UnicodeSegmentation, UnicodeWords}; 2 | 3 | use search::{Term, Token}; 4 | 5 | use analysis::ngram_generator::{Edge, NGramGenerator}; 6 | 7 | 8 | pub struct NGramTokenizer<'a> { 9 | unicode_words: UnicodeWords<'a>, 10 | min_size: usize, 11 | max_size: usize, 12 | edge: Edge, 13 | position_counter: u32, 14 | ngram_generator: Option>, 15 | } 16 | 17 | 18 | impl<'a> NGramTokenizer<'a> { 19 | pub fn new(input: &'a str, min_size: usize, max_size: usize, edge: Edge) -> NGramTokenizer<'a> { 20 | NGramTokenizer { 21 | unicode_words: input.unicode_words(), 22 | min_size: min_size, 23 | max_size: max_size, 24 | edge: edge, 25 | position_counter: 0, 26 | ngram_generator: None 27 | } 28 | } 29 | } 30 | 31 | 32 | impl<'a> Iterator for NGramTokenizer<'a> { 33 | type Item = Token; 34 | 35 | fn next(&mut self) -> Option { 36 | loop { 37 | // Get next ngram 38 | if let Some(ref mut ngram_generator) = self.ngram_generator { 39 | if let Some(gram) = ngram_generator.next() { 40 | return Some(Token { 41 | term: Term::from_string(gram), 42 | position: self.position_counter, 43 | }); 44 | } 45 | } 46 | 47 | // No more ngrams for this word, get next word 48 | let word = self.unicode_words.next(); 49 | 50 | match word { 51 | Some(word) => { 52 | self.position_counter += 1; 53 | self.ngram_generator = Some( 54 | NGramGenerator::new(word, self.min_size, self.max_size, self.edge) 55 | ); 56 | } 57 | None => return None, 58 | } 59 | } 60 | } 61 | } 62 | 63 | 64 | #[cfg(test)] 65 | mod tests { 66 | use search::{Term, Token}; 67 | 68 | use analysis::ngram_generator::Edge; 69 | 70 | use super::NGramTokenizer; 71 | 72 | 73 | #[test] 74 | fn test_ngram_tokenizer() { 75 | let tokenizer = NGramTokenizer::new("hello", 2, 3, Edge::Neither); 76 | let tokens = tokenizer.collect::>(); 77 | 78 | assert_eq!(tokens, vec![ 79 | Token { term: Term::from_string("he"), position: 1 }, 80 | Token { term: Term::from_string("hel"), position: 1 }, 81 | Token { term: Term::from_string("el"), position: 1 }, 82 | Token { term: Term::from_string("ell"), position: 1 }, 83 | Token { term: Term::from_string("ll"), position: 1 }, 84 | Token { term: Term::from_string("llo"), position: 1 }, 85 | Token { term: Term::from_string("lo"), position: 1 }, 86 | ]); 87 | } 88 | 89 | #[test] 90 | fn test_edgengram_tokenizer() { 91 | let tokenizer = NGramTokenizer::new("hello world", 2, 3, Edge::Left); 92 | let tokens = tokenizer.collect::>(); 93 | 94 | assert_eq!(tokens, vec![ 95 | Token { term: Term::from_string("he"), position: 1 }, 96 | Token { term: Term::from_string("hel"), position: 1 }, 97 | Token { term: Term::from_string("wo"), position: 2 }, 98 | Token { term: Term::from_string("wor"), position: 2 }, 99 | ]); 100 | } 101 | 102 | #[test] 103 | fn test_edgengram_tokenizer_max_size() { 104 | let tokenizer = NGramTokenizer::new("hello", 2, 1000, Edge::Left); 105 | let tokens = tokenizer.collect::>(); 106 | 107 | assert_eq!(tokens, vec![ 108 | Token { term: Term::from_string("he"), position: 1 }, 109 | Token { term: Term::from_string("hel"), position: 1 }, 110 | Token { term: Term::from_string("hell"), position: 1 }, 111 | Token { term: Term::from_string("hello"), position: 1 }, 112 | ]); 113 | } 114 | 115 | #[test] 116 | fn test_edgengram_tokenizer_right() { 117 | let tokenizer = NGramTokenizer::new("hello world", 2, 3, Edge::Right); 118 | let tokens = tokenizer.collect::>(); 119 | 120 | assert_eq!(tokens, vec![ 121 | Token { term: Term::from_string("lo"), position: 1 }, 122 | Token { term: Term::from_string("llo"), position: 1 }, 123 | Token { term: Term::from_string("ld"), position: 2 }, 124 | Token { term: Term::from_string("rld"), position: 2 }, 125 | ]); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/analysis/tokenizers/standard.rs: -------------------------------------------------------------------------------- 1 | //! Splits strings by word boundaries, according to the Unicode Standard [Annex #29](http://unicode.org/reports/tr29/) rules 2 | 3 | use unicode_segmentation::{UnicodeSegmentation, UnicodeWords}; 4 | 5 | use search::{Term, Token}; 6 | 7 | 8 | pub struct StandardTokenizer<'a> { 9 | unicode_words: UnicodeWords<'a>, 10 | position_counter: u32, 11 | } 12 | 13 | 14 | impl<'a> StandardTokenizer<'a> { 15 | pub fn new(input: &'a str) -> StandardTokenizer<'a> { 16 | StandardTokenizer { 17 | unicode_words: input.unicode_words(), 18 | position_counter: 0, 19 | } 20 | } 21 | } 22 | 23 | 24 | impl<'a> Iterator for StandardTokenizer<'a> { 25 | type Item = Token; 26 | 27 | fn next(&mut self) -> Option { 28 | match self.unicode_words.next() { 29 | Some(word) => { 30 | self.position_counter += 1; 31 | 32 | Some(Token { 33 | term: Term::from_string(word), 34 | position: self.position_counter, 35 | }) 36 | } 37 | None => None, 38 | } 39 | 40 | } 41 | } 42 | 43 | 44 | #[cfg(test)] 45 | mod tests { 46 | use search::{Term, Token}; 47 | 48 | use super::StandardTokenizer; 49 | 50 | const TEXT: &'static str = "Up from the bowels of hell he sails, weilding a tankard of freshly brewed ale!"; 51 | 52 | #[test] 53 | fn test_standard_tokenizer() { 54 | let tokenizer = StandardTokenizer::new(TEXT); 55 | let tokens = tokenizer.collect::>(); 56 | 57 | assert_eq!(tokens, vec![ 58 | Token { term: Term::from_string("Up"), position: 1 }, 59 | Token { term: Term::from_string("from"), position: 2 }, 60 | Token { term: Term::from_string("the"), position: 3 }, 61 | Token { term: Term::from_string("bowels"), position: 4 }, 62 | Token { term: Term::from_string("of"), position: 5 }, 63 | Token { term: Term::from_string("hell"), position: 6 }, 64 | Token { term: Term::from_string("he"), position: 7 }, 65 | Token { term: Term::from_string("sails"), position: 8 }, 66 | Token { term: Term::from_string("weilding"), position: 9 }, 67 | Token { term: Term::from_string("a"), position: 10 }, 68 | Token { term: Term::from_string("tankard"), position: 11 }, 69 | Token { term: Term::from_string("of"), position: 12 }, 70 | Token { term: Term::from_string("freshly"), position: 13 }, 71 | Token { term: Term::from_string("brewed"), position: 14 }, 72 | Token { term: Term::from_string("ale"), position: 15 } 73 | ]); 74 | } 75 | 76 | #[test] 77 | fn test_standard_tokenizer_cjk() { 78 | let tokenizer = StandardTokenizer::new("こんにちは、ハチ公!"); 79 | let tokens = tokenizer.collect::>(); 80 | 81 | assert_eq!(tokens, vec![ 82 | Token { term: Term::from_string("こ"), position: 1 }, 83 | Token { term: Term::from_string("ん"), position: 2 }, 84 | Token { term: Term::from_string("に"), position: 3 }, 85 | Token { term: Term::from_string("ち"), position: 4 }, 86 | Token { term: Term::from_string("は"), position: 5 }, 87 | Token { term: Term::from_string("ハチ"), position: 6 }, 88 | Token { term: Term::from_string("公"), position: 7 }, 89 | ]); 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/api/alias_api.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use api::persistent; 4 | use api::iron::prelude::*; 5 | use api::iron::status; 6 | use api::router::Router; 7 | use api::utils::json_response; 8 | 9 | 10 | pub fn view_get_global_alias(req: &mut Request) -> IronResult { 11 | let ref system = get_system!(req); 12 | let ref alias_name = read_path_parameter!(req, "alias").unwrap_or(""); 13 | 14 | // Lock cluster metadata 15 | let cluster_metadata = system.metadata.read().unwrap(); 16 | 17 | // Find alias 18 | let mut found_aliases = HashMap::new(); 19 | 20 | for index_ref in cluster_metadata.names.find(alias_name) { 21 | let index = match cluster_metadata.indices.get(&index_ref) { 22 | Some(index) => index, 23 | None => continue, 24 | }; 25 | 26 | let mut inner_map = HashMap::new(); 27 | let mut inner_inner_map = HashMap::new(); 28 | inner_inner_map.insert(alias_name, HashMap::::new()); 29 | inner_map.insert("aliases".to_owned(), inner_inner_map); 30 | found_aliases.insert(index.canonical_name().clone(), inner_map); 31 | } 32 | 33 | if !found_aliases.is_empty() { 34 | return Ok(json_response(status::Ok, json!(found_aliases))); 35 | } else { 36 | return Ok(json_response(status::NotFound, json!({}))); 37 | } 38 | } 39 | 40 | 41 | pub fn view_get_alias_list(_req: &mut Request) -> IronResult { 42 | // let ref system = get_system!(req); 43 | // let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 44 | 45 | // TODO 46 | 47 | return Ok(json_response(status::Ok, json!({}))); 48 | } 49 | 50 | pub fn view_get_alias(req: &mut Request) -> IronResult { 51 | let ref system = get_system!(req); 52 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 53 | let ref alias_name = read_path_parameter!(req, "alias").unwrap_or(""); 54 | 55 | // Lock cluster metadata 56 | let cluster_metadata = system.metadata.read().unwrap(); 57 | 58 | // Get index 59 | let index_ref = match cluster_metadata.names.find_canonical(index_name) { 60 | Some(index_ref) => index_ref, 61 | None => return Ok(json_response(status::NotFound, json!({}))), 62 | }; 63 | 64 | // Find alias 65 | if cluster_metadata.names.iter_index_aliases(index_ref).any(|name| &name == alias_name) { 66 | return Ok(json_response(status::Ok, json!({}))); 67 | } else { 68 | return Ok(json_response(status::NotFound, json!({}))); 69 | } 70 | } 71 | 72 | 73 | pub fn view_put_alias(req: &mut Request) -> IronResult { 74 | let ref system = get_system!(req); 75 | let ref index_selector = read_path_parameter!(req, "index").unwrap_or(""); 76 | let ref alias_name = read_path_parameter!(req, "alias").unwrap_or(""); 77 | 78 | // Lock cluster metadata 79 | let mut cluster_metadata = system.metadata.write().unwrap(); 80 | 81 | // Insert alias into names registry 82 | let index_refs = cluster_metadata.names.find(*index_selector); 83 | match cluster_metadata.names.insert_or_replace_alias(alias_name.to_string(), index_refs) { 84 | Ok(true) => { 85 | info!(system.log, "created alias"; "index" => *index_selector, "alias" => *alias_name); 86 | } 87 | Ok(false) => { 88 | info!(system.log, "updated alias"; "index" => *index_selector, "alias" => *alias_name); 89 | } 90 | Err(_) => { 91 | // TODO 92 | return Ok(json_response(status::Ok, json!({"acknowledged": false}))); 93 | } 94 | } 95 | 96 | Ok(json_response(status::Ok, json!({"acknowledged": true}))) 97 | } 98 | -------------------------------------------------------------------------------- /src/api/document_api.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | 3 | use serde_json; 4 | 5 | use document::DocumentSource; 6 | 7 | use api::persistent; 8 | use api::iron::prelude::*; 9 | use api::iron::status; 10 | use api::router::Router; 11 | use api::utils::json_response; 12 | 13 | 14 | pub fn view_get_doc(req: &mut Request) -> IronResult { 15 | let ref system = get_system!(req); 16 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 17 | let ref mapping_name = read_path_parameter!(req, "mapping").unwrap_or(""); 18 | // let ref doc_key = read_path_parameter!(req, "doc").unwrap_or(""); 19 | 20 | // Get index 21 | let cluster_metadata = system.metadata.read().unwrap(); 22 | let index = get_index_or_404!(cluster_metadata, *index_name); 23 | let index_metadata = index.metadata.read().unwrap(); 24 | 25 | // Check that the mapping exists 26 | if !index_metadata.mappings.contains_key(*mapping_name) { 27 | return Ok(json_response(status::NotFound, json!({"message": "Mapping not found"}))); 28 | } 29 | 30 | // Find document 31 | /* 32 | let index_reader = index.store.reader(); 33 | let doc = match index_reader.get_document_by_key(doc_key) { 34 | Some(doc) => doc, 35 | None => { 36 | return Ok(json_response(status::NotFound, "{\"message\": \"Document not found\"}")); 37 | } 38 | }; 39 | */ 40 | 41 | 42 | // Build JSON document 43 | // TODO: This is probably completely wrong 44 | // let json_object = BTreeMap::new(); 45 | // FIXME: for (field_name, field_value) in doc.fields.iter() { 46 | // FIXME: json_object.insert(field_name.clone(), Json::Array(field_value.iter().map(|v| v.term.as_json()).collect::>())); 47 | // FIXME: } 48 | 49 | return Ok(json_response(status::Ok, json!({}))); 50 | } 51 | 52 | 53 | pub fn view_put_doc(req: &mut Request) -> IronResult { 54 | let ref system = get_system!(req); 55 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 56 | let ref mapping_name = read_path_parameter!(req, "mapping").unwrap_or(""); 57 | let ref doc_key = read_path_parameter!(req, "doc").unwrap_or(""); 58 | 59 | // Get index 60 | let cluster_metadata = system.metadata.read().unwrap(); 61 | let index = get_index_or_404!(cluster_metadata, *index_name); 62 | let index_metadata = index.metadata.read().unwrap(); 63 | 64 | let doc = { 65 | // Find mapping 66 | let mapping = match index_metadata.mappings.get(*mapping_name) { 67 | Some(mapping) => mapping, 68 | None => { 69 | return Ok(json_response(status::NotFound, json!({"message": "Mapping not found"}))); 70 | } 71 | }; 72 | 73 | // Create document 74 | if let Some(data) = json_from_request_body!(req) { 75 | let document_source = DocumentSource { 76 | key: doc_key, 77 | data: data.as_object().unwrap(), 78 | }; 79 | document_source.prepare(mapping).unwrap() 80 | } else { 81 | return Ok(json_response(status::NotFound, json!({"message": "No data"}))); 82 | } 83 | }; 84 | 85 | index.store.insert_or_update_document(&doc).unwrap(); 86 | 87 | // TODO: {"_index":"wagtail","_type":"searchtests_searchtest","_id":"searchtests_searchtest:5378","_version":1,"created":true} 88 | return Ok(json_response(status::Ok, json!({}))); 89 | } 90 | 91 | 92 | pub fn view_delete_doc(req: &mut Request) -> IronResult { 93 | let ref system = get_system!(req); 94 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 95 | let ref mapping_name = read_path_parameter!(req, "mapping").unwrap_or(""); 96 | let ref doc_key = read_path_parameter!(req, "doc").unwrap_or(""); 97 | 98 | // Get index 99 | let cluster_metadata = system.metadata.read().unwrap(); 100 | let index = get_index_or_404!(cluster_metadata, *index_name); 101 | let index_metadata = index.metadata.read().unwrap(); 102 | 103 | // Check that the mapping exists 104 | if !index_metadata.mappings.contains_key(*mapping_name) { 105 | return Ok(json_response(status::NotFound, json!({"message": "Mapping not found"}))); 106 | } 107 | 108 | // Make sure the document exists 109 | if !index.store.reader().contains_document_key(doc_key) { 110 | return Ok(json_response(status::NotFound, json!({"message": "Document not found"}))); 111 | } 112 | 113 | // Delete document 114 | index.store.remove_document_by_key(doc_key).unwrap(); 115 | 116 | return Ok(json_response(status::Ok, json!({}))); 117 | } 118 | -------------------------------------------------------------------------------- /src/api/index_api.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | use std::io::Read; 3 | 4 | use serde_json; 5 | use search::backends::rocksdb::RocksDBStore; 6 | use uuid::Uuid; 7 | 8 | use index::Index; 9 | use index::metadata::IndexMetadata; 10 | use index::metadata::parse::parse as parse_index_metadata; 11 | 12 | use api::persistent; 13 | use api::iron::prelude::*; 14 | use api::iron::status; 15 | use api::router::Router; 16 | use api::utils::json_response; 17 | 18 | 19 | pub fn view_get_index(req: &mut Request) -> IronResult { 20 | let ref system = get_system!(req); 21 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 22 | 23 | // Get index 24 | let cluster_metadata = system.metadata.read().unwrap(); 25 | let index = get_index_or_404!(cluster_metadata, *index_name); 26 | 27 | // Serialise index metadata 28 | let json = { 29 | match serde_json::to_value(&index.metadata) { 30 | Ok(json) => json, 31 | Err(_) => { 32 | return Ok(json_response(status::InternalServerError, json!({ 33 | "message": "unable to serialise index metadata" 34 | }))); 35 | } 36 | } 37 | }; 38 | 39 | return Ok(json_response(status::Ok, json)); 40 | } 41 | 42 | 43 | pub fn view_put_index(req: &mut Request) -> IronResult { 44 | let ref system = get_system!(req); 45 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 46 | 47 | // Lock cluster metadata 48 | let mut cluster_metadata = system.metadata.write().unwrap(); 49 | 50 | // Find index 51 | let index_ref = cluster_metadata.names.find_canonical(&index_name); 52 | 53 | match index_ref { 54 | Some(_) => { 55 | // Update existing index 56 | // TODO 57 | 58 | info!(system.log, "updated index"; "index" => *index_name); 59 | } 60 | None => { 61 | // Load metadata 62 | let mut metadata = IndexMetadata::default(); 63 | match json_from_request_body!(req).map(|data| parse_index_metadata(&mut metadata, data)) { 64 | Some(Ok(())) | None => {} 65 | Some(Err(_)) => { 66 | // TODO: better error 67 | return Ok(json_response(status::BadRequest, json!({"message": "Couldn't parse index settings"}))); 68 | } 69 | } 70 | 71 | // Create index 72 | let mut indices_dir = system.get_indices_dir(); 73 | indices_dir.push(index_name); 74 | let index = Index::new(Uuid::new_v4(), index_name.clone().to_owned(), metadata, RocksDBStore::create(indices_dir).unwrap()); 75 | index.metadata.read().unwrap().save(index.metadata_path()).unwrap(); 76 | let index_ref = cluster_metadata.insert_index(index); 77 | 78 | // If there's an alias with the new indexes name, delete it. 79 | let alias_deleted = cluster_metadata.names.delete_alias_whole(index_name).unwrap(); 80 | if alias_deleted { 81 | info!(system.log, "deleted alias"; "alias" => format!("{}", index_name), "reason" => "replaced by index"); 82 | } 83 | 84 | // Register canonical name 85 | cluster_metadata.names.insert_canonical(index_name.clone().to_owned(), index_ref).unwrap(); 86 | 87 | info!(system.log, "created index"; "index" => *index_name); 88 | } 89 | } 90 | 91 | return Ok(json_response(status::Ok, json!({"acknowledged": true}))); 92 | } 93 | 94 | 95 | pub fn view_delete_index(req: &mut Request) -> IronResult { 96 | let ref system = get_system!(req); 97 | let ref index_selector = read_path_parameter!(req, "index").unwrap_or(""); 98 | 99 | // Lock cluster metadata 100 | let mut cluster_metadata = system.metadata.write().unwrap(); 101 | 102 | // Make sure the index exists 103 | get_index_or_404!(cluster_metadata, *index_selector); 104 | 105 | // Remove indices 106 | for index_ref in cluster_metadata.names.find(*index_selector) { 107 | // Get the index name 108 | let index_name = { 109 | if let Some(index) = cluster_metadata.indices.get(&index_ref) { 110 | index.canonical_name().to_string() 111 | } else { 112 | // Index doesn't exist 113 | continue; 114 | } 115 | }; 116 | 117 | // Remove index from array 118 | cluster_metadata.indices.remove(&index_ref); 119 | 120 | // Delete canonical name 121 | cluster_metadata.names.delete_canonical(&index_name, index_ref).unwrap(); 122 | 123 | // Delete file 124 | let mut indices_dir = system.get_indices_dir(); 125 | indices_dir.push(&index_name); 126 | match fs::remove_dir_all(&indices_dir) { 127 | Ok(()) => {}, 128 | Err(e) => { 129 | warn!(system.log, "failed to delete index data"; "index" => format!("{}", index_name), "error" => format!("{}", e)); 130 | } 131 | } 132 | 133 | info!(system.log, "deleted index"; "index" => index_name); 134 | 135 | // Delete aliases 136 | let alias_names = cluster_metadata.names.iter_index_aliases(index_ref).map(|n| n.to_string()).collect::>(); 137 | for alias_name in alias_names { 138 | let alias_deleted = cluster_metadata.names.delete_alias(&alias_name, index_ref).unwrap(); 139 | 140 | // If this was the only index being referenced by the alias, the alias would be deleted 141 | if alias_deleted { 142 | info!(system.log, "deleted alias"; "alias" => format!("{}", alias_name), "reason" => "no indices left"); 143 | } 144 | } 145 | } 146 | 147 | return Ok(json_response(status::Ok, json!({"acknowledged": true}))); 148 | } 149 | 150 | 151 | pub fn view_post_refresh_index(_req: &mut Request) -> IronResult { 152 | // let ref system = get_system!(req); 153 | // let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 154 | 155 | // Lock index array 156 | // TODO 157 | // let mut indices = system.indices.write().unwrap(); 158 | 159 | // TODO: {"_shards":{"total":10,"successful":5,"failed":0}} 160 | return Ok(json_response(status::Ok, json!({"acknowledged": true}))); 161 | } 162 | -------------------------------------------------------------------------------- /src/api/mapping_api.rs: -------------------------------------------------------------------------------- 1 | use std::io::Read; 2 | use std::collections::HashMap; 3 | 4 | use serde_json; 5 | use search::schema::{FieldType, FieldFlags, FIELD_INDEXED, FIELD_STORED}; 6 | 7 | use mapping::{self, MappingProperty}; 8 | use mapping::parse::parse as parse_mapping; 9 | 10 | use api::persistent; 11 | use api::iron::prelude::*; 12 | use api::iron::status; 13 | use api::router::Router; 14 | use api::utils::json_response; 15 | 16 | 17 | pub fn view_put_mapping(req: &mut Request) -> IronResult { 18 | let ref system = get_system!(req); 19 | let ref index_name = read_path_parameter!(req, "index").unwrap_or(""); 20 | let ref mapping_name = read_path_parameter!(req, "mapping").unwrap_or(""); 21 | 22 | // Lock cluster metadata 23 | let mut cluster_metadata = system.metadata.write().unwrap(); 24 | 25 | // Get index 26 | let index = get_index_or_404_mut!(cluster_metadata, *index_name); 27 | 28 | // Load data from body 29 | let data = json_from_request_body!(req); 30 | 31 | let data = match data { 32 | Some(data) => data, 33 | None => { 34 | // TODO: Better error 35 | return Ok(json_response(status::BadRequest, json!({"acknowledged": false}))); 36 | } 37 | }; 38 | 39 | let data = data.as_object().unwrap().get(*mapping_name).unwrap(); 40 | 41 | // Insert mapping 42 | let mapping_builder = match parse_mapping(&data) { 43 | Ok(mapping_builder) => mapping_builder, 44 | Err(_) => { 45 | // TODO: Better error 46 | return Ok(json_response(status::BadRequest, json!({"acknowledged": false}))); 47 | } 48 | }; 49 | let mut index_metadata = index.metadata.write().unwrap(); 50 | let mut mapping = mapping_builder.build(&index_metadata); 51 | //debug!("{:#?}", mapping); 52 | let is_updating = index_metadata.mappings.contains_key(*mapping_name); 53 | 54 | // Find list of new fields that need to be added to the store 55 | let new_fields = { 56 | let index_reader = index.store.reader(); 57 | let schema = index_reader.schema(); 58 | let mut new_fields: HashMap = HashMap::new(); 59 | for (name, property) in mapping.properties.iter() { 60 | if let MappingProperty::Field(ref field_mapping) = *property { 61 | let field_type = match field_mapping.data_type { 62 | mapping::FieldType::String => FieldType::Text, 63 | mapping::FieldType::Integer => FieldType::I64, 64 | mapping::FieldType::Boolean => FieldType::Boolean, 65 | mapping::FieldType::Date => FieldType::DateTime, 66 | }; 67 | 68 | // Flags 69 | let mut field_flags = FieldFlags::empty(); 70 | 71 | if field_mapping.is_indexed { 72 | field_flags |= FIELD_INDEXED; 73 | } 74 | 75 | if field_mapping.is_stored { 76 | field_flags |= FIELD_STORED; 77 | } 78 | 79 | // Check if this field already exists 80 | if let Some(field_ref) = schema.get_field_by_name(&name) { 81 | let field_info = schema.get(&field_ref).expect("get_field_by_name returned an invalid FieldId"); 82 | 83 | // Field already exists. Check for conflicting type or flags, otherwise ignore. 84 | if field_info.field_type == field_type && field_info.field_flags == field_flags { 85 | continue; 86 | } else { 87 | // Conflict! 88 | // TODO: Better error 89 | return Ok(json_response(status::BadRequest, json!({"acknowledged": false}))); 90 | } 91 | } 92 | 93 | new_fields.insert(name.clone(), (field_type, field_flags)); 94 | } 95 | } 96 | 97 | new_fields 98 | }; 99 | 100 | // Add new fields into the store 101 | for (field_name, (field_type, field_flags)) in new_fields { 102 | let indexed_yesno = if field_flags.contains(FIELD_INDEXED) { "yes" } else { "no" }; 103 | let stored_yesno = if field_flags.contains(FIELD_STORED) { "yes" } else { "no" }; 104 | info!(system.log, "adding field"; "index" => *index_name, "field" => &field_name, "type" => format!("{:?}", field_type), "indexed" => indexed_yesno, "stored" => stored_yesno); 105 | 106 | index.store.add_field(field_name, field_type, field_flags).unwrap(); 107 | } 108 | 109 | // Link the mapping 110 | { 111 | let index_reader = index.store.reader(); 112 | let schema = index_reader.schema(); 113 | 114 | for (name, property) in mapping.properties.iter_mut() { 115 | if let MappingProperty::Field(ref mut field_mapping) = *property { 116 | field_mapping.index_ref = schema.get_field_by_name(&name) 117 | } 118 | } 119 | } 120 | 121 | index_metadata.mappings.insert(mapping_name.clone().to_owned(), mapping); 122 | index_metadata.save(index.metadata_path()).unwrap(); 123 | 124 | if is_updating { 125 | // TODO: New mapping should be merged with existing one 126 | info!(system.log, "updated mapping"; "index" => *index_name, "mapping" => *mapping_name); 127 | } else { 128 | info!(system.log, "created mapping"; "index" => *index_name, "mapping" => *mapping_name); 129 | } 130 | 131 | return Ok(json_response(status::Ok, json!({"acknowledged": true}))); 132 | } 133 | -------------------------------------------------------------------------------- /src/api/mod.rs: -------------------------------------------------------------------------------- 1 | extern crate iron; 2 | extern crate router; 3 | extern crate persistent; 4 | 5 | #[macro_use] 6 | mod utils; 7 | mod search_api; 8 | mod alias_api; 9 | mod document_api; 10 | mod index_api; 11 | mod mapping_api; 12 | mod bulk_api; 13 | 14 | use std::sync::Arc; 15 | 16 | use api::iron::prelude::*; 17 | use api::iron::status; 18 | use api::iron::typemap::Key; 19 | use api::router::Router; 20 | use api::utils::json_response; 21 | 22 | use system::System; 23 | use VERSION; 24 | 25 | 26 | fn view_home(_: &mut Request) -> IronResult { 27 | Ok(json_response(status::Ok, json!({ 28 | "cluster_name": "rusticsearch", 29 | "version": { 30 | "number": VERSION 31 | } 32 | }))) 33 | } 34 | 35 | 36 | fn get_router() -> Router { 37 | router!(get "/" => view_home, 38 | get "/:index/_count" => search_api::view_count, 39 | post "/:index/_count" => search_api::view_count, 40 | get "/:index/_search" => search_api::view_search, 41 | post "/:index/_search" => search_api::view_search, 42 | get "/_alias/:alias" => alias_api::view_get_global_alias, 43 | get "/:index/_alias" => alias_api::view_get_alias_list, 44 | get "/:index/_alias/:alias" => alias_api::view_get_alias, 45 | put "/:index/_alias/:alias" => alias_api::view_put_alias, 46 | get "/:index/:mapping/:doc" => document_api::view_get_doc, 47 | put "/:index/:mapping/:doc" => document_api::view_put_doc, 48 | delete "/:index/:mapping/:doc" => document_api::view_delete_doc, 49 | get "/:index" => index_api::view_get_index, 50 | put "/:index" => index_api::view_put_index, 51 | delete "/:index" => index_api::view_delete_index, 52 | post "/:index/_refresh" => index_api::view_post_refresh_index, 53 | put "/:index/_mapping/:mapping" => mapping_api::view_put_mapping, 54 | post "/_bulk" => bulk_api::view_post_bulk, 55 | post "/:index/_bulk" => bulk_api::view_post_index_bulk) 56 | } 57 | 58 | 59 | // The "Context" struct just wraps Arc so we can put it into chain.link() 60 | // Workaround for: https://github.com/iron/persistent/issues/55 61 | 62 | struct Context { 63 | system: Arc, 64 | } 65 | 66 | 67 | impl Context { 68 | fn new(system: Arc) -> Context { 69 | Context { 70 | system: system, 71 | } 72 | } 73 | } 74 | 75 | 76 | impl Key for Context { 77 | type Value = Context; 78 | } 79 | 80 | 81 | pub fn api_main(system: Arc) { 82 | let router = get_router(); 83 | let mut chain = Chain::new(router); 84 | chain.link(persistent::Read::::both(Context::new(system.clone()))); 85 | info!(system.log, "listening"; "scheme" => "http", "address" => "localhost", "port" => 9200); 86 | 87 | if let Err(error) = Iron::new(chain).http("localhost:9200") { 88 | crit!(system.log, "unable to start api server"; "error" => format!("{}", error)); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/api/utils.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | 3 | use api::iron::prelude::*; 4 | use api::iron::status; 5 | 6 | 7 | macro_rules! get_system { 8 | ($req: expr) => {{ 9 | use api::Context; 10 | 11 | $req.get::>().unwrap().system.clone() 12 | }} 13 | } 14 | 15 | 16 | macro_rules! read_path_parameter { 17 | ($req: expr, $name: expr) => {{ 18 | $req.extensions.get::().unwrap().find($name) 19 | }} 20 | } 21 | 22 | 23 | pub fn json_response(status: status::Status, content: serde_json::Value) -> Response { 24 | let mut response = Response::with((status, format!("{}", content))); 25 | response.headers.set_raw("Content-Type", vec![b"application/json".to_vec()]); 26 | response 27 | } 28 | 29 | 30 | pub fn index_not_found_response() -> Response { 31 | json_response(status::NotFound, json!({"message": "Index not found"})) 32 | } 33 | 34 | 35 | macro_rules! get_index_or_404 { 36 | ($cluster_metadata: expr, $index_name: expr) => {{ 37 | use api::utils::index_not_found_response; 38 | 39 | let index_ref = match $cluster_metadata.names.find_canonical($index_name) { 40 | Some(index_ref) => index_ref, 41 | None => { 42 | return Ok(index_not_found_response()); 43 | } 44 | }; 45 | 46 | match $cluster_metadata.indices.get(&index_ref) { 47 | Some(index) => index, 48 | None => { 49 | return Ok(index_not_found_response()); 50 | } 51 | } 52 | }} 53 | } 54 | 55 | 56 | macro_rules! get_index_or_404_mut { 57 | ($cluster_metadata: expr, $index_name: expr) => {{ 58 | use api::utils::index_not_found_response; 59 | 60 | let index_ref = match $cluster_metadata.names.find_canonical($index_name) { 61 | Some(index_ref) => index_ref, 62 | None => { 63 | return Ok(index_not_found_response()); 64 | } 65 | }; 66 | 67 | match $cluster_metadata.indices.get_mut(&index_ref) { 68 | Some(index) => index, 69 | None => { 70 | return Ok(index_not_found_response()); 71 | } 72 | } 73 | }} 74 | } 75 | 76 | 77 | macro_rules! parse_json { 78 | ($string: expr) => {{ 79 | use api::utils::json_response; 80 | 81 | let value: serde_json::Value = match serde_json::from_str($string) { 82 | Ok(data) => data, 83 | Err(_) => { 84 | return Ok(json_response(status::BadRequest, json!({"message": "Couldn't parse JSON"}))); 85 | } 86 | }; 87 | 88 | value 89 | }} 90 | } 91 | 92 | 93 | macro_rules! json_from_request_body { 94 | ($req: expr) => {{ 95 | // Read request body to a string 96 | let mut payload = String::new(); 97 | $req.body.read_to_string(&mut payload).unwrap(); 98 | 99 | if !payload.is_empty() { 100 | Some(parse_json!(&payload)) 101 | } else { 102 | None 103 | } 104 | }} 105 | } 106 | -------------------------------------------------------------------------------- /src/cluster/metadata/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod name_registry; 2 | 3 | use std::collections::HashMap; 4 | 5 | use uuid::Uuid; 6 | 7 | use index::Index; 8 | 9 | use self::name_registry::NameRegistry; 10 | 11 | 12 | #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] 13 | pub struct IndexRef(Uuid); 14 | 15 | 16 | impl IndexRef { 17 | pub fn id(&self) -> &Uuid { 18 | &self.0 19 | } 20 | } 21 | 22 | 23 | #[derive(Debug)] 24 | pub struct ClusterMetadata { 25 | pub indices: HashMap, 26 | pub names: NameRegistry, 27 | } 28 | 29 | 30 | impl ClusterMetadata { 31 | pub fn new() -> ClusterMetadata { 32 | ClusterMetadata { 33 | indices: HashMap::new(), 34 | names: NameRegistry::new(), 35 | } 36 | } 37 | 38 | pub fn insert_index(&mut self, index: Index) -> IndexRef { 39 | let index_ref = IndexRef(index.id().clone()); 40 | self.indices.insert(index_ref, index); 41 | 42 | index_ref 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/cluster/metadata/name_registry.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::collections::hash_map::Iter as HashMapIter; 3 | 4 | use super::IndexRef; 5 | 6 | 7 | #[derive(Debug)] 8 | enum Name { 9 | /// This is the canonical name of an index 10 | Canonical(IndexRef), 11 | 12 | /// This is an alias 13 | Alias(Vec), 14 | } 15 | 16 | 17 | #[derive(Debug)] 18 | pub struct NameRegistry { 19 | names: HashMap, 20 | } 21 | 22 | 23 | impl NameRegistry { 24 | pub fn new() -> NameRegistry { 25 | NameRegistry { 26 | names: HashMap::new(), 27 | } 28 | } 29 | 30 | pub fn insert_canonical(&mut self, name: String, index_ref: IndexRef) -> Result<(), ()> { 31 | if let Some(_) = self.names.get(&name) { 32 | return Err(()); 33 | } 34 | 35 | self.names.insert(name, Name::Canonical(index_ref)); 36 | Ok(()) 37 | } 38 | 39 | pub fn delete_canonical(&mut self, name: &str, index_ref: IndexRef) -> Result<(), ()> { 40 | if let Some(&Name::Canonical(actual_index_ref)) = self.names.get(name) { 41 | if actual_index_ref != index_ref { 42 | return Err(()); 43 | } 44 | } else { 45 | return Err(()); 46 | } 47 | 48 | self.names.remove(name); 49 | Ok(()) 50 | } 51 | 52 | pub fn insert_alias(&mut self, name: String, indices: Vec) -> Result<(), ()> { 53 | if let Some(_) = self.names.get(&name) { 54 | return Err(()); 55 | } 56 | 57 | self.names.insert(name, Name::Alias(indices)); 58 | Ok(()) 59 | } 60 | 61 | pub fn insert_or_replace_alias(&mut self, name: String, indices: Vec) -> Result { 62 | if let Some(&Name::Canonical(_)) = self.names.get(&name) { 63 | // Cannot replace if it is a canonical name 64 | return Err(()); 65 | } 66 | 67 | let old_indices = self.names.insert(name, Name::Alias(indices)); 68 | match old_indices { 69 | Some(Name::Alias(_)) => { 70 | Ok(false) 71 | } 72 | Some(Name::Canonical(_)) => { 73 | unreachable!(); 74 | } 75 | None => { 76 | Ok(true) 77 | } 78 | } 79 | } 80 | 81 | pub fn delete_alias(&mut self, name: &str, index_ref: IndexRef) -> Result { 82 | let mut remove_alias = false; 83 | 84 | match self.names.get_mut(name) { 85 | Some(&mut Name::Alias(ref mut indices)) => { 86 | // Remove index from alias 87 | let index = match indices.iter().position(|ir| *ir == index_ref) { 88 | Some(index) => index, 89 | None => return Ok(false), 90 | }; 91 | 92 | indices.remove(index); 93 | 94 | if indices.is_empty() { 95 | remove_alias = true; 96 | } 97 | } 98 | Some(&mut Name::Canonical(_)) => { 99 | return Err(()); 100 | } 101 | None => {} 102 | } 103 | 104 | if remove_alias { 105 | self.names.remove(name); 106 | } 107 | 108 | Ok(remove_alias) 109 | } 110 | 111 | pub fn delete_alias_whole(&mut self, name: &str) -> Result { 112 | if let Some(&Name::Canonical(_)) = self.names.get(name) { 113 | return Err(()); 114 | } 115 | 116 | let alias = self.names.remove(name); 117 | Ok(alias.is_some()) 118 | } 119 | 120 | pub fn find(&self, selector: &str) -> Vec { 121 | let mut indices = Vec::new(); 122 | 123 | // Find name 124 | let name = self.names.get(selector); 125 | 126 | // Resolve the name if we have one 127 | if let Some(name) = name { 128 | match *name { 129 | Name::Canonical(ref index_ref) => indices.push(*index_ref), 130 | Name::Alias(ref alias_indices) => indices.append(&mut alias_indices.clone()), 131 | } 132 | } 133 | 134 | indices 135 | } 136 | 137 | pub fn find_canonical(&self, name: &str) -> Option { 138 | let name = self.names.get(name); 139 | 140 | match name { 141 | Some(&Name::Canonical(index_ref)) => Some(index_ref), 142 | Some(&Name::Alias(_)) | None => None, 143 | } 144 | } 145 | 146 | pub fn iter_index_aliases<'a>(&'a self, index_ref: IndexRef) -> IndexAliasesIterator<'a> { 147 | IndexAliasesIterator { 148 | index_ref: index_ref, 149 | names_iterator: self.names.iter(), 150 | } 151 | } 152 | } 153 | 154 | 155 | pub struct IndexAliasesIterator<'a> { 156 | index_ref: IndexRef, 157 | names_iterator: HashMapIter<'a, String, Name>, 158 | } 159 | 160 | 161 | impl<'a> Iterator for IndexAliasesIterator<'a> { 162 | type Item = &'a str; 163 | 164 | fn next(&mut self) -> Option<&'a str> { 165 | loop { 166 | match self.names_iterator.next() { 167 | Some((name, &Name::Alias(ref indices))) => { 168 | if indices.iter().any(|ir| *ir == self.index_ref) { 169 | return Some(name); 170 | } 171 | } 172 | Some((_, &Name::Canonical(_))) => {} 173 | None => return None 174 | } 175 | } 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /src/cluster/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod metadata; 2 | -------------------------------------------------------------------------------- /src/document.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | use search::Document; 3 | use fnv::FnvHashMap; 4 | 5 | use mapping::{Mapping, MappingProperty, FieldValueError}; 6 | 7 | 8 | #[derive(Debug)] 9 | pub struct DocumentSource<'a> { 10 | pub key: &'a str, 11 | pub data: &'a serde_json::Map, 12 | } 13 | 14 | 15 | #[derive(Debug)] 16 | pub enum PrepareDocumentError { 17 | FieldDoesntExist { 18 | field_name: String, 19 | }, 20 | FieldValueError { 21 | field_name: String, 22 | value: serde_json::Value, 23 | error: FieldValueError, 24 | }, 25 | } 26 | 27 | 28 | impl<'a> DocumentSource<'a> { 29 | pub fn prepare(&self, mapping: &Mapping) -> Result { 30 | let mut indexed_fields = FnvHashMap::default(); 31 | let mut stored_fields = FnvHashMap::default(); 32 | let mut all_field_strings: Vec = Vec::new(); 33 | 34 | for (field_name, field_value) in self.data { 35 | if *field_value == serde_json::Value::Null { 36 | // Treat null like a missing field 37 | continue; 38 | } 39 | 40 | match mapping.properties.get(field_name) { 41 | Some(&MappingProperty::Field(ref field_mapping)) => { 42 | if field_mapping.is_indexed { 43 | let value = field_mapping.process_value_for_index(field_value); 44 | 45 | match value { 46 | Ok(Some(value)) => { 47 | // Copy the field's value into the _all field 48 | if field_mapping.is_in_all { 49 | if let serde_json::Value::String(ref string) = *field_value { 50 | all_field_strings.push(string.clone()); 51 | } 52 | } 53 | 54 | // Insert the field 55 | indexed_fields.insert(field_mapping.index_ref.unwrap(), value); 56 | } 57 | Ok(None) => {} 58 | Err(error) => { 59 | return Err(PrepareDocumentError::FieldValueError { 60 | field_name: field_name.clone(), 61 | value: field_value.clone(), 62 | error: error, 63 | }); 64 | } 65 | } 66 | } 67 | 68 | if field_mapping.is_stored { 69 | let value = field_mapping.process_value_for_store(field_value); 70 | 71 | match value { 72 | Ok(Some(value)) => { 73 | // Insert the field 74 | stored_fields.insert(field_mapping.index_ref.unwrap(), value); 75 | } 76 | Ok(None) => {} 77 | Err(error) => { 78 | return Err(PrepareDocumentError::FieldValueError { 79 | field_name: field_name.clone(), 80 | value: field_value.clone(), 81 | error: error, 82 | }); 83 | } 84 | } 85 | } 86 | } 87 | Some(&MappingProperty::NestedMapping(ref _nested_mapping)) => { 88 | // TODO 89 | } 90 | None => { 91 | // No mapping found 92 | return Err(PrepareDocumentError::FieldDoesntExist { 93 | field_name: field_name.clone(), 94 | }); 95 | } 96 | } 97 | } 98 | 99 | // Insert _all field 100 | if let Some(property) = mapping.properties.get("_all") { 101 | if let MappingProperty::Field(ref field_mapping) = *property { 102 | let strings_json = serde_json::Value::String(all_field_strings.join(" ")); 103 | let value = field_mapping.process_value_for_index(&strings_json); 104 | 105 | match value { 106 | Ok(Some(value)) => { 107 | indexed_fields.insert(field_mapping.index_ref.unwrap(), value); 108 | } 109 | Ok(None) => {} 110 | Err(error) => { 111 | return Err(PrepareDocumentError::FieldValueError { 112 | field_name: "_all".to_string(), 113 | value: strings_json.clone(), 114 | error: error, 115 | }); 116 | } 117 | } 118 | } 119 | } 120 | 121 | Ok(Document { 122 | key: self.key.to_string(), 123 | indexed_fields: indexed_fields, 124 | stored_fields: stored_fields, 125 | }) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/index/maintenance.rs: -------------------------------------------------------------------------------- 1 | use index::Index; 2 | 3 | 4 | impl Index { 5 | /// Run a maintenance task on the index 6 | /// This must be run periodically by a background thread. It is not currently thread-safe 7 | pub fn run_maintenance_task(&self) -> Result<(), String> { 8 | let segment_stats = self.store.get_segment_statistics()?; 9 | 10 | // TODO: Deactivate segments with 100% deletions 11 | // TODO: Vacuum segments with many deletions 12 | 13 | // Merge segments 14 | 15 | // Firstly we classify each active segment into one of 5 groups (based on the number of 16 | // total documents they have): 17 | // Group 1: 1 - 9 docs 18 | // Group 2: 10 - 99 docs 19 | // Group 3: 100 - 999 docs 20 | // Group 4: 1000 - 9999 docs 21 | // Group 5: 10000 - 65536 docs 22 | 23 | // The group with the most active segments can perform a merge. A merge can be done on 24 | // between 5 - 1000 segments at a time. The smallest segments get merged first. 25 | 26 | let mut segments_g1 = Vec::new(); 27 | let mut segments_g2 = Vec::new(); 28 | let mut segments_g3 = Vec::new(); 29 | let mut segments_g4 = Vec::new(); 30 | let mut segments_g5 = Vec::new(); 31 | 32 | for (segment, stats) in segment_stats { 33 | match stats.total_docs() { 34 | 1 ... 9 => segments_g1.push((segment, stats)), 35 | 10 ... 99 => segments_g2.push((segment, stats)), 36 | 100 ... 999 => segments_g3.push((segment, stats)), 37 | 1000 ... 9999 => segments_g4.push((segment, stats)), 38 | 10000 ... 65536 => segments_g5.push((segment, stats)), 39 | _ => {}, 40 | } 41 | } 42 | 43 | // Now sort the groups by length in ascending order 44 | let mut segments_grouped = vec![segments_g1, segments_g2, segments_g3, segments_g4, segments_g5]; 45 | segments_grouped.sort_by_key(|group| group.len()); 46 | 47 | // The group with the most segments is our merge candidate. Check that it has above the 48 | // minimum number of documents to start a merge and truncate it to be less than the maximum. 49 | let mut group_to_merge = segments_grouped.pop().unwrap(); 50 | 51 | if group_to_merge.len() < 3 { 52 | // No point in merging these 53 | return Ok(()); 54 | } 55 | 56 | // Now we've found a group of segments to merge, we must check that all the docs will fit in a 57 | // single segment. If not, we choose the largest sub-group of segments to merge that fills the 58 | // quota as much as possible 59 | 60 | let mut current_doc_count: u32 = 0; 61 | let mut segment_ids = Vec::new(); 62 | 63 | // Sort segments total_docs in descending order 64 | // TODO: Check that this is descending order 65 | group_to_merge.sort_by_key(|&(_, ref stats)| -stats.total_docs()); 66 | 67 | for (segment, stats) in group_to_merge { 68 | if current_doc_count + stats.total_docs() as u32 > 65536 { 69 | // No space for this segment 70 | continue; 71 | } 72 | 73 | segment_ids.push(segment); 74 | current_doc_count += stats.total_docs() as u32; 75 | } 76 | 77 | // Merge segments 78 | self.store.merge_segments(&segment_ids)?; 79 | self.store.purge_segments(&segment_ids)?; 80 | 81 | Ok(()) 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/index/metadata/file.rs: -------------------------------------------------------------------------------- 1 | use std::path::Path; 2 | use std::io::{self, Read, Write}; 3 | use std::fs::File; 4 | 5 | use serde_json; 6 | use atomicwrites::{self, AtomicFile, AllowOverwrite}; 7 | 8 | use index::metadata::IndexMetadata; 9 | use index::metadata::parse::{parse, IndexMetadataParseError}; 10 | 11 | 12 | #[derive(Debug)] 13 | pub enum SaveIndexMetadataError { 14 | JsonEncoderError(serde_json::Error), 15 | IoError(atomicwrites::Error), 16 | } 17 | 18 | 19 | impl From for String { 20 | fn from(e: SaveIndexMetadataError) -> String { 21 | match e { 22 | SaveIndexMetadataError::JsonEncoderError(e) => format!("failed to save index metadata: {}", e).to_string(), 23 | SaveIndexMetadataError::IoError(e) => format!("failed to save index metadata: {}", e).to_string(), 24 | } 25 | } 26 | } 27 | 28 | impl From for SaveIndexMetadataError { 29 | fn from(e: serde_json::Error) -> SaveIndexMetadataError { 30 | SaveIndexMetadataError::JsonEncoderError(e) 31 | } 32 | } 33 | 34 | 35 | impl From> for SaveIndexMetadataError { 36 | fn from(e: atomicwrites::Error) -> SaveIndexMetadataError { 37 | SaveIndexMetadataError::IoError(e) 38 | } 39 | } 40 | 41 | 42 | #[derive(Debug)] 43 | pub enum LoadIndexMetadataError { 44 | IndexMetadataParseError(IndexMetadataParseError), 45 | JsonParserError(serde_json::Error), 46 | IoError(io::Error), 47 | } 48 | 49 | 50 | impl From for String { 51 | fn from(e: LoadIndexMetadataError) -> String { 52 | match e { 53 | LoadIndexMetadataError::IndexMetadataParseError(e) => format!("failed to load index metadata: {:?}", e).to_string(), 54 | LoadIndexMetadataError::JsonParserError(e) => format!("failed to load index metadata: {}", e).to_string(), 55 | LoadIndexMetadataError::IoError(e) => format!("failed to load index metadata: {}", e).to_string(), 56 | } 57 | } 58 | } 59 | 60 | 61 | impl From for LoadIndexMetadataError { 62 | fn from(e: IndexMetadataParseError) -> LoadIndexMetadataError { 63 | LoadIndexMetadataError::IndexMetadataParseError(e) 64 | } 65 | } 66 | 67 | 68 | impl From for LoadIndexMetadataError { 69 | fn from(e: serde_json::Error) -> LoadIndexMetadataError { 70 | LoadIndexMetadataError::JsonParserError(e) 71 | } 72 | } 73 | 74 | 75 | impl From for LoadIndexMetadataError { 76 | fn from(e: io::Error) -> LoadIndexMetadataError { 77 | LoadIndexMetadataError::IoError(e) 78 | } 79 | } 80 | 81 | 82 | impl IndexMetadata { 83 | pub fn save>(&self, path: P) -> Result<(), SaveIndexMetadataError> { 84 | // Encode to JSON 85 | let s = format!("{}", serde_json::to_value(self)?); 86 | 87 | // Write to file 88 | let file = AtomicFile::new(path, AllowOverwrite); 89 | file.write(|f| { 90 | f.write_all(s.as_bytes()) 91 | })?; 92 | 93 | Ok(()) 94 | } 95 | 96 | pub fn load>(path: P) -> Result { 97 | let mut file = File::open(path)?; 98 | let mut s = String::new(); 99 | file.read_to_string(&mut s)?; 100 | 101 | let mut metadata = IndexMetadata::default(); 102 | parse(&mut metadata, serde_json::from_str(&s)?)?; 103 | 104 | Ok(metadata) 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/index/metadata/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod parse; 2 | pub mod file; 3 | 4 | use std::collections::{HashMap, BTreeMap}; 5 | 6 | use serde::{Serialize, Serializer}; 7 | use serde_json; 8 | 9 | use analysis::AnalyzerSpec; 10 | use analysis::tokenizers::TokenizerSpec; 11 | use analysis::filters::FilterSpec; 12 | use mapping::{Mapping, MappingProperty, FieldMapping}; 13 | 14 | 15 | #[derive(Debug)] 16 | pub struct IndexMetadata { 17 | analyzers: HashMap, 18 | tokenizers: HashMap, 19 | filters: HashMap, 20 | pub mappings: HashMap, 21 | } 22 | 23 | 24 | impl Default for IndexMetadata { 25 | fn default() -> IndexMetadata { 26 | let mut metadata = IndexMetadata { 27 | analyzers: HashMap::new(), 28 | tokenizers: HashMap::new(), 29 | filters: HashMap::new(), 30 | mappings: HashMap::new(), 31 | }; 32 | 33 | // Builtin tokenizers 34 | metadata.insert_tokenizer("standard".to_string(), TokenizerSpec::Standard); 35 | metadata.insert_tokenizer("lowercase".to_string(), TokenizerSpec::Lowercase); 36 | 37 | // Builtin filters 38 | metadata.insert_filter("asciifolding".to_string(), FilterSpec::ASCIIFolding); 39 | metadata.insert_filter("lowercase".to_string(), FilterSpec::Lowercase); 40 | 41 | // Builtin analyzers 42 | metadata.insert_analyzer("standard".to_string(), AnalyzerSpec { 43 | tokenizer: TokenizerSpec::Standard, 44 | filters: vec![ 45 | FilterSpec::Lowercase, 46 | FilterSpec::ASCIIFolding, 47 | ] 48 | }); 49 | 50 | metadata 51 | } 52 | } 53 | 54 | 55 | impl IndexMetadata { 56 | // Tokenizer helpers 57 | 58 | pub fn insert_tokenizer(&mut self, name: String, tokenizer: TokenizerSpec) -> Option { 59 | self.tokenizers.insert(name, tokenizer) 60 | } 61 | 62 | pub fn tokenizers(&self) -> &HashMap { 63 | &self.tokenizers 64 | } 65 | 66 | // Filter helpers 67 | 68 | pub fn insert_filter(&mut self, name: String, filter: FilterSpec) -> Option { 69 | self.filters.insert(name, filter) 70 | } 71 | 72 | pub fn filters(&self) -> &HashMap { 73 | &self.filters 74 | } 75 | 76 | // Analyzer helpers 77 | 78 | pub fn insert_analyzer(&mut self, name: String, analyzer: AnalyzerSpec) -> Option { 79 | self.analyzers.insert(name, analyzer) 80 | } 81 | 82 | pub fn analyzers(&self) -> &HashMap { 83 | &self.analyzers 84 | } 85 | 86 | fn get_default_analyzer(&self) -> AnalyzerSpec { 87 | self.analyzers().get("default").cloned().unwrap_or_else(|| { 88 | AnalyzerSpec { 89 | tokenizer: TokenizerSpec::Standard, 90 | filters: vec![ 91 | FilterSpec::Lowercase, 92 | FilterSpec::ASCIIFolding, 93 | ] 94 | } 95 | }) 96 | } 97 | 98 | pub fn get_default_index_analyzer(&self) -> AnalyzerSpec { 99 | self.analyzers().get("default_index").cloned().unwrap_or_else(|| { 100 | self.get_default_analyzer() 101 | }) 102 | } 103 | 104 | pub fn get_default_search_analyzer(&self) -> AnalyzerSpec { 105 | self.analyzers().get("default_search").cloned().unwrap_or_else(|| { 106 | self.get_default_analyzer() 107 | }) 108 | } 109 | 110 | // Mapping helpers 111 | 112 | pub fn get_field_mapping(&self, name: &str) -> Option<&FieldMapping> { 113 | for mapping in self.mappings.values() { 114 | if let Some(property) = mapping.properties.get(name) { 115 | if let MappingProperty::Field(ref field_mapping) = *property { 116 | return Some(field_mapping); 117 | } 118 | } 119 | } 120 | 121 | None 122 | } 123 | } 124 | 125 | 126 | impl Serialize for IndexMetadata { 127 | fn serialize(&self, serializer: S) -> Result { 128 | // Tokenizers 129 | let mut tokenizers_json = BTreeMap::new(); 130 | for (name, tokenizer) in self.tokenizers.iter() { 131 | tokenizers_json.insert(name.to_string(), serde_json::to_value(&tokenizer).unwrap()); 132 | } 133 | 134 | // Filters 135 | let mut filters_json = BTreeMap::new(); 136 | for (name, filter) in self.filters.iter() { 137 | filters_json.insert(name.to_string(), serde_json::to_value(&filter).unwrap()); 138 | } 139 | 140 | // Mappings 141 | let mut mappings_json = BTreeMap::new(); 142 | for (name, mapping) in self.mappings.iter() { 143 | mappings_json.insert(name.to_string(), serde_json::to_value(&mapping).unwrap()); 144 | } 145 | 146 | let json = json!({ 147 | "settings": { 148 | "analysis": { 149 | "tokenizers": tokenizers_json, 150 | "filters": filters_json, 151 | "analyzers": {}, // TODO 152 | }, 153 | }, 154 | "mappings": mappings_json, 155 | }); 156 | 157 | json.serialize(serializer) 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/index/metadata/parse/analysis_analyzer.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | 3 | use analysis::AnalyzerSpec; 4 | use index::metadata::IndexMetadata; 5 | 6 | 7 | #[derive(Debug, PartialEq)] 8 | pub enum AnalyzerParseError { 9 | ExpectedObject, 10 | ExpectedString, 11 | ExpectedArray, 12 | ExpectedKey(String), 13 | UnrecognisedAnalyzerType(String), 14 | UnrecognisedTokenizer(String), 15 | UnrecognisedFilter(String), 16 | } 17 | 18 | 19 | pub fn parse(json: &serde_json::Value, index_metadata: &IndexMetadata) -> Result { 20 | let data = json.as_object().ok_or(AnalyzerParseError::ExpectedObject)?; 21 | 22 | // Get type 23 | let analyzer_type_json = data.get("type").ok_or(AnalyzerParseError::ExpectedKey("type".to_string()))?; 24 | let analyzer_type = analyzer_type_json.as_str().ok_or(AnalyzerParseError::ExpectedString)?; 25 | 26 | match analyzer_type { 27 | "custom" => { 28 | // Get tokenizer 29 | let tokenizer_name = match data.get("tokenizer") { 30 | Some(tokenizer_json) => { 31 | match tokenizer_json.as_str() { 32 | Some(tokenizer) => tokenizer, 33 | None => return Err(AnalyzerParseError::ExpectedString), 34 | } 35 | } 36 | None => return Err(AnalyzerParseError::ExpectedKey("tokenizer".to_string())), 37 | }; 38 | 39 | let tokenizer_spec = match index_metadata.tokenizers().get(tokenizer_name) { 40 | Some(tokenizer_spec) => tokenizer_spec, 41 | None => return Err(AnalyzerParseError::UnrecognisedTokenizer(tokenizer_name.to_string())), 42 | }; 43 | 44 | // Build analyzer 45 | let mut analyzer_spec = AnalyzerSpec { 46 | tokenizer: tokenizer_spec.clone(), 47 | filters: Vec::new(), 48 | }; 49 | 50 | // Add filters 51 | if let Some(filter_json) = data.get("filter") { 52 | match filter_json.as_array() { 53 | Some(filter_names) => { 54 | for filter_name_json in filter_names.iter() { 55 | // Get filter 56 | match filter_name_json.as_str() { 57 | Some(filter_name) => { 58 | let filter_spec = match index_metadata.filters().get(filter_name) { 59 | Some(filter_spec) => filter_spec, 60 | None => return Err(AnalyzerParseError::UnrecognisedFilter(filter_name.to_string())), 61 | }; 62 | 63 | analyzer_spec.filters.push(filter_spec.clone()); 64 | } 65 | None => return Err(AnalyzerParseError::ExpectedString), 66 | } 67 | } 68 | }, 69 | None => return Err(AnalyzerParseError::ExpectedArray), 70 | } 71 | } 72 | 73 | Ok(analyzer_spec) 74 | } 75 | // TODO 76 | // default/standard 77 | // standard_html_strip 78 | // simple 79 | // stop 80 | // whitespace 81 | // keyword 82 | // pattern 83 | // snowball 84 | // arabic 85 | // armenian 86 | // basque 87 | // brazilian 88 | // bulgarian 89 | // catalan 90 | // chinese 91 | // cjk 92 | // czech 93 | // danish 94 | // dutch 95 | // english 96 | // finnish 97 | // french 98 | // galician 99 | // german 100 | // greek 101 | // hindi 102 | // hungarian 103 | // indonesian 104 | // irish 105 | // italian 106 | // latvian 107 | // lithuanian 108 | // norwegian 109 | // persian 110 | // portuguese 111 | // romanian 112 | // sorani 113 | // spanish 114 | // swedish 115 | // turkish 116 | // thai 117 | // fingerprint 118 | _ => Err(AnalyzerParseError::UnrecognisedAnalyzerType(analyzer_type.to_string())), 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /src/index/metadata/parse/analysis_filter.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | 3 | use analysis::ngram_generator::Edge; 4 | use analysis::filters::FilterSpec; 5 | 6 | 7 | #[derive(Debug, PartialEq)] 8 | pub enum FilterParseError { 9 | ExpectedObject, 10 | ExpectedString, 11 | ExpectedPositiveInteger, 12 | ExpectedKey(String), 13 | UnrecognisedType(String), 14 | InvalidSideValue, 15 | } 16 | 17 | 18 | pub fn parse(json: &serde_json::Value) -> Result { 19 | let data = json.as_object().ok_or(FilterParseError::ExpectedObject)?; 20 | 21 | // Get type 22 | let filter_type_json = data.get("type").ok_or(FilterParseError::ExpectedKey("type".to_string()))?; 23 | let filter_type = filter_type_json.as_str().ok_or(FilterParseError::ExpectedString)?; 24 | 25 | match filter_type { 26 | "asciifolding" => { 27 | Ok(FilterSpec::ASCIIFolding) 28 | } 29 | "lowercase" => { 30 | Ok(FilterSpec::Lowercase) 31 | } 32 | "nGram" | "ngram" => { 33 | let min_gram = match data.get("min_gram") { 34 | Some(min_gram_json) => { 35 | match min_gram_json.as_u64() { 36 | Some(min_gram) => min_gram as usize, 37 | None => return Err(FilterParseError::ExpectedPositiveInteger), 38 | } 39 | } 40 | None => 1 as usize, 41 | }; 42 | 43 | let max_gram = match data.get("max_gram") { 44 | Some(max_gram_json) => { 45 | match max_gram_json.as_u64() { 46 | Some(max_gram) => max_gram as usize, 47 | None => return Err(FilterParseError::ExpectedPositiveInteger), 48 | } 49 | } 50 | None => 2 as usize, 51 | }; 52 | 53 | Ok(FilterSpec::NGram { 54 | min_size: min_gram, 55 | max_size: max_gram, 56 | edge: Edge::Neither, 57 | }) 58 | } 59 | "edgeNGram" | "edge_ngram" => { 60 | let min_gram = match data.get("min_gram") { 61 | Some(min_gram_json) => { 62 | match min_gram_json.as_u64() { 63 | Some(min_gram) => min_gram as usize, 64 | None => return Err(FilterParseError::ExpectedPositiveInteger), 65 | } 66 | } 67 | None => 1 as usize, 68 | }; 69 | 70 | let max_gram = match data.get("max_gram") { 71 | Some(max_gram_json) => { 72 | match max_gram_json.as_u64() { 73 | Some(max_gram) => max_gram as usize, 74 | None => return Err(FilterParseError::ExpectedPositiveInteger), 75 | } 76 | } 77 | None => 2 as usize, 78 | }; 79 | 80 | let edge = match data.get("side") { 81 | Some(side_json) => { 82 | match side_json.as_str() { 83 | Some(side_string) => { 84 | let side_string_lower = side_string.to_lowercase(); 85 | match side_string_lower.as_ref() { 86 | "front" => { 87 | Edge::Left 88 | } 89 | "back" => { 90 | Edge::Right 91 | } 92 | _ => return Err(FilterParseError::InvalidSideValue) 93 | } 94 | }, 95 | None => return Err(FilterParseError::ExpectedString), 96 | } 97 | } 98 | None => Edge::Left, 99 | }; 100 | 101 | Ok(FilterSpec::NGram { 102 | min_size: min_gram, 103 | max_size: max_gram, 104 | edge: edge, 105 | }) 106 | } 107 | // TODO 108 | // stop 109 | // reverse 110 | // length 111 | // uppercase 112 | // porter_stem 113 | // kstem 114 | // standard 115 | // shingle 116 | // unique 117 | // truncate 118 | // trim 119 | // limit 120 | // common_grams 121 | // snowball 122 | // stemmer 123 | // word_delimiter 124 | // delimited_payload_filter 125 | // elision 126 | // keep 127 | // keep_types 128 | // pattern_capture 129 | // pattern_replace 130 | // dictionary_decompounder 131 | // hyphenation_decompounder 132 | // arabic_stem 133 | // brazilian_stem 134 | // czech_stem 135 | // dutch_stem 136 | // french_stem 137 | // german_stem 138 | // russian_stem 139 | // keyword_marker 140 | // stemmer_override 141 | // arabic_normalization 142 | // german_normalization 143 | // hindi_normalization 144 | // indic_normalization 145 | // sorani_normalization 146 | // persian_normalization 147 | // scandinavian_normalization 148 | // scandinavian_folding 149 | // serbian_normalization 150 | // hunspell 151 | // cjk_bigram 152 | // cjk_width 153 | // apostrophe 154 | // classic 155 | // decimal_digit 156 | // fingerprint 157 | _ => Err(FilterParseError::UnrecognisedType(filter_type.to_string())), 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/index/metadata/parse/analysis_tokenizer.rs: -------------------------------------------------------------------------------- 1 | use serde_json; 2 | 3 | use analysis::ngram_generator::Edge; 4 | use analysis::tokenizers::TokenizerSpec; 5 | 6 | 7 | #[derive(Debug, PartialEq)] 8 | pub enum TokenizerParseError { 9 | ExpectedObject, 10 | ExpectedString, 11 | ExpectedPositiveInteger, 12 | ExpectedKey(String), 13 | UnrecognisedType(String), 14 | InvalidSideValue, 15 | } 16 | 17 | 18 | pub fn parse(json: &serde_json::Value) -> Result { 19 | let data = json.as_object().ok_or(TokenizerParseError::ExpectedObject)?; 20 | 21 | // Get type 22 | let tokenizer_type_json = data.get("type").ok_or(TokenizerParseError::ExpectedKey("type".to_string()))?; 23 | let tokenizer_type = tokenizer_type_json.as_str().ok_or(TokenizerParseError::ExpectedString)?; 24 | 25 | match tokenizer_type { 26 | "standard" => { 27 | Ok(TokenizerSpec::Standard) 28 | } 29 | "lowercase" => { 30 | Ok(TokenizerSpec::Lowercase) 31 | } 32 | "nGram" | "ngram" => { 33 | let min_gram = match data.get("min_gram") { 34 | Some(min_gram_json) => { 35 | match min_gram_json.as_u64() { 36 | Some(min_gram) => min_gram as usize, 37 | None => return Err(TokenizerParseError::ExpectedPositiveInteger), 38 | } 39 | } 40 | None => 1 as usize, 41 | }; 42 | 43 | let max_gram = match data.get("max_gram") { 44 | Some(max_gram_json) => { 45 | match max_gram_json.as_u64() { 46 | Some(max_gram) => max_gram as usize, 47 | None => return Err(TokenizerParseError::ExpectedPositiveInteger), 48 | } 49 | } 50 | None => 2 as usize, 51 | }; 52 | 53 | Ok(TokenizerSpec::NGram { 54 | min_size: min_gram, 55 | max_size: max_gram, 56 | edge: Edge::Neither, 57 | }) 58 | } 59 | "edgeNGram" | "edge_ngram" => { 60 | let min_gram = match data.get("min_gram") { 61 | Some(min_gram_json) => { 62 | match min_gram_json.as_u64() { 63 | Some(min_gram) => min_gram as usize, 64 | None => return Err(TokenizerParseError::ExpectedPositiveInteger), 65 | } 66 | } 67 | None => 1 as usize, 68 | }; 69 | 70 | let max_gram = match data.get("max_gram") { 71 | Some(max_gram_json) => { 72 | match max_gram_json.as_u64() { 73 | Some(max_gram) => max_gram as usize, 74 | None => return Err(TokenizerParseError::ExpectedPositiveInteger), 75 | } 76 | } 77 | None => 2 as usize, 78 | }; 79 | 80 | let edge = match data.get("side") { 81 | Some(side_json) => { 82 | match side_json.as_str() { 83 | Some(side_string) => { 84 | let side_string_lower = side_string.to_lowercase(); 85 | match side_string_lower.as_ref() { 86 | "front" => { 87 | Edge::Left 88 | } 89 | "back" => { 90 | Edge::Right 91 | } 92 | _ => return Err(TokenizerParseError::InvalidSideValue) 93 | } 94 | }, 95 | None => return Err(TokenizerParseError::ExpectedString), 96 | } 97 | } 98 | None => Edge::Left, 99 | }; 100 | 101 | Ok(TokenizerSpec::NGram { 102 | min_size: min_gram, 103 | max_size: max_gram, 104 | edge: edge, 105 | }) 106 | } 107 | // TODO 108 | // uax_url_email 109 | // path_hierarchy/PathHierarchy 110 | // keyword 111 | // letter 112 | // whitespace 113 | // pattern 114 | // classic 115 | // thai 116 | _ => Err(TokenizerParseError::UnrecognisedType(tokenizer_type.to_owned())), 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/index/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod maintenance; 2 | pub mod metadata; 3 | 4 | use std::sync::RwLock; 5 | use std::path::PathBuf; 6 | 7 | use search::backends::rocksdb::RocksDBStore; 8 | use uuid::Uuid; 9 | 10 | use index::metadata::IndexMetadata; 11 | 12 | 13 | #[derive(Debug)] 14 | pub struct Index { 15 | id: Uuid, 16 | canonical_name: String, 17 | pub metadata: RwLock, 18 | pub store: RocksDBStore, 19 | } 20 | 21 | 22 | impl Index { 23 | pub fn new(id: Uuid, canonical_name: String, metadata: IndexMetadata, store: RocksDBStore) -> Index { 24 | Index { 25 | id: id, 26 | canonical_name: canonical_name, 27 | metadata: RwLock::new(metadata), 28 | store: store, 29 | } 30 | } 31 | 32 | pub fn id(&self) -> &Uuid { 33 | &self.id 34 | } 35 | 36 | pub fn canonical_name(&self) -> &str { 37 | &self.canonical_name 38 | } 39 | 40 | pub fn metadata_path(&self) -> PathBuf { 41 | let mut path = self.store.path().to_path_buf(); 42 | path.push("metadata.json"); 43 | path 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate chrono; 2 | #[macro_use] 3 | extern crate router; 4 | extern crate url; 5 | #[macro_use] 6 | extern crate slog; 7 | extern crate slog_term; 8 | extern crate slog_async; 9 | #[macro_use] 10 | extern crate maplit; 11 | extern crate unicode_segmentation; 12 | extern crate uuid; 13 | extern crate serde; 14 | #[macro_use] 15 | extern crate serde_derive; 16 | #[macro_use] 17 | extern crate serde_json; 18 | extern crate atomicwrites; 19 | extern crate fnv; 20 | #[macro_use] 21 | extern crate bitflags; 22 | extern crate roaring; 23 | extern crate byteorder; 24 | extern crate rocksdb; 25 | 26 | pub mod search; 27 | pub mod analysis; 28 | pub mod query_parser; 29 | pub mod mapping; 30 | pub mod document; 31 | pub mod index; 32 | pub mod cluster; 33 | pub mod system; 34 | mod api; 35 | 36 | use std::path::Path; 37 | use std::sync::Arc; 38 | use std::thread; 39 | use std::time::Duration; 40 | use std::panic; 41 | 42 | use slog::Drain; 43 | 44 | use system::System; 45 | 46 | 47 | const VERSION: &'static str = env!("CARGO_PKG_VERSION"); 48 | 49 | 50 | fn main() { 51 | // Setup logging 52 | let decorator = slog_term::TermDecorator::new().build(); 53 | let drain = slog_term::CompactFormat::new(decorator).build().fuse(); 54 | let drain = slog_async::Async::new(drain).build().fuse(); 55 | let log = slog::Logger::root(drain, o!()); 56 | 57 | info!(log, "starting rusticsearch"; "version" => VERSION); 58 | 59 | let system = Arc::new(System::new(log, Path::new("data/").to_path_buf())); 60 | 61 | info!(system.log, "loading indices"); 62 | system.load_indices(); 63 | 64 | { 65 | let system = system.clone(); 66 | thread::spawn(move || { 67 | loop { 68 | { 69 | let cluster_metadata = system.metadata.read().unwrap(); 70 | for index in cluster_metadata.indices.values() { 71 | let result = panic::catch_unwind(|| { 72 | index.run_maintenance_task().unwrap(); 73 | }); 74 | 75 | if let Err(error) = result { 76 | error!(system.log, "index maintenance task panicked"; "index" => index.canonical_name(), "error" => format!("{:?}", error)); 77 | } 78 | } 79 | } 80 | 81 | thread::sleep(Duration::new(1, 0)); 82 | } 83 | }); 84 | } 85 | 86 | info!(system.log, "starting api server"); 87 | api::api_main(system); 88 | } 89 | -------------------------------------------------------------------------------- /src/query_parser/and_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "and" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::Query; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder, parse as parse_query}; 8 | 9 | 10 | #[derive(Debug)] 11 | struct AndQueryBuilder { 12 | queries: Vec>, 13 | } 14 | 15 | 16 | impl QueryBuilder for AndQueryBuilder { 17 | fn build(&self, context: &QueryBuildContext, schema: &Schema) -> Query { 18 | let mut queries = Vec::new(); 19 | 20 | for query in self.queries.iter() { 21 | queries.push(query.build(context, schema)); 22 | } 23 | 24 | Query::Conjunction { queries: queries } 25 | } 26 | } 27 | 28 | 29 | pub fn parse(json: &Json) -> Result, QueryParseError> { 30 | let filters = json.as_array().ok_or(QueryParseError::ExpectedArray)?; 31 | 32 | let mut queries = Vec::new(); 33 | for filter in filters.iter() { 34 | queries.push(parse_query(filter)?); 35 | } 36 | 37 | Ok(Box::new(AndQueryBuilder { 38 | queries: queries 39 | })) 40 | } 41 | 42 | 43 | #[cfg(test)] 44 | mod tests { 45 | use serde_json; 46 | 47 | use search::{Term, Query, TermScorer}; 48 | use search::schema::{Schema, FieldType, FIELD_INDEXED}; 49 | 50 | use query_parser::{QueryBuildContext, QueryParseError}; 51 | 52 | use super::parse; 53 | 54 | #[test] 55 | fn test_and_query() { 56 | let mut schema = Schema::new(); 57 | let test_field = schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 58 | 59 | let query = parse(&serde_json::from_str(" 60 | [ 61 | { 62 | \"term\": { 63 | \"test\": \"foo\" 64 | } 65 | }, 66 | { 67 | \"term\": { 68 | \"test\": \"bar\" 69 | } 70 | } 71 | ] 72 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 73 | 74 | assert_eq!(query, Ok(Query::Conjunction { 75 | queries: vec![ 76 | Query::Term { 77 | field: test_field, 78 | term: Term::from_string("foo"), 79 | scorer: TermScorer::default(), 80 | }, 81 | Query::Term { 82 | field: test_field, 83 | term: Term::from_string("bar"), 84 | scorer: TermScorer::default(), 85 | }, 86 | ], 87 | })) 88 | } 89 | 90 | #[test] 91 | fn test_gives_error_for_incorrect_type() { 92 | // String 93 | let query = parse(&serde_json::from_str(" 94 | \"hello\" 95 | ").unwrap()); 96 | 97 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 98 | 99 | // Object 100 | let query = parse(&serde_json::from_str(" 101 | { 102 | \"foo\": \"bar\" 103 | } 104 | ").unwrap()); 105 | 106 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 107 | 108 | // Integer 109 | let query = parse(&serde_json::from_str(" 110 | 123 111 | ").unwrap()); 112 | 113 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 114 | 115 | // Float 116 | let query = parse( &serde_json::from_str(" 117 | 123.1234 118 | ").unwrap()); 119 | 120 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/query_parser/constant_score_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "constant_score" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::Query; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder, parse as parse_query}; 8 | use query_parser::utils::parse_float; 9 | 10 | #[derive(Debug)] 11 | struct ConstantScoreQueryBuilder { 12 | filter: Box, 13 | score: f32, 14 | } 15 | 16 | impl QueryBuilder for ConstantScoreQueryBuilder { 17 | fn build(&self, context: &QueryBuildContext, schema: &Schema) -> Query { 18 | Query::Filter { 19 | query: Box::new(Query::All{ score: self.score }), 20 | filter: Box::new(self.filter.build(&context.clone().no_score(), schema)), 21 | } 22 | } 23 | } 24 | 25 | pub fn parse(json: &Json) -> Result, QueryParseError> { 26 | let object = json.as_object().ok_or(QueryParseError::ExpectedObject)?; 27 | 28 | let filter = match object.get("filter") { 29 | Some(inner) => parse_query(inner)?, 30 | None => return Err(QueryParseError::ExpectedKey("filter")), 31 | }; 32 | 33 | let boost = match object.get("boost") { 34 | Some(inner) => parse_float(inner)?, 35 | None => return Err(QueryParseError::ExpectedKey("boost")), 36 | }; 37 | 38 | // Check for any keys that we don't recognise 39 | for key in object.keys() { 40 | match key.as_ref() { 41 | "filter" | "boost" => {}, 42 | _ => return Err(QueryParseError::UnrecognisedKey(key.clone())) 43 | } 44 | } 45 | 46 | Ok(Box::new(ConstantScoreQueryBuilder { 47 | filter: filter, 48 | score: boost, 49 | })) 50 | } 51 | 52 | #[cfg(test)] 53 | mod tests { 54 | use search::{Term, Query, TermScorer}; 55 | use search::schema::{Schema, FieldType, FIELD_INDEXED}; 56 | 57 | use query_parser::{QueryBuildContext, QueryParseError}; 58 | 59 | use super::parse; 60 | 61 | #[test] 62 | fn test_constant_score_query() { 63 | let mut schema = Schema::new(); 64 | let test_field = schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 65 | 66 | let query = parse(&json!({ 67 | "filter": { 68 | "term": { 69 | "test": "foo" 70 | }, 71 | }, 72 | "boost": 2.0 73 | })).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 74 | 75 | assert_eq!(query, Ok(Query::Filter { 76 | query: Box::new(Query::All { score: 2.0 }), 77 | filter: Box::new(Query::Term { 78 | field: test_field, 79 | term: Term::from_string("foo"), 80 | scorer: TermScorer::default(), 81 | }) 82 | })) 83 | } 84 | 85 | #[test] 86 | fn test_missing_filter() { 87 | let mut schema = Schema::new(); 88 | schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 89 | 90 | let query = parse(&json!({ 91 | "boost": 2.0 92 | })).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 93 | 94 | assert_eq!(query, Err(QueryParseError::ExpectedKey("filter"))); 95 | } 96 | 97 | #[test] 98 | fn test_missing_boost() { 99 | let mut schema = Schema::new(); 100 | schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 101 | 102 | let query = parse(&json!({ 103 | "filter": { 104 | "term": { 105 | "test": "foo" 106 | }, 107 | } 108 | })).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 109 | 110 | assert_eq!(query, Err(QueryParseError::ExpectedKey("boost"))); 111 | } 112 | 113 | #[test] 114 | fn test_extra_key() { 115 | let mut schema = Schema::new(); 116 | schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 117 | 118 | let query = parse(&json!({ 119 | "filter": { 120 | "term": { 121 | "test": "foo" 122 | }, 123 | }, 124 | "boost": 2.0, 125 | "foo": "bar" 126 | })).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 127 | 128 | assert_eq!(query, Err(QueryParseError::UnrecognisedKey("foo".to_string()))); 129 | } 130 | 131 | #[test] 132 | fn test_invalid_query() { 133 | let mut schema = Schema::new(); 134 | schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 135 | 136 | let query = parse(&json!({ 137 | "filter": "foo", 138 | "boost": 2.0 139 | })).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 140 | 141 | assert_eq!(query, Err(QueryParseError::ExpectedObject)); 142 | } 143 | 144 | #[test] 145 | fn test_gives_error_for_incorrect_type() { 146 | // String 147 | let query = parse(&json!("hello")); 148 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 149 | 150 | // Array 151 | let query = parse(&json!(["hello"])); 152 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 153 | 154 | // Integer 155 | let query = parse(&json!(123)); 156 | 157 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 158 | 159 | // Float 160 | let query = parse(&json!(123.456)); 161 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/query_parser/match_all_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "match_all" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::Query; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder}; 8 | use query_parser::utils::parse_float; 9 | 10 | 11 | #[derive(Debug)] 12 | struct MatchAllQueryBuilder { 13 | boost: f32, 14 | } 15 | 16 | 17 | impl QueryBuilder for MatchAllQueryBuilder { 18 | fn build(&self, _context: &QueryBuildContext, _schema: &Schema) -> Query { 19 | Query::all().boost(self.boost) 20 | } 21 | } 22 | 23 | 24 | pub fn parse(json: &Json) -> Result, QueryParseError> { 25 | let object = json.as_object().ok_or(QueryParseError::ExpectedObject)?; 26 | 27 | // Get configuration 28 | let mut boost = 1.0f32; 29 | 30 | for (key, value) in object.iter() { 31 | match &key[..] { 32 | "boost" => { 33 | boost = parse_float(value)?; 34 | } 35 | _ => return Err(QueryParseError::UnrecognisedKey(key.clone())) 36 | } 37 | } 38 | 39 | Ok(Box::new(MatchAllQueryBuilder { 40 | boost: boost, 41 | })) 42 | } 43 | 44 | 45 | #[cfg(test)] 46 | mod tests { 47 | use serde_json; 48 | 49 | use search::Query; 50 | use search::schema::Schema; 51 | 52 | use query_parser::{QueryBuildContext, QueryParseError}; 53 | 54 | use super::parse; 55 | 56 | #[test] 57 | fn test_match_all_query() { 58 | let schema = Schema::new(); 59 | 60 | let query = parse(&serde_json::from_str(" 61 | { 62 | } 63 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 64 | 65 | assert_eq!(query, Ok(Query::All {score: 1.0f32})) 66 | } 67 | 68 | #[test] 69 | fn test_with_boost() { 70 | let schema = Schema::new(); 71 | 72 | let query = parse( &serde_json::from_str(" 73 | { 74 | \"boost\": 2.0 75 | } 76 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 77 | 78 | assert_eq!(query, Ok(Query::All {score: 2.0f32})) 79 | } 80 | 81 | #[test] 82 | fn test_with_boost_integer() { 83 | let schema = Schema::new(); 84 | 85 | let query = parse(&serde_json::from_str(" 86 | { 87 | \"boost\": 2 88 | } 89 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 90 | 91 | assert_eq!(query, Ok(Query::All {score: 2.0f32})) 92 | } 93 | 94 | #[test] 95 | fn test_gives_error_for_incorrect_type() { 96 | // Array 97 | let query = parse(&serde_json::from_str(" 98 | [ 99 | \"foo\" 100 | ] 101 | ").unwrap()); 102 | 103 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 104 | 105 | // Integer 106 | let query = parse(&serde_json::from_str(" 107 | 123 108 | ").unwrap()); 109 | 110 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 111 | 112 | // Float 113 | let query = parse(&serde_json::from_str(" 114 | 123.1234 115 | ").unwrap()); 116 | 117 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 118 | } 119 | 120 | #[test] 121 | fn test_gives_error_for_incorrect_boost_type() { 122 | // String 123 | let query = parse(&serde_json::from_str(" 124 | { 125 | \"boost\": \"2\" 126 | } 127 | ").unwrap()); 128 | 129 | assert_eq!(query.err(), Some(QueryParseError::ExpectedFloat)); 130 | 131 | // Array 132 | let query = parse(&serde_json::from_str(" 133 | { 134 | \"boost\": [2] 135 | } 136 | ").unwrap()); 137 | 138 | assert_eq!(query.err(), Some(QueryParseError::ExpectedFloat)); 139 | 140 | // Object 141 | let query = parse(&serde_json::from_str(" 142 | { 143 | \"boost\": { 144 | \"value\": 2 145 | } 146 | } 147 | ").unwrap()); 148 | 149 | assert_eq!(query.err(), Some(QueryParseError::ExpectedFloat)); 150 | } 151 | 152 | #[test] 153 | fn test_gives_error_for_unrecognised_key() { 154 | let query = parse(&serde_json::from_str(" 155 | { 156 | \"hello\": \"world\" 157 | } 158 | ").unwrap()); 159 | 160 | assert_eq!(query.err(), Some(QueryParseError::UnrecognisedKey("hello".to_string()))); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/query_parser/match_none_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "match_none" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::Query; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder}; 8 | 9 | 10 | #[derive(Debug)] 11 | struct MatchNoneQueryBuilder; 12 | 13 | 14 | impl QueryBuilder for MatchNoneQueryBuilder { 15 | fn build(&self, _context: &QueryBuildContext, _schema: &Schema) -> Query { 16 | Query::None 17 | } 18 | } 19 | 20 | 21 | pub fn parse(json: &Json) -> Result, QueryParseError> { 22 | let object = json.as_object().ok_or(QueryParseError::ExpectedObject)?; 23 | 24 | // Get configuration 25 | for (key, _value) in object.iter() { 26 | match &key[..] { 27 | _ => return Err(QueryParseError::UnrecognisedKey(key.clone())) 28 | } 29 | } 30 | 31 | Ok(Box::new(MatchNoneQueryBuilder)) 32 | } 33 | 34 | 35 | #[cfg(test)] 36 | mod tests { 37 | use serde_json; 38 | 39 | use search::Query; 40 | use search::schema::Schema; 41 | 42 | use query_parser::{QueryBuildContext, QueryParseError}; 43 | 44 | use super::parse; 45 | 46 | #[test] 47 | fn test_match_none_query() { 48 | let schema = Schema::new(); 49 | 50 | let query = parse(&serde_json::from_str(" 51 | { 52 | } 53 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 54 | 55 | assert_eq!(query, Ok(Query::None)) 56 | } 57 | 58 | #[test] 59 | fn test_gives_error_for_incorrect_type() { 60 | // Array 61 | let query = parse(&serde_json::from_str(" 62 | [ 63 | \"foo\" 64 | ] 65 | ").unwrap()); 66 | 67 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 68 | 69 | // Integer 70 | let query = parse(&serde_json::from_str(" 71 | 123 72 | ").unwrap()); 73 | 74 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 75 | 76 | // Float 77 | let query = parse(&serde_json::from_str(" 78 | 123.1234 79 | ").unwrap()); 80 | 81 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 82 | } 83 | 84 | #[test] 85 | fn test_gives_error_for_unrecognised_key() { 86 | let query = parse(&serde_json::from_str(" 87 | { 88 | \"hello\": \"world\" 89 | } 90 | ").unwrap()); 91 | 92 | assert_eq!(query.err(), Some(QueryParseError::UnrecognisedKey("hello".to_string()))); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/query_parser/mod.rs: -------------------------------------------------------------------------------- 1 | //! Parses Elasticsearch Query DSL 2 | 3 | pub mod utils; 4 | pub mod match_query; 5 | pub mod multi_match_query; 6 | pub mod match_all_query; 7 | pub mod match_none_query; 8 | pub mod filtered_query; 9 | pub mod terms_query; 10 | pub mod term_query; 11 | pub mod prefix_query; 12 | pub mod and_query; 13 | pub mod or_query; 14 | pub mod not_query; 15 | pub mod constant_score_query; 16 | 17 | use std::fmt::Debug; 18 | 19 | use serde_json::Value as Json; 20 | use search::Query; 21 | use search::schema::Schema; 22 | 23 | use index::metadata::IndexMetadata; 24 | 25 | 26 | #[derive(Debug, Clone)] 27 | pub struct QueryBuildContext<'a> { 28 | pub index_metadata: Option<&'a IndexMetadata>, 29 | score_required: bool, 30 | } 31 | 32 | 33 | impl<'a> QueryBuildContext<'a> { 34 | pub fn new() -> QueryBuildContext<'a> { 35 | QueryBuildContext { 36 | index_metadata: None, 37 | score_required: true 38 | } 39 | } 40 | 41 | #[inline] 42 | pub fn set_index_metadata(mut self, index_metadata: &'a IndexMetadata) -> QueryBuildContext<'a> { 43 | self.index_metadata = Some(index_metadata); 44 | self 45 | } 46 | 47 | #[inline] 48 | pub fn no_score(mut self) -> QueryBuildContext<'a> { 49 | self.score_required = false; 50 | self 51 | } 52 | } 53 | 54 | 55 | #[derive(Debug, PartialEq)] 56 | pub enum QueryParseError { 57 | UnrecognisedQueryType(String), 58 | FieldDoesntExist(String), 59 | UnrecognisedKey(String), 60 | ExpectedKey(&'static str), 61 | ExpectedObject, 62 | ExpectedArray, 63 | ExpectedString, 64 | ExpectedFloat, 65 | ExpectedObjectOrString, 66 | InvalidValue, 67 | ExpectedSingleKey, 68 | InvalidOperator, 69 | } 70 | 71 | 72 | pub trait QueryBuilder: Debug { 73 | fn build(&self, context: &QueryBuildContext, schema: &Schema) -> Query; 74 | } 75 | 76 | 77 | fn get_query_parser(query_name: &str) -> Option Result, QueryParseError>> { 78 | match query_name { 79 | "match" => Some(match_query::parse), 80 | "multi_match" => Some(multi_match_query::parse), 81 | "match_all" => Some(match_all_query::parse), 82 | "match_none" => Some(match_none_query::parse), 83 | "filtered" => Some(filtered_query::parse), 84 | "terms" => Some(terms_query::parse), 85 | "in" => Some(terms_query::parse), 86 | "term" => Some(term_query::parse), 87 | "prefix" => Some(prefix_query::parse), 88 | "and" => Some(and_query::parse), 89 | "or" => Some(or_query::parse), 90 | "not" => Some(not_query::parse), 91 | "constant_score" => Some(constant_score_query::parse), 92 | _ => None 93 | } 94 | } 95 | 96 | 97 | pub fn parse(json: &Json) -> Result, QueryParseError> { 98 | let object = json.as_object().ok_or(QueryParseError::ExpectedObject)?; 99 | 100 | let query_type = if object.len() == 1 { 101 | object.keys().collect::>()[0] 102 | } else { 103 | return Err(QueryParseError::ExpectedSingleKey) 104 | }; 105 | 106 | match get_query_parser(&query_type) { 107 | Some(parse) => parse(object.get(query_type).unwrap()), 108 | None => Err(QueryParseError::UnrecognisedQueryType(query_type.clone())), 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/query_parser/not_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "not" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::Query; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder, parse as parse_query}; 8 | 9 | 10 | #[derive(Debug)] 11 | struct NotQueryBuilder { 12 | query: Box, 13 | } 14 | 15 | 16 | impl QueryBuilder for NotQueryBuilder { 17 | fn build(&self, context: &QueryBuildContext, schema: &Schema) -> Query { 18 | Query::Exclude { 19 | query: Box::new(Query::all()), 20 | exclude: Box::new(self.query.build(&context.clone().no_score(), schema)), 21 | } 22 | } 23 | } 24 | 25 | 26 | pub fn parse(json: &Json) -> Result, QueryParseError> { 27 | Ok(Box::new(NotQueryBuilder { 28 | query: parse_query(json)?, 29 | })) 30 | } 31 | 32 | 33 | #[cfg(test)] 34 | mod tests { 35 | use serde_json; 36 | 37 | use search::{Term, Query, TermScorer}; 38 | use search::schema::{Schema, FieldType, FIELD_INDEXED}; 39 | 40 | use query_parser::{QueryBuildContext, QueryParseError}; 41 | 42 | use super::parse; 43 | 44 | #[test] 45 | fn test_not_query() { 46 | let mut schema = Schema::new(); 47 | let test_field = schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 48 | 49 | let query = parse(&&serde_json::from_str(" 50 | { 51 | \"term\": { 52 | \"test\": \"foo\" 53 | } 54 | } 55 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 56 | 57 | assert_eq!(query, Ok(Query::Exclude { 58 | query: Box::new(Query::all()), 59 | exclude: Box::new(Query::Term { 60 | field: test_field, 61 | term: Term::from_string("foo"), 62 | scorer: TermScorer::default(), 63 | }), 64 | })) 65 | } 66 | 67 | #[test] 68 | fn test_gives_error_for_incorrect_type() { 69 | // String 70 | let query = parse(&serde_json::from_str(" 71 | \"hello\" 72 | ").unwrap()); 73 | 74 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 75 | 76 | // Array 77 | let query = parse(&serde_json::from_str(" 78 | [ 79 | \"foo\" 80 | ] 81 | ").unwrap()); 82 | 83 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 84 | 85 | // Integer 86 | let query = parse(&serde_json::from_str(" 87 | 123 88 | ").unwrap()); 89 | 90 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 91 | 92 | // Float 93 | let query = parse(&serde_json::from_str(" 94 | 123.1234 95 | ").unwrap()); 96 | 97 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/query_parser/or_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "or" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::Query; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder, parse as parse_query}; 8 | 9 | 10 | #[derive(Debug)] 11 | struct OrQueryBuilder { 12 | queries: Vec>, 13 | } 14 | 15 | 16 | impl QueryBuilder for OrQueryBuilder { 17 | fn build(&self, context: &QueryBuildContext, schema: &Schema) -> Query { 18 | let mut queries = Vec::new(); 19 | 20 | for query in self.queries.iter() { 21 | queries.push(query.build(context, schema)); 22 | } 23 | 24 | Query::Disjunction { queries: queries } 25 | } 26 | } 27 | 28 | 29 | 30 | pub fn parse(json: &Json) -> Result, QueryParseError> { 31 | let filters = json.as_array().ok_or(QueryParseError::ExpectedArray)?; 32 | 33 | let mut queries = Vec::new(); 34 | for filter in filters.iter() { 35 | queries.push(parse_query(filter)?); 36 | } 37 | 38 | Ok(Box::new(OrQueryBuilder { 39 | queries: queries 40 | })) 41 | } 42 | 43 | 44 | #[cfg(test)] 45 | mod tests { 46 | use serde_json; 47 | 48 | use search::{Term, Query, TermScorer}; 49 | use search::schema::{Schema, FieldType, FIELD_INDEXED}; 50 | 51 | use query_parser::{QueryBuildContext, QueryParseError}; 52 | 53 | use super::parse; 54 | 55 | #[test] 56 | fn test_or_query() { 57 | let mut schema = Schema::new(); 58 | let test_field = schema.add_field("test".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 59 | 60 | let query = parse(&serde_json::from_str(" 61 | [ 62 | { 63 | \"term\": { 64 | \"test\": \"foo\" 65 | } 66 | }, 67 | { 68 | \"term\": { 69 | \"test\": \"bar\" 70 | } 71 | } 72 | ] 73 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 74 | 75 | assert_eq!(query, Ok(Query::Disjunction { 76 | queries: vec![ 77 | Query::Term { 78 | field: test_field, 79 | term: Term::from_string("foo"), 80 | scorer: TermScorer::default(), 81 | }, 82 | Query::Term { 83 | field: test_field, 84 | term: Term::from_string("bar"), 85 | scorer: TermScorer::default(), 86 | }, 87 | ], 88 | })) 89 | } 90 | 91 | #[test] 92 | fn test_gives_error_for_incorrect_type() { 93 | // String 94 | let query = parse(&serde_json::from_str(" 95 | \"hello\" 96 | ").unwrap()); 97 | 98 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 99 | 100 | // Object 101 | let query = parse(&serde_json::from_str(" 102 | { 103 | \"foo\": \"bar\" 104 | } 105 | ").unwrap()); 106 | 107 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 108 | 109 | // Integer 110 | let query = parse(&serde_json::from_str(" 111 | 123 112 | ").unwrap()); 113 | 114 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 115 | 116 | // Float 117 | let query = parse(&serde_json::from_str(" 118 | 123.1234 119 | ").unwrap()); 120 | 121 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/query_parser/terms_query.rs: -------------------------------------------------------------------------------- 1 | //! Parses "match" queries 2 | 3 | use serde_json::Value as Json; 4 | use search::{Term, Query, TermScorer}; 5 | use search::schema::Schema; 6 | 7 | use query_parser::{QueryBuildContext, QueryParseError, QueryBuilder}; 8 | use query_parser::utils::json_value_to_term; 9 | 10 | #[derive(Debug)] 11 | struct TermsQueryBuilder { 12 | field: String, 13 | terms: Vec, 14 | } 15 | 16 | 17 | impl QueryBuilder for TermsQueryBuilder { 18 | fn build(&self, _context: &QueryBuildContext, schema: &Schema) -> Query { 19 | // Create a term query for each token 20 | let mut queries = Vec::new(); 21 | for term in self.terms.iter() { 22 | queries.push(Query::Term { 23 | field: schema.get_field_by_name(&self.field).unwrap(), 24 | term: term.clone(), 25 | scorer: TermScorer::default(), 26 | }); 27 | } 28 | 29 | Query::Disjunction { queries: queries } 30 | } 31 | } 32 | 33 | 34 | pub fn parse(json: &Json) -> Result, QueryParseError> { 35 | let object = json.as_object().ok_or(QueryParseError::ExpectedObject)?; 36 | 37 | let field_name = if object.len() == 1 { 38 | object.keys().collect::>()[0] 39 | } else { 40 | return Err(QueryParseError::ExpectedSingleKey); 41 | }; 42 | 43 | // Get configuration 44 | let terms: Vec = if let &Json::Array(ref arr) = object.get(field_name).unwrap() { 45 | arr.iter().filter_map(|term| json_value_to_term(&term)).collect() 46 | } else { 47 | return Err(QueryParseError::ExpectedArray); 48 | }; 49 | 50 | Ok(Box::new(TermsQueryBuilder { 51 | field: field_name.clone(), 52 | terms: terms, 53 | })) 54 | } 55 | 56 | 57 | #[cfg(test)] 58 | mod tests { 59 | use serde_json; 60 | 61 | use search::{Term, Query, TermScorer}; 62 | 63 | use query_parser::{QueryBuildContext, QueryParseError}; 64 | use search::schema::{Schema, FieldType, FIELD_INDEXED}; 65 | 66 | use super::parse; 67 | 68 | #[test] 69 | fn test_terms_query() { 70 | let mut schema = Schema::new(); 71 | let foo_field = schema.add_field("foo".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 72 | 73 | let query = parse(&serde_json::from_str(" 74 | { 75 | \"foo\": [\"bar\", \"baz\"] 76 | } 77 | ").unwrap()).and_then(|builder| Ok(builder.build(&QueryBuildContext::new(), &schema))); 78 | 79 | assert_eq!(query, Ok(Query::Disjunction { 80 | queries: vec![ 81 | Query::Term { 82 | field: foo_field, 83 | term: Term::from_string("bar"), 84 | scorer: TermScorer::default(), 85 | }, 86 | Query::Term { 87 | field: foo_field, 88 | term: Term::from_string("baz"), 89 | scorer: TermScorer::default(), 90 | } 91 | ], 92 | })) 93 | } 94 | 95 | #[test] 96 | fn test_gives_error_for_incorrect_type() { 97 | // Array 98 | let query = parse(&serde_json::from_str(" 99 | [ 100 | \"foo\" 101 | ] 102 | ").unwrap()); 103 | 104 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 105 | 106 | // Integer 107 | let query = parse(&serde_json::from_str(" 108 | 123 109 | ").unwrap()); 110 | 111 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 112 | 113 | // Float 114 | let query = parse(&serde_json::from_str(" 115 | 123.1234 116 | ").unwrap()); 117 | 118 | assert_eq!(query.err(), Some(QueryParseError::ExpectedObject)); 119 | } 120 | 121 | #[test] 122 | fn test_gives_error_for_incorrect_query_type() { 123 | // Object 124 | let query = parse(&serde_json::from_str(" 125 | { 126 | \"foo\": { 127 | \"query\": [\"bar\", \"baz\"] 128 | } 129 | } 130 | ").unwrap()); 131 | 132 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 133 | 134 | // String 135 | let query = parse(&serde_json::from_str(" 136 | { 137 | \"foo\": \"bar baz\" 138 | } 139 | ").unwrap()); 140 | 141 | assert_eq!(query.err(), Some(QueryParseError::ExpectedArray)); 142 | } 143 | 144 | #[test] 145 | fn test_gives_error_for_missing_query() { 146 | let query = parse(&serde_json::from_str(" 147 | { 148 | } 149 | ").unwrap()); 150 | 151 | assert_eq!(query.err(), Some(QueryParseError::ExpectedSingleKey)); 152 | } 153 | 154 | #[test] 155 | fn test_gives_error_for_extra_key() { 156 | let query = parse(&serde_json::from_str(" 157 | { 158 | \"foo\": [\"bar\", \"baz\"], 159 | \"hello\": \"world\" 160 | } 161 | ").unwrap()); 162 | 163 | assert_eq!(query.err(), Some(QueryParseError::ExpectedSingleKey)); 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/query_parser/utils.rs: -------------------------------------------------------------------------------- 1 | use serde_json::Value as Json; 2 | use search::term::Term; 3 | 4 | use query_parser::QueryParseError; 5 | 6 | 7 | pub fn parse_string(json: &Json) -> Result { 8 | match *json { 9 | Json::String(ref string) => Ok(string.clone()), 10 | _ => Err(QueryParseError::ExpectedString), 11 | } 12 | } 13 | 14 | 15 | pub fn parse_float(json: &Json) -> Result { 16 | match json { 17 | &Json::Number(ref number) => { 18 | match number.as_f64() { 19 | Some(val) => Ok(val as f32), 20 | None => Err(QueryParseError::ExpectedFloat), 21 | } 22 | } 23 | _ => Err(QueryParseError::ExpectedFloat), 24 | } 25 | } 26 | 27 | 28 | #[derive(Debug)] 29 | pub enum Operator { 30 | Or, 31 | And, 32 | } 33 | 34 | 35 | pub fn parse_operator(json: &Json) -> Result { 36 | match *json { 37 | Json::String(ref value) => { 38 | match value.as_ref() { 39 | "or" => Ok(Operator::Or), 40 | "and" => Ok(Operator::And), 41 | _ => return Err(QueryParseError::InvalidOperator), 42 | } 43 | } 44 | _ => return Err(QueryParseError::InvalidOperator), 45 | } 46 | } 47 | 48 | 49 | pub fn parse_field_and_boost(json: &Json) -> Result<(String, f32), QueryParseError> { 50 | let string = parse_string(json)?; 51 | 52 | let split = string.split('^').collect::>(); 53 | if split.len() == 1 { 54 | return Ok((string.clone(), 1.0f32)); 55 | } else { 56 | let field_name = split[0].to_owned(); 57 | let boost: f32 = split[1].parse().unwrap_or(1.0f32); 58 | return Ok((field_name, boost)); 59 | } 60 | } 61 | 62 | 63 | pub fn json_value_to_term(json: &Json) -> Option { 64 | match json { 65 | &Json::String(ref string) => Some(Term::from_string(string)), 66 | &Json::Bool(value) => Some(Term::from_boolean(value)), 67 | &Json::Number(ref value) => { 68 | match value.as_i64() { 69 | Some(value) => Some(Term::from_integer(value)), 70 | None => None, 71 | } 72 | } 73 | &Json::Null => None, 74 | &Json::Array(_) => None, 75 | &Json::Object(_) => None, 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/search/backends/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod rocksdb; -------------------------------------------------------------------------------- /src/search/backends/rocksdb/benches/insert_document.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | extern crate kite; 5 | extern crate kite_rocksdb; 6 | extern crate rayon; 7 | extern crate fnv; 8 | 9 | use test::Bencher; 10 | use std::fs::remove_dir_all; 11 | 12 | use rayon::par_iter::{ParallelIterator, IntoParallelRefIterator}; 13 | use fnv::FnvHashMap; 14 | 15 | use search::term::Term; 16 | use search::token::Token; 17 | use search::schema::{FieldType, FIELD_INDEXED, FIELD_STORED}; 18 | use search::document::{Document, FieldValue}; 19 | 20 | use search::backends::rocksdb::RocksDBStore; 21 | 22 | #[bench] 23 | fn bench_insert_single_document(b: &mut Bencher) { 24 | remove_dir_all("test_indices/bench_insert_single_document"); 25 | 26 | let mut store = RocksDBStore::create("test_indices/bench_insert_single_document").unwrap(); 27 | let title_field = store.add_field("title".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 28 | let body_field = store.add_field("body".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 29 | let id_field = store.add_field("id".to_string(), FieldType::I64, FIELD_STORED).unwrap(); 30 | 31 | let mut tokens = Vec::new(); 32 | for t in 0..500 { 33 | tokens.push(Token { 34 | term: Term::from_string(&t.to_string()), 35 | position: t 36 | }); 37 | } 38 | 39 | let mut i = 0; 40 | b.iter(|| { 41 | i += 1; 42 | 43 | let mut indexed_fields = FnvHashMap::default(); 44 | indexed_fields.insert(body_field, tokens.clone().into()); 45 | indexed_fields.insert(title_field, vec![Token { term: Term::from_string(&i.to_string()), position: 1}].into()); 46 | 47 | let mut stored_fields = FnvHashMap::default(); 48 | stored_fields.insert(id_field, FieldValue::Integer(i)); 49 | 50 | store.insert_or_update_document(&Document { 51 | key: i.to_string(), 52 | indexed_fields: indexed_fields, 53 | stored_fields: stored_fields, 54 | }); 55 | }); 56 | } 57 | 58 | #[bench] 59 | fn bench_insert_documents_parallel(b: &mut Bencher) { 60 | remove_dir_all("test_indices/bench_insert_single_document_parallel"); 61 | 62 | let mut store = RocksDBStore::create("test_indices/bench_insert_single_document_parallel").unwrap(); 63 | let title_field = store.add_field("title".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 64 | let body_field = store.add_field("body".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 65 | let id_field = store.add_field("id".to_string(), FieldType::I64, FIELD_STORED).unwrap(); 66 | 67 | let mut tokens = Vec::new(); 68 | for t in 0..500 { 69 | tokens.push(Token { 70 | term: Term::from_string(&t.to_string()), 71 | position: t 72 | }); 73 | } 74 | 75 | let mut docs = Vec::new(); 76 | for i in 0..8 { 77 | let mut indexed_fields = FnvHashMap::default(); 78 | indexed_fields.insert(body_field, tokens.clone().into()); 79 | indexed_fields.insert(title_field, vec![Token { term: Term::from_string(&(i + 1).to_string()), position: 1}].into()); 80 | 81 | let mut stored_fields = FnvHashMap::default(); 82 | stored_fields.insert(id_field, FieldValue::Integer(i)); 83 | 84 | docs.push(Document { 85 | key: (i + 1).to_string(), 86 | indexed_fields: indexed_fields, 87 | stored_fields: stored_fields, 88 | }); 89 | } 90 | 91 | b.iter(move|| { 92 | docs.par_iter().for_each(|doc| { 93 | store.insert_or_update_document(doc); 94 | }); 95 | }); 96 | } 97 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/benches/merge_segments.rs: -------------------------------------------------------------------------------- 1 | #![feature(test)] 2 | 3 | extern crate test; 4 | extern crate kite; 5 | extern crate kite_rocksdb; 6 | extern crate fnv; 7 | 8 | use test::Bencher; 9 | use std::fs::remove_dir_all; 10 | 11 | use fnv::FnvHashMap; 12 | 13 | use search::term::Term; 14 | use search::token::Token; 15 | use search::schema::{FieldType, FIELD_INDEXED, FIELD_STORED}; 16 | use search::document::{Document, FieldValue}; 17 | 18 | use search::backends::rocksdb::RocksDBStore; 19 | 20 | #[bench] 21 | fn bench_merge_segments(b: &mut Bencher) { 22 | remove_dir_all("test_indices/bench_merge_segments"); 23 | 24 | let mut store = RocksDBStore::create("test_indices/bench_merge_segments").unwrap(); 25 | let title_field = store.add_field("title".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 26 | let body_field = store.add_field("body".to_string(), FieldType::Text, FIELD_INDEXED).unwrap(); 27 | let id_field = store.add_field("id".to_string(), FieldType::I64, FIELD_STORED).unwrap(); 28 | 29 | let mut tokens = Vec::new(); 30 | for t in 0..500 { 31 | tokens.push(Token { 32 | term: Term::from_string(&t.to_string()), 33 | position: t 34 | }); 35 | } 36 | 37 | // Make 1000 single-document segments 38 | for i in 0..1000 { 39 | let mut indexed_fields = FnvHashMap::default(); 40 | indexed_fields.insert(body_field, tokens.clone().into()); 41 | indexed_fields.insert(title_field, vec![Token { term: Term::from_string(&i.to_string()), position: 1}].into()); 42 | 43 | let mut stored_fields = FnvHashMap::default(); 44 | stored_fields.insert(id_field, FieldValue::Integer(i)); 45 | 46 | store.insert_or_update_document(&Document { 47 | key: i.to_string(), 48 | indexed_fields: indexed_fields, 49 | stored_fields: stored_fields, 50 | }); 51 | } 52 | 53 | // Merge them together in groups of 100 54 | // This is only run about 5 times so only half of the documents will be merged 55 | let mut i = 0; 56 | b.iter(|| { 57 | let start = i * 100; 58 | let stop = start + 100; 59 | let segments = (start..stop).collect::>(); 60 | 61 | store.merge_segments(&segments); 62 | store.purge_segments(&segments); 63 | 64 | i += 1; 65 | }); 66 | } 67 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/document_index.rs: -------------------------------------------------------------------------------- 1 | use std::sync::RwLock; 2 | use std::collections::HashMap; 3 | use std::io::Cursor; 4 | 5 | use rocksdb::{self, DB, WriteBatch}; 6 | use roaring::RoaringBitmap; 7 | use search::document::DocId; 8 | use search::segment::SegmentId; 9 | use byteorder::{ByteOrder, LittleEndian}; 10 | use fnv::FnvHashMap; 11 | 12 | use super::key_builder::KeyBuilder; 13 | use super::segment_ops::SegmentMergeError; 14 | 15 | /// Manages the index's "document index" 16 | pub struct DocumentIndexManager { 17 | primary_key_index: RwLock, DocId>>, 18 | } 19 | 20 | impl DocumentIndexManager { 21 | /// Generates a new document index 22 | pub fn new(_db: &DB) -> Result { 23 | Ok(DocumentIndexManager { 24 | primary_key_index: RwLock::new(HashMap::new()), 25 | }) 26 | } 27 | 28 | /// Loads the document index from an index 29 | pub fn open(db: &DB) -> Result { 30 | // Read primary key index 31 | let mut primary_key_index = HashMap::new(); 32 | let mut iter = db.raw_iterator(); 33 | iter.seek(b"k"); 34 | while iter.valid() { 35 | let k = iter.key().unwrap(); 36 | 37 | if k[0] != b'k' { 38 | break; 39 | } 40 | 41 | let v = iter.value().unwrap(); 42 | let segment = LittleEndian::read_u32(&v[0..4]); 43 | let ord = LittleEndian::read_u16(&v[4..6]); 44 | let doc_id = DocId(SegmentId(segment), ord); 45 | 46 | primary_key_index.insert(k[1..].to_vec(), doc_id); 47 | 48 | iter.next(); 49 | } 50 | 51 | Ok(DocumentIndexManager { 52 | primary_key_index: RwLock::new(primary_key_index), 53 | }) 54 | } 55 | 56 | fn delete_document_by_id_unchecked(&self, write_batch: &mut WriteBatch, doc_id: DocId) -> Result<(), rocksdb::Error> { 57 | let kb = KeyBuilder::segment_del_list((doc_id.0).0); 58 | let mut previous_doc_id_bytes = [0; 2]; 59 | LittleEndian::write_u16(&mut previous_doc_id_bytes, doc_id.1); 60 | try!(write_batch.merge(&kb.key(), &previous_doc_id_bytes)); 61 | 62 | // Increment deleted docs 63 | let kb = KeyBuilder::segment_stat((doc_id.0).0, b"deleted_docs"); 64 | let mut inc_bytes = [0; 8]; 65 | LittleEndian::write_i64(&mut inc_bytes, 1); 66 | try!(write_batch.merge(&kb.key(), &inc_bytes)); 67 | 68 | Ok(()) 69 | } 70 | 71 | pub fn insert_or_replace_key(&self, db: &DB, key: &Vec, doc_id: DocId) -> Result, rocksdb::Error> { 72 | // Update primary_key_index 73 | let mut write_batch = WriteBatch::default(); 74 | let previous_doc_id = self.primary_key_index.write().unwrap().insert(key.clone(), doc_id); 75 | 76 | let kb = KeyBuilder::primary_key_index(key); 77 | let mut doc_id_bytes = [0; 6]; 78 | LittleEndian::write_u32(&mut doc_id_bytes, (doc_id.0).0); 79 | LittleEndian::write_u16(&mut doc_id_bytes[4..], doc_id.1); 80 | try!(write_batch.put(&kb.key(), &doc_id_bytes)); 81 | 82 | // If there was a document there previously, delete it 83 | if let Some(previous_doc_id) = previous_doc_id { 84 | try!(self.delete_document_by_id_unchecked(&mut write_batch, previous_doc_id)); 85 | } 86 | 87 | // Write document data 88 | try!(db.write(write_batch)); 89 | 90 | Ok(previous_doc_id) 91 | } 92 | 93 | pub fn delete_document_by_key(&self, db: &DB, key: &Vec) -> Result, rocksdb::Error> { 94 | // Remove document from index 95 | let doc_id = self.primary_key_index.write().unwrap().remove(key); 96 | 97 | if let Some(doc_id) = doc_id { 98 | let mut write_batch = WriteBatch::default(); 99 | 100 | try!(self.delete_document_by_id_unchecked(&mut write_batch, doc_id)); 101 | 102 | try!(db.write(write_batch)); 103 | } 104 | 105 | Ok(doc_id) 106 | } 107 | 108 | pub fn contains_document_key(&self, key: &Vec) -> bool { 109 | self.primary_key_index.read().unwrap().contains_key(key) 110 | } 111 | 112 | pub fn commit_segment_merge(&self, db: &DB, mut write_batch: WriteBatch, source_segments: &Vec, dest_segment: u32, doc_id_mapping: &FnvHashMap) -> Result<(), SegmentMergeError> { 113 | // Lock the primary key index 114 | let mut primary_key_index = self.primary_key_index.write().unwrap(); 115 | 116 | // Update primary keys to point to their new locations 117 | let mut keys_to_update: HashMap, DocId> = HashMap::with_capacity(doc_id_mapping.len()); 118 | for (key, doc_id) in primary_key_index.iter() { 119 | if doc_id_mapping.contains_key(&doc_id) { 120 | keys_to_update.insert(key.clone(), *doc_id); 121 | } 122 | } 123 | 124 | for (key, doc_id) in keys_to_update { 125 | let new_doc_local_id = doc_id_mapping.get(&doc_id).unwrap(); 126 | let new_doc_id = DocId(SegmentId(dest_segment), *new_doc_local_id); 127 | 128 | let kb = KeyBuilder::primary_key_index(&key); 129 | let mut doc_id_bytes = [0; 6]; 130 | LittleEndian::write_u32(&mut doc_id_bytes, (new_doc_id.0).0); 131 | LittleEndian::write_u16(&mut doc_id_bytes[4..], new_doc_id.1); 132 | try!(write_batch.put(&kb.key(), &doc_id_bytes)); 133 | 134 | primary_key_index.insert(key, new_doc_id); 135 | } 136 | 137 | // Merge deletion lists 138 | // Must be done while the primary_key_index is locked as this prevents any more documents being deleted 139 | let mut deletion_list = RoaringBitmap::new(); 140 | for source_segment in source_segments { 141 | let kb = KeyBuilder::segment_del_list(*source_segment); 142 | match try!(db.get(&kb.key())) { 143 | Some(bitmap) => { 144 | let bitmap = RoaringBitmap::deserialize_from(Cursor::new(&bitmap[..])).unwrap(); 145 | for doc_id in bitmap.iter() { 146 | let doc_id = DocId(SegmentId(*source_segment), doc_id as u16); 147 | let new_doc_id = doc_id_mapping.get(&doc_id).unwrap(); 148 | deletion_list.insert(*new_doc_id as u32); 149 | } 150 | } 151 | None => {}, 152 | } 153 | } 154 | 155 | let mut dl_vec = Vec::new(); 156 | deletion_list.serialize_into(&mut dl_vec).unwrap(); 157 | 158 | let kb = KeyBuilder::segment_del_list(dest_segment); 159 | try!(db.put(&kb.key(), &dl_vec)); 160 | 161 | // Commit! 162 | try!(db.write_without_wal(write_batch)); 163 | 164 | Ok(()) 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/key_builder.rs: -------------------------------------------------------------------------------- 1 | pub struct KeyBuilder { 2 | key: Vec, 3 | } 4 | 5 | impl KeyBuilder { 6 | pub fn new() -> KeyBuilder { 7 | KeyBuilder { 8 | key: Vec::new(), 9 | } 10 | } 11 | 12 | pub fn with_capacity(size: usize) -> KeyBuilder { 13 | KeyBuilder { 14 | key: Vec::with_capacity(size), 15 | } 16 | } 17 | 18 | pub fn stored_field_value(segment: u32, doc_local_id: u16, field_id: u32, value_type: &[u8]) -> KeyBuilder { 19 | let mut kb = KeyBuilder::new(); 20 | kb.push_char(b'v'); 21 | kb.push_string(segment.to_string().as_bytes()); 22 | kb.separator(); 23 | kb.push_string(doc_local_id.to_string().as_bytes()); 24 | kb.separator(); 25 | kb.push_string(field_id.to_string().as_bytes()); 26 | kb.separator(); 27 | kb.push_string(value_type); 28 | kb 29 | } 30 | 31 | pub fn segment_stored_values_prefix(segment: u32) -> KeyBuilder { 32 | let mut kb = KeyBuilder::new(); 33 | kb.push_char(b'v'); 34 | kb.push_string(segment.to_string().as_bytes()); 35 | kb.separator(); 36 | kb 37 | } 38 | 39 | pub fn primary_key_index(key: &[u8]) -> KeyBuilder { 40 | let mut kb = KeyBuilder::with_capacity(1 + key.len()); 41 | kb.push_char(b'k'); 42 | kb.push_string(key); 43 | kb 44 | } 45 | 46 | pub fn term_dict_mapping(term: &[u8]) -> KeyBuilder { 47 | let mut kb = KeyBuilder::with_capacity(1 + term.len()); 48 | kb.push_char(b't'); 49 | kb.push_string(term); 50 | kb 51 | } 52 | 53 | pub fn segment_active(segment: u32) -> KeyBuilder { 54 | let mut kb = KeyBuilder::new(); 55 | kb.push_char(b'a'); 56 | kb.push_string(segment.to_string().as_bytes()); 57 | kb 58 | } 59 | 60 | pub fn segment_postings_list(segment: u32, field_id: u32, term_id: u32) -> KeyBuilder { 61 | let mut kb = KeyBuilder::new(); 62 | kb.push_char(b'd'); 63 | kb.push_string(field_id.to_string().as_bytes()); 64 | kb.separator(); 65 | kb.push_string(term_id.to_string().as_bytes()); 66 | kb.separator(); 67 | kb.push_string(segment.to_string().as_bytes()); 68 | kb 69 | } 70 | 71 | pub fn segment_stat_prefix(segment: u32) -> KeyBuilder { 72 | let mut kb = KeyBuilder::new(); 73 | kb.push_char(b's'); 74 | kb.push_string(segment.to_string().as_bytes()); 75 | kb.separator(); 76 | kb 77 | } 78 | 79 | pub fn segment_stat(segment: u32, name: &[u8]) -> KeyBuilder { 80 | let mut kb = KeyBuilder::segment_stat_prefix(segment); 81 | kb.push_string(name); 82 | kb 83 | } 84 | 85 | pub fn segment_stat_term_doc_frequency_stat_name(field_id: u32, term_id: u32) -> Vec { 86 | let mut stat_name = Vec::new(); 87 | for c in b"tdf" { 88 | stat_name.push(*c); 89 | } 90 | 91 | stat_name.push(b'-'); 92 | 93 | for c in field_id.to_string().as_bytes() { 94 | stat_name.push(*c); 95 | } 96 | 97 | stat_name.push(b'-'); 98 | 99 | for c in term_id.to_string().as_bytes() { 100 | stat_name.push(*c); 101 | } 102 | 103 | stat_name 104 | } 105 | 106 | pub fn segment_stat_total_field_tokens_stat_name(field_id: u32) -> Vec { 107 | let mut stat_name = Vec::new(); 108 | for c in b"fttok" { 109 | stat_name.push(*c); 110 | } 111 | 112 | stat_name.push(b'-'); 113 | 114 | for c in field_id.to_string().as_bytes() { 115 | stat_name.push(*c); 116 | } 117 | 118 | stat_name 119 | } 120 | 121 | pub fn segment_stat_total_field_docs_stat_name(field_id: u32) -> Vec { 122 | let mut stat_name = Vec::new(); 123 | for c in b"ftdoc" { 124 | stat_name.push(*c); 125 | } 126 | 127 | stat_name.push(b'-'); 128 | 129 | for c in field_id.to_string().as_bytes() { 130 | stat_name.push(*c); 131 | } 132 | 133 | stat_name 134 | } 135 | 136 | pub fn segment_del_list(segment: u32) -> KeyBuilder { 137 | let mut kb = KeyBuilder::new(); 138 | kb.push_char(b'x'); 139 | kb.push_string(segment.to_string().as_bytes()); 140 | kb 141 | } 142 | 143 | #[inline] 144 | pub fn key(&self) -> &[u8] { 145 | &self.key[..] 146 | } 147 | 148 | #[inline] 149 | pub fn push_char(&mut self, c: u8) { 150 | if c == b'/' || c == b'\\' { 151 | self.key.push(b'\\'); 152 | } 153 | self.key.push(c); 154 | } 155 | 156 | pub fn push_string(&mut self, s: &[u8]) { 157 | for c in s { 158 | self.push_char(*c); 159 | } 160 | } 161 | 162 | pub fn separator(&mut self) { 163 | self.key.push(b'/'); 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/search/planner/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod boolean_query; 2 | pub mod score_function; 3 | 4 | use search::Query; 5 | 6 | use super::super::RocksDBReader; 7 | use self::boolean_query::{BooleanQueryOp, BooleanQueryBuilder, plan_boolean_query}; 8 | use self::score_function::{ScoreFunctionOp, plan_score_function}; 9 | 10 | #[derive(Debug)] 11 | pub struct SearchPlan { 12 | pub boolean_query: Vec, 13 | pub boolean_query_is_negated: bool, 14 | pub score_function: Vec, 15 | } 16 | 17 | impl SearchPlan { 18 | pub fn new() -> SearchPlan { 19 | SearchPlan { 20 | boolean_query: Vec::new(), 21 | boolean_query_is_negated: false, 22 | score_function: Vec::new(), 23 | } 24 | } 25 | } 26 | 27 | pub fn plan_query(index_reader: &RocksDBReader, query: &Query, score: bool) -> SearchPlan { 28 | let mut plan = SearchPlan::new(); 29 | 30 | // Plan boolean query 31 | let mut builder = BooleanQueryBuilder::new(); 32 | plan_boolean_query(index_reader, &mut builder, query); 33 | 34 | // Add operations to exclude deleted documents to boolean query 35 | builder.push_deletion_list(); 36 | builder.andnot_combinator(); 37 | 38 | let (boolean_query, boolean_query_is_negated) = builder.build(); 39 | plan.boolean_query = boolean_query; 40 | plan.boolean_query_is_negated = boolean_query_is_negated; 41 | 42 | // Plan score function 43 | if score { 44 | plan_score_function(index_reader, &mut plan.score_function, query); 45 | } else { 46 | plan.score_function.push(ScoreFunctionOp::Literal(0.0f32)); 47 | } 48 | 49 | plan 50 | } 51 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/search/planner/score_function.rs: -------------------------------------------------------------------------------- 1 | use search::schema::FieldId; 2 | use search::term::TermId; 3 | use search::Query; 4 | use search::query::term_scorer::TermScorer; 5 | 6 | use super::super::RocksDBReader; 7 | 8 | #[derive(Debug, Clone)] 9 | pub enum CombinatorScorer { 10 | Avg, 11 | Max, 12 | } 13 | 14 | #[derive(Debug, Clone)] 15 | pub enum ScoreFunctionOp { 16 | Literal(f32), 17 | TermScorer(FieldId, TermId, TermScorer), 18 | CombinatorScorer(u32, CombinatorScorer), 19 | } 20 | 21 | fn plan_score_function_combinator(index_reader: &RocksDBReader, mut score_function: &mut Vec, queries: &Vec, scorer: CombinatorScorer) { 22 | match queries.len() { 23 | 0 => { 24 | score_function.push(ScoreFunctionOp::Literal(0.0f32)); 25 | } 26 | 1 => plan_score_function(index_reader, &mut score_function, &queries[0]), 27 | _ => { 28 | let mut query_iter = queries.iter(); 29 | plan_score_function(index_reader, &mut score_function, query_iter.next().unwrap()); 30 | 31 | for query in query_iter { 32 | plan_score_function(index_reader, &mut score_function, query); 33 | } 34 | } 35 | } 36 | 37 | score_function.push(ScoreFunctionOp::CombinatorScorer(queries.len() as u32, scorer)); 38 | } 39 | 40 | pub fn plan_score_function(index_reader: &RocksDBReader, mut score_function: &mut Vec, query: &Query) { 41 | match *query { 42 | Query::All{ref score} => { 43 | score_function.push(ScoreFunctionOp::Literal(*score)); 44 | } 45 | Query::None => { 46 | score_function.push(ScoreFunctionOp::Literal(0.0f32)); 47 | } 48 | Query::Term{field, ref term, ref scorer} => { 49 | // Get term 50 | let term_id = match index_reader.store.term_dictionary.get(term) { 51 | Some(term_id) => term_id, 52 | None => { 53 | // Term doesn't exist, so will never match 54 | score_function.push(ScoreFunctionOp::Literal(0.0f32)); 55 | return 56 | } 57 | }; 58 | 59 | score_function.push(ScoreFunctionOp::TermScorer(field, term_id, scorer.clone())); 60 | } 61 | Query::MultiTerm{field, ref term_selector, ref scorer} => { 62 | // Get terms 63 | let mut total_terms = 0; 64 | for term_id in index_reader.store.term_dictionary.select(term_selector) { 65 | score_function.push(ScoreFunctionOp::TermScorer(field, term_id, scorer.clone())); 66 | total_terms += 1; 67 | } 68 | 69 | // This query must push only one score value onto the stack. 70 | // If we haven't pushed any score operations, Push a literal 0.0 71 | // If we have pushed more than one score operations, which will lead to more 72 | // than one score value being pushed to the stack, combine the score values 73 | // with a combinator operation. 74 | match total_terms { 75 | 0 => score_function.push(ScoreFunctionOp::Literal(0.0f32)), 76 | 1 => {}, 77 | _ => score_function.push(ScoreFunctionOp::CombinatorScorer(total_terms, CombinatorScorer::Avg)), 78 | } 79 | } 80 | Query::Conjunction{ref queries} => { 81 | plan_score_function_combinator(index_reader, &mut score_function, queries, CombinatorScorer::Avg); 82 | } 83 | Query::Disjunction{ref queries} => { 84 | plan_score_function_combinator(index_reader, &mut score_function, queries, CombinatorScorer::Avg); 85 | } 86 | Query::DisjunctionMax{ref queries} => { 87 | plan_score_function_combinator(index_reader, &mut score_function, queries, CombinatorScorer::Max); 88 | } 89 | Query::Filter{ref query, ..} => { 90 | plan_score_function(index_reader, &mut score_function, query); 91 | } 92 | Query::Exclude{ref query, ..} => { 93 | plan_score_function(index_reader, &mut score_function, query); 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/search/statistics.rs: -------------------------------------------------------------------------------- 1 | use fnv::FnvHashMap; 2 | 3 | use search::schema::FieldId; 4 | use search::term::TermId; 5 | use search::segment::Segment; 6 | 7 | use super::super::RocksDBReader; 8 | use super::super::key_builder::KeyBuilder; 9 | 10 | pub trait StatisticsReader { 11 | fn total_docs(&mut self, field_id: FieldId) -> Result; 12 | fn total_tokens(&mut self, field_id: FieldId) -> Result; 13 | fn term_document_frequency(&mut self, field_id: FieldId, term_id: TermId) -> Result; 14 | } 15 | 16 | pub struct RocksDBStatisticsReader<'a> { 17 | index_reader: &'a RocksDBReader<'a>, 18 | total_docs: FnvHashMap, 19 | total_tokens: FnvHashMap, 20 | term_document_frequencies: FnvHashMap<(FieldId, TermId), i64>, 21 | } 22 | 23 | impl<'a> RocksDBStatisticsReader<'a> { 24 | pub fn new(index_reader: &'a RocksDBReader) -> RocksDBStatisticsReader<'a> { 25 | RocksDBStatisticsReader { 26 | index_reader: index_reader, 27 | total_docs: FnvHashMap::default(), 28 | total_tokens: FnvHashMap::default(), 29 | term_document_frequencies: FnvHashMap::default(), 30 | } 31 | } 32 | 33 | fn get_statistic(&self, name: &[u8]) -> Result { 34 | let mut val = 0; 35 | 36 | for segment in self.index_reader.store.segments.iter_active(&self.index_reader) { 37 | if let Some(new_val) = try!(segment.load_statistic(name)) { 38 | val += new_val; 39 | } 40 | } 41 | 42 | Ok(val) 43 | } 44 | } 45 | 46 | impl<'a> StatisticsReader for RocksDBStatisticsReader<'a> { 47 | fn total_docs(&mut self, field_id: FieldId) -> Result { 48 | if let Some(val) = self.total_docs.get(&field_id) { 49 | return Ok(*val); 50 | } 51 | 52 | let stat_name = KeyBuilder::segment_stat_total_field_docs_stat_name(field_id.0); 53 | let val = try!(self.get_statistic(&stat_name)); 54 | self.total_docs.insert(field_id, val); 55 | Ok(val) 56 | } 57 | 58 | fn total_tokens(&mut self, field_id: FieldId) -> Result { 59 | if let Some(val) = self.total_tokens.get(&field_id) { 60 | return Ok(*val); 61 | } 62 | 63 | let stat_name = KeyBuilder::segment_stat_total_field_tokens_stat_name(field_id.0); 64 | let val = try!(self.get_statistic(&stat_name)); 65 | self.total_tokens.insert(field_id, val); 66 | Ok(val) 67 | } 68 | 69 | fn term_document_frequency(&mut self, field_id: FieldId, term_id: TermId) -> Result { 70 | if let Some(val) = self.term_document_frequencies.get(&(field_id, term_id)) { 71 | return Ok(*val); 72 | } 73 | 74 | let stat_name = KeyBuilder::segment_stat_term_doc_frequency_stat_name(field_id.0, term_id.0); 75 | let val = try!(self.get_statistic(&stat_name)); 76 | self.term_document_frequencies.insert((field_id, term_id), val); 77 | Ok(val) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/segment.rs: -------------------------------------------------------------------------------- 1 | use std::io::Cursor; 2 | 3 | use search::segment::{SegmentId, Segment}; 4 | use search::schema::FieldId; 5 | use search::term::TermId; 6 | use roaring::RoaringBitmap; 7 | use byteorder::{ByteOrder, LittleEndian}; 8 | 9 | use super::RocksDBReader; 10 | use super::key_builder::KeyBuilder; 11 | 12 | pub struct RocksDBSegment<'a> { 13 | reader: &'a RocksDBReader<'a>, 14 | id: u32, 15 | } 16 | 17 | impl<'a> RocksDBSegment<'a> { 18 | pub fn new(reader: &'a RocksDBReader, id: u32) -> RocksDBSegment<'a> { 19 | RocksDBSegment { 20 | reader: reader, 21 | id: id, 22 | } 23 | } 24 | } 25 | 26 | impl<'a> Segment for RocksDBSegment<'a> { 27 | fn id(&self) -> SegmentId { 28 | SegmentId(self.id) 29 | } 30 | 31 | fn load_statistic(&self, stat_name: &[u8]) -> Result, String> { 32 | let kb = KeyBuilder::segment_stat(self.id, stat_name); 33 | let val = try!(self.reader.snapshot.get(&kb.key())).map(|val| LittleEndian::read_i64(&val)); 34 | Ok(val) 35 | } 36 | 37 | fn load_stored_field_value_raw(&self, doc_local_id: u16, field_id: FieldId, value_type: &[u8]) -> Result>, String> { 38 | let kb = KeyBuilder::stored_field_value(self.id, doc_local_id, field_id.0, value_type); 39 | let val = try!(self.reader.snapshot.get(&kb.key())); 40 | Ok(val.map(|v| v.to_vec())) 41 | } 42 | 43 | fn load_postings_list(&self, field_id: FieldId, term_id: TermId) -> Result, String> { 44 | let kb = KeyBuilder::segment_postings_list(self.id, field_id.0, term_id.0); 45 | let doc_id_set = try!(self.reader.snapshot.get(&kb.key())).map(|doc_id_set| RoaringBitmap::deserialize_from(Cursor::new(&doc_id_set[..])).unwrap()); 46 | Ok(doc_id_set) 47 | } 48 | 49 | fn load_deletion_list(&self) -> Result, String> { 50 | let kb = KeyBuilder::segment_del_list(self.id); 51 | let doc_id_set = try!(self.reader.snapshot.get(&kb.key())).map(|doc_id_set| RoaringBitmap::deserialize_from(Cursor::new(&doc_id_set[..])).unwrap()); 52 | Ok(doc_id_set) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/segment_builder.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use search::{Document, Term, TermId}; 4 | use search::schema::FieldId; 5 | use search::segment::{SegmentId, Segment}; 6 | use byteorder::{LittleEndian, WriteBytesExt}; 7 | use roaring::RoaringBitmap; 8 | use fnv::FnvHashMap; 9 | 10 | use super::key_builder::KeyBuilder; 11 | 12 | #[derive(Debug)] 13 | pub struct SegmentBuilder { 14 | current_doc: u16, 15 | pub term_dictionary: HashMap, 16 | current_term_id: u32, 17 | pub postings_lists: FnvHashMap<(FieldId, TermId), RoaringBitmap>, 18 | pub statistics: FnvHashMap, i64>, 19 | pub stored_field_values: FnvHashMap<(FieldId, u16, Vec), Vec>, 20 | } 21 | 22 | #[derive(Debug)] 23 | pub enum DocumentInsertError { 24 | /// Segment couldn't hold any more docs 25 | SegmentFull, 26 | } 27 | 28 | impl SegmentBuilder { 29 | pub fn new() -> SegmentBuilder { 30 | SegmentBuilder { 31 | current_doc: 0, 32 | term_dictionary: HashMap::new(), 33 | current_term_id: 0, 34 | postings_lists: FnvHashMap::default(), 35 | statistics: FnvHashMap::default(), 36 | stored_field_values: FnvHashMap::default(), 37 | } 38 | } 39 | 40 | fn get_term_id(&mut self, term: &Term) -> TermId { 41 | if let Some(term_id) = self.term_dictionary.get(term) { 42 | return *term_id; 43 | } 44 | 45 | // Add the term to the dictionary 46 | let term_id = TermId(self.current_term_id); 47 | self.current_term_id += 1; 48 | self.term_dictionary.insert(term.clone(), term_id); 49 | 50 | term_id 51 | } 52 | 53 | pub fn add_document(&mut self, doc: &Document) -> Result { 54 | // Get document ord 55 | let doc_id = self.current_doc; 56 | self.current_doc += 1; 57 | try!(self.current_doc.checked_add(1).ok_or(DocumentInsertError::SegmentFull)); 58 | 59 | // Insert indexed fields 60 | let mut term_frequencies = FnvHashMap::default(); 61 | for (field_id, tokens) in doc.indexed_fields.iter() { 62 | let mut field_token_count = 0; 63 | 64 | for (term, positions) in tokens.iter() { 65 | let frequency = positions.len(); 66 | field_token_count += frequency; 67 | 68 | // Get term id 69 | let term_id = self.get_term_id(term); 70 | 71 | // Term frequency 72 | let term_frequency = term_frequencies.entry(term_id).or_insert(0); 73 | *term_frequency += frequency; 74 | 75 | // Write postings list 76 | self.postings_lists.entry((*field_id, term_id)).or_insert_with(RoaringBitmap::new).insert(doc_id as u32); 77 | 78 | // Write term frequency 79 | // 1 is by far the most common frequency. At search time, we interpret a missing 80 | // key as meaning there is a term frequency of 1 81 | if frequency != 1 { 82 | let mut value_type = vec![b't', b'f']; 83 | value_type.extend(term_id.0.to_string().as_bytes()); 84 | 85 | let mut frequency_bytes: Vec = Vec::new(); 86 | frequency_bytes.write_i64::(frequency as i64).unwrap(); 87 | 88 | self.stored_field_values.insert((*field_id, doc_id, value_type), frequency_bytes); 89 | } 90 | 91 | // Increment term document frequency 92 | let stat_name = KeyBuilder::segment_stat_term_doc_frequency_stat_name(field_id.0, term_id.0); 93 | let stat = self.statistics.entry(stat_name).or_insert(0); 94 | *stat += 1; 95 | } 96 | 97 | // Field length 98 | // Used by the BM25 similarity model 99 | let length = ((field_token_count as f32).sqrt() - 1.0) * 3.0; 100 | let length = if length > 255.0 { 255.0 } else { length } as u8; 101 | if length != 0 { 102 | self.stored_field_values.insert((*field_id, doc_id, b"len".to_vec()), vec![length]); 103 | } 104 | 105 | // Increment total field docs 106 | { 107 | let stat_name = KeyBuilder::segment_stat_total_field_docs_stat_name(field_id.0); 108 | let stat = self.statistics.entry(stat_name).or_insert(0); 109 | *stat += 1; 110 | } 111 | 112 | // Increment total field tokens 113 | { 114 | let stat_name = KeyBuilder::segment_stat_total_field_tokens_stat_name(field_id.0); 115 | let stat = self.statistics.entry(stat_name).or_insert(0); 116 | *stat += field_token_count as i64; 117 | } 118 | } 119 | 120 | // Insert stored fields 121 | for (field, value) in doc.stored_fields.iter() { 122 | self.stored_field_values.insert((*field, doc_id, b"val".to_vec()), value.to_bytes()); 123 | } 124 | 125 | // Increment total docs 126 | { 127 | let stat = self.statistics.entry(b"total_docs".to_vec()).or_insert(0); 128 | *stat += 1; 129 | } 130 | 131 | Ok(doc_id) 132 | } 133 | } 134 | 135 | impl Segment for SegmentBuilder { 136 | fn id(&self) -> SegmentId { 137 | SegmentId(0) 138 | } 139 | 140 | fn load_statistic(&self, stat_name: &[u8]) -> Result, String> { 141 | Ok(self.statistics.get(stat_name).cloned()) 142 | } 143 | 144 | fn load_stored_field_value_raw(&self, doc_local_id: u16, field_id: FieldId, value_type: &[u8]) -> Result>, String> { 145 | Ok(self.stored_field_values.get(&(field_id, doc_local_id, value_type.to_vec())).cloned()) 146 | } 147 | 148 | fn load_postings_list(&self, field_id: FieldId, term_id: TermId) -> Result, String> { 149 | Ok(self.postings_lists.get(&(field_id, term_id)).cloned()) 150 | } 151 | 152 | fn load_deletion_list(&self) -> Result, String> { 153 | Ok(None) 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/segment_manager.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | use std::sync::atomic::{AtomicUsize, Ordering}; 3 | 4 | use rocksdb::{self, DB, DBRawIterator}; 5 | 6 | use super::RocksDBReader; 7 | use super::segment::RocksDBSegment; 8 | 9 | /// Manages "segments" within the index 10 | /// 11 | /// The index is partitioned into immutable segments. This manager is responsible 12 | /// for allocating segments keeping track of which segments are active and 13 | /// controlling routine tasks such as merging and vacuuming 14 | pub struct SegmentManager { 15 | next_segment: AtomicUsize, 16 | } 17 | 18 | impl SegmentManager { 19 | /// Generates a new segment manager 20 | pub fn new(db: &DB) -> Result { 21 | // TODO: Raise error if .next_segment already exists 22 | // Next segment 23 | try!(db.put(b".next_segment", b"1")); 24 | 25 | Ok(SegmentManager { 26 | next_segment: AtomicUsize::new(1), 27 | }) 28 | } 29 | 30 | /// Loads the segment manager from an index 31 | pub fn open(db: &DB) -> Result { 32 | let next_segment = match try!(db.get(b".next_segment")) { 33 | Some(next_segment) => { 34 | next_segment.to_utf8().unwrap().parse::().unwrap() 35 | } 36 | None => 1, // TODO: error 37 | }; 38 | 39 | Ok(SegmentManager { 40 | next_segment: AtomicUsize::new(next_segment as usize), 41 | }) 42 | } 43 | 44 | /// Allocates a new (inactive) segment 45 | pub fn new_segment(&self, db: &DB) -> Result { 46 | let next_segment = self.next_segment.fetch_add(1, Ordering::SeqCst) as u32; 47 | try!(db.put(b".next_segment", (next_segment + 1).to_string().as_bytes())); 48 | Ok(next_segment) 49 | } 50 | 51 | /// Iterates currently active segments 52 | pub fn iter_active<'a>(&self, reader: &'a RocksDBReader) -> ActiveSegmentsIterator<'a> { 53 | let mut iter = reader.snapshot.raw_iterator(); 54 | iter.seek(b"a"); 55 | ActiveSegmentsIterator { 56 | reader: reader, 57 | iter: iter, 58 | fused: false, 59 | } 60 | } 61 | } 62 | 63 | pub struct ActiveSegmentsIterator<'a> { 64 | reader: &'a RocksDBReader<'a>, 65 | iter: DBRawIterator, 66 | fused: bool, 67 | } 68 | 69 | impl<'a> Iterator for ActiveSegmentsIterator<'a> { 70 | type Item = RocksDBSegment<'a>; 71 | 72 | fn next(&mut self) -> Option> { 73 | if !self.fused && self.iter.valid() { 74 | let segment_id = { 75 | let k = self.iter.key().unwrap(); 76 | 77 | if k[0] != b'a' { 78 | self.fused = true; 79 | return None; 80 | } 81 | 82 | str::from_utf8(&k[1..]).unwrap().parse::().unwrap() 83 | }; 84 | 85 | self.iter.next(); 86 | 87 | Some(RocksDBSegment::new(self.reader, segment_id)) 88 | } else { 89 | None 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/segment_stats.rs: -------------------------------------------------------------------------------- 1 | use search::segment::Segment; 2 | 3 | use super::RocksDBStore; 4 | 5 | #[derive(Debug)] 6 | pub struct SegmentStatistics { 7 | total_docs: i64, 8 | deleted_docs: i64, 9 | } 10 | 11 | impl SegmentStatistics { 12 | fn read(segment: &S) -> Result { 13 | let total_docs = try!(segment.load_statistic(b"total_docs")).unwrap_or(0); 14 | let deleted_docs = try!(segment.load_statistic(b"deleted_docs")).unwrap_or(0); 15 | 16 | Ok(SegmentStatistics { 17 | total_docs: total_docs, 18 | deleted_docs: deleted_docs, 19 | }) 20 | } 21 | 22 | #[inline] 23 | pub fn total_docs(&self) -> i64 { 24 | self.total_docs 25 | } 26 | 27 | #[inline] 28 | pub fn deleted_docs(&self) -> i64 { 29 | self.deleted_docs 30 | } 31 | } 32 | 33 | impl RocksDBStore { 34 | pub fn get_segment_statistics(&self) -> Result, String> { 35 | let mut segment_stats = Vec::new(); 36 | let reader = self.reader(); 37 | 38 | for segment in self.segments.iter_active(&reader) { 39 | let stats = try!(SegmentStatistics::read(&segment)); 40 | segment_stats.push((segment.id().0, stats)); 41 | } 42 | 43 | Ok(segment_stats) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/search/backends/rocksdb/term_dictionary.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | use std::sync::{Mutex, RwLock}; 3 | use std::sync::atomic::{AtomicUsize, Ordering}; 4 | use std::collections::HashMap; 5 | 6 | use rocksdb::{self, DB}; 7 | use search::{Term, TermId}; 8 | use search::query::multi_term_selector::MultiTermSelector; 9 | 10 | use super::key_builder::KeyBuilder; 11 | 12 | /// Manages the index's "term dictionary" 13 | /// 14 | /// Because terms can be very long, we don't use their byte-representations as 15 | /// keys. We generate a unique number for each one to use instead. 16 | /// 17 | /// The term dictionary is a mapping between terms and their internal IDs 18 | /// (aka. TermId). It is entirely held in memory and persisted to the disk. 19 | pub struct TermDictionaryManager { 20 | next_term_id: AtomicUsize, 21 | terms: RwLock>, 22 | write_lock: Mutex, 23 | } 24 | 25 | impl TermDictionaryManager { 26 | /// Generates a new term dictionary 27 | pub fn new(db: &DB) -> Result { 28 | // TODO: Raise error if .next_term_id already exists 29 | // Next term id 30 | try!(db.put(b".next_term_id", b"1")); 31 | 32 | Ok(TermDictionaryManager { 33 | next_term_id: AtomicUsize::new(1), 34 | terms: RwLock::new(HashMap::new()), 35 | write_lock: Mutex::new(0), 36 | }) 37 | } 38 | 39 | /// Loads the term dictionary from an index 40 | pub fn open(db: &DB) -> Result { 41 | let next_term_id = match try!(db.get(b".next_term_id")) { 42 | Some(next_term_id) => { 43 | next_term_id.to_utf8().unwrap().parse::().unwrap() 44 | } 45 | None => 1, // TODO: error 46 | }; 47 | 48 | // Read dictionary 49 | let mut terms = HashMap::new(); 50 | let mut iter = db.raw_iterator(); 51 | iter.seek(b"t"); 52 | while iter.valid() { 53 | let k = iter.key().unwrap(); 54 | 55 | if k[0] != b't' { 56 | break; 57 | } 58 | 59 | let term_id = TermId(str::from_utf8(unsafe { &iter.value_inner().unwrap() }).unwrap().parse::().unwrap()); 60 | terms.insert(Term::from_bytes(&k[1..]), term_id); 61 | 62 | iter.next(); 63 | } 64 | 65 | Ok(TermDictionaryManager { 66 | next_term_id: AtomicUsize::new(next_term_id as usize), 67 | terms: RwLock::new(terms), 68 | write_lock: Mutex::new(0), 69 | }) 70 | } 71 | 72 | /// Retrieves the TermId for the given term 73 | pub fn get(&self, term: &Term) -> Option { 74 | self.terms.read().unwrap().get(term).cloned() 75 | } 76 | 77 | /// Iterates over terms in the dictionary which match the selector 78 | pub fn select(&self, term_selector: &MultiTermSelector) -> Vec { 79 | self.terms.read().unwrap().iter() 80 | .filter(|&(term, _term_id)| { 81 | term_selector.matches(term) 82 | }) 83 | .map(|(_term, term_id)| *term_id) 84 | .collect() 85 | } 86 | 87 | /// Retrieves the TermId for the given term, adding the term to the 88 | /// dictionary if it doesn't exist 89 | pub fn get_or_create(&self, db: &DB, term: &Term) -> Result { 90 | if let Some(term_id) = self.get(term) { 91 | return Ok(term_id); 92 | } 93 | 94 | // Term doesn't exist in the term dictionary 95 | 96 | // Increment next_term_id 97 | let next_term_id = self.next_term_id.fetch_add(1, Ordering::SeqCst) as u32; 98 | try!(db.put(b".next_term_id", (next_term_id + 1).to_string().as_bytes())); 99 | 100 | // Create term id 101 | let term_id = TermId(next_term_id); 102 | 103 | // Get write lock 104 | // Note: We have a separate lock so we don't need to keep an exclusive 105 | // lock on the in-memory term dictionary while writing to disk, as this 106 | // blocks readers. 107 | let _guard = self.write_lock.lock().unwrap(); 108 | 109 | // It's possible that another thread has written the term to the dictionary 110 | // since we checked earlier. If this is the case, We should forget about 111 | // writing our TermId and use the one that has been inserted already. 112 | if let Some(term_id) = self.terms.read().unwrap().get(term) { 113 | return Ok(*term_id); 114 | } 115 | 116 | // Write it to the on-disk term dictionary 117 | let kb = KeyBuilder::term_dict_mapping(term.as_bytes()); 118 | try!(db.put(kb.key(), next_term_id.to_string().as_bytes())); 119 | 120 | // Write it to the term dictionary 121 | self.terms.write().unwrap().insert(term.clone(), term_id);; 122 | 123 | Ok(term_id) 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/search/collectors/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod total_count; 2 | pub mod top_score; 3 | 4 | #[derive(Debug)] 5 | pub struct DocumentMatch { 6 | id: u64, 7 | score: Option, 8 | } 9 | 10 | impl DocumentMatch { 11 | pub fn new_unscored(id: u64) -> DocumentMatch { 12 | DocumentMatch { 13 | id: id, 14 | score: None, 15 | } 16 | } 17 | 18 | pub fn new_scored(id: u64, score: f32) -> DocumentMatch { 19 | DocumentMatch { 20 | id: id, 21 | score: Some(score), 22 | } 23 | } 24 | 25 | #[inline] 26 | pub fn doc_id(&self) -> u64 { 27 | self.id 28 | } 29 | 30 | #[inline] 31 | pub fn score(&self) -> Option { 32 | self.score 33 | } 34 | } 35 | 36 | pub trait Collector { 37 | fn needs_score(&self) -> bool; 38 | fn collect(&mut self, doc: DocumentMatch); 39 | } 40 | -------------------------------------------------------------------------------- /src/search/collectors/top_score.rs: -------------------------------------------------------------------------------- 1 | use std::cmp::Ordering; 2 | use std::collections::BinaryHeap; 3 | 4 | use search::collectors::{Collector, DocumentMatch}; 5 | 6 | /// An f32 that cannot be NaN. 7 | /// We need to order documents by score but NaN cannot be ordered, so we convert all scores into 8 | /// Realf32 first, handling any invalid values while doing that conversion 9 | #[derive(Debug, Copy, Clone, PartialEq, PartialOrd)] 10 | struct RealF32(f32); 11 | 12 | impl RealF32 { 13 | fn new(val: f32) -> Option { 14 | if val.is_nan() { 15 | None 16 | } else { 17 | Some(RealF32(val)) 18 | } 19 | } 20 | } 21 | 22 | impl Eq for RealF32 {} 23 | 24 | impl Ord for RealF32 { 25 | fn cmp(&self, other: &RealF32) -> Ordering { 26 | self.partial_cmp(other).unwrap() 27 | } 28 | } 29 | 30 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 31 | struct ScoredDocument { 32 | id: u64, 33 | score: RealF32, 34 | } 35 | 36 | impl Ord for ScoredDocument { 37 | fn cmp(&self, other: &ScoredDocument) -> Ordering { 38 | self.score.cmp(&other.score) 39 | } 40 | } 41 | 42 | impl PartialOrd for ScoredDocument { 43 | fn partial_cmp(&self, other: &ScoredDocument) -> Option { 44 | Some(self.cmp(other)) 45 | } 46 | } 47 | 48 | #[derive(Debug)] 49 | pub struct TopScoreCollector { 50 | max_docs: usize, 51 | heap: BinaryHeap, 52 | } 53 | 54 | impl TopScoreCollector { 55 | pub fn new(max_docs: usize) -> TopScoreCollector { 56 | TopScoreCollector { 57 | max_docs: max_docs, 58 | heap: BinaryHeap::with_capacity(max_docs + 1), 59 | } 60 | } 61 | 62 | pub fn into_sorted_vec(self) -> Vec { 63 | self.heap.into_sorted_vec().iter() 64 | .map(|scored_document| { 65 | DocumentMatch::new_scored(scored_document.id, -scored_document.score.0) 66 | }) 67 | .collect() 68 | } 69 | } 70 | 71 | impl Collector for TopScoreCollector { 72 | fn needs_score(&self) -> bool { 73 | true 74 | } 75 | 76 | fn collect(&mut self, doc: DocumentMatch) { 77 | let doc_id = doc.doc_id(); 78 | let score = doc.score(); 79 | 80 | // Build a ScoredDocument object, checking that the score is set and not NaN 81 | let scored_document = match score { 82 | Some(score) => { 83 | // Convert to RealF32 which is orderable but does not support NaN 84 | match RealF32::new(-score) { 85 | Some(real_score) => { 86 | ScoredDocument { 87 | id: doc_id, 88 | score: real_score, 89 | } 90 | } 91 | None => { 92 | // Score was NaN 93 | panic!("document with 'NaN' score was passed into TopScoreCollector"); 94 | } 95 | } 96 | } 97 | None => { 98 | panic!("unscored document was passed into TopScoreCollector"); 99 | } 100 | }; 101 | 102 | // Now insert the document into the heap 103 | self.heap.push(scored_document); 104 | 105 | // Now reduce the heap size if it's too big 106 | if self.heap.len() > self.max_docs { 107 | self.heap.pop(); 108 | } 109 | } 110 | } 111 | 112 | #[cfg(test)] 113 | mod tests { 114 | use search::collectors::{Collector, DocumentMatch}; 115 | use super::TopScoreCollector; 116 | 117 | #[test] 118 | fn test_top_score_collector_inital_state() { 119 | let collector = TopScoreCollector::new(10); 120 | 121 | let docs = collector.into_sorted_vec(); 122 | assert_eq!(docs.len(), 0); 123 | } 124 | 125 | #[test] 126 | fn test_top_score_collector_needs_score() { 127 | let collector = TopScoreCollector::new(10); 128 | 129 | assert_eq!(collector.needs_score(), true); 130 | } 131 | 132 | #[test] 133 | fn test_top_score_collector_collect() { 134 | let mut collector = TopScoreCollector::new(10); 135 | 136 | collector.collect(DocumentMatch::new_scored(0, 1.0f32)); 137 | collector.collect(DocumentMatch::new_scored(1, 0.5f32)); 138 | collector.collect(DocumentMatch::new_scored(2, 2.0f32)); 139 | collector.collect(DocumentMatch::new_scored(3, 1.5f32)); 140 | 141 | let docs = collector.into_sorted_vec(); 142 | assert_eq!(docs.len(), 4); 143 | assert_eq!(docs[0].id, 2); 144 | assert_eq!(docs[1].id, 3); 145 | assert_eq!(docs[2].id, 0); 146 | assert_eq!(docs[3].id, 1); 147 | } 148 | 149 | #[test] 150 | fn test_top_score_collector_truncate() { 151 | let mut collector = TopScoreCollector::new(2); 152 | 153 | collector.collect(DocumentMatch::new_scored(0, 1.0f32)); 154 | collector.collect(DocumentMatch::new_scored(1, 0.5f32)); 155 | collector.collect(DocumentMatch::new_scored(2, 2.0f32)); 156 | 157 | let docs = collector.into_sorted_vec(); 158 | assert_eq!(docs.len(), 2); 159 | assert_eq!(docs[0].id, 2); 160 | assert_eq!(docs[1].id, 0); 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/search/collectors/total_count.rs: -------------------------------------------------------------------------------- 1 | use search::collectors::{Collector, DocumentMatch}; 2 | 3 | #[derive(Debug)] 4 | pub struct TotalCountCollector { 5 | total_count: u64, 6 | } 7 | 8 | impl TotalCountCollector { 9 | pub fn new() -> TotalCountCollector { 10 | TotalCountCollector { 11 | total_count: 0, 12 | } 13 | } 14 | 15 | pub fn get_total_count(&self) -> u64 { 16 | self.total_count 17 | } 18 | } 19 | 20 | impl Collector for TotalCountCollector { 21 | fn needs_score(&self) -> bool { 22 | false 23 | } 24 | 25 | fn collect(&mut self, _doc: DocumentMatch) { 26 | self.total_count += 1; 27 | } 28 | } 29 | 30 | #[cfg(test)] 31 | mod tests { 32 | use search::collectors::{Collector, DocumentMatch}; 33 | use super::TotalCountCollector; 34 | 35 | #[test] 36 | fn test_total_count_collector_inital_state() { 37 | let collector = TotalCountCollector::new(); 38 | 39 | assert_eq!(collector.get_total_count(), 0); 40 | } 41 | 42 | #[test] 43 | fn test_total_count_collector_needs_score() { 44 | let collector = TotalCountCollector::new(); 45 | 46 | assert_eq!(collector.needs_score(), false); 47 | } 48 | 49 | #[test] 50 | fn test_total_count_collector_collect() { 51 | let mut collector = TotalCountCollector::new(); 52 | 53 | collector.collect(DocumentMatch::new_unscored(0)); 54 | collector.collect(DocumentMatch::new_unscored(1)); 55 | collector.collect(DocumentMatch::new_unscored(2)); 56 | 57 | assert_eq!(collector.get_total_count(), 3); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/search/document.rs: -------------------------------------------------------------------------------- 1 | use chrono::{DateTime, Utc, Timelike}; 2 | use byteorder::{WriteBytesExt, LittleEndian}; 3 | use fnv::FnvHashMap; 4 | 5 | use search::term_vector::TermVector; 6 | use search::schema::FieldId; 7 | use search::segment::SegmentId; 8 | 9 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] 10 | pub struct DocId(pub SegmentId, pub u16); 11 | 12 | impl DocId { 13 | pub fn as_u64(&self) -> u64 { 14 | ((self.0).0 as u64) << 16 | (self.1 as u64) 15 | } 16 | 17 | pub fn from_u64(val: u64) -> DocId { 18 | let segment = (val >> 16) & 0xFFFFFFFF; 19 | let local_id = val & 0xFFFF; 20 | DocId(SegmentId(segment as u32), local_id as u16) 21 | } 22 | } 23 | 24 | #[derive(Debug, Clone)] 25 | pub enum FieldValue { 26 | String(String), 27 | Integer(i64), 28 | Boolean(bool), 29 | DateTime(DateTime), 30 | } 31 | 32 | impl FieldValue { 33 | pub fn to_bytes(&self) -> Vec { 34 | match *self { 35 | FieldValue::String(ref string) => { 36 | let mut bytes = Vec::with_capacity(string.len()); 37 | 38 | for byte in string.as_bytes() { 39 | bytes.push(*byte); 40 | } 41 | 42 | bytes 43 | } 44 | FieldValue::Integer(value) => { 45 | let mut bytes = Vec::with_capacity(8); 46 | bytes.write_i64::(value).unwrap(); 47 | bytes 48 | } 49 | FieldValue::Boolean(value) => { 50 | if value { 51 | vec![b't'] 52 | } else { 53 | vec![b'f'] 54 | } 55 | } 56 | FieldValue::DateTime(value) => { 57 | let mut bytes = Vec::with_capacity(0); 58 | let timestamp = value.timestamp(); 59 | let micros = value.nanosecond() / 1000; 60 | let timestamp_with_micros = timestamp * 1000000 + micros as i64; 61 | bytes.write_i64::(timestamp_with_micros).unwrap(); 62 | bytes 63 | } 64 | } 65 | } 66 | } 67 | 68 | #[derive(Debug, Clone)] 69 | pub struct Document { 70 | pub key: String, 71 | pub indexed_fields: FnvHashMap, 72 | pub stored_fields: FnvHashMap, 73 | } 74 | -------------------------------------------------------------------------------- /src/search/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod term; 2 | pub mod token; 3 | pub mod term_vector; 4 | pub mod schema; 5 | pub mod document; 6 | pub mod segment; 7 | pub mod similarity; 8 | pub mod query; 9 | pub mod collectors; 10 | pub mod backends; 11 | 12 | pub use search::term::{Term, TermId}; 13 | pub use search::token::Token; 14 | pub use search::document::{Document, DocId}; 15 | pub use search::query::multi_term_selector::MultiTermSelector; 16 | pub use search::query::term_scorer::TermScorer; 17 | pub use search::query::Query; 18 | -------------------------------------------------------------------------------- /src/search/query/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod multi_term_selector; 2 | pub mod term_scorer; 3 | 4 | use search::term::Term; 5 | use search::schema::FieldId; 6 | use search::query::multi_term_selector::MultiTermSelector; 7 | use search::query::term_scorer::TermScorer; 8 | 9 | #[derive(Debug, PartialEq)] 10 | pub enum Query { 11 | /// Matches all documents, assigning the specified score to each one 12 | All { 13 | /// The score to assign to each document 14 | score: f32, 15 | }, 16 | 17 | /// Matches nothing 18 | None, 19 | 20 | /// Matches documents that contain the specified term in the specified field 21 | Term { 22 | /// The field being searched 23 | field: FieldId, 24 | 25 | /// The term to search for 26 | term: Term, 27 | 28 | /// The method of scoring each match 29 | scorer: TermScorer, 30 | }, 31 | 32 | /// Matches documents by a multi term selector 33 | /// Used for prefix, fuzzy and regex queries 34 | MultiTerm { 35 | /// The field being searched 36 | field: FieldId, 37 | 38 | /// The term selector to use. All terms that match this selector will be searched 39 | term_selector: MultiTermSelector, 40 | 41 | /// The method of scoring each match. 42 | scorer: TermScorer, 43 | }, 44 | 45 | /// Joins two queries with an AND operator 46 | /// This intersects the results of the queries. The scores are combined by average 47 | Conjunction { 48 | queries: Vec, 49 | }, 50 | 51 | /// Joins two queries with an OR operator 52 | /// This unites the results of the queries. The scores are combined by average 53 | Disjunction { 54 | queries: Vec, 55 | }, 56 | 57 | /// Joins two queries with an OR operator 58 | /// This unites the results of the queries. 59 | /// Unlike a regular Disjunction query, this takes the highest score of each query for a particular match 60 | DisjunctionMax { 61 | queries: Vec, 62 | }, 63 | 64 | /// Removes documents that do not match the "filter" query from the results 65 | /// Basically the same as a Conjunction query except that the "filter" query does not affect the score 66 | Filter { 67 | query: Box, 68 | filter: Box 69 | }, 70 | 71 | /// Removes documents that match the "exclude" query from the results 72 | Exclude { 73 | query: Box, 74 | exclude: Box 75 | }, 76 | } 77 | 78 | impl Query { 79 | /// Creates a new All query 80 | pub fn all() -> Query { 81 | Query::All { 82 | score: 1.0f32, 83 | } 84 | } 85 | 86 | /// Creates a new Term query 87 | pub fn term(field: FieldId, term: Term) -> Query { 88 | Query::Term { 89 | field: field, 90 | term: term, 91 | scorer: TermScorer::default(), 92 | } 93 | } 94 | 95 | /// Filters the query by another query 96 | /// Only documents that match the other query will remain in the results but the other query will not affect the score 97 | pub fn filter(self, filter: Query) -> Query { 98 | Query::Filter { 99 | query: Box::new(self), 100 | filter: Box::new(filter), 101 | } 102 | } 103 | 104 | /// Filters the query to exclude documents that match the other query 105 | pub fn exclude(self, exclude: Query) -> Query { 106 | Query::Exclude { 107 | query: Box::new(self), 108 | exclude: Box::new(exclude), 109 | } 110 | } 111 | 112 | #[inline] 113 | /// Multiplies the score of documents that match the query by the specified "boost" value 114 | pub fn boost(mut self, boost: f32) -> Query { 115 | self.add_boost(boost); 116 | self 117 | } 118 | 119 | fn add_boost(&mut self, add_boost: f32) { 120 | if add_boost == 1.0f32 { 121 | // This boost query won't have any effect 122 | return; 123 | } 124 | 125 | match *self { 126 | Query::All{ref mut score} => { 127 | *score *= add_boost; 128 | }, 129 | Query::None => (), 130 | Query::Term{ref mut scorer, ..} => { 131 | scorer.boost *= add_boost; 132 | } 133 | Query::MultiTerm{ref mut scorer, ..} => { 134 | scorer.boost *= add_boost; 135 | } 136 | Query::Conjunction{ref mut queries} => { 137 | for query in queries { 138 | query.add_boost(add_boost); 139 | } 140 | } 141 | Query::Disjunction{ref mut queries} => { 142 | for query in queries { 143 | query.add_boost(add_boost); 144 | } 145 | } 146 | Query::DisjunctionMax{ref mut queries} => { 147 | for query in queries { 148 | query.add_boost(add_boost); 149 | } 150 | } 151 | Query::Filter{ref mut query, ..} => { 152 | query.add_boost(add_boost); 153 | } 154 | Query::Exclude{ref mut query, ..} => { 155 | query.add_boost(add_boost); 156 | } 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/search/query/multi_term_selector.rs: -------------------------------------------------------------------------------- 1 | use search::term::Term; 2 | 3 | #[derive(Debug, PartialEq)] 4 | pub enum MultiTermSelector { 5 | Prefix(String), 6 | } 7 | 8 | impl MultiTermSelector { 9 | pub fn matches(&self, term: &Term) -> bool { 10 | match *self { 11 | MultiTermSelector::Prefix(ref prefix) => { 12 | return term.as_bytes().starts_with(prefix.as_bytes()); 13 | } 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/search/query/term_scorer.rs: -------------------------------------------------------------------------------- 1 | use search::similarity::SimilarityModel; 2 | 3 | #[derive(Debug, Clone, PartialEq)] 4 | pub struct TermScorer { 5 | pub similarity_model: SimilarityModel, 6 | pub boost: f32, 7 | } 8 | 9 | impl TermScorer { 10 | pub fn default_with_boost(boost: f32) -> TermScorer { 11 | TermScorer { 12 | similarity_model: SimilarityModel::Bm25 { 13 | k1: 1.2, 14 | b: 0.75, 15 | }, 16 | boost: boost, 17 | } 18 | } 19 | } 20 | 21 | impl Default for TermScorer { 22 | fn default() -> TermScorer { 23 | TermScorer::default_with_boost(1.0f32) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/search/schema.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::ops::Deref; 3 | use std::fmt; 4 | 5 | use serde::{Serialize, Deserialize, Serializer, Deserializer}; 6 | use fnv::FnvHashMap; 7 | 8 | bitflags! { 9 | pub flags FieldFlags: u32 { 10 | const FIELD_INDEXED = 0b00000001, 11 | const FIELD_STORED = 0b00000010, 12 | } 13 | } 14 | 15 | impl Serialize for FieldFlags { 16 | fn serialize(&self, serializer: S) -> Result 17 | where S: Serializer 18 | { 19 | let mut flag_strings = Vec::new(); 20 | 21 | if self.contains(FIELD_INDEXED) { 22 | flag_strings.push("INDEXED"); 23 | } 24 | 25 | if self.contains(FIELD_STORED) { 26 | flag_strings.push("STORED"); 27 | } 28 | 29 | serializer.serialize_str(&flag_strings.join("|")) 30 | } 31 | } 32 | 33 | impl<'a> Deserialize<'a> for FieldFlags { 34 | fn deserialize(deserializer: D) -> Result 35 | where D: Deserializer<'a> 36 | { 37 | struct Visitor; 38 | 39 | impl<'a> ::serde::de::Visitor<'a> for Visitor { 40 | type Value = FieldFlags; 41 | 42 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 43 | formatter.write_str("a string of flag names separated by a '|' character") 44 | } 45 | 46 | fn visit_str(self, value: &str) -> Result 47 | where E: ::serde::de::Error 48 | { 49 | let mut flags = FieldFlags::empty(); 50 | 51 | for flag_s in value.split("|") { 52 | match flag_s { 53 | "INDEXED" => { 54 | flags |= FIELD_INDEXED; 55 | } 56 | "STORED" => { 57 | flags |= FIELD_STORED; 58 | } 59 | _ => {} // TODO: error 60 | } 61 | } 62 | 63 | Ok(flags) 64 | } 65 | } 66 | 67 | deserializer.deserialize_str(Visitor) 68 | } 69 | } 70 | 71 | #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] 72 | pub enum FieldType { 73 | Text, 74 | PlainString, 75 | I64, 76 | Boolean, 77 | DateTime, 78 | } 79 | 80 | #[derive(Debug, Clone, Serialize, Deserialize)] 81 | pub struct FieldInfo { 82 | name: String, 83 | pub field_type: FieldType, 84 | pub field_flags: FieldFlags, 85 | } 86 | 87 | impl FieldInfo { 88 | pub fn new(name: String, field_type: FieldType, field_flags: FieldFlags) -> FieldInfo { 89 | FieldInfo { 90 | name: name, 91 | field_type: field_type, 92 | field_flags: field_flags, 93 | } 94 | } 95 | } 96 | 97 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] 98 | pub struct FieldId(pub u32); 99 | 100 | // FieldId needs to be serialised as a string as it's used as a mapping key 101 | impl Serialize for FieldId { 102 | fn serialize(&self, serializer: S) -> Result 103 | where S: Serializer 104 | { 105 | serializer.serialize_str(&self.0.to_string()) 106 | } 107 | } 108 | 109 | impl<'a> Deserialize<'a> for FieldId { 110 | fn deserialize(deserializer: D) -> Result 111 | where D: Deserializer<'a> 112 | { 113 | struct Visitor; 114 | 115 | impl<'a> ::serde::de::Visitor<'a> for Visitor { 116 | type Value = FieldId; 117 | 118 | fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { 119 | formatter.write_str("a string containing an integer") 120 | } 121 | 122 | fn visit_str(self, value: &str) -> Result 123 | where E: ::serde::de::Error 124 | { 125 | match value.parse() { 126 | Ok(value) => Ok(FieldId(value)), 127 | Err(_) => Err(E::invalid_value(::serde::de::Unexpected::Str(value), &"a string containing an integer")), 128 | } 129 | } 130 | } 131 | 132 | deserializer.deserialize_str(Visitor) 133 | } 134 | } 135 | 136 | #[derive(Debug)] 137 | pub enum AddFieldError { 138 | FieldAlreadyExists(String), 139 | } 140 | 141 | #[derive(Debug, Clone, Serialize, Deserialize)] 142 | pub struct Schema { 143 | next_field_id: u32, 144 | fields: FnvHashMap, 145 | field_names: HashMap, 146 | } 147 | 148 | impl Schema { 149 | pub fn new() -> Schema { 150 | Schema { 151 | next_field_id: 1, 152 | fields: FnvHashMap::default(), 153 | field_names: HashMap::new(), 154 | } 155 | } 156 | 157 | fn new_field_id(&mut self) -> FieldId { 158 | let field_id = FieldId(self.next_field_id); 159 | self.next_field_id += 1; 160 | 161 | field_id 162 | } 163 | 164 | pub fn get_field_by_name(&self, name: &str) -> Option { 165 | self.field_names.get(name).cloned() 166 | } 167 | 168 | pub fn add_field(&mut self, name: String, field_type: FieldType, field_flags: FieldFlags) -> Result { 169 | if self.field_names.contains_key(&name) { 170 | return Err(AddFieldError::FieldAlreadyExists(name)); 171 | } 172 | 173 | let field_id = self.new_field_id(); 174 | let field_info = FieldInfo::new(name.clone(), field_type, field_flags); 175 | 176 | self.fields.insert(field_id, field_info); 177 | self.field_names.insert(name, field_id); 178 | 179 | Ok(field_id) 180 | } 181 | 182 | pub fn remove_field(&mut self, field_id: &FieldId) -> bool { 183 | match self.fields.remove(field_id) { 184 | Some(removed_field) => { 185 | self.field_names.remove(&removed_field.name); 186 | true 187 | } 188 | None => false 189 | } 190 | } 191 | } 192 | 193 | impl Deref for Schema { 194 | type Target = FnvHashMap; 195 | 196 | fn deref(&self) -> &FnvHashMap { 197 | &self.fields 198 | } 199 | } 200 | -------------------------------------------------------------------------------- /src/search/segment.rs: -------------------------------------------------------------------------------- 1 | use roaring::RoaringBitmap; 2 | 3 | use search::schema::FieldId; 4 | use search::term::TermId; 5 | use search::document::DocId; 6 | 7 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] 8 | pub struct SegmentId(pub u32); 9 | 10 | pub trait Segment { 11 | fn load_statistic(&self, stat_name: &[u8]) -> Result, String>; 12 | fn load_stored_field_value_raw(&self, doc_local_id: u16, field_id: FieldId, value_type: &[u8]) -> Result>, String>; 13 | fn load_postings_list(&self, field_id: FieldId, term_id: TermId) -> Result, String>; 14 | fn load_deletion_list(&self) -> Result, String>; 15 | fn id(&self) -> SegmentId; 16 | 17 | fn doc_id(&self, local_id: u16) -> DocId { 18 | DocId(self.id(), local_id) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/search/similarity.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug, Clone, PartialEq)] 2 | pub enum SimilarityModel { 3 | TfIdf, 4 | Bm25{k1: f32, b: f32}, 5 | } 6 | 7 | /// tf(term_frequency) = log(term_frequency + 1.0) + 1.0 8 | #[inline] 9 | fn tf(term_frequency: u32) -> f32 { 10 | (term_frequency as f32 + 1.0f32).ln() + 1.0 11 | } 12 | 13 | /// idf(term_docs, total_docs) = log((total_docs + 1.0) / (term_docs + 1.0)) + 1.0 14 | #[inline] 15 | fn idf(term_docs: u64, total_docs: u64) -> f32 { 16 | ((total_docs as f32 + 1.0) / (term_docs as f32 + 1.0)).ln() + 1.0 17 | } 18 | 19 | impl SimilarityModel { 20 | pub fn score(&self, term_frequency: u32, length: f32, total_tokens: u64, total_docs: u64, total_docs_with_term: u64) -> f32 { 21 | match *self { 22 | SimilarityModel::TfIdf => { 23 | let tf = tf(term_frequency); 24 | let idf = idf(total_docs_with_term, total_docs); 25 | 26 | tf * idf 27 | } 28 | SimilarityModel::Bm25{k1, b} => { 29 | let tf = tf(term_frequency); 30 | let idf = idf(total_docs_with_term, total_docs); 31 | let average_length = (total_tokens as f32 + 1.0f32) / (total_docs as f32 + 1.0f32); 32 | 33 | idf * (k1 + 1.0) * (tf / (tf + (k1 * ((1.0 - b) + b * length.sqrt() / average_length.sqrt())) + 1.0f32)) 34 | } 35 | } 36 | } 37 | } 38 | 39 | #[cfg(test)] 40 | mod tests { 41 | use super::SimilarityModel; 42 | 43 | #[test] 44 | fn test_tf_idf_higher_term_freq_increases_score() { 45 | let similarity = SimilarityModel::TfIdf; 46 | 47 | assert!(similarity.score(2, 40.0, 100, 10, 5) > similarity.score(1, 40.0, 100, 10, 5)); 48 | } 49 | 50 | #[test] 51 | fn test_tf_idf_lower_term_docs_increases_score() { 52 | let similarity = SimilarityModel::TfIdf; 53 | 54 | assert!(similarity.score(1, 40.0, 100, 10, 5) > similarity.score(1, 40.0, 100, 10, 10)); 55 | } 56 | 57 | #[test] 58 | fn test_tf_idf_field_length_doesnt_affect_score() { 59 | let similarity = SimilarityModel::TfIdf; 60 | 61 | assert!(similarity.score(1, 100.0, 100, 20, 5) == similarity.score(1, 40.0, 100, 20, 5)); 62 | } 63 | 64 | #[test] 65 | fn test_tf_idf_total_tokens_doesnt_affect_score() { 66 | let similarity = SimilarityModel::TfIdf; 67 | 68 | assert!(similarity.score(1, 40.0, 1000, 20, 5) == similarity.score(1, 40.0, 100, 20, 5)); 69 | } 70 | 71 | #[test] 72 | fn test_tf_idf_handles_zeros() { 73 | let similarity = SimilarityModel::TfIdf; 74 | 75 | assert!(similarity.score(0, 0.0, 0, 0, 0).is_finite()); 76 | } 77 | 78 | #[test] 79 | fn test_bm25_higher_term_freq_increases_score() { 80 | let similarity = SimilarityModel::Bm25 { 81 | k1: 1.2, 82 | b: 0.75, 83 | }; 84 | 85 | assert!(similarity.score(2, 40.0, 100, 10, 5) > similarity.score(1, 40.0, 100, 10, 5)); 86 | } 87 | 88 | #[test] 89 | fn test_bm25_lower_term_docs_increases_score() { 90 | let similarity = SimilarityModel::Bm25 { 91 | k1: 1.2, 92 | b: 0.75, 93 | }; 94 | 95 | assert!(similarity.score(1, 40.0, 100, 10, 5) > similarity.score(1, 40.0, 100, 10, 10)); 96 | } 97 | 98 | #[test] 99 | fn test_bm25_lower_field_length_increases_score() { 100 | let similarity = SimilarityModel::Bm25 { 101 | k1: 1.2, 102 | b: 0.75, 103 | }; 104 | 105 | assert!(similarity.score(1, 40.0, 100, 20, 5) > similarity.score(1, 100.0, 100, 20, 5)); 106 | } 107 | 108 | #[test] 109 | fn test_bm25_higher_total_tokens_increases_score() { 110 | let similarity = SimilarityModel::Bm25 { 111 | k1: 1.2, 112 | b: 0.75, 113 | }; 114 | 115 | assert!(similarity.score(1, 40.0, 1000, 20, 5) > similarity.score(1, 40.0, 100, 20, 5)); 116 | } 117 | 118 | #[test] 119 | fn test_bm25_handles_zeros() { 120 | let similarity = SimilarityModel::Bm25 { 121 | k1: 0.0, 122 | b: 0.0, 123 | }; 124 | 125 | assert!(similarity.score(0, 0.0, 0, 0, 0).is_finite()); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/search/term.rs: -------------------------------------------------------------------------------- 1 | use chrono::{DateTime, Utc, Timelike}; 2 | use byteorder::{WriteBytesExt, LittleEndian}; 3 | 4 | 5 | #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] 6 | pub struct TermId(pub u32); 7 | 8 | 9 | #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Hash)] 10 | pub struct Term(Vec); 11 | 12 | impl Term { 13 | pub fn from_bytes(bytes: &[u8]) -> Term { 14 | Term(bytes.to_vec()) 15 | } 16 | 17 | pub fn from_string(string: &str) -> Term { 18 | let mut bytes = Vec::with_capacity(string.len()); 19 | 20 | for byte in string.as_bytes() { 21 | bytes.push(*byte); 22 | } 23 | 24 | Term(bytes) 25 | } 26 | 27 | pub fn from_boolean(value: bool) -> Term { 28 | if value { 29 | Term(vec![b't']) 30 | } else { 31 | Term(vec![b'f']) 32 | } 33 | } 34 | 35 | pub fn from_integer(value: i64) -> Term { 36 | let mut bytes = Vec::with_capacity(8); 37 | bytes.write_i64::(value).unwrap(); 38 | Term(bytes) 39 | } 40 | 41 | pub fn from_datetime(value: &DateTime) -> Term { 42 | let mut bytes = Vec::with_capacity(0); 43 | let timestamp = value.timestamp(); 44 | let micros = value.nanosecond() / 1000; 45 | let timestamp_with_micros = timestamp * 1000000 + micros as i64; 46 | bytes.write_i64::(timestamp_with_micros).unwrap(); 47 | Term(bytes) 48 | } 49 | 50 | pub fn as_bytes(&self) -> &[u8] { 51 | &self.0 52 | } 53 | } 54 | 55 | #[cfg(test)] 56 | mod tests { 57 | use chrono::{DateTime, Utc, Timelike}; 58 | use super::Term; 59 | 60 | #[test] 61 | fn test_string_to_bytes() { 62 | let term = Term::from_string("foo"); 63 | 64 | assert_eq!(term.as_bytes().to_vec(), vec![102, 111, 111]) 65 | } 66 | 67 | #[test] 68 | fn test_hiragana_string_to_bytes() { 69 | let term = Term::from_string("こんにちは"); 70 | 71 | assert_eq!(term.as_bytes().to_vec(), vec![227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175]) 72 | } 73 | 74 | #[test] 75 | fn test_blank_string_to_bytes() { 76 | let term = Term::from_string(""); 77 | 78 | assert_eq!(term.as_bytes().to_vec(), vec![] as Vec) 79 | } 80 | 81 | #[test] 82 | fn test_boolean_true_to_bytes() { 83 | let term = Term::from_boolean(true); 84 | 85 | // 116 = 't' in ASCII 86 | assert_eq!(term.as_bytes().to_vec(), vec![116]) 87 | } 88 | 89 | #[test] 90 | fn test_boolean_false_to_bytes() { 91 | let term = Term::from_boolean(false); 92 | 93 | // 102 = 'f' in ASCII 94 | assert_eq!(term.as_bytes().to_vec(), vec![102]) 95 | } 96 | 97 | #[test] 98 | fn test_integer_to_bytes() { 99 | let term = Term::from_integer(123); 100 | 101 | assert_eq!(term.as_bytes().to_vec(), vec![123, 0, 0, 0, 0, 0, 0, 0]) 102 | } 103 | 104 | #[test] 105 | fn test_negative_integer_to_bytes() { 106 | let term = Term::from_integer(-123); 107 | 108 | assert_eq!(term.as_bytes().to_vec(), vec![133, 255, 255, 255, 255, 255, 255, 255]) 109 | } 110 | 111 | #[test] 112 | fn test_datetime_to_bytes() { 113 | let date = "2016-07-23T16:15:00+01:00".parse::>().unwrap(); 114 | let term = Term::from_datetime(&date); 115 | 116 | assert_eq!(term.as_bytes().to_vec(), vec![0, 101, 191, 3, 79, 56, 5, 0]) 117 | } 118 | 119 | #[test] 120 | fn test_datetime_with_microseconds_to_bytes() { 121 | let mut date = "2016-07-23T16:15:00+01:00".parse::>().unwrap(); 122 | date = date.with_nanosecond(123123123).unwrap(); 123 | let term = Term::from_datetime(&date); 124 | 125 | // This is exactly 123123 higher than the result of "test_datetime_to_bytes" 126 | assert_eq!(term.as_bytes().to_vec(), vec![243, 69, 193, 3, 79, 56, 5, 0]) 127 | } 128 | 129 | #[test] 130 | fn test_datetime_with_different_timezone_to_bytes() { 131 | let date = "2016-07-23T16:15:00+02:00".parse::>().unwrap(); 132 | let term = Term::from_datetime(&date); 133 | 134 | // This is exactly 3_600_000_000 lower than the result of "test_datetime_to_bytes" 135 | assert_eq!(term.as_bytes().to_vec(), vec![0, 193, 43, 45, 78, 56, 5, 0]) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/search/term_vector.rs: -------------------------------------------------------------------------------- 1 | use std::ops::{Deref, DerefMut}; 2 | use std::collections::HashMap; 3 | 4 | use roaring::RoaringBitmap; 5 | 6 | use search::term::Term; 7 | use search::token::Token; 8 | 9 | #[derive(Debug, Clone, PartialEq)] 10 | pub struct TermVector(HashMap); 11 | 12 | impl TermVector { 13 | pub fn new() -> TermVector { 14 | TermVector(HashMap::new()) 15 | } 16 | } 17 | 18 | impl Deref for TermVector { 19 | type Target = HashMap; 20 | 21 | fn deref(&self) -> &HashMap { 22 | &self.0 23 | } 24 | } 25 | 26 | impl DerefMut for TermVector { 27 | fn deref_mut(&mut self) -> &mut HashMap { 28 | &mut self.0 29 | } 30 | } 31 | 32 | impl Into for Vec { 33 | fn into(self) -> TermVector { 34 | let mut map = HashMap::new(); 35 | 36 | for token in self { 37 | let positions = map.entry(token.term).or_insert_with(RoaringBitmap::new); 38 | positions.insert(token.position); 39 | } 40 | 41 | TermVector(map) 42 | } 43 | } 44 | 45 | impl Into> for TermVector { 46 | fn into(self) -> Vec { 47 | let mut vec = Vec::new(); 48 | 49 | for (term, positions) in self.0 { 50 | for position in positions { 51 | vec.push(Token { term: term.clone(), position: position }); 52 | } 53 | } 54 | 55 | vec.sort_by_key(|token| token.position); 56 | 57 | vec 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/search/token.rs: -------------------------------------------------------------------------------- 1 | use search::term::Term; 2 | 3 | #[derive(Debug, Clone, PartialEq)] 4 | pub struct Token { 5 | pub term: Term, 6 | pub position: u32, 7 | } 8 | -------------------------------------------------------------------------------- /src/system.rs: -------------------------------------------------------------------------------- 1 | use std::sync::RwLock; 2 | use std::path::{Path, PathBuf}; 3 | use std::fs; 4 | 5 | use slog::Logger; 6 | use search::backends::rocksdb::RocksDBStore; 7 | use uuid::Uuid; 8 | 9 | use index::Index; 10 | use index::metadata::IndexMetadata; 11 | use cluster::metadata::ClusterMetadata; 12 | 13 | 14 | pub struct System { 15 | pub log: Logger, 16 | data_dir: PathBuf, 17 | pub metadata: RwLock, 18 | } 19 | 20 | 21 | impl System { 22 | pub fn new(log: Logger, data_dir: PathBuf) -> System { 23 | System { 24 | log: log, 25 | data_dir: data_dir, 26 | metadata: RwLock::new(ClusterMetadata::new()), 27 | } 28 | } 29 | 30 | pub fn get_indices_dir(&self) -> PathBuf { 31 | let mut dir = self.data_dir.clone(); 32 | dir.push("indices"); 33 | dir 34 | } 35 | 36 | fn load_index(&self, id: Uuid, name: String, path: &Path) -> Result { 37 | let store = RocksDBStore::open(path)?; 38 | 39 | // Load metadata 40 | let mut metadata_path = path.to_path_buf(); 41 | metadata_path.push("metadata.json"); 42 | let metadata = IndexMetadata::load(metadata_path)?; 43 | 44 | Ok(Index::new(id, name, metadata, store)) 45 | } 46 | 47 | pub fn load_indices(&self) { 48 | let indices_dir = self.get_indices_dir(); 49 | match fs::read_dir(indices_dir.clone()) { 50 | Ok(files) => { 51 | for file in files { 52 | let path = file.unwrap().path(); 53 | if path.is_dir() { 54 | let index_name: String = path.file_name().unwrap().to_str().unwrap().to_owned(); 55 | 56 | match self.load_index(Uuid::new_v4(), index_name.clone().to_owned(), path.as_path()) { 57 | Ok(index) => { 58 | let mut cluster_metadata = self.metadata.write().unwrap(); 59 | let index_ref = cluster_metadata.insert_index(index); 60 | cluster_metadata.names.insert_canonical(index_name.clone(), index_ref).unwrap(); 61 | 62 | info!(self.log, "loaded index"; "index" => index_name); 63 | } 64 | Err(e) => { 65 | error!(self.log, "load index failed"; "index" => index_name, "error" => e); 66 | } 67 | } 68 | } 69 | } 70 | } 71 | Err(error) => { 72 | error!(self.log, "could not open indices directory"; "dir" => indices_dir.to_str().unwrap(), "error" => format!("{}", error)); 73 | } 74 | } 75 | } 76 | } 77 | --------------------------------------------------------------------------------