├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── dist └── training-data.7z ├── pom.xml └── src ├── main ├── kotlin │ └── be │ │ └── rlab │ │ ├── nlp │ │ ├── Distance.kt │ │ ├── MultiLanguageStemmer.kt │ │ ├── Normalizer.kt │ │ ├── SentimentAnalyzer.kt │ │ ├── StopWordTokenizer.kt │ │ ├── TextClassifier.kt │ │ ├── Tokenizer.kt │ │ ├── Tokenizers.kt │ │ ├── WordTokenizer.kt │ │ └── model │ │ │ ├── ClassificationResult.kt │ │ │ ├── Language.kt │ │ │ ├── Sentiment.kt │ │ │ ├── SentimentResult.kt │ │ │ ├── Token.kt │ │ │ └── TrainingDataSet.kt │ │ ├── search │ │ ├── AnalyzerFactory.kt │ │ ├── DocumentBuilder.kt │ │ ├── Hashes.kt │ │ ├── IndexManager.kt │ │ ├── IndexMapper.kt │ │ ├── LuceneFieldUtils.kt │ │ ├── LuceneIndex.kt │ │ ├── annotation │ │ │ ├── IndexDocument.kt │ │ │ ├── IndexField.kt │ │ │ ├── IndexFieldType.kt │ │ │ ├── Indexed.kt │ │ │ └── Stored.kt │ │ ├── mapper │ │ │ ├── FieldTypeMapper.kt │ │ │ ├── ListTypeMapper.kt │ │ │ └── SimpleTypeMapper.kt │ │ ├── model │ │ │ ├── BoolValue.kt │ │ │ ├── Cursor.kt │ │ │ ├── Document.kt │ │ │ ├── DocumentSchema.kt │ │ │ ├── Field.kt │ │ │ ├── FieldMetadata.kt │ │ │ ├── FieldSchema.kt │ │ │ ├── FieldType.kt │ │ │ ├── IndexConfig.kt │ │ │ ├── QueryBuilder.kt │ │ │ ├── SearchResult.kt │ │ │ └── TypedSearchResult.kt │ │ ├── query │ │ │ ├── DoubleRange.kt │ │ │ ├── DoubleTerm.kt │ │ │ ├── FloatRange.kt │ │ │ ├── FloatTerm.kt │ │ │ ├── Fuzzy.kt │ │ │ ├── IntRange.kt │ │ │ ├── IntTerm.kt │ │ │ ├── LongRange.kt │ │ │ ├── LongTerm.kt │ │ │ ├── Parser.kt │ │ │ ├── Phrase.kt │ │ │ ├── Regex.kt │ │ │ ├── SortBy.kt │ │ │ ├── StringRange.kt │ │ │ ├── StringTerm.kt │ │ │ └── Wildcard.kt │ │ └── schema │ │ │ ├── DocumentSchemaBuilder.kt │ │ │ └── FieldSchemaBuilder.kt │ │ ├── support │ │ ├── ResourceLoader.kt │ │ └── csv │ │ │ ├── Field.kt │ │ │ ├── Parser.kt │ │ │ ├── ParserConfig.kt │ │ │ └── Position.kt │ │ └── training │ │ ├── DataSet.kt │ │ ├── DataSetLoader.kt │ │ └── SentimentLoader.kt └── resources │ ├── logback.xml │ └── nlp │ └── stopwords │ ├── arabic.txt │ ├── armenian.txt │ ├── basque.txt │ ├── bengali.txt │ ├── brazilian.txt │ ├── bulgarian.txt │ ├── catalan.txt │ ├── chinese.txt │ ├── czech.txt │ ├── danish.txt │ ├── dutch.txt │ ├── english.txt │ ├── estonian.txt │ ├── finnish.txt │ ├── french.txt │ ├── galician.txt │ ├── german.txt │ ├── greek.txt │ ├── hindi.txt │ ├── hungarian.txt │ ├── indonesian.txt │ ├── irish.txt │ ├── italian.txt │ ├── latvian.txt │ ├── lithuanian.txt │ ├── norwegian.txt │ ├── persian.txt │ ├── polish.txt │ ├── portuguese.txt │ ├── romanian.txt │ ├── russian.txt │ ├── sorani.txt │ ├── spanish.txt │ ├── swedish.txt │ ├── thai.txt │ └── turkish.txt └── test └── kotlin └── be └── rlab ├── nlp ├── NormalizerTest.kt └── UpdateStopWordsTest.kt ├── search ├── Book.kt ├── IndexManagerTest.kt └── mock │ └── TestBook.kt └── support └── SearchTestUtils.kt /.gitattributes: -------------------------------------------------------------------------------- 1 | *.7z filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.nar 17 | *.ear 18 | *.zip 19 | *.tar.gz 20 | *.rar 21 | 22 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 23 | hs_err_pid* 24 | *.iml 25 | .idea 26 | target 27 | /data 28 | -------------------------------------------------------------------------------- /dist/training-data.7z: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:01c94e61814f977cbecfcd84956042c93994740bc64332ee1e1a37a5505eb969 3 | size 42814965 4 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/Distance.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import org.apache.lucene.search.spell.JaroWinklerDistance 4 | import org.apache.lucene.search.spell.LevenshteinDistance 5 | import org.apache.lucene.search.spell.NGramDistance 6 | 7 | /** Utility to calculate text distance using different algorithms. 8 | */ 9 | object Distance { 10 | 11 | /** Calculates the Jaro-Winkler distance between two texts. 12 | * 13 | * This algorithm works very well on terms that share the same prefixes. 14 | * 15 | * @param text A text to measure. 16 | * @param otherText Other text to measure. 17 | * @return the distance as a float between 0 and 1. 18 | */ 19 | fun jaroWinkler( 20 | text: String, 21 | otherText: String 22 | ): Float { 23 | return JaroWinklerDistance().getDistance(text, otherText) 24 | } 25 | 26 | /** Calculates the Damerau–Levenshtein distance between two texts. 27 | * 28 | * This algorithm works fine to detect misspellings. 29 | * 30 | * @param text A text to measure. 31 | * @param otherText Other text to measure. 32 | * @return the distance as a float between 0 and 1. 33 | */ 34 | fun levenshtein( 35 | text: String, 36 | otherText: String 37 | ): Float { 38 | return LevenshteinDistance().getDistance(text, otherText) 39 | } 40 | 41 | /** Calculates the N-gram distance between two texts. 42 | * 43 | * It works well for predicting the next token in a text. 44 | * 45 | * @param text A text to measure. 46 | * @param otherText Other text to measure. 47 | * @param size Size of the N-Grams groups. 48 | * @return the distance as a float between 0 and 1. 49 | */ 50 | fun ngram( 51 | text: String, 52 | otherText: String, 53 | size: Int = 2 54 | ): Float { 55 | return NGramDistance(size).getDistance(text, otherText) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/MultiLanguageStemmer.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Language 4 | import org.tartarus.snowball.SnowballStemmer 5 | import org.tartarus.snowball.ext.* 6 | 7 | /** Tartarus snowball stemmer wrapper. 8 | * 9 | * It supports all languages defined in the [Language] enumeration. 10 | */ 11 | class MultiLanguageStemmer( 12 | private val stemmer: SnowballStemmer 13 | ) { 14 | 15 | companion object { 16 | /** Creates a new stemmer for the specified language. 17 | * @param language Stemmer language. 18 | * @return the required stemmer. 19 | */ 20 | fun new(language: Language): MultiLanguageStemmer { 21 | return MultiLanguageStemmer(when (language) { 22 | Language.ARABIC -> ArabicStemmer() 23 | Language.ARMENIAN -> ArmenianStemmer() 24 | Language.BASQUE -> BasqueStemmer() 25 | Language.CATALAN -> CatalanStemmer() 26 | Language.DANISH -> DanishStemmer() 27 | Language.DUTCH -> DutchStemmer() 28 | Language.ENGLISH -> EnglishStemmer() 29 | Language.ESTONIAN -> EstonianStemmer() 30 | Language.FINNISH -> FinnishStemmer() 31 | Language.FRENCH -> FrenchStemmer() 32 | Language.GERMAN -> German2Stemmer() 33 | Language.GREEK -> GreekStemmer() 34 | Language.HINDI -> HindiStemmer() 35 | Language.HUNGARIAN -> HungarianStemmer() 36 | Language.INDONESIAN -> IndonesianStemmer() 37 | Language.IRISH -> IrishStemmer() 38 | Language.ITALIAN -> ItalianStemmer() 39 | Language.LITHUANIAN -> LithuanianStemmer() 40 | Language.NEPALI -> NepaliStemmer() 41 | Language.NORWEGIAN -> NorwegianStemmer() 42 | Language.PORTUGUESE -> PortugueseStemmer() 43 | Language.ROMANIAN -> RomanianStemmer() 44 | Language.RUSSIAN -> RussianStemmer() 45 | Language.SERBIAN -> SerbianStemmer() 46 | Language.SPANISH -> SpanishStemmer() 47 | Language.SWEDISH -> SwedishStemmer() 48 | Language.TURKISH -> TurkishStemmer() 49 | else -> throw RuntimeException("stemmer for language $language not supported") 50 | }) 51 | } 52 | } 53 | 54 | /** Applies the stemmer to a text. 55 | * @param text Text to apply the stemmer. 56 | * @return the stemmed text. 57 | */ 58 | fun stem(text: String): String { 59 | stemmer.current = text 60 | stemmer.stem() 61 | return stemmer.current 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/Normalizer.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Language 4 | import java.text.Normalizer as JavaNormalizer 5 | 6 | /** String normalizer. 7 | * 8 | * By default, it removes diacritics, removes punctuation, applies the stemmer for the specified language, 9 | * converts all terms to lowercase, and joins the terms with a single space. 10 | * 11 | * @param text Text to normalize. 12 | * @param language Text language. 13 | * @param caseSensitive Indicates whether to convert string to lowercase or not. 14 | * @param form Normalization form. 15 | * @param removeDiacritics Indicates whether to remove diacritics. 16 | * @param removePunctuation Indicates whether to split text into words. 17 | * @param removeStopWords Indicates whether to strip out stop words. 18 | * @param stemming Indicates whether to apply the stemmer to each term. 19 | * @param joinWith String to join the terms. 20 | */ 21 | class Normalizer( 22 | private val text: String, 23 | private var language: Language? = null, 24 | private var caseSensitive: Boolean = false, 25 | private var form: JavaNormalizer.Form = JavaNormalizer.Form.NFD, 26 | private var removeDiacritics: Boolean = true, 27 | private var removePunctuation: Boolean = true, 28 | private var removeStopWords: Boolean = false, 29 | private var stemming: Boolean = true, 30 | private var joinWith: String = " " 31 | ) { 32 | companion object { 33 | private val REGEX_UNACCENT: Regex = Regex("\\p{InCombiningDiacriticalMarks}+") 34 | 35 | /** Creates a new normalizer for the specified text. 36 | * @param text Text to normalize. 37 | * @param language Text language. 38 | * @return a new normalizer. 39 | */ 40 | fun new(text: String, language: Language? = null): Normalizer = Normalizer( 41 | text = text, 42 | language = language 43 | ) 44 | } 45 | 46 | fun forLanguage(newLanguage: Language): Normalizer = apply { 47 | language = newLanguage 48 | } 49 | 50 | fun caseSensitive(isCaseSensitive: Boolean = true): Normalizer = apply { 51 | caseSensitive = isCaseSensitive 52 | } 53 | 54 | fun caseInsensitive(isCaseSensitive: Boolean = false): Normalizer = apply { 55 | caseSensitive = isCaseSensitive 56 | } 57 | 58 | fun form(newForm: JavaNormalizer.Form): Normalizer = apply { 59 | form = newForm 60 | } 61 | 62 | fun removeDiacritics(remove: Boolean = true): Normalizer = apply { 63 | removeDiacritics = remove 64 | } 65 | 66 | fun keepDiacritics(keep: Boolean = true): Normalizer = apply { 67 | removeDiacritics = !keep 68 | } 69 | 70 | fun removeStopWords(remove: Boolean = true): Normalizer = apply { 71 | removeStopWords = remove 72 | } 73 | 74 | fun keepStopWords(keep: Boolean = true): Normalizer = apply { 75 | removeStopWords = !keep 76 | } 77 | 78 | fun removePunctuation(remove: Boolean = true): Normalizer = apply { 79 | removePunctuation = remove 80 | } 81 | 82 | fun keepPunctuation(keep: Boolean = true): Normalizer = apply { 83 | removePunctuation = !keep 84 | } 85 | 86 | fun applyStemming(apply: Boolean = true): Normalizer = apply { 87 | stemming = apply 88 | } 89 | 90 | fun skipStemming(): Normalizer = apply { 91 | stemming = false 92 | } 93 | 94 | fun joinWith(joinText: String): Normalizer = apply { 95 | joinWith = joinText 96 | } 97 | 98 | /** Applies normalizations and returns the normalized text. 99 | * @return a valid text. 100 | */ 101 | fun normalize(): String { 102 | var normalizedText = with(JavaNormalizer.normalize(text, form)) { -> 103 | if (removeDiacritics) { 104 | replace(REGEX_UNACCENT, "") 105 | } else { 106 | this 107 | } 108 | } 109 | 110 | if (removeDiacritics) { 111 | normalizedText = normalizedText.replace(REGEX_UNACCENT, "") 112 | } 113 | if (!caseSensitive) { 114 | normalizedText = normalizedText.lowercase() 115 | } 116 | /** Word tokenizer to split text into words. */ 117 | val wordTokenizer = WordTokenizer(removePunctuation) 118 | 119 | normalizedText = wordTokenizer.tokenize(normalizedText.reader()).joinToString(joinWith) { word -> 120 | word.toString() 121 | } 122 | 123 | if (removeStopWords) { 124 | val stopWordTokenizer = StopWordTokenizer.new(requireNotNull(language) { 125 | "language is required for the stop words tokenizer" 126 | }) 127 | normalizedText = stopWordTokenizer.tokenize(normalizedText.reader()).joinToString(joinWith) { word -> 128 | word.toString() 129 | } 130 | } 131 | 132 | if (stemming) { 133 | val stemmer = MultiLanguageStemmer.new(requireNotNull(language) { "language is required for the stemmer" }) 134 | normalizedText = normalizedText.split(joinWith).joinToString(joinWith) { word -> 135 | stemmer.stem(word) 136 | } 137 | } 138 | 139 | return normalizedText 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/SentimentAnalyzer.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Language 4 | import be.rlab.nlp.model.Sentiment 5 | import be.rlab.nlp.model.SentimentResult 6 | import be.rlab.search.IndexManager 7 | import be.rlab.search.query.term 8 | import kotlin.math.abs 9 | import kotlin.math.max 10 | import kotlin.math.min 11 | 12 | /** Analyzer that uses a pre-trained index to detect sentiments. 13 | * 14 | * The training set must be initialized by the [be.rlab.training.SentimentLoader]. 15 | */ 16 | class SentimentAnalyzer( 17 | private val indexManager: IndexManager 18 | ) { 19 | companion object { 20 | const val NAMESPACE: String = "sentiments" 21 | const val VALUE_FIELD: String = "value" 22 | const val SENTIMENT_FIELD: String = "sentiment" 23 | const val SENTIMENT_NEG_FIELD: String = "positive" 24 | const val SENTIMENT_POS_FIELD: String = "negative" 25 | } 26 | 27 | /** Analyzes a text to determine the average sentiment of the text. 28 | * 29 | * @param text Text to analyze. 30 | * @param language Text language. 31 | * @return returns the sentiment analysis result. 32 | */ 33 | fun analyze( 34 | text: String, 35 | language: Language 36 | ): SentimentResult { 37 | val words: Map = Normalizer.new( 38 | text = text, 39 | language = language, 40 | ).removeStopWords().skipStemming().normalize().split(" ").associateWith { word -> 41 | val score: Int = indexManager.find(NAMESPACE, language) { 42 | term(VALUE_FIELD, word) 43 | }.fold(0) { score, doc -> 44 | val positive: Int = doc[SENTIMENT_POS_FIELD]!! 45 | val negative: Int = doc[SENTIMENT_NEG_FIELD]!! 46 | 47 | score + positive - negative 48 | } 49 | 50 | score 51 | } 52 | val negativeCount: Int = words.values.count { value -> 53 | value < 0 54 | } 55 | val positiveCount: Int = words.values.count { value -> 56 | value > 0 57 | } 58 | val negativeScore: Int = words.values.filter { value -> 59 | value < 0 60 | }.fold(0) { score, value -> 61 | score + abs(value) 62 | } 63 | val positiveScore: Int = words.values.filter { value -> 64 | value > 0 65 | }.fold(0) { score, value -> 66 | score + abs(value) 67 | } 68 | 69 | val size: Float = words.size.toFloat() 70 | val count: Float = max(negativeCount, positiveCount).toFloat() 71 | val score: Float = min(size, count + max(negativeScore, positiveScore).toFloat()) 72 | 73 | return SentimentResult( 74 | score = (1 / (size / score)), 75 | sentiment = when { 76 | positiveScore > negativeScore -> Sentiment.POSITIVE 77 | positiveScore < negativeScore -> Sentiment.NEGATIVE 78 | else -> Sentiment.UNKNOWN 79 | } 80 | ) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/StopWordTokenizer.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Language 4 | import be.rlab.nlp.model.Token 5 | import org.apache.lucene.analysis.CharArraySet 6 | import org.apache.lucene.analysis.TokenStream 7 | import org.apache.lucene.analysis.core.StopFilter 8 | import org.apache.lucene.analysis.standard.StandardTokenizer 9 | import java.io.Reader 10 | import kotlin.streams.toList 11 | 12 | /** Tokenizer to remove stop words from a document. 13 | * By default it uses the stop words list embedded in this package. 14 | */ 15 | class StopWordTokenizer( 16 | private val stopWords: List 17 | ) : Tokenizer { 18 | 19 | companion object { 20 | /** Language to determine the list of stop words to use. 21 | * @param language Stop words language. 22 | * @return the new tokenizer. 23 | */ 24 | fun new(language: Language): StopWordTokenizer { 25 | return StopWordTokenizer( 26 | stopWords( 27 | language 28 | ) 29 | ) 30 | } 31 | 32 | /** Returns the list of default stop words for a language. 33 | * @param language Stop words language. 34 | * @return the list of stop words. 35 | */ 36 | fun stopWords(language: Language): List { 37 | val langFile = "nlp/stopwords/${language.name.lowercase()}.txt" 38 | 39 | return Thread.currentThread().contextClassLoader 40 | .getResourceAsStream(langFile)?.use { resource -> 41 | resource.bufferedReader().lines().toList() 42 | } ?: emptyList() 43 | } 44 | } 45 | 46 | override fun stream(document: Reader): TokenStream { 47 | val wordTokenizer = StandardTokenizer().apply { 48 | setReader(document) 49 | } 50 | return StopFilter(wordTokenizer, CharArraySet(stopWords, false)) 51 | } 52 | 53 | override fun tokenize(document: Reader): List { 54 | return Tokenizers.tokenize(stream(document)) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/TextClassifier.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.Distance.jaroWinkler 4 | import be.rlab.nlp.model.ClassificationResult 5 | import be.rlab.nlp.model.Language 6 | import be.rlab.nlp.model.TrainingDataSet 7 | import be.rlab.search.IndexManager 8 | import be.rlab.search.LuceneIndex 9 | import be.rlab.search.model.SearchResult 10 | import be.rlab.search.query.wildcard 11 | 12 | /** This class allows to train and query text classifiers. 13 | * 14 | * This implementation is not designed for performance. The classification retrieves all features from the index 15 | * for a specific namespace and it evaluates distances on runtime. The features are normalized before storing them. 16 | * 17 | * It uses an [LuceneIndex] to store the training data set. 18 | */ 19 | class TextClassifier( 20 | private val indexManager: IndexManager, 21 | private val namespace: String 22 | ) { 23 | 24 | companion object { 25 | private const val CATEGORY_FIELD: String = "category" 26 | private const val TEXT_FIELD: String = "text" 27 | private const val MAX_FEATURES: Int = 10000 28 | } 29 | 30 | /** Analyzes and sets the category for a text. 31 | * It stores the text and the category into the index. 32 | * 33 | * @param category Text category. 34 | * @param text Text to assign the specified category. 35 | * @param language Text language. 36 | */ 37 | fun train( 38 | category: String, 39 | text: String, 40 | language: Language 41 | ) { 42 | val normalizedText: String = Normalizer.new(text, language = language) 43 | .applyStemming() 44 | .removeStopWords() 45 | .normalize() 46 | 47 | indexManager.index(namespace, language) { 48 | text(CATEGORY_FIELD, category) 49 | text(TEXT_FIELD, normalizedText) 50 | } 51 | } 52 | 53 | /** Trains the classifier from data sets. 54 | * @param dataSets Data sets used to train this classifier. 55 | */ 56 | fun train(dataSets: List) { 57 | dataSets.forEach { dataSet -> 58 | dataSet.categories.forEach { category -> 59 | dataSet.values.forEach { value -> 60 | train(category, value, dataSet.language) 61 | } 62 | } 63 | } 64 | } 65 | 66 | /** Resolves the top category for a text. 67 | * @param text Text to resolve category. 68 | * @param language Text language. 69 | * @return the resolved category or null if there's no matching category. 70 | */ 71 | fun classify( 72 | text: String, 73 | language: Language 74 | ): String? { 75 | return classifyAll(text, language).firstOrNull()?.assignedClass 76 | } 77 | 78 | /** Resolves all categories for a text. 79 | * @param text Text to search categories for. 80 | * @param language Text language. 81 | * @return the list of matching categories. 82 | */ 83 | fun classifyAll( 84 | text: String, 85 | language: Language 86 | ): List { 87 | 88 | val features: SearchResult = indexManager.search(namespace, language, limit = MAX_FEATURES) { 89 | wildcard(CATEGORY_FIELD, "*") 90 | } 91 | 92 | val normalizedText: String = Normalizer.new(text, language) 93 | .applyStemming() 94 | .removeStopWords() 95 | .normalize() 96 | 97 | return features.docs.groupBy { document -> 98 | val value: String = document[CATEGORY_FIELD]!! 99 | value 100 | }.map { (category, documents) -> 101 | val distance: Float = documents.maxOfOrNull { document -> 102 | jaroWinkler(document[TEXT_FIELD]!!, normalizedText) 103 | } ?: 0.0F 104 | 105 | ClassificationResult( 106 | assignedClass = category, 107 | score = distance.toDouble() 108 | ) 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/Tokenizer.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Token 4 | import org.apache.lucene.analysis.TokenStream 5 | import java.io.Reader 6 | 7 | /** Must be implemented by classes that split out a document into tokens. 8 | * A token is the minimum unit of text within a document stream. 9 | */ 10 | interface Tokenizer { 11 | /** Returns the Lucene token stream for the specified document. 12 | * @param document Document to tokenize. 13 | * @return the Lucene token stream. 14 | */ 15 | fun stream(document: Reader): TokenStream 16 | 17 | /** Tokenizes a document and returns the list of result [Token]s. 18 | * @param document Document to tokenize. 19 | * @return the result tokens. 20 | */ 21 | fun tokenize(document: Reader): List 22 | } 23 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/Tokenizers.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Token 4 | import org.apache.lucene.analysis.TokenStream 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 6 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute 7 | 8 | /** Utilities to work with Lucene tokenizers. 9 | */ 10 | object Tokenizers { 11 | 12 | /** Executes a Lucene tokenizer and returns the result tokens. 13 | * @param tokenStream Lucene tokenizer stream. 14 | * @return the tokens generated by the tokenizer. 15 | */ 16 | fun tokenize(tokenStream: TokenStream): List { 17 | tokenStream.use { 18 | val charTermAttribute: CharTermAttribute = tokenStream.addAttribute(CharTermAttribute::class.java) 19 | val offsetAttribute: OffsetAttribute = tokenStream.addAttribute(OffsetAttribute::class.java) 20 | val result: MutableList = mutableListOf() 21 | 22 | tokenStream.reset() 23 | 24 | while (tokenStream.incrementToken()) { 25 | result.add( 26 | Token( 27 | start = offsetAttribute.startOffset(), 28 | end = offsetAttribute.endOffset(), 29 | data = charTermAttribute.toString().toCharArray() 30 | ) 31 | ) 32 | } 33 | 34 | tokenStream.end() 35 | 36 | return result 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/WordTokenizer.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Token 4 | import org.apache.lucene.analysis.TokenStream 5 | import org.apache.lucene.analysis.core.WhitespaceTokenizer 6 | import org.apache.lucene.analysis.standard.StandardTokenizer 7 | import java.io.Reader 8 | 9 | /** Splits out a document into words. 10 | * It removes spaces and punctuation. 11 | */ 12 | class WordTokenizer( 13 | /** true to remove punctuation, false otherwise. */ 14 | private val punctuation: Boolean = true 15 | ) : Tokenizer { 16 | 17 | override fun stream(document: Reader): TokenStream { 18 | return if (punctuation) { 19 | StandardTokenizer().apply { 20 | setReader(document) 21 | } 22 | } else { 23 | WhitespaceTokenizer().apply { 24 | setReader(document) 25 | } 26 | } 27 | } 28 | 29 | override fun tokenize(document: Reader): List { 30 | return Tokenizers.tokenize(stream(document)) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/model/ClassificationResult.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp.model 2 | 3 | /** Represents a text classification result. 4 | * It is used by the [be.rlab.augusto.nlp.TextClassifier] to retrieve all matching categories and 5 | * its scores for a text. 6 | * 7 | * @param assignedClass Text category. 8 | * @param score Score within the result set. 9 | */ 10 | data class ClassificationResult( 11 | val assignedClass: String, 12 | val score: Double 13 | ) 14 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/model/Language.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp.model 2 | 3 | enum class Language(val code: String) { 4 | ARABIC("ar"), 5 | ARMENIAN("hy"), 6 | BASQUE("eu"), 7 | BENGALI("bn"), 8 | BRAZILIAN("pt"), 9 | BULGARIAN("bg"), 10 | CATALAN("ca"), 11 | CHINESE("zh"), 12 | CZECH("cs"), 13 | DANISH("da"), 14 | DUTCH("nl"), 15 | ENGLISH("en"), 16 | ESTONIAN("et"), 17 | FINNISH("fi"), 18 | FRENCH("fr"), 19 | GALICIAN("gl"), 20 | GERMAN("de"), 21 | GREEK("el"), 22 | HINDI("hi"), 23 | HUNGARIAN("hu"), 24 | INDONESIAN("id"), 25 | IRISH("ga"), 26 | ITALIAN("it"), 27 | LATVIAN("lv"), 28 | LITHUANIAN("lt"), 29 | NORWEGIAN("no"), 30 | NEPALI("ne"), 31 | PERSIAN("fa"), 32 | POLISH("pl"), 33 | PORTUGUESE("pt"), 34 | ROMANIAN("ro"), 35 | RUSSIAN("ru"), 36 | SERBIAN("sr"), 37 | SORANI("ku"), 38 | SPANISH("es"), 39 | SWEDISH("sv"), 40 | THAI("th"), 41 | TURKISH("tr") 42 | } 43 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/model/Sentiment.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp.model 2 | 3 | enum class Sentiment { 4 | /** Positive sentiment. */ 5 | POSITIVE, 6 | /** Negative sentiment. */ 7 | NEGATIVE, 8 | /** Sentiment cannot be determined. */ 9 | UNKNOWN 10 | } 11 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/model/SentimentResult.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp.model 2 | 3 | /** Represents the result of sentiment analysis. 4 | * 5 | * The score indicates how strong is the sentiment within the analyzed text. 6 | */ 7 | data class SentimentResult( 8 | /** Number between 0 and 1 that indicates how strong is the sentiment. */ 9 | val score: Float, 10 | 11 | /** Resolved sentiment. */ 12 | val sentiment: Sentiment 13 | ) 14 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/model/Token.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp.model 2 | 3 | /** Represents a token generated by a Lucene tokenizer. 4 | * @param start Start position in the document. 5 | * @param end End position in the document. 6 | * @param data Token data. 7 | */ 8 | data class Token( 9 | val start: Int, 10 | val end: Int, 11 | val data: CharArray 12 | ) { 13 | 14 | fun isEmpty(): Boolean { 15 | return data.isEmpty() 16 | } 17 | 18 | override fun toString(): String { 19 | return String(data) 20 | } 21 | 22 | override fun equals(other: Any?): Boolean { 23 | if (this === other) return true 24 | if (javaClass != other?.javaClass) return false 25 | 26 | other as Token 27 | 28 | if (start != other.start) return false 29 | if (end != other.end) return false 30 | if (!data.contentEquals(other.data)) return false 31 | 32 | return true 33 | } 34 | 35 | override fun hashCode(): Int { 36 | var result = start 37 | result = 31 * result + end 38 | result = 31 * result + data.contentHashCode() 39 | return result 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/nlp/model/TrainingDataSet.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp.model 2 | 3 | import be.rlab.nlp.model.Language 4 | 5 | data class TrainingDataSet( 6 | val language: Language, 7 | val categories: List, 8 | val values: List 9 | ) 10 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/AnalyzerFactory.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search 2 | 3 | import be.rlab.nlp.StopWordTokenizer 4 | import be.rlab.nlp.model.Language 5 | import org.apache.lucene.analysis.Analyzer 6 | import org.apache.lucene.analysis.CharArraySet 7 | import org.apache.lucene.analysis.bg.BulgarianAnalyzer 8 | import org.apache.lucene.analysis.bn.BengaliAnalyzer 9 | import org.apache.lucene.analysis.br.BrazilianAnalyzer 10 | import org.apache.lucene.analysis.ca.CatalanAnalyzer 11 | import org.apache.lucene.analysis.ckb.SoraniAnalyzer 12 | import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer 13 | import org.apache.lucene.analysis.cz.CzechAnalyzer 14 | import org.apache.lucene.analysis.da.DanishAnalyzer 15 | import org.apache.lucene.analysis.de.GermanAnalyzer 16 | import org.apache.lucene.analysis.el.GreekAnalyzer 17 | import org.apache.lucene.analysis.en.EnglishAnalyzer 18 | import org.apache.lucene.analysis.es.SpanishAnalyzer 19 | import org.apache.lucene.analysis.et.EstonianAnalyzer 20 | import org.apache.lucene.analysis.eu.BasqueAnalyzer 21 | import org.apache.lucene.analysis.fa.PersianAnalyzer 22 | import org.apache.lucene.analysis.fi.FinnishAnalyzer 23 | import org.apache.lucene.analysis.fr.FrenchAnalyzer 24 | import org.apache.lucene.analysis.ga.IrishAnalyzer 25 | import org.apache.lucene.analysis.gl.GalicianAnalyzer 26 | import org.apache.lucene.analysis.hi.HindiAnalyzer 27 | import org.apache.lucene.analysis.hu.HungarianAnalyzer 28 | import org.apache.lucene.analysis.hy.ArmenianAnalyzer 29 | import org.apache.lucene.analysis.id.IndonesianAnalyzer 30 | import org.apache.lucene.analysis.it.ItalianAnalyzer 31 | import org.apache.lucene.analysis.lt.LithuanianAnalyzer 32 | import org.apache.lucene.analysis.lv.LatvianAnalyzer 33 | import org.apache.lucene.analysis.ne.NepaliAnalyzer 34 | import org.apache.lucene.analysis.nl.DutchAnalyzer 35 | import org.apache.lucene.analysis.no.NorwegianAnalyzer 36 | import org.apache.lucene.analysis.pl.PolishAnalyzer 37 | import org.apache.lucene.analysis.pt.PortugueseAnalyzer 38 | import org.apache.lucene.analysis.ro.RomanianAnalyzer 39 | import org.apache.lucene.analysis.ru.RussianAnalyzer 40 | import org.apache.lucene.analysis.sr.SerbianAnalyzer 41 | import org.apache.lucene.analysis.sv.SwedishAnalyzer 42 | import org.apache.lucene.analysis.th.ThaiAnalyzer 43 | import org.apache.lucene.analysis.tr.TurkishAnalyzer 44 | import kotlin.reflect.full.createInstance 45 | 46 | object AnalyzerFactory { 47 | fun newAnalyzer(language: Language): Analyzer { 48 | val klass = when(language) { 49 | Language.ARABIC -> EnglishAnalyzer::class 50 | Language.ARMENIAN -> ArmenianAnalyzer::class 51 | Language.BASQUE -> BasqueAnalyzer::class 52 | Language.BENGALI -> BengaliAnalyzer::class 53 | Language.BRAZILIAN -> BrazilianAnalyzer::class 54 | Language.BULGARIAN -> BulgarianAnalyzer::class 55 | Language.CATALAN -> CatalanAnalyzer::class 56 | Language.CHINESE -> SmartChineseAnalyzer::class 57 | Language.CZECH -> CzechAnalyzer::class 58 | Language.DANISH -> DanishAnalyzer::class 59 | Language.DUTCH -> DutchAnalyzer::class 60 | Language.ENGLISH -> EnglishAnalyzer::class 61 | Language.ESTONIAN -> EstonianAnalyzer::class 62 | Language.FINNISH -> FinnishAnalyzer::class 63 | Language.FRENCH -> FrenchAnalyzer::class 64 | Language.GALICIAN -> GalicianAnalyzer::class 65 | Language.GERMAN -> GermanAnalyzer::class 66 | Language.GREEK -> GreekAnalyzer::class 67 | Language.HINDI -> HindiAnalyzer::class 68 | Language.HUNGARIAN -> HungarianAnalyzer::class 69 | Language.INDONESIAN -> IndonesianAnalyzer::class 70 | Language.IRISH -> IrishAnalyzer::class 71 | Language.ITALIAN -> ItalianAnalyzer::class 72 | Language.LATVIAN -> LatvianAnalyzer::class 73 | Language.LITHUANIAN -> LithuanianAnalyzer::class 74 | Language.NEPALI -> NepaliAnalyzer::class 75 | Language.NORWEGIAN -> NorwegianAnalyzer::class 76 | Language.PERSIAN -> PersianAnalyzer::class 77 | Language.POLISH -> PolishAnalyzer::class 78 | Language.PORTUGUESE -> PortugueseAnalyzer::class 79 | Language.ROMANIAN -> RomanianAnalyzer::class 80 | Language.RUSSIAN -> RussianAnalyzer::class 81 | Language.SERBIAN -> SerbianAnalyzer::class 82 | Language.SORANI -> SoraniAnalyzer::class 83 | Language.SPANISH -> SpanishAnalyzer::class 84 | Language.SWEDISH -> SwedishAnalyzer::class 85 | Language.THAI -> ThaiAnalyzer::class 86 | Language.TURKISH -> TurkishAnalyzer::class 87 | } 88 | 89 | val stopWords = StopWordTokenizer.stopWords(language) 90 | 91 | return if (stopWords.isEmpty()) { 92 | klass.createInstance() 93 | } else { 94 | val ctor = klass.constructors.find { ctor -> 95 | ctor.parameters.size == 1 && ctor.parameters.first().type.classifier == CharArraySet::class 96 | } ?: throw RuntimeException("constructor with stop words not found") 97 | ctor.call(CharArraySet(stopWords, false)) 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/Hashes.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search 2 | 3 | import be.rlab.nlp.model.Language 4 | import org.apache.commons.codec.digest.MurmurHash3 5 | import java.util.* 6 | 7 | /** Utility to generate and read hashes. 8 | */ 9 | object Hashes { 10 | private val languageHashes: Map = Language.entries.associateWith { language -> 11 | Integer.toHexString(MurmurHash3.hash32(language.name)).padStart(8, '0') 12 | } 13 | private val reverseLanguageHashes: Map = Language.entries.associateBy { language -> 14 | Integer.toHexString(MurmurHash3.hash32(language.name)).padStart(8, '0') 15 | } 16 | 17 | /** Generates a non-cryptographic, language-dependant hash to represent unique identifiers. 18 | * It uses a combination of murmur3 hashes over the language, the id and the current time. 19 | * The generated id contains information about the language and it can be reversed using [getLanguage]. 20 | */ 21 | fun generateId( 22 | id: UUID, 23 | language: Language 24 | ): String { 25 | val langHash: String = languageHashes.getValue(language) 26 | val timestamp: String = java.lang.Long.toHexString(System.currentTimeMillis()).padStart(12, '0') 27 | val idHash: String = Integer.toHexString(MurmurHash3.hash32(id.toString())).padStart(8, '0') 28 | return "$langHash$timestamp$idHash" 29 | } 30 | 31 | /** Returns the language for an identifier generated with [generateId]. 32 | * @param id Id to retrieve language. 33 | * @return The id language. 34 | */ 35 | fun getLanguage(id: String): Language { 36 | return reverseLanguageHashes.getValue(id.substring(0..7)) 37 | } 38 | } -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/IndexMapper.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search 2 | 3 | import be.rlab.nlp.model.Language 4 | import be.rlab.search.model.* 5 | import be.rlab.search.schema.DocumentSchemaBuilder 6 | import be.rlab.search.mapper.FieldTypeMapper 7 | import be.rlab.search.mapper.ListTypeMapper 8 | import be.rlab.search.mapper.SimpleTypeMapper 9 | import kotlin.reflect.KClass 10 | import kotlin.reflect.full.primaryConstructor 11 | 12 | class IndexMapper( 13 | val indexManager: IndexManager, 14 | fieldTypeMappers: List = emptyList() 15 | ) { 16 | val fieldTypeMappers: List = fieldTypeMappers + listOf( 17 | SimpleTypeMapper(), 18 | ListTypeMapper() 19 | ) 20 | 21 | /** Analyzes and indexes a document reading the configuration from annotations. 22 | * @param source Document to index. Must be annotated with the proper annotations. 23 | */ 24 | fun index(source: T, language: Language) { 25 | val schema = DocumentSchemaBuilder.buildFromClass(source::class, fieldTypeMappers) 26 | val builder = DocumentBuilder.fromObject(schema, language, source, LuceneIndex.CURRENT_VERSION) 27 | indexManager.index(builder.build()) 28 | } 29 | 30 | /** Search for documents in a specific language. 31 | * 32 | * The query builder provides a flexible interface to build Lucene queries. 33 | * 34 | * The cursor and the limit allow to paginate the search results. If you provide a cursor returned 35 | * in a previous [SearchResult], this method resumes the search from there. 36 | * 37 | * @param language Language of the index to search. 38 | * @param cursor Cursor to resume a paginated search. 39 | * @param limit Max number of results to retrieve. 40 | * @param builder Query builder. 41 | */ 42 | inline fun search( 43 | language: Language, 44 | cursor: Cursor = Cursor.first(), 45 | limit: Int = IndexManager.DEFAULT_LIMIT, 46 | builder: QueryBuilder.() -> Unit 47 | ): TypedSearchResult { 48 | val schema: DocumentSchema = DocumentSchemaBuilder.buildFromClass(T::class, fieldTypeMappers) 49 | val query = QueryBuilder.forSchema(schema, language).apply(builder) 50 | val result = indexManager.search(query, cursor, limit) 51 | 52 | return TypedSearchResult( 53 | docs = result.docs.map { source -> convert(source, T::class) }, 54 | total = result.total, 55 | next = result.next 56 | ) 57 | } 58 | 59 | fun convert(source: Document, targetType: KClass): T { 60 | val docSchema = DocumentSchemaBuilder.buildFromClass(targetType, fieldTypeMappers) 61 | val constructor = targetType.primaryConstructor ?: throw RuntimeException("no primary constructor found") 62 | val values: List = constructor.parameters.map { param -> 63 | val mapper = fieldTypeMappers.firstOrNull { mapper -> mapper.supports(param.type) } 64 | ?: throw RuntimeException("no mapper found for type: ${param.type}") 65 | val value: Any? = mapper.mapValue(param.type, param.name!!, docSchema, source) 66 | require(param.type.isMarkedNullable || value != null) { "field value cannot be null: name=${param.name}" } 67 | value 68 | } 69 | return constructor.call(*values.toTypedArray()) 70 | } 71 | 72 | inline fun convert(source: Document): T { 73 | return convert(source, T::class) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/LuceneFieldUtils.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search 2 | 3 | import be.rlab.search.LuceneIndex.Companion.METADATA_FIELD 4 | import be.rlab.search.model.Field 5 | import be.rlab.search.model.FieldMetadata 6 | import be.rlab.search.model.FieldType 7 | import org.apache.lucene.document.* 8 | import org.apache.lucene.util.BytesRef 9 | import org.apache.lucene.document.Document as LuceneDocument 10 | import org.apache.lucene.document.Field as LuceneField 11 | 12 | object LuceneFieldUtils { 13 | const val PRIVATE_FIELD_PREFIX: String = "private!!" 14 | 15 | fun LuceneDocument.addField( 16 | field: Field, 17 | version: String 18 | ) { 19 | require(field.values.isNotEmpty()) { "field value cannot be null" } 20 | 21 | if (field.docValues) { 22 | docValuesFromField(field).forEach(::add) 23 | newFields(field).map(::newStoredField).forEach(::add) 24 | } else { 25 | val newFields = newFields(field) 26 | if (field.stored) { 27 | newFields 28 | .filter { newField -> newField.numericValue() != null } 29 | .map(::newStoredField) 30 | .forEach(::add) 31 | } 32 | newFields.forEach(::add) 33 | } 34 | 35 | add( 36 | StringField( 37 | privateField("${field.name}!!$METADATA_FIELD", version), 38 | FieldMetadata( 39 | type = field.type, 40 | stored = field.stored, 41 | indexed = field.indexed, 42 | docValues = field.docValues 43 | ).serialize(), 44 | LuceneField.Store.YES 45 | ) 46 | ) 47 | } 48 | 49 | fun privateField( 50 | name: String, 51 | version: String = LuceneIndex.CURRENT_VERSION 52 | ): String { 53 | return when (version) { 54 | "1" -> name 55 | "2" -> "${PRIVATE_FIELD_PREFIX}$name" 56 | else -> throw RuntimeException("invalid document version: $version") 57 | } 58 | } 59 | 60 | private fun docValuesFromField(field: Field): List { 61 | require(field.docValues) { "the field is not marked as doc value" } 62 | require(field.values.isNotEmpty()) { "at least one value is required" } 63 | 64 | return when (field.type) { 65 | FieldType.STRING, FieldType.TEXT -> if (field.values.size > 1) 66 | field.values.map { SortedSetDocValuesField(field.name, BytesRef(it as String)) } 67 | else 68 | listOf(SortedDocValuesField(field.name, BytesRef(field.values.first() as String))) 69 | FieldType.INT, FieldType.LONG, FieldType.FLOAT, FieldType.DOUBLE -> if (field.values.size > 1) { 70 | toArray(field.values).map { value -> 71 | SortedNumericDocValuesField(field.name, value.toLong()) 72 | } 73 | } else 74 | listOf(NumericDocValuesField(field.name, (field.values.first() as Number).toLong())) 75 | } 76 | } 77 | 78 | private fun newFields(field: Field): List { 79 | return when (field.type) { 80 | FieldType.STRING -> toArray(field.values).map { value -> 81 | StringField(field.name, value, if (field.stored) { 82 | LuceneField.Store.YES 83 | } else { 84 | LuceneField.Store.NO 85 | }) 86 | } 87 | 88 | FieldType.TEXT -> toArray(field.values).map { value -> 89 | TextField(field.name, value, if (field.stored) { 90 | LuceneField.Store.YES 91 | } else { 92 | LuceneField.Store.NO 93 | }) 94 | } 95 | 96 | FieldType.INT -> listOf(IntPoint(field.name, *toArray(field.values).toIntArray())) 97 | FieldType.LONG -> listOf(LongPoint(field.name, *toArray(field.values).toLongArray())) 98 | FieldType.FLOAT -> listOf(FloatPoint(field.name, *toArray(field.values).toFloatArray())) 99 | FieldType.DOUBLE -> listOf(DoublePoint(field.name, *toArray(field.values).toDoubleArray())) 100 | } 101 | } 102 | 103 | private fun newStoredField(field: LuceneField): LuceneField { 104 | return field.numericValue()?.let { value -> 105 | when (field) { 106 | is IntPoint -> StoredField(field.name(), value.toInt()) 107 | is LongPoint -> StoredField(field.name(), value.toLong()) 108 | is FloatPoint -> StoredField(field.name(), value.toFloat()) 109 | is DoublePoint -> StoredField(field.name(), value.toDouble()) 110 | else -> field 111 | } 112 | } ?: field 113 | } 114 | 115 | private inline fun toArray(source: List): Array { 116 | return source.map { item -> item as T }.toTypedArray() 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/annotation/IndexDocument.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.annotation 2 | 3 | @Target(AnnotationTarget.CLASS) 4 | @Retention(AnnotationRetention.RUNTIME) 5 | annotation class IndexDocument( 6 | val namespace: String 7 | ) 8 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/annotation/IndexField.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.annotation 2 | 3 | import be.rlab.search.model.BoolValue 4 | 5 | @Target(AnnotationTarget.PROPERTY) 6 | @Retention(AnnotationRetention.RUNTIME) 7 | annotation class IndexField( 8 | @Deprecated("use name instead") 9 | val fieldName: String = "", 10 | val name: String = "", 11 | val index: BoolValue = BoolValue.DEFAULT, 12 | val docValues: Boolean = false, 13 | val store: BoolValue = BoolValue.DEFAULT, 14 | ) 15 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/annotation/IndexFieldType.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.annotation 2 | 3 | import be.rlab.search.model.FieldType 4 | 5 | /** Overrides the type inference and sets the field type in the index. 6 | * If this type is not compatible with the Kotlin type, it will throw an error at index time. 7 | */ 8 | @Target(AnnotationTarget.PROPERTY) 9 | @Retention(AnnotationRetention.RUNTIME) 10 | annotation class IndexFieldType( 11 | val type: FieldType 12 | ) 13 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/annotation/Indexed.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.annotation 2 | 3 | /** Mark this field to be indexed. 4 | */ 5 | @Target(AnnotationTarget.PROPERTY) 6 | @Retention(AnnotationRetention.RUNTIME) 7 | @Deprecated("Use @IndexField(index = BoolValue.YES | BoolValue.NO) instead") 8 | annotation class Indexed( 9 | /** True to index the field, false otherwise. */ 10 | val value: Boolean = true 11 | ) 12 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/annotation/Stored.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.annotation 2 | 3 | /** Mark this field to be stored in the index. 4 | */ 5 | @Target(AnnotationTarget.PROPERTY) 6 | @Retention(AnnotationRetention.RUNTIME) 7 | @Deprecated("Use @IndexField(store = BoolValue.YES | BoolValue.NO) instead") 8 | annotation class Stored( 9 | /** True to store the field value, false otherwise. */ 10 | val value: Boolean = true 11 | ) 12 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/mapper/FieldTypeMapper.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.mapper 2 | 3 | import be.rlab.search.model.Document 4 | import be.rlab.search.model.DocumentSchema 5 | import be.rlab.search.model.FieldSchema 6 | import be.rlab.search.schema.FieldSchemaBuilder 7 | import kotlin.reflect.KType 8 | 9 | /** Implement this interface to map from Kotlin types to Lucene fields and viceversa. 10 | */ 11 | interface FieldTypeMapper { 12 | /** Indicates whether this mapper can convert a Kotlin type. 13 | * @param sourceType Type to verify. 14 | * @return true if this mapper can convert the source type, false otherwise. 15 | */ 16 | fun supports(sourceType: KType): Boolean 17 | 18 | /** Converts a Kotlin type to a list of fields schemas. 19 | * If the source type is not supported, it throws an error. 20 | */ 21 | fun mapSchema(sourceType: KType, builder: FieldSchemaBuilder): List 22 | 23 | /** Converts a field or a set of fields from a Lucene document into its Kotlin value. 24 | * If the target type is not supported, it throws an error. 25 | */ 26 | fun mapValue( 27 | targetType: KType, 28 | fieldName: String, 29 | schema: DocumentSchema, 30 | document: Document 31 | ): T? 32 | } 33 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/mapper/ListTypeMapper.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.mapper 2 | 3 | import be.rlab.search.model.Document 4 | import be.rlab.search.model.DocumentSchema 5 | import be.rlab.search.model.FieldSchema 6 | import be.rlab.search.model.FieldType 7 | import be.rlab.search.schema.FieldSchemaBuilder 8 | import kotlin.reflect.KType 9 | 10 | class ListTypeMapper : FieldTypeMapper { 11 | override fun supports(sourceType: KType): Boolean { 12 | return sourceType.classifier == List::class 13 | } 14 | 15 | override fun mapSchema(sourceType: KType, builder: FieldSchemaBuilder): List { 16 | require(sourceType.classifier == List::class) { "source type not supported: $sourceType" } 17 | require(sourceType.arguments.isNotEmpty()) { "list type cannot be resolved" } 18 | require(!sourceType.isMarkedNullable) { "List type cannot be null" } 19 | 20 | val fieldType = when (sourceType.arguments.first().type?.classifier) { 21 | String::class -> FieldType.TEXT 22 | Int::class -> FieldType.INT 23 | Long::class -> FieldType.LONG 24 | Double::class -> FieldType.DOUBLE 25 | Float::class -> FieldType.FLOAT 26 | else -> throw RuntimeException( 27 | "Unsupported property type '${sourceType}' for field '${builder.name}'" 28 | ) 29 | } 30 | 31 | return listOf(builder.type(fieldType).build()) 32 | } 33 | 34 | @Suppress("UNCHECKED_CAST") 35 | override fun mapValue(targetType: KType, fieldName: String, schema: DocumentSchema, document: Document): T? { 36 | require(targetType.classifier == List::class) { "target type not supported: $targetType" } 37 | val fieldSchema = schema.findField(fieldName) 38 | return fieldSchema?.let { 39 | document.getValues(fieldSchema.name) as T 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/mapper/SimpleTypeMapper.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.mapper 2 | 3 | import be.rlab.search.model.Document 4 | import be.rlab.search.model.DocumentSchema 5 | import be.rlab.search.model.FieldSchema 6 | import be.rlab.search.model.FieldType 7 | import be.rlab.search.schema.FieldSchemaBuilder 8 | import kotlin.reflect.KType 9 | 10 | class SimpleTypeMapper : FieldTypeMapper { 11 | private val supportedTypes = listOf( 12 | String::class, Int::class, Long::class, Double::class, Float::class 13 | ) 14 | 15 | override fun supports(sourceType: KType): Boolean { 16 | return supportedTypes.contains(sourceType.classifier) 17 | } 18 | 19 | override fun mapSchema(sourceType: KType, builder: FieldSchemaBuilder): List { 20 | require(supportedTypes.contains(sourceType.classifier)) { 21 | "source type not supported: $sourceType" 22 | } 23 | 24 | val fieldType = when (sourceType.classifier) { 25 | String::class -> FieldType.TEXT 26 | Int::class -> FieldType.INT 27 | Long::class -> FieldType.LONG 28 | Double::class -> FieldType.DOUBLE 29 | Float::class -> FieldType.FLOAT 30 | else -> throw RuntimeException( 31 | "Unsupported property type '${sourceType}' for field '${builder.name}'" 32 | ) 33 | } 34 | val stored = builder.store ?: fieldType.stored 35 | val indexed = builder.index ?: fieldType.indexed 36 | 37 | require(stored || sourceType.isMarkedNullable) { 38 | "If the field will not be stored the property must be nullable: name=${builder.name}" 39 | } 40 | 41 | return listOf( 42 | builder 43 | .type(fieldType) 44 | .store(stored) 45 | .index(indexed) 46 | .build() 47 | ) 48 | } 49 | 50 | @Suppress("UNCHECKED_CAST") 51 | override fun mapValue(targetType: KType, fieldName: String, schema: DocumentSchema, document: Document): T? { 52 | require(supportedTypes.contains(targetType.classifier)) { 53 | "target type not supported: $targetType" 54 | } 55 | 56 | val fieldSchema = schema.findField(fieldName) 57 | return fieldSchema?.let { 58 | document.getValues(fieldSchema.name)?.let { values: List -> 59 | values.first() as T 60 | } 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/BoolValue.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | /** Represents a three-state boolean to be used when nullable as a third-state 4 | * is not allowed, like in annotations. 5 | */ 6 | enum class BoolValue { 7 | YES, 8 | NO, 9 | DEFAULT; 10 | 11 | fun resolve(defaultValue: Boolean): Boolean { 12 | return if (this == DEFAULT) { 13 | defaultValue 14 | } else { 15 | this == YES 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/Cursor.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | data class Cursor( 4 | val docId: Int, 5 | val score: Float, 6 | val shardIndex: Int 7 | ) { 8 | companion object { 9 | fun first(): Cursor = 10 | Cursor( 11 | docId = -1, 12 | score = 0.toFloat(), 13 | shardIndex = 0 14 | ) 15 | } 16 | 17 | fun isFirst(): Boolean = 18 | docId == -1 19 | } 20 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/Document.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | import be.rlab.nlp.model.Language 4 | import be.rlab.search.Hashes.generateId 5 | import java.util.* 6 | 7 | /** Represents a document in the Lucene index. 8 | */ 9 | data class Document( 10 | /** Document id generated by [generateId] */ 11 | val id: String, 12 | /** Document namespace that represents this collection. */ 13 | val namespace: String, 14 | /** List of fields */ 15 | val fields: List>, 16 | /** Document version, used to keep backward compatibility between releases. */ 17 | val version: String 18 | ) { 19 | companion object { 20 | fun new( 21 | id: String, 22 | namespace: String, 23 | fields: List>, 24 | version: String 25 | ): Document = 26 | Document( 27 | id = id, 28 | namespace = namespace, 29 | fields = fields, 30 | version = version 31 | ) 32 | 33 | fun new( 34 | namespace: String, 35 | language: Language, 36 | fields: List>, 37 | version: String 38 | ): Document = 39 | Document( 40 | id = generateId(UUID.randomUUID(), language), 41 | namespace = namespace, 42 | fields = fields, 43 | version = version 44 | ) 45 | } 46 | 47 | @Suppress("UNCHECKED_CAST") 48 | fun getValues(fieldName: String): List? { 49 | return fields.find { field -> 50 | field.name == fieldName 51 | }?.let { field -> field.values as List } 52 | } 53 | 54 | inline operator fun get(name: String): T? { 55 | val field = fields.find { field -> 56 | field.name == name 57 | } 58 | val targetClass = T::class 59 | return if (targetClass == List::class) { 60 | field?.values as T? 61 | } else { 62 | field?.values?.firstOrNull() as T? 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/DocumentSchema.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | data class DocumentSchema( 4 | val namespace: String, 5 | val fields: List 6 | ) { 7 | companion object { 8 | 9 | fun new( 10 | namespace: String, 11 | fields: List 12 | ): DocumentSchema = DocumentSchema( 13 | namespace = namespace, 14 | fields = fields 15 | ) 16 | } 17 | 18 | /** Finds a field schema by name or by the underlying property name. 19 | * @param name Field or property name. 20 | * @return the required field, or null if it does not exist. 21 | */ 22 | fun findField(name: String): FieldSchema? { 23 | return fields.firstOrNull { field -> field.name == name || field.propertyName == name } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/Field.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | data class Field( 4 | val name: String, 5 | val values: List, 6 | val type: FieldType, 7 | val stored: Boolean, 8 | val indexed: Boolean, 9 | val docValues: Boolean 10 | ) { 11 | companion object { 12 | fun new( 13 | name: String, 14 | type: FieldType, 15 | values: List, 16 | stored: Boolean = type.stored, 17 | indexed: Boolean = type.indexed, 18 | docValues: Boolean = false 19 | ): Field = Field( 20 | name = name, 21 | type = type, 22 | values = values, 23 | stored = stored, 24 | indexed = indexed, 25 | docValues = docValues 26 | ) 27 | } 28 | 29 | fun configure( 30 | stored: Boolean, 31 | indexed: Boolean, 32 | docValues: Boolean 33 | ): Field = copy( 34 | stored = stored, 35 | indexed = indexed, 36 | docValues = docValues 37 | ) 38 | 39 | fun addValues(newValues: List): Field = copy( 40 | values = values + newValues 41 | ) 42 | } 43 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/FieldMetadata.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | /** Represents the field metadata stored in the index. 4 | */ 5 | data class FieldMetadata( 6 | val type: FieldType, 7 | val stored: Boolean, 8 | val indexed: Boolean, 9 | val docValues: Boolean 10 | ) { 11 | companion object { 12 | fun deserialize(metadata: String): FieldMetadata { 13 | val entries = metadata.split(",").associate { field -> 14 | val entry = field.split("=") 15 | entry[0] to entry[1] 16 | } 17 | return FieldMetadata( 18 | type = FieldType.valueOf(entries.getValue("type")), 19 | stored = entries.getValue("stored") == "1", 20 | indexed = entries.getValue("indexed") == "1", 21 | docValues = entries.getValue("docValues") == "1" 22 | ) 23 | } 24 | 25 | private fun mapBool(value: Boolean): String { 26 | return if (value) "1" else "0" 27 | } 28 | } 29 | 30 | fun serialize(): String { 31 | return "type=${type.name},stored=${mapBool(stored)},indexed=${mapBool(indexed)},docValues=${mapBool(docValues)}" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/FieldSchema.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | /** Contains metadata to manage a single field in the index. 4 | * The options in this schema overrides the default options from the type. 5 | */ 6 | data class FieldSchema( 7 | val propertyName: String?, 8 | val name: String, 9 | val type: FieldType, 10 | val stored: Boolean, 11 | val indexed: Boolean, 12 | val docValues: Boolean 13 | ) { 14 | companion object { 15 | /** Creates a new field not linked to a property. 16 | * @param name Field name. 17 | * @param type Field type. 18 | * @param stored True to store the field, false otherwise. 19 | * @param indexed True to index the field, false otherwise. 20 | * @param propertyName Kotlin's property name related to this field, if it applies. 21 | */ 22 | fun new( 23 | name: String, 24 | type: FieldType, 25 | stored: Boolean, 26 | indexed: Boolean, 27 | docValues: Boolean, 28 | propertyName: String? 29 | ): FieldSchema = FieldSchema( 30 | name = name, 31 | type = type, 32 | indexed = indexed, 33 | stored = stored, 34 | docValues = docValues, 35 | propertyName = propertyName 36 | ) 37 | 38 | fun validate(name: String, type: FieldType, values: List<*>) { 39 | require(values.isNotEmpty()) { "the field must have a value: name=$name" } 40 | 41 | when (type) { 42 | FieldType.TEXT, FieldType.STRING -> 43 | require(values.all { it is String }) { "one or more values are not String: name=$name" } 44 | FieldType.INT -> 45 | require(values.all { it is Int }) { "one or more values are not Int: name=$name" } 46 | FieldType.LONG -> 47 | require(values.all { it is Long }) { "one or more values are not Long: name=$name" } 48 | FieldType.FLOAT -> 49 | require(values.all { it is Float }) { "one or more values are not Float: name=$name" } 50 | FieldType.DOUBLE -> 51 | require(values.all { it is Double }) { "one or more values are not Double: name=$name" } 52 | } 53 | } 54 | } 55 | 56 | fun validate(values: List<*>): FieldSchema = apply { 57 | validate(name, type, values) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/FieldType.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | /** Index supported field types. 4 | */ 5 | enum class FieldType( 6 | val stored: Boolean, 7 | val indexed: Boolean 8 | ) { 9 | /** A String field that is stored but not tokenized. 10 | */ 11 | STRING(stored = true, indexed = false), 12 | /** A field that is stored and indexed. 13 | */ 14 | TEXT(stored = true, indexed = true), 15 | /** Integer value for exact/range queries. By default numeric types are not stored. */ 16 | INT(stored = false, indexed = true), 17 | /** Long value for exact/range queries. By default numeric types are not stored. */ 18 | LONG(stored = false, indexed = true), 19 | /** Float value for exact/range queries. By default numeric types are not stored. */ 20 | FLOAT(stored = false, indexed = true), 21 | /** Double value for exact/range queries. By default numeric types are not stored. */ 22 | DOUBLE(stored = false, indexed = true) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/IndexConfig.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | import be.rlab.nlp.model.Language 4 | import org.apache.lucene.search.similarities.BM25Similarity 5 | import org.apache.lucene.search.similarities.Similarity 6 | 7 | data class IndexConfig( 8 | val supportedLanguages: List, 9 | val similarity: Similarity 10 | ) { 11 | companion object { 12 | fun new( 13 | supportedLanguages: List, 14 | similarity: Similarity 15 | ): IndexConfig = IndexConfig( 16 | supportedLanguages = supportedLanguages, 17 | similarity = similarity 18 | ) 19 | 20 | fun default(): IndexConfig = IndexConfig( 21 | supportedLanguages = Language.entries, 22 | similarity = BM25Similarity() 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/QueryBuilder.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | import be.rlab.nlp.Normalizer 4 | import be.rlab.nlp.model.Language 5 | import be.rlab.search.LuceneFieldUtils.privateField 6 | import be.rlab.search.LuceneIndex.Companion.NAMESPACE_FIELD 7 | import be.rlab.search.query.term 8 | import org.apache.lucene.search.* 9 | import kotlin.reflect.KProperty1 10 | 11 | /** Builder to create Lucene [Query]s. 12 | */ 13 | class QueryBuilder private constructor ( 14 | val language: Language, 15 | val fields: List 16 | ) { 17 | 18 | companion object { 19 | fun query( 20 | namespace: String, 21 | language: Language 22 | ): QueryBuilder { 23 | val builder = QueryBuilder(language, fields = emptyList()) 24 | 25 | return builder.apply { 26 | term(privateField(NAMESPACE_FIELD), namespace, normalize = false) 27 | } 28 | } 29 | 30 | fun forSchema( 31 | schema: DocumentSchema, 32 | language: Language 33 | ): QueryBuilder { 34 | val builder = QueryBuilder(language, fields = schema.fields) 35 | 36 | return builder.apply { 37 | term(privateField(NAMESPACE_FIELD), schema.namespace, normalize = false) 38 | } 39 | } 40 | } 41 | 42 | class QueryModifiers( 43 | internal var boost: Float = -1.0F, 44 | internal val searchBy: MutableList = mutableListOf() 45 | ) { 46 | fun boost(score: Float) { 47 | boost = score 48 | } 49 | 50 | fun by(vararg fields: String) { 51 | searchBy.addAll(fields.toList()) 52 | } 53 | } 54 | 55 | private var root: BooleanQuery.Builder = BooleanQuery.Builder() 56 | private val sortFields: MutableList = mutableListOf() 57 | 58 | fun build(): Query { 59 | return root.build() 60 | } 61 | 62 | fun sort(): Sort? { 63 | return if (sortFields.isNotEmpty()) { 64 | Sort(*sortFields.toTypedArray()) 65 | } else { 66 | null 67 | } 68 | } 69 | 70 | fun findByField( 71 | occur: BooleanClause.Occur, 72 | callback: QueryModifiers.() -> Unit, 73 | builder: () -> Query 74 | ): QueryBuilder = apply { 75 | root.add(withModifiers( 76 | builder(), 77 | callback 78 | ), occur) 79 | } 80 | 81 | fun findByProperty( 82 | property: KProperty1, 83 | occur: BooleanClause.Occur, 84 | callback: QueryModifiers.() -> Unit, 85 | builder: (FieldSchema) -> Query 86 | ): QueryBuilder = apply { 87 | require(fields.isNotEmpty()) { "QueryBuilder does not support search by multiple fields" } 88 | 89 | val field = requireNotNull(getFieldSchema(property.name)) { 90 | "property '${property.name}' not annotated with @IndexField" 91 | } 92 | 93 | root.add(withModifiers( 94 | builder(field), 95 | callback 96 | ), occur) 97 | } 98 | 99 | fun findByAllFields( 100 | occur: BooleanClause.Occur, 101 | callback: QueryModifiers.() -> Unit, 102 | builder: (FieldSchema) -> Query 103 | ): QueryBuilder = apply { 104 | require(fields.isNotEmpty()) { "QueryBuilder does not support search by multiple fields" } 105 | val modifiers = QueryModifiers().apply(callback) 106 | val selectedFields = if (modifiers.searchBy.isNotEmpty()) { 107 | fields.filter { field -> modifiers.searchBy.contains(field.name) } 108 | } else { 109 | fields 110 | } 111 | val queries = selectedFields.map(builder) 112 | val child: BooleanQuery.Builder = queries.fold(BooleanQuery.Builder()) { aggregate, query -> 113 | aggregate.add(withModifiers(query, modifiers), BooleanClause.Occur.SHOULD) 114 | } 115 | root.add(child.build(), occur) 116 | } 117 | 118 | fun custom(callback: (BooleanQuery.Builder) -> Unit): QueryBuilder = apply { 119 | callback(root) 120 | } 121 | 122 | fun normalizeIfRequired(value: String, normalize: Boolean = false): String { 123 | return if (normalize) { 124 | Normalizer.new(value, language).normalize() 125 | } else { 126 | value 127 | } 128 | } 129 | 130 | /** Returns a field schema by name or by property name. 131 | * @param name Field name or property name. 132 | * @return the required schema, if exists. 133 | */ 134 | fun getFieldSchema(name: String): FieldSchema? { 135 | return fields.find { field -> field.name == name || field.propertyName == name } 136 | } 137 | 138 | /** Adds a sorting criteria to the list of existing criteria. 139 | * @param sortField Sorting criteria. 140 | */ 141 | fun addSortField(sortField: SortField): QueryBuilder = apply { 142 | sortFields += sortField 143 | } 144 | 145 | private fun withModifiers( 146 | query: Query, 147 | callback: QueryModifiers.() -> Unit 148 | ): Query = 149 | withModifiers(query, QueryModifiers().apply(callback)) 150 | 151 | private fun withModifiers( 152 | query: Query, 153 | modifiers: QueryModifiers 154 | ): Query { 155 | return if (modifiers.boost >= 0) { 156 | BoostQuery(query, modifiers.boost) 157 | } else { 158 | query 159 | } 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/SearchResult.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | /** Represents a set of paginated search results. 4 | */ 5 | data class SearchResult( 6 | /** Search results. */ 7 | val docs: List, 8 | /** Total number of documents in the search. */ 9 | val total: Long, 10 | /** Cursor to retrieve the next page. */ 11 | val next: Cursor? 12 | ) { 13 | companion object { 14 | fun new( 15 | results: List, 16 | total: Long, 17 | next: Cursor? = null 18 | ): SearchResult = 19 | SearchResult( 20 | docs = results, 21 | total = total, 22 | next = next 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/model/TypedSearchResult.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.model 2 | 3 | /** Contains the paginated search results mapped to the target type. 4 | */ 5 | data class TypedSearchResult( 6 | /** Search results. */ 7 | val docs: List, 8 | /** Total number of documents in the search. */ 9 | val total: Long, 10 | /** Cursor to retrieve the next page. */ 11 | val next: Cursor? 12 | ) { 13 | companion object { 14 | fun new( 15 | results: List, 16 | total: Long, 17 | next: Cursor? = null 18 | ): TypedSearchResult = 19 | TypedSearchResult( 20 | docs = results, 21 | total = total, 22 | next = next 23 | ) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/DoubleRange.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.DoublePoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.range( 9 | fieldName: String, 10 | lowerValue: Double, 11 | upperValue: Double, 12 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 13 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 14 | ): QueryBuilder = findByField(occur, callback) { 15 | DoublePoint.newRangeQuery(fieldName, lowerValue, upperValue) 16 | } 17 | 18 | fun QueryBuilder.range( 19 | lowerValue: Double, 20 | upperValue: Double, 21 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 22 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 23 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 24 | DoublePoint.newRangeQuery(field.name, lowerValue, upperValue) 25 | } 26 | 27 | fun QueryBuilder.range( 28 | property: KProperty1, 29 | lowerValue: Double, 30 | upperValue: Double, 31 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 32 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 33 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 34 | DoublePoint.newRangeQuery(field.name, lowerValue, upperValue) 35 | } 36 | 37 | fun QueryBuilder.range( 38 | fieldName: String, 39 | lowerValue: DoubleArray, 40 | upperValue: DoubleArray, 41 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 42 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 43 | ): QueryBuilder = findByField(occur, callback) { 44 | DoublePoint.newRangeQuery(fieldName, lowerValue, upperValue) 45 | } 46 | 47 | fun QueryBuilder.range( 48 | lowerValue: DoubleArray, 49 | upperValue: DoubleArray, 50 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 51 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 52 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 53 | DoublePoint.newRangeQuery(field.name, lowerValue, upperValue) 54 | } 55 | 56 | fun QueryBuilder.range( 57 | property: KProperty1, 58 | lowerValue: DoubleArray, 59 | upperValue: DoubleArray, 60 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 61 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 62 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 63 | DoublePoint.newRangeQuery(field.name, lowerValue, upperValue) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/DoubleTerm.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.DoublePoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.term( 9 | fieldName: String, 10 | value: Double, 11 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 12 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 13 | ): QueryBuilder = findByField(occur, callback) { 14 | DoublePoint.newExactQuery(fieldName, value) 15 | } 16 | 17 | fun QueryBuilder.term( 18 | value: Double, 19 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 20 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 21 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 22 | DoublePoint.newExactQuery(field.name, value) 23 | } 24 | 25 | fun QueryBuilder.term( 26 | property: KProperty1, 27 | value: Double, 28 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 29 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 30 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 31 | DoublePoint.newExactQuery(field.name, value) 32 | } 33 | 34 | fun QueryBuilder.term( 35 | fieldName: String, 36 | value: DoubleArray, 37 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 38 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 39 | ): QueryBuilder = findByField(occur, callback) { 40 | DoublePoint.newSetQuery(fieldName, *value) 41 | } 42 | 43 | fun QueryBuilder.term( 44 | value: DoubleArray, 45 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 46 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 47 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 48 | DoublePoint.newSetQuery(field.name, *value) 49 | } 50 | 51 | fun QueryBuilder.term( 52 | property: KProperty1, 53 | value: DoubleArray, 54 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 55 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 56 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 57 | DoublePoint.newSetQuery(field.name, *value) 58 | } 59 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/FloatRange.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.FloatPoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.range( 9 | fieldName: String, 10 | lowerValue: Float, 11 | upperValue: Float, 12 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 13 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 14 | ): QueryBuilder = findByField(occur, callback) { 15 | FloatPoint.newRangeQuery(fieldName, lowerValue, upperValue) 16 | } 17 | 18 | fun QueryBuilder.range( 19 | lowerValue: Float, 20 | upperValue: Float, 21 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 22 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 23 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 24 | FloatPoint.newRangeQuery(field.name, lowerValue, upperValue) 25 | } 26 | 27 | fun QueryBuilder.range( 28 | property: KProperty1, 29 | lowerValue: Float, 30 | upperValue: Float, 31 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 32 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 33 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 34 | FloatPoint.newRangeQuery(field.name, lowerValue, upperValue) 35 | } 36 | 37 | fun QueryBuilder.range( 38 | fieldName: String, 39 | lowerValue: FloatArray, 40 | upperValue: FloatArray, 41 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 42 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 43 | ): QueryBuilder = findByField(occur, callback) { 44 | FloatPoint.newRangeQuery(fieldName, lowerValue, upperValue) 45 | } 46 | 47 | fun QueryBuilder.range( 48 | lowerValue: FloatArray, 49 | upperValue: FloatArray, 50 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 51 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 52 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 53 | FloatPoint.newRangeQuery(field.name, lowerValue, upperValue) 54 | } 55 | 56 | fun QueryBuilder.range( 57 | property: KProperty1, 58 | lowerValue: FloatArray, 59 | upperValue: FloatArray, 60 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 61 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 62 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 63 | FloatPoint.newRangeQuery(field.name, lowerValue, upperValue) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/FloatTerm.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.FloatPoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.term( 9 | fieldName: String, 10 | value: Float, 11 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 12 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 13 | ): QueryBuilder = findByField(occur, callback) { 14 | FloatPoint.newExactQuery(fieldName, value) 15 | } 16 | 17 | fun QueryBuilder.term( 18 | value: Float, 19 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 20 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 21 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 22 | FloatPoint.newExactQuery(field.name, value) 23 | } 24 | 25 | fun QueryBuilder.term( 26 | property: KProperty1, 27 | value: Float, 28 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 29 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 30 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 31 | FloatPoint.newExactQuery(field.name, value) 32 | } 33 | 34 | fun QueryBuilder.term( 35 | fieldName: String, 36 | value: FloatArray, 37 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 38 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 39 | ): QueryBuilder = findByField(occur, callback) { 40 | FloatPoint.newSetQuery(fieldName, *value) 41 | } 42 | 43 | fun QueryBuilder.term( 44 | value: FloatArray, 45 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 46 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 47 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 48 | FloatPoint.newSetQuery(field.name, *value) 49 | } 50 | 51 | fun QueryBuilder.term( 52 | property: KProperty1, 53 | value: FloatArray, 54 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 55 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 56 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 57 | FloatPoint.newSetQuery(field.name, *value) 58 | } 59 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/Fuzzy.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.index.Term 5 | import org.apache.lucene.search.BooleanClause 6 | import org.apache.lucene.search.FuzzyQuery 7 | import kotlin.reflect.KProperty1 8 | 9 | fun QueryBuilder.fuzzy( 10 | fieldName: String, 11 | value: String, 12 | normalize: Boolean = true, 13 | maxEdits: Int = FuzzyQuery.defaultMaxEdits, 14 | prefixLength: Int = FuzzyQuery.defaultPrefixLength, 15 | maxExpansions: Int = FuzzyQuery.defaultMaxExpansions, 16 | transpositions: Boolean = FuzzyQuery.defaultTranspositions, 17 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 18 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 19 | ): QueryBuilder = findByField(occur, callback) { 20 | val term = Term(fieldName, normalizeIfRequired(value, normalize)) 21 | FuzzyQuery(term, maxEdits, prefixLength, maxExpansions, transpositions) 22 | } 23 | 24 | fun QueryBuilder.fuzzy( 25 | value: String, 26 | normalize: Boolean = true, 27 | maxEdits: Int = FuzzyQuery.defaultMaxEdits, 28 | prefixLength: Int = FuzzyQuery.defaultPrefixLength, 29 | maxExpansions: Int = FuzzyQuery.defaultMaxExpansions, 30 | transpositions: Boolean = FuzzyQuery.defaultTranspositions, 31 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 32 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 33 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 34 | val term = Term(field.name, normalizeIfRequired(value, normalize)) 35 | FuzzyQuery(term, maxEdits, prefixLength, maxExpansions, transpositions) 36 | } 37 | 38 | fun QueryBuilder.fuzzy( 39 | property: KProperty1, 40 | value: String, 41 | normalize: Boolean = true, 42 | maxEdits: Int = FuzzyQuery.defaultMaxEdits, 43 | prefixLength: Int = FuzzyQuery.defaultPrefixLength, 44 | maxExpansions: Int = FuzzyQuery.defaultMaxExpansions, 45 | transpositions: Boolean = FuzzyQuery.defaultTranspositions, 46 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 47 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 48 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 49 | val term = Term(field.name, normalizeIfRequired(value, normalize)) 50 | FuzzyQuery(term, maxEdits, prefixLength, maxExpansions, transpositions) 51 | } 52 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/IntRange.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.IntPoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.range( 9 | fieldName: String, 10 | lowerValue: Int, 11 | upperValue: Int, 12 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 13 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 14 | ): QueryBuilder = findByField(occur, callback) { 15 | IntPoint.newRangeQuery(fieldName, lowerValue, upperValue) 16 | } 17 | 18 | fun QueryBuilder.range( 19 | lowerValue: Int, 20 | upperValue: Int, 21 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 22 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 23 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 24 | IntPoint.newRangeQuery(field.name, lowerValue, upperValue) 25 | } 26 | 27 | fun QueryBuilder.range( 28 | property: KProperty1, 29 | lowerValue: Int, 30 | upperValue: Int, 31 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 32 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 33 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 34 | IntPoint.newRangeQuery(field.name, lowerValue, upperValue) 35 | } 36 | 37 | fun QueryBuilder.range( 38 | fieldName: String, 39 | lowerValue: IntArray, 40 | upperValue: IntArray, 41 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 42 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 43 | ): QueryBuilder = findByField(occur, callback) { 44 | IntPoint.newRangeQuery(fieldName, lowerValue, upperValue) 45 | } 46 | 47 | fun QueryBuilder.range( 48 | lowerValue: IntArray, 49 | upperValue: IntArray, 50 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 51 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 52 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 53 | IntPoint.newRangeQuery(field.name, lowerValue, upperValue) 54 | } 55 | 56 | fun QueryBuilder.range( 57 | property: KProperty1, 58 | lowerValue: IntArray, 59 | upperValue: IntArray, 60 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 61 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 62 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 63 | IntPoint.newRangeQuery(field.name, lowerValue, upperValue) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/IntTerm.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.IntPoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.term( 9 | fieldName: String, 10 | value: Int, 11 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 12 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 13 | ): QueryBuilder = findByField(occur, callback) { 14 | IntPoint.newExactQuery(fieldName, value) 15 | } 16 | 17 | fun QueryBuilder.term( 18 | value: Int, 19 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 20 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 21 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 22 | IntPoint.newExactQuery(field.name, value) 23 | } 24 | 25 | fun QueryBuilder.term( 26 | property: KProperty1, 27 | value: Int, 28 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 29 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 30 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 31 | IntPoint.newExactQuery(field.name, value) 32 | } 33 | 34 | fun QueryBuilder.term( 35 | fieldName: String, 36 | value: IntArray, 37 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 38 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 39 | ): QueryBuilder = findByField(occur, callback) { 40 | IntPoint.newSetQuery(fieldName, *value) 41 | } 42 | 43 | fun QueryBuilder.term( 44 | value: IntArray, 45 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 46 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 47 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 48 | IntPoint.newSetQuery(field.name, *value) 49 | } 50 | 51 | fun QueryBuilder.term( 52 | property: KProperty1, 53 | value: IntArray, 54 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 55 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 56 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 57 | IntPoint.newSetQuery(field.name, *value) 58 | } 59 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/LongRange.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.LongPoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.range( 9 | fieldName: String, 10 | lowerValue: Long, 11 | upperValue: Long, 12 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 13 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 14 | ): QueryBuilder = findByField(occur, callback) { 15 | LongPoint.newRangeQuery(fieldName, lowerValue, upperValue) 16 | } 17 | 18 | fun QueryBuilder.range( 19 | lowerValue: Long, 20 | upperValue: Long, 21 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 22 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 23 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 24 | LongPoint.newRangeQuery(field.name, lowerValue, upperValue) 25 | } 26 | 27 | fun QueryBuilder.range( 28 | property: KProperty1, 29 | lowerValue: Long, 30 | upperValue: Long, 31 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 32 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 33 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 34 | LongPoint.newRangeQuery(field.name, lowerValue, upperValue) 35 | } 36 | 37 | fun QueryBuilder.range( 38 | fieldName: String, 39 | lowerValue: LongArray, 40 | upperValue: LongArray, 41 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 42 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 43 | ): QueryBuilder = findByField(occur, callback) { 44 | LongPoint.newRangeQuery(fieldName, lowerValue, upperValue) 45 | } 46 | 47 | fun QueryBuilder.range( 48 | lowerValue: LongArray, 49 | upperValue: LongArray, 50 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 51 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 52 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 53 | LongPoint.newRangeQuery(field.name, lowerValue, upperValue) 54 | } 55 | 56 | fun QueryBuilder.range( 57 | property: KProperty1, 58 | lowerValue: LongArray, 59 | upperValue: LongArray, 60 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 61 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 62 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 63 | LongPoint.newRangeQuery(field.name, lowerValue, upperValue) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/LongTerm.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.document.LongPoint 5 | import org.apache.lucene.search.BooleanClause 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.term( 9 | fieldName: String, 10 | value: Long, 11 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 12 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 13 | ): QueryBuilder = findByField(occur, callback) { 14 | LongPoint.newExactQuery(fieldName, value) 15 | } 16 | 17 | fun QueryBuilder.term( 18 | value: Long, 19 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 20 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 21 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 22 | LongPoint.newExactQuery(field.name, value) 23 | } 24 | 25 | fun QueryBuilder.term( 26 | property: KProperty1, 27 | value: Long, 28 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 29 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 30 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 31 | LongPoint.newExactQuery(field.name, value) 32 | } 33 | 34 | fun QueryBuilder.term( 35 | fieldName: String, 36 | value: LongArray, 37 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 38 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 39 | ): QueryBuilder = findByField(occur, callback) { 40 | LongPoint.newSetQuery(fieldName, *value) 41 | } 42 | 43 | fun QueryBuilder.term( 44 | value: LongArray, 45 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 46 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 47 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 48 | LongPoint.newSetQuery(field.name, *value) 49 | } 50 | 51 | fun QueryBuilder.term( 52 | property: KProperty1, 53 | value: LongArray, 54 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 55 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 56 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 57 | LongPoint.newSetQuery(field.name, *value) 58 | } 59 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/Parser.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.AnalyzerFactory 4 | import be.rlab.search.model.QueryBuilder 5 | import org.apache.lucene.queryparser.classic.QueryParser 6 | import org.apache.lucene.search.BooleanClause 7 | 8 | fun QueryBuilder.parse( 9 | defaultFieldName: String, 10 | query: String, 11 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 12 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 13 | ): QueryBuilder = findByField(occur, callback) { 14 | val parser = QueryParser(defaultFieldName, AnalyzerFactory.newAnalyzer(language)) 15 | parser.parse(query) 16 | } 17 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/Phrase.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.search.BooleanClause 5 | import org.apache.lucene.search.PhraseQuery 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.phrase( 9 | fieldName: String, 10 | vararg values: String, 11 | normalize: Boolean = true, 12 | maxEditDistance: Int = 0, 13 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 14 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 15 | ): QueryBuilder = findByField(occur, callback) { 16 | val terms = values.map { value -> normalizeIfRequired(value, normalize) }.toTypedArray() 17 | PhraseQuery(maxEditDistance, fieldName, *terms) 18 | } 19 | 20 | fun QueryBuilder.phrase( 21 | vararg values: String, 22 | normalize: Boolean = true, 23 | maxEditDistance: Int = 0, 24 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 25 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 26 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 27 | val terms = values.map { value -> normalizeIfRequired(value, normalize) }.toTypedArray() 28 | PhraseQuery(maxEditDistance, field.name, *terms) 29 | } 30 | 31 | fun QueryBuilder.phrase( 32 | property: KProperty1, 33 | vararg values: String, 34 | normalize: Boolean = true, 35 | maxEditDistance: Int = 0, 36 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 37 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 38 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 39 | val terms = values.map { value -> normalizeIfRequired(value, normalize) }.toTypedArray() 40 | PhraseQuery(maxEditDistance, field.name, *terms) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/Regex.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.index.Term 5 | import org.apache.lucene.search.BooleanClause 6 | import org.apache.lucene.search.RegexpQuery 7 | import org.apache.lucene.util.automaton.RegExp 8 | import kotlin.reflect.KProperty1 9 | 10 | fun QueryBuilder.regex( 11 | fieldName: String, 12 | value: Regex, 13 | flags: Int = RegExp.ALL, 14 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 15 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 16 | ): QueryBuilder = findByField(occur, callback) { 17 | RegexpQuery(Term(fieldName, value.pattern), flags) 18 | } 19 | 20 | fun QueryBuilder.regex( 21 | value: Regex, 22 | flags: Int = RegExp.ALL, 23 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 24 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 25 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 26 | RegexpQuery(Term(field.name, value.pattern), flags) 27 | } 28 | 29 | fun QueryBuilder.regex( 30 | property: KProperty1, 31 | value: Regex, 32 | flags: Int = RegExp.ALL, 33 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 34 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 35 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 36 | RegexpQuery(Term(field.name, value.pattern), flags) 37 | } 38 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/SortBy.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.FieldType 4 | import be.rlab.search.model.QueryBuilder 5 | import org.apache.lucene.search.SortField 6 | import org.apache.lucene.search.SortedNumericSortField 7 | import kotlin.reflect.KProperty1 8 | 9 | fun QueryBuilder.sortBy( 10 | vararg fieldsNames: String, 11 | reverse: Boolean = false 12 | ): QueryBuilder = apply { 13 | fieldsNames 14 | .map { name -> requireNotNull(getFieldSchema(name)) { "field schema not found: name=$name" } } 15 | .forEach { fieldSchema -> 16 | require(fieldSchema.docValues) { "The field must be stored as docValues to enable sorting." } 17 | val sortField = when (fieldSchema.type) { 18 | FieldType.TEXT, FieldType.STRING -> SortField(fieldSchema.name, SortField.Type.STRING, reverse) 19 | FieldType.INT -> SortedNumericSortField(fieldSchema.name, SortField.Type.INT, reverse) 20 | FieldType.LONG -> SortedNumericSortField(fieldSchema.name, SortField.Type.LONG, reverse) 21 | FieldType.FLOAT -> SortedNumericSortField(fieldSchema.name, SortField.Type.FLOAT, reverse) 22 | FieldType.DOUBLE -> SortedNumericSortField(fieldSchema.name, SortField.Type.DOUBLE, reverse) 23 | } 24 | addSortField(sortField) 25 | } 26 | } 27 | 28 | fun QueryBuilder.sortBy( 29 | vararg properties: KProperty1, 30 | reverse: Boolean = false 31 | ): QueryBuilder = apply { 32 | sortBy(*properties.map { property -> property.name }.toTypedArray(), reverse = reverse) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/StringRange.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.search.BooleanClause 5 | import org.apache.lucene.search.TermRangeQuery 6 | import kotlin.reflect.KProperty1 7 | 8 | fun QueryBuilder.range( 9 | fieldName: String, 10 | lowerTerm: String, 11 | upperTerm: String, 12 | includeLower: Boolean = true, 13 | includeUpper: Boolean = true, 14 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 15 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 16 | ): QueryBuilder = findByField(occur, callback) { 17 | TermRangeQuery.newStringRange(fieldName, lowerTerm, upperTerm, includeLower, includeUpper) 18 | } 19 | 20 | fun QueryBuilder.range( 21 | lowerTerm: String, 22 | upperTerm: String, 23 | includeLower: Boolean = true, 24 | includeUpper: Boolean = true, 25 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 26 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 27 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 28 | TermRangeQuery.newStringRange(field.name, lowerTerm, upperTerm, includeLower, includeUpper) 29 | } 30 | 31 | fun QueryBuilder.range( 32 | property: KProperty1, 33 | lowerTerm: String, 34 | upperTerm: String, 35 | includeLower: Boolean = true, 36 | includeUpper: Boolean = true, 37 | occur: BooleanClause.Occur = BooleanClause.Occur.SHOULD, 38 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 39 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 40 | TermRangeQuery.newStringRange(field.name, lowerTerm, upperTerm, includeLower, includeUpper) 41 | } 42 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/StringTerm.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.index.Term 5 | import org.apache.lucene.search.BooleanClause 6 | import org.apache.lucene.search.TermQuery 7 | import kotlin.reflect.KProperty1 8 | 9 | fun QueryBuilder.term( 10 | fieldName: String, 11 | value: String, 12 | normalize: Boolean = true, 13 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 14 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 15 | ): QueryBuilder = findByField(occur, callback) { 16 | TermQuery(Term(fieldName, normalizeIfRequired(value, normalize))) 17 | } 18 | 19 | fun QueryBuilder.term( 20 | value: String, 21 | normalize: Boolean = true, 22 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 23 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 24 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 25 | TermQuery(Term(field.name, normalizeIfRequired(value, normalize))) 26 | } 27 | 28 | fun QueryBuilder.term( 29 | property: KProperty1, 30 | value: String, 31 | normalize: Boolean = true, 32 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 33 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 34 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 35 | TermQuery(Term(field.name, normalizeIfRequired(value, normalize))) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/query/Wildcard.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.query 2 | 3 | import be.rlab.search.model.QueryBuilder 4 | import org.apache.lucene.index.Term 5 | import org.apache.lucene.search.BooleanClause 6 | import org.apache.lucene.search.WildcardQuery 7 | import kotlin.reflect.KProperty1 8 | 9 | fun QueryBuilder.wildcard( 10 | fieldName: String, 11 | value: String, 12 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 13 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 14 | ): QueryBuilder = findByField(occur, callback) { 15 | WildcardQuery(Term(fieldName, value)) 16 | } 17 | 18 | fun QueryBuilder.wildcard( 19 | value: String, 20 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 21 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 22 | ): QueryBuilder = findByAllFields(occur, callback) { field -> 23 | WildcardQuery(Term(field.name, value)) 24 | } 25 | 26 | fun QueryBuilder.wildcard( 27 | property: KProperty1, 28 | value: String, 29 | occur: BooleanClause.Occur = BooleanClause.Occur.MUST, 30 | callback: QueryBuilder.QueryModifiers.() -> Unit = {} 31 | ): QueryBuilder = findByProperty(property, occur, callback) { field -> 32 | WildcardQuery(Term(field.name, value)) 33 | } 34 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/schema/DocumentSchemaBuilder.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.schema 2 | 3 | import be.rlab.search.annotation.IndexDocument 4 | import be.rlab.search.annotation.IndexField 5 | import be.rlab.search.mapper.FieldTypeMapper 6 | import be.rlab.search.model.DocumentSchema 7 | import be.rlab.search.model.FieldSchema 8 | import be.rlab.search.model.FieldType 9 | import kotlin.reflect.KClass 10 | import kotlin.reflect.KProperty1 11 | import kotlin.reflect.full.findAnnotation 12 | import kotlin.reflect.full.hasAnnotation 13 | 14 | /** Builder to define a [DocumentSchema]. 15 | */ 16 | class DocumentSchemaBuilder private constructor ( 17 | private val namespace: String 18 | ){ 19 | companion object { 20 | fun new( 21 | namespace: String, 22 | callback: DocumentSchemaBuilder.() -> Unit 23 | ): DocumentSchemaBuilder = 24 | DocumentSchemaBuilder(namespace).apply(callback) 25 | 26 | @Suppress("UNCHECKED_CAST") 27 | fun buildFromClass( 28 | documentType: KClass<*>, 29 | fieldTypeMappers: List 30 | ): DocumentSchema { 31 | require(documentType.hasAnnotation()) { 32 | "@IndexDocument annotation not found in class: ${documentType.qualifiedName}." 33 | } 34 | val docInfo: IndexDocument = documentType.findAnnotation() 35 | ?: throw RuntimeException("@IndexDocument annotation not found") 36 | val fields = documentType.members 37 | .filter { member -> member is KProperty1<*, *> && member.hasAnnotation() } 38 | .flatMap { member -> 39 | FieldSchemaBuilder.buildFromProperty(member as KProperty1, fieldTypeMappers) 40 | } 41 | 42 | return DocumentSchema( 43 | namespace = docInfo.namespace, 44 | fields = fields 45 | ) 46 | } 47 | } 48 | 49 | private val fields: MutableList = mutableListOf() 50 | 51 | /** Defines a new text field. 52 | * 53 | * By default text fields are indexed and stored. 54 | * 55 | * @param name Field name. 56 | */ 57 | fun text(name: String, callback: FieldSchemaBuilder.() -> Unit = {}): DocumentSchemaBuilder = apply { 58 | fields += FieldSchemaBuilder.new(name, FieldType.TEXT, callback).build() 59 | } 60 | 61 | /** Creates a new string field. 62 | * 63 | * String fields are saved as single terms and they're not indexed. 64 | * By default String fields are stored. 65 | * 66 | * @param name Field name. 67 | */ 68 | fun string(name: String, callback: FieldSchemaBuilder.() -> Unit = {}): DocumentSchemaBuilder = apply { 69 | fields += FieldSchemaBuilder.new(name, FieldType.STRING, callback).build() 70 | } 71 | 72 | /** Creates a new int field. 73 | * By default numeric fields are not stored. 74 | * 75 | * @param name Field name. 76 | */ 77 | fun int(name: String, callback: FieldSchemaBuilder.() -> Unit = {}): DocumentSchemaBuilder = apply { 78 | fields += FieldSchemaBuilder.new(name, FieldType.INT, callback).build() 79 | } 80 | 81 | /** Creates a new long field. 82 | * By default numeric fields are not stored. 83 | * 84 | * @param name Field name. 85 | */ 86 | fun long(name: String, callback: FieldSchemaBuilder.() -> Unit = {}): DocumentSchemaBuilder = apply { 87 | fields += FieldSchemaBuilder.new(name, FieldType.LONG, callback).build() 88 | } 89 | 90 | /** Creates a new float field. 91 | * By default numeric fields are not stored. 92 | * 93 | * @param name Field name. 94 | */ 95 | fun float(name: String, callback: FieldSchemaBuilder.() -> Unit = {}): DocumentSchemaBuilder = apply { 96 | fields += FieldSchemaBuilder.new(name, FieldType.FLOAT, callback).build() 97 | } 98 | 99 | /** Creates a new double field. 100 | * By default numeric fields are not stored. 101 | * 102 | * @param name Field name. 103 | */ 104 | fun double(name: String, callback: FieldSchemaBuilder.() -> Unit = {}): DocumentSchemaBuilder = apply { 105 | fields += FieldSchemaBuilder.new(name, FieldType.DOUBLE, callback).build() 106 | } 107 | 108 | /** Builds the document. 109 | */ 110 | fun build(): DocumentSchema { 111 | return DocumentSchema.new( 112 | namespace = namespace, 113 | fields = fields 114 | ) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/search/schema/FieldSchemaBuilder.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.schema 2 | 3 | import be.rlab.search.annotation.IndexField 4 | import be.rlab.search.annotation.IndexFieldType 5 | import be.rlab.search.annotation.Indexed 6 | import be.rlab.search.annotation.Stored 7 | import be.rlab.search.mapper.FieldTypeMapper 8 | import be.rlab.search.model.BoolValue 9 | import be.rlab.search.model.FieldSchema 10 | import be.rlab.search.model.FieldType 11 | import kotlin.reflect.KProperty1 12 | import kotlin.reflect.full.findAnnotation 13 | 14 | /** Builder to create a [FieldSchema]. 15 | */ 16 | class FieldSchemaBuilder private constructor ( 17 | name: String 18 | ){ 19 | companion object { 20 | fun new( 21 | name: String, 22 | type: FieldType, 23 | callback: FieldSchemaBuilder.() -> Unit 24 | ): FieldSchemaBuilder = 25 | FieldSchemaBuilder(name).type(type).apply(callback) 26 | 27 | /** Creates a new field linked to a property. 28 | * @param property Property to link to this field. 29 | * @return the new field. 30 | */ 31 | fun buildFromProperty( 32 | property: KProperty1, 33 | fieldTypeMappers: List 34 | ): List { 35 | val field: IndexField = property.findAnnotation() 36 | ?: throw RuntimeException("@IndexField annotation not found") 37 | val typeMetadata: IndexFieldType? = property.findAnnotation() 38 | val name = field.name.takeIf { it.isNotBlank() } 39 | ?: field.fieldName.takeIf { it.isNotBlank() } 40 | ?: property.name 41 | val mapper = fieldTypeMappers.firstOrNull { mapper -> mapper.supports(property.returnType) } 42 | ?: throw RuntimeException("no type mapper found for property: $name") 43 | val stored: Boolean? = property.findAnnotation()?.value 44 | ?: if (field.store == BoolValue.DEFAULT) null else field.store == BoolValue.YES 45 | val indexed: Boolean? = property.findAnnotation()?.value 46 | ?: if (field.index == BoolValue.DEFAULT) null else field.index == BoolValue.YES 47 | val docValues: Boolean = field.docValues 48 | 49 | val builder = FieldSchemaBuilder(name) 50 | .propertyName(property.name) 51 | .type(typeMetadata?.type) 52 | .store(stored) 53 | .index(indexed) 54 | .docValues(docValues) 55 | return mapper.mapSchema(property.returnType, builder) 56 | } 57 | } 58 | 59 | var propertyName: String? = null 60 | private set 61 | var name: String = name 62 | private set 63 | var type: FieldType? = null 64 | private set 65 | var store: Boolean? = null 66 | private set 67 | var index: Boolean? = null 68 | private set 69 | var docValues: Boolean = false 70 | private set 71 | 72 | /** Sets the Kotlin property name, if it applies. 73 | * This is only used by the IndexMapper. 74 | * @param name Kotlin object property name. 75 | */ 76 | fun propertyName(name: String): FieldSchemaBuilder = apply { 77 | propertyName = name 78 | } 79 | 80 | /** Sets the Lucene field name. 81 | * @param fieldName Lucene field name. 82 | */ 83 | fun name(fieldName: String): FieldSchemaBuilder = apply { 84 | name = fieldName 85 | } 86 | 87 | /** Sets the Lucene type of this field. 88 | * @param fieldType Lucene type. 89 | */ 90 | fun type(fieldType: FieldType?): FieldSchemaBuilder = apply { 91 | type = fieldType 92 | } 93 | 94 | /** Configures whether the value of this field must be stored in the index or not. 95 | * @param stored True to store, false to prevent from storing the field. 96 | */ 97 | fun store(stored: Boolean? = true): FieldSchemaBuilder = apply { 98 | store = stored 99 | } 100 | 101 | /** Configures whether this field must be indexed or not. 102 | * @param indexed True to index, false to prevent from indexing the field. 103 | */ 104 | fun index(indexed: Boolean? = true): FieldSchemaBuilder = apply { 105 | index = indexed 106 | } 107 | 108 | /** Marks this field to be stored as DocValues. 109 | * DocValues are a document-level fields, and they are much faster for sorting and faceting. 110 | * 111 | * @param isDocValues true to store this field as DocValues, false otherwise. 112 | * 113 | * @see https://solr.apache.org/guide/6_6/docvalues.html 114 | */ 115 | fun docValues(isDocValues: Boolean = true): FieldSchemaBuilder = apply { 116 | docValues = isDocValues 117 | } 118 | 119 | /** Builds the field schema. 120 | */ 121 | fun build(): FieldSchema { 122 | val resolvedType = requireNotNull(type) { "The Lucene type is required and it is not set." } 123 | 124 | return FieldSchema.new( 125 | name = name, 126 | type = resolvedType, 127 | stored = store ?: resolvedType.stored, 128 | indexed = index ?: resolvedType.indexed, 129 | docValues = docValues, 130 | propertyName = propertyName 131 | ) 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/support/ResourceLoader.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.support 2 | 3 | import java.io.BufferedReader 4 | 5 | object ResourceLoader { 6 | fun fromClasspath(path: String): BufferedReader { 7 | return Thread.currentThread().contextClassLoader 8 | .getResourceAsStream(path)?.bufferedReader() 9 | ?: throw RuntimeException("Resource not found: $path") 10 | } 11 | } -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/support/csv/Field.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.support.csv 2 | 3 | import java.nio.charset.Charset 4 | 5 | data class Field( 6 | val data: ByteArray 7 | ) { 8 | val value: String by lazy { 9 | data.toString(Charset.defaultCharset()) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/support/csv/Parser.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.support.csv 2 | 3 | import org.slf4j.Logger 4 | import org.slf4j.LoggerFactory 5 | import java.io.RandomAccessFile 6 | import java.nio.ByteBuffer 7 | import java.nio.channels.FileChannel.MapMode 8 | 9 | class Parser( 10 | config: ParserConfig = ParserConfig.default() 11 | ) { 12 | 13 | companion object { 14 | private const val DOUBLE_QUOTE: Byte = 34 15 | private const val LINE_FEED: Byte = 10 16 | private const val CARRIAGE_RETURN: Byte = 13 17 | private const val ESCAPE: Byte = 92 18 | /** Size of the buffer used to detect file format. */ 19 | private const val PROBE_BUFFER_SIZE: Long = 1024 * 1024 20 | } 21 | 22 | private val logger: Logger = LoggerFactory.getLogger(Parser::class.java) 23 | private val bufferSize: Long = config.bufferSize 24 | private val separator: Byte = config.separator 25 | 26 | fun parse( 27 | csvFile: String, 28 | callback: (Position, List) -> Unit 29 | ) { 30 | 31 | logger.info("parsing csv started") 32 | 33 | val handle = RandomAccessFile(csvFile, "r") 34 | val probeBufferSize: Long = if (handle.length() < PROBE_BUFFER_SIZE) { 35 | handle.length() 36 | } else { 37 | PROBE_BUFFER_SIZE 38 | } 39 | val lineBreakSize = lineSeparatorLength( 40 | handle.channel.map(MapMode.READ_ONLY, 0, probeBufferSize), 41 | handle.length() 42 | ) 43 | var pointer: Long = 0 44 | var tail = ByteArray(0) 45 | 46 | logger.info("csv line reader ready to send records") 47 | 48 | while(true) { 49 | val bytesRead: Long = if (pointer + bufferSize > handle.length()) { 50 | handle.length() - pointer 51 | } else { 52 | bufferSize 53 | } 54 | logger.info("reading $bytesRead bytes") 55 | 56 | val buffer: ByteBuffer = handle.channel.map( 57 | MapMode.READ_ONLY, pointer, bytesRead 58 | ) 59 | var offset = 0 60 | var lineStart = 0 61 | 62 | while (offset < bytesRead) { 63 | val char: Byte = buffer[offset] 64 | 65 | if (char == LINE_FEED || char == CARRIAGE_RETURN) { 66 | 67 | val lineEnd = lineStart + (offset - lineStart) 68 | val line = if (tail.isNotEmpty()) { 69 | val lineWithTail = tail + readLine(buffer, lineStart, lineEnd) 70 | tail = ByteArray(0) 71 | lineWithTail 72 | } else 73 | readLine(buffer, lineStart, lineEnd) 74 | 75 | callback(Position( 76 | start = pointer + lineStart, 77 | end = pointer + lineEnd 78 | ), parseRecord(line)) 79 | 80 | buffer.position(buffer.position() + lineBreakSize) 81 | offset += lineBreakSize 82 | lineStart = offset 83 | } else { 84 | offset += 1 85 | } 86 | } 87 | 88 | if (bytesRead < bufferSize) { 89 | break 90 | } 91 | 92 | tail = readLine(buffer, lineStart, bytesRead.toInt()) 93 | pointer += bytesRead 94 | } 95 | 96 | logger.info("parsing csv finished") 97 | } 98 | 99 | private fun lineSeparatorLength( 100 | buffer: ByteBuffer, 101 | size: Long 102 | ): Int { 103 | var offset = 0 104 | var char: Byte = buffer[0] 105 | 106 | while (offset < size && char != LINE_FEED && char != CARRIAGE_RETURN) { 107 | char = buffer[++offset] 108 | } 109 | 110 | val nextChar: Byte = if (offset < size) { 111 | buffer[++offset] 112 | } else { 113 | -1 114 | } 115 | 116 | return when { 117 | char == CARRIAGE_RETURN && nextChar == LINE_FEED -> 2 118 | else -> 1 119 | } 120 | } 121 | 122 | private fun parseRecord(rawRecord: ByteArray): List { 123 | var withinField = false 124 | var escape = false 125 | var startIndex = 0 126 | val record: MutableList = mutableListOf() 127 | var addend = 0 128 | 129 | for (index in rawRecord.indices) { 130 | val char = rawRecord[index] 131 | 132 | when { 133 | !escape && !withinField && char == separator -> { 134 | record.add(Field( 135 | rawRecord.copyOfRange(startIndex + addend, startIndex + (index - startIndex) - addend) 136 | )) 137 | startIndex = index + 1 138 | addend = 0 139 | } 140 | !escape && char == DOUBLE_QUOTE -> { 141 | if (!withinField) { 142 | addend += 1 143 | } 144 | 145 | withinField = !withinField 146 | } 147 | !escape && char == ESCAPE -> 148 | escape = true 149 | escape -> { 150 | escape = false 151 | } 152 | } 153 | } 154 | 155 | record.add(Field( 156 | rawRecord.copyOfRange(startIndex + addend, startIndex + (rawRecord.size - startIndex) - addend) 157 | )) 158 | 159 | return record 160 | } 161 | 162 | private fun readLine( 163 | buffer: ByteBuffer, 164 | start: Int, 165 | end: Int 166 | ): ByteArray { 167 | val lineBuffer = ByteArray(end - start) 168 | buffer.get(lineBuffer) 169 | return lineBuffer 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/support/csv/ParserConfig.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.support.csv 2 | 3 | data class ParserConfig( 4 | val bufferSize: Long, 5 | val separator: Byte 6 | ) { 7 | companion object { 8 | private const val DEFAULT_SEPARATOR: Byte = 44 9 | private const val DEFAULT_BUFFER_SIZE: Long = 1024 * 1024 * 50 10 | 11 | fun default(): ParserConfig = ParserConfig( 12 | bufferSize = DEFAULT_BUFFER_SIZE, 13 | separator = DEFAULT_SEPARATOR 14 | ) 15 | 16 | fun new( 17 | separator: String, 18 | bufferSize: Long = DEFAULT_BUFFER_SIZE 19 | ): ParserConfig = ParserConfig( 20 | bufferSize = bufferSize, 21 | separator = separator[0].toByte() 22 | ) 23 | } 24 | } -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/support/csv/Position.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.support.csv 2 | 3 | data class Position( 4 | val start: Long, 5 | val end: Long 6 | ) 7 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/training/DataSet.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.training 2 | 3 | import be.rlab.nlp.model.Language 4 | import be.rlab.support.csv.ParserConfig 5 | import java.io.File 6 | 7 | /** Represents a file-system based dataset. 8 | */ 9 | data class DataSet( 10 | /** Namespace to scope this dataset in the index. */ 11 | val namespace: String, 12 | /** Dataset language. */ 13 | val language: Language, 14 | /** A classifier to group this dataset. */ 15 | val classifier: String, 16 | /** Dataset file. */ 17 | val file: File, 18 | /* CSV parser config. */ 19 | val parserConfig: ParserConfig = ParserConfig.default() 20 | ) 21 | -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/training/DataSetLoader.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.training 2 | 3 | import be.rlab.search.IndexManager 4 | import be.rlab.search.query.term 5 | import be.rlab.support.csv.Field 6 | import be.rlab.support.csv.Parser 7 | import org.slf4j.Logger 8 | import org.slf4j.LoggerFactory 9 | 10 | abstract class DataSetLoader( 11 | protected val indexManager: IndexManager 12 | ) { 13 | companion object { 14 | private val logger: Logger = LoggerFactory.getLogger(DataSetLoader::class.java) 15 | private const val PROBE_FIELD: String = "INITIALIZED" 16 | } 17 | 18 | protected fun loadIfRequired( 19 | dataSet: DataSet, 20 | callback: (List) -> Unit 21 | ) { 22 | val dataSetName: String = dataSet.file.nameWithoutExtension 23 | 24 | logger.info("verifying if data set $dataSetName is already loaded") 25 | 26 | val exists = indexManager.find(dataSet.namespace, dataSet.language, limit = 1) { 27 | term("$dataSetName::$PROBE_FIELD", "true") 28 | }.toList().isNotEmpty() 29 | 30 | if (exists) { 31 | logger.info("data set $dataSetName is already loaded") 32 | return 33 | } 34 | 35 | logger.info("data set $dataSetName is not loaded, parsing") 36 | 37 | Parser(dataSet.parserConfig).parse(dataSet.file.absolutePath) { _, record -> 38 | callback(record) 39 | } 40 | 41 | logger.info("synchronizing index") 42 | 43 | indexManager.index(dataSet.namespace, dataSet.language) { 44 | string("$dataSetName::$PROBE_FIELD", "true") 45 | } 46 | 47 | indexManager.sync() 48 | } 49 | } -------------------------------------------------------------------------------- /src/main/kotlin/be/rlab/training/SentimentLoader.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.training 2 | 3 | import be.rlab.nlp.Normalizer 4 | import be.rlab.nlp.SentimentAnalyzer.Companion.NAMESPACE 5 | import be.rlab.nlp.SentimentAnalyzer.Companion.SENTIMENT_FIELD 6 | import be.rlab.nlp.SentimentAnalyzer.Companion.SENTIMENT_NEG_FIELD 7 | import be.rlab.nlp.SentimentAnalyzer.Companion.SENTIMENT_POS_FIELD 8 | import be.rlab.nlp.SentimentAnalyzer.Companion.VALUE_FIELD 9 | import be.rlab.nlp.model.Language.ENGLISH 10 | import be.rlab.nlp.model.Language.SPANISH 11 | import be.rlab.search.IndexManager 12 | import be.rlab.search.DocumentBuilder 13 | import be.rlab.support.csv.Field 14 | import be.rlab.support.csv.ParserConfig 15 | import java.io.File 16 | 17 | class SentimentLoader( 18 | private val basePath: String, 19 | indexManager: IndexManager 20 | ) : DataSetLoader(indexManager) { 21 | 22 | companion object { 23 | private const val LEXICON: String = "lexicon" 24 | private const val TWEETS: String = "tweets" 25 | private val TABS: ParserConfig = ParserConfig.new("\t") 26 | } 27 | 28 | private val dataSets: List = listOf( 29 | DataSet(NAMESPACE, SPANISH, LEXICON, file("michigan-lexicons-medium.es.csv"), TABS), 30 | DataSet(NAMESPACE, SPANISH, LEXICON, file("michigan-lexicons-full.es.csv"), TABS), 31 | DataSet(NAMESPACE, SPANISH, TWEETS, file("sentistrength-1600_tweets_dev_complete.es.csv"), TABS), 32 | DataSet(NAMESPACE, SPANISH, TWEETS, file("sentistrength-1600_tweets_test_average_complete.es.tsv"), TABS), 33 | DataSet(NAMESPACE, ENGLISH, TWEETS, file("michigan-tweets-complete.en.csv")) 34 | ) 35 | 36 | private val parsers: Map, DocumentBuilder.(List) -> Unit> = mapOf( 37 | listOf( 38 | "michigan-lexicons-medium.es.csv", 39 | "michigan-lexicons-full.es.csv" 40 | ) to { record -> 41 | val sentiment: String = record[2].value 42 | val value: String = Normalizer.new(text = record[0].value, language = SPANISH) 43 | .removeStopWords() 44 | .skipStemming() 45 | .normalize() 46 | 47 | text(VALUE_FIELD, value) 48 | 49 | when(sentiment) { 50 | "pos" -> addSentiment(negative = 1, positive = 2) 51 | "neg" -> addSentiment(negative = 2, positive = 1) 52 | else -> throw RuntimeException("unknown sentiment: $sentiment") 53 | } 54 | }, 55 | listOf( 56 | "sentistrength-1600_tweets_dev_complete.es.csv", 57 | "sentistrength-1600_tweets_test_average_complete.es.tsv" 58 | ) to { record -> 59 | val value: String = Normalizer.new(text = record[2].value, language = SPANISH) 60 | .removeStopWords() 61 | .skipStemming() 62 | .normalize() 63 | 64 | text(VALUE_FIELD, value) 65 | addSentiment(negative = record[1].value.toInt(), positive = record[0].value.toInt()) 66 | }, 67 | listOf("michigan-tweets-complete.en.csv") to { record -> 68 | val sentiment: Int = record[1].value.toInt() 69 | val value: String = record[3].value 70 | text(VALUE_FIELD, value) 71 | 72 | when(sentiment) { 73 | 0 -> addSentiment(negative = 2, positive = 1) 74 | 1 -> addSentiment(negative = 1, positive = 2) 75 | else -> throw RuntimeException("unknown sentiment: $sentiment") 76 | } 77 | } 78 | ) 79 | 80 | fun loadDataIfRequired() { 81 | dataSets.forEach { dataSet -> 82 | loadIfRequired(dataSet) { record -> 83 | val parser: DocumentBuilder.(List) -> Unit = parsers.filterKeys { dataSets -> 84 | dataSets.contains(dataSet.file.name) 85 | }.map { 86 | it.value 87 | }.single() 88 | 89 | indexManager.index(dataSet.namespace, dataSet.language) { 90 | try { 91 | parser(record) 92 | } catch (cause: Exception) { 93 | println("error parsing record: $record") 94 | } 95 | } 96 | } 97 | } 98 | } 99 | 100 | private fun DocumentBuilder.addSentiment( 101 | negative: Int, 102 | positive: Int 103 | ) { 104 | int(SENTIMENT_FIELD, negative, positive) 105 | int(SENTIMENT_NEG_FIELD, negative) { 106 | store() 107 | } 108 | int(SENTIMENT_POS_FIELD, positive) { 109 | store() 110 | } 111 | } 112 | 113 | private fun file(name: String): File { 114 | return File(basePath, name) 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | true 4 | 5 | 6 | 7 | 9 | 10 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/arabic.txt: -------------------------------------------------------------------------------- 1 | ، 2 | آض 3 | آمينَ 4 | آه 5 | آهاً 6 | آي 7 | أ 8 | أب 9 | أجل 10 | أجمع 11 | أخ 12 | أخذ 13 | أصبح 14 | أضحى 15 | أقبل 16 | أقل 17 | أكثر 18 | ألا 19 | أم 20 | أما 21 | أمامك 22 | أمامكَ 23 | أمسى 24 | أمّا 25 | أن 26 | أنا 27 | أنت 28 | أنتم 29 | أنتما 30 | أنتن 31 | أنتِ 32 | أنشأ 33 | أنّى 34 | أو 35 | أوشك 36 | أولئك 37 | أولئكم 38 | أولاء 39 | أولالك 40 | أوّهْ 41 | أي 42 | أيا 43 | أين 44 | أينما 45 | أيّ 46 | أَنَّ 47 | أََيُّ 48 | أُفٍّ 49 | إذ 50 | إذا 51 | إذاً 52 | إذما 53 | إذن 54 | إلى 55 | إليكم 56 | إليكما 57 | إليكنّ 58 | إليكَ 59 | إلَيْكَ 60 | إلّا 61 | إمّا 62 | إن 63 | إنّما 64 | إي 65 | إياك 66 | إياكم 67 | إياكما 68 | إياكن 69 | إيانا 70 | إياه 71 | إياها 72 | إياهم 73 | إياهما 74 | إياهن 75 | إياي 76 | إيهٍ 77 | إِنَّ 78 | ا 79 | ابتدأ 80 | اثر 81 | اجل 82 | احد 83 | اخرى 84 | اخلولق 85 | اذا 86 | اربعة 87 | ارتدّ 88 | استحال 89 | اطار 90 | اعادة 91 | اعلنت 92 | اف 93 | اكثر 94 | اكد 95 | الألاء 96 | الألى 97 | الا 98 | الاخيرة 99 | الان 100 | الاول 101 | الاولى 102 | التى 103 | التي 104 | الثاني 105 | الثانية 106 | الذاتي 107 | الذى 108 | الذي 109 | الذين 110 | السابق 111 | الف 112 | اللائي 113 | اللاتي 114 | اللتان 115 | اللتيا 116 | اللتين 117 | اللذان 118 | اللذين 119 | اللواتي 120 | الماضي 121 | المقبل 122 | الوقت 123 | الى 124 | اليوم 125 | اما 126 | امام 127 | امس 128 | ان 129 | انبرى 130 | انقلب 131 | انه 132 | انها 133 | او 134 | اول 135 | اي 136 | ايار 137 | ايام 138 | ايضا 139 | ب 140 | بات 141 | باسم 142 | بان 143 | بخٍ 144 | برس 145 | بسبب 146 | بسّ 147 | بشكل 148 | بضع 149 | بطآن 150 | بعد 151 | بعض 152 | بك 153 | بكم 154 | بكما 155 | بكن 156 | بل 157 | بلى 158 | بما 159 | بماذا 160 | بمن 161 | بن 162 | بنا 163 | به 164 | بها 165 | بي 166 | بيد 167 | بين 168 | بَسْ 169 | بَلْهَ 170 | بِئْسَ 171 | تانِ 172 | تانِك 173 | تبدّل 174 | تجاه 175 | تحوّل 176 | تلقاء 177 | تلك 178 | تلكم 179 | تلكما 180 | تم 181 | تينك 182 | تَيْنِ 183 | تِه 184 | تِي 185 | ثلاثة 186 | ثم 187 | ثمّ 188 | ثمّة 189 | ثُمَّ 190 | جعل 191 | جلل 192 | جميع 193 | جير 194 | حار 195 | حاشا 196 | حاليا 197 | حاي 198 | حتى 199 | حرى 200 | حسب 201 | حم 202 | حوالى 203 | حول 204 | حيث 205 | حيثما 206 | حين 207 | حيَّ 208 | حَبَّذَا 209 | حَتَّى 210 | حَذارِ 211 | خلا 212 | خلال 213 | دون 214 | دونك 215 | ذا 216 | ذات 217 | ذاك 218 | ذانك 219 | ذانِ 220 | ذلك 221 | ذلكم 222 | ذلكما 223 | ذلكن 224 | ذو 225 | ذوا 226 | ذواتا 227 | ذواتي 228 | ذيت 229 | ذينك 230 | ذَيْنِ 231 | ذِه 232 | ذِي 233 | راح 234 | رجع 235 | رويدك 236 | ريث 237 | رُبَّ 238 | زيارة 239 | سبحان 240 | سرعان 241 | سنة 242 | سنوات 243 | سوف 244 | سوى 245 | سَاءَ 246 | سَاءَمَا 247 | شبه 248 | شخصا 249 | شرع 250 | شَتَّانَ 251 | صار 252 | صباح 253 | صفر 254 | صهٍ 255 | صهْ 256 | ضد 257 | ضمن 258 | طاق 259 | طالما 260 | طفق 261 | طَق 262 | ظلّ 263 | عاد 264 | عام 265 | عاما 266 | عامة 267 | عدا 268 | عدة 269 | عدد 270 | عدم 271 | عسى 272 | عشر 273 | عشرة 274 | علق 275 | على 276 | عليك 277 | عليه 278 | عليها 279 | علًّ 280 | عن 281 | عند 282 | عندما 283 | عوض 284 | عين 285 | عَدَسْ 286 | عَمَّا 287 | غدا 288 | غير 289 | ـ 290 | ف 291 | فان 292 | فلان 293 | فو 294 | فى 295 | في 296 | فيم 297 | فيما 298 | فيه 299 | فيها 300 | قال 301 | قام 302 | قبل 303 | قد 304 | قطّ 305 | قلما 306 | قوة 307 | كأنّما 308 | كأين 309 | كأيّ 310 | كأيّن 311 | كاد 312 | كان 313 | كانت 314 | كذا 315 | كذلك 316 | كرب 317 | كل 318 | كلا 319 | كلاهما 320 | كلتا 321 | كلم 322 | كليكما 323 | كليهما 324 | كلّما 325 | كلَّا 326 | كم 327 | كما 328 | كي 329 | كيت 330 | كيف 331 | كيفما 332 | كَأَنَّ 333 | كِخ 334 | لئن 335 | لا 336 | لات 337 | لاسيما 338 | لدن 339 | لدى 340 | لعمر 341 | لقاء 342 | لك 343 | لكم 344 | لكما 345 | لكن 346 | لكنَّما 347 | لكي 348 | لكيلا 349 | للامم 350 | لم 351 | لما 352 | لمّا 353 | لن 354 | لنا 355 | له 356 | لها 357 | لو 358 | لوكالة 359 | لولا 360 | لوما 361 | لي 362 | لَسْتَ 363 | لَسْتُ 364 | لَسْتُم 365 | لَسْتُمَا 366 | لَسْتُنَّ 367 | لَسْتِ 368 | لَسْنَ 369 | لَعَلَّ 370 | لَكِنَّ 371 | لَيْتَ 372 | لَيْسَ 373 | لَيْسَا 374 | لَيْسَتَا 375 | لَيْسَتْ 376 | لَيْسُوا 377 | لَِسْنَا 378 | ما 379 | ماانفك 380 | مابرح 381 | مادام 382 | ماذا 383 | مازال 384 | مافتئ 385 | مايو 386 | متى 387 | مثل 388 | مذ 389 | مساء 390 | مع 391 | معاذ 392 | مقابل 393 | مكانكم 394 | مكانكما 395 | مكانكنّ 396 | مكانَك 397 | مليار 398 | مليون 399 | مما 400 | ممن 401 | من 402 | منذ 403 | منها 404 | مه 405 | مهما 406 | مَنْ 407 | مِن 408 | نحن 409 | نحو 410 | نعم 411 | نفس 412 | نفسه 413 | نهاية 414 | نَخْ 415 | نِعِمّا 416 | نِعْمَ 417 | ها 418 | هاؤم 419 | هاكَ 420 | هاهنا 421 | هبّ 422 | هذا 423 | هذه 424 | هكذا 425 | هل 426 | هلمَّ 427 | هلّا 428 | هم 429 | هما 430 | هن 431 | هنا 432 | هناك 433 | هنالك 434 | هو 435 | هي 436 | هيا 437 | هيت 438 | هيّا 439 | هَؤلاء 440 | هَاتانِ 441 | هَاتَيْنِ 442 | هَاتِه 443 | هَاتِي 444 | هَجْ 445 | هَذا 446 | هَذانِ 447 | هَذَيْنِ 448 | هَذِه 449 | هَذِي 450 | هَيْهَاتَ 451 | و 452 | و6 453 | وا 454 | واحد 455 | واضاف 456 | واضافت 457 | واكد 458 | وان 459 | واهاً 460 | واوضح 461 | وراءَك 462 | وفي 463 | وقال 464 | وقالت 465 | وقد 466 | وقف 467 | وكان 468 | وكانت 469 | ولا 470 | ولم 471 | ومن 472 | وهو 473 | وهي 474 | ويكأنّ 475 | وَيْ 476 | وُشْكَانََ 477 | يكون 478 | يمكن 479 | يوم 480 | ّأيّان -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/armenian.txt: -------------------------------------------------------------------------------- 1 | այդ 2 | այլ 3 | այն 4 | այս 5 | դու 6 | դուք 7 | եմ 8 | են 9 | ենք 10 | ես 11 | եք 12 | է 13 | էի 14 | էին 15 | էինք 16 | էիր 17 | էիք 18 | էր 19 | ըստ 20 | թ 21 | ի 22 | ին 23 | իսկ 24 | իր 25 | կամ 26 | համար 27 | հետ 28 | հետո 29 | մենք 30 | մեջ 31 | մի 32 | ն 33 | նա 34 | նաև 35 | նրա 36 | նրանք 37 | որ 38 | որը 39 | որոնք 40 | որպես 41 | ու 42 | ում 43 | պիտի 44 | վրա 45 | և -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/basque.txt: -------------------------------------------------------------------------------- 1 | al 2 | anitz 3 | arabera 4 | asko 5 | baina 6 | bat 7 | batean 8 | batek 9 | bati 10 | batzuei 11 | batzuek 12 | batzuetan 13 | batzuk 14 | bera 15 | beraiek 16 | berau 17 | berauek 18 | bere 19 | berori 20 | beroriek 21 | beste 22 | bezala 23 | da 24 | dago 25 | dira 26 | ditu 27 | du 28 | dute 29 | edo 30 | egin 31 | ere 32 | eta 33 | eurak 34 | ez 35 | gainera 36 | gu 37 | gutxi 38 | guzti 39 | haiei 40 | haiek 41 | haietan 42 | hainbeste 43 | hala 44 | han 45 | handik 46 | hango 47 | hara 48 | hari 49 | hark 50 | hartan 51 | hau 52 | hauei 53 | hauek 54 | hauetan 55 | hemen 56 | hemendik 57 | hemengo 58 | hi 59 | hona 60 | honek 61 | honela 62 | honetan 63 | honi 64 | hor 65 | hori 66 | horiei 67 | horiek 68 | horietan 69 | horko 70 | horra 71 | horrek 72 | horrela 73 | horretan 74 | horri 75 | hortik 76 | hura 77 | izan 78 | ni 79 | noiz 80 | nola 81 | non 82 | nondik 83 | nongo 84 | nor 85 | nora 86 | ze 87 | zein 88 | zen 89 | zenbait 90 | zenbat 91 | zer 92 | zergatik 93 | ziren 94 | zituen 95 | zu 96 | zuek 97 | zuen 98 | zuten -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/bengali.txt: -------------------------------------------------------------------------------- 1 | অতএব 2 | অথচ 3 | অথবা 4 | অনুযায়ী 5 | অনেক 6 | অনেকে 7 | অনেকেই 8 | অন্তত 9 | অন্য 10 | অবধি 11 | অবশ্য 12 | অর্থাত 13 | আই 14 | আগামী 15 | আগে 16 | আগেই 17 | আছে 18 | আজ 19 | আদ্যভাগে 20 | আপনার 21 | আপনি 22 | আবার 23 | আমরা 24 | আমাকে 25 | আমাদের 26 | আমার 27 | আমি 28 | আর 29 | আরও 30 | ই 31 | ইত্যাদি 32 | ইহা 33 | উচিত 34 | উত্তর 35 | উনি 36 | উপর 37 | উপরে 38 | এ 39 | এঁদের 40 | এঁরা 41 | এই 42 | একই 43 | একটি 44 | একবার 45 | একে 46 | এক্ 47 | এখন 48 | এখনও 49 | এখানে 50 | এখানেই 51 | এটা 52 | এটাই 53 | এটি 54 | এত 55 | এতটাই 56 | এতে 57 | এদের 58 | এব 59 | এবং 60 | এবার 61 | এমন 62 | এমনকী 63 | এমনি 64 | এর 65 | এরা 66 | এল 67 | এস 68 | এসে 69 | ঐ 70 | ও 71 | ওঁদের 72 | ওঁর 73 | ওঁরা 74 | ওই 75 | ওকে 76 | ওখানে 77 | ওদের 78 | ওর 79 | ওরা 80 | কখনও 81 | কত 82 | কবে 83 | কমনে 84 | কয়েক 85 | কয়েকটি 86 | করছে 87 | করছেন 88 | করতে 89 | করবে 90 | করবেন 91 | করলে 92 | করলেন 93 | করা 94 | করাই 95 | করায় 96 | করার 97 | করি 98 | করিতে 99 | করিয়া 100 | করিয়ে 101 | করে 102 | করেই 103 | করেছিলেন 104 | করেছে 105 | করেছেন 106 | করেন 107 | কাউকে 108 | কাছ 109 | কাছে 110 | কাজ 111 | কাজে 112 | কারও 113 | কারণ 114 | কি 115 | কিংবা 116 | কিছু 117 | কিছুই 118 | কিন্তু 119 | কী 120 | কে 121 | কেউ 122 | কেউই 123 | কেখা 124 | কেন 125 | কোটি 126 | কোন 127 | কোনও 128 | কোনো 129 | ক্ষেত্রে 130 | কয়েক 131 | খুব 132 | গিয়ে 133 | গিয়েছে 134 | গিয়ে 135 | গুলি 136 | গেছে 137 | গেল 138 | গেলে 139 | গোটা 140 | চলে 141 | চান 142 | চায় 143 | চার 144 | চালু 145 | চেয়ে 146 | চেষ্টা 147 | ছাড়া 148 | ছাড়াও 149 | ছিল 150 | ছিলেন 151 | জন 152 | জনকে 153 | জনের 154 | জন্য 155 | জন্যওজে 156 | জানতে 157 | জানা 158 | জানানো 159 | জানায় 160 | জানিয়ে 161 | জানিয়েছে 162 | জে 163 | জ্নজন 164 | টি 165 | ঠিক 166 | তখন 167 | তত 168 | তথা 169 | তবু 170 | তবে 171 | তা 172 | তাঁকে 173 | তাঁদের 174 | তাঁর 175 | তাঁরা 176 | তাঁাহারা 177 | তাই 178 | তাও 179 | তাকে 180 | তাতে 181 | তাদের 182 | তার 183 | তারপর 184 | তারা 185 | তারৈ 186 | তাহলে 187 | তাহা 188 | তাহাতে 189 | তাহার 190 | তিনঐ 191 | তিনি 192 | তিনিও 193 | তুমি 194 | তুলে 195 | তেমন 196 | তো 197 | তোমার 198 | থাকবে 199 | থাকবেন 200 | থাকা 201 | থাকায় 202 | থাকে 203 | থাকেন 204 | থেকে 205 | থেকেই 206 | থেকেও 207 | দিকে 208 | দিতে 209 | দিন 210 | দিয়ে 211 | দিয়েছে 212 | দিয়েছেন 213 | দিলেন 214 | দু 215 | দুই 216 | দুটি 217 | দুটো 218 | দেওয়া 219 | দেওয়ার 220 | দেওয়া 221 | দেখতে 222 | দেখা 223 | দেখে 224 | দেন 225 | দেয় 226 | দ্বারা 227 | ধরা 228 | ধরে 229 | ধামার 230 | নতুন 231 | নয় 232 | না 233 | নাই 234 | নাকি 235 | নাগাদ 236 | নানা 237 | নিজে 238 | নিজেই 239 | নিজেদের 240 | নিজের 241 | নিতে 242 | নিয়ে 243 | নিয়ে 244 | নেই 245 | নেওয়া 246 | নেওয়ার 247 | নেওয়া 248 | নয় 249 | পক্ষে 250 | পর 251 | পরে 252 | পরেই 253 | পরেও 254 | পর্যন্ত 255 | পাওয়া 256 | পাচ 257 | পারি 258 | পারে 259 | পারেন 260 | পি 261 | পেয়ে 262 | পেয়্র্ 263 | প্রতি 264 | প্রথম 265 | প্রভৃতি 266 | প্রযন্ত 267 | প্রাথমিক 268 | প্রায় 269 | প্রায় 270 | ফলে 271 | ফিরে 272 | ফের 273 | বক্তব্য 274 | বদলে 275 | বন 276 | বরং 277 | বলতে 278 | বলল 279 | বললেন 280 | বলা 281 | বলে 282 | বলেছেন 283 | বলেন 284 | বসে 285 | বহু 286 | বা 287 | বাদে 288 | বার 289 | বি 290 | বিনা 291 | বিভিন্ন 292 | বিশেষ 293 | বিষয়টি 294 | বেশ 295 | বেশি 296 | ব্যবহার 297 | ব্যাপারে 298 | ভাবে 299 | ভাবেই 300 | মতো 301 | মতোই 302 | মধ্যভাগে 303 | মধ্যে 304 | মধ্যেই 305 | মধ্যেও 306 | মনে 307 | মাত্র 308 | মাধ্যমে 309 | মোট 310 | মোটেই 311 | যখন 312 | যত 313 | যতটা 314 | যথেষ্ট 315 | যদি 316 | যদিও 317 | যা 318 | যাঁর 319 | যাঁরা 320 | যাওয়া 321 | যাওয়ার 322 | যাওয়া 323 | যাকে 324 | যাচ্ছে 325 | যাতে 326 | যাদের 327 | যান 328 | যাবে 329 | যায় 330 | যার 331 | যারা 332 | যিনি 333 | যে 334 | যেখানে 335 | যেতে 336 | যেন 337 | যেমন 338 | র 339 | রকম 340 | রয়েছে 341 | রাখা 342 | রেখে 343 | লক্ষ 344 | শুধু 345 | শুরু 346 | সঙ্গে 347 | সঙ্গেও 348 | সব 349 | সবার 350 | সমস্ত 351 | সম্প্রতি 352 | সহ 353 | সহিত 354 | সাধারণ 355 | সামনে 356 | সি 357 | সুতরাং 358 | সে 359 | সেই 360 | সেখান 361 | সেখানে 362 | সেটা 363 | সেটাই 364 | সেটাও 365 | সেটি 366 | স্পষ্ট 367 | স্বয়ং 368 | হইতে 369 | হইবে 370 | হইয়া 371 | হওয়া 372 | হওয়ায় 373 | হওয়ার 374 | হচ্ছে 375 | হত 376 | হতে 377 | হতেই 378 | হন 379 | হবে 380 | হবেন 381 | হয় 382 | হয়তো 383 | হয়নি 384 | হয়ে 385 | হয়েই 386 | হয়েছিল 387 | হয়েছে 388 | হয়েছেন 389 | হল 390 | হলে 391 | হলেই 392 | হলেও 393 | হলো 394 | হাজার 395 | হিসাবে 396 | হৈলে 397 | হোক 398 | হয় -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/brazilian.txt: -------------------------------------------------------------------------------- 1 | a 2 | acerca 3 | adeus 4 | agora 5 | ainda 6 | alem 7 | algmas 8 | algo 9 | algumas 10 | alguns 11 | ali 12 | além 13 | ambas 14 | ambos 15 | ano 16 | anos 17 | antes 18 | ao 19 | aonde 20 | aos 21 | apenas 22 | apoio 23 | apontar 24 | apos 25 | após 26 | aquela 27 | aquelas 28 | aquele 29 | aqueles 30 | aqui 31 | aquilo 32 | as 33 | assim 34 | através 35 | atrás 36 | até 37 | aí 38 | baixo 39 | bastante 40 | bem 41 | boa 42 | boas 43 | bom 44 | bons 45 | breve 46 | cada 47 | caminho 48 | catorze 49 | cedo 50 | cento 51 | certamente 52 | certeza 53 | cima 54 | cinco 55 | coisa 56 | com 57 | como 58 | comprido 59 | conhecido 60 | conselho 61 | contra 62 | contudo 63 | corrente 64 | cuja 65 | cujas 66 | cujo 67 | cujos 68 | custa 69 | cá 70 | da 71 | daquela 72 | daquelas 73 | daquele 74 | daqueles 75 | dar 76 | das 77 | de 78 | debaixo 79 | dela 80 | delas 81 | dele 82 | deles 83 | demais 84 | dentro 85 | depois 86 | desde 87 | desligado 88 | dessa 89 | dessas 90 | desse 91 | desses 92 | desta 93 | destas 94 | deste 95 | destes 96 | deve 97 | devem 98 | deverá 99 | dez 100 | dezanove 101 | dezasseis 102 | dezassete 103 | dezoito 104 | dia 105 | diante 106 | direita 107 | dispoe 108 | dispoem 109 | diversa 110 | diversas 111 | diversos 112 | diz 113 | dizem 114 | dizer 115 | do 116 | dois 117 | dos 118 | doze 119 | duas 120 | durante 121 | dá 122 | dão 123 | dúvida 124 | e 125 | ela 126 | elas 127 | ele 128 | eles 129 | em 130 | embora 131 | enquanto 132 | entao 133 | entre 134 | então 135 | era 136 | eram 137 | essa 138 | essas 139 | esse 140 | esses 141 | esta 142 | estado 143 | estamos 144 | estar 145 | estará 146 | estas 147 | estava 148 | estavam 149 | este 150 | esteja 151 | estejam 152 | estejamos 153 | estes 154 | esteve 155 | estive 156 | estivemos 157 | estiver 158 | estivera 159 | estiveram 160 | estiverem 161 | estivermos 162 | estivesse 163 | estivessem 164 | estiveste 165 | estivestes 166 | estivéramos 167 | estivéssemos 168 | estou 169 | está 170 | estás 171 | estávamos 172 | estão 173 | eu 174 | exemplo 175 | falta 176 | fará 177 | favor 178 | faz 179 | fazeis 180 | fazem 181 | fazemos 182 | fazer 183 | fazes 184 | fazia 185 | faço 186 | fez 187 | fim 188 | final 189 | foi 190 | fomos 191 | for 192 | fora 193 | foram 194 | forem 195 | forma 196 | formos 197 | fosse 198 | fossem 199 | foste 200 | fostes 201 | fui 202 | fôramos 203 | fôssemos 204 | geral 205 | grande 206 | grandes 207 | grupo 208 | ha 209 | haja 210 | hajam 211 | hajamos 212 | havemos 213 | havia 214 | hei 215 | hoje 216 | hora 217 | horas 218 | houve 219 | houvemos 220 | houver 221 | houvera 222 | houveram 223 | houverei 224 | houverem 225 | houveremos 226 | houveria 227 | houveriam 228 | houvermos 229 | houverá 230 | houverão 231 | houveríamos 232 | houvesse 233 | houvessem 234 | houvéramos 235 | houvéssemos 236 | há 237 | hão 238 | iniciar 239 | inicio 240 | ir 241 | irá 242 | isso 243 | ista 244 | iste 245 | isto 246 | já 247 | lado 248 | lhe 249 | lhes 250 | ligado 251 | local 252 | logo 253 | longe 254 | lugar 255 | lá 256 | maior 257 | maioria 258 | maiorias 259 | mais 260 | mal 261 | mas 262 | me 263 | mediante 264 | meio 265 | menor 266 | menos 267 | meses 268 | mesma 269 | mesmas 270 | mesmo 271 | mesmos 272 | meu 273 | meus 274 | mil 275 | minha 276 | minhas 277 | momento 278 | muito 279 | muitos 280 | máximo 281 | mês 282 | na 283 | nada 284 | nao 285 | naquela 286 | naquelas 287 | naquele 288 | naqueles 289 | nas 290 | nem 291 | nenhuma 292 | nessa 293 | nessas 294 | nesse 295 | nesses 296 | nesta 297 | nestas 298 | neste 299 | nestes 300 | no 301 | noite 302 | nome 303 | nos 304 | nossa 305 | nossas 306 | nosso 307 | nossos 308 | nova 309 | novas 310 | nove 311 | novo 312 | novos 313 | num 314 | numa 315 | numas 316 | nunca 317 | nuns 318 | não 319 | nível 320 | nós 321 | número 322 | o 323 | obra 324 | obrigada 325 | obrigado 326 | oitava 327 | oitavo 328 | oito 329 | onde 330 | ontem 331 | onze 332 | os 333 | ou 334 | outra 335 | outras 336 | outro 337 | outros 338 | para 339 | parece 340 | parte 341 | partir 342 | paucas 343 | pegar 344 | pela 345 | pelas 346 | pelo 347 | pelos 348 | perante 349 | perto 350 | pessoas 351 | pode 352 | podem 353 | poder 354 | poderá 355 | podia 356 | pois 357 | ponto 358 | pontos 359 | por 360 | porque 361 | porquê 362 | portanto 363 | posição 364 | possivelmente 365 | posso 366 | possível 367 | pouca 368 | pouco 369 | poucos 370 | povo 371 | primeira 372 | primeiras 373 | primeiro 374 | primeiros 375 | promeiro 376 | propios 377 | proprio 378 | própria 379 | próprias 380 | próprio 381 | próprios 382 | próxima 383 | próximas 384 | próximo 385 | próximos 386 | puderam 387 | pôde 388 | põe 389 | põem 390 | quais 391 | qual 392 | qualquer 393 | quando 394 | quanto 395 | quarta 396 | quarto 397 | quatro 398 | que 399 | quem 400 | quer 401 | quereis 402 | querem 403 | queremas 404 | queres 405 | quero 406 | questão 407 | quieto 408 | quinta 409 | quinto 410 | quinze 411 | quáis 412 | quê 413 | relação 414 | sabe 415 | sabem 416 | saber 417 | se 418 | segunda 419 | segundo 420 | sei 421 | seis 422 | seja 423 | sejam 424 | sejamos 425 | sem 426 | sempre 427 | sendo 428 | ser 429 | serei 430 | seremos 431 | seria 432 | seriam 433 | será 434 | serão 435 | seríamos 436 | sete 437 | seu 438 | seus 439 | sexta 440 | sexto 441 | sim 442 | sistema 443 | sob 444 | sobre 445 | sois 446 | somente 447 | somos 448 | sou 449 | sua 450 | suas 451 | são 452 | sétima 453 | sétimo 454 | só 455 | tal 456 | talvez 457 | tambem 458 | também 459 | tanta 460 | tantas 461 | tanto 462 | tarde 463 | te 464 | tem 465 | temos 466 | tempo 467 | tendes 468 | tenha 469 | tenham 470 | tenhamos 471 | tenho 472 | tens 473 | tentar 474 | tentaram 475 | tente 476 | tentei 477 | ter 478 | terceira 479 | terceiro 480 | terei 481 | teremos 482 | teria 483 | teriam 484 | terá 485 | terão 486 | teríamos 487 | teu 488 | teus 489 | teve 490 | tinha 491 | tinham 492 | tipo 493 | tive 494 | tivemos 495 | tiver 496 | tivera 497 | tiveram 498 | tiverem 499 | tivermos 500 | tivesse 501 | tivessem 502 | tiveste 503 | tivestes 504 | tivéramos 505 | tivéssemos 506 | toda 507 | todas 508 | todo 509 | todos 510 | trabalhar 511 | trabalho 512 | treze 513 | três 514 | tu 515 | tua 516 | tuas 517 | tudo 518 | tão 519 | tém 520 | têm 521 | tínhamos 522 | um 523 | uma 524 | umas 525 | uns 526 | usa 527 | usar 528 | vai 529 | vais 530 | valor 531 | veja 532 | vem 533 | vens 534 | ver 535 | verdade 536 | verdadeiro 537 | vez 538 | vezes 539 | viagem 540 | vindo 541 | vinte 542 | você 543 | vocês 544 | vos 545 | vossa 546 | vossas 547 | vosso 548 | vossos 549 | vários 550 | vão 551 | vêm 552 | vós 553 | zero 554 | à 555 | às 556 | área 557 | é 558 | éramos 559 | és 560 | último -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/bulgarian.txt: -------------------------------------------------------------------------------- 1 | а 2 | автентичен 3 | аз 4 | ако 5 | ала 6 | бе 7 | без 8 | беше 9 | би 10 | бивш 11 | бивша 12 | бившо 13 | бил 14 | била 15 | били 16 | било 17 | благодаря 18 | близо 19 | бъдат 20 | бъде 21 | бяха 22 | в 23 | вас 24 | ваш 25 | ваша 26 | вероятно 27 | вече 28 | взема 29 | ви 30 | вие 31 | винаги 32 | внимава 33 | време 34 | все 35 | всеки 36 | всички 37 | всичко 38 | всяка 39 | във 40 | въпреки 41 | върху 42 | г 43 | ги 44 | главен 45 | главна 46 | главно 47 | глас 48 | го 49 | година 50 | години 51 | годишен 52 | д 53 | да 54 | дали 55 | два 56 | двама 57 | двамата 58 | две 59 | двете 60 | ден 61 | днес 62 | дни 63 | до 64 | добра 65 | добре 66 | добро 67 | добър 68 | докато 69 | докога 70 | дори 71 | досега 72 | доста 73 | друг 74 | друга 75 | други 76 | е 77 | евтин 78 | едва 79 | един 80 | една 81 | еднаква 82 | еднакви 83 | еднакъв 84 | едно 85 | екип 86 | ето 87 | живот 88 | за 89 | забавям 90 | зад 91 | заедно 92 | заради 93 | засега 94 | заспал 95 | затова 96 | защо 97 | защото 98 | и 99 | из 100 | или 101 | им 102 | има 103 | имат 104 | иска 105 | й 106 | каза 107 | как 108 | каква 109 | какво 110 | както 111 | какъв 112 | като 113 | кога 114 | когато 115 | което 116 | които 117 | кой 118 | който 119 | колко 120 | която 121 | къде 122 | където 123 | към 124 | лесен 125 | лесно 126 | ли 127 | лош 128 | м 129 | май 130 | малко 131 | ме 132 | между 133 | мек 134 | мен 135 | месец 136 | ми 137 | много 138 | мнозина 139 | мога 140 | могат 141 | може 142 | мокър 143 | моля 144 | момента 145 | му 146 | н 147 | на 148 | над 149 | назад 150 | най 151 | направи 152 | напред 153 | например 154 | нас 155 | не 156 | него 157 | нещо 158 | нея 159 | ни 160 | ние 161 | никой 162 | нито 163 | нищо 164 | но 165 | нов 166 | нова 167 | нови 168 | новина 169 | някои 170 | някой 171 | няколко 172 | няма 173 | обаче 174 | около 175 | освен 176 | особено 177 | от 178 | отгоре 179 | отново 180 | още 181 | пак 182 | по 183 | повече 184 | повечето 185 | под 186 | поне 187 | поради 188 | после 189 | почти 190 | прави 191 | пред 192 | преди 193 | през 194 | при 195 | пък 196 | първата 197 | първи 198 | първо 199 | пъти 200 | равен 201 | равна 202 | с 203 | са 204 | сам 205 | само 206 | се 207 | сега 208 | си 209 | син 210 | скоро 211 | след 212 | следващ 213 | сме 214 | смях 215 | според 216 | сред 217 | срещу 218 | сте 219 | съм 220 | със 221 | също 222 | т 223 | т.н. 224 | тази 225 | така 226 | такива 227 | такъв 228 | там 229 | твой 230 | те 231 | тези 232 | ти 233 | то 234 | това 235 | тогава 236 | този 237 | той 238 | толкова 239 | точно 240 | три 241 | трябва 242 | тук 243 | тъй 244 | тя 245 | тях 246 | у 247 | утре 248 | харесва 249 | хиляди 250 | ч 251 | часа 252 | че 253 | често 254 | чрез 255 | ще 256 | щом 257 | юмрук 258 | я 259 | як -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/catalan.txt: -------------------------------------------------------------------------------- 1 | a 2 | abans 3 | ací 4 | ah 5 | així 6 | això 7 | al 8 | aleshores 9 | algun 10 | alguna 11 | algunes 12 | alguns 13 | alhora 14 | allà 15 | allí 16 | allò 17 | als 18 | altra 19 | altre 20 | altres 21 | amb 22 | ambdues 23 | ambdós 24 | anar 25 | ans 26 | apa 27 | aquell 28 | aquella 29 | aquelles 30 | aquells 31 | aquest 32 | aquesta 33 | aquestes 34 | aquests 35 | aquí 36 | baix 37 | bastant 38 | bé 39 | cada 40 | cadascuna 41 | cadascunes 42 | cadascuns 43 | cadascú 44 | com 45 | consegueixo 46 | conseguim 47 | conseguir 48 | consigueix 49 | consigueixen 50 | consigueixes 51 | contra 52 | d'un 53 | d'una 54 | d'unes 55 | d'uns 56 | dalt 57 | de 58 | del 59 | dels 60 | des 61 | des de 62 | després 63 | dins 64 | dintre 65 | donat 66 | doncs 67 | durant 68 | e 69 | eh 70 | el 71 | elles 72 | ells 73 | els 74 | em 75 | en 76 | encara 77 | ens 78 | entre 79 | era 80 | erem 81 | eren 82 | eres 83 | es 84 | esta 85 | estan 86 | estat 87 | estava 88 | estaven 89 | estem 90 | esteu 91 | estic 92 | està 93 | estàvem 94 | estàveu 95 | et 96 | etc 97 | ets 98 | fa 99 | faig 100 | fan 101 | fas 102 | fem 103 | fer 104 | feu 105 | fi 106 | fins 107 | fora 108 | gairebé 109 | ha 110 | han 111 | has 112 | haver 113 | havia 114 | he 115 | hem 116 | heu 117 | hi 118 | ho 119 | i 120 | igual 121 | iguals 122 | inclòs 123 | ja 124 | jo 125 | l'hi 126 | la 127 | les 128 | li 129 | li'n 130 | llarg 131 | llavors 132 | m'he 133 | ma 134 | mal 135 | malgrat 136 | mateix 137 | mateixa 138 | mateixes 139 | mateixos 140 | me 141 | mentre 142 | meu 143 | meus 144 | meva 145 | meves 146 | mode 147 | molt 148 | molta 149 | moltes 150 | molts 151 | mon 152 | mons 153 | més 154 | n'he 155 | n'hi 156 | ne 157 | ni 158 | no 159 | nogensmenys 160 | només 161 | nosaltres 162 | nostra 163 | nostre 164 | nostres 165 | o 166 | oh 167 | oi 168 | on 169 | pas 170 | pel 171 | pels 172 | per 173 | per que 174 | perquè 175 | però 176 | poc 177 | poca 178 | pocs 179 | podem 180 | poden 181 | poder 182 | podeu 183 | poques 184 | potser 185 | primer 186 | propi 187 | puc 188 | qual 189 | quals 190 | quan 191 | quant 192 | que 193 | quelcom 194 | qui 195 | quin 196 | quina 197 | quines 198 | quins 199 | què 200 | s'ha 201 | s'han 202 | sa 203 | sabem 204 | saben 205 | saber 206 | sabeu 207 | sap 208 | saps 209 | semblant 210 | semblants 211 | sense 212 | ser 213 | ses 214 | seu 215 | seus 216 | seva 217 | seves 218 | si 219 | sobre 220 | sobretot 221 | soc 222 | solament 223 | sols 224 | som 225 | son 226 | sons 227 | sota 228 | sou 229 | sóc 230 | són 231 | t'ha 232 | t'han 233 | t'he 234 | ta 235 | tal 236 | també 237 | tampoc 238 | tan 239 | tant 240 | tanta 241 | tantes 242 | te 243 | tene 244 | tenim 245 | tenir 246 | teniu 247 | teu 248 | teus 249 | teva 250 | teves 251 | tinc 252 | ton 253 | tons 254 | tot 255 | tota 256 | totes 257 | tots 258 | un 259 | una 260 | unes 261 | uns 262 | us 263 | va 264 | vaig 265 | vam 266 | van 267 | vas 268 | veu 269 | vosaltres 270 | vostra 271 | vostre 272 | vostres 273 | érem 274 | éreu 275 | és 276 | éssent 277 | últim 278 | ús -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/czech.txt: -------------------------------------------------------------------------------- 1 | a 2 | aby 3 | ahoj 4 | aj 5 | ale 6 | anebo 7 | ani 8 | aniž 9 | ano 10 | asi 11 | aspoň 12 | atd 13 | atp 14 | az 15 | ačkoli 16 | až 17 | bez 18 | beze 19 | blízko 20 | bohužel 21 | brzo 22 | bude 23 | budem 24 | budeme 25 | budes 26 | budete 27 | budeš 28 | budou 29 | budu 30 | by 31 | byl 32 | byla 33 | byli 34 | bylo 35 | byly 36 | bys 37 | byt 38 | být 39 | během 40 | chce 41 | chceme 42 | chcete 43 | chceš 44 | chci 45 | chtít 46 | chtějí 47 | chut' 48 | chuti 49 | ci 50 | clanek 51 | clanku 52 | clanky 53 | co 54 | coz 55 | což 56 | cz 57 | daleko 58 | dalsi 59 | další 60 | den 61 | deset 62 | design 63 | devatenáct 64 | devět 65 | dnes 66 | do 67 | dobrý 68 | docela 69 | dva 70 | dvacet 71 | dvanáct 72 | dvě 73 | dál 74 | dále 75 | děkovat 76 | děkujeme 77 | děkuji 78 | email 79 | ho 80 | hodně 81 | i 82 | jak 83 | jakmile 84 | jako 85 | jakož 86 | jde 87 | je 88 | jeden 89 | jedenáct 90 | jedna 91 | jedno 92 | jednou 93 | jedou 94 | jeho 95 | jehož 96 | jej 97 | jeji 98 | jejich 99 | její 100 | jelikož 101 | jemu 102 | jen 103 | jenom 104 | jenž 105 | jeste 106 | jestli 107 | jestliže 108 | ještě 109 | jež 110 | ji 111 | jich 112 | jimi 113 | jinak 114 | jine 115 | jiné 116 | jiz 117 | již 118 | jsem 119 | jses 120 | jseš 121 | jsi 122 | jsme 123 | jsou 124 | jste 125 | já 126 | jí 127 | jím 128 | jíž 129 | jšte 130 | k 131 | kam 132 | každý 133 | kde 134 | kdo 135 | kdy 136 | kdyz 137 | když 138 | ke 139 | kolik 140 | kromě 141 | ktera 142 | ktere 143 | kteri 144 | kterou 145 | ktery 146 | která 147 | které 148 | který 149 | kteři 150 | kteří 151 | ku 152 | kvůli 153 | ma 154 | mají 155 | mate 156 | me 157 | mezi 158 | mi 159 | mit 160 | mne 161 | mnou 162 | mně 163 | moc 164 | mohl 165 | mohou 166 | moje 167 | moji 168 | možná 169 | muj 170 | musí 171 | muze 172 | my 173 | má 174 | málo 175 | mám 176 | máme 177 | máte 178 | máš 179 | mé 180 | mí 181 | mít 182 | mě 183 | můj 184 | může 185 | na 186 | nad 187 | nade 188 | nam 189 | napiste 190 | napište 191 | naproti 192 | nas 193 | nasi 194 | načež 195 | naše 196 | naši 197 | ne 198 | nebo 199 | nebyl 200 | nebyla 201 | nebyli 202 | nebyly 203 | nechť 204 | nedělají 205 | nedělá 206 | nedělám 207 | neděláme 208 | neděláte 209 | neděláš 210 | neg 211 | nejsi 212 | nejsou 213 | nemají 214 | nemáme 215 | nemáte 216 | neměl 217 | neni 218 | není 219 | nestačí 220 | nevadí 221 | nez 222 | než 223 | nic 224 | nich 225 | nimi 226 | nove 227 | novy 228 | nové 229 | nový 230 | nula 231 | ná 232 | nám 233 | námi 234 | nás 235 | náš 236 | ní 237 | ním 238 | ně 239 | něco 240 | nějak 241 | někde 242 | někdo 243 | němu 244 | němuž 245 | o 246 | od 247 | ode 248 | on 249 | ona 250 | oni 251 | ono 252 | ony 253 | osm 254 | osmnáct 255 | pak 256 | patnáct 257 | po 258 | pod 259 | podle 260 | pokud 261 | potom 262 | pouze 263 | pozdě 264 | pořád 265 | prave 266 | pravé 267 | pred 268 | pres 269 | pri 270 | pro 271 | proc 272 | prostě 273 | prosím 274 | proti 275 | proto 276 | protoze 277 | protože 278 | proč 279 | prvni 280 | první 281 | práve 282 | pta 283 | pět 284 | před 285 | přede 286 | přes 287 | přese 288 | při 289 | přičemž 290 | re 291 | rovně 292 | s 293 | se 294 | sedm 295 | sedmnáct 296 | si 297 | sice 298 | skoro 299 | smí 300 | smějí 301 | snad 302 | spolu 303 | sta 304 | sto 305 | strana 306 | sté 307 | sve 308 | svych 309 | svym 310 | svymi 311 | své 312 | svých 313 | svým 314 | svými 315 | svůj 316 | ta 317 | tady 318 | tak 319 | take 320 | takhle 321 | taky 322 | takze 323 | také 324 | takže 325 | tam 326 | tamhle 327 | tamhleto 328 | tamto 329 | tato 330 | te 331 | tebe 332 | tebou 333 | ted' 334 | tedy 335 | tema 336 | ten 337 | tento 338 | teto 339 | ti 340 | tim 341 | timto 342 | tipy 343 | tisíc 344 | tisíce 345 | to 346 | tobě 347 | tohle 348 | toho 349 | tohoto 350 | tom 351 | tomto 352 | tomu 353 | tomuto 354 | toto 355 | trošku 356 | tu 357 | tuto 358 | tvoje 359 | tvá 360 | tvé 361 | tvůj 362 | ty 363 | tyto 364 | téma 365 | této 366 | tím 367 | tímto 368 | tě 369 | těm 370 | těma 371 | těmu 372 | třeba 373 | tři 374 | třináct 375 | u 376 | určitě 377 | uz 378 | už 379 | v 380 | vam 381 | vas 382 | vase 383 | vaše 384 | vaši 385 | ve 386 | vedle 387 | večer 388 | vice 389 | vlastně 390 | vsak 391 | vy 392 | vám 393 | vámi 394 | vás 395 | váš 396 | více 397 | však 398 | všechen 399 | všechno 400 | všichni 401 | vůbec 402 | vždy 403 | z 404 | za 405 | zatímco 406 | zač 407 | zda 408 | zde 409 | ze 410 | zpet 411 | zpravy 412 | zprávy 413 | zpět 414 | čau 415 | či 416 | článek 417 | článku 418 | články 419 | čtrnáct 420 | čtyři 421 | šest 422 | šestnáct 423 | že -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/danish.txt: -------------------------------------------------------------------------------- 1 | ad 2 | af 3 | aldrig 4 | alle 5 | alt 6 | anden 7 | andet 8 | andre 9 | at 10 | bare 11 | begge 12 | blev 13 | blive 14 | bliver 15 | da 16 | de 17 | dem 18 | den 19 | denne 20 | der 21 | deres 22 | det 23 | dette 24 | dig 25 | din 26 | dine 27 | disse 28 | dit 29 | dog 30 | du 31 | efter 32 | ej 33 | eller 34 | en 35 | end 36 | ene 37 | eneste 38 | enhver 39 | er 40 | et 41 | far 42 | fem 43 | fik 44 | fire 45 | flere 46 | fleste 47 | for 48 | fordi 49 | forrige 50 | fra 51 | få 52 | får 53 | før 54 | god 55 | godt 56 | ham 57 | han 58 | hans 59 | har 60 | havde 61 | have 62 | hej 63 | helt 64 | hende 65 | hendes 66 | her 67 | hos 68 | hun 69 | hvad 70 | hvem 71 | hver 72 | hvilken 73 | hvis 74 | hvor 75 | hvordan 76 | hvorfor 77 | hvornår 78 | i 79 | ikke 80 | ind 81 | ingen 82 | intet 83 | ja 84 | jeg 85 | jer 86 | jeres 87 | jo 88 | kan 89 | kom 90 | komme 91 | kommer 92 | kun 93 | kunne 94 | lad 95 | lav 96 | lidt 97 | lige 98 | lille 99 | man 100 | mand 101 | mange 102 | med 103 | meget 104 | men 105 | mens 106 | mere 107 | mig 108 | min 109 | mine 110 | mit 111 | mod 112 | må 113 | ned 114 | nej 115 | ni 116 | nogen 117 | noget 118 | nogle 119 | nu 120 | ny 121 | nyt 122 | når 123 | nær 124 | næste 125 | næsten 126 | og 127 | også 128 | okay 129 | om 130 | op 131 | os 132 | otte 133 | over 134 | på 135 | se 136 | seks 137 | selv 138 | ser 139 | ses 140 | sig 141 | sige 142 | sin 143 | sine 144 | sit 145 | skal 146 | skulle 147 | som 148 | stor 149 | store 150 | syv 151 | så 152 | sådan 153 | tag 154 | tage 155 | thi 156 | ti 157 | til 158 | to 159 | tre 160 | ud 161 | under 162 | var 163 | ved 164 | vi 165 | vil 166 | ville 167 | vor 168 | vores 169 | være 170 | været -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/dutch.txt: -------------------------------------------------------------------------------- 1 | aan 2 | aangaande 3 | aangezien 4 | achte 5 | achter 6 | achterna 7 | af 8 | afgelopen 9 | al 10 | aldaar 11 | aldus 12 | alhoewel 13 | alias 14 | alle 15 | allebei 16 | alleen 17 | alles 18 | als 19 | alsnog 20 | altijd 21 | altoos 22 | ander 23 | andere 24 | anders 25 | anderszins 26 | beetje 27 | behalve 28 | behoudens 29 | beide 30 | beiden 31 | ben 32 | beneden 33 | bent 34 | bepaald 35 | betreffende 36 | bij 37 | bijna 38 | bijv 39 | binnen 40 | binnenin 41 | blijkbaar 42 | blijken 43 | boven 44 | bovenal 45 | bovendien 46 | bovengenoemd 47 | bovenstaand 48 | bovenvermeld 49 | buiten 50 | bv 51 | daar 52 | daardoor 53 | daarheen 54 | daarin 55 | daarna 56 | daarnet 57 | daarom 58 | daarop 59 | daaruit 60 | daarvanlangs 61 | dan 62 | dat 63 | de 64 | deden 65 | deed 66 | der 67 | derde 68 | derhalve 69 | dertig 70 | deze 71 | dhr 72 | die 73 | dikwijls 74 | dit 75 | doch 76 | doe 77 | doen 78 | doet 79 | door 80 | doorgaand 81 | drie 82 | duizend 83 | dus 84 | echter 85 | een 86 | eens 87 | eer 88 | eerdat 89 | eerder 90 | eerlang 91 | eerst 92 | eerste 93 | eigen 94 | eigenlijk 95 | elk 96 | elke 97 | en 98 | enig 99 | enige 100 | enigszins 101 | enkel 102 | er 103 | erdoor 104 | erg 105 | ergens 106 | etc 107 | etcetera 108 | even 109 | eveneens 110 | evenwel 111 | gauw 112 | ge 113 | gedurende 114 | geen 115 | gehad 116 | gekund 117 | geleden 118 | gelijk 119 | gemoeten 120 | gemogen 121 | genoeg 122 | geweest 123 | gewoon 124 | gewoonweg 125 | haar 126 | haarzelf 127 | had 128 | hadden 129 | hare 130 | heb 131 | hebben 132 | hebt 133 | hedden 134 | heeft 135 | heel 136 | hem 137 | hemzelf 138 | hen 139 | het 140 | hetzelfde 141 | hier 142 | hierbeneden 143 | hierboven 144 | hierin 145 | hierna 146 | hierom 147 | hij 148 | hijzelf 149 | hoe 150 | hoewel 151 | honderd 152 | hun 153 | hunne 154 | ieder 155 | iedere 156 | iedereen 157 | iemand 158 | iets 159 | ik 160 | ikzelf 161 | in 162 | inderdaad 163 | inmiddels 164 | intussen 165 | inzake 166 | is 167 | ja 168 | je 169 | jezelf 170 | jij 171 | jijzelf 172 | jou 173 | jouw 174 | jouwe 175 | juist 176 | jullie 177 | kan 178 | klaar 179 | kon 180 | konden 181 | krachtens 182 | kun 183 | kunnen 184 | kunt 185 | laatst 186 | later 187 | liever 188 | lijken 189 | lijkt 190 | maak 191 | maakt 192 | maakte 193 | maakten 194 | maar 195 | mag 196 | maken 197 | me 198 | meer 199 | meest 200 | meestal 201 | men 202 | met 203 | mevr 204 | mezelf 205 | mij 206 | mijn 207 | mijnent 208 | mijner 209 | mijzelf 210 | minder 211 | miss 212 | misschien 213 | missen 214 | mits 215 | mocht 216 | mochten 217 | moest 218 | moesten 219 | moet 220 | moeten 221 | mogen 222 | mr 223 | mrs 224 | mw 225 | na 226 | naar 227 | nadat 228 | nam 229 | namelijk 230 | nee 231 | neem 232 | negen 233 | nemen 234 | nergens 235 | net 236 | niemand 237 | niet 238 | niets 239 | niks 240 | noch 241 | nochtans 242 | nog 243 | nogal 244 | nooit 245 | nu 246 | nv 247 | of 248 | ofschoon 249 | om 250 | omdat 251 | omhoog 252 | omlaag 253 | omstreeks 254 | omtrent 255 | omver 256 | ondanks 257 | onder 258 | ondertussen 259 | ongeveer 260 | ons 261 | onszelf 262 | onze 263 | onzeker 264 | ooit 265 | ook 266 | op 267 | opnieuw 268 | opzij 269 | over 270 | overal 271 | overeind 272 | overige 273 | overigens 274 | paar 275 | pas 276 | per 277 | precies 278 | recent 279 | redelijk 280 | reeds 281 | rond 282 | rondom 283 | samen 284 | sedert 285 | sinds 286 | sindsdien 287 | slechts 288 | sommige 289 | spoedig 290 | steeds 291 | tamelijk 292 | te 293 | tegen 294 | tegenover 295 | tenzij 296 | terwijl 297 | thans 298 | tien 299 | tiende 300 | tijdens 301 | tja 302 | toch 303 | toe 304 | toen 305 | toenmaals 306 | toenmalig 307 | tot 308 | totdat 309 | tussen 310 | twee 311 | tweede 312 | u 313 | uit 314 | uitgezonderd 315 | uw 316 | vaak 317 | vaakwat 318 | van 319 | vanaf 320 | vandaan 321 | vanuit 322 | vanwege 323 | veel 324 | veeleer 325 | veertig 326 | verder 327 | verscheidene 328 | verschillende 329 | vervolgens 330 | via 331 | vier 332 | vierde 333 | vijf 334 | vijfde 335 | vijftig 336 | vol 337 | volgend 338 | volgens 339 | voor 340 | vooraf 341 | vooral 342 | vooralsnog 343 | voorbij 344 | voordat 345 | voordezen 346 | voordien 347 | voorheen 348 | voorop 349 | voorts 350 | vooruit 351 | vrij 352 | vroeg 353 | waar 354 | waarom 355 | waarschijnlijk 356 | wanneer 357 | want 358 | waren 359 | was 360 | wat 361 | we 362 | wederom 363 | weer 364 | weg 365 | wegens 366 | weinig 367 | wel 368 | weldra 369 | welk 370 | welke 371 | werd 372 | werden 373 | werder 374 | wezen 375 | whatever 376 | wie 377 | wiens 378 | wier 379 | wij 380 | wijzelf 381 | wil 382 | wilden 383 | willen 384 | word 385 | worden 386 | wordt 387 | zal 388 | ze 389 | zei 390 | zeker 391 | zelf 392 | zelfde 393 | zelfs 394 | zes 395 | zeven 396 | zich 397 | zichzelf 398 | zij 399 | zijn 400 | zijne 401 | zijzelf 402 | zo 403 | zoals 404 | zodat 405 | zodra 406 | zonder 407 | zou 408 | zouden 409 | zowat 410 | zulk 411 | zulke 412 | zullen 413 | zult -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/estonian.txt: -------------------------------------------------------------------------------- 1 | aga 2 | ei 3 | et 4 | ja 5 | jah 6 | kas 7 | kui 8 | kõik 9 | ma 10 | me 11 | mida 12 | midagi 13 | mind 14 | minu 15 | mis 16 | mu 17 | mul 18 | mulle 19 | nad 20 | nii 21 | oled 22 | olen 23 | oli 24 | oma 25 | on 26 | pole 27 | sa 28 | seda 29 | see 30 | selle 31 | siin 32 | siis 33 | ta 34 | te 35 | ära -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/galician.txt: -------------------------------------------------------------------------------- 1 | a 2 | alí 3 | ao 4 | aos 5 | aquel 6 | aquela 7 | aquelas 8 | aqueles 9 | aquilo 10 | aquí 11 | as 12 | así 13 | aínda 14 | ben 15 | cando 16 | che 17 | co 18 | coa 19 | coas 20 | comigo 21 | con 22 | connosco 23 | contigo 24 | convosco 25 | cos 26 | cun 27 | cunha 28 | cunhas 29 | cuns 30 | da 31 | dalgunha 32 | dalgunhas 33 | dalgún 34 | dalgúns 35 | das 36 | de 37 | del 38 | dela 39 | delas 40 | deles 41 | desde 42 | deste 43 | do 44 | dos 45 | dun 46 | dunha 47 | dunhas 48 | duns 49 | e 50 | el 51 | ela 52 | elas 53 | eles 54 | en 55 | era 56 | eran 57 | esa 58 | esas 59 | ese 60 | eses 61 | esta 62 | estaba 63 | estar 64 | este 65 | estes 66 | estiven 67 | estou 68 | está 69 | están 70 | eu 71 | facer 72 | foi 73 | foron 74 | fun 75 | había 76 | hai 77 | iso 78 | isto 79 | la 80 | las 81 | lle 82 | lles 83 | lo 84 | los 85 | mais 86 | me 87 | meu 88 | meus 89 | min 90 | miña 91 | miñas 92 | moi 93 | na 94 | nas 95 | neste 96 | nin 97 | no 98 | non 99 | nos 100 | nosa 101 | nosas 102 | noso 103 | nosos 104 | nun 105 | nunha 106 | nunhas 107 | nuns 108 | nós 109 | o 110 | os 111 | ou 112 | para 113 | pero 114 | pode 115 | pois 116 | pola 117 | polas 118 | polo 119 | polos 120 | por 121 | que 122 | se 123 | senón 124 | ser 125 | seu 126 | seus 127 | sexa 128 | sido 129 | sobre 130 | súa 131 | súas 132 | tamén 133 | tan 134 | te 135 | ten 136 | ter 137 | teu 138 | teus 139 | teñen 140 | teño 141 | ti 142 | tido 143 | tiven 144 | tiña 145 | túa 146 | túas 147 | un 148 | unha 149 | unhas 150 | uns 151 | vos 152 | vosa 153 | vosas 154 | voso 155 | vosos 156 | vós 157 | á 158 | é 159 | ó 160 | ós -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/hindi.txt: -------------------------------------------------------------------------------- 1 | अंदर 2 | अत 3 | अदि 4 | अप 5 | अपना 6 | अपनि 7 | अपनी 8 | अपने 9 | अभि 10 | अभी 11 | आदि 12 | आप 13 | इंहिं 14 | इंहें 15 | इंहों 16 | इतयादि 17 | इत्यादि 18 | इन 19 | इनका 20 | इन्हीं 21 | इन्हें 22 | इन्हों 23 | इस 24 | इसका 25 | इसकि 26 | इसकी 27 | इसके 28 | इसमें 29 | इसि 30 | इसी 31 | इसे 32 | उंहिं 33 | उंहें 34 | उंहों 35 | उन 36 | उनका 37 | उनकि 38 | उनकी 39 | उनके 40 | उनको 41 | उन्हीं 42 | उन्हें 43 | उन्हों 44 | उस 45 | उसके 46 | उसि 47 | उसी 48 | उसे 49 | एक 50 | एवं 51 | एस 52 | एसे 53 | ऐसे 54 | ओर 55 | और 56 | कइ 57 | कई 58 | कर 59 | करता 60 | करते 61 | करना 62 | करने 63 | करें 64 | कहते 65 | कहा 66 | का 67 | काफि 68 | काफ़ी 69 | कि 70 | किंहें 71 | किंहों 72 | कितना 73 | किन्हें 74 | किन्हों 75 | किया 76 | किर 77 | किस 78 | किसि 79 | किसी 80 | किसे 81 | की 82 | कुछ 83 | कुल 84 | के 85 | को 86 | कोइ 87 | कोई 88 | कोन 89 | कोनसा 90 | कौन 91 | कौनसा 92 | गया 93 | घर 94 | जब 95 | जहाँ 96 | जहां 97 | जा 98 | जिंहें 99 | जिंहों 100 | जितना 101 | जिधर 102 | जिन 103 | जिन्हें 104 | जिन्हों 105 | जिस 106 | जिसे 107 | जीधर 108 | जेसा 109 | जेसे 110 | जैसा 111 | जैसे 112 | जो 113 | तक 114 | तब 115 | तरह 116 | तिंहें 117 | तिंहों 118 | तिन 119 | तिन्हें 120 | तिन्हों 121 | तिस 122 | तिसे 123 | तो 124 | था 125 | थि 126 | थी 127 | थे 128 | दबारा 129 | दवारा 130 | दिया 131 | दुसरा 132 | दुसरे 133 | दूसरे 134 | दो 135 | द्वारा 136 | न 137 | नहिं 138 | नहीं 139 | ना 140 | निचे 141 | निहायत 142 | नीचे 143 | ने 144 | पर 145 | पहले 146 | पुरा 147 | पूरा 148 | पे 149 | फिर 150 | बनि 151 | बनी 152 | बहि 153 | बही 154 | बहुत 155 | बाद 156 | बाला 157 | बिलकुल 158 | भि 159 | भितर 160 | भी 161 | भीतर 162 | मगर 163 | मानो 164 | मे 165 | में 166 | यदि 167 | यह 168 | यहाँ 169 | यहां 170 | यहि 171 | यही 172 | या 173 | यिह 174 | ये 175 | रखें 176 | रवासा 177 | रहा 178 | रहे 179 | ऱ्वासा 180 | लिए 181 | लिये 182 | लेकिन 183 | व 184 | वगेरह 185 | वरग 186 | वर्ग 187 | वह 188 | वहाँ 189 | वहां 190 | वहिं 191 | वहीं 192 | वाले 193 | वुह 194 | वे 195 | वग़ैरह 196 | संग 197 | सकता 198 | सकते 199 | सबसे 200 | सभि 201 | सभी 202 | साथ 203 | साबुत 204 | साभ 205 | सारा 206 | से 207 | सो 208 | हि 209 | ही 210 | हुअ 211 | हुआ 212 | हुइ 213 | हुई 214 | हुए 215 | हे 216 | हें 217 | है 218 | हैं 219 | हो 220 | होता 221 | होति 222 | होती 223 | होते 224 | होना 225 | होने -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/irish.txt: -------------------------------------------------------------------------------- 1 | a 2 | ach 3 | ag 4 | agus 5 | an 6 | aon 7 | ar 8 | arna 9 | as 10 | b' 11 | ba 12 | beirt 13 | bhúr 14 | caoga 15 | ceathair 16 | ceathrar 17 | chomh 18 | chtó 19 | chuig 20 | chun 21 | cois 22 | céad 23 | cúig 24 | cúigear 25 | d' 26 | daichead 27 | dar 28 | de 29 | deich 30 | deichniúr 31 | den 32 | dhá 33 | do 34 | don 35 | dtí 36 | dá 37 | dár 38 | dó 39 | faoi 40 | faoin 41 | faoina 42 | faoinár 43 | fara 44 | fiche 45 | gach 46 | gan 47 | go 48 | gur 49 | haon 50 | hocht 51 | i 52 | iad 53 | idir 54 | in 55 | ina 56 | ins 57 | inár 58 | is 59 | le 60 | leis 61 | lena 62 | lenár 63 | m' 64 | mar 65 | mo 66 | mé 67 | na 68 | nach 69 | naoi 70 | naonúr 71 | ná 72 | ní 73 | níor 74 | nó 75 | nócha 76 | ocht 77 | ochtar 78 | os 79 | roimh 80 | sa 81 | seacht 82 | seachtar 83 | seachtó 84 | seasca 85 | seisear 86 | siad 87 | sibh 88 | sinn 89 | sna 90 | sé 91 | sí 92 | tar 93 | thar 94 | thú 95 | triúr 96 | trí 97 | trína 98 | trínár 99 | tríocha 100 | tú 101 | um 102 | ár 103 | é 104 | éis 105 | í 106 | ó 107 | ón 108 | óna 109 | ónár -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/latvian.txt: -------------------------------------------------------------------------------- 1 | aiz 2 | ap 3 | apakš 4 | apakšpus 5 | ar 6 | arī 7 | augšpus 8 | bet 9 | bez 10 | bija 11 | biji 12 | biju 13 | bijām 14 | bijāt 15 | būs 16 | būsi 17 | būsiet 18 | būsim 19 | būt 20 | būšu 21 | caur 22 | diemžēl 23 | diezin 24 | droši 25 | dēļ 26 | esam 27 | esat 28 | esi 29 | esmu 30 | gan 31 | gar 32 | iekam 33 | iekams 34 | iekām 35 | iekāms 36 | iekš 37 | iekšpus 38 | ik 39 | ir 40 | it 41 | itin 42 | iz 43 | ja 44 | jau 45 | jeb 46 | jebšu 47 | jel 48 | jo 49 | jā 50 | ka 51 | kamēr 52 | kaut 53 | kolīdz 54 | kopš 55 | kā 56 | kļuva 57 | kļuvi 58 | kļuvu 59 | kļuvām 60 | kļuvāt 61 | kļūs 62 | kļūsi 63 | kļūsiet 64 | kļūsim 65 | kļūst 66 | kļūstam 67 | kļūstat 68 | kļūsti 69 | kļūstu 70 | kļūt 71 | kļūšu 72 | labad 73 | lai 74 | lejpus 75 | līdz 76 | līdzko 77 | ne 78 | nebūt 79 | nedz 80 | nekā 81 | nevis 82 | nezin 83 | no 84 | nu 85 | nē 86 | otrpus 87 | pa 88 | par 89 | pat 90 | pie 91 | pirms 92 | pret 93 | priekš 94 | pār 95 | pēc 96 | starp 97 | tad 98 | tak 99 | tapi 100 | taps 101 | tapsi 102 | tapsiet 103 | tapsim 104 | tapt 105 | tapāt 106 | tapšu 107 | taču 108 | te 109 | tiec 110 | tiek 111 | tiekam 112 | tiekat 113 | tieku 114 | tik 115 | tika 116 | tikai 117 | tiki 118 | tikko 119 | tiklab 120 | tiklīdz 121 | tiks 122 | tiksiet 123 | tiksim 124 | tikt 125 | tiku 126 | tikvien 127 | tikām 128 | tikāt 129 | tikšu 130 | tomēr 131 | topat 132 | turpretim 133 | turpretī 134 | tā 135 | tādēļ 136 | tālab 137 | tāpēc 138 | un 139 | uz 140 | vai 141 | var 142 | varat 143 | varēja 144 | varēji 145 | varēju 146 | varējām 147 | varējāt 148 | varēs 149 | varēsi 150 | varēsiet 151 | varēsim 152 | varēt 153 | varēšu 154 | vien 155 | virs 156 | virspus 157 | vis 158 | viņpus 159 | zem 160 | ārpus 161 | šaipus -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/lithuanian.txt: -------------------------------------------------------------------------------- 1 | abi 2 | abidvi 3 | abiejose 4 | abiejuose 5 | abiejø 6 | abiem 7 | abigaliai 8 | abipus 9 | abu 10 | abudu 11 | ai 12 | ana 13 | anaiptol 14 | anaisiais 15 | anajai 16 | anajam 17 | anajame 18 | anapus 19 | anas 20 | anasai 21 | anasis 22 | anei 23 | aniedvi 24 | anieji 25 | aniesiems 26 | anoji 27 | anojo 28 | anojoje 29 | anokia 30 | anoks 31 | anosiomis 32 | anosioms 33 | anosios 34 | anosiose 35 | anot 36 | ant 37 | antai 38 | anuodu 39 | anuoju 40 | anuosiuose 41 | anuosius 42 | anàja 43 | anàjà 44 | anàjá 45 | anàsias 46 | anøjø 47 | apie 48 | aplink 49 | ar 50 | arba 51 | argi 52 | arti 53 | aukðèiau 54 | að 55 | be 56 | bei 57 | beje 58 | bemaþ 59 | bent 60 | bet 61 | betgi 62 | beveik 63 | dar 64 | dargi 65 | daugmaþ 66 | deja 67 | dëka 68 | dël 69 | dëlei 70 | dëlto 71 | ech 72 | et 73 | gal 74 | galbût 75 | galgi 76 | gan 77 | gana 78 | gi 79 | greta 80 | idant 81 | iki 82 | ir 83 | irgi 84 | it 85 | itin 86 | ið 87 | iðilgai 88 | iðvis 89 | jaisiais 90 | jajai 91 | jajam 92 | jajame 93 | jei 94 | jeigu 95 | ji 96 | jiedu 97 | jiedvi 98 | jieji 99 | jiesiems 100 | jinai 101 | jis 102 | jisai 103 | jog 104 | joji 105 | jojo 106 | jojoje 107 | jokia 108 | joks 109 | josiomis 110 | josioms 111 | josios 112 | josiose 113 | judu 114 | judvi 115 | juk 116 | jumis 117 | jums 118 | jumyse 119 | juodu 120 | juoju 121 | juosiuose 122 | juosius 123 | jus 124 | jàja 125 | jàjà 126 | jàsias 127 | jájá 128 | jøjø 129 | jûs 130 | jûsiðkis 131 | jûsiðkë 132 | jûsø 133 | kad 134 | kada 135 | kadangi 136 | kai 137 | kaip 138 | kaipgi 139 | kas 140 | katra 141 | katras 142 | katriedvi 143 | katruodu 144 | kaþin 145 | kaþkas 146 | kaþkatra 147 | kaþkatras 148 | kaþkokia 149 | kaþkoks 150 | kaþkuri 151 | kaþkuris 152 | kiaurai 153 | kiek 154 | kiekvienas 155 | kieno 156 | kita 157 | kitas 158 | kitokia 159 | kitoks 160 | kodël 161 | kokia 162 | koks 163 | kol 164 | kolei 165 | kone 166 | kuomet 167 | kur 168 | kurgi 169 | kuri 170 | kuriedvi 171 | kuris 172 | kuriuodu 173 | lai 174 | lig 175 | ligi 176 | link 177 | lyg 178 | man 179 | manaisiais 180 | manajai 181 | manajam 182 | manajame 183 | manas 184 | manasai 185 | manasis 186 | mane 187 | manieji 188 | maniesiems 189 | manim 190 | manimi 191 | maniðkis 192 | maniðkë 193 | mano 194 | manoji 195 | manojo 196 | manojoje 197 | manosiomis 198 | manosioms 199 | manosios 200 | manosiose 201 | manuoju 202 | manuosiuose 203 | manuosius 204 | manyje 205 | manàja 206 | manàjà 207 | manàjá 208 | manàsias 209 | manæs 210 | manøjø 211 | mat 212 | maþdaug 213 | maþne 214 | mes 215 | mudu 216 | mudvi 217 | mumis 218 | mums 219 | mumyse 220 | mus 221 | mûsiðkis 222 | mûsiðkë 223 | mûsø 224 | na 225 | nagi 226 | ne 227 | nebe 228 | nebent 229 | negi 230 | negu 231 | nei 232 | nejau 233 | nejaugi 234 | nekaip 235 | nelyginant 236 | nes 237 | net 238 | netgi 239 | netoli 240 | neva 241 | nors 242 | nuo 243 | në 244 | o 245 | ogi 246 | oi 247 | paeiliui 248 | pagal 249 | pakeliui 250 | palaipsniui 251 | palei 252 | pas 253 | pasak 254 | paskos 255 | paskui 256 | paskum 257 | pat 258 | pati 259 | patiems 260 | paties 261 | pats 262 | patys 263 | patá 264 | paèiais 265 | paèiam 266 | paèiame 267 | paèiu 268 | paèiuose 269 | paèius 270 | paèiø 271 | per 272 | pernelyg 273 | pirm 274 | pirma 275 | pirmiau 276 | po 277 | prie 278 | prieð 279 | prieðais 280 | pro 281 | pusiau 282 | rasi 283 | rodos 284 | sau 285 | savaisiais 286 | savajai 287 | savajam 288 | savajame 289 | savas 290 | savasai 291 | savasis 292 | save 293 | savieji 294 | saviesiems 295 | savimi 296 | saviðkis 297 | saviðkë 298 | savo 299 | savoji 300 | savojo 301 | savojoje 302 | savosiomis 303 | savosioms 304 | savosios 305 | savosiose 306 | savuoju 307 | savuosiuose 308 | savuosius 309 | savyje 310 | savàja 311 | savàjà 312 | savàjá 313 | savàsias 314 | savæs 315 | savøjø 316 | skersai 317 | skradþiai 318 | staèiai 319 | su 320 | sulig 321 | ta 322 | tad 323 | tai 324 | taigi 325 | taip 326 | taipogi 327 | taisiais 328 | tajai 329 | tajam 330 | tajame 331 | tamsta 332 | tarp 333 | tarsi 334 | tartum 335 | tarytum 336 | tas 337 | tasai 338 | tau 339 | tavaisiais 340 | tavajai 341 | tavajam 342 | tavajame 343 | tavas 344 | tavasai 345 | tavasis 346 | tave 347 | tavieji 348 | taviesiems 349 | tavimi 350 | taviðkis 351 | taviðkë 352 | tavo 353 | tavoji 354 | tavojo 355 | tavojoje 356 | tavosiomis 357 | tavosioms 358 | tavosios 359 | tavosiose 360 | tavuoju 361 | tavuosiuose 362 | tavuosius 363 | tavyje 364 | tavàja 365 | tavàjà 366 | tavàjá 367 | tavàsias 368 | tavæs 369 | tavøjø 370 | taèiau 371 | te 372 | tegu 373 | tegul 374 | tiedvi 375 | tieji 376 | ties 377 | tiesiems 378 | tiesiog 379 | tik 380 | tikriausiai 381 | tiktai 382 | toji 383 | tojo 384 | tojoje 385 | tokia 386 | toks 387 | tol 388 | tolei 389 | toliau 390 | tosiomis 391 | tosioms 392 | tosios 393 | tosiose 394 | tu 395 | tuodu 396 | tuoju 397 | tuosiuose 398 | tuosius 399 | turbût 400 | tàja 401 | tàjà 402 | tàjá 403 | tàsias 404 | tøjø 405 | tûlas 406 | uþ 407 | uþtat 408 | uþvis 409 | va 410 | vai 411 | viduj 412 | vidury 413 | vien 414 | vienas 415 | vienokia 416 | vienoks 417 | vietoj 418 | virð 419 | virðuj 420 | virðum 421 | vis 422 | vis dëlto 423 | visa 424 | visas 425 | visgi 426 | visokia 427 | visoks 428 | vos 429 | vël 430 | vëlgi 431 | ypaè 432 | á 433 | ákypai 434 | ástriþai 435 | ðalia 436 | ðe 437 | ði 438 | ðiaisiais 439 | ðiajai 440 | ðiajam 441 | ðiajame 442 | ðiapus 443 | ðiedvi 444 | ðieji 445 | ðiesiems 446 | ðioji 447 | ðiojo 448 | ðiojoje 449 | ðiokia 450 | ðioks 451 | ðiosiomis 452 | ðiosioms 453 | ðiosios 454 | ðiosiose 455 | ðis 456 | ðisai 457 | ðit 458 | ðita 459 | ðitas 460 | ðitiedvi 461 | ðitokia 462 | ðitoks 463 | ðituodu 464 | ðiuodu 465 | ðiuoju 466 | ðiuosiuose 467 | ðiuosius 468 | ðiàja 469 | ðiàjà 470 | ðiàsias 471 | ðiøjø 472 | ðtai 473 | ðájá 474 | þemiau -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/norwegian.txt: -------------------------------------------------------------------------------- 1 | alle 2 | andre 3 | arbeid 4 | at 5 | av 6 | bare 7 | begge 8 | ble 9 | blei 10 | bli 11 | blir 12 | blitt 13 | bort 14 | bra 15 | bruke 16 | både 17 | båe 18 | da 19 | de 20 | deg 21 | dei 22 | deim 23 | deira 24 | deires 25 | dem 26 | den 27 | denne 28 | der 29 | dere 30 | deres 31 | det 32 | dette 33 | di 34 | din 35 | disse 36 | ditt 37 | du 38 | dykk 39 | dykkar 40 | då 41 | eg 42 | ein 43 | eit 44 | eitt 45 | eller 46 | elles 47 | en 48 | ene 49 | eneste 50 | enhver 51 | enn 52 | er 53 | et 54 | ett 55 | etter 56 | folk 57 | for 58 | fordi 59 | forsûke 60 | fra 61 | få 62 | før 63 | fûr 64 | fûrst 65 | gjorde 66 | gjûre 67 | god 68 | gå 69 | ha 70 | hadde 71 | han 72 | hans 73 | har 74 | hennar 75 | henne 76 | hennes 77 | her 78 | hjå 79 | ho 80 | hoe 81 | honom 82 | hoss 83 | hossen 84 | hun 85 | hva 86 | hvem 87 | hver 88 | hvilke 89 | hvilken 90 | hvis 91 | hvor 92 | hvordan 93 | hvorfor 94 | i 95 | ikke 96 | ikkje 97 | ingen 98 | ingi 99 | inkje 100 | inn 101 | innen 102 | inni 103 | ja 104 | jeg 105 | kan 106 | kom 107 | korleis 108 | korso 109 | kun 110 | kunne 111 | kva 112 | kvar 113 | kvarhelst 114 | kven 115 | kvi 116 | kvifor 117 | lage 118 | lang 119 | lik 120 | like 121 | makt 122 | man 123 | mange 124 | me 125 | med 126 | medan 127 | meg 128 | meget 129 | mellom 130 | men 131 | mens 132 | mer 133 | mest 134 | mi 135 | min 136 | mine 137 | mitt 138 | mot 139 | mye 140 | mykje 141 | må 142 | måte 143 | navn 144 | ned 145 | nei 146 | no 147 | noe 148 | noen 149 | noka 150 | noko 151 | nokon 152 | nokor 153 | nokre 154 | ny 155 | nå 156 | når 157 | og 158 | også 159 | om 160 | opp 161 | oss 162 | over 163 | part 164 | punkt 165 | på 166 | rett 167 | riktig 168 | samme 169 | sant 170 | seg 171 | selv 172 | si 173 | sia 174 | sidan 175 | siden 176 | sin 177 | sine 178 | sist 179 | sitt 180 | sjøl 181 | skal 182 | skulle 183 | slik 184 | slutt 185 | so 186 | som 187 | somme 188 | somt 189 | start 190 | stille 191 | så 192 | sånn 193 | tid 194 | til 195 | tilbake 196 | tilstand 197 | um 198 | under 199 | upp 200 | ut 201 | uten 202 | var 203 | vart 204 | varte 205 | ved 206 | verdi 207 | vere 208 | verte 209 | vi 210 | vil 211 | ville 212 | vite 213 | vore 214 | vors 215 | vort 216 | vår 217 | være 218 | vært 219 | vöre 220 | vört 221 | å -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/polish.txt: -------------------------------------------------------------------------------- 1 | a 2 | aby 3 | ach 4 | acz 5 | aczkolwiek 6 | aj 7 | albo 8 | ale 9 | ależ 10 | ani 11 | aż 12 | bardziej 13 | bardzo 14 | bez 15 | bo 16 | bowiem 17 | by 18 | byli 19 | bym 20 | bynajmniej 21 | być 22 | był 23 | była 24 | było 25 | były 26 | będzie 27 | będą 28 | cali 29 | cała 30 | cały 31 | chce 32 | choć 33 | ci 34 | ciebie 35 | cię 36 | co 37 | cokolwiek 38 | coraz 39 | coś 40 | czasami 41 | czasem 42 | czemu 43 | czy 44 | czyli 45 | często 46 | daleko 47 | dla 48 | dlaczego 49 | dlatego 50 | do 51 | dobrze 52 | dokąd 53 | dość 54 | dr 55 | dużo 56 | dwa 57 | dwaj 58 | dwie 59 | dwoje 60 | dzisiaj 61 | dziś 62 | gdy 63 | gdyby 64 | gdyż 65 | gdzie 66 | gdziekolwiek 67 | gdzieś 68 | go 69 | godz 70 | hab 71 | i 72 | ich 73 | ii 74 | iii 75 | ile 76 | im 77 | inna 78 | inne 79 | inny 80 | innych 81 | inż 82 | iv 83 | ix 84 | iż 85 | ja 86 | jak 87 | jakaś 88 | jakby 89 | jaki 90 | jakichś 91 | jakie 92 | jakiś 93 | jakiż 94 | jakkolwiek 95 | jako 96 | jakoś 97 | je 98 | jeden 99 | jedna 100 | jednak 101 | jednakże 102 | jedno 103 | jednym 104 | jedynie 105 | jego 106 | jej 107 | jemu 108 | jest 109 | jestem 110 | jeszcze 111 | jeśli 112 | jeżeli 113 | już 114 | ją 115 | każdy 116 | kiedy 117 | kierunku 118 | kilka 119 | kilku 120 | kimś 121 | kto 122 | ktokolwiek 123 | ktoś 124 | która 125 | które 126 | którego 127 | której 128 | który 129 | których 130 | którym 131 | którzy 132 | ku 133 | lat 134 | lecz 135 | lub 136 | ma 137 | mają 138 | mam 139 | mamy 140 | mało 141 | mgr 142 | mi 143 | miał 144 | mimo 145 | między 146 | mnie 147 | mną 148 | mogą 149 | moi 150 | moim 151 | moja 152 | moje 153 | może 154 | możliwe 155 | można 156 | mu 157 | musi 158 | my 159 | mój 160 | na 161 | nad 162 | nam 163 | nami 164 | nas 165 | nasi 166 | nasz 167 | nasza 168 | nasze 169 | naszego 170 | naszych 171 | natomiast 172 | natychmiast 173 | nawet 174 | nic 175 | nich 176 | nie 177 | niech 178 | niego 179 | niej 180 | niemu 181 | nigdy 182 | nim 183 | nimi 184 | nią 185 | niż 186 | no 187 | nowe 188 | np 189 | nr 190 | o 191 | o.o. 192 | obok 193 | od 194 | ok 195 | około 196 | on 197 | ona 198 | one 199 | oni 200 | ono 201 | oraz 202 | oto 203 | owszem 204 | pan 205 | pana 206 | pani 207 | pl 208 | po 209 | pod 210 | podczas 211 | pomimo 212 | ponad 213 | ponieważ 214 | powinien 215 | powinna 216 | powinni 217 | powinno 218 | poza 219 | prawie 220 | prof 221 | przecież 222 | przed 223 | przede 224 | przedtem 225 | przez 226 | przy 227 | raz 228 | razie 229 | roku 230 | również 231 | sam 232 | sama 233 | się 234 | skąd 235 | sobie 236 | sobą 237 | sposób 238 | swoje 239 | są 240 | ta 241 | tak 242 | taka 243 | taki 244 | takich 245 | takie 246 | także 247 | tam 248 | te 249 | tego 250 | tej 251 | tel 252 | temu 253 | ten 254 | teraz 255 | też 256 | to 257 | tobie 258 | tobą 259 | toteż 260 | totobą 261 | trzeba 262 | tu 263 | tutaj 264 | twoi 265 | twoim 266 | twoja 267 | twoje 268 | twym 269 | twój 270 | ty 271 | tych 272 | tylko 273 | tym 274 | tys 275 | tzw 276 | tę 277 | u 278 | ul 279 | vi 280 | vii 281 | viii 282 | vol 283 | w 284 | wam 285 | wami 286 | was 287 | wasi 288 | wasz 289 | wasza 290 | wasze 291 | we 292 | według 293 | wie 294 | wiele 295 | wielu 296 | więc 297 | więcej 298 | wszyscy 299 | wszystkich 300 | wszystkie 301 | wszystkim 302 | wszystko 303 | wtedy 304 | www 305 | wy 306 | właśnie 307 | wśród 308 | xi 309 | xii 310 | xiii 311 | xiv 312 | xv 313 | z 314 | za 315 | zapewne 316 | zawsze 317 | zaś 318 | ze 319 | zeznowu 320 | znowu 321 | znów 322 | został 323 | zł 324 | żaden 325 | żadna 326 | żadne 327 | żadnych 328 | że 329 | żeby -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/portuguese.txt: -------------------------------------------------------------------------------- 1 | a 2 | acerca 3 | adeus 4 | agora 5 | ainda 6 | alem 7 | algmas 8 | algo 9 | algumas 10 | alguns 11 | ali 12 | além 13 | ambas 14 | ambos 15 | ano 16 | anos 17 | antes 18 | ao 19 | aonde 20 | aos 21 | apenas 22 | apoio 23 | apontar 24 | apos 25 | após 26 | aquela 27 | aquelas 28 | aquele 29 | aqueles 30 | aqui 31 | aquilo 32 | as 33 | assim 34 | através 35 | atrás 36 | até 37 | aí 38 | baixo 39 | bastante 40 | bem 41 | boa 42 | boas 43 | bom 44 | bons 45 | breve 46 | cada 47 | caminho 48 | catorze 49 | cedo 50 | cento 51 | certamente 52 | certeza 53 | cima 54 | cinco 55 | coisa 56 | com 57 | como 58 | comprido 59 | conhecido 60 | conselho 61 | contra 62 | contudo 63 | corrente 64 | cuja 65 | cujas 66 | cujo 67 | cujos 68 | custa 69 | cá 70 | da 71 | daquela 72 | daquelas 73 | daquele 74 | daqueles 75 | dar 76 | das 77 | de 78 | debaixo 79 | dela 80 | delas 81 | dele 82 | deles 83 | demais 84 | dentro 85 | depois 86 | desde 87 | desligado 88 | dessa 89 | dessas 90 | desse 91 | desses 92 | desta 93 | destas 94 | deste 95 | destes 96 | deve 97 | devem 98 | deverá 99 | dez 100 | dezanove 101 | dezasseis 102 | dezassete 103 | dezoito 104 | dia 105 | diante 106 | direita 107 | dispoe 108 | dispoem 109 | diversa 110 | diversas 111 | diversos 112 | diz 113 | dizem 114 | dizer 115 | do 116 | dois 117 | dos 118 | doze 119 | duas 120 | durante 121 | dá 122 | dão 123 | dúvida 124 | e 125 | ela 126 | elas 127 | ele 128 | eles 129 | em 130 | embora 131 | enquanto 132 | entao 133 | entre 134 | então 135 | era 136 | eram 137 | essa 138 | essas 139 | esse 140 | esses 141 | esta 142 | estado 143 | estamos 144 | estar 145 | estará 146 | estas 147 | estava 148 | estavam 149 | este 150 | esteja 151 | estejam 152 | estejamos 153 | estes 154 | esteve 155 | estive 156 | estivemos 157 | estiver 158 | estivera 159 | estiveram 160 | estiverem 161 | estivermos 162 | estivesse 163 | estivessem 164 | estiveste 165 | estivestes 166 | estivéramos 167 | estivéssemos 168 | estou 169 | está 170 | estás 171 | estávamos 172 | estão 173 | eu 174 | exemplo 175 | falta 176 | fará 177 | favor 178 | faz 179 | fazeis 180 | fazem 181 | fazemos 182 | fazer 183 | fazes 184 | fazia 185 | faço 186 | fez 187 | fim 188 | final 189 | foi 190 | fomos 191 | for 192 | fora 193 | foram 194 | forem 195 | forma 196 | formos 197 | fosse 198 | fossem 199 | foste 200 | fostes 201 | fui 202 | fôramos 203 | fôssemos 204 | geral 205 | grande 206 | grandes 207 | grupo 208 | ha 209 | haja 210 | hajam 211 | hajamos 212 | havemos 213 | havia 214 | hei 215 | hoje 216 | hora 217 | horas 218 | houve 219 | houvemos 220 | houver 221 | houvera 222 | houveram 223 | houverei 224 | houverem 225 | houveremos 226 | houveria 227 | houveriam 228 | houvermos 229 | houverá 230 | houverão 231 | houveríamos 232 | houvesse 233 | houvessem 234 | houvéramos 235 | houvéssemos 236 | há 237 | hão 238 | iniciar 239 | inicio 240 | ir 241 | irá 242 | isso 243 | ista 244 | iste 245 | isto 246 | já 247 | lado 248 | lhe 249 | lhes 250 | ligado 251 | local 252 | logo 253 | longe 254 | lugar 255 | lá 256 | maior 257 | maioria 258 | maiorias 259 | mais 260 | mal 261 | mas 262 | me 263 | mediante 264 | meio 265 | menor 266 | menos 267 | meses 268 | mesma 269 | mesmas 270 | mesmo 271 | mesmos 272 | meu 273 | meus 274 | mil 275 | minha 276 | minhas 277 | momento 278 | muito 279 | muitos 280 | máximo 281 | mês 282 | na 283 | nada 284 | nao 285 | naquela 286 | naquelas 287 | naquele 288 | naqueles 289 | nas 290 | nem 291 | nenhuma 292 | nessa 293 | nessas 294 | nesse 295 | nesses 296 | nesta 297 | nestas 298 | neste 299 | nestes 300 | no 301 | noite 302 | nome 303 | nos 304 | nossa 305 | nossas 306 | nosso 307 | nossos 308 | nova 309 | novas 310 | nove 311 | novo 312 | novos 313 | num 314 | numa 315 | numas 316 | nunca 317 | nuns 318 | não 319 | nível 320 | nós 321 | número 322 | o 323 | obra 324 | obrigada 325 | obrigado 326 | oitava 327 | oitavo 328 | oito 329 | onde 330 | ontem 331 | onze 332 | os 333 | ou 334 | outra 335 | outras 336 | outro 337 | outros 338 | para 339 | parece 340 | parte 341 | partir 342 | paucas 343 | pegar 344 | pela 345 | pelas 346 | pelo 347 | pelos 348 | perante 349 | perto 350 | pessoas 351 | pode 352 | podem 353 | poder 354 | poderá 355 | podia 356 | pois 357 | ponto 358 | pontos 359 | por 360 | porque 361 | porquê 362 | portanto 363 | posição 364 | possivelmente 365 | posso 366 | possível 367 | pouca 368 | pouco 369 | poucos 370 | povo 371 | primeira 372 | primeiras 373 | primeiro 374 | primeiros 375 | promeiro 376 | propios 377 | proprio 378 | própria 379 | próprias 380 | próprio 381 | próprios 382 | próxima 383 | próximas 384 | próximo 385 | próximos 386 | puderam 387 | pôde 388 | põe 389 | põem 390 | quais 391 | qual 392 | qualquer 393 | quando 394 | quanto 395 | quarta 396 | quarto 397 | quatro 398 | que 399 | quem 400 | quer 401 | quereis 402 | querem 403 | queremas 404 | queres 405 | quero 406 | questão 407 | quieto 408 | quinta 409 | quinto 410 | quinze 411 | quáis 412 | quê 413 | relação 414 | sabe 415 | sabem 416 | saber 417 | se 418 | segunda 419 | segundo 420 | sei 421 | seis 422 | seja 423 | sejam 424 | sejamos 425 | sem 426 | sempre 427 | sendo 428 | ser 429 | serei 430 | seremos 431 | seria 432 | seriam 433 | será 434 | serão 435 | seríamos 436 | sete 437 | seu 438 | seus 439 | sexta 440 | sexto 441 | sim 442 | sistema 443 | sob 444 | sobre 445 | sois 446 | somente 447 | somos 448 | sou 449 | sua 450 | suas 451 | são 452 | sétima 453 | sétimo 454 | só 455 | tal 456 | talvez 457 | tambem 458 | também 459 | tanta 460 | tantas 461 | tanto 462 | tarde 463 | te 464 | tem 465 | temos 466 | tempo 467 | tendes 468 | tenha 469 | tenham 470 | tenhamos 471 | tenho 472 | tens 473 | tentar 474 | tentaram 475 | tente 476 | tentei 477 | ter 478 | terceira 479 | terceiro 480 | terei 481 | teremos 482 | teria 483 | teriam 484 | terá 485 | terão 486 | teríamos 487 | teu 488 | teus 489 | teve 490 | tinha 491 | tinham 492 | tipo 493 | tive 494 | tivemos 495 | tiver 496 | tivera 497 | tiveram 498 | tiverem 499 | tivermos 500 | tivesse 501 | tivessem 502 | tiveste 503 | tivestes 504 | tivéramos 505 | tivéssemos 506 | toda 507 | todas 508 | todo 509 | todos 510 | trabalhar 511 | trabalho 512 | treze 513 | três 514 | tu 515 | tua 516 | tuas 517 | tudo 518 | tão 519 | tém 520 | têm 521 | tínhamos 522 | um 523 | uma 524 | umas 525 | uns 526 | usa 527 | usar 528 | vai 529 | vais 530 | valor 531 | veja 532 | vem 533 | vens 534 | ver 535 | verdade 536 | verdadeiro 537 | vez 538 | vezes 539 | viagem 540 | vindo 541 | vinte 542 | você 543 | vocês 544 | vos 545 | vossa 546 | vossas 547 | vosso 548 | vossos 549 | vários 550 | vão 551 | vêm 552 | vós 553 | zero 554 | à 555 | às 556 | área 557 | é 558 | éramos 559 | és 560 | último -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/romanian.txt: -------------------------------------------------------------------------------- 1 | a 2 | abia 3 | acea 4 | aceasta 5 | această 6 | aceea 7 | aceeasi 8 | acei 9 | aceia 10 | acel 11 | acela 12 | acelasi 13 | acele 14 | acelea 15 | acest 16 | acesta 17 | aceste 18 | acestea 19 | acestei 20 | acestia 21 | acestui 22 | aceşti 23 | aceştia 24 | acolo 25 | acord 26 | acum 27 | adica 28 | ai 29 | aia 30 | aibă 31 | aici 32 | aiurea 33 | al 34 | ala 35 | alaturi 36 | ale 37 | alea 38 | alt 39 | alta 40 | altceva 41 | altcineva 42 | alte 43 | altfel 44 | alti 45 | altii 46 | altul 47 | am 48 | anume 49 | apoi 50 | ar 51 | are 52 | as 53 | asa 54 | asemenea 55 | asta 56 | astazi 57 | astea 58 | astfel 59 | astăzi 60 | asupra 61 | atare 62 | atat 63 | atata 64 | atatea 65 | atatia 66 | ati 67 | atit 68 | atita 69 | atitea 70 | atitia 71 | atunci 72 | au 73 | avea 74 | avem 75 | aveţi 76 | avut 77 | azi 78 | aş 79 | aşadar 80 | aţi 81 | b 82 | ba 83 | bine 84 | bucur 85 | bună 86 | c 87 | ca 88 | cam 89 | cand 90 | capat 91 | care 92 | careia 93 | carora 94 | caruia 95 | cat 96 | catre 97 | caut 98 | ce 99 | cea 100 | ceea 101 | cei 102 | ceilalti 103 | cel 104 | cele 105 | celor 106 | ceva 107 | chiar 108 | ci 109 | cinci 110 | cind 111 | cine 112 | cineva 113 | cit 114 | cita 115 | cite 116 | citeva 117 | citi 118 | citiva 119 | conform 120 | contra 121 | cu 122 | cui 123 | cum 124 | cumva 125 | curând 126 | curînd 127 | când 128 | cât 129 | câte 130 | câtva 131 | câţi 132 | cînd 133 | cît 134 | cîte 135 | cîtva 136 | cîţi 137 | că 138 | căci 139 | cărei 140 | căror 141 | cărui 142 | către 143 | d 144 | da 145 | daca 146 | dacă 147 | dar 148 | dat 149 | datorită 150 | dată 151 | dau 152 | de 153 | deasupra 154 | deci 155 | decit 156 | degraba 157 | deja 158 | deoarece 159 | departe 160 | desi 161 | despre 162 | deşi 163 | din 164 | dinaintea 165 | dintr 166 | dintr- 167 | dintre 168 | doar 169 | doi 170 | doilea 171 | două 172 | drept 173 | dupa 174 | după 175 | dă 176 | e 177 | ea 178 | ei 179 | el 180 | ele 181 | era 182 | eram 183 | este 184 | eu 185 | exact 186 | eşti 187 | f 188 | face 189 | fara 190 | fata 191 | fel 192 | fi 193 | fie 194 | fiecare 195 | fii 196 | fim 197 | fiu 198 | fiţi 199 | foarte 200 | fost 201 | frumos 202 | fără 203 | g 204 | geaba 205 | graţie 206 | h 207 | halbă 208 | i 209 | ia 210 | iar 211 | ieri 212 | ii 213 | il 214 | imi 215 | in 216 | inainte 217 | inapoi 218 | inca 219 | incit 220 | insa 221 | intr 222 | intre 223 | isi 224 | iti 225 | j 226 | k 227 | l 228 | la 229 | le 230 | li 231 | lor 232 | lui 233 | lângă 234 | lîngă 235 | m 236 | ma 237 | mai 238 | mare 239 | mea 240 | mei 241 | mele 242 | mereu 243 | meu 244 | mi 245 | mie 246 | mine 247 | mod 248 | mult 249 | multa 250 | multe 251 | multi 252 | multă 253 | mulţi 254 | mulţumesc 255 | mâine 256 | mîine 257 | mă 258 | n 259 | ne 260 | nevoie 261 | ni 262 | nici 263 | niciodata 264 | nicăieri 265 | nimeni 266 | nimeri 267 | nimic 268 | niste 269 | nişte 270 | noastre 271 | noastră 272 | noi 273 | noroc 274 | nostri 275 | nostru 276 | nou 277 | noua 278 | nouă 279 | noştri 280 | nu 281 | numai 282 | o 283 | opt 284 | or 285 | ori 286 | oricare 287 | orice 288 | oricine 289 | oricum 290 | oricând 291 | oricât 292 | oricînd 293 | oricît 294 | oriunde 295 | p 296 | pai 297 | parca 298 | patra 299 | patru 300 | patrulea 301 | pe 302 | pentru 303 | peste 304 | pic 305 | pina 306 | plus 307 | poate 308 | pot 309 | prea 310 | prima 311 | primul 312 | prin 313 | printr- 314 | putini 315 | puţin 316 | puţina 317 | puţină 318 | până 319 | pînă 320 | r 321 | rog 322 | s 323 | sa 324 | sa-mi 325 | sa-ti 326 | sai 327 | sale 328 | sau 329 | se 330 | si 331 | sint 332 | sintem 333 | spate 334 | spre 335 | sub 336 | sunt 337 | suntem 338 | sunteţi 339 | sus 340 | sută 341 | sînt 342 | sîntem 343 | sînteţi 344 | să 345 | săi 346 | său 347 | t 348 | ta 349 | tale 350 | te 351 | ti 352 | timp 353 | tine 354 | toata 355 | toate 356 | toată 357 | tocmai 358 | tot 359 | toti 360 | totul 361 | totusi 362 | totuşi 363 | toţi 364 | trei 365 | treia 366 | treilea 367 | tu 368 | tuturor 369 | tăi 370 | tău 371 | u 372 | ul 373 | ului 374 | un 375 | una 376 | unde 377 | undeva 378 | unei 379 | uneia 380 | unele 381 | uneori 382 | unii 383 | unor 384 | unora 385 | unu 386 | unui 387 | unuia 388 | unul 389 | v 390 | va 391 | vi 392 | voastre 393 | voastră 394 | voi 395 | vom 396 | vor 397 | vostru 398 | vouă 399 | voştri 400 | vreme 401 | vreo 402 | vreun 403 | vă 404 | x 405 | z 406 | zece 407 | zero 408 | zi 409 | zice 410 | îi 411 | îl 412 | îmi 413 | împotriva 414 | în 415 | înainte 416 | înaintea 417 | încotro 418 | încât 419 | încît 420 | între 421 | întrucât 422 | întrucît 423 | îţi 424 | ăla 425 | ălea 426 | ăsta 427 | ăstea 428 | ăştia 429 | şapte 430 | şase 431 | şi 432 | ştiu 433 | ţi 434 | ţie -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/russian.txt: -------------------------------------------------------------------------------- 1 | c 2 | а 3 | алло 4 | без 5 | белый 6 | близко 7 | более 8 | больше 9 | большой 10 | будем 11 | будет 12 | будете 13 | будешь 14 | будто 15 | буду 16 | будут 17 | будь 18 | бы 19 | бывает 20 | бывь 21 | был 22 | была 23 | были 24 | было 25 | быть 26 | в 27 | важная 28 | важное 29 | важные 30 | важный 31 | вам 32 | вами 33 | вас 34 | ваш 35 | ваша 36 | ваше 37 | ваши 38 | вверх 39 | вдали 40 | вдруг 41 | ведь 42 | везде 43 | вернуться 44 | весь 45 | вечер 46 | взгляд 47 | взять 48 | вид 49 | видел 50 | видеть 51 | вместе 52 | вне 53 | вниз 54 | внизу 55 | во 56 | вода 57 | война 58 | вокруг 59 | вон 60 | вообще 61 | вопрос 62 | восемнадцатый 63 | восемнадцать 64 | восемь 65 | восьмой 66 | вот 67 | впрочем 68 | времени 69 | время 70 | все 71 | все еще 72 | всегда 73 | всего 74 | всем 75 | всеми 76 | всему 77 | всех 78 | всею 79 | всю 80 | всюду 81 | вся 82 | всё 83 | второй 84 | вы 85 | выйти 86 | г 87 | где 88 | главный 89 | глаз 90 | говорил 91 | говорит 92 | говорить 93 | год 94 | года 95 | году 96 | голова 97 | голос 98 | город 99 | да 100 | давать 101 | давно 102 | даже 103 | далекий 104 | далеко 105 | дальше 106 | даром 107 | дать 108 | два 109 | двадцатый 110 | двадцать 111 | две 112 | двенадцатый 113 | двенадцать 114 | дверь 115 | двух 116 | девятнадцатый 117 | девятнадцать 118 | девятый 119 | девять 120 | действительно 121 | дел 122 | делал 123 | делать 124 | делаю 125 | дело 126 | день 127 | деньги 128 | десятый 129 | десять 130 | для 131 | до 132 | довольно 133 | долго 134 | должен 135 | должно 136 | должный 137 | дом 138 | дорога 139 | друг 140 | другая 141 | другие 142 | других 143 | друго 144 | другое 145 | другой 146 | думать 147 | душа 148 | е 149 | его 150 | ее 151 | ей 152 | ему 153 | если 154 | есть 155 | еще 156 | ещё 157 | ею 158 | её 159 | ж 160 | ждать 161 | же 162 | жена 163 | женщина 164 | жизнь 165 | жить 166 | за 167 | занят 168 | занята 169 | занято 170 | заняты 171 | затем 172 | зато 173 | зачем 174 | здесь 175 | земля 176 | знать 177 | значит 178 | значить 179 | и 180 | иди 181 | идти 182 | из 183 | или 184 | им 185 | имеет 186 | имел 187 | именно 188 | иметь 189 | ими 190 | имя 191 | иногда 192 | их 193 | к 194 | каждая 195 | каждое 196 | каждые 197 | каждый 198 | кажется 199 | казаться 200 | как 201 | какая 202 | какой 203 | кем 204 | книга 205 | когда 206 | кого 207 | ком 208 | комната 209 | кому 210 | конец 211 | конечно 212 | которая 213 | которого 214 | которой 215 | которые 216 | который 217 | которых 218 | кроме 219 | кругом 220 | кто 221 | куда 222 | лежать 223 | лет 224 | ли 225 | лицо 226 | лишь 227 | лучше 228 | любить 229 | люди 230 | м 231 | маленький 232 | мало 233 | мать 234 | машина 235 | между 236 | меля 237 | менее 238 | меньше 239 | меня 240 | место 241 | миллионов 242 | мимо 243 | минута 244 | мир 245 | мира 246 | мне 247 | много 248 | многочисленная 249 | многочисленное 250 | многочисленные 251 | многочисленный 252 | мной 253 | мною 254 | мог 255 | могу 256 | могут 257 | мож 258 | может 259 | может быть 260 | можно 261 | можхо 262 | мои 263 | мой 264 | мор 265 | москва 266 | мочь 267 | моя 268 | моё 269 | мы 270 | на 271 | наверху 272 | над 273 | надо 274 | назад 275 | наиболее 276 | найти 277 | наконец 278 | нам 279 | нами 280 | народ 281 | нас 282 | начала 283 | начать 284 | наш 285 | наша 286 | наше 287 | наши 288 | не 289 | него 290 | недавно 291 | недалеко 292 | нее 293 | ней 294 | некоторый 295 | нельзя 296 | нем 297 | немного 298 | нему 299 | непрерывно 300 | нередко 301 | несколько 302 | нет 303 | нею 304 | неё 305 | ни 306 | нибудь 307 | ниже 308 | низко 309 | никакой 310 | никогда 311 | никто 312 | никуда 313 | ним 314 | ними 315 | них 316 | ничего 317 | ничто 318 | но 319 | новый 320 | нога 321 | ночь 322 | ну 323 | нужно 324 | нужный 325 | нх 326 | о 327 | об 328 | оба 329 | обычно 330 | один 331 | одиннадцатый 332 | одиннадцать 333 | однажды 334 | однако 335 | одного 336 | одной 337 | оказаться 338 | окно 339 | около 340 | он 341 | она 342 | они 343 | оно 344 | опять 345 | особенно 346 | остаться 347 | от 348 | ответить 349 | отец 350 | откуда 351 | отовсюду 352 | отсюда 353 | очень 354 | первый 355 | перед 356 | писать 357 | плечо 358 | по 359 | под 360 | подойди 361 | подумать 362 | пожалуйста 363 | позже 364 | пойти 365 | пока 366 | пол 367 | получить 368 | помнить 369 | понимать 370 | понять 371 | пор 372 | пора 373 | после 374 | последний 375 | посмотреть 376 | посреди 377 | потом 378 | потому 379 | почему 380 | почти 381 | правда 382 | прекрасно 383 | при 384 | про 385 | просто 386 | против 387 | процентов 388 | путь 389 | пятнадцатый 390 | пятнадцать 391 | пятый 392 | пять 393 | работа 394 | работать 395 | раз 396 | разве 397 | рано 398 | раньше 399 | ребенок 400 | решить 401 | россия 402 | рука 403 | русский 404 | ряд 405 | рядом 406 | с 407 | с кем 408 | сам 409 | сама 410 | сами 411 | самим 412 | самими 413 | самих 414 | само 415 | самого 416 | самой 417 | самом 418 | самому 419 | саму 420 | самый 421 | свет 422 | свое 423 | своего 424 | своей 425 | свои 426 | своих 427 | свой 428 | свою 429 | сделать 430 | сеаой 431 | себе 432 | себя 433 | сегодня 434 | седьмой 435 | сейчас 436 | семнадцатый 437 | семнадцать 438 | семь 439 | сидеть 440 | сила 441 | сих 442 | сказал 443 | сказала 444 | сказать 445 | сколько 446 | слишком 447 | слово 448 | случай 449 | смотреть 450 | сначала 451 | снова 452 | со 453 | собой 454 | собою 455 | советский 456 | совсем 457 | спасибо 458 | спросить 459 | сразу 460 | стал 461 | старый 462 | стать 463 | стол 464 | сторона 465 | стоять 466 | страна 467 | суть 468 | считать 469 | т 470 | та 471 | так 472 | такая 473 | также 474 | таки 475 | такие 476 | такое 477 | такой 478 | там 479 | твои 480 | твой 481 | твоя 482 | твоё 483 | те 484 | тебе 485 | тебя 486 | тем 487 | теми 488 | теперь 489 | тех 490 | то 491 | тобой 492 | тобою 493 | товарищ 494 | тогда 495 | того 496 | тоже 497 | только 498 | том 499 | тому 500 | тот 501 | тою 502 | третий 503 | три 504 | тринадцатый 505 | тринадцать 506 | ту 507 | туда 508 | тут 509 | ты 510 | тысяч 511 | у 512 | увидеть 513 | уж 514 | уже 515 | улица 516 | уметь 517 | утро 518 | хороший 519 | хорошо 520 | хотел бы 521 | хотеть 522 | хоть 523 | хотя 524 | хочешь 525 | час 526 | часто 527 | часть 528 | чаще 529 | чего 530 | человек 531 | чем 532 | чему 533 | через 534 | четвертый 535 | четыре 536 | четырнадцатый 537 | четырнадцать 538 | что 539 | чтоб 540 | чтобы 541 | чуть 542 | шестнадцатый 543 | шестнадцать 544 | шестой 545 | шесть 546 | эта 547 | эти 548 | этим 549 | этими 550 | этих 551 | это 552 | этого 553 | этой 554 | этом 555 | этому 556 | этот 557 | эту 558 | я 559 | являюсь -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/sorani.txt: -------------------------------------------------------------------------------- 1 | ئێمە 2 | ئێوە 3 | ئەم 4 | ئەو 5 | ئەوان 6 | ئەوەی 7 | بۆ 8 | بێ 9 | بێجگە 10 | بە 11 | بەبێ 12 | بەدەم 13 | بەردەم 14 | بەرلە 15 | بەرەوی 16 | بەرەوە 17 | بەلای 18 | بەپێی 19 | تۆ 20 | تێ 21 | جگە 22 | دوای 23 | دوو 24 | دە 25 | دەکات 26 | دەگەڵ 27 | سەر 28 | لێ 29 | لە 30 | لەبابەت 31 | لەباتی 32 | لەبارەی 33 | لەبرێتی 34 | لەبن 35 | لەبەر 36 | لەبەینی 37 | لەدەم 38 | لەرێ 39 | لەرێگا 40 | لەرەوی 41 | لەسەر 42 | لەلایەن 43 | لەناو 44 | لەنێو 45 | لەو 46 | لەپێناوی 47 | لەژێر 48 | لەگەڵ 49 | من 50 | ناو 51 | نێوان 52 | هەر 53 | هەروەها 54 | و 55 | وەک 56 | پاش 57 | پێ 58 | پێش 59 | چەند 60 | کرد 61 | کە 62 | ی -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/swedish.txt: -------------------------------------------------------------------------------- 1 | aderton 2 | adertonde 3 | adjö 4 | aldrig 5 | alla 6 | allas 7 | allt 8 | alltid 9 | alltså 10 | andra 11 | andras 12 | annan 13 | annat 14 | artonde 15 | artonn 16 | att 17 | av 18 | bakom 19 | bara 20 | behöva 21 | behövas 22 | behövde 23 | behövt 24 | beslut 25 | beslutat 26 | beslutit 27 | bland 28 | blev 29 | bli 30 | blir 31 | blivit 32 | bort 33 | borta 34 | bra 35 | bäst 36 | bättre 37 | båda 38 | bådas 39 | dag 40 | dagar 41 | dagarna 42 | dagen 43 | de 44 | del 45 | delen 46 | dem 47 | den 48 | denna 49 | deras 50 | dess 51 | dessa 52 | det 53 | detta 54 | dig 55 | din 56 | dina 57 | dit 58 | ditt 59 | dock 60 | dom 61 | du 62 | där 63 | därför 64 | då 65 | e 66 | efter 67 | eftersom 68 | ej 69 | elfte 70 | eller 71 | elva 72 | emot 73 | en 74 | enkel 75 | enkelt 76 | enkla 77 | enligt 78 | ens 79 | er 80 | era 81 | ers 82 | ert 83 | ett 84 | ettusen 85 | fanns 86 | fem 87 | femte 88 | femtio 89 | femtionde 90 | femton 91 | femtonde 92 | fick 93 | fin 94 | finnas 95 | finns 96 | fjorton 97 | fjortonde 98 | fjärde 99 | fler 100 | flera 101 | flesta 102 | fram 103 | framför 104 | från 105 | fyra 106 | fyrtio 107 | fyrtionde 108 | få 109 | får 110 | fått 111 | följande 112 | för 113 | före 114 | förlåt 115 | förra 116 | första 117 | genast 118 | genom 119 | gick 120 | gjorde 121 | gjort 122 | god 123 | goda 124 | godare 125 | godast 126 | gott 127 | gälla 128 | gäller 129 | gällt 130 | gärna 131 | gå 132 | går 133 | gått 134 | gör 135 | göra 136 | ha 137 | hade 138 | haft 139 | han 140 | hans 141 | har 142 | heller 143 | hellre 144 | helst 145 | helt 146 | henne 147 | hennes 148 | hit 149 | hon 150 | honom 151 | hundra 152 | hundraen 153 | hundraett 154 | hur 155 | här 156 | hög 157 | höger 158 | högre 159 | högst 160 | i 161 | ibland 162 | icke 163 | idag 164 | igen 165 | igår 166 | imorgon 167 | in 168 | inför 169 | inga 170 | ingen 171 | ingenting 172 | inget 173 | innan 174 | inne 175 | inom 176 | inte 177 | inuti 178 | ja 179 | jag 180 | jo 181 | ju 182 | just 183 | jämfört 184 | kan 185 | kanske 186 | knappast 187 | kom 188 | komma 189 | kommer 190 | kommit 191 | kr 192 | kunde 193 | kunna 194 | kunnat 195 | kvar 196 | legat 197 | ligga 198 | ligger 199 | lika 200 | likställd 201 | likställda 202 | lilla 203 | lite 204 | liten 205 | litet 206 | länge 207 | längre 208 | längst 209 | lätt 210 | lättare 211 | lättast 212 | långsam 213 | långsammare 214 | långsammast 215 | långsamt 216 | långt 217 | låt 218 | man 219 | med 220 | mej 221 | mellan 222 | men 223 | mer 224 | mera 225 | mest 226 | mig 227 | min 228 | mina 229 | mindre 230 | minst 231 | mitt 232 | mittemot 233 | mot 234 | mycket 235 | många 236 | måste 237 | möjlig 238 | möjligen 239 | möjligt 240 | möjligtvis 241 | ned 242 | nederst 243 | nedersta 244 | nedre 245 | nej 246 | ner 247 | ni 248 | nio 249 | nionde 250 | nittio 251 | nittionde 252 | nitton 253 | nittonde 254 | nog 255 | noll 256 | nr 257 | nu 258 | nummer 259 | när 260 | nästa 261 | någon 262 | någonting 263 | något 264 | några 265 | nån 266 | nånting 267 | nåt 268 | nödvändig 269 | nödvändiga 270 | nödvändigt 271 | nödvändigtvis 272 | och 273 | också 274 | ofta 275 | oftast 276 | olika 277 | olikt 278 | om 279 | oss 280 | på 281 | rakt 282 | redan 283 | rätt 284 | sa 285 | sade 286 | sagt 287 | samma 288 | sedan 289 | senare 290 | senast 291 | sent 292 | sex 293 | sextio 294 | sextionde 295 | sexton 296 | sextonde 297 | sig 298 | sin 299 | sina 300 | sist 301 | sista 302 | siste 303 | sitt 304 | sitta 305 | sju 306 | sjunde 307 | sjuttio 308 | sjuttionde 309 | sjutton 310 | sjuttonde 311 | själv 312 | sjätte 313 | ska 314 | skall 315 | skulle 316 | slutligen 317 | små 318 | smått 319 | snart 320 | som 321 | stor 322 | stora 323 | stort 324 | större 325 | störst 326 | säga 327 | säger 328 | sämre 329 | sämst 330 | så 331 | sådan 332 | sådana 333 | sådant 334 | ta 335 | tack 336 | tar 337 | tidig 338 | tidigare 339 | tidigast 340 | tidigt 341 | till 342 | tills 343 | tillsammans 344 | tio 345 | tionde 346 | tjugo 347 | tjugoen 348 | tjugoett 349 | tjugonde 350 | tjugotre 351 | tjugotvå 352 | tjungo 353 | tolfte 354 | tolv 355 | tre 356 | tredje 357 | trettio 358 | trettionde 359 | tretton 360 | trettonde 361 | två 362 | tvåhundra 363 | under 364 | upp 365 | ur 366 | ursäkt 367 | ut 368 | utan 369 | utanför 370 | ute 371 | va 372 | vad 373 | var 374 | vara 375 | varför 376 | varifrån 377 | varit 378 | varje 379 | varken 380 | vars 381 | varsågod 382 | vart 383 | vem 384 | vems 385 | verkligen 386 | vi 387 | vid 388 | vidare 389 | viktig 390 | viktigare 391 | viktigast 392 | viktigt 393 | vilka 394 | vilkas 395 | vilken 396 | vilket 397 | vill 398 | väl 399 | vänster 400 | vänstra 401 | värre 402 | vår 403 | våra 404 | vårt 405 | än 406 | ännu 407 | är 408 | även 409 | åt 410 | åtminstone 411 | åtta 412 | åttio 413 | åttionde 414 | åttonde 415 | över 416 | övermorgon 417 | överst 418 | övre -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/thai.txt: -------------------------------------------------------------------------------- 1 | กล่าว 2 | กว่า 3 | กัน 4 | กับ 5 | การ 6 | ก็ 7 | ก่อน 8 | ขณะ 9 | ขอ 10 | ของ 11 | ขึ้น 12 | คง 13 | ครั้ง 14 | ความ 15 | คือ 16 | จะ 17 | จัด 18 | จาก 19 | จึง 20 | ช่วง 21 | ซึ่ง 22 | ดัง 23 | ด้วย 24 | ด้าน 25 | ตั้ง 26 | ตั้งแต่ 27 | ตาม 28 | ต่อ 29 | ต่าง 30 | ต่างๆ 31 | ต้อง 32 | ถึง 33 | ถูก 34 | ถ้า 35 | ทั้ง 36 | ทั้งนี้ 37 | ทาง 38 | ทำ 39 | ทำให้ 40 | ที่ 41 | ที่สุด 42 | ทุก 43 | นอกจาก 44 | นัก 45 | นั้น 46 | นำ 47 | นี้ 48 | น่า 49 | บาง 50 | ผล 51 | ผ่าน 52 | พบ 53 | พร้อม 54 | มา 55 | มาก 56 | มี 57 | ยัง 58 | รวม 59 | ระหว่าง 60 | รับ 61 | ราย 62 | ร่วม 63 | ลง 64 | วัน 65 | ว่า 66 | สำหรับ 67 | สุด 68 | ส่ง 69 | ส่วน 70 | หนึ่ง 71 | หรือ 72 | หลัง 73 | หลังจาก 74 | หลาย 75 | หาก 76 | อยาก 77 | อยู่ 78 | อย่าง 79 | ออก 80 | อะไร 81 | อาจ 82 | อีก 83 | เขา 84 | เข้า 85 | เคย 86 | เฉพาะ 87 | เช่น 88 | เดียว 89 | เดียวกัน 90 | เนื่องจาก 91 | เปิด 92 | เปิดเผย 93 | เป็น 94 | เป็นการ 95 | เพราะ 96 | เพื่อ 97 | เมื่อ 98 | เรา 99 | เริ่ม 100 | เลย 101 | เห็น 102 | เอง 103 | แต่ 104 | แบบ 105 | แรก 106 | และ 107 | แล้ว 108 | แห่ง 109 | โดย 110 | ใน 111 | ให้ 112 | ได้ 113 | ไป 114 | ไม่ 115 | ไว้ -------------------------------------------------------------------------------- /src/main/resources/nlp/stopwords/turkish.txt: -------------------------------------------------------------------------------- 1 | acaba 2 | acep 3 | adamakıllı 4 | adeta 5 | ait 6 | altmýþ 7 | altmış 8 | altý 9 | altı 10 | ama 11 | amma 12 | anca 13 | ancak 14 | arada 15 | artýk 16 | aslında 17 | aynen 18 | ayrıca 19 | az 20 | açıkça 21 | açıkçası 22 | bana 23 | bari 24 | bazen 25 | bazý 26 | bazı 27 | başkası 28 | baţka 29 | belki 30 | ben 31 | benden 32 | beni 33 | benim 34 | beri 35 | beriki 36 | beþ 37 | beş 38 | beţ 39 | bilcümle 40 | bile 41 | bin 42 | binaen 43 | binaenaleyh 44 | bir 45 | biraz 46 | birazdan 47 | birbiri 48 | birden 49 | birdenbire 50 | biri 51 | birice 52 | birileri 53 | birisi 54 | birkaç 55 | birkaçı 56 | birkez 57 | birlikte 58 | birçok 59 | birçoğu 60 | birþey 61 | birþeyi 62 | birşey 63 | birşeyi 64 | birţey 65 | bitevi 66 | biteviye 67 | bittabi 68 | biz 69 | bizatihi 70 | bizce 71 | bizcileyin 72 | bizden 73 | bize 74 | bizi 75 | bizim 76 | bizimki 77 | bizzat 78 | boşuna 79 | bu 80 | buna 81 | bunda 82 | bundan 83 | bunlar 84 | bunları 85 | bunların 86 | bunu 87 | bunun 88 | buracıkta 89 | burada 90 | buradan 91 | burası 92 | böyle 93 | böylece 94 | böylecene 95 | böylelikle 96 | böylemesine 97 | böylesine 98 | büsbütün 99 | bütün 100 | cuk 101 | cümlesi 102 | da 103 | daha 104 | dahi 105 | dahil 106 | dahilen 107 | daima 108 | dair 109 | dayanarak 110 | de 111 | defa 112 | dek 113 | demin 114 | demincek 115 | deminden 116 | denli 117 | derakap 118 | derhal 119 | derken 120 | deđil 121 | değil 122 | değin 123 | diye 124 | diđer 125 | diğer 126 | diğeri 127 | doksan 128 | dokuz 129 | dolayı 130 | dolayısıyla 131 | doğru 132 | dört 133 | edecek 134 | eden 135 | ederek 136 | edilecek 137 | ediliyor 138 | edilmesi 139 | ediyor 140 | elbet 141 | elbette 142 | elli 143 | emme 144 | en 145 | enikonu 146 | epey 147 | epeyce 148 | epeyi 149 | esasen 150 | esnasında 151 | etmesi 152 | etraflı 153 | etraflıca 154 | etti 155 | ettiği 156 | ettiğini 157 | evleviyetle 158 | evvel 159 | evvela 160 | evvelce 161 | evvelden 162 | evvelemirde 163 | evveli 164 | eđer 165 | eğer 166 | fakat 167 | filanca 168 | gah 169 | gayet 170 | gayetle 171 | gayri 172 | gayrı 173 | gelgelelim 174 | gene 175 | gerek 176 | gerçi 177 | geçende 178 | geçenlerde 179 | gibi 180 | gibilerden 181 | gibisinden 182 | gine 183 | göre 184 | gırla 185 | hakeza 186 | halbuki 187 | halen 188 | halihazırda 189 | haliyle 190 | handiyse 191 | hangi 192 | hangisi 193 | hani 194 | hariç 195 | hasebiyle 196 | hasılı 197 | hatta 198 | hele 199 | hem 200 | henüz 201 | hep 202 | hepsi 203 | her 204 | herhangi 205 | herkes 206 | herkesin 207 | hiç 208 | hiçbir 209 | hiçbiri 210 | hoş 211 | hulasaten 212 | iken 213 | iki 214 | ila 215 | ile 216 | ilen 217 | ilgili 218 | ilk 219 | illa 220 | illaki 221 | imdi 222 | indinde 223 | inen 224 | insermi 225 | ise 226 | ister 227 | itibaren 228 | itibariyle 229 | itibarıyla 230 | iyi 231 | iyice 232 | iyicene 233 | için 234 | iş 235 | işte 236 | iţte 237 | kadar 238 | kaffesi 239 | kah 240 | kala 241 | kanýmca 242 | karşın 243 | katrilyon 244 | kaynak 245 | kaçı 246 | kelli 247 | kendi 248 | kendilerine 249 | kendini 250 | kendisi 251 | kendisine 252 | kendisini 253 | kere 254 | kez 255 | keza 256 | kezalik 257 | keşke 258 | keţke 259 | ki 260 | kim 261 | kimden 262 | kime 263 | kimi 264 | kimisi 265 | kimse 266 | kimsecik 267 | kimsecikler 268 | külliyen 269 | kýrk 270 | kýsaca 271 | kırk 272 | kısaca 273 | lakin 274 | leh 275 | lütfen 276 | maada 277 | madem 278 | mademki 279 | mamafih 280 | mebni 281 | međer 282 | meğer 283 | meğerki 284 | meğerse 285 | milyar 286 | milyon 287 | mu 288 | mü 289 | mý 290 | mı 291 | nasýl 292 | nasıl 293 | nasılsa 294 | nazaran 295 | naşi 296 | ne 297 | neden 298 | nedeniyle 299 | nedenle 300 | nedense 301 | nerde 302 | nerden 303 | nerdeyse 304 | nere 305 | nerede 306 | nereden 307 | neredeyse 308 | neresi 309 | nereye 310 | netekim 311 | neye 312 | neyi 313 | neyse 314 | nice 315 | nihayet 316 | nihayetinde 317 | nitekim 318 | niye 319 | niçin 320 | o 321 | olan 322 | olarak 323 | oldu 324 | olduklarını 325 | oldukça 326 | olduğu 327 | olduğunu 328 | olmadı 329 | olmadığı 330 | olmak 331 | olması 332 | olmayan 333 | olmaz 334 | olsa 335 | olsun 336 | olup 337 | olur 338 | olursa 339 | oluyor 340 | on 341 | ona 342 | onca 343 | onculayın 344 | onda 345 | ondan 346 | onlar 347 | onlardan 348 | onlari 349 | onlarýn 350 | onları 351 | onların 352 | onu 353 | onun 354 | oracık 355 | oracıkta 356 | orada 357 | oradan 358 | oranca 359 | oranla 360 | oraya 361 | otuz 362 | oysa 363 | oysaki 364 | pek 365 | pekala 366 | peki 367 | pekçe 368 | peyderpey 369 | rağmen 370 | sadece 371 | sahi 372 | sahiden 373 | sana 374 | sanki 375 | sekiz 376 | seksen 377 | sen 378 | senden 379 | seni 380 | senin 381 | siz 382 | sizden 383 | sizi 384 | sizin 385 | sonra 386 | sonradan 387 | sonraları 388 | sonunda 389 | tabii 390 | tam 391 | tamam 392 | tamamen 393 | tamamıyla 394 | tarafından 395 | tek 396 | trilyon 397 | tüm 398 | var 399 | vardı 400 | vasıtasıyla 401 | ve 402 | velev 403 | velhasıl 404 | velhasılıkelam 405 | veya 406 | veyahut 407 | ya 408 | yahut 409 | yakinen 410 | yakında 411 | yakından 412 | yakınlarda 413 | yalnız 414 | yalnızca 415 | yani 416 | yapacak 417 | yapmak 418 | yaptı 419 | yaptıkları 420 | yaptığı 421 | yaptığını 422 | yapılan 423 | yapılması 424 | yapıyor 425 | yedi 426 | yeniden 427 | yenilerde 428 | yerine 429 | yetmiþ 430 | yetmiş 431 | yetmiţ 432 | yine 433 | yirmi 434 | yok 435 | yoksa 436 | yoluyla 437 | yüz 438 | yüzünden 439 | zarfında 440 | zaten 441 | zati 442 | zira 443 | çabuk 444 | çabukça 445 | çeşitli 446 | çok 447 | çokları 448 | çoklarınca 449 | çokluk 450 | çoklukla 451 | çokça 452 | çoğu 453 | çoğun 454 | çoğunca 455 | çoğunlukla 456 | çünkü 457 | öbür 458 | öbürkü 459 | öbürü 460 | önce 461 | önceden 462 | önceleri 463 | öncelikle 464 | öteki 465 | ötekisi 466 | öyle 467 | öylece 468 | öylelikle 469 | öylemesine 470 | öz 471 | üzere 472 | üç 473 | þey 474 | þeyden 475 | þeyi 476 | þeyler 477 | þu 478 | þuna 479 | þunda 480 | þundan 481 | þunu 482 | şayet 483 | şey 484 | şeyden 485 | şeyi 486 | şeyler 487 | şu 488 | şuna 489 | şuncacık 490 | şunda 491 | şundan 492 | şunlar 493 | şunları 494 | şunu 495 | şunun 496 | şura 497 | şuracık 498 | şuracıkta 499 | şurası 500 | şöyle 501 | ţayet 502 | ţimdi 503 | ţu 504 | ţöyle -------------------------------------------------------------------------------- /src/test/kotlin/be/rlab/nlp/NormalizerTest.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Language 4 | import org.junit.jupiter.api.Test 5 | 6 | class NormalizerTest { 7 | @Test 8 | fun normalize() { 9 | val normalizer = Normalizer.new( 10 | "Era fácil compartir cuando había comida suficiente, o apenas la suficiente, para seguir viviendo. " + 11 | "¿Pero cuando no la había? Entonces entraba en juego la fuerza; la fuerza se convertía en derecho; en " + 12 | "poder, y la herramienta del poder era la violencia, y su aliado más devoto, el ojo que no quiere ver", 13 | language = Language.SPANISH 14 | ).applyStemming() 15 | .removeDiacritics() 16 | .removePunctuation() 17 | .removeStopWords() 18 | .caseInsensitive() 19 | val result = normalizer.normalize() 20 | assert(result == "facil compart com suficient suficient segu viv entrab jueg fuerz " + 21 | "fuerz converti derech herramient violenci ali devot ojo" 22 | ) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/test/kotlin/be/rlab/nlp/UpdateStopWordsTest.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.nlp 2 | 3 | import be.rlab.nlp.model.Language 4 | import org.junit.jupiter.api.Disabled 5 | import org.junit.jupiter.api.Test 6 | import java.io.File 7 | import java.io.FileNotFoundException 8 | import java.net.URI 9 | 10 | /** This test downloads all stop words files from the stopwords-iso repository and updates 11 | * the files in `nlp/stopwords/`. It will not fail if a language is not supported by the 12 | * stopwords-iso project. 13 | * 14 | * Look at the github repository for further information: https://github.com/stopwords-iso/stopwords-iso 15 | */ 16 | @Disabled 17 | class UpdateStopWordsTest { 18 | companion object { 19 | const val DOWNLOAD_URL: String = 20 | "https://raw.githubusercontent.com/stopwords-iso/stopwords-{code}/master/stopwords-{code}.txt" 21 | } 22 | 23 | @Test 24 | fun update() { 25 | Language.values().forEach { language -> 26 | try { 27 | println("updating stop words for language: $language") 28 | val url = URI.create(DOWNLOAD_URL.replace("{code}", language.code)).toURL() 29 | val data = url.openStream().bufferedReader().readText() 30 | val file = File("src/main/resources/nlp/stopwords/${language.name.lowercase()}.txt") 31 | file.writeText(data) 32 | } catch(cause: FileNotFoundException) { 33 | println("stop words not supported for language: $language") 34 | } 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/kotlin/be/rlab/search/Book.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search 2 | 3 | import be.rlab.search.annotation.IndexDocument 4 | import be.rlab.search.annotation.IndexField 5 | import be.rlab.search.annotation.IndexFieldType 6 | import be.rlab.search.model.BoolValue 7 | import be.rlab.search.model.FieldType 8 | 9 | @IndexDocument(namespace = IndexManagerTest.NAMESPACE) 10 | data class Book( 11 | @IndexField val id: String, 12 | @IndexField(store = BoolValue.NO, index = BoolValue.YES) val title: String?, 13 | @IndexField val description: String, 14 | @IndexField(docValues = true) @IndexFieldType(FieldType.TEXT) val genre: String, 15 | @IndexField @IndexFieldType(FieldType.TEXT) val categories: List, 16 | @IndexField val author: String, 17 | @IndexField(store = BoolValue.YES) val hash: Int, 18 | @IndexField(store = BoolValue.YES, index = BoolValue.YES) val rate: Float 19 | ) 20 | -------------------------------------------------------------------------------- /src/test/kotlin/be/rlab/search/IndexManagerTest.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search 2 | 3 | import be.rlab.nlp.model.Language 4 | import be.rlab.search.mock.TestBook 5 | import be.rlab.search.model.TypedSearchResult 6 | import be.rlab.search.query.sortBy 7 | import be.rlab.search.query.term 8 | import be.rlab.support.SearchTestUtils.firstWord 9 | import org.apache.lucene.search.similarities.BM25Similarity 10 | import org.junit.jupiter.api.AfterEach 11 | import org.junit.jupiter.api.BeforeEach 12 | import org.junit.jupiter.api.Test 13 | import java.io.File 14 | import java.util.* 15 | 16 | class IndexManagerTest { 17 | companion object { 18 | const val NAMESPACE: String = "TestNamespace" 19 | const val FIELD_ID: String = "uuid" 20 | const val FIELD_HASH: String = "hash" 21 | const val FIELD_TITLE: String = "title" 22 | const val FIELD_DESCRIPTION: String = "description" 23 | const val FIELD_CATEGORY: String = "category" 24 | const val FIELD_AUTHOR_NAME: String = "author_name" 25 | } 26 | 27 | private val indexDir: File = File("./data/test-index") 28 | private lateinit var indexManager: IndexManager 29 | 30 | @BeforeEach 31 | fun setUp() { 32 | indexDir.deleteRecursively() 33 | indexManager = IndexManager.Builder(indexDir.absolutePath) 34 | .forLanguages(Language.entries) 35 | .withSimilarity(BM25Similarity()) 36 | .build() 37 | } 38 | 39 | @AfterEach 40 | fun tearDown() { 41 | indexManager.close() 42 | } 43 | 44 | @Test 45 | fun terms() { 46 | indexManager.addSchema(NAMESPACE) { 47 | string(FIELD_ID) { 48 | docValues() 49 | } 50 | int(FIELD_HASH) { 51 | index() 52 | store() 53 | } 54 | text(FIELD_TITLE) 55 | text(FIELD_DESCRIPTION) 56 | text(FIELD_AUTHOR_NAME) 57 | text(FIELD_CATEGORY) { 58 | docValues() 59 | } 60 | } 61 | 62 | indexManager.index(NAMESPACE, Language.SPANISH) { 63 | string(FIELD_ID, UUID.randomUUID().toString()) 64 | int(FIELD_HASH, 1234) 65 | field(FIELD_TITLE, "Memorias del subsuelo") 66 | text(FIELD_DESCRIPTION, "Antihéroes de su ingente producción novelística") { 67 | store() 68 | } 69 | text(FIELD_AUTHOR_NAME, "Fiódor Dostoyevski") { 70 | store() 71 | } 72 | listOf("Drama", "Filosófico", "Psicológico").forEach { category -> 73 | text(FIELD_CATEGORY, category) { 74 | store() 75 | } 76 | } 77 | } 78 | indexManager.sync() 79 | 80 | val results1 = indexManager.search(NAMESPACE, Language.SPANISH) { 81 | term("Memorias") 82 | sortBy(FIELD_ID) 83 | } 84 | val results2 = indexManager.search(NAMESPACE, Language.SPANISH) { 85 | term(FIELD_TITLE, "Memorias") 86 | term("drama") { 87 | by(FIELD_DESCRIPTION, FIELD_CATEGORY) 88 | } 89 | } 90 | val results3 = indexManager.search(NAMESPACE, Language.SPANISH) { 91 | term("drama") { 92 | by(FIELD_DESCRIPTION) 93 | } 94 | } 95 | 96 | assert(results1.docs.size == 1) 97 | assert(results2.docs.size == 1) 98 | assert(results3.docs.isEmpty()) 99 | } 100 | 101 | @Test 102 | fun mapper() { 103 | val books = Array(10) { TestBook().new() } 104 | val mapper = IndexMapper(indexManager) 105 | 106 | books[4] = books[4].copy(title = "memorias") 107 | books[8] = books[8].copy(title = "subsuelo", categories = listOf("drama")) 108 | books.forEach { book -> mapper.index(book, Language.ENGLISH) } 109 | indexManager.sync() 110 | 111 | val results1: TypedSearchResult = mapper.search(Language.ENGLISH) { 112 | term(firstWord(books[2].description)) 113 | sortBy(Book::genre) 114 | } 115 | val results2: TypedSearchResult = mapper.search(Language.ENGLISH) { 116 | term(Book::title, books[4].title!!) 117 | } 118 | val results3: TypedSearchResult = mapper.search(Language.ENGLISH) { 119 | term(FIELD_TITLE, books[8].title!!) 120 | term(books[8].categories.first()) 121 | } 122 | 123 | assert(results1.docs.isNotEmpty()) 124 | assert(results2.docs.isNotEmpty()) 125 | assert(results3.docs.isNotEmpty()) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/test/kotlin/be/rlab/search/mock/TestBook.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.search.mock 2 | 3 | import be.rlab.search.Book 4 | import io.github.serpro69.kfaker.faker 5 | import java.util.UUID 6 | 7 | val faker = faker {} 8 | 9 | data class TestBook( 10 | val id: String = UUID.randomUUID().toString(), 11 | val title: String = faker.book.title(), 12 | val description: String = faker.bojackHorseman.quotes(), 13 | val genre: String = faker.book.genre(), 14 | val categories: List = listOf(faker.adjective.positive(), faker.adjective.positive()), 15 | val author: String = faker.book.author(), 16 | val hash: Int = faker.random.nextInt(), 17 | val rate: Float = faker.random.nextFloat() 18 | ) { 19 | fun new(): Book = Book( 20 | id = id, 21 | title = title, 22 | description = description, 23 | genre = genre, 24 | categories = categories, 25 | author = author, 26 | hash = hash, 27 | rate = rate 28 | ) 29 | } 30 | -------------------------------------------------------------------------------- /src/test/kotlin/be/rlab/support/SearchTestUtils.kt: -------------------------------------------------------------------------------- 1 | package be.rlab.support 2 | 3 | import be.rlab.nlp.Normalizer 4 | import be.rlab.nlp.model.Language 5 | 6 | object SearchTestUtils { 7 | fun firstWord(text: String, language: Language = Language.ENGLISH): String { 8 | return Normalizer.new(text, language) 9 | .skipStemming() 10 | .removeStopWords() 11 | .normalize() 12 | .split(" ") 13 | .first { word -> word.length > 3 } 14 | } 15 | 16 | fun lastWord(text: String, language: Language = Language.ENGLISH): String { 17 | return Normalizer.new(text, language) 18 | .skipStemming() 19 | .removeStopWords() 20 | .normalize() 21 | .split(" ") 22 | .reversed() 23 | .first { word -> word.length > 3 } 24 | } 25 | } 26 | --------------------------------------------------------------------------------