├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── tokenmill-logo.svg ├── .gitignore ├── .gitlab-ci.yml ├── CHANGELOG ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── charts ├── mt-avg-per-doc.png ├── mt-min-max-per-doc.png ├── mt-throughput-per-sec.png ├── mt-total.png ├── st-avg-per-doc.png ├── st-min-max-per-doc.png └── st-throughput-per-sec.png ├── classes └── lt │ └── tokenmill │ └── beagle │ └── phrases │ ├── Annotation.class │ ├── Annotator.class │ └── DictionaryEntry.class ├── deps.edn ├── pom.xml ├── src └── beagle │ ├── annotation_merger.clj │ ├── dictionary_optimizer.clj │ ├── java │ ├── annotation.clj │ └── java.clj │ ├── lucene_alpha.clj │ ├── monitor.clj │ ├── phrases.clj │ ├── readers.clj │ ├── schema.clj │ ├── text_analysis.clj │ └── validator.clj └── test ├── beagle ├── annotation_merge_test.clj ├── corner_case_phrases_test.clj ├── dictionary_optimization_test.clj ├── java_test.clj ├── lucene_alpha_test.clj ├── optimization_suggestions_test.clj ├── phrases_test.clj ├── readers_test.clj ├── text_analysis_test.clj └── validator_test.clj └── resources ├── dict.csv ├── dict.edn ├── dict.json ├── logback.xml └── phrases.html /.gitattributes: -------------------------------------------------------------------------------- 1 | test/resources/phrases.html linguist-vendored=false 2 | test/resources/phrases.html linguist-detectable=false -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/tokenmill-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pom.xml.asc 2 | *.jar 3 | *.class 4 | /lib/ 5 | /classes/ 6 | /target/ 7 | /checkouts/ 8 | .lein-deps-sum 9 | .lein-repl-history 10 | .lein-plugins/ 11 | .lein-failures 12 | .nrepl-port 13 | .cpcache/ 14 | target/* 15 | .idea 16 | *.iml 17 | .env 18 | *.json 19 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | stages: 2 | - test 3 | 4 | variables: 5 | GIT_DEPTH: 3 6 | 7 | cache: 8 | key: one-key-to-rule-them-all 9 | paths: 10 | - ./.m2/repository 11 | - ./.gitlibs 12 | 13 | lint: 14 | stage: test 15 | image: borkdude/clj-kondo 16 | cache: {} 17 | when: always 18 | script: 19 | - clj-kondo --lint src test --config '{:output {:exclude-files ["java"]}}' 20 | 21 | unit-test: 22 | stage: test 23 | when: always 24 | image: clojure:tools-deps-alpine 25 | script: 26 | - export GITLIBS=".gitlibs/" 27 | - clojure -Sdeps '{:mvn/local-repo "./.m2/repository"}' -A:test 28 | 29 | validate-sample-dictionaries: 30 | stage: test 31 | when: always 32 | image: clojure:tools-deps-alpine 33 | script: 34 | - export GITLIBS=".gitlibs/" 35 | - > 36 | clojure -Sdeps '{:mvn/local-repo "./.m2/repository"}' -m beagle.validator 37 | test/resources/dict.csv csv 38 | test/resources/dict.json json 39 | test/resources/dict.edn edn 40 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/). 3 | 4 | ## 0.9.0-SNAPSHOT - 2019-10-13 5 | ### Added 6 | - Ensuring ordering for phrases with slop 7 | 8 | ## 0.4.0-SNAPSHOT - 2019-10-12 9 | ### Added 10 | - Tokenizer can be specified for every dictionary entry 11 | - Java Interface accepts tokenizer string 12 | - Highlighter options support for text analysis options 13 | ### Changed 14 | - Use MultiPhraseQuery instead of PhraseQuery internally 15 | 16 | ## 0.3.1 - 2019-10-03 17 | ### Fixed 18 | - Java interface for phrase highlighting 19 | 20 | ## 0.3.0 - 2019-09-24 21 | ### Added 22 | - Performance optimizations 23 | ### Changed 24 | - Refactored code towards batch document highlighting 25 | 26 | ## 0.2.0 - 2019-09-24 27 | ### Added 28 | - Alpha version for Lucene query support 29 | 30 | ## 0.1.7 - 2019-09-20 31 | ### Added 32 | - Deployment to Maven Central 33 | 34 | ## 0.1.6 - 2019-09-19 35 | ### Added 36 | - Added Java interface 37 | ### Fixed 38 | - Concurrent usage 39 | 40 | ## 0.1.5 - 2019-09-16 41 | ### Fixed 42 | - Handling of cases when text or phrases are tokenized to 0 tokens 43 | 44 | ## 0.1.4 - 2019-09-10 45 | ### Added 46 | - Phrase slop support 47 | 48 | ## 0.1.3 - 2019-09-04 49 | ### Added 50 | - Use one Lucene Monitor in total 51 | 52 | ## 0.1.2 - 2019-09-03 53 | ### Added 54 | - Support for stemming for multiple languages 55 | 56 | ## 0.1.1 - 2019-08-26 57 | ### Added 58 | - Initial release 59 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at info@tokenmill.lt. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | First off, thanks for taking the time to contribute! 2 | 3 | The following is a set of guidelines for contributing to Beagle which is hosted at https://github.com/tokenmill/beagle. These are just guidelines, not rules, use your best judgment and feel free to propose changes to this document in a pull request. 4 | 5 | This project adheres to the Contributor Covenant code of conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to info@tokenmill.lt. 6 | Issues & Pull requests 7 | 8 | Issues and Pull requests welcome! 9 | 10 | We do ask that before submitting a pull request you open an issue tracking the bug of enhancement you'd like to fix or submit. This makes it easier to discuss changes in the abstract, before focusing on a particular solution. 11 | 12 | Furthermore, please be diligent about submitting pull requests which only make one essential change at a time. While formatting changes and code cleanups are welcome, they should be separate from features and a pull request should only introduce one logical feature at a time. When adding new features, please ensure there are accompanying tests. 13 | 14 | Commit Messages 15 | 16 | Commit messages should be well formed, according to the guidelines outlined by Tim Pope: http://karma-runner.github.io/4.0/dev/git-commit-msg.html 17 | 18 | When fixing an existing issue, add - fixes #xxx somewhere in the commit message: this has the dual purpose of closing the issue when your patch is merged to master as well as automatically providing a link in to the related issue. 19 | 20 | Change Log 21 | 22 | Pull requests are required to update the changelog. Changelog entries should mention and link to any issues or tickets involved in the change, and should provide a short summary description of the particular changes of the patch. 23 | 24 | Include the issue number (#xxx) which will link back to the originating issue in Github. Commentary on the change should appear as a nested, unordered list. 25 | 26 | Whitespace & Linting 27 | 28 | Beagle is maintained with fairly strict whitespace and style standards. 29 | 30 | Gitlab CI jobs will fail if the clj-kondo rules are violated, or the source format doesnt match the default cljfmt style guidelines. Hence, patches must be formatted and whitespace linted before they will be accepted. 31 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM oracle/graalvm-ce:19.2.0.1 as builder 2 | RUN gu install native-image 3 | 4 | ENV GRAALVM_HOME=$JAVA_HOME 5 | 6 | RUN curl -O https://download.clojure.org/install/linux-install-1.10.1.469.sh 7 | RUN chmod +x linux-install-1.10.1.469.sh 8 | RUN ./linux-install-1.10.1.469.sh 9 | 10 | RUN mkdir -p /usr/src/app 11 | WORKDIR /usr/src/app 12 | 13 | COPY deps.edn /usr/src/app/ 14 | RUN clojure -R:native-image 15 | COPY . /usr/src/app 16 | 17 | RUN clojure -A:native-image 18 | 19 | RUN chmod 755 dictionary-validator 20 | 21 | FROM alpine:3.9.4 as validator 22 | 23 | WORKDIR /opt 24 | COPY --from=builder /usr/src/app/dictionary-validator /usr/local/bin/dictionary-validator 25 | 26 | CMD ["dictionary-validator"] 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Tokenmill, UAB 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | lint-code: 2 | clojure -M:clj-kondo --config '{:output {:exclude-files ["java"]}}' 3 | 4 | unit-test: 5 | clojure -M:runner:test -e :noisy 6 | 7 | build-dictionary-validator: 8 | docker build --target builder -f Dockerfile -t tokenmill/beagle-dictionary-validator . 9 | docker rm build || true 10 | docker create --name build tokenmill/beagle-dictionary-validator 11 | docker cp build:/usr/src/app/dictionary-validator dictionary-validator 12 | 13 | build-graal-validator-docker: 14 | docker build --target validator -f Dockerfile -t tokenmill/beagle-dictionary-validator . 15 | 16 | recompile-java-interface: 17 | rm -rf classes 18 | mkdir classes 19 | clojure -e "(require 'beagle.java.annotation) (compile 'beagle.java.annotation) (compile 'beagle.java.java)" 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | # Beagle 6 | 7 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 8 | [![pipeline status](https://gitlab.com/tokenmill/oss/beagle/badges/master/pipeline.svg)](https://gitlab.com/tokenmill/oss/beagle/pipelines/master/latest) 9 | [![Maven Central](https://img.shields.io/maven-central/v/lt.tokenmill/beagle.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:%22lt.tokenmill%22%20AND%20a:%22beagle%22) 10 | 11 | Beagle is a detector of interesting things in text. Its intended use is in-stream search applications. Suppose you need to monitor a stream of text documents such as web crawl results, chat messages, or corporate documents in order to identify keywords, phrases, regexes, and [complex search queries](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html) of interest. With Beagle you can quickly be up and running with such a system, allowing you to focus on productively monitoring your documents. 12 | 13 | Beagle is based on the [Lucene monitor](https://github.com/apache/lucene-solr/tree/master/lucene/monitor) library which is based on [Luwak](https://github.com/flaxsearch/luwak). 14 | 15 | ## Components 16 | 17 | - [Phrase highlighter with support for](#phrase-annotator-usage): 18 | - case sensitivity, 19 | - ascii folding, 20 | - stemming support for various languages, 21 | - phrase slop, 22 | - synonymous phrases, 23 | - metadata, 24 | - tokenizer, 25 | - ensuring order of terms in a phrase with slop, 26 | - any combination of previously mentioned features. 27 | - [Java interface to the phrase highlighter](#java-interface-to-the-phrase-highlighter) 28 | - (alpha!) [Lucene query string support](#lucene-query-support) (interface is subject to change) 29 | - [Dictionary file readers (csv, json, edn)](#dictionary-readers) 30 | - [Dictionary validator](#dictionary-validator) 31 | - [Dictionary optimizer](#dictionary-optimizer) 32 | - [Annotation merger](#annotation-merger) 33 | 34 | ## Phrase Annotator Usage 35 | 36 | ```clojure 37 | (require '[beagle.phrases :as phrases]) 38 | 39 | (let [dictionary [{:text "to be annotated" :id "1"}] 40 | highlighter-fn (phrases/highlighter dictionary)] 41 | (highlighter-fn "before annotated to be annotated after annotated")) 42 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32}) 43 | 44 | ;; Case sensitivity is controlled per dictionary entry 45 | (let [dictionary [{:text "TO BE ANNOTATED" :id "1" :case-sensitive? false}] 46 | highlighter-fn (phrases/highlighter dictionary)] 47 | (highlighter-fn "before annotated to be annotated after annotated")) 48 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32}) 49 | 50 | ;; ASCII folding is controlled per dictionary entry 51 | (let [dictionary [{:text "TÖ BE ÄNNÖTÄTED" :id "1" :case-sensitive? false :ascii-fold? true}] 52 | highlighter-fn (phrases/highlighter dictionary)] 53 | (highlighter-fn "before annotated to be annotated after annotated")) 54 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32}) 55 | 56 | ;; Stemming is supported for multiple languages per dictionary entry 57 | (let [dictionary [{:text "Kaunas" :id "1" :stem? true :stemmer :lithuanian}] 58 | highlighter-fn (phrases/highlighter dictionary)] 59 | (highlighter-fn "Kauno miestas")) 60 | => ({:text "Kauno", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 5}) 61 | 62 | ;; Phrases also support slop (i.e. terms edit distance) per dictionary entry 63 | (let [txt "before start and end after" 64 | dictionary [{:text "start end" :id "1" :slop 1}] 65 | highlighter-fn (phrases/highlighter dictionary)] 66 | (highlighter-fn txt)) 67 | => ({:text "start and end", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 7, :end-offset 20}) 68 | 69 | ;; Every phrase can specify which tokenizer to use 70 | (let [txt "[URGENT!] Do this immediately!" 71 | dictionary [{:text "[URGENT!]" :id "a" :tokenizer :whitespace} 72 | {:text "[URGENT!]" :id "b" :tokenizer :standard}] 73 | highlighter-fn (phrases/highlighter dictionary)] 74 | (clojure.pprint/pprint (highlighter-fn txt))) 75 | => 76 | ({:text "[URGENT!]", 77 | :type "PHRASE", 78 | :dict-entry-id "a", 79 | :meta {}, 80 | :begin-offset 0, 81 | :end-offset 9} 82 | {:text "URGENT", 83 | :type "PHRASE", 84 | :dict-entry-id "b", 85 | :meta {}, 86 | :begin-offset 1, 87 | :end-offset 7}) 88 | 89 | ;; Ensure that phrase terms are matched in the provided order 90 | ;; e.g. NOT preserving order (default) 91 | (let [txt "Mill Token" 92 | dictionary [{:text "Token Mill" :slop 2 :in-order? false}] 93 | highlighter-fn (phrases/highlighter dictionary)] 94 | (highlighter-fn txt)) 95 | => [{:text "Mill Token" :type "PHRASE" :dict-entry-id "0" :meta {} :begin-offset 0 :end-offset 10}] 96 | ;; e.g. Preserving order 97 | (let [txt "Mill Token" 98 | dictionary [{:text "Token Mill" :slop 2 :in-order? true}] 99 | highlighter-fn (phrases/highlighter dictionary)] 100 | (highlighter-fn txt)) 101 | => () 102 | ``` 103 | 104 | ## Java Interface to the Phrase Highlighter 105 | 106 | Example: 107 | ```java 108 | import lt.tokenmill.beagle.phrases.Annotation; 109 | import lt.tokenmill.beagle.phrases.Annotator; 110 | import lt.tokenmill.beagle.phrases.DictionaryEntry; 111 | 112 | import java.util.Arrays; 113 | import java.util.Collection; 114 | import java.util.HashMap; 115 | 116 | public class Main { 117 | public static void main(String[] args) { 118 | DictionaryEntry dictionaryEntry = new DictionaryEntry("test phrase"); 119 | Annotator annotator = new Annotator(Arrays.asList(dictionaryEntry)); 120 | Collection annotations = annotator.annotate("This is my test phrase"); 121 | annotations.forEach(s -> System.out.println("Annotated: \'" + s.text() + "\' at offset: " + s.beginOffset() + ":" + s.endOffset())); 122 | } 123 | } 124 | 125 | // => Annotated: 'test phrase' at offset: 11:22 126 | ``` 127 | 128 | The available options for the Java API are explained with examples in the [Java Interface for Phrase Highlighting wiki page](https://github.com/tokenmill/beagle/wiki/Java-Interface-for-Phrase-Highlighting). 129 | 130 | All the options that are present in the Clojure interface are also available for use in Java, just convert Clojure keywords to Java strings, e.g. 131 | ``` 132 | :case-sensitive? => "case-sensitive?" 133 | ``` 134 | 135 | ### Project Setup with Maven 136 | 137 | The library is deployed in the Maven Central Repository and you can just add the beagle dependency to your `pom.xml`: 138 | 139 | ```xml 140 | 141 | lt.tokenmill 142 | beagle 143 | 0.3.1 144 | 145 | ``` 146 | 147 | ## Lucene Query Support 148 | 149 | Examples: 150 | 151 | ```clojure 152 | (require '[beagle.lucene-alpha :as lucene]) 153 | 154 | (let [txt "some text this other that" 155 | dictionary [{:text "this AND that" :id "1" :slop 1}] 156 | annotator-fn (lucene/annotator dictionary)] 157 | (annotator-fn txt {})) 158 | => ({:text "this AND that", :type "QUERY", :dict-entry-id "1", :meta {}}) 159 | ``` 160 | 161 | ## Performance 162 | 163 | The performance was measured on a desktop PC with Ubuntu 19.04 and 8-core Ryzen 1700. 164 | 165 | The test setup was for news articles and dictionary made up of names of city names in USA. 166 | 167 | Code and data for benchmarking and more benchmarks can be found [here](https://github.com/tokenmill/beagle-performance-benchmarks). 168 | 169 | ### Single-thread 170 | 171 | Average time spent per document ranged from 1.58 ms for dictionary of 5k phrases to 4.58 ms per document for 80k phrases. 172 | 173 | ![alt text](charts/st-avg-per-doc.png) 174 | 175 | Throughput of docs analyzed ranged from 626 docs/sec for dictionary of 5k phrases to 210 docs/sec for 80k phrases. 176 | 177 | ![alt text](charts/st-throughput-per-sec.png) 178 | 179 | Max time spent per document has couple of spikes when processing a document takes ~1000ms. These spikes should 180 | have been caused either by GC pauses, or JVM deoptimizations. Aside from those spikes, max time ranges grows steadily 181 | from 15 ms to 72 ms as the dictionary size grows. 182 | 183 | Min time spent per document is fairly stable for any dictionary size and is about 0.45 ms. Most likely these are the 184 | cases when [Presearcher](https://lucene.apache.org/core/8_2_0/monitor/index.html) haven't found any candidate queries to run against the document. 185 | 186 | ![alt text](charts/st-min-max-per-doc.png) 187 | 188 | ### Multi-threaded 189 | 190 | Using `core.async` pipeline time spent per single doc ranged from 3.38 ms for dictionary of 5k phrases to 15.34 ms per document for 80k phrases. 191 | 192 | ![alt text](charts/mt-avg-per-doc.png) 193 | 194 | Total time spent to process all 10k docs ranged from 2412 ms for dictionary of 5k phrases to 12595 ms per document for 80k phrases. 195 | 196 | ![alt text](charts/mt-total.png) 197 | 198 | Throughput of docs analyzed ranged from 4143 docs/sec for dictionary of 5k phrases to 793 docs/sec for 80k phrases. 199 | 200 | ![alt text](charts/mt-throughput-per-sec.png) 201 | 202 | Max time spent per document has risen fairy steady from 24.15 ms for dictionary of 10k phrases to 113.45 ms per document for 60k phrases. 203 | 204 | Min time spent per document varied from 0.6 ms for dictionary of 10k phrases to 1.1 ms per document for 55k phrases. 205 | 206 | ![alt text](charts/mt-min-max-per-doc.png) 207 | 208 | ### Conclusions about Performance 209 | 210 | Processing of a one document on average is faster in the single-thread mode by roughly by 3x compared to multi-threaded mode but even 211 | in multi-threaded mode one document rarely takes more than 10 ms. 212 | 213 | In multi-threaded mode throughput grows with the number on CPU cores almost linearly: 4143/8=518 docs per core per sec in multi-threaded mode 214 | while in single-thread mode 626 docs per core per sec. 215 | 216 | ## Dictionary Readers 217 | 218 | Three file formats are supported: csv, edn, json. 219 | 220 | ### CSV Dictionary Format 221 | 222 | Separator: "," 223 | Escape: "\"" 224 | 225 | The first line *MUST* be a header. 226 | 227 | Supported header keys: `["text" "type" "id" "synonyms" "case-sensitive?" ":ascii-fold?" "meta"]` 228 | 229 | Order is not important. 230 | 231 | Under `synonyms`, there should be a list of string separated by ";" 232 | Under `meta`, there should be a list of strings separated by ";". Even number of strings is expected. In case of odd number, last one is ignored. 233 | 234 | ## Dictionary Validator 235 | 236 | Accepts any number of dictionaries to validate as long as they are provided in pairs as '"/path/to/dictionary/file" "file-type"' 237 | 238 | ### Supported File Types 239 | 240 | - csv 241 | - json 242 | - edn 243 | 244 | ### Output 245 | 246 | - If any dictionary is invalid exception will be thrown with exit status 1 247 | 248 | ### Usage 249 | 250 | #### Clojure 251 | 252 | To use validator directly execute command: `clj -m beagle.validator "/path/to/dictionary/file" "file-type" "/path/to/dictionary/file2" "file-type" & ...` 253 | 254 | ##### Example: 255 | 256 | ``` 257 | clj -m beagle.validator "your-dict.csv" "csv" "your-other-dict.json" "json" 258 | ``` 259 | 260 | #### Docker 261 | 262 | Example in Gitlab CI: 263 | 264 | ``` 265 | validate-dictionaries: 266 | stage: dictionary-validation 267 | when: always 268 | image: tokenmill/beagle-dictionary-validator 269 | script: 270 | - > 271 | dictionary-validator 272 | /path/to/dict.csv csv 273 | /path/to/dict.json json 274 | /path/to/dict.edn edn 275 | ``` 276 | 277 | ## Dictionary Optimizer 278 | 279 | Supported optimizations: 280 | - Remove duplicate dictionary entries 281 | - Merge synonyms 282 | - Synonyms and text equality check 283 | 284 | There are cases when dictionary entries can't be merged: 285 | - Differences in text analysis 286 | 287 | Examples: 288 | ```clojure 289 | (require '[beagle.dictionary-optimizer :as optimizer]) 290 | 291 | ; Remove duplicates 292 | (let [dictionary [{:text "TO BE ANNOTATED" :id "1"} 293 | {:text "TO BE ANNOTATED"}]] 294 | (optimizer/optimize dictionary)) 295 | => ({:text "TO BE ANNOTATED", :id "1"}) 296 | 297 | ; Merge synonyms 298 | (let [dictionary [{:text "TO BE ANNOTATED" :synonyms ["ONE"]} 299 | {:text "TO BE ANNOTATED" :synonyms ["TWO"]}]] 300 | (optimizer/optimize dictionary)) 301 | => ({:text "TO BE ANNOTATED", :synonyms ("TWO" "ONE")}) 302 | 303 | ; Synonyms and text equality check 304 | (let [dictionary [{:text "TO BE ANNOTATED" :synonyms ["TO BE ANNOTATED"]}]] 305 | (optimizer/optimize dictionary)) 306 | => ({:text "TO BE ANNOTATED", :synonyms ["TO BE ANNOTATED"]}) 307 | 308 | ; Can't be merged because of differences in text analysis 309 | (let [dictionary [{:text "TO BE ANNOTATED" :case-sensitive? true} 310 | {:text "TO BE ANNOTATED" :case-sensitive? false}]] 311 | (optimizer/optimize dictionary)) 312 | => ({:text "TO BE ANNOTATED", :case-sensitive? true} {:text "TO BE ANNOTATED", :case-sensitive? false}) 313 | ``` 314 | 315 | ## Annotation Merger 316 | 317 | Only annotations of the same type are merged. 318 | 319 | Handled cases: 320 | - Duplicate annotations 321 | - Nested annotations 322 | 323 | Examples: 324 | ```clojure 325 | (require '[beagle.annotation-merger :as merger]) 326 | 327 | (let [dictionary [{:text "TEST"} 328 | {:text "This TEST is"}] 329 | highlighter-fn (phrases/highlighter dictionary) 330 | annotations (highlighter-fn "This TEST is")] 331 | (println "Annotations: " annotations) 332 | (merger/merge-same-type-annotations annotations)) 333 | Annotations: ({:text TEST, :type PHRASE, :dict-entry-id 0, :meta {}, :begin-offset 5, :end-offset 9} {:text This TEST is, :type PHRASE, :dict-entry-id 1, :meta {}, :begin-offset 0, :end-offset 12}) 334 | => ({:text "This TEST is", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 12}) 335 | 336 | ;; You can also inline the need of merging annotations 337 | (let [dictionary [{:text "TEST"} 338 | {:text "This TEST is"}] 339 | highlighter-fn (phrases/highlighter dictionary)] 340 | (highlighter-fn "This TEST is" {:merge-annotations? true})) 341 | => ({:text "This TEST is", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 12}) 342 | ``` 343 | 344 | ## License 345 | 346 | Copyright © 2019 [TokenMill UAB](http://www.tokenmill.lt). 347 | 348 | Distributed under the The Apache License, Version 2.0. 349 | -------------------------------------------------------------------------------- /charts/mt-avg-per-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-avg-per-doc.png -------------------------------------------------------------------------------- /charts/mt-min-max-per-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-min-max-per-doc.png -------------------------------------------------------------------------------- /charts/mt-throughput-per-sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-throughput-per-sec.png -------------------------------------------------------------------------------- /charts/mt-total.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-total.png -------------------------------------------------------------------------------- /charts/st-avg-per-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-avg-per-doc.png -------------------------------------------------------------------------------- /charts/st-min-max-per-doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-min-max-per-doc.png -------------------------------------------------------------------------------- /charts/st-throughput-per-sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-throughput-per-sec.png -------------------------------------------------------------------------------- /classes/lt/tokenmill/beagle/phrases/Annotation.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/Annotation.class -------------------------------------------------------------------------------- /classes/lt/tokenmill/beagle/phrases/Annotator.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/Annotator.class -------------------------------------------------------------------------------- /classes/lt/tokenmill/beagle/phrases/DictionaryEntry.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/DictionaryEntry.class -------------------------------------------------------------------------------- /deps.edn: -------------------------------------------------------------------------------- 1 | {:deps {org.clojure/clojure {:mvn/version "1.10.3"} 2 | org.clojure/data.csv {:mvn/version "1.0.0"} 3 | org.clojure/tools.logging {:mvn/version "1.1.0"} 4 | org.apache.lucene/lucene-core {:mvn/version "8.9.0"} 5 | org.apache.lucene/lucene-monitor {:mvn/version "8.9.0"} 6 | metosin/jsonista {:mvn/version "0.3.3"}} 7 | :paths ["src" "classes"] 8 | :mvn/repos {"central" {:url "https://repo1.maven.org/maven2/"} 9 | "clojars" {:url "https://repo.clojars.org/"}} 10 | :aliases {:dev 11 | {:extra-deps {org.jsoup/jsoup {:mvn/version "1.13.1"} 12 | org.clojure/test.check {:mvn/version "1.0.0"} 13 | criterium/criterium {:mvn/version "0.4.6"} 14 | ch.qos.logback/logback-classic {:mvn/version "1.2.3"}} 15 | :extra-paths ["test/resources"]} 16 | :clj-kondo 17 | {:main-opts ["-m" "clj-kondo.main --lint src test"] 18 | :extra-deps {clj-kondo/clj-kondo {:mvn/version "2019.07.31-alpha"}} 19 | :jvm-opts ["-Dclojure.main.report=stderr"]} 20 | :test 21 | {:extra-paths ["test"] 22 | :extra-deps {com.cognitect/test-runner {:git/url "https://github.com/cognitect-labs/test-runner.git" 23 | :sha "62ef1de18e076903374306060ac0e8a752e57c86"} 24 | org.jsoup/jsoup {:mvn/version "1.13.1"} 25 | org.clojure/test.check {:mvn/version "1.0.0"}}} 26 | :runner 27 | {:extra-paths ["test"] 28 | :main-opts ["-m" "cognitect.test-runner"]} 29 | :native-image 30 | {:override-deps {org.clojure/clojure {:mvn/version "1.9.0"}} 31 | :main-opts ["-m clj.native-image beagle.validator" 32 | "--initialize-at-build-time" 33 | "--report-unsupported-elements-at-runtime" 34 | "-H:Name=dictionary-validator"] 35 | :jvm-opts ["-Dclojure.compiler.direct-linking=true"] 36 | :extra-deps {clj.native-image/clj.native-image 37 | {:git/url "https://github.com/taylorwood/clj.native-image.git" 38 | :sha "7708e7fd4572459c81f6a6b8e44c96f41cdd92d4"}}}}} 39 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | lt.tokenmill 5 | beagle 6 | 0.9.0-SNAPSHOT 7 | beagle 8 | Stream search library 9 | https://github.com/tokenmill/beagle 10 | 11 | 12 | 13 | The Apache License, Version 2.0 14 | http://www.apache.org/licenses/LICENSE-2.0.txt 15 | 16 | 17 | 18 | 19 | 20 | Dainius Jocas 21 | dainius.jocas@tokenmill.lt 22 | TokenMill 23 | http://www.tokenmill.lt 24 | 25 | 26 | Žygimantas Medelis 27 | zygimantas.medelis@gmail.com 28 | TokenMill 29 | http://www.tokenmill.lt 30 | 31 | 32 | 33 | 34 | 35 | org.clojure 36 | clojure 37 | 1.10.1 38 | 39 | 40 | org.clojure 41 | data.csv 42 | 0.1.4 43 | 44 | 45 | org.clojure 46 | tools.logging 47 | 0.5.0 48 | 49 | 50 | org.apache.lucene 51 | lucene-core 52 | 8.2.0 53 | 54 | 55 | org.apache.lucene 56 | lucene-monitor 57 | 8.2.0 58 | 59 | 60 | metosin 61 | jsonista 62 | 0.2.4 63 | 64 | 65 | 66 | 67 | UTF-8 68 | 69 | 70 | 71 | 72 | release-sign-artifacts 73 | 74 | 75 | performRelease 76 | true 77 | 78 | 79 | 80 | 81 | 82 | org.apache.maven.plugins 83 | maven-javadoc-plugin 84 | 3.1.1 85 | 86 | 87 | attach-javadoc 88 | 89 | jar 90 | 91 | 92 | 93 | 94 | 95 | org.apache.maven.plugins 96 | maven-gpg-plugin 97 | 1.6 98 | 99 | 100 | sign-artifacts 101 | verify 102 | 103 | sign 104 | 105 | 106 | 107 | 108 | 109 | org.apache.maven.plugins 110 | maven-source-plugin 111 | 3.1.0 112 | 113 | 114 | org.apache.maven.plugins 115 | maven-deploy-plugin 116 | 3.0.0-M1 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | src 125 | 126 | 127 | src 128 | 129 | 130 | classes 131 | 132 | 133 | 134 | 135 | org.apache.maven.plugins 136 | maven-jar-plugin 137 | 3.1.2 138 | 139 | 140 | empty-javadoc-jar 141 | package 142 | 143 | jar 144 | 145 | 146 | javadoc 147 | ${basedir}/javadoc 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | https://github.com/tokenmill/beagle 157 | scm:git:git://github.com/tokenmill/beagle.git 158 | scm:git:ssh://git@github.com/tokenmill/beagle.git 159 | HEAD 160 | 161 | 162 | 163 | 164 | clojars 165 | https://repo.clojars.org/ 166 | 167 | 168 | sonatype 169 | https://oss.sonatype.org/content/repositories/snapshots/ 170 | 171 | 172 | 173 | 174 | 175 | ossrh 176 | https://oss.sonatype.org/content/repositories/snapshots 177 | 178 | 179 | ossrh 180 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /src/beagle/annotation_merger.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.annotation-merger) 2 | 3 | (defn related-annotations? [anno1 anno2] 4 | (<= (:begin-offset anno1) (:begin-offset anno2) (:end-offset anno1))) 5 | 6 | (defn parent-child-annotations? [parent-anno child-anno] 7 | (and (>= (:begin-offset child-anno) (:begin-offset parent-anno)) 8 | (<= (:end-offset child-anno) (:end-offset parent-anno)))) 9 | 10 | (defn merge-annotations [annotations] 11 | (let [sorted-annotation (sort-by :begin-offset annotations)] 12 | (loop [parent-annotation (first sorted-annotation) 13 | [child-annotation & remaining] (rest sorted-annotation) 14 | result []] 15 | (if child-annotation 16 | (if (related-annotations? parent-annotation child-annotation) 17 | (recur (if (and (parent-child-annotations? parent-annotation child-annotation) 18 | (not (parent-child-annotations? child-annotation parent-annotation))) 19 | parent-annotation 20 | child-annotation) 21 | remaining 22 | result) 23 | (recur child-annotation remaining (conj result parent-annotation))) 24 | (conj result parent-annotation))))) 25 | 26 | (defn merge-same-type-annotations [annotations] 27 | (mapcat (fn [[_ anns]] (merge-annotations anns)) (group-by :type annotations))) 28 | -------------------------------------------------------------------------------- /src/beagle/dictionary_optimizer.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.dictionary-optimizer 2 | (:require [clojure.set :as set] 3 | [clojure.string :as str])) 4 | 5 | (defn merge-synonyms [group-of-entries] 6 | (reduce (fn [synonyms-set {synonyms :synonyms}] 7 | (into synonyms-set synonyms)) 8 | #{} group-of-entries)) 9 | 10 | (defn merge-meta [group-of-entries] 11 | (reduce (fn [acc {meta :meta}] (merge acc meta)) {} group-of-entries)) 12 | 13 | (defn merge-entries [entries] 14 | (let [{:keys [text case-sensitive? ascii-fold? id]} (first entries) 15 | synonyms (remove #(= text %) (merge-synonyms entries)) 16 | meta (merge-meta entries)] 17 | (cond-> {:text text} 18 | (not-empty synonyms) (assoc :synonyms synonyms) 19 | (not-empty meta) (assoc :meta meta) 20 | id (assoc :id id) 21 | (not (nil? case-sensitive?)) (assoc :case-sensitive? case-sensitive?) 22 | (not (nil? ascii-fold?)) (assoc :ascii-fold? ascii-fold?)))) 23 | 24 | (defn mergeable-meta? [{meta-a :meta} {meta-b :meta}] 25 | (every? #(= (get meta-a %) (get meta-b %)) (set/intersection (set (keys meta-a)) (set (keys meta-b))))) 26 | 27 | (defn aggregate-entries-by-meta [entries] 28 | (loop [entry-a (first entries) 29 | [entry-b & remaining] (rest entries) 30 | acc [] 31 | exceptions []] 32 | (if entry-b 33 | (if (mergeable-meta? entry-a entry-b) 34 | (recur (merge-entries [entry-a entry-b]) remaining acc exceptions) 35 | (recur entry-a remaining acc (conj exceptions entry-b))) 36 | (if (seq exceptions) 37 | (recur (first exceptions) (rest exceptions) (conj acc entry-a) []) 38 | (conj acc entry-a))))) 39 | 40 | (defn group-dictionary-entries [dictionary] 41 | (group-by (fn [entry] [(:text entry) (:case-sensitive? entry) (:ascii-fold? entry)]) dictionary)) 42 | 43 | (defn optimize [dictionary] 44 | (mapcat (fn [[_ grouped-entries]] (aggregate-entries-by-meta grouped-entries)) 45 | (group-dictionary-entries dictionary))) 46 | 47 | (defn optimization-suggestion [entries] 48 | {:suggestion (-> (format "Dictionary items '%s' have identical `[text case-sensitivity ascii-folding] features." 49 | (reduce #(conj %1 (or (:id %2) (:text %2))) [] entries)) 50 | (str/replace #"\"" "")) 51 | :dictionary-items entries}) 52 | 53 | (defn dry-run [dictionary] 54 | (reduce (fn [acc [_ grouped-entries]] 55 | (if (< 1 (count grouped-entries)) 56 | (conj acc (optimization-suggestion grouped-entries)) 57 | acc)) 58 | [] (group-dictionary-entries dictionary))) 59 | -------------------------------------------------------------------------------- /src/beagle/java/annotation.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.java.annotation) 2 | 3 | (gen-class 4 | :name lt.tokenmill.beagle.phrases.Annotation 5 | :prefix Annotation- 6 | :state "state" 7 | :init "init" 8 | :constructors {[String String Long Long String java.util.Map] []} 9 | :methods [[text [] String] 10 | [type [] String] 11 | [beginOffset [] Long] 12 | [endOffset [] Long] 13 | [dictionaryEntryId [] String] 14 | [meta [] java.util.Map]] 15 | :prefix Annotation-) 16 | 17 | (defn Annotation-init [text type begin end dictionaryEntryId meta] 18 | [[] (atom {:text text 19 | :type type 20 | :begin begin 21 | :end end 22 | :dict-entry-id dictionaryEntryId 23 | :meta meta})]) 24 | 25 | (defn Annotation-text [this] 26 | (@(.state this) :text)) 27 | (defn Annotation-type [this] 28 | (@(.state this) :type)) 29 | (defn Annotation-beginOffset [this] 30 | (@(.state this) :begin)) 31 | (defn Annotation-endOffset [this] 32 | (@(.state this) :end)) 33 | (defn Annotation-dictionaryEntryId [this] 34 | (@(.state this) :dict-entry-id)) 35 | (defn Annotation-meta [this] 36 | (@(.state this) :meta)) 37 | -------------------------------------------------------------------------------- /src/beagle/java/java.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.java.java 2 | (:gen-class) 3 | (:require [beagle.phrases :as phrases])) 4 | 5 | (gen-class 6 | :name lt.tokenmill.beagle.phrases.DictionaryEntry 7 | :state "state" 8 | :init "init" 9 | :constructors {[String] []} 10 | :methods [[text [] String] 11 | [type [] String] 12 | [setType [String] void] 13 | [id [] String] 14 | [setId [String] void] 15 | [synonyms [] java.util.Collection] 16 | [setSynonyms [java.util.Collection] void] 17 | [caseSensitive [] Boolean] 18 | [setCaseSensitive [Boolean] void] 19 | [asciiFold [] Boolean] 20 | [setAsciiFold [Boolean] void] 21 | [stem [] Boolean] 22 | [setStem [Boolean] void] 23 | [stemmer [] String] 24 | [setStemmer [String] void] 25 | [slop [] Integer] 26 | [setSlop [Integer] void] 27 | [tokenizer [] String] 28 | [setTokenizer [String] void] 29 | [meta [] java.util.Map] 30 | [setMeta [java.util.Map] void]] 31 | :prefix DictionaryEntry-) 32 | 33 | (defn DictionaryEntry-init [phrase] 34 | [[] (atom {:text phrase})]) 35 | 36 | (defn DictionaryEntry-text [this] 37 | (@(.state this) :text)) 38 | (defn DictionaryEntry-type [this] 39 | (@(.state this) :type)) 40 | (defn DictionaryEntry-setType [this type] 41 | (swap! (.state this) assoc :type type)) 42 | (defn DictionaryEntry-id [this] 43 | (@(.state this) :id)) 44 | (defn DictionaryEntry-setId [this id] 45 | (swap! (.state this) assoc :id id)) 46 | (defn DictionaryEntry-synonyms [this] 47 | (@(.state this) :synonyms)) 48 | (defn DictionaryEntry-setSynonyms [this synonyms] 49 | (swap! (.state this) assoc :synonyms synonyms)) 50 | (defn DictionaryEntry-caseSensitive [this] 51 | (@(.state this) :case-sensitive?)) 52 | (defn DictionaryEntry-setCaseSensitive [this case-sensitive] 53 | (swap! (.state this) assoc :case-sensitive? case-sensitive)) 54 | (defn DictionaryEntry-asciiFold [this] 55 | (@(.state this) :ascii-fold?)) 56 | (defn DictionaryEntry-setAsciiFold [this ascii-fold] 57 | (swap! (.state this) assoc :ascii-fold? ascii-fold)) 58 | (defn DictionaryEntry-stem [this] 59 | (@(.state this) :stem?)) 60 | (defn DictionaryEntry-setStem [this stem] 61 | (swap! (.state this) assoc :stem? stem)) 62 | (defn DictionaryEntry-stemmer [this] 63 | (@(.state this) :stemmer)) 64 | (defn DictionaryEntry-setStemmer [this stemmer] 65 | (swap! (.state this) assoc :stemmer stemmer)) 66 | (defn DictionaryEntry-slop [this] 67 | (@(.state this) :slop)) 68 | (defn DictionaryEntry-setSlop [this slop] 69 | (swap! (.state this) assoc :slop slop)) 70 | (defn DictionaryEntry-meta [this] 71 | (@(.state this) :meta)) 72 | (defn DictionaryEntry-setMeta [this meta] 73 | (swap! (.state this) assoc :meta meta)) 74 | (defn DictionaryEntry-tokenizer [this] 75 | (@(.state this) :tokenizer)) 76 | (defn DictionaryEntry-setTokenizer [this tokenizer] 77 | (swap! (.state this) assoc :tokenizer tokenizer)) 78 | 79 | (gen-class 80 | :name lt.tokenmill.beagle.phrases.Annotator 81 | :state "state" 82 | :init "init" 83 | :constructors {[java.util.Collection] [] 84 | [java.util.Collection java.util.Map] []} 85 | :prefix Phrases- 86 | :methods [[annotate [String] java.util.Collection] 87 | [annotate [String java.util.Map] java.util.Collection]]) 88 | 89 | (defn Phrases-init 90 | ([dictionary] (Phrases-init dictionary {})) 91 | ([dictionary opts] 92 | [[] (atom {:dictionary dictionary 93 | :annotator-fn (phrases/highlighter 94 | (map (fn [dictionary-entry] 95 | {:text (.text dictionary-entry) 96 | :type (.type dictionary-entry) 97 | :id (.id dictionary-entry) 98 | :synonyms (.synonyms dictionary-entry) 99 | :case-sensitive? (.caseSensitive dictionary-entry) 100 | :ascii-fold? (.asciiFold dictionary-entry) 101 | :stem? (.stem dictionary-entry) 102 | :stemmer (keyword (.stemmer dictionary-entry)) 103 | :slop (.slop dictionary-entry) 104 | :tokenizer (keyword (.tokenizer dictionary-entry)) 105 | :meta (.meta dictionary-entry)}) dictionary) 106 | (reduce (fn [m [k v]] 107 | (assoc m (keyword k) v)) {} opts))})])) 108 | 109 | (defn Phrases-annotate 110 | ([this text] (Phrases-annotate this text {})) 111 | ([this text opts] 112 | (map (fn [ann] (lt.tokenmill.beagle.phrases.Annotation. 113 | (:text ann) 114 | (:type ann) 115 | (long (:begin-offset ann)) 116 | (long (:end-offset ann)) 117 | (:dict-entry-id ann) 118 | (:meta ann))) 119 | ((@(.state this) :annotator-fn) text (reduce (fn [m [k v]] 120 | (assoc m (keyword k) v)) {} opts))))) 121 | -------------------------------------------------------------------------------- /src/beagle/lucene_alpha.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.lucene-alpha 2 | (:require [clojure.string :as s] 3 | [clojure.tools.logging :as log] 4 | [beagle.monitor :as monitor] 5 | [beagle.text-analysis :as text-analysis]) 6 | (:import (org.apache.lucene.monitor MonitorQuery QueryMatch Monitor) 7 | (org.apache.lucene.queryparser.classic QueryParser ParseException) 8 | (org.apache.lucene.document Document Field FieldType) 9 | (org.apache.lucene.index IndexOptions))) 10 | 11 | (def ^FieldType field-type 12 | (doto (FieldType.) 13 | (.setTokenized true) 14 | (.setIndexOptions IndexOptions/DOCS_AND_FREQS) 15 | (.setStoreTermVectors true) 16 | (.setStoreTermVectorOffsets true))) 17 | 18 | (defn match-text [^String text ^Monitor monitor field-names type-name] 19 | (let [doc (Document.)] 20 | (doseq [field-name field-names] 21 | (.add doc (Field. ^String field-name text field-type))) 22 | (map (fn [^QueryMatch query-match] 23 | (let [^MonitorQuery query (.getQuery monitor (.getQueryId query-match)) 24 | meta (.getMetadata query)] 25 | {:text (.getQueryString query) 26 | :type (or (get meta "_type") type-name) 27 | :dict-entry-id (.getQueryId query-match) 28 | :meta (into {} meta)})) (.getMatches (.match monitor doc (QueryMatch/SIMPLE_MATCHER)))))) 29 | 30 | (defn dict-entry->monitor-queries [{:keys [id text meta type] :as dict-entry} default-analysis-conf idx] 31 | (try 32 | (let [query-id (or id (str idx)) 33 | metadata (reduce-kv (fn [m k v] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta))] 34 | (MonitorQuery. query-id 35 | (.parse (QueryParser. 36 | (text-analysis/get-field-name dict-entry default-analysis-conf) 37 | (text-analysis/get-string-analyzer dict-entry default-analysis-conf)) 38 | text) 39 | text 40 | metadata)) 41 | (catch ParseException e 42 | (log/errorf "Failed to parse query: '%s' with exception '%s'" dict-entry e)) 43 | (catch Exception e (log/errorf "Failed create query: '%s' with '%s'" dict-entry e)))) 44 | 45 | (defn dictionary->monitor-queries [dictionary default-analysis-conf] 46 | (remove nil? 47 | (map (fn [dict-entry idx] 48 | (dict-entry->monitor-queries dict-entry default-analysis-conf idx)) 49 | dictionary (range)))) 50 | 51 | (defn match-monitor [text monitor field-names type-name opts] 52 | (log/debugf "Match monitor with opts='%s'" opts) 53 | (if (s/blank? text) 54 | [] 55 | (match-text text monitor field-names type-name))) 56 | 57 | (defn annotator 58 | ([dictionary] (annotator dictionary {})) 59 | ([dictionary {:keys [type-name tokenizer]}] 60 | (let [type-name (if (s/blank? type-name) "QUERY" type-name) 61 | {:keys [monitor field-names]} (monitor/setup dictionary 62 | {:tokenizer tokenizer} 63 | dictionary->monitor-queries)] 64 | (fn 65 | ([text] (match-monitor text monitor field-names type-name {})) 66 | ([text opts] (match-monitor text monitor field-names type-name opts)))))) 67 | -------------------------------------------------------------------------------- /src/beagle/monitor.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.monitor 2 | (:require [clojure.java.io :as io] 3 | [clojure.tools.logging :as log] 4 | [jsonista.core :as json] 5 | [beagle.text-analysis :as text-analysis]) 6 | (:import (org.apache.lucene.monitor MonitorConfiguration Monitor MonitorQuerySerializer MonitorQuery) 7 | (org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper) 8 | (org.apache.lucene.util BytesRef) 9 | (org.apache.lucene.search MatchAllDocsQuery) 10 | (java.util ArrayList))) 11 | 12 | (def monitor-query-serializer 13 | (reify MonitorQuerySerializer 14 | (serialize [_ query] 15 | (BytesRef. 16 | (json/write-value-as-string 17 | {"query-id" (.getId query) 18 | "query" (.getQueryString query) 19 | "metadata" (.getMetadata query)}))) 20 | (deserialize [_ binary-value] 21 | (let [dq (json/read-value (io/reader (.bytes ^BytesRef binary-value)))] 22 | (MonitorQuery. (get dq "query-id") 23 | (MatchAllDocsQuery.) 24 | (get dq "query") 25 | (get dq "metadata")))))) 26 | 27 | (defn create [field-names-w-analyzers] 28 | (let [^MonitorConfiguration config (MonitorConfiguration.) 29 | per-field-analyzers (PerFieldAnalyzerWrapper. 30 | (text-analysis/get-string-analyzer {} {}) field-names-w-analyzers)] 31 | (.setIndexPath config nil monitor-query-serializer) 32 | (Monitor. per-field-analyzers config))) 33 | 34 | (defn defer-to-one-by-one-registration [^Monitor monitor monitor-queries] 35 | (doseq [mq monitor-queries] 36 | (try 37 | (.register monitor (doto (ArrayList.) (.add mq))) 38 | (catch Exception e 39 | (log/errorf "Failed to register query: '%s'" mq) 40 | (.printStackTrace e))))) 41 | 42 | (defn register-queries [^Monitor monitor monitor-queries] 43 | (try 44 | (.register monitor ^Iterable monitor-queries) 45 | (catch Exception _ 46 | (defer-to-one-by-one-registration monitor monitor-queries)))) 47 | 48 | (defn field-name-analyzer-mappings 49 | "Creates a map with field names as keys and Lucene analyzers as values. 50 | Both field name and analyzer are decided based on the dictionary entry configuration. 51 | First group dictionary entries by field name. Then from every group of dictionary entries 52 | take the first entry and create an analyzer based on analysis configuration." 53 | [dictionary default-analysis-conf] 54 | (->> dictionary 55 | (group-by (fn [dictionary-entry] 56 | (text-analysis/get-field-name dictionary-entry default-analysis-conf))) 57 | (reduce (fn [acc [field-name dict]] 58 | (assoc acc field-name (text-analysis/get-string-analyzer (first dict) default-analysis-conf))) 59 | {}))) 60 | 61 | (defn prepare [monitor dict-entries default-analysis-conf dictionary->monitor-queries-fn] 62 | (register-queries monitor (dictionary->monitor-queries-fn dict-entries default-analysis-conf))) 63 | 64 | (defn setup 65 | "Setups the monitor with all the dictionary entries." 66 | [dictionary default-analysis-conf dict-entry->monitor-queries-fn] 67 | (let [mappings-from-field-names-to-analyzers (field-name-analyzer-mappings dictionary default-analysis-conf) 68 | monitor (create mappings-from-field-names-to-analyzers)] 69 | (prepare monitor dictionary default-analysis-conf dict-entry->monitor-queries-fn) 70 | {:monitor monitor 71 | :field-names (keys mappings-from-field-names-to-analyzers)})) 72 | -------------------------------------------------------------------------------- /src/beagle/phrases.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.phrases 2 | (:require [clojure.string :as s] 3 | [clojure.tools.logging :as log] 4 | [beagle.validator :as validator] 5 | [beagle.annotation-merger :as merger] 6 | [beagle.dictionary-optimizer :as optimizer] 7 | [beagle.text-analysis :as text-analysis] 8 | [beagle.monitor :as monitor] 9 | [beagle.schema :refer [->Highlight ->DictionaryEntry]]) 10 | (:import (java.util UUID) 11 | (org.apache.lucene.document Document FieldType Field) 12 | (org.apache.lucene.index IndexOptions Term) 13 | (org.apache.lucene.monitor Monitor MonitorQuery HighlightsMatch HighlightsMatch$Hit) 14 | (org.apache.lucene.search MultiPhraseQuery$Builder FuzzyQuery) 15 | (org.apache.lucene.search.spans SpanNearQuery$Builder SpanTermQuery SpanMultiTermQueryWrapper))) 16 | 17 | (defn filter-and-sort-ordered-hits [^String text ^String highlight-text ordered-hits] 18 | (->> ordered-hits 19 | (filter (fn [^HighlightsMatch$Hit hit] 20 | (= highlight-text (let [s (.-startOffset hit) 21 | e (.-endOffset hit)] 22 | (subs text s e))))) 23 | (sort-by (fn [^HighlightsMatch$Hit hit] (.-startOffset hit))))) 24 | 25 | (defn group-sequencial-ending 26 | "Groups a sequence taking only the last hit from a consecutive sub-sequence 27 | of terms, e.g. [1 2 3 6 7] => [3 7]" 28 | [spans-end-hits] 29 | (loop [[current-term & terms] spans-end-hits 30 | last-item nil 31 | current-seq [] 32 | filtered-ends []] 33 | (if (nil? current-term) 34 | (conj filtered-ends (last current-seq)) 35 | (if (nil? last-item) 36 | (recur terms current-term [current-term] (if (seq current-seq) 37 | (conj filtered-ends (last current-seq)) 38 | filtered-ends)) 39 | (if (= (inc (.-startPosition last-item)) (.-startPosition current-term)) 40 | (recur terms current-term (conj current-seq current-term) filtered-ends) 41 | (recur terms current-term [current-term] (conj filtered-ends (last current-seq)))))))) 42 | 43 | (defn pair-begins-with-ends [spans-start-hits spans-end-hits] 44 | (let [grouped-ends (group-sequencial-ending spans-end-hits)] 45 | (loop [[start & starts-tail :as starts] spans-start-hits 46 | [end & ends-tail] grouped-ends 47 | pairs []] 48 | (if (or (nil? start) (nil? end)) 49 | pairs 50 | (if (= start end) 51 | (recur starts ends-tail pairs) 52 | (recur (remove #(< (.-startPosition %) (.-startPosition end)) starts-tail) 53 | ends-tail (conj pairs [start end]))))))) 54 | 55 | (defn ordered-hits->highlights 56 | "The default highlighter fails to handle SpanNearQuery: highlights are term highlights not the whole 57 | span highlights. 58 | The temporary workaround works as follows: 59 | 1) find the very first hit 60 | 2) find the very last hit 61 | 3) assume that all spans begins and ends with the same terms 62 | 4) collect all hits like the beginning 63 | 5) collect all hits like the ending 64 | 6) pair beginnings with endings and make one highlight per pair" 65 | [text type-name query-id metadata ordered-hits] 66 | (let [^HighlightsMatch$Hit first-hit (apply min-key #(.-startOffset ^HighlightsMatch$Hit %) ordered-hits) 67 | first-text (subs text (.-startOffset first-hit) (.-endOffset first-hit)) 68 | ^HighlightsMatch$Hit last-hit (apply max-key #(.-startOffset ^HighlightsMatch$Hit %) ordered-hits) 69 | last-text (subs text (.-startOffset last-hit) (.-endOffset last-hit)) 70 | spans-start-hits (filter-and-sort-ordered-hits text first-text ordered-hits) 71 | spans-end-hits (filter-and-sort-ordered-hits text last-text ordered-hits) 72 | normalized-metadata (dissoc metadata "_in-order")] 73 | (map (fn [[^HighlightsMatch$Hit span-start-hit ^HighlightsMatch$Hit span-end-hit]] 74 | (let [start-offset (.-startOffset span-start-hit) 75 | end-offset (.-endOffset span-end-hit)] 76 | (->Highlight 77 | (subs text start-offset end-offset) 78 | (or (get meta "_type") type-name) 79 | query-id 80 | normalized-metadata 81 | start-offset 82 | end-offset))) (pair-begins-with-ends spans-start-hits spans-end-hits)))) 83 | 84 | (defn match->annotation [text ^Monitor monitor type-name ^HighlightsMatch match] 85 | (mapcat 86 | (fn [[_ hits]] 87 | (let [query-id (.getQueryId match) 88 | metadata (into {} (.getMetadata (.getQuery monitor query-id)))] 89 | (if (get metadata "_in-order") 90 | (ordered-hits->highlights text type-name query-id metadata hits) 91 | (map (fn [^HighlightsMatch$Hit hit] 92 | (let [start-offset (.-startOffset hit) 93 | end-offset (.-endOffset hit)] 94 | (->Highlight 95 | (subs text start-offset end-offset) 96 | (or (get metadata "_type") type-name) 97 | query-id 98 | metadata 99 | start-offset 100 | end-offset))) hits)))) 101 | (.getHits match))) 102 | 103 | (def ^FieldType field-type 104 | (doto (FieldType.) 105 | (.setTokenized true) 106 | (.setIndexOptions IndexOptions/DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) 107 | (.setStoreTermVectors true) 108 | (.setStoreTermVectorOffsets true))) 109 | 110 | (defn annotate-text [^String text ^Monitor monitor field-names ^String type-name] 111 | (try 112 | (let [doc (Document.)] 113 | (doseq [field-name field-names] 114 | (.add doc (Field. ^String field-name text field-type))) 115 | (mapcat #(match->annotation text monitor type-name %) 116 | (.getMatches 117 | (.match monitor 118 | #^"[Lorg.apache.lucene.document.Document;" (into-array Document [doc]) 119 | (HighlightsMatch/MATCHER)) 120 | 0))) 121 | (catch Exception e 122 | (log/errorf "Failed to match text: '%s'" text) 123 | (.printStackTrace e)))) 124 | 125 | (defn prepare-synonyms [query-id {:keys [synonyms] :as dict-entry}] 126 | (map (fn [synonym] 127 | (->DictionaryEntry 128 | synonym 129 | (:type dict-entry) 130 | (str (UUID/randomUUID)) 131 | nil 132 | (:case-sensitive? dict-entry) 133 | (:ascii-fold? dict-entry) 134 | (:stem? dict-entry) 135 | (:stemmer dict-entry) 136 | (:slop dict-entry) 137 | (:tokenizer dict-entry) 138 | (assoc (:meta dict-entry) 139 | :synonym? "true" :query-id query-id))) 140 | synonyms)) 141 | 142 | (defn dict-entry->terms [dict-entry default-analysis-conf] 143 | (let [analyzer (text-analysis/get-string-analyzer dict-entry default-analysis-conf)] 144 | (into-array String (text-analysis/text->token-strings (:text dict-entry) analyzer)))) 145 | 146 | (defn merge-dict-entry-with-highlighter-opts 147 | "There are dictionary opts that do not contribute to text analysis, but contributes 148 | to querying. This function acts a single point in merging default highlighter opts 149 | to the dictionary entry." 150 | [dict-entry default-analysis-conf] 151 | (cond-> dict-entry 152 | (and (not (contains? dict-entry :slop)) 153 | (contains? default-analysis-conf :slop)) 154 | (assoc :slop (:slop default-analysis-conf)) 155 | 156 | (and (not (contains? dict-entry :in-order?)) 157 | (contains? default-analysis-conf :in-order?)) 158 | (assoc :in-order? (:in-order? default-analysis-conf)))) 159 | 160 | (defn dict-entry->monitor-query [dict-entry default-analysis-conf idx] 161 | (let [field-name (text-analysis/get-field-name dict-entry default-analysis-conf) 162 | terms (dict-entry->terms dict-entry default-analysis-conf) 163 | {:keys [id text meta type slop in-order?] 164 | :as dict-entry} (merge-dict-entry-with-highlighter-opts dict-entry default-analysis-conf) 165 | query-id (or id (str idx)) 166 | metadata (reduce (fn [m [k v]] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta)) 167 | normalized-slop (when slop (max 0 (min slop Integer/MAX_VALUE)))] 168 | (if (seq terms) 169 | (if (or (and (and (number? slop) (< 0 slop)) in-order? (< 1 (count terms))) 170 | (:fuzzy? dict-entry)) 171 | (MonitorQuery. query-id 172 | (try 173 | (let [ordered? (cond 174 | in-order? true 175 | (and (nil? in-order?) (:fuzzy? dict-entry)) true 176 | :else false) 177 | snqb (SpanNearQuery$Builder. ^String field-name ordered?)] 178 | (doseq [term terms] 179 | (if (true? (:fuzzy? dict-entry)) 180 | (.addClause snqb (SpanMultiTermQueryWrapper. 181 | (FuzzyQuery. 182 | (Term. ^String field-name ^String term) 183 | (or (:fuzziness dict-entry) 1)))) 184 | (.addClause snqb (SpanTermQuery. (Term. ^String field-name ^String term))))) 185 | (when-not (= slop normalized-slop) 186 | (log/warnf "Phrase slop '%s' normalized to '%s'" slop normalized-slop)) 187 | (when normalized-slop 188 | (.setSlop snqb normalized-slop)) 189 | (.build snqb)) 190 | (catch Exception e (.printStackTrace e))) 191 | text 192 | (assoc metadata "_in-order" true)) 193 | (MonitorQuery. query-id 194 | (let [mpqb (MultiPhraseQuery$Builder.)] 195 | (doseq [s terms] 196 | (.add mpqb (Term. ^String field-name ^String s))) 197 | (when slop 198 | (when-not (= slop normalized-slop) 199 | (log/warnf "Phrase slop '%s' normalized to '%s'" slop normalized-slop)) 200 | (.setSlop mpqb normalized-slop)) 201 | (.build mpqb)) 202 | text 203 | metadata)) 204 | (log/warnf "Discarding the dictionary entry because no tokens: '%s'" dict-entry)))) 205 | 206 | (defn dict-entries->monitor-queries [dict-entries default-analysis-conf] 207 | (->> dict-entries 208 | (mapcat (fn [idx dict-entry] 209 | (let [query-id (or (get dict-entry :id) (str idx))] 210 | (cons 211 | (dict-entry->monitor-query dict-entry default-analysis-conf idx) 212 | (map #(dict-entry->monitor-query % default-analysis-conf nil) 213 | (prepare-synonyms query-id dict-entry))))) 214 | (range)) 215 | (remove nil?))) 216 | 217 | (defn synonym-annotation? [annotation] 218 | (= "true" (get-in annotation [:meta "synonym?"]))) 219 | 220 | (defn meta-type? [annotation] 221 | (string? (get-in annotation [:meta "_type"]))) 222 | 223 | (defn post-process [annotation] 224 | (cond-> annotation 225 | (synonym-annotation? annotation) (assoc :dict-entry-id (get-in annotation [:meta "query-id"])) 226 | (meta-type? annotation) (update-in [:meta] dissoc "_type"))) 227 | 228 | (defn match [text monitor field-names type-name opts] 229 | (if (s/blank? text) 230 | [] 231 | (let [annotations (map post-process (annotate-text text monitor field-names type-name))] 232 | (if (:merge-annotations? opts) 233 | (merger/merge-same-type-annotations annotations) 234 | annotations)))) 235 | 236 | (defn highlighter 237 | "Creates a highlighter function with for a given dictionary. 238 | Params: 239 | - dictionary 240 | a list of dictionary entries as described in `beagle.schema/dict-entry` 241 | Opts: 242 | - type-name 243 | a string, defaults to \"PHRASE\" 244 | - validate-dictionary? 245 | if set to true then validates the dictionary, default false 246 | - optimize-dictionary? 247 | if set to true then optimizes dictionary before creating the monitor, default false 248 | - tokenizer 249 | a keyword one of #{:keyword :letter :standard :classic :strict :unicode-whitespace :whitespace}, default :standard 250 | - case-sensitive? 251 | if set to true text matching is case sensitive, default true 252 | - ascii-fold? 253 | if set to true before matching text is ascii folded, default false 254 | - stem? 255 | if set to true before matching text is stemmed, default false 256 | - stemmer 257 | a keyword one of #{:arabic :armenian :basque :catalan :danish :dutch :english :estonian 258 | :finnish :french :german :german2 :hungarian :irish :italian :kp :lithuanian :lovins 259 | :norwegian :porter :portuguese :romanian :russian :spanish :swedish :turkish} 260 | that specifies the stemmer algorithm, default :english 261 | - slop 262 | the max edit-distance for phrase matching, default 0 263 | - in-order? 264 | if set to true enforces phrase terms ordering in matches, default false" 265 | ([dictionary] (highlighter dictionary {})) 266 | ([dictionary opts] 267 | (when (:validate-dictionary? opts) (validator/validate-dictionary dictionary)) 268 | (let [dictionary (if (:optimize-dictionary? opts) (optimizer/optimize dictionary) dictionary) 269 | type-name (if (s/blank? (:type-name opts)) "PHRASE" (:type-name opts)) 270 | {:keys [monitor field-names]} (monitor/setup dictionary opts dict-entries->monitor-queries)] 271 | (fn 272 | ([text] (match text monitor field-names type-name {})) 273 | ([text opts] (match text monitor field-names type-name opts)))))) 274 | 275 | (defn ^:deprecated annotator 276 | [dictionary & {:keys [type-name validate-dictionary? optimize-dictionary? tokenizer]}] 277 | (when validate-dictionary? (validator/validate-dictionary dictionary)) 278 | (let [dictionary (if optimize-dictionary? (optimizer/optimize dictionary) dictionary) 279 | type-name (if (s/blank? type-name) "PHRASE" type-name) 280 | {:keys [monitor field-names]} (monitor/setup dictionary {:tokenizer tokenizer} 281 | dict-entries->monitor-queries)] 282 | (fn 283 | ([text] (match text monitor field-names type-name {})) 284 | ([text & {:as opts}] (match text monitor field-names type-name opts))))) 285 | -------------------------------------------------------------------------------- /src/beagle/readers.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.readers 2 | (:require [clojure.string :as s] 3 | [clojure.edn :as edn] 4 | [clojure.java.io :as io] 5 | [clojure.data.csv :as csv] 6 | [jsonista.core :as json]) 7 | (:import (java.io PushbackReader))) 8 | 9 | (def mapper (json/object-mapper {:decode-key-fn true})) 10 | 11 | (defn read-edn 12 | "Reads dictionary from the source. 13 | `source` - must be something that an input stream can be created." 14 | [source] 15 | (with-open [rdr (PushbackReader. (io/reader (io/input-stream source)))] 16 | (doall (edn/read rdr)))) 17 | 18 | (defn read-csv [source] 19 | (with-open [reader (io/reader source)] 20 | (let [[header & lines] (csv/read-csv reader :separator \, :quote \") 21 | kvs (map keyword header)] 22 | (->> lines 23 | (map (fn [line] (map s/trim line))) 24 | (map #(apply array-map (interleave kvs %))) 25 | (map #(into {} (remove (fn [[_ v]] (s/blank? v)) %))) 26 | (map (fn [{:keys [synonyms] :as dict}] 27 | (if-not (s/blank? synonyms) 28 | (assoc dict :synonyms (map s/trim (s/split synonyms #";"))) 29 | dict))) 30 | (map (fn [{:keys [case-sensitive?] :as dict}] 31 | (if-not (s/blank? case-sensitive?) 32 | (assoc dict :case-sensitive? (Boolean/valueOf ^String case-sensitive?)) 33 | dict))) 34 | (map (fn [{:keys [ascii-fold?] :as dict}] 35 | (if-not (s/blank? ascii-fold?) 36 | (assoc dict :ascii-fold? (Boolean/valueOf ^String ascii-fold?)) 37 | dict))) 38 | (map (fn [{:keys [meta] :as dict}] 39 | (if-not (s/blank? meta) 40 | (assoc dict :meta (reduce (fn [acc [k v]] (assoc acc k v)) 41 | {} 42 | (->> (map s/trim (s/split meta #";")) 43 | (partition-all 2) 44 | (remove (fn [[_ v]] (s/blank? (str v))))))) 45 | 46 | dict))) 47 | (doall))))) 48 | 49 | (defn read-json [source] 50 | (with-open [rdr (io/reader (io/input-stream source))] 51 | (doall (json/read-value rdr mapper)))) 52 | -------------------------------------------------------------------------------- /src/beagle/schema.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.schema 2 | (:require [clojure.spec.alpha :as s] 3 | [clojure.spec.gen.alpha :as gen] 4 | [clojure.string :as str])) 5 | 6 | (s/def ::non-empty-string 7 | (s/and string? (complement str/blank?))) 8 | 9 | (s/def ::text ::non-empty-string) 10 | (s/def ::type (s/nilable string?)) 11 | (s/def ::id (s/nilable string?)) 12 | (s/def ::synonyms (s/nilable (s/coll-of ::non-empty-string))) 13 | (s/def ::case-sensitive? (s/nilable boolean?)) 14 | (s/def ::ascii-fold? (s/nilable boolean?)) 15 | (s/def ::stem? (s/nilable boolean?)) 16 | (s/def ::stemmer (s/nilable keyword?)) 17 | (s/def ::slop (s/nilable #(and (number? %) (or (pos-int? %) (zero? %))))) 18 | (s/def ::tokenizer (s/nilable keyword?)) 19 | (s/def ::in-order? (s/nilable boolean?)) 20 | (s/def ::meta 21 | (s/with-gen 22 | (s/nilable (s/map-of #(or (string? %) (keyword? %)) string?)) 23 | #(gen/fmap (fn [s] {s s}) (s/gen string?)))) 24 | 25 | (s/def ::dict-entry 26 | (s/keys :req-un [::text] 27 | :opt-un [::type ::id ::synonyms ::meta 28 | ::case-sensitive? ::ascii-fold? ::stem? ::stemmer ::slop 29 | ::tokenizer ::in-order?])) 30 | 31 | (defrecord DictionaryEntry [text type id synonyms case-sensitive? ascii-fold? 32 | stem? stemmer slop tokenizer meta]) 33 | 34 | (s/def ::dictionary (s/coll-of ::dict-entry)) 35 | 36 | (s/def ::begin-offset pos-int?) 37 | (s/def ::end-offset pos-int?) 38 | (s/def ::dict-entry-id ::non-empty-string) 39 | 40 | (s/def ::dictionary-annotation 41 | (s/keys :req-un [::text ::type ::begin-offset ::end-offset] 42 | :opt-un [::dict-entry-id ::meta])) 43 | 44 | (defrecord Highlight [text type dict-entry-id meta begin-offset end-offset]) 45 | 46 | (s/def ::annotations (s/coll-of ::dictionary-annotation)) 47 | -------------------------------------------------------------------------------- /src/beagle/text_analysis.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.text-analysis 2 | (:require [clojure.string :as string] 3 | [clojure.tools.logging :as log]) 4 | (:import (org.apache.lucene.analysis Analyzer Analyzer$TokenStreamComponents Tokenizer TokenStream) 5 | (org.apache.lucene.analysis.core LowerCaseFilter WhitespaceTokenizer LetterTokenizer KeywordTokenizer UnicodeWhitespaceTokenizer) 6 | (org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter) 7 | (org.apache.lucene.analysis.standard ClassicFilter StandardTokenizer ClassicTokenizer) 8 | (org.apache.lucene.analysis.tokenattributes CharTermAttribute) 9 | (org.apache.lucene.analysis.pattern PatternTokenizer) 10 | (org.apache.lucene.analysis.snowball SnowballFilter) 11 | (org.tartarus.snowball.ext LithuanianStemmer ArabicStemmer ArmenianStemmer BasqueStemmer EnglishStemmer CatalanStemmer DanishStemmer DutchStemmer EstonianStemmer FinnishStemmer FrenchStemmer German2Stemmer GermanStemmer HungarianStemmer IrishStemmer ItalianStemmer KpStemmer LovinsStemmer NorwegianStemmer PorterStemmer PortugueseStemmer RomanianStemmer RussianStemmer SpanishStemmer SwedishStemmer TurkishStemmer) 12 | (org.tartarus.snowball SnowballProgram) 13 | (java.io StringReader))) 14 | 15 | (defn ^SnowballProgram stemmer 16 | "Creates a stemmer object given the stemmer keyword. 17 | Default stemmer is English." 18 | [stemmer-kw] 19 | (case stemmer-kw 20 | :arabic (ArabicStemmer.) 21 | :armenian (ArmenianStemmer.) 22 | :basque (BasqueStemmer.) 23 | :catalan (CatalanStemmer.) 24 | :danish (DanishStemmer.) 25 | :dutch (DutchStemmer.) 26 | :english (EnglishStemmer.) 27 | :estonian (EstonianStemmer.) 28 | :finnish (FinnishStemmer.) 29 | :french (FrenchStemmer.) 30 | :german2 (German2Stemmer.) 31 | :german (GermanStemmer.) 32 | :hungarian (HungarianStemmer.) 33 | :irish (IrishStemmer.) 34 | :italian (ItalianStemmer.) 35 | :kp (KpStemmer.) 36 | :lithuanian (LithuanianStemmer.) 37 | :lovins (LovinsStemmer.) 38 | :norwegian (NorwegianStemmer.) 39 | :porter (PorterStemmer.) 40 | :portuguese (PortugueseStemmer.) 41 | :romanian (RomanianStemmer.) 42 | :russian (RussianStemmer.) 43 | :spanish (SpanishStemmer.) 44 | :swedish (SwedishStemmer.) 45 | :turkish (TurkishStemmer.) 46 | (do 47 | (when stemmer-kw 48 | (log/debugf "Stemmer '%s' not found! EnglishStemmer is used." stemmer-kw)) 49 | (EnglishStemmer.)))) 50 | 51 | (defn ^Tokenizer tokenizer [tokenizer-kw] 52 | (case tokenizer-kw 53 | :keyword (KeywordTokenizer.) 54 | :letter (LetterTokenizer.) 55 | :classic (ClassicTokenizer.) 56 | :standard (StandardTokenizer.) 57 | :strict (PatternTokenizer. #"[^a-zA-Z0-9{}\[\]()<>#+=@&']+" -1) 58 | :unicode-whitespace (UnicodeWhitespaceTokenizer.) 59 | :whitespace (WhitespaceTokenizer.) 60 | (do 61 | (when tokenizer-kw 62 | (log/debugf "Tokenizer '%s' not found. StandardTokenizer is used." tokenizer-kw)) 63 | (StandardTokenizer.)))) 64 | 65 | (defn analyzer-constructor [{tokenizer-kw :tokenizer 66 | ascii-fold? :ascii-fold? 67 | case-sensitive? :case-sensitive? 68 | stem? :stem? 69 | stemmer-kw :stemmer}] 70 | (proxy [Analyzer] [] 71 | (createComponents [^String field-name] 72 | (let [^Tokenizer tokenizr (tokenizer tokenizer-kw) 73 | ^TokenStream filters-chain (cond-> tokenizr 74 | (not case-sensitive?) (LowerCaseFilter.) 75 | ascii-fold? (ASCIIFoldingFilter.)) 76 | token-stream (if stem? 77 | (SnowballFilter. filters-chain (stemmer stemmer-kw)) 78 | (if (instance? Tokenizer filters-chain) 79 | (ClassicFilter. tokenizr) 80 | filters-chain))] 81 | (Analyzer$TokenStreamComponents. 82 | ^Tokenizer tokenizr ^TokenStream token-stream))))) 83 | 84 | (defn field-name-constructor [{tokenizer-kw :tokenizer 85 | ascii-fold? :ascii-fold? 86 | case-sensitive? :case-sensitive? 87 | stem? :stem? 88 | stemmer-kw :stemmer}] 89 | (let [tokenizr (str (name (or tokenizer-kw :standard)) "-tokenizer") 90 | filters (cond-> [] 91 | (not case-sensitive?) (conj "lowercased") 92 | ascii-fold? (conj "ascii-folded") 93 | stem? (conj (str "stemmed-" (name (or stemmer-kw :english)))))] 94 | (if (seq filters) 95 | (str "text" "." tokenizr "." (string/join "-" (sort filters))) 96 | (str "text" "." tokenizr)))) 97 | 98 | (def analyzer (memoize analyzer-constructor)) 99 | (def field-name (memoize field-name-constructor)) 100 | 101 | (def default-conf 102 | {:tokenizer :standard 103 | :case-sensitive? true 104 | :ascii-fold? false 105 | :stem? false 106 | :stemmer :english}) 107 | 108 | (defrecord Conf [tokenizer case-sensitive? ascii-fold? stem? stemmer]) 109 | 110 | (defn three-way-merge 111 | "Given a key and three maps return the value that would appear in the map after merge. 112 | Semantics is of the default Clojure merge." 113 | [k m1 m2 m3] 114 | (if (nil? (k m3)) 115 | (if (nil? (k m2)) 116 | (k m1) 117 | (k m2)) 118 | (k m3))) 119 | 120 | (defn ^Analyzer get-string-analyzer [analysis-conf default-analysis-conf] 121 | (analyzer (->Conf 122 | (three-way-merge :tokenizer default-conf default-analysis-conf analysis-conf) 123 | (three-way-merge :case-sensitive? default-conf default-analysis-conf analysis-conf) 124 | (three-way-merge :ascii-fold? default-conf default-analysis-conf analysis-conf) 125 | (three-way-merge :stem? default-conf default-analysis-conf analysis-conf) 126 | (three-way-merge :stemmer default-conf default-analysis-conf analysis-conf)))) 127 | 128 | (defn ^String get-field-name [analysis-conf default-analysis-conf] 129 | (field-name (->Conf 130 | (three-way-merge :tokenizer default-conf default-analysis-conf analysis-conf) 131 | (three-way-merge :case-sensitive? default-conf default-analysis-conf analysis-conf) 132 | (three-way-merge :ascii-fold? default-conf default-analysis-conf analysis-conf) 133 | (three-way-merge :stem? default-conf default-analysis-conf analysis-conf) 134 | (three-way-merge :stemmer default-conf default-analysis-conf analysis-conf)))) 135 | 136 | (defn text->token-strings 137 | "Given a text and an analyzer returns a list of tokens as strings." 138 | [^String text ^Analyzer analyzer] 139 | (let [^TokenStream token-stream (.tokenStream analyzer "not-important" (StringReader. text)) 140 | ^CharTermAttribute termAtt (.addAttribute token-stream CharTermAttribute)] 141 | (.reset token-stream) 142 | (reduce (fn [acc _] 143 | (if (.incrementToken token-stream) 144 | (conj acc (.toString termAtt)) 145 | (do 146 | (.end token-stream) 147 | (.close token-stream) 148 | (reduced acc)))) [] (range)))) 149 | -------------------------------------------------------------------------------- /src/beagle/validator.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.validator 2 | (:gen-class) 3 | (:require [clojure.spec.alpha :as s] 4 | [beagle.schema :as sch] 5 | [beagle.readers :as readers])) 6 | 7 | (defn validate-dictionary [dictionary] 8 | (s/conform ::sch/dictionary dictionary)) 9 | 10 | (defn valid-dictionary? [dictionary] 11 | (try 12 | (seq (validate-dictionary dictionary)) 13 | (catch Exception _))) 14 | 15 | (def supported-dictionary-file-types #{"csv" "json" "edn"}) 16 | 17 | (defn valid-dictionary-file? [dictionary-file dictionary-file-type] 18 | (if (contains? supported-dictionary-file-types dictionary-file-type) 19 | (valid-dictionary? (case dictionary-file-type 20 | "csv" (readers/read-csv dictionary-file) 21 | "json" (readers/read-json dictionary-file) 22 | "edn" (readers/read-edn dictionary-file))) 23 | (.printStackTrace (Exception. (format "File type not supported: `%s`" dictionary-file-type))))) 24 | 25 | (defn -main [& args] 26 | (when (odd? (count args)) 27 | (.printStackTrace (Exception. "Even number of arguments must be present - 'dictionary-name dictionary-type ...'")) 28 | (System/exit 1)) 29 | (when (some #(not (apply valid-dictionary-file? %)) (partition-all 2 args)) 30 | (System/exit 1))) 31 | -------------------------------------------------------------------------------- /test/beagle/annotation_merge_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.annotation-merge-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [beagle.phrases :as phrases] 4 | [beagle.annotation-merger :as merger])) 5 | 6 | (deftest annotator-with-merge-option-test 7 | (let [dictionary [{:text "1 2"} {:text "2"} {:text "1 2 3 4"} 8 | {:text "4"} {:text "5"} {:text "6 5 3 7"} {:text "6 5"}] 9 | highlighter-fn (phrases/highlighter dictionary {:type-name "TEST"}) 10 | text "A B C 1 2 3 4 D E F G 6 5 3 7"] 11 | (is (= (count (highlighter-fn text {:merge-annotations? false})) (count (highlighter-fn text)))) 12 | (is (< (count (highlighter-fn text {:merge-annotations? true})) (count (highlighter-fn text)))) 13 | (is (= [(set (vals {:begin-offset 6 14 | :dict-entry-id "2" 15 | :end-offset 13 16 | :meta {} 17 | :text "1 2 3 4" 18 | :type "TEST"})) 19 | (set (vals {:begin-offset 22 20 | :dict-entry-id "5" 21 | :end-offset 29 22 | :meta {} 23 | :text "6 5 3 7" 24 | :type "TEST"}))] 25 | (map #(-> % vals set) (highlighter-fn text {:merge-annotations? true})))))) 26 | 27 | (deftest annotation-merge-test 28 | (is (= [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}] 29 | (merger/merge-same-type-annotations 30 | [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5} 31 | {:text "A" :type "TEST" :dict-entry-id "3" :meta {} :begin-offset 0 :end-offset 1} 32 | {:text "AAAA" :type "TEST" :dict-entry-id "2" :meta {} :begin-offset 1 :end-offset 5}]))) 33 | 34 | (is (= [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5} 35 | {:text "AAA" :type "TEST2" :dict-entry-id "10" :meta {} :begin-offset 0 :end-offset 3}] 36 | (merger/merge-same-type-annotations 37 | [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5} 38 | {:text "A" :type "TEST" :dict-entry-id "2" :meta {} :begin-offset 0 :end-offset 1} 39 | {:text "AAAA" :type "TEST" :dict-entry-id "3" :meta {} :begin-offset 1 :end-offset 5} 40 | {:text "AAA" :type "TEST2" :dict-entry-id "10" :meta {} :begin-offset 0 :end-offset 3} 41 | {:text "A" :type "TEST2" :dict-entry-id "11" :meta {} :begin-offset 0 :end-offset 1}])))) 42 | 43 | -------------------------------------------------------------------------------- /test/beagle/corner_case_phrases_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.corner-case-phrases-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [beagle.phrases :as phrases]) 4 | (:import (org.jsoup Jsoup))) 5 | 6 | (deftest corner-cases 7 | (let [annotator (phrases/highlighter [{:text "N-Able N-Central" 8 | :case-sensitive? false}]) 9 | text (some-> (Jsoup/parse (slurp "test/resources/phrases.html")) (.body) (.text))] 10 | (is (empty? (annotator text))))) 11 | -------------------------------------------------------------------------------- /test/beagle/dictionary_optimization_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.dictionary-optimization-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [beagle.dictionary-optimizer :as optimizer] 4 | [beagle.phrases :as phrases])) 5 | 6 | (deftest meta-merge-test 7 | (is (optimizer/mergeable-meta? nil {:meta {:email "123"}})) 8 | (is (optimizer/mergeable-meta? {:meta {}} {:meta {:email "123"}})) 9 | (is (optimizer/mergeable-meta? {:meta {:email "123"}} nil)) 10 | (is (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "123"}})) 11 | (is (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "123" :total 5646}})) 12 | (is (optimizer/mergeable-meta? {:meta {:email "123" :total 5646}} {:meta {:email "123"}})) 13 | (is (not (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "321"}}))) 14 | (is (not (optimizer/mergeable-meta? {:meta {:email "123" :total 5646}} {:meta {:email "123" :total 9999}}))) 15 | (is (= [{:ascii-fold? true 16 | :case-sensitive? true 17 | :id "test-id" 18 | :meta {:abc "123" :email "test@example.com"} 19 | :synonyms ["abc" "XXXX"] 20 | :text "test text"} 21 | {:ascii-fold? true 22 | :case-sensitive? true 23 | :id "test-id" 24 | :meta {:email "bobby@example.com"} 25 | :synonyms ["def"] 26 | :text "test text"}] 27 | (optimizer/aggregate-entries-by-meta 28 | [{:text "test text" 29 | :id "test-id" 30 | :synonyms ["abc"] 31 | :case-sensitive? true 32 | :ascii-fold? true 33 | :meta {:email "test@example.com"}} 34 | {:text "test text" 35 | :id "test-id" 36 | :synonyms ["def"] 37 | :case-sensitive? true 38 | :ascii-fold? true 39 | :meta {:email "bobby@example.com"}} 40 | {:text "test text" 41 | :id "test-id" 42 | :synonyms ["XXXX"] 43 | :case-sensitive? true 44 | :ascii-fold? true 45 | :meta {:email "test@example.com" :abc "123"}}])))) 46 | 47 | (deftest dictionary-optimization-test 48 | (let [dictionary [{:case-sensitive? true 49 | :ascii-fold? true 50 | :synonyms ["AAAA1"] 51 | :text "AAAA"} 52 | {:case-sensitive? true 53 | :ascii-fold? true 54 | :synonyms ["AAAA2"] 55 | :text "AAAA"} 56 | {:case-sensitive? false 57 | :ascii-fold? true 58 | :synonyms ["AAAA3"] 59 | :text "AAAA"} 60 | {:case-sensitive? true 61 | :ascii-fold? true 62 | :synonyms ["AAAA4"] 63 | :text "AAAA"} 64 | {:case-sensitive? true 65 | :ascii-fold? false 66 | :synonyms ["AAAA5"] 67 | :text "AAAA"} 68 | {:case-sensitive? true 69 | :ascii-fold? false 70 | :synonyms ["AAAA"] 71 | :text "AAAA"} 72 | {:case-sensitive? false 73 | :synonyms ["BBBB1"] 74 | :text "BBBB"} 75 | {:case-sensitive? false 76 | :synonyms ["BBBB"] 77 | :text "BBBB"}] 78 | expected-dictionary [{:text "AAAA" 79 | :synonyms ["AAAA4" "AAAA2" "AAAA1"] 80 | :case-sensitive? true 81 | :ascii-fold? true} 82 | {:case-sensitive? false :ascii-fold? true :synonyms ["AAAA3"] :text "AAAA"} 83 | {:text "AAAA" :synonyms ["AAAA5"] :case-sensitive? true :ascii-fold? false} 84 | {:text "BBBB" :synonyms ["BBBB1"] :case-sensitive? false}] 85 | optimized-dictionary (optimizer/optimize dictionary)] 86 | (is (< (count optimized-dictionary) (count dictionary))) 87 | (is (= (count expected-dictionary) (count optimized-dictionary))) 88 | (is (= (set (map #(update % :synonyms set) expected-dictionary)) 89 | (set (map #(update % :synonyms set) optimized-dictionary)))))) 90 | 91 | (deftest synonym-optimization 92 | (let [dictionary [{:text "test" :id "1" :synonyms ["beagle" "luwak1"]}] 93 | monitor-queries (phrases/dict-entries->monitor-queries dictionary {:tokenizer :standard})] 94 | (is (= 3 (count monitor-queries))) 95 | (let [highlighter-fn (phrases/highlighter dictionary {:type-name "TEST"}) 96 | anns (highlighter-fn "this is a beagle text test luwak1")] 97 | (is (= 3 (count anns)))))) 98 | -------------------------------------------------------------------------------- /test/beagle/java_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.java-test 2 | (:require [clojure.test :refer [deftest is]])) 3 | 4 | (deftest simple-java-interface 5 | (let [de (doto (lt.tokenmill.beagle.phrases.DictionaryEntry. "test") 6 | (.setSlop (Integer. 1))) 7 | annotator (lt.tokenmill.beagle.phrases.Annotator. [de] {})] 8 | (is (= "test" (first (map #(.text %) (.annotate annotator "test txt" {}))))))) 9 | 10 | (deftest case-sensitivity 11 | (let [de (doto (lt.tokenmill.beagle.phrases.DictionaryEntry. "LYNDON BAINES JOHNSON") 12 | (.setCaseSensitive false)) 13 | annotator (lt.tokenmill.beagle.phrases.Annotator. [de] {})] 14 | (is (= 1 (count (filter #(= "Lyndon Baines Johnson" (.text %)) (.annotate annotator "Lyndon Baines Johnson (/ˈlɪndən ˈbeɪnz/; August 27, 1908 – January 22, 1973), often referred to as LBJ, was an American politician who served as the 36th president of the United States from 1963 to 1969." {}))))))) 15 | -------------------------------------------------------------------------------- /test/beagle/lucene_alpha_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.lucene-alpha-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [beagle.lucene-alpha :as lucene])) 4 | 5 | (deftest smoke 6 | (let [txt "some text this other that" 7 | dictionary [{:text "this AND that" :id "1" :slop 1}] 8 | annotator-fn (lucene/annotator dictionary) 9 | [ann1 :as anns] (annotator-fn txt {}) 10 | anns2 (annotator-fn txt)] 11 | (is (= anns anns2)) 12 | (is (= 1 (count anns))) 13 | (is (= "1" (:dict-entry-id ann1))))) 14 | 15 | (deftest ^:noisy smoke-2 16 | (let [txt "some text this AND" 17 | dictionary [{:text "this AND" :id "1" :slop 1}] 18 | annotator-fn (lucene/annotator dictionary) 19 | [ann1 :as anns] (annotator-fn txt)] 20 | (is (= 0 (count anns))) 21 | (is (nil? (:dict-entry-id ann1))))) 22 | 23 | (deftest smoke-3 24 | (let [txt "some number 1234 test" 25 | dictionary [{:text "/.*\\d*.*/" :id "1" :slop 1}] 26 | annotator-fn (lucene/annotator dictionary) 27 | anns (annotator-fn txt)] 28 | (is (< 0 (count anns))))) 29 | -------------------------------------------------------------------------------- /test/beagle/optimization_suggestions_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.optimization-suggestions-test 2 | (:require [clojure.test :refer [deftest is testing]] 3 | [beagle.dictionary-optimizer :as optimizer])) 4 | 5 | (deftest optimization-suggestions 6 | (testing "Suggestions for similar dictionary items" 7 | (is (= [{:dictionary-items [{:id "1" :synonyms ["beagle"] :text "test"} {:id "2" :synonyms ["luwak1"] :text "test"}] 8 | :suggestion "Dictionary items '[1 2]' have identical `[text case-sensitivity ascii-folding] features."}] 9 | (optimizer/dry-run [{:text "test" :id "1" :synonyms ["beagle"]} 10 | {:text "test" :id "2" :synonyms ["luwak1"]}])))) 11 | 12 | (testing "Suggestions for two similar dictionary item groups" 13 | (is (= [{:suggestion "Dictionary items '[1 3]' have identical `[text case-sensitivity ascii-folding] features." 14 | :dictionary-items [{:id "1" :synonyms ["beagle"] :text "test"} {:id "3" :synonyms ["beagle"] :text "test"}]} 15 | {:suggestion "Dictionary items '[2 4]' have identical `[text case-sensitivity ascii-folding] features." 16 | :dictionary-items [{:id "2" :synonyms ["luwak2"] :text "test2"} {:id "4" :synonyms ["beagle3"] :text "test2"}]}] 17 | (optimizer/dry-run [{:id "1" :synonyms ["beagle"] :text "test"} 18 | {:id "2" :synonyms ["luwak2"] :text "test2"} 19 | {:id "3" :synonyms ["beagle"] :text "test"} 20 | {:id "4" :synonyms ["beagle3"] :text "test2"}])))) 21 | 22 | (testing "Suggestions for single dictionary item" 23 | (is (= [] (optimizer/dry-run [{:id "1" :synonyms ["beagle"] :text "test"}])))) 24 | 25 | (testing "Suggestions for distinct dictionary items" 26 | (is (= [] (optimizer/dry-run [{:id "1" :case-sensitive? true :synonyms ["beagle"] :text "test"} 27 | {:id "2" :synonyms ["beagle"] :text "test2"} 28 | {:id "3" :ascii-fold? false :synonyms ["beagle"] :text "test3"}])))) 29 | 30 | (testing "Suggestions for two similar dictionary item groups and one distinct dictionary item" 31 | (is (= [{:suggestion "Dictionary items '[test 3 4]' have identical `[text case-sensitivity ascii-folding] features." 32 | :dictionary-items [{:synonyms ["beagle"] :text "test"} 33 | {:id "3" :synonyms ["beagle"] :text "test"} 34 | {:id "4" :synonyms ["luwak222"] :text "test"}]} 35 | {:suggestion "Dictionary items '[2 test2]' have identical `[text case-sensitivity ascii-folding] features." 36 | :dictionary-items [{:id "2" :synonyms ["luwak2"] :text "test2"} {:synonyms ["beagle3"] :text "test2"}]}] 37 | (optimizer/dry-run [{:synonyms ["beagle"] :text "test"} 38 | {:id "2" :synonyms ["luwak2"] :text "test2"} 39 | {:id "3" :synonyms ["beagle"] :text "test"} 40 | {:id "4" :synonyms ["luwak222"] :text "test"} 41 | {:synonyms ["beagle3"] :text "test2"} 42 | {:synonyms ["beagle"] :text "test" :ascii-fold? true}]))))) 43 | -------------------------------------------------------------------------------- /test/beagle/phrases_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.phrases-test 2 | (:require [clojure.test :refer [deftest is testing]] 3 | [clojure.spec.alpha :as s] 4 | [clojure.spec.test.alpha :as stest] 5 | [beagle.phrases :as phrases] 6 | [beagle.schema :as schema])) 7 | 8 | (s/def ::opts (s/* (s/cat :opt keyword? :val any?))) 9 | 10 | (s/fdef phrases/highlighter 11 | :args (s/alt :unary (s/cat :dictionary ::schema/dictionary) 12 | :binary (s/cat :dictionary ::schema/dictionary :opts any?)) 13 | :ret (s/fspec :args (s/alt :unary (s/cat :text string?) 14 | :binary (s/cat :text string? :opts any?)) 15 | :ret ::schema/annotations)) 16 | 17 | (stest/instrument `phrases/highlighter) 18 | 19 | (s/exercise-fn `phrases/highlighter) 20 | 21 | (def label "LABEL") 22 | 23 | (deftest dictionary-entry-record 24 | (let [dictionary [(schema/map->DictionaryEntry {:text "test"})] 25 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 26 | anns (highlighter-fn "before annotated test phrase after annotated")] 27 | (is (= 1 (count anns))))) 28 | 29 | (deftest type-per-dictionary-entry 30 | (let [dictionary [{:text "test phrase" :id "1" :meta {:test "test"} :type "CUSTOM"}] 31 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 32 | anns (highlighter-fn "before annotated test phrase after annotated")] 33 | (is (seq (s/conform ::schema/annotations anns))) 34 | (is (seq anns)) 35 | (is (= "1" (-> anns first :dict-entry-id))) 36 | (is (= "CUSTOM" (-> anns first :type))) 37 | (is (= "test phrase" (-> anns first :text))) 38 | (is (nil? (-> anns first (get-in [:meta "_type"])))))) 39 | 40 | (deftest id 41 | (let [dictionary [{:text "test" :id "1" :meta {:test "test"}}] 42 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 43 | anns (highlighter-fn "before annotated test after annotated")] 44 | (is (seq anns)) 45 | (is (= "1" (-> anns first :dict-entry-id))) 46 | (is (= "LABEL" (-> anns first :type))))) 47 | 48 | (deftest metadata-append 49 | (let [dictionary [{:text "test" :meta {"email" "test@example.com"}}] 50 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 51 | anns (highlighter-fn "before annotated test after annotated")] 52 | (is (seq anns)) 53 | (is (= {"email" "test@example.com"} (-> anns first :meta))))) 54 | 55 | (deftest case-sensitivity 56 | (testing "case sensitive" 57 | (let [dictionary [{:text "test"}] 58 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 59 | anns (highlighter-fn "before annotated test after annotated")] 60 | (is (seq anns))) 61 | (let [dictionary [{:text "TeSt" :case-sensitive? true}] 62 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 63 | anns (highlighter-fn "before annotated test after annotated")] 64 | (is (empty? anns))) 65 | (let [label "LABEL" 66 | dictionary [{:text "test" :case-sensitive? true}] 67 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 68 | anns (highlighter-fn "before annotated Test after annotated")] 69 | (is (empty? anns)))) 70 | 71 | (testing "case insensitive" 72 | (let [dictionary [{:text "TeSt" :case-sensitive? false}] 73 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 74 | anns (highlighter-fn "before annotated test after annotated")] 75 | (is (seq anns))) 76 | (let [dictionary [{:text "test" :case-sensitive? false}] 77 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 78 | anns (highlighter-fn "before annotated test after annotated")] 79 | (is (seq anns))))) 80 | 81 | (deftest ascii-folding-dictionary 82 | (let [dictionary [{:text "wörd"}] 83 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 84 | anns (highlighter-fn "before annotated wörd after annotated")] 85 | (is (seq anns))) 86 | (let [dictionary [{:text "wörd"}] 87 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 88 | anns (highlighter-fn "before annotated word after annotated")] 89 | (is (empty? anns))) 90 | (let [label "LABEL" 91 | dictionary [{:text "wörd" :ascii-fold? true}] 92 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 93 | anns (highlighter-fn "before annotated word after annotated")] 94 | (is (seq anns))) 95 | (let [dictionary [{:text "word" :ascii-fold? true}] 96 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 97 | anns (highlighter-fn "before annotated wörd after annotated")] 98 | (is (seq anns))) 99 | (let [label "LABEL" 100 | dictionary [{:text "word" :ascii-fold? false}] 101 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 102 | anns (highlighter-fn "before annotated wörd after annotated")] 103 | (is (empty? anns)))) 104 | 105 | (deftest ascii-folding-with-case-sensitivity 106 | (let [label "TYPE"] 107 | (testing "case sensitive" 108 | (let [dictionary [{:text "schön" :ascii-fold? true}] 109 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 110 | anns (highlighter-fn "before annotated Schön after annotated")] 111 | (is (empty? anns))) 112 | (let [dictionary [{:text "Schön" :ascii-fold? true}] 113 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 114 | anns (highlighter-fn "before annotated Schon after annotated")] 115 | (is (seq anns))) 116 | (let [dictionary [{:text "schön" :ascii-fold? true}] 117 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 118 | anns (highlighter-fn "before annotated Schon after annotated")] 119 | (is (empty? anns)))) 120 | 121 | (testing "case insensitive" 122 | (let [dictionary [{:text "schön" :ascii-fold? true :case-sensitive? false}] 123 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 124 | anns (highlighter-fn "before annotated Schon after annotated")] 125 | (is (seq anns)))) 126 | (let [dictionary [{:text "schön" :ascii-fold? true :case-sensitive? false}] 127 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 128 | anns (highlighter-fn "before annotated schon after annotated")] 129 | (is (seq anns))) 130 | (let [dictionary [{:text "schon" :ascii-fold? true :case-sensitive? false}] 131 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 132 | anns (highlighter-fn "before annotated schön after annotated")] 133 | (is (seq anns))) 134 | 135 | (testing "false ascii fold" 136 | (let [dictionary [{:text "schon" :ascii-fold? false}] 137 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 138 | anns (highlighter-fn "before annotated schön after annotated")] 139 | (is (empty? anns)))))) 140 | 141 | (deftest synonyms 142 | (let [dictionary [{:text "test" :id "1" :synonyms ["beagle"]}] 143 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 144 | anns (highlighter-fn "before annotated beagle after annotated")] 145 | (is (= 1 (count anns))) 146 | (is (= "1" (-> anns first :dict-entry-id))) 147 | (is (= "beagle" (-> anns first :text)))) 148 | 149 | (let [dictionary [{:text "test" :id "1" :synonyms ["Luwak"] :case-sensitive? true}] 150 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 151 | anns (highlighter-fn "before annotated beagle after annotated")] 152 | (is (empty? anns))) 153 | 154 | (let [dictionary [{:text "test" :id "1" :synonyms ["beagle"] :case-sensitive? false}] 155 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 156 | anns (highlighter-fn "before annotated beagle after annotated")] 157 | (is (= 1 (count anns))) 158 | (is (= "1" (-> anns first :dict-entry-id))) 159 | (is (= "beagle" (-> anns first :text)))) 160 | 161 | (testing "synonyms with false ascii fold" 162 | (let [dictionary [{:text "test" :synonyms ["schön"] :ascii-fold? false}] 163 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 164 | anns (highlighter-fn "before annotated schon after annotated")] 165 | (is (empty? anns))) 166 | (let [dictionary [{:text "test" :synonyms ["schön"] :ascii-fold? true}] 167 | highlighter-fn (phrases/highlighter dictionary {:type-name label}) 168 | anns (highlighter-fn "before annotated schon after annotated")] 169 | (is (seq anns)) 170 | (is (= "schon" (-> anns first :text)))))) 171 | 172 | (deftest phrase-end-sentence 173 | (let [dictionary [{:text "test-test"}] 174 | highlighter-fn (phrases/highlighter dictionary) 175 | anns (highlighter-fn "before annotated test-test.")] 176 | (is (seq anns)) 177 | (is (= "test-test" (:text (first anns)))))) 178 | 179 | (deftest phrase-in-quotes 180 | (let [dictionary [{:text "test-test" :case-sensitive? false}] 181 | highlighter-fn (phrases/highlighter dictionary) 182 | anns (highlighter-fn "before annotated \"TEST-test\".")] 183 | (is (seq anns)) 184 | (is (= "TEST-test" (:text (first anns)))))) 185 | 186 | (deftest phrase-in-quotes-should-not-match 187 | (let [dictionary [{:text "test-test" :case-sensitive? false}] 188 | highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace}) 189 | anns (highlighter-fn "before annotated \"TEST-test\".")] 190 | (is (empty? anns)))) 191 | 192 | (deftest overlapping-phrases 193 | (let [dictionary [{:text "test phrase test" :case-sensitive? false}] 194 | highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace}) 195 | anns (highlighter-fn "start test phrase test phrase test end")] 196 | (is (= 2 (count anns))))) 197 | 198 | (deftest lt-stemming 199 | (let [dictionary [{:text "Kaunas" :id "1" :stem? true :stemmer :lithuanian}] 200 | highlighter-fn (phrases/highlighter dictionary) 201 | anns (highlighter-fn "Kauno miestas")] 202 | (is (seq anns)) 203 | (is (= "Kauno" (-> anns first :text)))) 204 | (let [dictionary [{:text "Kaunas Vilnius" :id "1" :stem? true}] 205 | highlighter-fn (phrases/highlighter dictionary) 206 | anns (highlighter-fn "Kaunas, Vilnius")] 207 | (is (seq anns)) 208 | (is (= "Kaunas, Vilnius" (-> anns first :text)))) 209 | (let [dictionary [{:text "Kaunas" :id "1" :case-sensitive? false :stem? true :stemmer :lithuanian}] 210 | highlighter-fn (phrases/highlighter dictionary) 211 | anns (highlighter-fn "kauno miestas")] 212 | (is (seq anns)) 213 | (is (= "kauno" (-> anns first :text))))) 214 | 215 | (deftest en-stemming 216 | (let [txt "who let the dogs out?"] 217 | (let [dictionary [{:text "dog" :id "1"}] 218 | highlighter-fn (phrases/highlighter dictionary) 219 | anns (highlighter-fn txt)] 220 | (is (empty? anns))) 221 | (let [dictionary [{:text "dog" :id "1" :stem? true}] 222 | highlighter-fn (phrases/highlighter dictionary) 223 | anns (highlighter-fn txt)] 224 | (is (seq anns)) 225 | (is (= "dogs" (-> anns first :text)))) 226 | (let [dictionary [{:text "dog" :id "1" :stem? true :stemmer :english}] 227 | highlighter-fn (phrases/highlighter dictionary) 228 | anns (highlighter-fn txt)] 229 | (is (seq anns)) 230 | (is (= "dogs" (-> anns first :text)))) 231 | (let [dictionary [{:text "dog" :id "1" :stem? true :stemmer :estonian}] 232 | highlighter-fn (phrases/highlighter dictionary) 233 | anns (highlighter-fn txt)] 234 | (is (empty? anns))))) 235 | 236 | (deftest mixed-stemmers 237 | (let [txt "Saboniai plays basketball" 238 | dictionary [{:text "Sabonis" :id "1" :stem? true :stemmer :lithuanian} 239 | {:text "play" :id "2" :stem? true :stemmer :english}] 240 | highlighter-fn (phrases/highlighter dictionary) 241 | anns (highlighter-fn txt)] 242 | (is (= 2 (count anns))))) 243 | 244 | (deftest phrase-slop 245 | (let [txt "before start and end after" 246 | dictionary [{:text "start end" :id "1" :slop 1}] 247 | highlighter-fn (phrases/highlighter dictionary) 248 | anns (highlighter-fn txt)] 249 | (is (= 1 (count anns))) 250 | (is (= "start and end" (:text (first anns))))) 251 | (testing "all terms in the phrase should match" 252 | (let [txt "before start end after" 253 | dictionary [{:text "start NOPE end" :id "1" :slop 10}] 254 | highlighter-fn (phrases/highlighter dictionary) 255 | anns (highlighter-fn txt)] 256 | (is (empty? anns)))) 257 | (let [txt "before start phrase and end phrase after" 258 | dictionary [{:text "start phrase end phrase" :id "1" :slop 1}] 259 | highlighter-fn (phrases/highlighter dictionary) 260 | anns (highlighter-fn txt)] 261 | (is (= 1 (count anns))) 262 | (is (= "start phrase and end phrase" (:text (first anns))))) 263 | (testing "phrase edit distance" 264 | (let [txt "before start end after" 265 | dictionary [{:text "end start" :id "1" :slop 0}] 266 | highlighter-fn (phrases/highlighter dictionary) 267 | anns (highlighter-fn txt)] 268 | (is (empty? anns))) 269 | (let [txt "before start end after" 270 | dictionary [{:text "end start" :id "1" :slop 2}] 271 | highlighter-fn (phrases/highlighter dictionary) 272 | anns (highlighter-fn txt)] 273 | (is (= 1 (count anns))) 274 | (is (= "start end" (:text (first anns)))))) 275 | (testing "all terms should match despite the slop" 276 | (let [txt "before start end after" 277 | dictionary [{:text "end start foo" :id "1" :slop 100}] 278 | highlighter-fn (phrases/highlighter dictionary) 279 | anns (highlighter-fn txt)] 280 | (is (empty? anns))))) 281 | 282 | (deftest dictionary-corner-cases 283 | (let [txt "Some text to test ." 284 | dictionary [{:text "."} {:text "text"}] 285 | highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace}) 286 | anns (highlighter-fn txt)] 287 | (is (= 2 (count anns)))) 288 | (let [txt "Some text to test." 289 | dictionary [{:text ""} {:text "text"}] 290 | highlighter-fn (phrases/highlighter dictionary) 291 | anns (highlighter-fn txt)] 292 | (is (seq anns)))) 293 | 294 | (deftest ^:noisy noisy-tests-for-corner-cases 295 | (let [txt "Some text to test." 296 | dictionary [{:text "."} {:text "text"}] 297 | highlighter-fn (phrases/highlighter dictionary) 298 | anns (highlighter-fn txt)] 299 | (is (seq anns)) 300 | (is (= 1 (count anns)))) 301 | (let [txt " ` `" 302 | dictionary [{:text "test" :id "1"}] 303 | highlighter-fn (phrases/highlighter dictionary) 304 | anns (highlighter-fn txt)] 305 | (is (coll? anns)) 306 | (is (empty? anns))) 307 | (testing "slop versions" 308 | (stest/unstrument `phrases/highlighter) 309 | (testing "nil slop" 310 | (let [txt "before start end after" 311 | dictionary [{:text "end start foo" :id "1" :slop nil}] 312 | highlighter-fn (phrases/highlighter dictionary) 313 | anns (highlighter-fn txt)] 314 | (is (empty? anns)))) 315 | (testing "very big slop" 316 | (let [txt "before start end after" 317 | dictionary [{:text "end start foo" :id "1" :slop 1000000000000}] 318 | highlighter-fn (phrases/highlighter dictionary) 319 | anns (highlighter-fn txt)] 320 | (is (empty? anns)))) 321 | (testing "slop with negative value" 322 | (let [txt "before start end after" 323 | dictionary [{:text "end start foo" :id "1" :slop -1}] 324 | highlighter-fn (phrases/highlighter dictionary) 325 | anns (highlighter-fn txt)] 326 | (is (empty? anns)))) 327 | (stest/instrument `phrases/highlighter))) 328 | 329 | (deftest tokenizer-conf 330 | (let [txt "URGENT! Do this immediately!" 331 | dictionary [{:text "URGENT" :id "a" :tokenizer :whitespace} 332 | {:text "URGENT" :id "b" :tokenizer :standard}] 333 | highlighter-fn (phrases/highlighter dictionary) 334 | anns (highlighter-fn txt)] 335 | (is (= 1 (count anns))) 336 | (is (= "b" (:dict-entry-id (first anns))))) 337 | (let [txt "[URGENT!] Do this immediately!" 338 | dictionary [{:text "[URGENT!]" :id "a" :tokenizer :whitespace} 339 | {:text "[URGENT!]" :id "b" :tokenizer :standard}] 340 | highlighter-fn (phrases/highlighter dictionary) 341 | anns (highlighter-fn txt)] 342 | (is (= 2 (count anns))) 343 | (is (= "[URGENT!]" (:text (first (filter #(= "a" (:dict-entry-id %)) anns))))) 344 | (is (= "URGENT" (:text (first (filter #(= "b" (:dict-entry-id %)) anns))))))) 345 | 346 | (deftest phrase-ordering-basic-case 347 | (is (= 1 (count ((phrases/highlighter [{:text "Token Mill" :slop 2 :in-order? false}]) 348 | "Mill Token")))) 349 | (is (= 0 (count ((phrases/highlighter [{:text "Token Mill" :slop 2 :in-order? true}]) 350 | "Mill Token"))))) 351 | 352 | (deftest highlighter-opts-for-slop-with-order 353 | (is (= 0 (count ((phrases/highlighter [{:text "Token Mill"}] 354 | {}) 355 | "Mill Token")))) 356 | (is (= 1 (count ((phrases/highlighter [{:text "Token Mill"}] 357 | {:slop 2}) 358 | "Mill Token")))) 359 | (is (= 0 (count ((phrases/highlighter [{:text "Token Mill"}] 360 | {:slop 2 :in-order? true}) 361 | "Mill Token"))))) 362 | 363 | (deftest ordered-phrase-with-on-term 364 | (is (= 1 (count ((phrases/highlighter [{:text "phrase" :slop 2 :in-order? true}]) 365 | "prefix phrase suffix"))))) 366 | 367 | (deftest ordered-phrase-with-two-equal-terms-in-front-and-end 368 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase" :slop 2 :in-order? true}]) 369 | "prefix phrase phrase suffix")] 370 | (is (= 1 (count anns))) 371 | (is (= "phrase phrase" (:text ann))) 372 | (is (= 7 (:begin-offset ann))) 373 | (is (= 20 (:end-offset ann)))) 374 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase and phrase" :slop 2 :in-order? true}]) 375 | "prefix phrase and phrase suffix")] 376 | (is (= 1 (count anns))) 377 | (is (= "phrase and phrase" (:text ann))) 378 | (is (= 7 (:begin-offset ann))) 379 | (is (= 24 (:end-offset ann))))) 380 | 381 | (deftest ordered-ambigous-phrase 382 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}]) 383 | "prefix phrase phrase end suffix")] 384 | (is (= 1 (count anns))) 385 | (is (= "phrase phrase end" (:text ann))) 386 | (is (= 7 (:begin-offset ann))) 387 | (is (= 24 (:end-offset ann)))) 388 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}]) 389 | "prefix phrase phrase end end suffix")] 390 | (is (= 1 (count anns))) 391 | (is (= "phrase phrase end" (:text ann))) 392 | (is (= 7 (:begin-offset ann))) 393 | (is (= 24 (:end-offset ann)))) 394 | (let [[ann1 & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 1 :in-order? true}]) 395 | "prefix phrase phrase a phrase end suffix")] 396 | (is (= 1 (count anns))) 397 | (is (= "phrase a phrase end" (:text ann1))) 398 | (is (= 14 (:begin-offset ann1))) 399 | (is (= 33 (:end-offset ann1)))) 400 | 401 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase end end" :slop 1 :in-order? true}]) 402 | "prefix phrase phrase end end suffix")] 403 | (is (= 1 (count anns))) 404 | (is (= "phrase phrase end end" (:text ann))) 405 | (is (= 7 (:begin-offset ann))) 406 | (is (= 28 (:end-offset ann)))) 407 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase end end" :slop 1 :in-order? true}]) 408 | "prefix phrase phrase end end X X phrase phrase end end suffix")] 409 | (is (= 2 (count anns))) 410 | (is (= "phrase phrase end end" (:text ann))) 411 | (is (= 7 (:begin-offset ann))) 412 | (is (= 28 (:end-offset ann))))) 413 | 414 | (deftest complicated-ordering 415 | (let [[ann1 ann2 & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}]) 416 | "prefix phrase phrase end phrase end suffix")] 417 | (is (= 2 (count anns))) 418 | (is (= "phrase phrase end" (:text ann1))) 419 | (is (= 7 (:begin-offset ann1))) 420 | (is (= 24 (:end-offset ann1))) 421 | ;; FIXME: this highlight is not correct 422 | (is (= "phrase end" (:text ann2))) 423 | (is (= 25 (:begin-offset ann2))) 424 | (is (= 35 (:end-offset ann2))))) 425 | 426 | (deftest preserve-order-edge-cases 427 | (testing "multiple match of a phrase" 428 | (is (= 3 (count ((phrases/highlighter 429 | [{:text "Token Mill" :slop 3 :in-order? false}]) 430 | "Prefix Token Mill Infix Token a Mill Suffix")))) 431 | (is (= 2 (count ((phrases/highlighter 432 | [{:text "Token Mill" :slop 1 :in-order? true}]) 433 | "Prefix Token Mill Infix Token a Mill Suffix")))) 434 | (is (= 1 (count ((phrases/highlighter 435 | [{:text "Token Mill" :slop 0 :in-order? true}]) 436 | "Prefix Token Mill Infix Token a Mill Suffix")))) 437 | (let [highlights ((phrases/highlighter 438 | [{:text "Token Mill" :slop 1 :in-order? true :meta {:test "test"}}]) 439 | "Prefix Token Mill Infix Token a Mill Suffix")] 440 | (is (= 2 (count highlights))) 441 | (let [first-highlight (apply min-key :begin-offset highlights)] 442 | (is (= "Token Mill" (:text first-highlight))) 443 | (is (= 7 (:begin-offset first-highlight))) 444 | (is (= 17 (:end-offset first-highlight))) 445 | (is (= {"test" "test"} (:meta first-highlight))) 446 | (is (= "PHRASE" (:type first-highlight)))) 447 | (let [second-highlight (apply max-key :begin-offset highlights)] 448 | (is (= "Token a Mill" (:text second-highlight))) 449 | (is (= 24 (:begin-offset second-highlight))) 450 | (is (= 36 (:end-offset second-highlight))) 451 | (is (= {"test" "test"} (:meta second-highlight))) 452 | (is (= "PHRASE" (:type second-highlight))))))) 453 | 454 | (deftest annotator-options 455 | (testing "case sensitivity flag" 456 | (let [txt "prefix PHRASE suffix" 457 | dictionary [{:text "phrase"}] 458 | highlighter-fn (phrases/highlighter dictionary) 459 | anns (highlighter-fn txt)] 460 | (is (empty? anns))) 461 | (let [txt "prefix PHRASE suffix" 462 | dictionary [{:text "phrase"}] 463 | highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false}) 464 | anns (highlighter-fn txt)] 465 | (is (= 1 (count anns))))) 466 | 467 | (testing "ascii folding flag" 468 | (let [txt "prefix PHRÄSE suffix" 469 | dictionary [{:text "phrase"}] 470 | highlighter-fn (phrases/highlighter dictionary) 471 | anns (highlighter-fn txt)] 472 | (is (empty? anns))) 473 | (let [txt "prefix PHRÄSE suffix" 474 | dictionary [{:text "phrase"}] 475 | highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false 476 | :ascii-fold? true}) 477 | anns (highlighter-fn txt)] 478 | (is (= 1 (count anns))))) 479 | 480 | (testing "stemming options" 481 | (let [txt "prefix PHRASES suffix" 482 | dictionary [{:text "phrase"}] 483 | highlighter-fn (phrases/highlighter dictionary) 484 | anns (highlighter-fn txt)] 485 | (is (empty? anns))) 486 | (let [txt "prefix PHRASES suffix" 487 | dictionary [{:text "phrase"}] 488 | highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false 489 | :stem? true 490 | :stemmer :english}) 491 | anns (highlighter-fn txt)] 492 | (is (= 1 (count anns)))))) 493 | 494 | (deftest phrases-with-edit-distance 495 | (let [txt "prefix tokne mill suffix" 496 | dictionary [{:text "token mill" :fuzzy? true :fuzziness 1}] 497 | highlighter-fn (phrases/highlighter dictionary {}) 498 | [ann1 :as anns] (highlighter-fn txt)] 499 | (is (= 1 (count anns))) 500 | (is (= "tokne mill" (:text ann1)))) 501 | (let [txt "prefix mill tokne suffix" 502 | dictionary [{:text "token mill" :fuzzy? true :fuzziness 1}] 503 | highlighter-fn (phrases/highlighter dictionary {}) 504 | anns (highlighter-fn txt)] 505 | (is (empty? anns))) 506 | (let [txt "prefix tokne mill suffix" 507 | dictionary [{:text "mill token" :fuzzy? true :fuzziness 1 :in-order? true}] 508 | highlighter-fn (phrases/highlighter dictionary {}) 509 | anns (highlighter-fn txt)] 510 | (is (empty? anns))) 511 | (let [txt "prefix mill tokne suffix" 512 | dictionary [{:text "token mill" :fuzzy? true :fuzziness 1 :in-order? false}] 513 | highlighter-fn (phrases/highlighter dictionary {}) 514 | [ann1 :as anns] (highlighter-fn txt)] 515 | (is (= 1 (count anns))) 516 | (is (= "mill tokne" (:text ann1)))) 517 | (let [txt "prefix tokne uab mill suffix" 518 | dictionary [{:text "mill token" :fuzzy? true :fuzziness 1 :in-order? false}] 519 | highlighter-fn (phrases/highlighter dictionary {}) 520 | anns (highlighter-fn txt)] 521 | (is (empty? anns)))) 522 | -------------------------------------------------------------------------------- /test/beagle/readers_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.readers-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [clojure.spec.alpha :as s] 4 | [beagle.schema :as sch] 5 | [beagle.readers :as readers]) 6 | (:import (java.io ByteArrayInputStream))) 7 | 8 | (deftest json-reader 9 | (is (not (nil? (s/conform ::sch/dictionary 10 | (readers/read-json 11 | (ByteArrayInputStream. 12 | (.getBytes "[{\"text\": \"moo\"}]"))))))) 13 | (is (not (nil? (s/conform ::sch/dictionary 14 | (readers/read-json "test/resources/dict.json")))))) 15 | 16 | (deftest csv-file-reader 17 | (is (not (nil? (s/conform ::sch/dictionary (readers/read-csv "test/resources/dict.csv")))))) 18 | 19 | (deftest edn-file-reader 20 | (is (not (nil? (s/conform ::sch/dictionary (readers/read-edn "test/resources/dict.edn")))))) 21 | -------------------------------------------------------------------------------- /test/beagle/text_analysis_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.text-analysis-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [beagle.text-analysis :as text-analysis])) 4 | 5 | (deftest field-name-construction 6 | (is (= "text.standard-tokenizer" 7 | (text-analysis/get-field-name {} {}))) 8 | (is (= "text.standard-tokenizer" 9 | (text-analysis/get-field-name {:case-sensitive? true} {}))) 10 | (is (= "text.standard-tokenizer.lowercased" 11 | (text-analysis/get-field-name {:case-sensitive? false} {}))) 12 | (is (= "text.standard-tokenizer.ascii-folded" 13 | (text-analysis/get-field-name {:ascii-fold? true} {}))) 14 | (is (= "text.standard-tokenizer.stemmed-english" 15 | (text-analysis/get-field-name {:stem? true} {}))) 16 | (is (= "text.standard-tokenizer.stemmed-lithuanian" 17 | (text-analysis/get-field-name {:stem? true :stemmer :lithuanian} {}))) 18 | (is (= "text.standard-tokenizer.ascii-folded-lowercased-stemmed-lithuanian" 19 | (text-analysis/get-field-name {:ascii-fold? true 20 | :case-sensitive? false 21 | :stem? true 22 | :stemmer :lithuanian} {})))) 23 | 24 | (deftest token-stream 25 | (let [txt "These are tests."] 26 | (is (= ["These" "are" "tests"] 27 | (text-analysis/text->token-strings 28 | txt (text-analysis/get-string-analyzer {:case-sensitive? true} {})))) 29 | (is (= ["these" "are" "tests"] 30 | (text-analysis/text->token-strings 31 | txt (text-analysis/get-string-analyzer {:case-sensitive? false} {})))) 32 | (is (= ["these" "are" "tests"] 33 | (text-analysis/text->token-strings 34 | txt (text-analysis/get-string-analyzer {:case-sensitive? false 35 | :ascii-fold? true} {})))) 36 | (is (= ["these" "are" "test"] 37 | (text-analysis/text->token-strings 38 | txt (text-analysis/get-string-analyzer {:case-sensitive? false 39 | :ascii-fold? true 40 | :stem? true} {})))) 41 | ; this one is surprising but correct 42 | (is (= ["these" "are" "tests."] 43 | (text-analysis/text->token-strings 44 | txt (text-analysis/get-string-analyzer {:case-sensitive? false 45 | :ascii-fold? true 46 | :stem? true} {:tokenizer :whitespace})))))) 47 | -------------------------------------------------------------------------------- /test/beagle/validator_test.clj: -------------------------------------------------------------------------------- 1 | (ns beagle.validator-test 2 | (:require [clojure.test :refer [deftest is]] 3 | [beagle.validator :as validator])) 4 | 5 | (deftest basic-cases 6 | (is (seq (validator/valid-dictionary? [{:text "test" :id "1" :meta {:test "test"} :type "CUSTOM"}]))) 7 | (is (nil? (validator/valid-dictionary? [{:id "1" :meta {:test "test"} :type "CUSTOM"}])))) 8 | -------------------------------------------------------------------------------- /test/resources/dict.csv: -------------------------------------------------------------------------------- 1 | text,id,synonyms,meta,case-sensitive?,ascii-fold?,type 2 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,true,TEST 3 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,true 4 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,false 5 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,FALSE 6 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,NOT_BOOL 7 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true 8 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2 9 | test-dictionary-item,id1,syn1;syn2,k;v 10 | test-dictionary-item,id1,syn1;syn2,k 11 | test-dictionary-item,id1,syn1;syn2, 12 | test-dictionary-item,id1,syn1;syn2 13 | test-dictionary-item,id1,syn1; 14 | test-dictionary-item,id1,syn1 15 | test-dictionary-item,id1, 16 | test-dictionary-item,id1 17 | test-dictionary-item 18 | test-dictionary-item,,,,,,TEST 19 | -------------------------------------------------------------------------------- /test/resources/dict.edn: -------------------------------------------------------------------------------- 1 | [{:text "test text" 2 | :id "test-id" 3 | :case-sensitive? true 4 | :ascii-fold? true 5 | :meta {:email "test@example.com"}}] 6 | -------------------------------------------------------------------------------- /test/resources/dict.json: -------------------------------------------------------------------------------- 1 | [{"text": "test text", 2 | "id": "test-id", 3 | "case-sensitive?": true, 4 | "ascii-fold?": true, 5 | "meta": {"email": "test@example.com"}}] -------------------------------------------------------------------------------- /test/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | %d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | --------------------------------------------------------------------------------