├── .gitattributes
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── tokenmill-logo.svg
├── .gitignore
├── .gitlab-ci.yml
├── CHANGELOG
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── charts
├── mt-avg-per-doc.png
├── mt-min-max-per-doc.png
├── mt-throughput-per-sec.png
├── mt-total.png
├── st-avg-per-doc.png
├── st-min-max-per-doc.png
└── st-throughput-per-sec.png
├── classes
└── lt
│ └── tokenmill
│ └── beagle
│ └── phrases
│ ├── Annotation.class
│ ├── Annotator.class
│ └── DictionaryEntry.class
├── deps.edn
├── pom.xml
├── src
└── beagle
│ ├── annotation_merger.clj
│ ├── dictionary_optimizer.clj
│ ├── java
│ ├── annotation.clj
│ └── java.clj
│ ├── lucene_alpha.clj
│ ├── monitor.clj
│ ├── phrases.clj
│ ├── readers.clj
│ ├── schema.clj
│ ├── text_analysis.clj
│ └── validator.clj
└── test
├── beagle
├── annotation_merge_test.clj
├── corner_case_phrases_test.clj
├── dictionary_optimization_test.clj
├── java_test.clj
├── lucene_alpha_test.clj
├── optimization_suggestions_test.clj
├── phrases_test.clj
├── readers_test.clj
├── text_analysis_test.clj
└── validator_test.clj
└── resources
├── dict.csv
├── dict.edn
├── dict.json
├── logback.xml
└── phrases.html
/.gitattributes:
--------------------------------------------------------------------------------
1 | test/resources/phrases.html linguist-vendored=false
2 | test/resources/phrases.html linguist-detectable=false
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/tokenmill-logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | pom.xml.asc
2 | *.jar
3 | *.class
4 | /lib/
5 | /classes/
6 | /target/
7 | /checkouts/
8 | .lein-deps-sum
9 | .lein-repl-history
10 | .lein-plugins/
11 | .lein-failures
12 | .nrepl-port
13 | .cpcache/
14 | target/*
15 | .idea
16 | *.iml
17 | .env
18 | *.json
19 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | stages:
2 | - test
3 |
4 | variables:
5 | GIT_DEPTH: 3
6 |
7 | cache:
8 | key: one-key-to-rule-them-all
9 | paths:
10 | - ./.m2/repository
11 | - ./.gitlibs
12 |
13 | lint:
14 | stage: test
15 | image: borkdude/clj-kondo
16 | cache: {}
17 | when: always
18 | script:
19 | - clj-kondo --lint src test --config '{:output {:exclude-files ["java"]}}'
20 |
21 | unit-test:
22 | stage: test
23 | when: always
24 | image: clojure:tools-deps-alpine
25 | script:
26 | - export GITLIBS=".gitlibs/"
27 | - clojure -Sdeps '{:mvn/local-repo "./.m2/repository"}' -A:test
28 |
29 | validate-sample-dictionaries:
30 | stage: test
31 | when: always
32 | image: clojure:tools-deps-alpine
33 | script:
34 | - export GITLIBS=".gitlibs/"
35 | - >
36 | clojure -Sdeps '{:mvn/local-repo "./.m2/repository"}' -m beagle.validator
37 | test/resources/dict.csv csv
38 | test/resources/dict.json json
39 | test/resources/dict.edn edn
40 |
--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
1 | # Change Log
2 | All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
3 |
4 | ## 0.9.0-SNAPSHOT - 2019-10-13
5 | ### Added
6 | - Ensuring ordering for phrases with slop
7 |
8 | ## 0.4.0-SNAPSHOT - 2019-10-12
9 | ### Added
10 | - Tokenizer can be specified for every dictionary entry
11 | - Java Interface accepts tokenizer string
12 | - Highlighter options support for text analysis options
13 | ### Changed
14 | - Use MultiPhraseQuery instead of PhraseQuery internally
15 |
16 | ## 0.3.1 - 2019-10-03
17 | ### Fixed
18 | - Java interface for phrase highlighting
19 |
20 | ## 0.3.0 - 2019-09-24
21 | ### Added
22 | - Performance optimizations
23 | ### Changed
24 | - Refactored code towards batch document highlighting
25 |
26 | ## 0.2.0 - 2019-09-24
27 | ### Added
28 | - Alpha version for Lucene query support
29 |
30 | ## 0.1.7 - 2019-09-20
31 | ### Added
32 | - Deployment to Maven Central
33 |
34 | ## 0.1.6 - 2019-09-19
35 | ### Added
36 | - Added Java interface
37 | ### Fixed
38 | - Concurrent usage
39 |
40 | ## 0.1.5 - 2019-09-16
41 | ### Fixed
42 | - Handling of cases when text or phrases are tokenized to 0 tokens
43 |
44 | ## 0.1.4 - 2019-09-10
45 | ### Added
46 | - Phrase slop support
47 |
48 | ## 0.1.3 - 2019-09-04
49 | ### Added
50 | - Use one Lucene Monitor in total
51 |
52 | ## 0.1.2 - 2019-09-03
53 | ### Added
54 | - Support for stemming for multiple languages
55 |
56 | ## 0.1.1 - 2019-08-26
57 | ### Added
58 | - Initial release
59 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at info@tokenmill.lt. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | First off, thanks for taking the time to contribute!
2 |
3 | The following is a set of guidelines for contributing to Beagle which is hosted at https://github.com/tokenmill/beagle. These are just guidelines, not rules, use your best judgment and feel free to propose changes to this document in a pull request.
4 |
5 | This project adheres to the Contributor Covenant code of conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to info@tokenmill.lt.
6 | Issues & Pull requests
7 |
8 | Issues and Pull requests welcome!
9 |
10 | We do ask that before submitting a pull request you open an issue tracking the bug of enhancement you'd like to fix or submit. This makes it easier to discuss changes in the abstract, before focusing on a particular solution.
11 |
12 | Furthermore, please be diligent about submitting pull requests which only make one essential change at a time. While formatting changes and code cleanups are welcome, they should be separate from features and a pull request should only introduce one logical feature at a time. When adding new features, please ensure there are accompanying tests.
13 |
14 | Commit Messages
15 |
16 | Commit messages should be well formed, according to the guidelines outlined by Tim Pope: http://karma-runner.github.io/4.0/dev/git-commit-msg.html
17 |
18 | When fixing an existing issue, add - fixes #xxx somewhere in the commit message: this has the dual purpose of closing the issue when your patch is merged to master as well as automatically providing a link in to the related issue.
19 |
20 | Change Log
21 |
22 | Pull requests are required to update the changelog. Changelog entries should mention and link to any issues or tickets involved in the change, and should provide a short summary description of the particular changes of the patch.
23 |
24 | Include the issue number (#xxx) which will link back to the originating issue in Github. Commentary on the change should appear as a nested, unordered list.
25 |
26 | Whitespace & Linting
27 |
28 | Beagle is maintained with fairly strict whitespace and style standards.
29 |
30 | Gitlab CI jobs will fail if the clj-kondo rules are violated, or the source format doesnt match the default cljfmt style guidelines. Hence, patches must be formatted and whitespace linted before they will be accepted.
31 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM oracle/graalvm-ce:19.2.0.1 as builder
2 | RUN gu install native-image
3 |
4 | ENV GRAALVM_HOME=$JAVA_HOME
5 |
6 | RUN curl -O https://download.clojure.org/install/linux-install-1.10.1.469.sh
7 | RUN chmod +x linux-install-1.10.1.469.sh
8 | RUN ./linux-install-1.10.1.469.sh
9 |
10 | RUN mkdir -p /usr/src/app
11 | WORKDIR /usr/src/app
12 |
13 | COPY deps.edn /usr/src/app/
14 | RUN clojure -R:native-image
15 | COPY . /usr/src/app
16 |
17 | RUN clojure -A:native-image
18 |
19 | RUN chmod 755 dictionary-validator
20 |
21 | FROM alpine:3.9.4 as validator
22 |
23 | WORKDIR /opt
24 | COPY --from=builder /usr/src/app/dictionary-validator /usr/local/bin/dictionary-validator
25 |
26 | CMD ["dictionary-validator"]
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2019 Tokenmill, UAB
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | lint-code:
2 | clojure -M:clj-kondo --config '{:output {:exclude-files ["java"]}}'
3 |
4 | unit-test:
5 | clojure -M:runner:test -e :noisy
6 |
7 | build-dictionary-validator:
8 | docker build --target builder -f Dockerfile -t tokenmill/beagle-dictionary-validator .
9 | docker rm build || true
10 | docker create --name build tokenmill/beagle-dictionary-validator
11 | docker cp build:/usr/src/app/dictionary-validator dictionary-validator
12 |
13 | build-graal-validator-docker:
14 | docker build --target validator -f Dockerfile -t tokenmill/beagle-dictionary-validator .
15 |
16 | recompile-java-interface:
17 | rm -rf classes
18 | mkdir classes
19 | clojure -e "(require 'beagle.java.annotation) (compile 'beagle.java.annotation) (compile 'beagle.java.java)"
20 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # Beagle
6 |
7 | [](https://opensource.org/licenses/Apache-2.0)
8 | [](https://gitlab.com/tokenmill/oss/beagle/pipelines/master/latest)
9 | [](https://search.maven.org/search?q=g:%22lt.tokenmill%22%20AND%20a:%22beagle%22)
10 |
11 | Beagle is a detector of interesting things in text. Its intended use is in-stream search applications. Suppose you need to monitor a stream of text documents such as web crawl results, chat messages, or corporate documents in order to identify keywords, phrases, regexes, and [complex search queries](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html) of interest. With Beagle you can quickly be up and running with such a system, allowing you to focus on productively monitoring your documents.
12 |
13 | Beagle is based on the [Lucene monitor](https://github.com/apache/lucene-solr/tree/master/lucene/monitor) library which is based on [Luwak](https://github.com/flaxsearch/luwak).
14 |
15 | ## Components
16 |
17 | - [Phrase highlighter with support for](#phrase-annotator-usage):
18 | - case sensitivity,
19 | - ascii folding,
20 | - stemming support for various languages,
21 | - phrase slop,
22 | - synonymous phrases,
23 | - metadata,
24 | - tokenizer,
25 | - ensuring order of terms in a phrase with slop,
26 | - any combination of previously mentioned features.
27 | - [Java interface to the phrase highlighter](#java-interface-to-the-phrase-highlighter)
28 | - (alpha!) [Lucene query string support](#lucene-query-support) (interface is subject to change)
29 | - [Dictionary file readers (csv, json, edn)](#dictionary-readers)
30 | - [Dictionary validator](#dictionary-validator)
31 | - [Dictionary optimizer](#dictionary-optimizer)
32 | - [Annotation merger](#annotation-merger)
33 |
34 | ## Phrase Annotator Usage
35 |
36 | ```clojure
37 | (require '[beagle.phrases :as phrases])
38 |
39 | (let [dictionary [{:text "to be annotated" :id "1"}]
40 | highlighter-fn (phrases/highlighter dictionary)]
41 | (highlighter-fn "before annotated to be annotated after annotated"))
42 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32})
43 |
44 | ;; Case sensitivity is controlled per dictionary entry
45 | (let [dictionary [{:text "TO BE ANNOTATED" :id "1" :case-sensitive? false}]
46 | highlighter-fn (phrases/highlighter dictionary)]
47 | (highlighter-fn "before annotated to be annotated after annotated"))
48 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32})
49 |
50 | ;; ASCII folding is controlled per dictionary entry
51 | (let [dictionary [{:text "TÖ BE ÄNNÖTÄTED" :id "1" :case-sensitive? false :ascii-fold? true}]
52 | highlighter-fn (phrases/highlighter dictionary)]
53 | (highlighter-fn "before annotated to be annotated after annotated"))
54 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32})
55 |
56 | ;; Stemming is supported for multiple languages per dictionary entry
57 | (let [dictionary [{:text "Kaunas" :id "1" :stem? true :stemmer :lithuanian}]
58 | highlighter-fn (phrases/highlighter dictionary)]
59 | (highlighter-fn "Kauno miestas"))
60 | => ({:text "Kauno", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 5})
61 |
62 | ;; Phrases also support slop (i.e. terms edit distance) per dictionary entry
63 | (let [txt "before start and end after"
64 | dictionary [{:text "start end" :id "1" :slop 1}]
65 | highlighter-fn (phrases/highlighter dictionary)]
66 | (highlighter-fn txt))
67 | => ({:text "start and end", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 7, :end-offset 20})
68 |
69 | ;; Every phrase can specify which tokenizer to use
70 | (let [txt "[URGENT!] Do this immediately!"
71 | dictionary [{:text "[URGENT!]" :id "a" :tokenizer :whitespace}
72 | {:text "[URGENT!]" :id "b" :tokenizer :standard}]
73 | highlighter-fn (phrases/highlighter dictionary)]
74 | (clojure.pprint/pprint (highlighter-fn txt)))
75 | =>
76 | ({:text "[URGENT!]",
77 | :type "PHRASE",
78 | :dict-entry-id "a",
79 | :meta {},
80 | :begin-offset 0,
81 | :end-offset 9}
82 | {:text "URGENT",
83 | :type "PHRASE",
84 | :dict-entry-id "b",
85 | :meta {},
86 | :begin-offset 1,
87 | :end-offset 7})
88 |
89 | ;; Ensure that phrase terms are matched in the provided order
90 | ;; e.g. NOT preserving order (default)
91 | (let [txt "Mill Token"
92 | dictionary [{:text "Token Mill" :slop 2 :in-order? false}]
93 | highlighter-fn (phrases/highlighter dictionary)]
94 | (highlighter-fn txt))
95 | => [{:text "Mill Token" :type "PHRASE" :dict-entry-id "0" :meta {} :begin-offset 0 :end-offset 10}]
96 | ;; e.g. Preserving order
97 | (let [txt "Mill Token"
98 | dictionary [{:text "Token Mill" :slop 2 :in-order? true}]
99 | highlighter-fn (phrases/highlighter dictionary)]
100 | (highlighter-fn txt))
101 | => ()
102 | ```
103 |
104 | ## Java Interface to the Phrase Highlighter
105 |
106 | Example:
107 | ```java
108 | import lt.tokenmill.beagle.phrases.Annotation;
109 | import lt.tokenmill.beagle.phrases.Annotator;
110 | import lt.tokenmill.beagle.phrases.DictionaryEntry;
111 |
112 | import java.util.Arrays;
113 | import java.util.Collection;
114 | import java.util.HashMap;
115 |
116 | public class Main {
117 | public static void main(String[] args) {
118 | DictionaryEntry dictionaryEntry = new DictionaryEntry("test phrase");
119 | Annotator annotator = new Annotator(Arrays.asList(dictionaryEntry));
120 | Collection annotations = annotator.annotate("This is my test phrase");
121 | annotations.forEach(s -> System.out.println("Annotated: \'" + s.text() + "\' at offset: " + s.beginOffset() + ":" + s.endOffset()));
122 | }
123 | }
124 |
125 | // => Annotated: 'test phrase' at offset: 11:22
126 | ```
127 |
128 | The available options for the Java API are explained with examples in the [Java Interface for Phrase Highlighting wiki page](https://github.com/tokenmill/beagle/wiki/Java-Interface-for-Phrase-Highlighting).
129 |
130 | All the options that are present in the Clojure interface are also available for use in Java, just convert Clojure keywords to Java strings, e.g.
131 | ```
132 | :case-sensitive? => "case-sensitive?"
133 | ```
134 |
135 | ### Project Setup with Maven
136 |
137 | The library is deployed in the Maven Central Repository and you can just add the beagle dependency to your `pom.xml`:
138 |
139 | ```xml
140 |
141 | lt.tokenmill
142 | beagle
143 | 0.3.1
144 |
145 | ```
146 |
147 | ## Lucene Query Support
148 |
149 | Examples:
150 |
151 | ```clojure
152 | (require '[beagle.lucene-alpha :as lucene])
153 |
154 | (let [txt "some text this other that"
155 | dictionary [{:text "this AND that" :id "1" :slop 1}]
156 | annotator-fn (lucene/annotator dictionary)]
157 | (annotator-fn txt {}))
158 | => ({:text "this AND that", :type "QUERY", :dict-entry-id "1", :meta {}})
159 | ```
160 |
161 | ## Performance
162 |
163 | The performance was measured on a desktop PC with Ubuntu 19.04 and 8-core Ryzen 1700.
164 |
165 | The test setup was for news articles and dictionary made up of names of city names in USA.
166 |
167 | Code and data for benchmarking and more benchmarks can be found [here](https://github.com/tokenmill/beagle-performance-benchmarks).
168 |
169 | ### Single-thread
170 |
171 | Average time spent per document ranged from 1.58 ms for dictionary of 5k phrases to 4.58 ms per document for 80k phrases.
172 |
173 | 
174 |
175 | Throughput of docs analyzed ranged from 626 docs/sec for dictionary of 5k phrases to 210 docs/sec for 80k phrases.
176 |
177 | 
178 |
179 | Max time spent per document has couple of spikes when processing a document takes ~1000ms. These spikes should
180 | have been caused either by GC pauses, or JVM deoptimizations. Aside from those spikes, max time ranges grows steadily
181 | from 15 ms to 72 ms as the dictionary size grows.
182 |
183 | Min time spent per document is fairly stable for any dictionary size and is about 0.45 ms. Most likely these are the
184 | cases when [Presearcher](https://lucene.apache.org/core/8_2_0/monitor/index.html) haven't found any candidate queries to run against the document.
185 |
186 | 
187 |
188 | ### Multi-threaded
189 |
190 | Using `core.async` pipeline time spent per single doc ranged from 3.38 ms for dictionary of 5k phrases to 15.34 ms per document for 80k phrases.
191 |
192 | 
193 |
194 | Total time spent to process all 10k docs ranged from 2412 ms for dictionary of 5k phrases to 12595 ms per document for 80k phrases.
195 |
196 | 
197 |
198 | Throughput of docs analyzed ranged from 4143 docs/sec for dictionary of 5k phrases to 793 docs/sec for 80k phrases.
199 |
200 | 
201 |
202 | Max time spent per document has risen fairy steady from 24.15 ms for dictionary of 10k phrases to 113.45 ms per document for 60k phrases.
203 |
204 | Min time spent per document varied from 0.6 ms for dictionary of 10k phrases to 1.1 ms per document for 55k phrases.
205 |
206 | 
207 |
208 | ### Conclusions about Performance
209 |
210 | Processing of a one document on average is faster in the single-thread mode by roughly by 3x compared to multi-threaded mode but even
211 | in multi-threaded mode one document rarely takes more than 10 ms.
212 |
213 | In multi-threaded mode throughput grows with the number on CPU cores almost linearly: 4143/8=518 docs per core per sec in multi-threaded mode
214 | while in single-thread mode 626 docs per core per sec.
215 |
216 | ## Dictionary Readers
217 |
218 | Three file formats are supported: csv, edn, json.
219 |
220 | ### CSV Dictionary Format
221 |
222 | Separator: ","
223 | Escape: "\""
224 |
225 | The first line *MUST* be a header.
226 |
227 | Supported header keys: `["text" "type" "id" "synonyms" "case-sensitive?" ":ascii-fold?" "meta"]`
228 |
229 | Order is not important.
230 |
231 | Under `synonyms`, there should be a list of string separated by ";"
232 | Under `meta`, there should be a list of strings separated by ";". Even number of strings is expected. In case of odd number, last one is ignored.
233 |
234 | ## Dictionary Validator
235 |
236 | Accepts any number of dictionaries to validate as long as they are provided in pairs as '"/path/to/dictionary/file" "file-type"'
237 |
238 | ### Supported File Types
239 |
240 | - csv
241 | - json
242 | - edn
243 |
244 | ### Output
245 |
246 | - If any dictionary is invalid exception will be thrown with exit status 1
247 |
248 | ### Usage
249 |
250 | #### Clojure
251 |
252 | To use validator directly execute command: `clj -m beagle.validator "/path/to/dictionary/file" "file-type" "/path/to/dictionary/file2" "file-type" & ...`
253 |
254 | ##### Example:
255 |
256 | ```
257 | clj -m beagle.validator "your-dict.csv" "csv" "your-other-dict.json" "json"
258 | ```
259 |
260 | #### Docker
261 |
262 | Example in Gitlab CI:
263 |
264 | ```
265 | validate-dictionaries:
266 | stage: dictionary-validation
267 | when: always
268 | image: tokenmill/beagle-dictionary-validator
269 | script:
270 | - >
271 | dictionary-validator
272 | /path/to/dict.csv csv
273 | /path/to/dict.json json
274 | /path/to/dict.edn edn
275 | ```
276 |
277 | ## Dictionary Optimizer
278 |
279 | Supported optimizations:
280 | - Remove duplicate dictionary entries
281 | - Merge synonyms
282 | - Synonyms and text equality check
283 |
284 | There are cases when dictionary entries can't be merged:
285 | - Differences in text analysis
286 |
287 | Examples:
288 | ```clojure
289 | (require '[beagle.dictionary-optimizer :as optimizer])
290 |
291 | ; Remove duplicates
292 | (let [dictionary [{:text "TO BE ANNOTATED" :id "1"}
293 | {:text "TO BE ANNOTATED"}]]
294 | (optimizer/optimize dictionary))
295 | => ({:text "TO BE ANNOTATED", :id "1"})
296 |
297 | ; Merge synonyms
298 | (let [dictionary [{:text "TO BE ANNOTATED" :synonyms ["ONE"]}
299 | {:text "TO BE ANNOTATED" :synonyms ["TWO"]}]]
300 | (optimizer/optimize dictionary))
301 | => ({:text "TO BE ANNOTATED", :synonyms ("TWO" "ONE")})
302 |
303 | ; Synonyms and text equality check
304 | (let [dictionary [{:text "TO BE ANNOTATED" :synonyms ["TO BE ANNOTATED"]}]]
305 | (optimizer/optimize dictionary))
306 | => ({:text "TO BE ANNOTATED", :synonyms ["TO BE ANNOTATED"]})
307 |
308 | ; Can't be merged because of differences in text analysis
309 | (let [dictionary [{:text "TO BE ANNOTATED" :case-sensitive? true}
310 | {:text "TO BE ANNOTATED" :case-sensitive? false}]]
311 | (optimizer/optimize dictionary))
312 | => ({:text "TO BE ANNOTATED", :case-sensitive? true} {:text "TO BE ANNOTATED", :case-sensitive? false})
313 | ```
314 |
315 | ## Annotation Merger
316 |
317 | Only annotations of the same type are merged.
318 |
319 | Handled cases:
320 | - Duplicate annotations
321 | - Nested annotations
322 |
323 | Examples:
324 | ```clojure
325 | (require '[beagle.annotation-merger :as merger])
326 |
327 | (let [dictionary [{:text "TEST"}
328 | {:text "This TEST is"}]
329 | highlighter-fn (phrases/highlighter dictionary)
330 | annotations (highlighter-fn "This TEST is")]
331 | (println "Annotations: " annotations)
332 | (merger/merge-same-type-annotations annotations))
333 | Annotations: ({:text TEST, :type PHRASE, :dict-entry-id 0, :meta {}, :begin-offset 5, :end-offset 9} {:text This TEST is, :type PHRASE, :dict-entry-id 1, :meta {}, :begin-offset 0, :end-offset 12})
334 | => ({:text "This TEST is", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 12})
335 |
336 | ;; You can also inline the need of merging annotations
337 | (let [dictionary [{:text "TEST"}
338 | {:text "This TEST is"}]
339 | highlighter-fn (phrases/highlighter dictionary)]
340 | (highlighter-fn "This TEST is" {:merge-annotations? true}))
341 | => ({:text "This TEST is", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 12})
342 | ```
343 |
344 | ## License
345 |
346 | Copyright © 2019 [TokenMill UAB](http://www.tokenmill.lt).
347 |
348 | Distributed under the The Apache License, Version 2.0.
349 |
--------------------------------------------------------------------------------
/charts/mt-avg-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-avg-per-doc.png
--------------------------------------------------------------------------------
/charts/mt-min-max-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-min-max-per-doc.png
--------------------------------------------------------------------------------
/charts/mt-throughput-per-sec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-throughput-per-sec.png
--------------------------------------------------------------------------------
/charts/mt-total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-total.png
--------------------------------------------------------------------------------
/charts/st-avg-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-avg-per-doc.png
--------------------------------------------------------------------------------
/charts/st-min-max-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-min-max-per-doc.png
--------------------------------------------------------------------------------
/charts/st-throughput-per-sec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-throughput-per-sec.png
--------------------------------------------------------------------------------
/classes/lt/tokenmill/beagle/phrases/Annotation.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/Annotation.class
--------------------------------------------------------------------------------
/classes/lt/tokenmill/beagle/phrases/Annotator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/Annotator.class
--------------------------------------------------------------------------------
/classes/lt/tokenmill/beagle/phrases/DictionaryEntry.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/DictionaryEntry.class
--------------------------------------------------------------------------------
/deps.edn:
--------------------------------------------------------------------------------
1 | {:deps {org.clojure/clojure {:mvn/version "1.10.3"}
2 | org.clojure/data.csv {:mvn/version "1.0.0"}
3 | org.clojure/tools.logging {:mvn/version "1.1.0"}
4 | org.apache.lucene/lucene-core {:mvn/version "8.9.0"}
5 | org.apache.lucene/lucene-monitor {:mvn/version "8.9.0"}
6 | metosin/jsonista {:mvn/version "0.3.3"}}
7 | :paths ["src" "classes"]
8 | :mvn/repos {"central" {:url "https://repo1.maven.org/maven2/"}
9 | "clojars" {:url "https://repo.clojars.org/"}}
10 | :aliases {:dev
11 | {:extra-deps {org.jsoup/jsoup {:mvn/version "1.13.1"}
12 | org.clojure/test.check {:mvn/version "1.0.0"}
13 | criterium/criterium {:mvn/version "0.4.6"}
14 | ch.qos.logback/logback-classic {:mvn/version "1.2.3"}}
15 | :extra-paths ["test/resources"]}
16 | :clj-kondo
17 | {:main-opts ["-m" "clj-kondo.main --lint src test"]
18 | :extra-deps {clj-kondo/clj-kondo {:mvn/version "2019.07.31-alpha"}}
19 | :jvm-opts ["-Dclojure.main.report=stderr"]}
20 | :test
21 | {:extra-paths ["test"]
22 | :extra-deps {com.cognitect/test-runner {:git/url "https://github.com/cognitect-labs/test-runner.git"
23 | :sha "62ef1de18e076903374306060ac0e8a752e57c86"}
24 | org.jsoup/jsoup {:mvn/version "1.13.1"}
25 | org.clojure/test.check {:mvn/version "1.0.0"}}}
26 | :runner
27 | {:extra-paths ["test"]
28 | :main-opts ["-m" "cognitect.test-runner"]}
29 | :native-image
30 | {:override-deps {org.clojure/clojure {:mvn/version "1.9.0"}}
31 | :main-opts ["-m clj.native-image beagle.validator"
32 | "--initialize-at-build-time"
33 | "--report-unsupported-elements-at-runtime"
34 | "-H:Name=dictionary-validator"]
35 | :jvm-opts ["-Dclojure.compiler.direct-linking=true"]
36 | :extra-deps {clj.native-image/clj.native-image
37 | {:git/url "https://github.com/taylorwood/clj.native-image.git"
38 | :sha "7708e7fd4572459c81f6a6b8e44c96f41cdd92d4"}}}}}
39 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | lt.tokenmill
5 | beagle
6 | 0.9.0-SNAPSHOT
7 | beagle
8 | Stream search library
9 | https://github.com/tokenmill/beagle
10 |
11 |
12 |
13 | The Apache License, Version 2.0
14 | http://www.apache.org/licenses/LICENSE-2.0.txt
15 |
16 |
17 |
18 |
19 |
20 | Dainius Jocas
21 | dainius.jocas@tokenmill.lt
22 | TokenMill
23 | http://www.tokenmill.lt
24 |
25 |
26 | Žygimantas Medelis
27 | zygimantas.medelis@gmail.com
28 | TokenMill
29 | http://www.tokenmill.lt
30 |
31 |
32 |
33 |
34 |
35 | org.clojure
36 | clojure
37 | 1.10.1
38 |
39 |
40 | org.clojure
41 | data.csv
42 | 0.1.4
43 |
44 |
45 | org.clojure
46 | tools.logging
47 | 0.5.0
48 |
49 |
50 | org.apache.lucene
51 | lucene-core
52 | 8.2.0
53 |
54 |
55 | org.apache.lucene
56 | lucene-monitor
57 | 8.2.0
58 |
59 |
60 | metosin
61 | jsonista
62 | 0.2.4
63 |
64 |
65 |
66 |
67 | UTF-8
68 |
69 |
70 |
71 |
72 | release-sign-artifacts
73 |
74 |
75 | performRelease
76 | true
77 |
78 |
79 |
80 |
81 |
82 | org.apache.maven.plugins
83 | maven-javadoc-plugin
84 | 3.1.1
85 |
86 |
87 | attach-javadoc
88 |
89 | jar
90 |
91 |
92 |
93 |
94 |
95 | org.apache.maven.plugins
96 | maven-gpg-plugin
97 | 1.6
98 |
99 |
100 | sign-artifacts
101 | verify
102 |
103 | sign
104 |
105 |
106 |
107 |
108 |
109 | org.apache.maven.plugins
110 | maven-source-plugin
111 | 3.1.0
112 |
113 |
114 | org.apache.maven.plugins
115 | maven-deploy-plugin
116 | 3.0.0-M1
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 | src
125 |
126 |
127 | src
128 |
129 |
130 | classes
131 |
132 |
133 |
134 |
135 | org.apache.maven.plugins
136 | maven-jar-plugin
137 | 3.1.2
138 |
139 |
140 | empty-javadoc-jar
141 | package
142 |
143 | jar
144 |
145 |
146 | javadoc
147 | ${basedir}/javadoc
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 | https://github.com/tokenmill/beagle
157 | scm:git:git://github.com/tokenmill/beagle.git
158 | scm:git:ssh://git@github.com/tokenmill/beagle.git
159 | HEAD
160 |
161 |
162 |
163 |
164 | clojars
165 | https://repo.clojars.org/
166 |
167 |
168 | sonatype
169 | https://oss.sonatype.org/content/repositories/snapshots/
170 |
171 |
172 |
173 |
174 |
175 | ossrh
176 | https://oss.sonatype.org/content/repositories/snapshots
177 |
178 |
179 | ossrh
180 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
181 |
182 |
183 |
184 |
--------------------------------------------------------------------------------
/src/beagle/annotation_merger.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.annotation-merger)
2 |
3 | (defn related-annotations? [anno1 anno2]
4 | (<= (:begin-offset anno1) (:begin-offset anno2) (:end-offset anno1)))
5 |
6 | (defn parent-child-annotations? [parent-anno child-anno]
7 | (and (>= (:begin-offset child-anno) (:begin-offset parent-anno))
8 | (<= (:end-offset child-anno) (:end-offset parent-anno))))
9 |
10 | (defn merge-annotations [annotations]
11 | (let [sorted-annotation (sort-by :begin-offset annotations)]
12 | (loop [parent-annotation (first sorted-annotation)
13 | [child-annotation & remaining] (rest sorted-annotation)
14 | result []]
15 | (if child-annotation
16 | (if (related-annotations? parent-annotation child-annotation)
17 | (recur (if (and (parent-child-annotations? parent-annotation child-annotation)
18 | (not (parent-child-annotations? child-annotation parent-annotation)))
19 | parent-annotation
20 | child-annotation)
21 | remaining
22 | result)
23 | (recur child-annotation remaining (conj result parent-annotation)))
24 | (conj result parent-annotation)))))
25 |
26 | (defn merge-same-type-annotations [annotations]
27 | (mapcat (fn [[_ anns]] (merge-annotations anns)) (group-by :type annotations)))
28 |
--------------------------------------------------------------------------------
/src/beagle/dictionary_optimizer.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.dictionary-optimizer
2 | (:require [clojure.set :as set]
3 | [clojure.string :as str]))
4 |
5 | (defn merge-synonyms [group-of-entries]
6 | (reduce (fn [synonyms-set {synonyms :synonyms}]
7 | (into synonyms-set synonyms))
8 | #{} group-of-entries))
9 |
10 | (defn merge-meta [group-of-entries]
11 | (reduce (fn [acc {meta :meta}] (merge acc meta)) {} group-of-entries))
12 |
13 | (defn merge-entries [entries]
14 | (let [{:keys [text case-sensitive? ascii-fold? id]} (first entries)
15 | synonyms (remove #(= text %) (merge-synonyms entries))
16 | meta (merge-meta entries)]
17 | (cond-> {:text text}
18 | (not-empty synonyms) (assoc :synonyms synonyms)
19 | (not-empty meta) (assoc :meta meta)
20 | id (assoc :id id)
21 | (not (nil? case-sensitive?)) (assoc :case-sensitive? case-sensitive?)
22 | (not (nil? ascii-fold?)) (assoc :ascii-fold? ascii-fold?))))
23 |
24 | (defn mergeable-meta? [{meta-a :meta} {meta-b :meta}]
25 | (every? #(= (get meta-a %) (get meta-b %)) (set/intersection (set (keys meta-a)) (set (keys meta-b)))))
26 |
27 | (defn aggregate-entries-by-meta [entries]
28 | (loop [entry-a (first entries)
29 | [entry-b & remaining] (rest entries)
30 | acc []
31 | exceptions []]
32 | (if entry-b
33 | (if (mergeable-meta? entry-a entry-b)
34 | (recur (merge-entries [entry-a entry-b]) remaining acc exceptions)
35 | (recur entry-a remaining acc (conj exceptions entry-b)))
36 | (if (seq exceptions)
37 | (recur (first exceptions) (rest exceptions) (conj acc entry-a) [])
38 | (conj acc entry-a)))))
39 |
40 | (defn group-dictionary-entries [dictionary]
41 | (group-by (fn [entry] [(:text entry) (:case-sensitive? entry) (:ascii-fold? entry)]) dictionary))
42 |
43 | (defn optimize [dictionary]
44 | (mapcat (fn [[_ grouped-entries]] (aggregate-entries-by-meta grouped-entries))
45 | (group-dictionary-entries dictionary)))
46 |
47 | (defn optimization-suggestion [entries]
48 | {:suggestion (-> (format "Dictionary items '%s' have identical `[text case-sensitivity ascii-folding] features."
49 | (reduce #(conj %1 (or (:id %2) (:text %2))) [] entries))
50 | (str/replace #"\"" ""))
51 | :dictionary-items entries})
52 |
53 | (defn dry-run [dictionary]
54 | (reduce (fn [acc [_ grouped-entries]]
55 | (if (< 1 (count grouped-entries))
56 | (conj acc (optimization-suggestion grouped-entries))
57 | acc))
58 | [] (group-dictionary-entries dictionary)))
59 |
--------------------------------------------------------------------------------
/src/beagle/java/annotation.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.java.annotation)
2 |
3 | (gen-class
4 | :name lt.tokenmill.beagle.phrases.Annotation
5 | :prefix Annotation-
6 | :state "state"
7 | :init "init"
8 | :constructors {[String String Long Long String java.util.Map] []}
9 | :methods [[text [] String]
10 | [type [] String]
11 | [beginOffset [] Long]
12 | [endOffset [] Long]
13 | [dictionaryEntryId [] String]
14 | [meta [] java.util.Map]]
15 | :prefix Annotation-)
16 |
17 | (defn Annotation-init [text type begin end dictionaryEntryId meta]
18 | [[] (atom {:text text
19 | :type type
20 | :begin begin
21 | :end end
22 | :dict-entry-id dictionaryEntryId
23 | :meta meta})])
24 |
25 | (defn Annotation-text [this]
26 | (@(.state this) :text))
27 | (defn Annotation-type [this]
28 | (@(.state this) :type))
29 | (defn Annotation-beginOffset [this]
30 | (@(.state this) :begin))
31 | (defn Annotation-endOffset [this]
32 | (@(.state this) :end))
33 | (defn Annotation-dictionaryEntryId [this]
34 | (@(.state this) :dict-entry-id))
35 | (defn Annotation-meta [this]
36 | (@(.state this) :meta))
37 |
--------------------------------------------------------------------------------
/src/beagle/java/java.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.java.java
2 | (:gen-class)
3 | (:require [beagle.phrases :as phrases]))
4 |
5 | (gen-class
6 | :name lt.tokenmill.beagle.phrases.DictionaryEntry
7 | :state "state"
8 | :init "init"
9 | :constructors {[String] []}
10 | :methods [[text [] String]
11 | [type [] String]
12 | [setType [String] void]
13 | [id [] String]
14 | [setId [String] void]
15 | [synonyms [] java.util.Collection]
16 | [setSynonyms [java.util.Collection] void]
17 | [caseSensitive [] Boolean]
18 | [setCaseSensitive [Boolean] void]
19 | [asciiFold [] Boolean]
20 | [setAsciiFold [Boolean] void]
21 | [stem [] Boolean]
22 | [setStem [Boolean] void]
23 | [stemmer [] String]
24 | [setStemmer [String] void]
25 | [slop [] Integer]
26 | [setSlop [Integer] void]
27 | [tokenizer [] String]
28 | [setTokenizer [String] void]
29 | [meta [] java.util.Map]
30 | [setMeta [java.util.Map] void]]
31 | :prefix DictionaryEntry-)
32 |
33 | (defn DictionaryEntry-init [phrase]
34 | [[] (atom {:text phrase})])
35 |
36 | (defn DictionaryEntry-text [this]
37 | (@(.state this) :text))
38 | (defn DictionaryEntry-type [this]
39 | (@(.state this) :type))
40 | (defn DictionaryEntry-setType [this type]
41 | (swap! (.state this) assoc :type type))
42 | (defn DictionaryEntry-id [this]
43 | (@(.state this) :id))
44 | (defn DictionaryEntry-setId [this id]
45 | (swap! (.state this) assoc :id id))
46 | (defn DictionaryEntry-synonyms [this]
47 | (@(.state this) :synonyms))
48 | (defn DictionaryEntry-setSynonyms [this synonyms]
49 | (swap! (.state this) assoc :synonyms synonyms))
50 | (defn DictionaryEntry-caseSensitive [this]
51 | (@(.state this) :case-sensitive?))
52 | (defn DictionaryEntry-setCaseSensitive [this case-sensitive]
53 | (swap! (.state this) assoc :case-sensitive? case-sensitive))
54 | (defn DictionaryEntry-asciiFold [this]
55 | (@(.state this) :ascii-fold?))
56 | (defn DictionaryEntry-setAsciiFold [this ascii-fold]
57 | (swap! (.state this) assoc :ascii-fold? ascii-fold))
58 | (defn DictionaryEntry-stem [this]
59 | (@(.state this) :stem?))
60 | (defn DictionaryEntry-setStem [this stem]
61 | (swap! (.state this) assoc :stem? stem))
62 | (defn DictionaryEntry-stemmer [this]
63 | (@(.state this) :stemmer))
64 | (defn DictionaryEntry-setStemmer [this stemmer]
65 | (swap! (.state this) assoc :stemmer stemmer))
66 | (defn DictionaryEntry-slop [this]
67 | (@(.state this) :slop))
68 | (defn DictionaryEntry-setSlop [this slop]
69 | (swap! (.state this) assoc :slop slop))
70 | (defn DictionaryEntry-meta [this]
71 | (@(.state this) :meta))
72 | (defn DictionaryEntry-setMeta [this meta]
73 | (swap! (.state this) assoc :meta meta))
74 | (defn DictionaryEntry-tokenizer [this]
75 | (@(.state this) :tokenizer))
76 | (defn DictionaryEntry-setTokenizer [this tokenizer]
77 | (swap! (.state this) assoc :tokenizer tokenizer))
78 |
79 | (gen-class
80 | :name lt.tokenmill.beagle.phrases.Annotator
81 | :state "state"
82 | :init "init"
83 | :constructors {[java.util.Collection] []
84 | [java.util.Collection java.util.Map] []}
85 | :prefix Phrases-
86 | :methods [[annotate [String] java.util.Collection]
87 | [annotate [String java.util.Map] java.util.Collection]])
88 |
89 | (defn Phrases-init
90 | ([dictionary] (Phrases-init dictionary {}))
91 | ([dictionary opts]
92 | [[] (atom {:dictionary dictionary
93 | :annotator-fn (phrases/highlighter
94 | (map (fn [dictionary-entry]
95 | {:text (.text dictionary-entry)
96 | :type (.type dictionary-entry)
97 | :id (.id dictionary-entry)
98 | :synonyms (.synonyms dictionary-entry)
99 | :case-sensitive? (.caseSensitive dictionary-entry)
100 | :ascii-fold? (.asciiFold dictionary-entry)
101 | :stem? (.stem dictionary-entry)
102 | :stemmer (keyword (.stemmer dictionary-entry))
103 | :slop (.slop dictionary-entry)
104 | :tokenizer (keyword (.tokenizer dictionary-entry))
105 | :meta (.meta dictionary-entry)}) dictionary)
106 | (reduce (fn [m [k v]]
107 | (assoc m (keyword k) v)) {} opts))})]))
108 |
109 | (defn Phrases-annotate
110 | ([this text] (Phrases-annotate this text {}))
111 | ([this text opts]
112 | (map (fn [ann] (lt.tokenmill.beagle.phrases.Annotation.
113 | (:text ann)
114 | (:type ann)
115 | (long (:begin-offset ann))
116 | (long (:end-offset ann))
117 | (:dict-entry-id ann)
118 | (:meta ann)))
119 | ((@(.state this) :annotator-fn) text (reduce (fn [m [k v]]
120 | (assoc m (keyword k) v)) {} opts)))))
121 |
--------------------------------------------------------------------------------
/src/beagle/lucene_alpha.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.lucene-alpha
2 | (:require [clojure.string :as s]
3 | [clojure.tools.logging :as log]
4 | [beagle.monitor :as monitor]
5 | [beagle.text-analysis :as text-analysis])
6 | (:import (org.apache.lucene.monitor MonitorQuery QueryMatch Monitor)
7 | (org.apache.lucene.queryparser.classic QueryParser ParseException)
8 | (org.apache.lucene.document Document Field FieldType)
9 | (org.apache.lucene.index IndexOptions)))
10 |
11 | (def ^FieldType field-type
12 | (doto (FieldType.)
13 | (.setTokenized true)
14 | (.setIndexOptions IndexOptions/DOCS_AND_FREQS)
15 | (.setStoreTermVectors true)
16 | (.setStoreTermVectorOffsets true)))
17 |
18 | (defn match-text [^String text ^Monitor monitor field-names type-name]
19 | (let [doc (Document.)]
20 | (doseq [field-name field-names]
21 | (.add doc (Field. ^String field-name text field-type)))
22 | (map (fn [^QueryMatch query-match]
23 | (let [^MonitorQuery query (.getQuery monitor (.getQueryId query-match))
24 | meta (.getMetadata query)]
25 | {:text (.getQueryString query)
26 | :type (or (get meta "_type") type-name)
27 | :dict-entry-id (.getQueryId query-match)
28 | :meta (into {} meta)})) (.getMatches (.match monitor doc (QueryMatch/SIMPLE_MATCHER))))))
29 |
30 | (defn dict-entry->monitor-queries [{:keys [id text meta type] :as dict-entry} default-analysis-conf idx]
31 | (try
32 | (let [query-id (or id (str idx))
33 | metadata (reduce-kv (fn [m k v] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta))]
34 | (MonitorQuery. query-id
35 | (.parse (QueryParser.
36 | (text-analysis/get-field-name dict-entry default-analysis-conf)
37 | (text-analysis/get-string-analyzer dict-entry default-analysis-conf))
38 | text)
39 | text
40 | metadata))
41 | (catch ParseException e
42 | (log/errorf "Failed to parse query: '%s' with exception '%s'" dict-entry e))
43 | (catch Exception e (log/errorf "Failed create query: '%s' with '%s'" dict-entry e))))
44 |
45 | (defn dictionary->monitor-queries [dictionary default-analysis-conf]
46 | (remove nil?
47 | (map (fn [dict-entry idx]
48 | (dict-entry->monitor-queries dict-entry default-analysis-conf idx))
49 | dictionary (range))))
50 |
51 | (defn match-monitor [text monitor field-names type-name opts]
52 | (log/debugf "Match monitor with opts='%s'" opts)
53 | (if (s/blank? text)
54 | []
55 | (match-text text monitor field-names type-name)))
56 |
57 | (defn annotator
58 | ([dictionary] (annotator dictionary {}))
59 | ([dictionary {:keys [type-name tokenizer]}]
60 | (let [type-name (if (s/blank? type-name) "QUERY" type-name)
61 | {:keys [monitor field-names]} (monitor/setup dictionary
62 | {:tokenizer tokenizer}
63 | dictionary->monitor-queries)]
64 | (fn
65 | ([text] (match-monitor text monitor field-names type-name {}))
66 | ([text opts] (match-monitor text monitor field-names type-name opts))))))
67 |
--------------------------------------------------------------------------------
/src/beagle/monitor.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.monitor
2 | (:require [clojure.java.io :as io]
3 | [clojure.tools.logging :as log]
4 | [jsonista.core :as json]
5 | [beagle.text-analysis :as text-analysis])
6 | (:import (org.apache.lucene.monitor MonitorConfiguration Monitor MonitorQuerySerializer MonitorQuery)
7 | (org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper)
8 | (org.apache.lucene.util BytesRef)
9 | (org.apache.lucene.search MatchAllDocsQuery)
10 | (java.util ArrayList)))
11 |
12 | (def monitor-query-serializer
13 | (reify MonitorQuerySerializer
14 | (serialize [_ query]
15 | (BytesRef.
16 | (json/write-value-as-string
17 | {"query-id" (.getId query)
18 | "query" (.getQueryString query)
19 | "metadata" (.getMetadata query)})))
20 | (deserialize [_ binary-value]
21 | (let [dq (json/read-value (io/reader (.bytes ^BytesRef binary-value)))]
22 | (MonitorQuery. (get dq "query-id")
23 | (MatchAllDocsQuery.)
24 | (get dq "query")
25 | (get dq "metadata"))))))
26 |
27 | (defn create [field-names-w-analyzers]
28 | (let [^MonitorConfiguration config (MonitorConfiguration.)
29 | per-field-analyzers (PerFieldAnalyzerWrapper.
30 | (text-analysis/get-string-analyzer {} {}) field-names-w-analyzers)]
31 | (.setIndexPath config nil monitor-query-serializer)
32 | (Monitor. per-field-analyzers config)))
33 |
34 | (defn defer-to-one-by-one-registration [^Monitor monitor monitor-queries]
35 | (doseq [mq monitor-queries]
36 | (try
37 | (.register monitor (doto (ArrayList.) (.add mq)))
38 | (catch Exception e
39 | (log/errorf "Failed to register query: '%s'" mq)
40 | (.printStackTrace e)))))
41 |
42 | (defn register-queries [^Monitor monitor monitor-queries]
43 | (try
44 | (.register monitor ^Iterable monitor-queries)
45 | (catch Exception _
46 | (defer-to-one-by-one-registration monitor monitor-queries))))
47 |
48 | (defn field-name-analyzer-mappings
49 | "Creates a map with field names as keys and Lucene analyzers as values.
50 | Both field name and analyzer are decided based on the dictionary entry configuration.
51 | First group dictionary entries by field name. Then from every group of dictionary entries
52 | take the first entry and create an analyzer based on analysis configuration."
53 | [dictionary default-analysis-conf]
54 | (->> dictionary
55 | (group-by (fn [dictionary-entry]
56 | (text-analysis/get-field-name dictionary-entry default-analysis-conf)))
57 | (reduce (fn [acc [field-name dict]]
58 | (assoc acc field-name (text-analysis/get-string-analyzer (first dict) default-analysis-conf)))
59 | {})))
60 |
61 | (defn prepare [monitor dict-entries default-analysis-conf dictionary->monitor-queries-fn]
62 | (register-queries monitor (dictionary->monitor-queries-fn dict-entries default-analysis-conf)))
63 |
64 | (defn setup
65 | "Setups the monitor with all the dictionary entries."
66 | [dictionary default-analysis-conf dict-entry->monitor-queries-fn]
67 | (let [mappings-from-field-names-to-analyzers (field-name-analyzer-mappings dictionary default-analysis-conf)
68 | monitor (create mappings-from-field-names-to-analyzers)]
69 | (prepare monitor dictionary default-analysis-conf dict-entry->monitor-queries-fn)
70 | {:monitor monitor
71 | :field-names (keys mappings-from-field-names-to-analyzers)}))
72 |
--------------------------------------------------------------------------------
/src/beagle/phrases.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.phrases
2 | (:require [clojure.string :as s]
3 | [clojure.tools.logging :as log]
4 | [beagle.validator :as validator]
5 | [beagle.annotation-merger :as merger]
6 | [beagle.dictionary-optimizer :as optimizer]
7 | [beagle.text-analysis :as text-analysis]
8 | [beagle.monitor :as monitor]
9 | [beagle.schema :refer [->Highlight ->DictionaryEntry]])
10 | (:import (java.util UUID)
11 | (org.apache.lucene.document Document FieldType Field)
12 | (org.apache.lucene.index IndexOptions Term)
13 | (org.apache.lucene.monitor Monitor MonitorQuery HighlightsMatch HighlightsMatch$Hit)
14 | (org.apache.lucene.search MultiPhraseQuery$Builder FuzzyQuery)
15 | (org.apache.lucene.search.spans SpanNearQuery$Builder SpanTermQuery SpanMultiTermQueryWrapper)))
16 |
17 | (defn filter-and-sort-ordered-hits [^String text ^String highlight-text ordered-hits]
18 | (->> ordered-hits
19 | (filter (fn [^HighlightsMatch$Hit hit]
20 | (= highlight-text (let [s (.-startOffset hit)
21 | e (.-endOffset hit)]
22 | (subs text s e)))))
23 | (sort-by (fn [^HighlightsMatch$Hit hit] (.-startOffset hit)))))
24 |
25 | (defn group-sequencial-ending
26 | "Groups a sequence taking only the last hit from a consecutive sub-sequence
27 | of terms, e.g. [1 2 3 6 7] => [3 7]"
28 | [spans-end-hits]
29 | (loop [[current-term & terms] spans-end-hits
30 | last-item nil
31 | current-seq []
32 | filtered-ends []]
33 | (if (nil? current-term)
34 | (conj filtered-ends (last current-seq))
35 | (if (nil? last-item)
36 | (recur terms current-term [current-term] (if (seq current-seq)
37 | (conj filtered-ends (last current-seq))
38 | filtered-ends))
39 | (if (= (inc (.-startPosition last-item)) (.-startPosition current-term))
40 | (recur terms current-term (conj current-seq current-term) filtered-ends)
41 | (recur terms current-term [current-term] (conj filtered-ends (last current-seq))))))))
42 |
43 | (defn pair-begins-with-ends [spans-start-hits spans-end-hits]
44 | (let [grouped-ends (group-sequencial-ending spans-end-hits)]
45 | (loop [[start & starts-tail :as starts] spans-start-hits
46 | [end & ends-tail] grouped-ends
47 | pairs []]
48 | (if (or (nil? start) (nil? end))
49 | pairs
50 | (if (= start end)
51 | (recur starts ends-tail pairs)
52 | (recur (remove #(< (.-startPosition %) (.-startPosition end)) starts-tail)
53 | ends-tail (conj pairs [start end])))))))
54 |
55 | (defn ordered-hits->highlights
56 | "The default highlighter fails to handle SpanNearQuery: highlights are term highlights not the whole
57 | span highlights.
58 | The temporary workaround works as follows:
59 | 1) find the very first hit
60 | 2) find the very last hit
61 | 3) assume that all spans begins and ends with the same terms
62 | 4) collect all hits like the beginning
63 | 5) collect all hits like the ending
64 | 6) pair beginnings with endings and make one highlight per pair"
65 | [text type-name query-id metadata ordered-hits]
66 | (let [^HighlightsMatch$Hit first-hit (apply min-key #(.-startOffset ^HighlightsMatch$Hit %) ordered-hits)
67 | first-text (subs text (.-startOffset first-hit) (.-endOffset first-hit))
68 | ^HighlightsMatch$Hit last-hit (apply max-key #(.-startOffset ^HighlightsMatch$Hit %) ordered-hits)
69 | last-text (subs text (.-startOffset last-hit) (.-endOffset last-hit))
70 | spans-start-hits (filter-and-sort-ordered-hits text first-text ordered-hits)
71 | spans-end-hits (filter-and-sort-ordered-hits text last-text ordered-hits)
72 | normalized-metadata (dissoc metadata "_in-order")]
73 | (map (fn [[^HighlightsMatch$Hit span-start-hit ^HighlightsMatch$Hit span-end-hit]]
74 | (let [start-offset (.-startOffset span-start-hit)
75 | end-offset (.-endOffset span-end-hit)]
76 | (->Highlight
77 | (subs text start-offset end-offset)
78 | (or (get meta "_type") type-name)
79 | query-id
80 | normalized-metadata
81 | start-offset
82 | end-offset))) (pair-begins-with-ends spans-start-hits spans-end-hits))))
83 |
84 | (defn match->annotation [text ^Monitor monitor type-name ^HighlightsMatch match]
85 | (mapcat
86 | (fn [[_ hits]]
87 | (let [query-id (.getQueryId match)
88 | metadata (into {} (.getMetadata (.getQuery monitor query-id)))]
89 | (if (get metadata "_in-order")
90 | (ordered-hits->highlights text type-name query-id metadata hits)
91 | (map (fn [^HighlightsMatch$Hit hit]
92 | (let [start-offset (.-startOffset hit)
93 | end-offset (.-endOffset hit)]
94 | (->Highlight
95 | (subs text start-offset end-offset)
96 | (or (get metadata "_type") type-name)
97 | query-id
98 | metadata
99 | start-offset
100 | end-offset))) hits))))
101 | (.getHits match)))
102 |
103 | (def ^FieldType field-type
104 | (doto (FieldType.)
105 | (.setTokenized true)
106 | (.setIndexOptions IndexOptions/DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
107 | (.setStoreTermVectors true)
108 | (.setStoreTermVectorOffsets true)))
109 |
110 | (defn annotate-text [^String text ^Monitor monitor field-names ^String type-name]
111 | (try
112 | (let [doc (Document.)]
113 | (doseq [field-name field-names]
114 | (.add doc (Field. ^String field-name text field-type)))
115 | (mapcat #(match->annotation text monitor type-name %)
116 | (.getMatches
117 | (.match monitor
118 | #^"[Lorg.apache.lucene.document.Document;" (into-array Document [doc])
119 | (HighlightsMatch/MATCHER))
120 | 0)))
121 | (catch Exception e
122 | (log/errorf "Failed to match text: '%s'" text)
123 | (.printStackTrace e))))
124 |
125 | (defn prepare-synonyms [query-id {:keys [synonyms] :as dict-entry}]
126 | (map (fn [synonym]
127 | (->DictionaryEntry
128 | synonym
129 | (:type dict-entry)
130 | (str (UUID/randomUUID))
131 | nil
132 | (:case-sensitive? dict-entry)
133 | (:ascii-fold? dict-entry)
134 | (:stem? dict-entry)
135 | (:stemmer dict-entry)
136 | (:slop dict-entry)
137 | (:tokenizer dict-entry)
138 | (assoc (:meta dict-entry)
139 | :synonym? "true" :query-id query-id)))
140 | synonyms))
141 |
142 | (defn dict-entry->terms [dict-entry default-analysis-conf]
143 | (let [analyzer (text-analysis/get-string-analyzer dict-entry default-analysis-conf)]
144 | (into-array String (text-analysis/text->token-strings (:text dict-entry) analyzer))))
145 |
146 | (defn merge-dict-entry-with-highlighter-opts
147 | "There are dictionary opts that do not contribute to text analysis, but contributes
148 | to querying. This function acts a single point in merging default highlighter opts
149 | to the dictionary entry."
150 | [dict-entry default-analysis-conf]
151 | (cond-> dict-entry
152 | (and (not (contains? dict-entry :slop))
153 | (contains? default-analysis-conf :slop))
154 | (assoc :slop (:slop default-analysis-conf))
155 |
156 | (and (not (contains? dict-entry :in-order?))
157 | (contains? default-analysis-conf :in-order?))
158 | (assoc :in-order? (:in-order? default-analysis-conf))))
159 |
160 | (defn dict-entry->monitor-query [dict-entry default-analysis-conf idx]
161 | (let [field-name (text-analysis/get-field-name dict-entry default-analysis-conf)
162 | terms (dict-entry->terms dict-entry default-analysis-conf)
163 | {:keys [id text meta type slop in-order?]
164 | :as dict-entry} (merge-dict-entry-with-highlighter-opts dict-entry default-analysis-conf)
165 | query-id (or id (str idx))
166 | metadata (reduce (fn [m [k v]] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta))
167 | normalized-slop (when slop (max 0 (min slop Integer/MAX_VALUE)))]
168 | (if (seq terms)
169 | (if (or (and (and (number? slop) (< 0 slop)) in-order? (< 1 (count terms)))
170 | (:fuzzy? dict-entry))
171 | (MonitorQuery. query-id
172 | (try
173 | (let [ordered? (cond
174 | in-order? true
175 | (and (nil? in-order?) (:fuzzy? dict-entry)) true
176 | :else false)
177 | snqb (SpanNearQuery$Builder. ^String field-name ordered?)]
178 | (doseq [term terms]
179 | (if (true? (:fuzzy? dict-entry))
180 | (.addClause snqb (SpanMultiTermQueryWrapper.
181 | (FuzzyQuery.
182 | (Term. ^String field-name ^String term)
183 | (or (:fuzziness dict-entry) 1))))
184 | (.addClause snqb (SpanTermQuery. (Term. ^String field-name ^String term)))))
185 | (when-not (= slop normalized-slop)
186 | (log/warnf "Phrase slop '%s' normalized to '%s'" slop normalized-slop))
187 | (when normalized-slop
188 | (.setSlop snqb normalized-slop))
189 | (.build snqb))
190 | (catch Exception e (.printStackTrace e)))
191 | text
192 | (assoc metadata "_in-order" true))
193 | (MonitorQuery. query-id
194 | (let [mpqb (MultiPhraseQuery$Builder.)]
195 | (doseq [s terms]
196 | (.add mpqb (Term. ^String field-name ^String s)))
197 | (when slop
198 | (when-not (= slop normalized-slop)
199 | (log/warnf "Phrase slop '%s' normalized to '%s'" slop normalized-slop))
200 | (.setSlop mpqb normalized-slop))
201 | (.build mpqb))
202 | text
203 | metadata))
204 | (log/warnf "Discarding the dictionary entry because no tokens: '%s'" dict-entry))))
205 |
206 | (defn dict-entries->monitor-queries [dict-entries default-analysis-conf]
207 | (->> dict-entries
208 | (mapcat (fn [idx dict-entry]
209 | (let [query-id (or (get dict-entry :id) (str idx))]
210 | (cons
211 | (dict-entry->monitor-query dict-entry default-analysis-conf idx)
212 | (map #(dict-entry->monitor-query % default-analysis-conf nil)
213 | (prepare-synonyms query-id dict-entry)))))
214 | (range))
215 | (remove nil?)))
216 |
217 | (defn synonym-annotation? [annotation]
218 | (= "true" (get-in annotation [:meta "synonym?"])))
219 |
220 | (defn meta-type? [annotation]
221 | (string? (get-in annotation [:meta "_type"])))
222 |
223 | (defn post-process [annotation]
224 | (cond-> annotation
225 | (synonym-annotation? annotation) (assoc :dict-entry-id (get-in annotation [:meta "query-id"]))
226 | (meta-type? annotation) (update-in [:meta] dissoc "_type")))
227 |
228 | (defn match [text monitor field-names type-name opts]
229 | (if (s/blank? text)
230 | []
231 | (let [annotations (map post-process (annotate-text text monitor field-names type-name))]
232 | (if (:merge-annotations? opts)
233 | (merger/merge-same-type-annotations annotations)
234 | annotations))))
235 |
236 | (defn highlighter
237 | "Creates a highlighter function with for a given dictionary.
238 | Params:
239 | - dictionary
240 | a list of dictionary entries as described in `beagle.schema/dict-entry`
241 | Opts:
242 | - type-name
243 | a string, defaults to \"PHRASE\"
244 | - validate-dictionary?
245 | if set to true then validates the dictionary, default false
246 | - optimize-dictionary?
247 | if set to true then optimizes dictionary before creating the monitor, default false
248 | - tokenizer
249 | a keyword one of #{:keyword :letter :standard :classic :strict :unicode-whitespace :whitespace}, default :standard
250 | - case-sensitive?
251 | if set to true text matching is case sensitive, default true
252 | - ascii-fold?
253 | if set to true before matching text is ascii folded, default false
254 | - stem?
255 | if set to true before matching text is stemmed, default false
256 | - stemmer
257 | a keyword one of #{:arabic :armenian :basque :catalan :danish :dutch :english :estonian
258 | :finnish :french :german :german2 :hungarian :irish :italian :kp :lithuanian :lovins
259 | :norwegian :porter :portuguese :romanian :russian :spanish :swedish :turkish}
260 | that specifies the stemmer algorithm, default :english
261 | - slop
262 | the max edit-distance for phrase matching, default 0
263 | - in-order?
264 | if set to true enforces phrase terms ordering in matches, default false"
265 | ([dictionary] (highlighter dictionary {}))
266 | ([dictionary opts]
267 | (when (:validate-dictionary? opts) (validator/validate-dictionary dictionary))
268 | (let [dictionary (if (:optimize-dictionary? opts) (optimizer/optimize dictionary) dictionary)
269 | type-name (if (s/blank? (:type-name opts)) "PHRASE" (:type-name opts))
270 | {:keys [monitor field-names]} (monitor/setup dictionary opts dict-entries->monitor-queries)]
271 | (fn
272 | ([text] (match text monitor field-names type-name {}))
273 | ([text opts] (match text monitor field-names type-name opts))))))
274 |
275 | (defn ^:deprecated annotator
276 | [dictionary & {:keys [type-name validate-dictionary? optimize-dictionary? tokenizer]}]
277 | (when validate-dictionary? (validator/validate-dictionary dictionary))
278 | (let [dictionary (if optimize-dictionary? (optimizer/optimize dictionary) dictionary)
279 | type-name (if (s/blank? type-name) "PHRASE" type-name)
280 | {:keys [monitor field-names]} (monitor/setup dictionary {:tokenizer tokenizer}
281 | dict-entries->monitor-queries)]
282 | (fn
283 | ([text] (match text monitor field-names type-name {}))
284 | ([text & {:as opts}] (match text monitor field-names type-name opts)))))
285 |
--------------------------------------------------------------------------------
/src/beagle/readers.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.readers
2 | (:require [clojure.string :as s]
3 | [clojure.edn :as edn]
4 | [clojure.java.io :as io]
5 | [clojure.data.csv :as csv]
6 | [jsonista.core :as json])
7 | (:import (java.io PushbackReader)))
8 |
9 | (def mapper (json/object-mapper {:decode-key-fn true}))
10 |
11 | (defn read-edn
12 | "Reads dictionary from the source.
13 | `source` - must be something that an input stream can be created."
14 | [source]
15 | (with-open [rdr (PushbackReader. (io/reader (io/input-stream source)))]
16 | (doall (edn/read rdr))))
17 |
18 | (defn read-csv [source]
19 | (with-open [reader (io/reader source)]
20 | (let [[header & lines] (csv/read-csv reader :separator \, :quote \")
21 | kvs (map keyword header)]
22 | (->> lines
23 | (map (fn [line] (map s/trim line)))
24 | (map #(apply array-map (interleave kvs %)))
25 | (map #(into {} (remove (fn [[_ v]] (s/blank? v)) %)))
26 | (map (fn [{:keys [synonyms] :as dict}]
27 | (if-not (s/blank? synonyms)
28 | (assoc dict :synonyms (map s/trim (s/split synonyms #";")))
29 | dict)))
30 | (map (fn [{:keys [case-sensitive?] :as dict}]
31 | (if-not (s/blank? case-sensitive?)
32 | (assoc dict :case-sensitive? (Boolean/valueOf ^String case-sensitive?))
33 | dict)))
34 | (map (fn [{:keys [ascii-fold?] :as dict}]
35 | (if-not (s/blank? ascii-fold?)
36 | (assoc dict :ascii-fold? (Boolean/valueOf ^String ascii-fold?))
37 | dict)))
38 | (map (fn [{:keys [meta] :as dict}]
39 | (if-not (s/blank? meta)
40 | (assoc dict :meta (reduce (fn [acc [k v]] (assoc acc k v))
41 | {}
42 | (->> (map s/trim (s/split meta #";"))
43 | (partition-all 2)
44 | (remove (fn [[_ v]] (s/blank? (str v)))))))
45 |
46 | dict)))
47 | (doall)))))
48 |
49 | (defn read-json [source]
50 | (with-open [rdr (io/reader (io/input-stream source))]
51 | (doall (json/read-value rdr mapper))))
52 |
--------------------------------------------------------------------------------
/src/beagle/schema.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.schema
2 | (:require [clojure.spec.alpha :as s]
3 | [clojure.spec.gen.alpha :as gen]
4 | [clojure.string :as str]))
5 |
6 | (s/def ::non-empty-string
7 | (s/and string? (complement str/blank?)))
8 |
9 | (s/def ::text ::non-empty-string)
10 | (s/def ::type (s/nilable string?))
11 | (s/def ::id (s/nilable string?))
12 | (s/def ::synonyms (s/nilable (s/coll-of ::non-empty-string)))
13 | (s/def ::case-sensitive? (s/nilable boolean?))
14 | (s/def ::ascii-fold? (s/nilable boolean?))
15 | (s/def ::stem? (s/nilable boolean?))
16 | (s/def ::stemmer (s/nilable keyword?))
17 | (s/def ::slop (s/nilable #(and (number? %) (or (pos-int? %) (zero? %)))))
18 | (s/def ::tokenizer (s/nilable keyword?))
19 | (s/def ::in-order? (s/nilable boolean?))
20 | (s/def ::meta
21 | (s/with-gen
22 | (s/nilable (s/map-of #(or (string? %) (keyword? %)) string?))
23 | #(gen/fmap (fn [s] {s s}) (s/gen string?))))
24 |
25 | (s/def ::dict-entry
26 | (s/keys :req-un [::text]
27 | :opt-un [::type ::id ::synonyms ::meta
28 | ::case-sensitive? ::ascii-fold? ::stem? ::stemmer ::slop
29 | ::tokenizer ::in-order?]))
30 |
31 | (defrecord DictionaryEntry [text type id synonyms case-sensitive? ascii-fold?
32 | stem? stemmer slop tokenizer meta])
33 |
34 | (s/def ::dictionary (s/coll-of ::dict-entry))
35 |
36 | (s/def ::begin-offset pos-int?)
37 | (s/def ::end-offset pos-int?)
38 | (s/def ::dict-entry-id ::non-empty-string)
39 |
40 | (s/def ::dictionary-annotation
41 | (s/keys :req-un [::text ::type ::begin-offset ::end-offset]
42 | :opt-un [::dict-entry-id ::meta]))
43 |
44 | (defrecord Highlight [text type dict-entry-id meta begin-offset end-offset])
45 |
46 | (s/def ::annotations (s/coll-of ::dictionary-annotation))
47 |
--------------------------------------------------------------------------------
/src/beagle/text_analysis.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.text-analysis
2 | (:require [clojure.string :as string]
3 | [clojure.tools.logging :as log])
4 | (:import (org.apache.lucene.analysis Analyzer Analyzer$TokenStreamComponents Tokenizer TokenStream)
5 | (org.apache.lucene.analysis.core LowerCaseFilter WhitespaceTokenizer LetterTokenizer KeywordTokenizer UnicodeWhitespaceTokenizer)
6 | (org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter)
7 | (org.apache.lucene.analysis.standard ClassicFilter StandardTokenizer ClassicTokenizer)
8 | (org.apache.lucene.analysis.tokenattributes CharTermAttribute)
9 | (org.apache.lucene.analysis.pattern PatternTokenizer)
10 | (org.apache.lucene.analysis.snowball SnowballFilter)
11 | (org.tartarus.snowball.ext LithuanianStemmer ArabicStemmer ArmenianStemmer BasqueStemmer EnglishStemmer CatalanStemmer DanishStemmer DutchStemmer EstonianStemmer FinnishStemmer FrenchStemmer German2Stemmer GermanStemmer HungarianStemmer IrishStemmer ItalianStemmer KpStemmer LovinsStemmer NorwegianStemmer PorterStemmer PortugueseStemmer RomanianStemmer RussianStemmer SpanishStemmer SwedishStemmer TurkishStemmer)
12 | (org.tartarus.snowball SnowballProgram)
13 | (java.io StringReader)))
14 |
15 | (defn ^SnowballProgram stemmer
16 | "Creates a stemmer object given the stemmer keyword.
17 | Default stemmer is English."
18 | [stemmer-kw]
19 | (case stemmer-kw
20 | :arabic (ArabicStemmer.)
21 | :armenian (ArmenianStemmer.)
22 | :basque (BasqueStemmer.)
23 | :catalan (CatalanStemmer.)
24 | :danish (DanishStemmer.)
25 | :dutch (DutchStemmer.)
26 | :english (EnglishStemmer.)
27 | :estonian (EstonianStemmer.)
28 | :finnish (FinnishStemmer.)
29 | :french (FrenchStemmer.)
30 | :german2 (German2Stemmer.)
31 | :german (GermanStemmer.)
32 | :hungarian (HungarianStemmer.)
33 | :irish (IrishStemmer.)
34 | :italian (ItalianStemmer.)
35 | :kp (KpStemmer.)
36 | :lithuanian (LithuanianStemmer.)
37 | :lovins (LovinsStemmer.)
38 | :norwegian (NorwegianStemmer.)
39 | :porter (PorterStemmer.)
40 | :portuguese (PortugueseStemmer.)
41 | :romanian (RomanianStemmer.)
42 | :russian (RussianStemmer.)
43 | :spanish (SpanishStemmer.)
44 | :swedish (SwedishStemmer.)
45 | :turkish (TurkishStemmer.)
46 | (do
47 | (when stemmer-kw
48 | (log/debugf "Stemmer '%s' not found! EnglishStemmer is used." stemmer-kw))
49 | (EnglishStemmer.))))
50 |
51 | (defn ^Tokenizer tokenizer [tokenizer-kw]
52 | (case tokenizer-kw
53 | :keyword (KeywordTokenizer.)
54 | :letter (LetterTokenizer.)
55 | :classic (ClassicTokenizer.)
56 | :standard (StandardTokenizer.)
57 | :strict (PatternTokenizer. #"[^a-zA-Z0-9{}\[\]()<>#+=@&']+" -1)
58 | :unicode-whitespace (UnicodeWhitespaceTokenizer.)
59 | :whitespace (WhitespaceTokenizer.)
60 | (do
61 | (when tokenizer-kw
62 | (log/debugf "Tokenizer '%s' not found. StandardTokenizer is used." tokenizer-kw))
63 | (StandardTokenizer.))))
64 |
65 | (defn analyzer-constructor [{tokenizer-kw :tokenizer
66 | ascii-fold? :ascii-fold?
67 | case-sensitive? :case-sensitive?
68 | stem? :stem?
69 | stemmer-kw :stemmer}]
70 | (proxy [Analyzer] []
71 | (createComponents [^String field-name]
72 | (let [^Tokenizer tokenizr (tokenizer tokenizer-kw)
73 | ^TokenStream filters-chain (cond-> tokenizr
74 | (not case-sensitive?) (LowerCaseFilter.)
75 | ascii-fold? (ASCIIFoldingFilter.))
76 | token-stream (if stem?
77 | (SnowballFilter. filters-chain (stemmer stemmer-kw))
78 | (if (instance? Tokenizer filters-chain)
79 | (ClassicFilter. tokenizr)
80 | filters-chain))]
81 | (Analyzer$TokenStreamComponents.
82 | ^Tokenizer tokenizr ^TokenStream token-stream)))))
83 |
84 | (defn field-name-constructor [{tokenizer-kw :tokenizer
85 | ascii-fold? :ascii-fold?
86 | case-sensitive? :case-sensitive?
87 | stem? :stem?
88 | stemmer-kw :stemmer}]
89 | (let [tokenizr (str (name (or tokenizer-kw :standard)) "-tokenizer")
90 | filters (cond-> []
91 | (not case-sensitive?) (conj "lowercased")
92 | ascii-fold? (conj "ascii-folded")
93 | stem? (conj (str "stemmed-" (name (or stemmer-kw :english)))))]
94 | (if (seq filters)
95 | (str "text" "." tokenizr "." (string/join "-" (sort filters)))
96 | (str "text" "." tokenizr))))
97 |
98 | (def analyzer (memoize analyzer-constructor))
99 | (def field-name (memoize field-name-constructor))
100 |
101 | (def default-conf
102 | {:tokenizer :standard
103 | :case-sensitive? true
104 | :ascii-fold? false
105 | :stem? false
106 | :stemmer :english})
107 |
108 | (defrecord Conf [tokenizer case-sensitive? ascii-fold? stem? stemmer])
109 |
110 | (defn three-way-merge
111 | "Given a key and three maps return the value that would appear in the map after merge.
112 | Semantics is of the default Clojure merge."
113 | [k m1 m2 m3]
114 | (if (nil? (k m3))
115 | (if (nil? (k m2))
116 | (k m1)
117 | (k m2))
118 | (k m3)))
119 |
120 | (defn ^Analyzer get-string-analyzer [analysis-conf default-analysis-conf]
121 | (analyzer (->Conf
122 | (three-way-merge :tokenizer default-conf default-analysis-conf analysis-conf)
123 | (three-way-merge :case-sensitive? default-conf default-analysis-conf analysis-conf)
124 | (three-way-merge :ascii-fold? default-conf default-analysis-conf analysis-conf)
125 | (three-way-merge :stem? default-conf default-analysis-conf analysis-conf)
126 | (three-way-merge :stemmer default-conf default-analysis-conf analysis-conf))))
127 |
128 | (defn ^String get-field-name [analysis-conf default-analysis-conf]
129 | (field-name (->Conf
130 | (three-way-merge :tokenizer default-conf default-analysis-conf analysis-conf)
131 | (three-way-merge :case-sensitive? default-conf default-analysis-conf analysis-conf)
132 | (three-way-merge :ascii-fold? default-conf default-analysis-conf analysis-conf)
133 | (three-way-merge :stem? default-conf default-analysis-conf analysis-conf)
134 | (three-way-merge :stemmer default-conf default-analysis-conf analysis-conf))))
135 |
136 | (defn text->token-strings
137 | "Given a text and an analyzer returns a list of tokens as strings."
138 | [^String text ^Analyzer analyzer]
139 | (let [^TokenStream token-stream (.tokenStream analyzer "not-important" (StringReader. text))
140 | ^CharTermAttribute termAtt (.addAttribute token-stream CharTermAttribute)]
141 | (.reset token-stream)
142 | (reduce (fn [acc _]
143 | (if (.incrementToken token-stream)
144 | (conj acc (.toString termAtt))
145 | (do
146 | (.end token-stream)
147 | (.close token-stream)
148 | (reduced acc)))) [] (range))))
149 |
--------------------------------------------------------------------------------
/src/beagle/validator.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.validator
2 | (:gen-class)
3 | (:require [clojure.spec.alpha :as s]
4 | [beagle.schema :as sch]
5 | [beagle.readers :as readers]))
6 |
7 | (defn validate-dictionary [dictionary]
8 | (s/conform ::sch/dictionary dictionary))
9 |
10 | (defn valid-dictionary? [dictionary]
11 | (try
12 | (seq (validate-dictionary dictionary))
13 | (catch Exception _)))
14 |
15 | (def supported-dictionary-file-types #{"csv" "json" "edn"})
16 |
17 | (defn valid-dictionary-file? [dictionary-file dictionary-file-type]
18 | (if (contains? supported-dictionary-file-types dictionary-file-type)
19 | (valid-dictionary? (case dictionary-file-type
20 | "csv" (readers/read-csv dictionary-file)
21 | "json" (readers/read-json dictionary-file)
22 | "edn" (readers/read-edn dictionary-file)))
23 | (.printStackTrace (Exception. (format "File type not supported: `%s`" dictionary-file-type)))))
24 |
25 | (defn -main [& args]
26 | (when (odd? (count args))
27 | (.printStackTrace (Exception. "Even number of arguments must be present - 'dictionary-name dictionary-type ...'"))
28 | (System/exit 1))
29 | (when (some #(not (apply valid-dictionary-file? %)) (partition-all 2 args))
30 | (System/exit 1)))
31 |
--------------------------------------------------------------------------------
/test/beagle/annotation_merge_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.annotation-merge-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [beagle.phrases :as phrases]
4 | [beagle.annotation-merger :as merger]))
5 |
6 | (deftest annotator-with-merge-option-test
7 | (let [dictionary [{:text "1 2"} {:text "2"} {:text "1 2 3 4"}
8 | {:text "4"} {:text "5"} {:text "6 5 3 7"} {:text "6 5"}]
9 | highlighter-fn (phrases/highlighter dictionary {:type-name "TEST"})
10 | text "A B C 1 2 3 4 D E F G 6 5 3 7"]
11 | (is (= (count (highlighter-fn text {:merge-annotations? false})) (count (highlighter-fn text))))
12 | (is (< (count (highlighter-fn text {:merge-annotations? true})) (count (highlighter-fn text))))
13 | (is (= [(set (vals {:begin-offset 6
14 | :dict-entry-id "2"
15 | :end-offset 13
16 | :meta {}
17 | :text "1 2 3 4"
18 | :type "TEST"}))
19 | (set (vals {:begin-offset 22
20 | :dict-entry-id "5"
21 | :end-offset 29
22 | :meta {}
23 | :text "6 5 3 7"
24 | :type "TEST"}))]
25 | (map #(-> % vals set) (highlighter-fn text {:merge-annotations? true}))))))
26 |
27 | (deftest annotation-merge-test
28 | (is (= [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}]
29 | (merger/merge-same-type-annotations
30 | [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}
31 | {:text "A" :type "TEST" :dict-entry-id "3" :meta {} :begin-offset 0 :end-offset 1}
32 | {:text "AAAA" :type "TEST" :dict-entry-id "2" :meta {} :begin-offset 1 :end-offset 5}])))
33 |
34 | (is (= [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}
35 | {:text "AAA" :type "TEST2" :dict-entry-id "10" :meta {} :begin-offset 0 :end-offset 3}]
36 | (merger/merge-same-type-annotations
37 | [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}
38 | {:text "A" :type "TEST" :dict-entry-id "2" :meta {} :begin-offset 0 :end-offset 1}
39 | {:text "AAAA" :type "TEST" :dict-entry-id "3" :meta {} :begin-offset 1 :end-offset 5}
40 | {:text "AAA" :type "TEST2" :dict-entry-id "10" :meta {} :begin-offset 0 :end-offset 3}
41 | {:text "A" :type "TEST2" :dict-entry-id "11" :meta {} :begin-offset 0 :end-offset 1}]))))
42 |
43 |
--------------------------------------------------------------------------------
/test/beagle/corner_case_phrases_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.corner-case-phrases-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [beagle.phrases :as phrases])
4 | (:import (org.jsoup Jsoup)))
5 |
6 | (deftest corner-cases
7 | (let [annotator (phrases/highlighter [{:text "N-Able N-Central"
8 | :case-sensitive? false}])
9 | text (some-> (Jsoup/parse (slurp "test/resources/phrases.html")) (.body) (.text))]
10 | (is (empty? (annotator text)))))
11 |
--------------------------------------------------------------------------------
/test/beagle/dictionary_optimization_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.dictionary-optimization-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [beagle.dictionary-optimizer :as optimizer]
4 | [beagle.phrases :as phrases]))
5 |
6 | (deftest meta-merge-test
7 | (is (optimizer/mergeable-meta? nil {:meta {:email "123"}}))
8 | (is (optimizer/mergeable-meta? {:meta {}} {:meta {:email "123"}}))
9 | (is (optimizer/mergeable-meta? {:meta {:email "123"}} nil))
10 | (is (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "123"}}))
11 | (is (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "123" :total 5646}}))
12 | (is (optimizer/mergeable-meta? {:meta {:email "123" :total 5646}} {:meta {:email "123"}}))
13 | (is (not (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "321"}})))
14 | (is (not (optimizer/mergeable-meta? {:meta {:email "123" :total 5646}} {:meta {:email "123" :total 9999}})))
15 | (is (= [{:ascii-fold? true
16 | :case-sensitive? true
17 | :id "test-id"
18 | :meta {:abc "123" :email "test@example.com"}
19 | :synonyms ["abc" "XXXX"]
20 | :text "test text"}
21 | {:ascii-fold? true
22 | :case-sensitive? true
23 | :id "test-id"
24 | :meta {:email "bobby@example.com"}
25 | :synonyms ["def"]
26 | :text "test text"}]
27 | (optimizer/aggregate-entries-by-meta
28 | [{:text "test text"
29 | :id "test-id"
30 | :synonyms ["abc"]
31 | :case-sensitive? true
32 | :ascii-fold? true
33 | :meta {:email "test@example.com"}}
34 | {:text "test text"
35 | :id "test-id"
36 | :synonyms ["def"]
37 | :case-sensitive? true
38 | :ascii-fold? true
39 | :meta {:email "bobby@example.com"}}
40 | {:text "test text"
41 | :id "test-id"
42 | :synonyms ["XXXX"]
43 | :case-sensitive? true
44 | :ascii-fold? true
45 | :meta {:email "test@example.com" :abc "123"}}]))))
46 |
47 | (deftest dictionary-optimization-test
48 | (let [dictionary [{:case-sensitive? true
49 | :ascii-fold? true
50 | :synonyms ["AAAA1"]
51 | :text "AAAA"}
52 | {:case-sensitive? true
53 | :ascii-fold? true
54 | :synonyms ["AAAA2"]
55 | :text "AAAA"}
56 | {:case-sensitive? false
57 | :ascii-fold? true
58 | :synonyms ["AAAA3"]
59 | :text "AAAA"}
60 | {:case-sensitive? true
61 | :ascii-fold? true
62 | :synonyms ["AAAA4"]
63 | :text "AAAA"}
64 | {:case-sensitive? true
65 | :ascii-fold? false
66 | :synonyms ["AAAA5"]
67 | :text "AAAA"}
68 | {:case-sensitive? true
69 | :ascii-fold? false
70 | :synonyms ["AAAA"]
71 | :text "AAAA"}
72 | {:case-sensitive? false
73 | :synonyms ["BBBB1"]
74 | :text "BBBB"}
75 | {:case-sensitive? false
76 | :synonyms ["BBBB"]
77 | :text "BBBB"}]
78 | expected-dictionary [{:text "AAAA"
79 | :synonyms ["AAAA4" "AAAA2" "AAAA1"]
80 | :case-sensitive? true
81 | :ascii-fold? true}
82 | {:case-sensitive? false :ascii-fold? true :synonyms ["AAAA3"] :text "AAAA"}
83 | {:text "AAAA" :synonyms ["AAAA5"] :case-sensitive? true :ascii-fold? false}
84 | {:text "BBBB" :synonyms ["BBBB1"] :case-sensitive? false}]
85 | optimized-dictionary (optimizer/optimize dictionary)]
86 | (is (< (count optimized-dictionary) (count dictionary)))
87 | (is (= (count expected-dictionary) (count optimized-dictionary)))
88 | (is (= (set (map #(update % :synonyms set) expected-dictionary))
89 | (set (map #(update % :synonyms set) optimized-dictionary))))))
90 |
91 | (deftest synonym-optimization
92 | (let [dictionary [{:text "test" :id "1" :synonyms ["beagle" "luwak1"]}]
93 | monitor-queries (phrases/dict-entries->monitor-queries dictionary {:tokenizer :standard})]
94 | (is (= 3 (count monitor-queries)))
95 | (let [highlighter-fn (phrases/highlighter dictionary {:type-name "TEST"})
96 | anns (highlighter-fn "this is a beagle text test luwak1")]
97 | (is (= 3 (count anns))))))
98 |
--------------------------------------------------------------------------------
/test/beagle/java_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.java-test
2 | (:require [clojure.test :refer [deftest is]]))
3 |
4 | (deftest simple-java-interface
5 | (let [de (doto (lt.tokenmill.beagle.phrases.DictionaryEntry. "test")
6 | (.setSlop (Integer. 1)))
7 | annotator (lt.tokenmill.beagle.phrases.Annotator. [de] {})]
8 | (is (= "test" (first (map #(.text %) (.annotate annotator "test txt" {})))))))
9 |
10 | (deftest case-sensitivity
11 | (let [de (doto (lt.tokenmill.beagle.phrases.DictionaryEntry. "LYNDON BAINES JOHNSON")
12 | (.setCaseSensitive false))
13 | annotator (lt.tokenmill.beagle.phrases.Annotator. [de] {})]
14 | (is (= 1 (count (filter #(= "Lyndon Baines Johnson" (.text %)) (.annotate annotator "Lyndon Baines Johnson (/ˈlɪndən ˈbeɪnz/; August 27, 1908 – January 22, 1973), often referred to as LBJ, was an American politician who served as the 36th president of the United States from 1963 to 1969." {})))))))
15 |
--------------------------------------------------------------------------------
/test/beagle/lucene_alpha_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.lucene-alpha-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [beagle.lucene-alpha :as lucene]))
4 |
5 | (deftest smoke
6 | (let [txt "some text this other that"
7 | dictionary [{:text "this AND that" :id "1" :slop 1}]
8 | annotator-fn (lucene/annotator dictionary)
9 | [ann1 :as anns] (annotator-fn txt {})
10 | anns2 (annotator-fn txt)]
11 | (is (= anns anns2))
12 | (is (= 1 (count anns)))
13 | (is (= "1" (:dict-entry-id ann1)))))
14 |
15 | (deftest ^:noisy smoke-2
16 | (let [txt "some text this AND"
17 | dictionary [{:text "this AND" :id "1" :slop 1}]
18 | annotator-fn (lucene/annotator dictionary)
19 | [ann1 :as anns] (annotator-fn txt)]
20 | (is (= 0 (count anns)))
21 | (is (nil? (:dict-entry-id ann1)))))
22 |
23 | (deftest smoke-3
24 | (let [txt "some number 1234 test"
25 | dictionary [{:text "/.*\\d*.*/" :id "1" :slop 1}]
26 | annotator-fn (lucene/annotator dictionary)
27 | anns (annotator-fn txt)]
28 | (is (< 0 (count anns)))))
29 |
--------------------------------------------------------------------------------
/test/beagle/optimization_suggestions_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.optimization-suggestions-test
2 | (:require [clojure.test :refer [deftest is testing]]
3 | [beagle.dictionary-optimizer :as optimizer]))
4 |
5 | (deftest optimization-suggestions
6 | (testing "Suggestions for similar dictionary items"
7 | (is (= [{:dictionary-items [{:id "1" :synonyms ["beagle"] :text "test"} {:id "2" :synonyms ["luwak1"] :text "test"}]
8 | :suggestion "Dictionary items '[1 2]' have identical `[text case-sensitivity ascii-folding] features."}]
9 | (optimizer/dry-run [{:text "test" :id "1" :synonyms ["beagle"]}
10 | {:text "test" :id "2" :synonyms ["luwak1"]}]))))
11 |
12 | (testing "Suggestions for two similar dictionary item groups"
13 | (is (= [{:suggestion "Dictionary items '[1 3]' have identical `[text case-sensitivity ascii-folding] features."
14 | :dictionary-items [{:id "1" :synonyms ["beagle"] :text "test"} {:id "3" :synonyms ["beagle"] :text "test"}]}
15 | {:suggestion "Dictionary items '[2 4]' have identical `[text case-sensitivity ascii-folding] features."
16 | :dictionary-items [{:id "2" :synonyms ["luwak2"] :text "test2"} {:id "4" :synonyms ["beagle3"] :text "test2"}]}]
17 | (optimizer/dry-run [{:id "1" :synonyms ["beagle"] :text "test"}
18 | {:id "2" :synonyms ["luwak2"] :text "test2"}
19 | {:id "3" :synonyms ["beagle"] :text "test"}
20 | {:id "4" :synonyms ["beagle3"] :text "test2"}]))))
21 |
22 | (testing "Suggestions for single dictionary item"
23 | (is (= [] (optimizer/dry-run [{:id "1" :synonyms ["beagle"] :text "test"}]))))
24 |
25 | (testing "Suggestions for distinct dictionary items"
26 | (is (= [] (optimizer/dry-run [{:id "1" :case-sensitive? true :synonyms ["beagle"] :text "test"}
27 | {:id "2" :synonyms ["beagle"] :text "test2"}
28 | {:id "3" :ascii-fold? false :synonyms ["beagle"] :text "test3"}]))))
29 |
30 | (testing "Suggestions for two similar dictionary item groups and one distinct dictionary item"
31 | (is (= [{:suggestion "Dictionary items '[test 3 4]' have identical `[text case-sensitivity ascii-folding] features."
32 | :dictionary-items [{:synonyms ["beagle"] :text "test"}
33 | {:id "3" :synonyms ["beagle"] :text "test"}
34 | {:id "4" :synonyms ["luwak222"] :text "test"}]}
35 | {:suggestion "Dictionary items '[2 test2]' have identical `[text case-sensitivity ascii-folding] features."
36 | :dictionary-items [{:id "2" :synonyms ["luwak2"] :text "test2"} {:synonyms ["beagle3"] :text "test2"}]}]
37 | (optimizer/dry-run [{:synonyms ["beagle"] :text "test"}
38 | {:id "2" :synonyms ["luwak2"] :text "test2"}
39 | {:id "3" :synonyms ["beagle"] :text "test"}
40 | {:id "4" :synonyms ["luwak222"] :text "test"}
41 | {:synonyms ["beagle3"] :text "test2"}
42 | {:synonyms ["beagle"] :text "test" :ascii-fold? true}])))))
43 |
--------------------------------------------------------------------------------
/test/beagle/phrases_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.phrases-test
2 | (:require [clojure.test :refer [deftest is testing]]
3 | [clojure.spec.alpha :as s]
4 | [clojure.spec.test.alpha :as stest]
5 | [beagle.phrases :as phrases]
6 | [beagle.schema :as schema]))
7 |
8 | (s/def ::opts (s/* (s/cat :opt keyword? :val any?)))
9 |
10 | (s/fdef phrases/highlighter
11 | :args (s/alt :unary (s/cat :dictionary ::schema/dictionary)
12 | :binary (s/cat :dictionary ::schema/dictionary :opts any?))
13 | :ret (s/fspec :args (s/alt :unary (s/cat :text string?)
14 | :binary (s/cat :text string? :opts any?))
15 | :ret ::schema/annotations))
16 |
17 | (stest/instrument `phrases/highlighter)
18 |
19 | (s/exercise-fn `phrases/highlighter)
20 |
21 | (def label "LABEL")
22 |
23 | (deftest dictionary-entry-record
24 | (let [dictionary [(schema/map->DictionaryEntry {:text "test"})]
25 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
26 | anns (highlighter-fn "before annotated test phrase after annotated")]
27 | (is (= 1 (count anns)))))
28 |
29 | (deftest type-per-dictionary-entry
30 | (let [dictionary [{:text "test phrase" :id "1" :meta {:test "test"} :type "CUSTOM"}]
31 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
32 | anns (highlighter-fn "before annotated test phrase after annotated")]
33 | (is (seq (s/conform ::schema/annotations anns)))
34 | (is (seq anns))
35 | (is (= "1" (-> anns first :dict-entry-id)))
36 | (is (= "CUSTOM" (-> anns first :type)))
37 | (is (= "test phrase" (-> anns first :text)))
38 | (is (nil? (-> anns first (get-in [:meta "_type"]))))))
39 |
40 | (deftest id
41 | (let [dictionary [{:text "test" :id "1" :meta {:test "test"}}]
42 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
43 | anns (highlighter-fn "before annotated test after annotated")]
44 | (is (seq anns))
45 | (is (= "1" (-> anns first :dict-entry-id)))
46 | (is (= "LABEL" (-> anns first :type)))))
47 |
48 | (deftest metadata-append
49 | (let [dictionary [{:text "test" :meta {"email" "test@example.com"}}]
50 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
51 | anns (highlighter-fn "before annotated test after annotated")]
52 | (is (seq anns))
53 | (is (= {"email" "test@example.com"} (-> anns first :meta)))))
54 |
55 | (deftest case-sensitivity
56 | (testing "case sensitive"
57 | (let [dictionary [{:text "test"}]
58 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
59 | anns (highlighter-fn "before annotated test after annotated")]
60 | (is (seq anns)))
61 | (let [dictionary [{:text "TeSt" :case-sensitive? true}]
62 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
63 | anns (highlighter-fn "before annotated test after annotated")]
64 | (is (empty? anns)))
65 | (let [label "LABEL"
66 | dictionary [{:text "test" :case-sensitive? true}]
67 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
68 | anns (highlighter-fn "before annotated Test after annotated")]
69 | (is (empty? anns))))
70 |
71 | (testing "case insensitive"
72 | (let [dictionary [{:text "TeSt" :case-sensitive? false}]
73 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
74 | anns (highlighter-fn "before annotated test after annotated")]
75 | (is (seq anns)))
76 | (let [dictionary [{:text "test" :case-sensitive? false}]
77 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
78 | anns (highlighter-fn "before annotated test after annotated")]
79 | (is (seq anns)))))
80 |
81 | (deftest ascii-folding-dictionary
82 | (let [dictionary [{:text "wörd"}]
83 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
84 | anns (highlighter-fn "before annotated wörd after annotated")]
85 | (is (seq anns)))
86 | (let [dictionary [{:text "wörd"}]
87 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
88 | anns (highlighter-fn "before annotated word after annotated")]
89 | (is (empty? anns)))
90 | (let [label "LABEL"
91 | dictionary [{:text "wörd" :ascii-fold? true}]
92 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
93 | anns (highlighter-fn "before annotated word after annotated")]
94 | (is (seq anns)))
95 | (let [dictionary [{:text "word" :ascii-fold? true}]
96 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
97 | anns (highlighter-fn "before annotated wörd after annotated")]
98 | (is (seq anns)))
99 | (let [label "LABEL"
100 | dictionary [{:text "word" :ascii-fold? false}]
101 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
102 | anns (highlighter-fn "before annotated wörd after annotated")]
103 | (is (empty? anns))))
104 |
105 | (deftest ascii-folding-with-case-sensitivity
106 | (let [label "TYPE"]
107 | (testing "case sensitive"
108 | (let [dictionary [{:text "schön" :ascii-fold? true}]
109 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
110 | anns (highlighter-fn "before annotated Schön after annotated")]
111 | (is (empty? anns)))
112 | (let [dictionary [{:text "Schön" :ascii-fold? true}]
113 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
114 | anns (highlighter-fn "before annotated Schon after annotated")]
115 | (is (seq anns)))
116 | (let [dictionary [{:text "schön" :ascii-fold? true}]
117 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
118 | anns (highlighter-fn "before annotated Schon after annotated")]
119 | (is (empty? anns))))
120 |
121 | (testing "case insensitive"
122 | (let [dictionary [{:text "schön" :ascii-fold? true :case-sensitive? false}]
123 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
124 | anns (highlighter-fn "before annotated Schon after annotated")]
125 | (is (seq anns))))
126 | (let [dictionary [{:text "schön" :ascii-fold? true :case-sensitive? false}]
127 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
128 | anns (highlighter-fn "before annotated schon after annotated")]
129 | (is (seq anns)))
130 | (let [dictionary [{:text "schon" :ascii-fold? true :case-sensitive? false}]
131 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
132 | anns (highlighter-fn "before annotated schön after annotated")]
133 | (is (seq anns)))
134 |
135 | (testing "false ascii fold"
136 | (let [dictionary [{:text "schon" :ascii-fold? false}]
137 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
138 | anns (highlighter-fn "before annotated schön after annotated")]
139 | (is (empty? anns))))))
140 |
141 | (deftest synonyms
142 | (let [dictionary [{:text "test" :id "1" :synonyms ["beagle"]}]
143 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
144 | anns (highlighter-fn "before annotated beagle after annotated")]
145 | (is (= 1 (count anns)))
146 | (is (= "1" (-> anns first :dict-entry-id)))
147 | (is (= "beagle" (-> anns first :text))))
148 |
149 | (let [dictionary [{:text "test" :id "1" :synonyms ["Luwak"] :case-sensitive? true}]
150 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
151 | anns (highlighter-fn "before annotated beagle after annotated")]
152 | (is (empty? anns)))
153 |
154 | (let [dictionary [{:text "test" :id "1" :synonyms ["beagle"] :case-sensitive? false}]
155 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
156 | anns (highlighter-fn "before annotated beagle after annotated")]
157 | (is (= 1 (count anns)))
158 | (is (= "1" (-> anns first :dict-entry-id)))
159 | (is (= "beagle" (-> anns first :text))))
160 |
161 | (testing "synonyms with false ascii fold"
162 | (let [dictionary [{:text "test" :synonyms ["schön"] :ascii-fold? false}]
163 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
164 | anns (highlighter-fn "before annotated schon after annotated")]
165 | (is (empty? anns)))
166 | (let [dictionary [{:text "test" :synonyms ["schön"] :ascii-fold? true}]
167 | highlighter-fn (phrases/highlighter dictionary {:type-name label})
168 | anns (highlighter-fn "before annotated schon after annotated")]
169 | (is (seq anns))
170 | (is (= "schon" (-> anns first :text))))))
171 |
172 | (deftest phrase-end-sentence
173 | (let [dictionary [{:text "test-test"}]
174 | highlighter-fn (phrases/highlighter dictionary)
175 | anns (highlighter-fn "before annotated test-test.")]
176 | (is (seq anns))
177 | (is (= "test-test" (:text (first anns))))))
178 |
179 | (deftest phrase-in-quotes
180 | (let [dictionary [{:text "test-test" :case-sensitive? false}]
181 | highlighter-fn (phrases/highlighter dictionary)
182 | anns (highlighter-fn "before annotated \"TEST-test\".")]
183 | (is (seq anns))
184 | (is (= "TEST-test" (:text (first anns))))))
185 |
186 | (deftest phrase-in-quotes-should-not-match
187 | (let [dictionary [{:text "test-test" :case-sensitive? false}]
188 | highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace})
189 | anns (highlighter-fn "before annotated \"TEST-test\".")]
190 | (is (empty? anns))))
191 |
192 | (deftest overlapping-phrases
193 | (let [dictionary [{:text "test phrase test" :case-sensitive? false}]
194 | highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace})
195 | anns (highlighter-fn "start test phrase test phrase test end")]
196 | (is (= 2 (count anns)))))
197 |
198 | (deftest lt-stemming
199 | (let [dictionary [{:text "Kaunas" :id "1" :stem? true :stemmer :lithuanian}]
200 | highlighter-fn (phrases/highlighter dictionary)
201 | anns (highlighter-fn "Kauno miestas")]
202 | (is (seq anns))
203 | (is (= "Kauno" (-> anns first :text))))
204 | (let [dictionary [{:text "Kaunas Vilnius" :id "1" :stem? true}]
205 | highlighter-fn (phrases/highlighter dictionary)
206 | anns (highlighter-fn "Kaunas, Vilnius")]
207 | (is (seq anns))
208 | (is (= "Kaunas, Vilnius" (-> anns first :text))))
209 | (let [dictionary [{:text "Kaunas" :id "1" :case-sensitive? false :stem? true :stemmer :lithuanian}]
210 | highlighter-fn (phrases/highlighter dictionary)
211 | anns (highlighter-fn "kauno miestas")]
212 | (is (seq anns))
213 | (is (= "kauno" (-> anns first :text)))))
214 |
215 | (deftest en-stemming
216 | (let [txt "who let the dogs out?"]
217 | (let [dictionary [{:text "dog" :id "1"}]
218 | highlighter-fn (phrases/highlighter dictionary)
219 | anns (highlighter-fn txt)]
220 | (is (empty? anns)))
221 | (let [dictionary [{:text "dog" :id "1" :stem? true}]
222 | highlighter-fn (phrases/highlighter dictionary)
223 | anns (highlighter-fn txt)]
224 | (is (seq anns))
225 | (is (= "dogs" (-> anns first :text))))
226 | (let [dictionary [{:text "dog" :id "1" :stem? true :stemmer :english}]
227 | highlighter-fn (phrases/highlighter dictionary)
228 | anns (highlighter-fn txt)]
229 | (is (seq anns))
230 | (is (= "dogs" (-> anns first :text))))
231 | (let [dictionary [{:text "dog" :id "1" :stem? true :stemmer :estonian}]
232 | highlighter-fn (phrases/highlighter dictionary)
233 | anns (highlighter-fn txt)]
234 | (is (empty? anns)))))
235 |
236 | (deftest mixed-stemmers
237 | (let [txt "Saboniai plays basketball"
238 | dictionary [{:text "Sabonis" :id "1" :stem? true :stemmer :lithuanian}
239 | {:text "play" :id "2" :stem? true :stemmer :english}]
240 | highlighter-fn (phrases/highlighter dictionary)
241 | anns (highlighter-fn txt)]
242 | (is (= 2 (count anns)))))
243 |
244 | (deftest phrase-slop
245 | (let [txt "before start and end after"
246 | dictionary [{:text "start end" :id "1" :slop 1}]
247 | highlighter-fn (phrases/highlighter dictionary)
248 | anns (highlighter-fn txt)]
249 | (is (= 1 (count anns)))
250 | (is (= "start and end" (:text (first anns)))))
251 | (testing "all terms in the phrase should match"
252 | (let [txt "before start end after"
253 | dictionary [{:text "start NOPE end" :id "1" :slop 10}]
254 | highlighter-fn (phrases/highlighter dictionary)
255 | anns (highlighter-fn txt)]
256 | (is (empty? anns))))
257 | (let [txt "before start phrase and end phrase after"
258 | dictionary [{:text "start phrase end phrase" :id "1" :slop 1}]
259 | highlighter-fn (phrases/highlighter dictionary)
260 | anns (highlighter-fn txt)]
261 | (is (= 1 (count anns)))
262 | (is (= "start phrase and end phrase" (:text (first anns)))))
263 | (testing "phrase edit distance"
264 | (let [txt "before start end after"
265 | dictionary [{:text "end start" :id "1" :slop 0}]
266 | highlighter-fn (phrases/highlighter dictionary)
267 | anns (highlighter-fn txt)]
268 | (is (empty? anns)))
269 | (let [txt "before start end after"
270 | dictionary [{:text "end start" :id "1" :slop 2}]
271 | highlighter-fn (phrases/highlighter dictionary)
272 | anns (highlighter-fn txt)]
273 | (is (= 1 (count anns)))
274 | (is (= "start end" (:text (first anns))))))
275 | (testing "all terms should match despite the slop"
276 | (let [txt "before start end after"
277 | dictionary [{:text "end start foo" :id "1" :slop 100}]
278 | highlighter-fn (phrases/highlighter dictionary)
279 | anns (highlighter-fn txt)]
280 | (is (empty? anns)))))
281 |
282 | (deftest dictionary-corner-cases
283 | (let [txt "Some text to test ."
284 | dictionary [{:text "."} {:text "text"}]
285 | highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace})
286 | anns (highlighter-fn txt)]
287 | (is (= 2 (count anns))))
288 | (let [txt "Some text to test."
289 | dictionary [{:text ""} {:text "text"}]
290 | highlighter-fn (phrases/highlighter dictionary)
291 | anns (highlighter-fn txt)]
292 | (is (seq anns))))
293 |
294 | (deftest ^:noisy noisy-tests-for-corner-cases
295 | (let [txt "Some text to test."
296 | dictionary [{:text "."} {:text "text"}]
297 | highlighter-fn (phrases/highlighter dictionary)
298 | anns (highlighter-fn txt)]
299 | (is (seq anns))
300 | (is (= 1 (count anns))))
301 | (let [txt " ` `"
302 | dictionary [{:text "test" :id "1"}]
303 | highlighter-fn (phrases/highlighter dictionary)
304 | anns (highlighter-fn txt)]
305 | (is (coll? anns))
306 | (is (empty? anns)))
307 | (testing "slop versions"
308 | (stest/unstrument `phrases/highlighter)
309 | (testing "nil slop"
310 | (let [txt "before start end after"
311 | dictionary [{:text "end start foo" :id "1" :slop nil}]
312 | highlighter-fn (phrases/highlighter dictionary)
313 | anns (highlighter-fn txt)]
314 | (is (empty? anns))))
315 | (testing "very big slop"
316 | (let [txt "before start end after"
317 | dictionary [{:text "end start foo" :id "1" :slop 1000000000000}]
318 | highlighter-fn (phrases/highlighter dictionary)
319 | anns (highlighter-fn txt)]
320 | (is (empty? anns))))
321 | (testing "slop with negative value"
322 | (let [txt "before start end after"
323 | dictionary [{:text "end start foo" :id "1" :slop -1}]
324 | highlighter-fn (phrases/highlighter dictionary)
325 | anns (highlighter-fn txt)]
326 | (is (empty? anns))))
327 | (stest/instrument `phrases/highlighter)))
328 |
329 | (deftest tokenizer-conf
330 | (let [txt "URGENT! Do this immediately!"
331 | dictionary [{:text "URGENT" :id "a" :tokenizer :whitespace}
332 | {:text "URGENT" :id "b" :tokenizer :standard}]
333 | highlighter-fn (phrases/highlighter dictionary)
334 | anns (highlighter-fn txt)]
335 | (is (= 1 (count anns)))
336 | (is (= "b" (:dict-entry-id (first anns)))))
337 | (let [txt "[URGENT!] Do this immediately!"
338 | dictionary [{:text "[URGENT!]" :id "a" :tokenizer :whitespace}
339 | {:text "[URGENT!]" :id "b" :tokenizer :standard}]
340 | highlighter-fn (phrases/highlighter dictionary)
341 | anns (highlighter-fn txt)]
342 | (is (= 2 (count anns)))
343 | (is (= "[URGENT!]" (:text (first (filter #(= "a" (:dict-entry-id %)) anns)))))
344 | (is (= "URGENT" (:text (first (filter #(= "b" (:dict-entry-id %)) anns)))))))
345 |
346 | (deftest phrase-ordering-basic-case
347 | (is (= 1 (count ((phrases/highlighter [{:text "Token Mill" :slop 2 :in-order? false}])
348 | "Mill Token"))))
349 | (is (= 0 (count ((phrases/highlighter [{:text "Token Mill" :slop 2 :in-order? true}])
350 | "Mill Token")))))
351 |
352 | (deftest highlighter-opts-for-slop-with-order
353 | (is (= 0 (count ((phrases/highlighter [{:text "Token Mill"}]
354 | {})
355 | "Mill Token"))))
356 | (is (= 1 (count ((phrases/highlighter [{:text "Token Mill"}]
357 | {:slop 2})
358 | "Mill Token"))))
359 | (is (= 0 (count ((phrases/highlighter [{:text "Token Mill"}]
360 | {:slop 2 :in-order? true})
361 | "Mill Token")))))
362 |
363 | (deftest ordered-phrase-with-on-term
364 | (is (= 1 (count ((phrases/highlighter [{:text "phrase" :slop 2 :in-order? true}])
365 | "prefix phrase suffix")))))
366 |
367 | (deftest ordered-phrase-with-two-equal-terms-in-front-and-end
368 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase" :slop 2 :in-order? true}])
369 | "prefix phrase phrase suffix")]
370 | (is (= 1 (count anns)))
371 | (is (= "phrase phrase" (:text ann)))
372 | (is (= 7 (:begin-offset ann)))
373 | (is (= 20 (:end-offset ann))))
374 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase and phrase" :slop 2 :in-order? true}])
375 | "prefix phrase and phrase suffix")]
376 | (is (= 1 (count anns)))
377 | (is (= "phrase and phrase" (:text ann)))
378 | (is (= 7 (:begin-offset ann)))
379 | (is (= 24 (:end-offset ann)))))
380 |
381 | (deftest ordered-ambigous-phrase
382 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}])
383 | "prefix phrase phrase end suffix")]
384 | (is (= 1 (count anns)))
385 | (is (= "phrase phrase end" (:text ann)))
386 | (is (= 7 (:begin-offset ann)))
387 | (is (= 24 (:end-offset ann))))
388 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}])
389 | "prefix phrase phrase end end suffix")]
390 | (is (= 1 (count anns)))
391 | (is (= "phrase phrase end" (:text ann)))
392 | (is (= 7 (:begin-offset ann)))
393 | (is (= 24 (:end-offset ann))))
394 | (let [[ann1 & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 1 :in-order? true}])
395 | "prefix phrase phrase a phrase end suffix")]
396 | (is (= 1 (count anns)))
397 | (is (= "phrase a phrase end" (:text ann1)))
398 | (is (= 14 (:begin-offset ann1)))
399 | (is (= 33 (:end-offset ann1))))
400 |
401 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase end end" :slop 1 :in-order? true}])
402 | "prefix phrase phrase end end suffix")]
403 | (is (= 1 (count anns)))
404 | (is (= "phrase phrase end end" (:text ann)))
405 | (is (= 7 (:begin-offset ann)))
406 | (is (= 28 (:end-offset ann))))
407 | (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase end end" :slop 1 :in-order? true}])
408 | "prefix phrase phrase end end X X phrase phrase end end suffix")]
409 | (is (= 2 (count anns)))
410 | (is (= "phrase phrase end end" (:text ann)))
411 | (is (= 7 (:begin-offset ann)))
412 | (is (= 28 (:end-offset ann)))))
413 |
414 | (deftest complicated-ordering
415 | (let [[ann1 ann2 & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}])
416 | "prefix phrase phrase end phrase end suffix")]
417 | (is (= 2 (count anns)))
418 | (is (= "phrase phrase end" (:text ann1)))
419 | (is (= 7 (:begin-offset ann1)))
420 | (is (= 24 (:end-offset ann1)))
421 | ;; FIXME: this highlight is not correct
422 | (is (= "phrase end" (:text ann2)))
423 | (is (= 25 (:begin-offset ann2)))
424 | (is (= 35 (:end-offset ann2)))))
425 |
426 | (deftest preserve-order-edge-cases
427 | (testing "multiple match of a phrase"
428 | (is (= 3 (count ((phrases/highlighter
429 | [{:text "Token Mill" :slop 3 :in-order? false}])
430 | "Prefix Token Mill Infix Token a Mill Suffix"))))
431 | (is (= 2 (count ((phrases/highlighter
432 | [{:text "Token Mill" :slop 1 :in-order? true}])
433 | "Prefix Token Mill Infix Token a Mill Suffix"))))
434 | (is (= 1 (count ((phrases/highlighter
435 | [{:text "Token Mill" :slop 0 :in-order? true}])
436 | "Prefix Token Mill Infix Token a Mill Suffix"))))
437 | (let [highlights ((phrases/highlighter
438 | [{:text "Token Mill" :slop 1 :in-order? true :meta {:test "test"}}])
439 | "Prefix Token Mill Infix Token a Mill Suffix")]
440 | (is (= 2 (count highlights)))
441 | (let [first-highlight (apply min-key :begin-offset highlights)]
442 | (is (= "Token Mill" (:text first-highlight)))
443 | (is (= 7 (:begin-offset first-highlight)))
444 | (is (= 17 (:end-offset first-highlight)))
445 | (is (= {"test" "test"} (:meta first-highlight)))
446 | (is (= "PHRASE" (:type first-highlight))))
447 | (let [second-highlight (apply max-key :begin-offset highlights)]
448 | (is (= "Token a Mill" (:text second-highlight)))
449 | (is (= 24 (:begin-offset second-highlight)))
450 | (is (= 36 (:end-offset second-highlight)))
451 | (is (= {"test" "test"} (:meta second-highlight)))
452 | (is (= "PHRASE" (:type second-highlight)))))))
453 |
454 | (deftest annotator-options
455 | (testing "case sensitivity flag"
456 | (let [txt "prefix PHRASE suffix"
457 | dictionary [{:text "phrase"}]
458 | highlighter-fn (phrases/highlighter dictionary)
459 | anns (highlighter-fn txt)]
460 | (is (empty? anns)))
461 | (let [txt "prefix PHRASE suffix"
462 | dictionary [{:text "phrase"}]
463 | highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false})
464 | anns (highlighter-fn txt)]
465 | (is (= 1 (count anns)))))
466 |
467 | (testing "ascii folding flag"
468 | (let [txt "prefix PHRÄSE suffix"
469 | dictionary [{:text "phrase"}]
470 | highlighter-fn (phrases/highlighter dictionary)
471 | anns (highlighter-fn txt)]
472 | (is (empty? anns)))
473 | (let [txt "prefix PHRÄSE suffix"
474 | dictionary [{:text "phrase"}]
475 | highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false
476 | :ascii-fold? true})
477 | anns (highlighter-fn txt)]
478 | (is (= 1 (count anns)))))
479 |
480 | (testing "stemming options"
481 | (let [txt "prefix PHRASES suffix"
482 | dictionary [{:text "phrase"}]
483 | highlighter-fn (phrases/highlighter dictionary)
484 | anns (highlighter-fn txt)]
485 | (is (empty? anns)))
486 | (let [txt "prefix PHRASES suffix"
487 | dictionary [{:text "phrase"}]
488 | highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false
489 | :stem? true
490 | :stemmer :english})
491 | anns (highlighter-fn txt)]
492 | (is (= 1 (count anns))))))
493 |
494 | (deftest phrases-with-edit-distance
495 | (let [txt "prefix tokne mill suffix"
496 | dictionary [{:text "token mill" :fuzzy? true :fuzziness 1}]
497 | highlighter-fn (phrases/highlighter dictionary {})
498 | [ann1 :as anns] (highlighter-fn txt)]
499 | (is (= 1 (count anns)))
500 | (is (= "tokne mill" (:text ann1))))
501 | (let [txt "prefix mill tokne suffix"
502 | dictionary [{:text "token mill" :fuzzy? true :fuzziness 1}]
503 | highlighter-fn (phrases/highlighter dictionary {})
504 | anns (highlighter-fn txt)]
505 | (is (empty? anns)))
506 | (let [txt "prefix tokne mill suffix"
507 | dictionary [{:text "mill token" :fuzzy? true :fuzziness 1 :in-order? true}]
508 | highlighter-fn (phrases/highlighter dictionary {})
509 | anns (highlighter-fn txt)]
510 | (is (empty? anns)))
511 | (let [txt "prefix mill tokne suffix"
512 | dictionary [{:text "token mill" :fuzzy? true :fuzziness 1 :in-order? false}]
513 | highlighter-fn (phrases/highlighter dictionary {})
514 | [ann1 :as anns] (highlighter-fn txt)]
515 | (is (= 1 (count anns)))
516 | (is (= "mill tokne" (:text ann1))))
517 | (let [txt "prefix tokne uab mill suffix"
518 | dictionary [{:text "mill token" :fuzzy? true :fuzziness 1 :in-order? false}]
519 | highlighter-fn (phrases/highlighter dictionary {})
520 | anns (highlighter-fn txt)]
521 | (is (empty? anns))))
522 |
--------------------------------------------------------------------------------
/test/beagle/readers_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.readers-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [clojure.spec.alpha :as s]
4 | [beagle.schema :as sch]
5 | [beagle.readers :as readers])
6 | (:import (java.io ByteArrayInputStream)))
7 |
8 | (deftest json-reader
9 | (is (not (nil? (s/conform ::sch/dictionary
10 | (readers/read-json
11 | (ByteArrayInputStream.
12 | (.getBytes "[{\"text\": \"moo\"}]")))))))
13 | (is (not (nil? (s/conform ::sch/dictionary
14 | (readers/read-json "test/resources/dict.json"))))))
15 |
16 | (deftest csv-file-reader
17 | (is (not (nil? (s/conform ::sch/dictionary (readers/read-csv "test/resources/dict.csv"))))))
18 |
19 | (deftest edn-file-reader
20 | (is (not (nil? (s/conform ::sch/dictionary (readers/read-edn "test/resources/dict.edn"))))))
21 |
--------------------------------------------------------------------------------
/test/beagle/text_analysis_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.text-analysis-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [beagle.text-analysis :as text-analysis]))
4 |
5 | (deftest field-name-construction
6 | (is (= "text.standard-tokenizer"
7 | (text-analysis/get-field-name {} {})))
8 | (is (= "text.standard-tokenizer"
9 | (text-analysis/get-field-name {:case-sensitive? true} {})))
10 | (is (= "text.standard-tokenizer.lowercased"
11 | (text-analysis/get-field-name {:case-sensitive? false} {})))
12 | (is (= "text.standard-tokenizer.ascii-folded"
13 | (text-analysis/get-field-name {:ascii-fold? true} {})))
14 | (is (= "text.standard-tokenizer.stemmed-english"
15 | (text-analysis/get-field-name {:stem? true} {})))
16 | (is (= "text.standard-tokenizer.stemmed-lithuanian"
17 | (text-analysis/get-field-name {:stem? true :stemmer :lithuanian} {})))
18 | (is (= "text.standard-tokenizer.ascii-folded-lowercased-stemmed-lithuanian"
19 | (text-analysis/get-field-name {:ascii-fold? true
20 | :case-sensitive? false
21 | :stem? true
22 | :stemmer :lithuanian} {}))))
23 |
24 | (deftest token-stream
25 | (let [txt "These are tests."]
26 | (is (= ["These" "are" "tests"]
27 | (text-analysis/text->token-strings
28 | txt (text-analysis/get-string-analyzer {:case-sensitive? true} {}))))
29 | (is (= ["these" "are" "tests"]
30 | (text-analysis/text->token-strings
31 | txt (text-analysis/get-string-analyzer {:case-sensitive? false} {}))))
32 | (is (= ["these" "are" "tests"]
33 | (text-analysis/text->token-strings
34 | txt (text-analysis/get-string-analyzer {:case-sensitive? false
35 | :ascii-fold? true} {}))))
36 | (is (= ["these" "are" "test"]
37 | (text-analysis/text->token-strings
38 | txt (text-analysis/get-string-analyzer {:case-sensitive? false
39 | :ascii-fold? true
40 | :stem? true} {}))))
41 | ; this one is surprising but correct
42 | (is (= ["these" "are" "tests."]
43 | (text-analysis/text->token-strings
44 | txt (text-analysis/get-string-analyzer {:case-sensitive? false
45 | :ascii-fold? true
46 | :stem? true} {:tokenizer :whitespace}))))))
47 |
--------------------------------------------------------------------------------
/test/beagle/validator_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.validator-test
2 | (:require [clojure.test :refer [deftest is]]
3 | [beagle.validator :as validator]))
4 |
5 | (deftest basic-cases
6 | (is (seq (validator/valid-dictionary? [{:text "test" :id "1" :meta {:test "test"} :type "CUSTOM"}])))
7 | (is (nil? (validator/valid-dictionary? [{:id "1" :meta {:test "test"} :type "CUSTOM"}]))))
8 |
--------------------------------------------------------------------------------
/test/resources/dict.csv:
--------------------------------------------------------------------------------
1 | text,id,synonyms,meta,case-sensitive?,ascii-fold?,type
2 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,true,TEST
3 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,true
4 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,false
5 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,FALSE
6 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,NOT_BOOL
7 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true
8 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2
9 | test-dictionary-item,id1,syn1;syn2,k;v
10 | test-dictionary-item,id1,syn1;syn2,k
11 | test-dictionary-item,id1,syn1;syn2,
12 | test-dictionary-item,id1,syn1;syn2
13 | test-dictionary-item,id1,syn1;
14 | test-dictionary-item,id1,syn1
15 | test-dictionary-item,id1,
16 | test-dictionary-item,id1
17 | test-dictionary-item
18 | test-dictionary-item,,,,,,TEST
19 |
--------------------------------------------------------------------------------
/test/resources/dict.edn:
--------------------------------------------------------------------------------
1 | [{:text "test text"
2 | :id "test-id"
3 | :case-sensitive? true
4 | :ascii-fold? true
5 | :meta {:email "test@example.com"}}]
6 |
--------------------------------------------------------------------------------
/test/resources/dict.json:
--------------------------------------------------------------------------------
1 | [{"text": "test text",
2 | "id": "test-id",
3 | "case-sensitive?": true,
4 | "ascii-fold?": true,
5 | "meta": {"email": "test@example.com"}}]
--------------------------------------------------------------------------------
/test/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | %d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------