├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── tokenmill-logo.svg
├── .gitignore
├── .gitlab-ci.yml
├── CHANGELOG
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── charts
    ├── mt-avg-per-doc.png
    ├── mt-min-max-per-doc.png
    ├── mt-throughput-per-sec.png
    ├── mt-total.png
    ├── st-avg-per-doc.png
    ├── st-min-max-per-doc.png
    └── st-throughput-per-sec.png
├── classes
    └── lt
    │   └── tokenmill
    │       └── beagle
    │           └── phrases
    │               ├── Annotation.class
    │               ├── Annotator.class
    │               └── DictionaryEntry.class
├── deps.edn
├── pom.xml
├── src
    └── beagle
    │   ├── annotation_merger.clj
    │   ├── dictionary_optimizer.clj
    │   ├── java
    │       ├── annotation.clj
    │       └── java.clj
    │   ├── lucene_alpha.clj
    │   ├── monitor.clj
    │   ├── phrases.clj
    │   ├── readers.clj
    │   ├── schema.clj
    │   ├── text_analysis.clj
    │   └── validator.clj
└── test
    ├── beagle
        ├── annotation_merge_test.clj
        ├── corner_case_phrases_test.clj
        ├── dictionary_optimization_test.clj
        ├── java_test.clj
        ├── lucene_alpha_test.clj
        ├── optimization_suggestions_test.clj
        ├── phrases_test.clj
        ├── readers_test.clj
        ├── text_analysis_test.clj
        └── validator_test.clj
    └── resources
        ├── dict.csv
        ├── dict.edn
        ├── dict.json
        ├── logback.xml
        └── phrases.html


/.gitattributes:
--------------------------------------------------------------------------------
1 | test/resources/phrases.html linguist-vendored=false
2 | test/resources/phrases.html linguist-detectable=false


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/tokenmill-logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
 3 | 
 4 | <svg
 5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 6 |    xmlns:cc="http://creativecommons.org/ns#"
 7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 8 |    xmlns:svg="http://www.w3.org/2000/svg"
 9 |    xmlns="http://www.w3.org/2000/svg"
10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
12 |    version="1.1"
13 |    id="svg3473"
14 |    xml:space="preserve"
15 |    width="278.78665"
16 |    height="280.37387"
17 |    viewBox="0 0 278.78665 280.37387"
18 |    sodipodi:docname="tokenmill-logo.svg"
19 |    inkscape:version="0.92.4 5da689c313, 2019-01-14"><metadata
20 |      id="metadata3479"><rdf:RDF><cc:Work
21 |          rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
22 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
23 |      id="defs3477" /><sodipodi:namedview
24 |      pagecolor="#ffffff"
25 |      bordercolor="#666666"
26 |      borderopacity="1"
27 |      objecttolerance="10"
28 |      gridtolerance="10"
29 |      guidetolerance="10"
30 |      inkscape:pageopacity="0"
31 |      inkscape:pageshadow="2"
32 |      inkscape:window-width="1920"
33 |      inkscape:window-height="1080"
34 |      id="namedview3475"
35 |      showgrid="false"
36 |      fit-margin-top="0"
37 |      fit-margin-left="0"
38 |      fit-margin-right="0"
39 |      fit-margin-bottom="0"
40 |      inkscape:zoom="1.093603"
41 |      inkscape:cx="385.82913"
42 |      inkscape:cy="128.81489"
43 |      inkscape:window-x="0"
44 |      inkscape:window-y="0"
45 |      inkscape:window-maximized="0"
46 |      inkscape:current-layer="g3481" /><g
47 |      id="g3481"
48 |      inkscape:groupmode="layer"
49 |      inkscape:label="ink_ext_XXXXXX"
50 |      transform="matrix(1.3333333,0,0,-1.3333333,-257.45733,537.03742)"><path
51 |        inkscape:connector-curvature="0"
52 |        id="path3499"
53 |        style="fill:#ccff66;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:0.1"
54 |        d="m 281.869,284.249 v 24.318 H 402.183 V 284.249 H 281.869" /><path
55 |        inkscape:connector-curvature="0"
56 |        id="path3501"
57 |        style="fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.1"
58 |        d="m 300.212,301.022 h -4.109 v -10.658 h -1.652 v 10.658 h -4.109 v 1.468 h 9.87 z m 12.694,-4.586 c 0,-1.871 -0.513,-3.388 -1.541,-4.55 -1.076,-1.222 -2.537,-1.834 -4.384,-1.834 -1.835,0 -3.29,0.612 -4.366,1.834 -1.015,1.175 -1.522,2.692 -1.522,4.55 0,1.859 0.513,3.37 1.54,4.531 1.077,1.223 2.532,1.835 4.367,1.835 1.847,0 3.302,-0.606 4.365,-1.816 1.028,-1.162 1.541,-2.679 1.541,-4.55 z m -1.706,-0.018 c 0,1.406 -0.348,2.556 -1.046,3.449 -0.758,0.99 -1.815,1.486 -3.173,1.486 -1.333,0 -2.378,-0.496 -3.136,-1.486 -0.699,-0.906 -1.046,-2.049 -1.046,-3.431 0,-1.395 0.342,-2.544 1.027,-3.449 0.771,-0.99 1.828,-1.487 3.174,-1.487 1.357,0 2.415,0.497 3.173,1.487 0.685,0.892 1.027,2.036 1.027,3.431 z m 13.576,-6.054 h -2.146 l -4.275,6.109 -1.853,-1.853 v -4.256 h -1.669 v 12.126 h 1.669 v -5.87 l 5.852,5.87 h 2.147 l -4.917,-4.898 z m 2.788,5.595 c 0,-3.058 1.266,-4.586 3.797,-4.586 0.832,0 1.573,0.202 2.221,0.605 0.696,0.452 1.131,1.07 1.302,1.854 h 1.688 c -0.172,-0.686 -0.532,-1.371 -1.083,-2.056 -0.941,-1.174 -2.304,-1.761 -4.091,-1.761 -1.859,0 -3.283,0.581 -4.274,1.743 -0.93,1.088 -1.395,2.586 -1.395,4.494 0,1.933 0.484,3.486 1.45,4.66 1.04,1.26 2.508,1.89 4.403,1.89 1.834,0 3.191,-0.716 4.072,-2.147 0.71,-1.149 1.077,-2.715 1.101,-4.696 z m 3.872,5.43 c -1.187,0 -2.128,-0.4 -2.826,-1.202 -0.697,-0.801 -1.046,-1.8 -1.046,-2.999 h 7.449 c 0,1.272 -0.27,2.269 -0.808,2.991 -0.623,0.807 -1.547,1.21 -2.769,1.21 z m 15.923,-11.025 v 7.797 c 0,2.152 -0.96,3.228 -2.88,3.228 -1.088,0 -1.969,-0.415 -2.642,-1.247 -0.649,-0.771 -0.973,-1.706 -0.973,-2.807 v -6.971 h -1.669 v 12.071 h 1.56 v -2.055 c 0.501,0.845 1.082,1.459 1.742,1.844 0.661,0.385 1.486,0.578 2.477,0.578 1.223,0 2.208,-0.337 2.954,-1.009 0.746,-0.672 1.119,-1.614 1.119,-2.825 v -8.604 z m 20.063,0 v 8.145 c 0,1.92 -0.826,2.88 -2.478,2.88 -0.978,0 -1.76,-0.361 -2.347,-1.082 -0.563,-0.673 -0.844,-1.507 -0.844,-2.495 v -7.448 h -1.651 v 8.714 c 0,0.685 -0.204,1.236 -0.606,1.651 -0.402,0.415 -0.941,0.624 -1.614,0.624 -1.052,0 -1.902,-0.417 -2.55,-1.248 -0.6,-0.783 -0.899,-1.708 -0.899,-2.77 v -6.971 h -1.651 v 12.071 h 1.523 v -1.89 c 0.928,1.505 2.262,2.257 3.999,2.257 1.578,0 2.666,-0.722 3.265,-2.165 0.93,1.443 2.128,2.165 3.596,2.165 1.26,0 2.232,-0.386 2.917,-1.156 0.648,-0.746 0.972,-1.757 0.972,-3.027 v -8.255 z m 6.31,0 h -1.651 v 12.126 h 1.651 z m 10.585,0 h -7.815 v 12.126 h 1.67 v -10.658 h 6.145 z m 9.393,0 h -7.815 v 12.126 h 1.669 v -10.658 h 6.146 v -1.468" /><path
59 |        inkscape:connector-curvature="0"
60 |        id="path3503"
61 |        style="fill:#808080;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.1"
62 |        d="m 263.024,207.119 c -0.288,-0.86 -0.435,-1.65 -0.437,-2.363 -0.007,-0.717 0.104,-1.352 0.336,-1.908 0.228,-0.554 0.568,-1.024 1.01,-1.409 0.445,-0.382 0.972,-0.68 1.586,-0.886 0.614,-0.205 1.213,-0.285 1.799,-0.249 0.584,0.041 1.136,0.214 1.655,0.518 0.515,0.305 0.988,0.745 1.418,1.32 0.427,0.57 0.785,1.29 1.072,2.149 0.288,0.86 0.437,1.649 0.441,2.366 0.002,0.714 -0.111,1.349 -0.34,1.9 -0.232,0.552 -0.569,1.022 -1.01,1.406 -0.444,0.39 -0.971,0.688 -1.584,0.893 -0.614,0.205 -1.214,0.285 -1.802,0.242 -0.584,-0.042 -1.139,-0.213 -1.653,-0.515 -0.518,-0.3 -0.991,-0.739 -1.415,-1.312 -0.43,-0.574 -0.787,-1.293 -1.076,-2.152 z m -27.485,109.568 -3.526,3.783 0.548,2.003 4.864,-5.342 3.969,-1.085 -0.493,-1.805 -3.97,1.085 -6.905,-2.124 0.547,2.001 4.96,1.462 z m -26.742,-36.171 -1.744,8.16 2.546,0.544 2.394,-11.201 -16.601,-3.548 -2.299,10.756 2.545,0.545 1.649,-7.716 4.29,0.917 -1.415,6.623 2.545,0.544 1.415,-6.623 z m 81.294,53.756 c -0.31,-0.063 -0.595,-0.09 -0.855,-0.08 -0.26,0.011 -0.494,0.053 -0.706,0.129 -0.212,0.075 -0.402,0.181 -0.568,0.317 -0.166,0.136 -0.309,0.3 -0.431,0.491 -0.098,0.151 -0.182,0.325 -0.25,0.519 -0.069,0.195 -0.129,0.416 -0.178,0.664 l -0.708,3.485 1.084,0.22 0.687,-3.387 c 0.085,-0.42 0.201,-0.736 0.349,-0.952 0.167,-0.239 0.368,-0.395 0.608,-0.468 0.237,-0.073 0.502,-0.08 0.795,-0.021 0.292,0.06 0.533,0.169 0.725,0.33 0.191,0.16 0.315,0.383 0.376,0.667 0.051,0.257 0.035,0.593 -0.05,1.013 l -0.688,3.386 1.084,0.221 0.707,-3.485 c 0.051,-0.248 0.082,-0.474 0.095,-0.68 0.013,-0.206 0.004,-0.398 -0.028,-0.577 -0.038,-0.223 -0.107,-0.43 -0.207,-0.62 -0.101,-0.19 -0.238,-0.362 -0.408,-0.515 -0.17,-0.154 -0.374,-0.285 -0.612,-0.396 -0.238,-0.112 -0.512,-0.198 -0.821,-0.261 z m 27.595,-2.339 c 0.094,0.128 0.138,0.253 0.135,0.379 -0.006,0.125 -0.042,0.253 -0.113,0.382 -0.071,0.13 -0.162,0.259 -0.273,0.389 -0.111,0.129 -0.229,0.264 -0.354,0.404 -0.165,0.187 -0.314,0.374 -0.449,0.564 -0.134,0.189 -0.231,0.388 -0.289,0.591 -0.058,0.203 -0.069,0.415 -0.031,0.634 0.039,0.219 0.145,0.45 0.322,0.69 0.196,0.266 0.412,0.463 0.65,0.591 0.237,0.128 0.484,0.198 0.737,0.211 0.254,0.01 0.514,-0.03 0.774,-0.125 0.262,-0.094 0.526,-0.239 0.794,-0.437 0.268,-0.197 0.518,-0.421 0.75,-0.672 0.231,-0.253 0.415,-0.493 0.549,-0.719 l -0.582,-0.789 c -0.185,0.294 -0.38,0.553 -0.584,0.776 -0.205,0.223 -0.413,0.411 -0.622,0.567 -0.304,0.222 -0.579,0.339 -0.823,0.35 -0.246,0.01 -0.45,-0.095 -0.613,-0.316 -0.094,-0.128 -0.144,-0.252 -0.151,-0.374 -0.006,-0.122 0.015,-0.245 0.068,-0.367 0.053,-0.122 0.128,-0.243 0.225,-0.365 0.095,-0.121 0.201,-0.246 0.316,-0.372 0.181,-0.202 0.345,-0.405 0.495,-0.607 0.151,-0.201 0.264,-0.409 0.336,-0.627 0.072,-0.215 0.092,-0.44 0.055,-0.673 -0.037,-0.234 -0.154,-0.483 -0.347,-0.747 -0.2,-0.271 -0.422,-0.467 -0.668,-0.592 -0.247,-0.122 -0.508,-0.181 -0.789,-0.175 -0.277,0.006 -0.572,0.076 -0.882,0.207 -0.309,0.13 -0.621,0.312 -0.933,0.543 -0.274,0.201 -0.522,0.426 -0.746,0.675 -0.225,0.248 -0.39,0.486 -0.493,0.713 l 0.574,0.777 c 0.168,-0.291 0.352,-0.547 0.551,-0.769 0.201,-0.222 0.424,-0.425 0.676,-0.611 0.145,-0.107 0.296,-0.2 0.454,-0.279 0.158,-0.079 0.314,-0.132 0.468,-0.157 0.154,-0.024 0.302,-0.012 0.441,0.038 0.141,0.049 0.265,0.147 0.372,0.292 z m 7.249,20.049 4.505,1.665 -0.059,4.546 2.027,-1.406 -0.084,-2.496 2.366,0.915 1.952,-1.354 -4.19,-1.496 0.031,-4.915 -2.038,1.413 0.072,2.831 -2.638,-1.052 z m -87.432,31.577 0.83,-1.076 0.443,-12.536 5.858,4.515 1.278,-1.657 -8.492,-6.547 -0.823,1.068 -0.442,12.551 -5.648,-4.354 -1.278,1.658 z m 38.903,-51.367 c -0.21,0.318 -0.434,0.572 -0.671,0.757 -0.235,0.188 -0.475,0.316 -0.719,0.383 -0.244,0.067 -0.488,0.077 -0.73,0.03 -0.242,-0.049 -0.477,-0.147 -0.704,-0.297 -0.228,-0.15 -0.41,-0.329 -0.549,-0.533 -0.138,-0.204 -0.223,-0.432 -0.257,-0.683 -0.033,-0.25 -0.009,-0.521 0.07,-0.812 0.079,-0.29 0.225,-0.595 0.435,-0.913 0.211,-0.318 0.435,-0.572 0.672,-0.759 0.236,-0.186 0.476,-0.313 0.718,-0.38 0.244,-0.066 0.487,-0.075 0.728,-0.028 0.245,0.045 0.48,0.144 0.707,0.294 0.227,0.15 0.409,0.329 0.547,0.535 0.138,0.204 0.225,0.433 0.258,0.682 0.034,0.25 0.011,0.52 -0.069,0.81 -0.08,0.291 -0.225,0.596 -0.436,0.914 z m 1.396,-1.88 c 0.053,0.062 0.117,0.128 0.189,0.198 0.074,0.072 0.152,0.143 0.236,0.212 0.082,0.069 0.168,0.138 0.253,0.202 0.088,0.066 0.173,0.126 0.26,0.183 l 0.074,0.05 0.553,-0.833 c -0.148,-0.079 -0.322,-0.18 -0.522,-0.302 -0.201,-0.122 -0.411,-0.255 -0.631,-0.399 -0.164,-0.11 -0.31,-0.212 -0.434,-0.306 -0.126,-0.094 -0.251,-0.189 -0.378,-0.286 -0.124,-0.096 -0.256,-0.195 -0.393,-0.297 -0.137,-0.102 -0.274,-0.198 -0.409,-0.288 -0.411,-0.271 -0.816,-0.446 -1.215,-0.522 -0.398,-0.077 -0.785,-0.068 -1.156,0.029 -0.372,0.096 -0.721,0.272 -1.049,0.529 -0.328,0.258 -0.624,0.587 -0.889,0.986 -0.265,0.4 -0.45,0.803 -0.557,1.211 -0.107,0.408 -0.13,0.804 -0.072,1.189 0.06,0.385 0.205,0.748 0.433,1.091 0.229,0.342 0.546,0.648 0.951,0.916 0.404,0.268 0.809,0.439 1.214,0.517 0.405,0.076 0.797,0.068 1.171,-0.024 0.375,-0.094 0.728,-0.266 1.059,-0.52 0.33,-0.252 0.623,-0.573 0.881,-0.962 0.134,-0.203 0.247,-0.414 0.335,-0.634 0.089,-0.219 0.151,-0.435 0.187,-0.652 0.036,-0.216 0.045,-0.435 0.029,-0.653 -0.016,-0.22 -0.061,-0.428 -0.136,-0.628 z m 26.565,8.955 0.803,0.355 0.835,0.156 0.149,-0.877 -0.834,-0.151 -0.882,0.071 -0.008,-0.024 0.586,-0.655 0.42,-0.809 -0.789,-0.414 -0.415,0.811 -0.198,0.861 -0.033,0.005 -0.441,-0.765 -0.635,-0.654 -0.632,0.626 0.638,0.65 0.752,0.456 -10e-4,0.025 -0.865,0.191 -0.754,0.388 0.4,0.794 0.753,-0.393 0.664,-0.574 0.02,0.016 -0.086,0.877 0.132,0.883 0.877,-0.132 -0.132,-0.882 -0.338,-0.813 z m 26.712,-17.191 c -0.354,-0.246 -0.706,-0.429 -1.056,-0.551 -0.347,-0.123 -0.688,-0.195 -1.023,-0.217 -0.021,-0.228 -0.039,-0.443 -0.061,-0.646 -0.021,-0.202 -0.037,-0.382 -0.053,-0.541 -0.014,-0.159 -0.028,-0.289 -0.036,-0.39 -0.011,-0.101 -0.017,-0.163 -0.018,-0.186 l -1.03,1.207 c 0.016,0.086 0.031,0.177 0.045,0.275 0.013,0.097 0.03,0.226 0.049,0.387 -0.356,0.095 -0.686,0.24 -0.99,0.439 -0.302,0.198 -0.569,0.434 -0.801,0.704 -0.187,0.22 -0.327,0.445 -0.421,0.677 -0.094,0.234 -0.14,0.462 -0.14,0.684 0.001,0.224 0.051,0.439 0.147,0.644 0.097,0.207 0.243,0.393 0.436,0.558 0.164,0.139 0.334,0.241 0.51,0.302 0.176,0.062 0.354,0.096 0.534,0.103 0.18,0.006 0.36,-0.01 0.544,-0.048 0.182,-0.038 0.362,-0.086 0.539,-0.145 0.012,0.167 0.027,0.326 0.05,0.477 0.021,0.153 0.056,0.299 0.102,0.438 0.047,0.138 0.11,0.268 0.187,0.391 0.079,0.122 0.18,0.236 0.304,0.342 0.18,0.154 0.37,0.261 0.568,0.318 0.198,0.058 0.398,0.068 0.602,0.032 0.204,-0.036 0.407,-0.119 0.608,-0.248 0.203,-0.13 0.396,-0.306 0.586,-0.528 0.189,-0.222 0.336,-0.44 0.438,-0.657 0.104,-0.217 0.158,-0.426 0.17,-0.63 0.011,-0.203 -0.025,-0.396 -0.107,-0.582 -0.081,-0.186 -0.214,-0.356 -0.399,-0.514 -0.162,-0.138 -0.329,-0.236 -0.502,-0.298 -0.172,-0.06 -0.345,-0.093 -0.525,-0.096 -0.177,-0.005 -0.357,0.013 -0.538,0.052 -0.182,0.039 -0.361,0.086 -0.542,0.142 -0.02,-0.219 -0.042,-0.469 -0.069,-0.75 -0.027,-0.282 -0.055,-0.58 -0.083,-0.893 0.205,0.04 0.414,0.105 0.627,0.198 0.212,0.093 0.427,0.219 0.644,0.375 z m -4.215,0.994 c 0.14,-0.164 0.307,-0.306 0.498,-0.43 0.192,-0.123 0.403,-0.214 0.635,-0.274 0.017,0.168 0.032,0.34 0.05,0.515 0.014,0.175 0.029,0.347 0.046,0.515 0.014,0.17 0.029,0.336 0.042,0.497 0.011,0.161 0.024,0.314 0.039,0.457 -0.123,0.038 -0.244,0.069 -0.361,0.091 -0.12,0.023 -0.233,0.032 -0.339,0.029 -0.11,-0.004 -0.214,-0.023 -0.312,-0.055 -0.099,-0.035 -0.192,-0.086 -0.279,-0.16 -0.1,-0.086 -0.17,-0.178 -0.213,-0.276 -0.043,-0.1 -0.065,-0.201 -0.063,-0.304 0.002,-0.103 0.025,-0.207 0.071,-0.31 0.044,-0.104 0.107,-0.202 0.186,-0.295 z m 2.799,3.051 c -0.072,-0.061 -0.13,-0.13 -0.171,-0.206 -0.043,-0.074 -0.077,-0.16 -0.098,-0.256 -0.024,-0.096 -0.04,-0.202 -0.048,-0.317 -0.008,-0.116 -0.016,-0.246 -0.022,-0.387 0.124,-0.039 0.243,-0.071 0.359,-0.094 0.116,-0.026 0.227,-0.039 0.333,-0.037 0.107,10e-4 0.208,0.016 0.304,0.046 0.095,0.029 0.183,0.079 0.264,0.149 0.138,0.118 0.208,0.26 0.21,0.426 0.003,0.167 -0.07,0.336 -0.219,0.511 -0.075,0.087 -0.154,0.155 -0.235,0.203 -0.083,0.048 -0.163,0.078 -0.244,0.089 -0.08,0.012 -0.156,0.006 -0.23,-0.015 -0.075,-0.022 -0.142,-0.06 -0.203,-0.112 z m 3.433,-14.601 c -0.078,0.104 -0.156,0.236 -0.236,0.393 -0.077,0.156 -0.141,0.306 -0.191,0.447 -0.127,0.359 -0.175,0.674 -0.139,0.944 0.036,0.271 0.145,0.491 0.331,0.662 0.133,0.124 0.333,0.233 0.602,0.327 l 2.222,0.788 -0.284,0.802 0.863,0.305 0.284,-0.802 1.199,0.425 0.371,-1.049 -1.199,-0.424 0.438,-1.239 -0.862,-0.306 -0.438,1.239 -2.085,-0.738 c -0.166,-0.059 -0.281,-0.128 -0.341,-0.206 -0.105,-0.135 -0.11,-0.336 -0.018,-0.597 0.043,-0.121 0.095,-0.24 0.157,-0.351 0.061,-0.113 0.126,-0.218 0.193,-0.313 z m -0.782,-32.567 c -0.057,-0.126 -0.089,-0.251 -0.1,-0.376 -0.011,-0.123 0.007,-0.245 0.052,-0.362 0.045,-0.117 0.121,-0.225 0.224,-0.326 0.105,-0.102 0.244,-0.191 0.414,-0.269 0.183,-0.083 0.346,-0.128 0.488,-0.134 0.14,-0.007 0.265,0.015 0.374,0.066 0.106,0.053 0.199,0.124 0.278,0.223 0.078,0.097 0.147,0.21 0.207,0.34 0.075,0.165 0.133,0.343 0.172,0.533 0.041,0.191 0.056,0.378 0.05,0.564 -0.3,0.136 -0.569,0.227 -0.806,0.274 -0.236,0.046 -0.443,0.054 -0.62,0.025 -0.177,-0.03 -0.327,-0.094 -0.448,-0.19 -0.122,-0.098 -0.218,-0.222 -0.285,-0.368 z m 2.748,-1.357 c -0.118,-0.258 -0.266,-0.483 -0.445,-0.675 -0.178,-0.189 -0.378,-0.335 -0.601,-0.431 -0.225,-0.098 -0.464,-0.145 -0.724,-0.141 -0.256,0.005 -0.526,0.07 -0.809,0.199 -0.281,0.128 -0.52,0.288 -0.713,0.483 -0.195,0.192 -0.338,0.411 -0.432,0.656 -0.094,0.243 -0.132,0.507 -0.116,0.793 0.017,0.284 0.096,0.581 0.236,0.888 0.147,0.324 0.339,0.586 0.576,0.785 0.237,0.198 0.509,0.335 0.813,0.409 0.306,0.073 0.639,0.084 0.997,0.032 0.361,-0.052 0.74,-0.168 1.135,-0.348 0.45,-0.206 0.852,-0.456 1.205,-0.752 0.353,-0.295 0.634,-0.621 0.841,-0.979 0.207,-0.356 0.329,-0.735 0.363,-1.138 0.034,-0.4 -0.045,-0.81 -0.237,-1.228 -0.107,-0.236 -0.23,-0.444 -0.371,-0.62 -0.138,-0.178 -0.259,-0.31 -0.365,-0.398 l -0.872,0.399 c 0.144,0.141 0.277,0.289 0.401,0.443 0.124,0.157 0.225,0.32 0.304,0.494 0.081,0.176 0.128,0.361 0.141,0.552 0.014,0.193 -0.014,0.386 -0.085,0.581 -0.072,0.196 -0.187,0.393 -0.348,0.591 -0.16,0.196 -0.375,0.387 -0.643,0.57 l -0.008,-0.016 c 0.004,-0.388 -0.077,-0.77 -0.243,-1.149 z m -7.378,-9.135 c -0.156,-0.258 -0.226,-0.504 -0.216,-0.735 0.013,-0.23 0.11,-0.434 0.294,-0.607 0.102,-0.097 0.209,-0.165 0.316,-0.202 0.11,-0.038 0.219,-0.049 0.327,-0.038 0.11,0.012 0.214,0.047 0.316,0.102 0.1,0.058 0.196,0.133 0.285,0.225 0.086,0.091 0.154,0.189 0.208,0.295 0.052,0.106 0.081,0.213 0.087,0.322 0.006,0.109 -0.012,0.216 -0.055,0.324 -0.042,0.106 -0.115,0.209 -0.218,0.307 -0.183,0.173 -0.39,0.26 -0.622,0.261 -0.231,-0.002 -0.471,-0.086 -0.722,-0.254 z m -2.084,1.978 c -0.104,-0.109 -0.184,-0.226 -0.241,-0.349 -0.057,-0.123 -0.087,-0.247 -0.091,-0.373 -0.003,-0.124 0.018,-0.246 0.067,-0.365 0.048,-0.121 0.124,-0.23 0.228,-0.329 0.117,-0.11 0.236,-0.188 0.361,-0.235 0.123,-0.046 0.249,-0.065 0.376,-0.056 0.127,0.009 0.253,0.042 0.379,0.099 0.127,0.057 0.249,0.135 0.368,0.235 0.093,0.124 0.164,0.25 0.215,0.38 0.051,0.128 0.077,0.256 0.079,0.384 0.002,0.126 -0.023,0.251 -0.075,0.372 -0.055,0.122 -0.139,0.236 -0.255,0.346 -0.105,0.1 -0.216,0.17 -0.339,0.213 -0.122,0.042 -0.245,0.057 -0.369,0.047 -0.125,-0.012 -0.248,-0.048 -0.368,-0.11 -0.12,-0.064 -0.232,-0.15 -0.335,-0.259 z m -0.661,0.627 c 0.227,0.238 0.465,0.424 0.718,0.557 0.251,0.135 0.503,0.217 0.753,0.246 0.248,0.028 0.49,0.003 0.724,-0.072 0.235,-0.075 0.449,-0.204 0.641,-0.387 0.151,-0.142 0.268,-0.291 0.35,-0.445 0.083,-0.154 0.138,-0.309 0.165,-0.468 0.027,-0.157 0.029,-0.315 0.006,-0.471 -0.025,-0.156 -0.069,-0.309 -0.137,-0.456 l 0.014,-0.014 c 0.294,0.124 0.582,0.166 0.864,0.129 0.281,-0.036 0.539,-0.166 0.775,-0.389 0.198,-0.188 0.34,-0.398 0.426,-0.628 0.084,-0.227 0.116,-0.463 0.097,-0.705 -0.02,-0.239 -0.085,-0.478 -0.203,-0.716 -0.115,-0.239 -0.27,-0.459 -0.466,-0.666 -0.196,-0.208 -0.41,-0.375 -0.642,-0.501 -0.231,-0.13 -0.466,-0.208 -0.705,-0.242 -0.24,-0.031 -0.477,-0.01 -0.709,0.061 -0.233,0.074 -0.451,0.205 -0.649,0.394 -0.235,0.222 -0.378,0.473 -0.428,0.752 -0.053,0.281 -0.025,0.57 0.082,0.87 l -0.013,0.013 c -0.145,-0.075 -0.294,-0.128 -0.449,-0.16 -0.155,-0.031 -0.312,-0.039 -0.471,-0.019 -0.16,0.019 -0.318,0.065 -0.476,0.14 -0.157,0.074 -0.312,0.184 -0.462,0.326 -0.193,0.183 -0.333,0.389 -0.421,0.62 -0.088,0.23 -0.125,0.47 -0.108,0.72 0.014,0.251 0.083,0.506 0.204,0.765 0.12,0.259 0.294,0.508 0.52,0.746 z m -10.382,-10.174 c 0.212,-0.426 0.454,-0.714 0.723,-0.863 0.272,-0.15 0.555,-0.153 0.849,-0.006 0.297,0.148 0.466,0.376 0.51,0.683 0.043,0.306 -0.041,0.672 -0.253,1.098 -0.213,0.426 -0.455,0.714 -0.725,0.863 -0.271,0.15 -0.556,0.152 -0.852,0.004 -0.294,-0.147 -0.462,-0.374 -0.506,-0.68 -0.043,-0.307 0.041,-0.673 0.254,-1.099 z m -1.029,-0.513 c -0.152,0.306 -0.253,0.609 -0.299,0.911 -0.047,0.301 -0.039,0.588 0.03,0.859 0.068,0.271 0.195,0.523 0.384,0.753 0.187,0.23 0.44,0.424 0.759,0.584 0.315,0.156 0.619,0.24 0.913,0.251 0.296,0.011 0.575,-0.038 0.835,-0.147 0.259,-0.107 0.498,-0.27 0.712,-0.488 0.216,-0.218 0.401,-0.479 0.553,-0.784 0.152,-0.306 0.25,-0.611 0.294,-0.914 0.045,-0.302 0.031,-0.591 -0.037,-0.865 -0.069,-0.275 -0.198,-0.526 -0.386,-0.753 -0.186,-0.23 -0.437,-0.42 -0.751,-0.577 -0.32,-0.16 -0.626,-0.246 -0.923,-0.258 -0.296,-0.013 -0.573,0.036 -0.832,0.146 -0.258,0.111 -0.493,0.276 -0.706,0.495 -0.213,0.22 -0.393,0.482 -0.546,0.787 z m -12.619,-2.318 c -0.09,-0.505 -0.21,-1.049 -0.358,-1.634 l -0.79,-3.095 -1.608,-0.154 -0.602,6.253 1.143,0.109 0.306,-3.167 c 0.044,-0.464 0.078,-1.005 0.1,-1.623 l 0.043,0.004 c 0.035,0.183 0.078,0.388 0.128,0.617 0.051,0.228 0.105,0.453 0.163,0.675 l 0.953,3.659 0.739,0.07 1.634,-3.408 c 0.098,-0.208 0.194,-0.418 0.288,-0.633 0.093,-0.215 0.174,-0.408 0.244,-0.58 l 0.043,0.004 c -0.094,0.572 -0.165,1.108 -0.213,1.607 l -0.306,3.173 1.146,0.11 0.601,-6.253 -1.597,-0.154 -1.365,2.873 c -0.23,0.479 -0.451,0.995 -0.664,1.549 z m -15.33,3.787 c -0.02,-0.059 -0.045,-0.132 -0.076,-0.22 -0.03,-0.086 -0.06,-0.159 -0.084,-0.216 l 0.008,-0.003 c 0.083,0.031 0.177,0.059 0.278,0.083 0.102,0.023 0.211,0.041 0.325,0.049 0.116,0.009 0.239,0.006 0.369,-0.009 0.13,-0.015 0.259,-0.045 0.386,-0.087 0.23,-0.077 0.44,-0.194 0.63,-0.347 0.191,-0.155 0.341,-0.349 0.452,-0.582 0.111,-0.232 0.171,-0.505 0.182,-0.816 0.011,-0.311 -0.047,-0.661 -0.177,-1.049 -0.13,-0.39 -0.297,-0.712 -0.5,-0.97 -0.205,-0.257 -0.426,-0.454 -0.666,-0.592 -0.241,-0.135 -0.493,-0.217 -0.752,-0.24 -0.261,-0.022 -0.51,0.005 -0.752,0.086 -0.263,0.088 -0.48,0.221 -0.647,0.402 -0.167,0.179 -0.288,0.377 -0.362,0.593 l -0.016,0.005 -0.208,-0.469 -0.926,0.31 1.471,4.394 c 0.085,0.253 0.178,0.461 0.279,0.628 0.103,0.169 0.216,0.309 0.344,0.421 0.15,0.131 0.318,0.234 0.506,0.306 0.188,0.072 0.389,0.118 0.604,0.134 0.215,0.017 0.442,0.009 0.682,-0.027 0.238,-0.036 0.484,-0.094 0.737,-0.178 0.143,-0.048 0.28,-0.102 0.414,-0.158 0.132,-0.056 0.256,-0.113 0.369,-0.173 0.114,-0.06 0.218,-0.118 0.307,-0.177 0.09,-0.059 0.167,-0.114 0.225,-0.161 l -0.288,-0.861 c -0.172,0.131 -0.362,0.25 -0.566,0.356 -0.204,0.106 -0.424,0.199 -0.661,0.279 -0.473,0.157 -0.868,0.188 -1.188,0.092 -0.319,-0.096 -0.541,-0.328 -0.665,-0.699 z m 0.662,-1.156 c -0.106,0.035 -0.211,0.055 -0.316,0.06 -0.103,0.006 -0.202,0 -0.297,-0.013 -0.093,-0.017 -0.181,-0.038 -0.262,-0.067 -0.081,-0.029 -0.15,-0.061 -0.214,-0.097 l -0.644,-1.927 c 0.069,-0.172 0.17,-0.33 0.307,-0.476 0.136,-0.144 0.297,-0.249 0.483,-0.314 0.065,-0.023 0.139,-0.036 0.216,-0.046 0.077,-0.007 0.162,-0.003 0.25,0.013 0.087,0.018 0.176,0.048 0.267,0.096 0.09,0.048 0.177,0.117 0.265,0.207 0.086,0.09 0.169,0.204 0.25,0.343 0.081,0.137 0.154,0.305 0.219,0.499 0.073,0.218 0.116,0.418 0.133,0.604 0.014,0.183 10e-4,0.35 -0.041,0.498 -0.044,0.148 -0.116,0.275 -0.217,0.381 -0.102,0.104 -0.235,0.184 -0.399,0.239 z m -17.343,3.45 0.372,0.502 5.494,1.774 -2.737,2.026 0.574,0.773 3.966,-2.934 -0.369,-0.499 -5.5,-1.776 2.638,-1.952 -0.573,-0.773 z m -2.293,11.914 -5.467,-3.094 -0.819,1.447 c -0.072,0.129 -0.142,0.26 -0.209,0.397 -0.065,0.136 -0.126,0.274 -0.181,0.407 -0.054,0.134 -0.103,0.267 -0.146,0.398 -0.044,0.129 -0.076,0.253 -0.099,0.37 -0.073,0.344 -0.093,0.671 -0.061,0.984 0.032,0.315 0.111,0.606 0.243,0.879 0.128,0.274 0.303,0.524 0.526,0.751 0.221,0.229 0.487,0.431 0.797,0.606 0.294,0.167 0.59,0.289 0.884,0.371 0.294,0.08 0.586,0.112 0.873,0.099 0.288,-0.013 0.572,-0.075 0.848,-0.186 0.278,-0.112 0.545,-0.277 0.806,-0.496 0.218,-0.182 0.433,-0.408 0.643,-0.679 0.21,-0.27 0.419,-0.587 0.627,-0.956 z m -1.637,0.939 c -0.332,0.586 -0.67,0.998 -1.018,1.238 -0.31,0.209 -0.644,0.311 -1.004,0.305 -0.36,-0.006 -0.754,-0.129 -1.181,-0.371 -0.224,-0.127 -0.411,-0.265 -0.564,-0.416 -0.152,-0.151 -0.272,-0.312 -0.359,-0.485 -0.089,-0.172 -0.145,-0.352 -0.172,-0.541 -0.025,-0.189 -0.024,-0.385 0.005,-0.589 0.027,-0.188 0.081,-0.386 0.162,-0.593 0.08,-0.206 0.188,-0.427 0.323,-0.664 l 0.249,-0.441 3.792,2.145 z m -7.792,13.031 -0.38,2.886 -2.564,-0.338 -0.15,1.141 6.228,0.82 0.15,-1.141 -2.709,-0.357 0.38,-2.885 2.709,0.356 0.15,-1.14 -6.228,-0.821 -0.15,1.141 z m 2.516,16.432 c -0.311,0.181 -0.533,0.312 -0.666,0.392 -0.13,0.08 -0.401,0.246 -0.811,0.498 l -2.827,1.764 0.243,1.192 5.666,-3.66 -0.206,-1.012 -6.645,-1.153 0.244,1.2 3.291,0.519 c 0.476,0.072 0.79,0.119 0.941,0.14 0.154,0.023 0.409,0.056 0.766,0.101 z m 2.388,15.872 c -0.09,-0.088 -0.178,-0.183 -0.264,-0.283 -0.085,-0.099 -0.157,-0.198 -0.218,-0.298 -0.066,-0.106 -0.111,-0.208 -0.136,-0.307 -0.025,-0.098 -0.025,-0.193 -0.002,-0.287 0.023,-0.093 0.073,-0.183 0.147,-0.271 0.076,-0.085 0.18,-0.17 0.314,-0.252 l 0.508,-0.311 0.66,1.076 0.78,-0.478 -0.66,-1.076 3.099,-1.9 -0.58,-0.947 -3.1,1.898 -0.403,-0.657 -0.78,0.478 0.403,0.658 -0.46,0.281 c -0.272,0.167 -0.483,0.349 -0.634,0.544 -0.149,0.197 -0.245,0.403 -0.288,0.615 -0.044,0.215 -0.04,0.432 0.013,0.654 0.054,0.22 0.147,0.438 0.279,0.654 0.082,0.134 0.175,0.26 0.277,0.379 0.104,0.117 0.201,0.215 0.293,0.29 z m -28.601,-27.902 c -0.608,-0.051 -1.135,-0.17 -1.576,-0.358 -0.444,-0.186 -0.807,-0.424 -1.089,-0.715 -0.281,-0.288 -0.481,-0.623 -0.602,-0.998 -0.117,-0.376 -0.16,-0.781 -0.125,-1.215 0.037,-0.434 0.147,-0.825 0.326,-1.178 0.18,-0.349 0.434,-0.644 0.759,-0.883 0.325,-0.238 0.723,-0.413 1.192,-0.525 0.467,-0.112 1.006,-0.141 1.613,-0.091 0.607,0.05 1.134,0.169 1.578,0.358 0.442,0.188 0.804,0.426 1.083,0.715 0.279,0.29 0.48,0.623 0.599,0.998 0.124,0.377 0.167,0.782 0.131,1.214 -0.036,0.434 -0.147,0.827 -0.33,1.178 -0.181,0.35 -0.434,0.647 -0.758,0.884 -0.322,0.24 -0.72,0.416 -1.186,0.525 -0.469,0.113 -1.008,0.142 -1.615,0.091 z m 3.704,0.513 c -0.046,0.122 -0.089,0.262 -0.133,0.418 -0.044,0.158 -0.084,0.322 -0.115,0.491 -0.035,0.169 -0.065,0.341 -0.089,0.511 -0.025,0.173 -0.044,0.339 -0.058,0.504 l -0.012,0.143 1.592,0.133 c -0.004,-0.267 0.006,-0.587 0.022,-0.961 0.015,-0.377 0.04,-0.772 0.075,-1.191 0.027,-0.314 0.058,-0.596 0.094,-0.842 0.036,-0.249 0.072,-0.498 0.111,-0.749 0.039,-0.248 0.076,-0.509 0.115,-0.778 0.037,-0.272 0.068,-0.538 0.089,-0.794 0.065,-0.784 -0.001,-1.486 -0.202,-2.103 -0.197,-0.617 -0.507,-1.152 -0.927,-1.598 -0.42,-0.446 -0.934,-0.8 -1.547,-1.062 -0.613,-0.262 -1.301,-0.425 -2.063,-0.488 -0.762,-0.064 -1.47,-0.015 -2.123,0.149 -0.654,0.163 -1.226,0.434 -1.72,0.812 -0.494,0.379 -0.891,0.859 -1.197,1.443 -0.303,0.583 -0.488,1.261 -0.553,2.034 -0.064,0.773 0.006,1.472 0.208,2.098 0.204,0.626 0.516,1.168 0.932,1.621 0.42,0.454 0.931,0.815 1.541,1.085 0.607,0.268 1.281,0.433 2.024,0.495 0.387,0.032 0.77,0.028 1.144,-0.017 0.376,-0.044 0.726,-0.122 1.057,-0.238 0.331,-0.116 0.644,-0.27 0.938,-0.461 0.296,-0.191 0.553,-0.414 0.775,-0.673 z m 7.844,44.081 0.119,1.397 0.423,1.289 1.343,-0.463 -0.43,-1.285 -0.775,-1.182 0.026,-0.029 1.369,0.317 1.454,-0.031 -0.025,-1.421 -1.454,0.041 -1.358,0.382 -0.033,-0.043 0.732,-1.206 0.43,-1.39 -1.362,-0.404 -0.421,1.392 -0.062,1.403 -0.034,0.017 -0.931,-1.063 -1.123,-0.759 -0.806,1.169 1.129,0.753 1.314,0.49 -0.008,0.041 -1.294,0.552 -1.135,0.863 0.857,1.127 1.134,-0.862 0.88,-1.099 z m 14.299,14.356 c 0.392,0.243 0.748,0.349 1.069,0.322 0.319,-0.029 0.627,-0.187 0.922,-0.474 l 0.027,0.017 c 0.023,0.2 0.062,0.409 0.12,0.622 0.061,0.213 0.147,0.423 0.261,0.628 0.113,0.206 0.26,0.406 0.44,0.597 0.18,0.197 0.399,0.371 0.658,0.532 0.163,0.101 0.344,0.192 0.549,0.274 0.204,0.085 0.421,0.138 0.648,0.161 0.227,0.022 0.46,0.005 0.703,-0.046 0.241,-0.053 0.482,-0.166 0.721,-0.333 0.156,-0.112 0.314,-0.261 0.475,-0.447 0.163,-0.189 0.333,-0.429 0.515,-0.723 l 4.257,-6.871 -1.51,-0.935 -4.143,6.686 c -0.11,0.178 -0.215,0.327 -0.309,0.446 -0.094,0.122 -0.187,0.213 -0.271,0.278 -0.205,0.15 -0.418,0.217 -0.634,0.201 -0.216,-0.017 -0.43,-0.093 -0.648,-0.228 -0.277,-0.171 -0.498,-0.422 -0.662,-0.759 -0.166,-0.334 -0.265,-0.724 -0.298,-1.166 l 2.539,-4.099 -1.509,-0.936 -2.812,4.539 c -0.095,0.152 -0.197,0.249 -0.31,0.29 -0.109,0.041 -0.24,0.014 -0.395,-0.082 -0.104,-0.065 -0.235,-0.173 -0.39,-0.323 l -0.755,1.218 c 0.263,0.262 0.509,0.466 0.742,0.611 z m 30.28,9.865 c 0.073,-0.686 0.058,-1.318 -0.038,-1.901 -0.095,-0.582 -0.256,-1.115 -0.482,-1.601 0.304,-0.203 0.59,-0.394 0.859,-0.579 0.266,-0.185 0.506,-0.347 0.717,-0.491 0.212,-0.141 0.383,-0.259 0.517,-0.35 0.134,-0.092 0.217,-0.148 0.247,-0.168 l -2.481,-0.517 c -0.108,0.09 -0.223,0.181 -0.35,0.275 -0.126,0.093 -0.293,0.216 -0.505,0.364 -0.405,-0.424 -0.863,-0.775 -1.374,-1.047 -0.51,-0.273 -1.045,-0.465 -1.601,-0.582 -0.451,-0.094 -0.875,-0.117 -1.272,-0.072 -0.4,0.048 -0.755,0.16 -1.065,0.33 -0.312,0.173 -0.576,0.408 -0.789,0.7 -0.216,0.295 -0.364,0.641 -0.447,1.04 -0.069,0.335 -0.081,0.652 -0.032,0.945 0.048,0.295 0.137,0.569 0.266,0.826 0.13,0.258 0.292,0.498 0.485,0.727 0.193,0.225 0.399,0.441 0.616,0.643 -0.224,0.144 -0.435,0.289 -0.63,0.436 -0.198,0.147 -0.375,0.309 -0.534,0.48 -0.158,0.171 -0.292,0.357 -0.404,0.561 -0.111,0.204 -0.193,0.433 -0.246,0.689 -0.078,0.371 -0.082,0.717 -0.01,1.038 0.072,0.323 0.21,0.611 0.417,0.869 0.206,0.257 0.479,0.478 0.814,0.661 0.339,0.184 0.734,0.32 1.19,0.416 0.455,0.095 0.874,0.132 1.255,0.11 0.384,-0.023 0.72,-0.106 1.014,-0.246 0.293,-0.141 0.536,-0.341 0.734,-0.597 0.197,-0.257 0.334,-0.574 0.413,-0.953 0.069,-0.333 0.078,-0.642 0.032,-0.932 -0.048,-0.286 -0.135,-0.556 -0.268,-0.809 -0.131,-0.252 -0.293,-0.489 -0.486,-0.715 -0.194,-0.224 -0.398,-0.439 -0.616,-0.649 0.292,-0.195 0.624,-0.418 0.998,-0.673 0.375,-0.253 0.77,-0.521 1.187,-0.801 0.101,0.317 0.171,0.661 0.204,1.032 0.033,0.367 0.02,0.764 -0.032,1.188 z m -4.627,-5.141 c 0.337,0.071 0.665,0.194 0.984,0.368 0.32,0.174 0.61,0.399 0.873,0.679 -0.225,0.152 -0.453,0.306 -0.685,0.464 -0.234,0.155 -0.462,0.309 -0.687,0.461 -0.225,0.151 -0.447,0.299 -0.662,0.439 -0.217,0.141 -0.42,0.276 -0.61,0.406 -0.148,-0.143 -0.285,-0.289 -0.405,-0.437 -0.124,-0.149 -0.223,-0.3 -0.301,-0.452 -0.078,-0.155 -0.133,-0.316 -0.161,-0.479 -0.029,-0.165 -0.027,-0.335 0.009,-0.512 0.043,-0.206 0.118,-0.376 0.223,-0.511 0.106,-0.137 0.231,-0.245 0.377,-0.322 0.147,-0.076 0.308,-0.124 0.489,-0.139 0.18,-0.017 0.365,-0.005 0.556,0.035 z m -2.125,6.262 c 0.03,-0.149 0.083,-0.282 0.156,-0.397 0.072,-0.117 0.166,-0.232 0.285,-0.335 0.115,-0.107 0.252,-0.209 0.407,-0.31 0.156,-0.101 0.332,-0.211 0.526,-0.327 0.147,0.143 0.285,0.287 0.406,0.43 0.125,0.143 0.228,0.288 0.307,0.438 0.08,0.15 0.137,0.302 0.17,0.461 0.03,0.155 0.028,0.317 -0.007,0.486 -0.059,0.283 -0.205,0.49 -0.435,0.619 -0.231,0.132 -0.525,0.16 -0.883,0.086 -0.181,-0.038 -0.336,-0.097 -0.467,-0.173 -0.129,-0.079 -0.234,-0.17 -0.311,-0.274 -0.078,-0.103 -0.128,-0.213 -0.155,-0.333 -0.027,-0.121 -0.025,-0.246 10e-4,-0.371 z m 26.911,-7.02 c -0.207,-0.029 -0.45,-0.035 -0.732,-0.027 -0.277,0.012 -0.535,0.036 -0.772,0.075 -0.602,0.097 -1.079,0.273 -1.429,0.529 -0.353,0.26 -0.577,0.581 -0.674,0.973 -0.071,0.281 -0.07,0.645 0.003,1.094 l 0.604,3.717 -1.342,0.217 0.234,1.443 1.341,-0.218 0.326,2.006 1.754,-0.285 -0.326,-2.006 2.071,-0.336 -0.233,-1.443 -2.071,0.336 -0.567,-3.486 c -0.045,-0.278 -0.035,-0.49 0.027,-0.634 0.109,-0.251 0.386,-0.413 0.824,-0.484 0.201,-0.032 0.407,-0.05 0.611,-0.05 0.205,0 0.4,0.01 0.586,0.029 z m 36.424,-20.902 c 0.133,-0.177 0.284,-0.319 0.45,-0.428 0.165,-0.11 0.347,-0.179 0.547,-0.205 0.198,-0.027 0.408,-0.005 0.627,0.064 0.224,0.068 0.458,0.193 0.697,0.373 0.256,0.191 0.443,0.385 0.56,0.579 0.118,0.193 0.182,0.385 0.194,0.575 0.008,0.19 -0.021,0.375 -0.097,0.561 -0.076,0.183 -0.181,0.367 -0.318,0.551 -0.174,0.232 -0.378,0.449 -0.615,0.65 -0.235,0.204 -0.486,0.368 -0.752,0.502 -0.42,-0.314 -0.753,-0.621 -1.001,-0.918 -0.246,-0.296 -0.417,-0.58 -0.512,-0.849 -0.092,-0.271 -0.118,-0.53 -0.077,-0.774 0.043,-0.246 0.143,-0.475 0.297,-0.681 z m 4.009,2.807 c 0.271,-0.362 0.473,-0.743 0.604,-1.14 0.129,-0.395 0.178,-0.787 0.142,-1.174 -0.036,-0.389 -0.153,-0.761 -0.357,-1.121 -0.203,-0.356 -0.503,-0.683 -0.9,-0.98 -0.396,-0.296 -0.804,-0.507 -1.223,-0.63 -0.419,-0.124 -0.835,-0.157 -1.251,-0.101 -0.412,0.056 -0.812,0.206 -1.199,0.446 -0.386,0.242 -0.741,0.58 -1.063,1.012 -0.342,0.457 -0.56,0.925 -0.657,1.409 -0.097,0.484 -0.08,0.97 0.051,1.454 0.132,0.486 0.372,0.959 0.72,1.421 0.349,0.465 0.802,0.906 1.358,1.321 0.634,0.474 1.292,0.845 1.978,1.111 0.685,0.269 1.358,0.412 2.017,0.429 0.658,0.015 1.282,-0.106 1.87,-0.366 0.588,-0.26 1.101,-0.686 1.541,-1.275 0.248,-0.331 0.445,-0.664 0.583,-0.994 0.143,-0.332 0.234,-0.603 0.276,-0.818 l -1.226,-0.916 c -0.089,0.311 -0.191,0.611 -0.314,0.903 -0.124,0.295 -0.275,0.561 -0.458,0.805 -0.185,0.248 -0.407,0.456 -0.666,0.621 -0.258,0.167 -0.55,0.275 -0.879,0.327 -0.329,0.051 -0.691,0.039 -1.093,-0.035 -0.399,-0.073 -0.83,-0.227 -1.294,-0.462 l 0.018,-0.024 c 0.545,-0.293 1.018,-0.7 1.422,-1.223 z m 10.28,-20.098 c 0.243,-0.416 0.533,-0.702 0.864,-0.865 0.333,-0.159 0.691,-0.179 1.076,-0.056 0.215,0.069 0.391,0.166 0.525,0.29 0.138,0.124 0.238,0.267 0.304,0.428 0.066,0.161 0.1,0.335 0.1,0.521 -0.004,0.185 -0.035,0.376 -0.098,0.571 -0.061,0.19 -0.145,0.362 -0.252,0.519 -0.108,0.153 -0.236,0.276 -0.385,0.368 -0.147,0.092 -0.311,0.15 -0.496,0.172 -0.181,0.023 -0.381,-0.002 -0.595,-0.07 -0.385,-0.124 -0.666,-0.348 -0.844,-0.671 -0.175,-0.325 -0.243,-0.726 -0.199,-1.207 z m -4.371,-1.401 c 0.075,-0.23 0.176,-0.43 0.304,-0.604 0.129,-0.174 0.28,-0.311 0.453,-0.414 0.172,-0.101 0.36,-0.165 0.564,-0.187 0.206,-0.026 0.417,-0.001 0.635,0.069 0.244,0.078 0.444,0.185 0.607,0.323 0.159,0.138 0.282,0.3 0.366,0.483 0.085,0.186 0.136,0.388 0.152,0.608 0.018,0.221 0.002,0.453 -0.045,0.695 -0.103,0.226 -0.225,0.423 -0.369,0.592 -0.139,0.17 -0.298,0.304 -0.476,0.406 -0.175,0.1 -0.369,0.162 -0.579,0.18 -0.212,0.017 -0.438,-0.012 -0.681,-0.09 -0.219,-0.069 -0.404,-0.172 -0.558,-0.312 -0.152,-0.138 -0.267,-0.299 -0.349,-0.48 -0.08,-0.185 -0.124,-0.385 -0.128,-0.601 -0.003,-0.216 0.031,-0.439 0.104,-0.668 z m -1.383,-0.443 c -0.161,0.499 -0.239,0.976 -0.232,1.433 0.004,0.455 0.083,0.87 0.236,1.242 0.15,0.371 0.371,0.691 0.655,0.96 0.286,0.272 0.632,0.471 1.035,0.601 0.314,0.101 0.613,0.151 0.892,0.149 0.279,-0.002 0.538,-0.046 0.782,-0.129 0.241,-0.081 0.462,-0.201 0.664,-0.354 0.2,-0.153 0.379,-0.333 0.534,-0.541 l 0.029,0.01 c 0.053,0.506 0.214,0.942 0.484,1.308 0.266,0.367 0.645,0.629 1.137,0.787 0.416,0.134 0.821,0.172 1.207,0.115 0.384,-0.057 0.74,-0.193 1.063,-0.404 0.32,-0.212 0.605,-0.488 0.849,-0.835 0.244,-0.344 0.435,-0.731 0.574,-1.165 0.139,-0.433 0.209,-0.86 0.211,-1.282 0.003,-0.424 -0.068,-0.815 -0.205,-1.173 -0.141,-0.36 -0.351,-0.677 -0.629,-0.947 -0.281,-0.27 -0.634,-0.474 -1.05,-0.607 -0.492,-0.158 -0.952,-0.166 -1.383,-0.021 -0.432,0.14 -0.817,0.401 -1.154,0.782 l -0.029,-0.009 c -0.005,-0.26 -0.047,-0.51 -0.121,-0.751 -0.075,-0.242 -0.184,-0.468 -0.334,-0.674 -0.15,-0.21 -0.335,-0.396 -0.562,-0.56 -0.225,-0.163 -0.496,-0.296 -0.811,-0.397 -0.405,-0.13 -0.8,-0.168 -1.192,-0.114 -0.388,0.054 -0.753,0.186 -1.091,0.401 -0.34,0.214 -0.645,0.505 -0.914,0.873 -0.27,0.367 -0.485,0.802 -0.645,1.302 z m 1.391,-45.451 c 0.639,-0.513 1.309,-1.098 2.013,-1.754 l 3.728,-3.482 -1.016,-2.372 -9.221,3.957 0.724,1.686 4.671,-2.004 c 0.684,-0.294 1.468,-0.661 2.35,-1.105 l 0.027,0.063 c -0.228,0.19 -0.483,0.407 -0.764,0.653 -0.281,0.247 -0.554,0.495 -0.822,0.747 l -4.392,4.142 0.468,1.089 6.028,-0.328 c 0.366,-0.021 0.735,-0.047 1.107,-0.082 0.372,-0.034 0.706,-0.069 1,-0.103 l 0.027,0.064 c -0.873,0.307 -1.678,0.619 -2.414,0.935 l -4.679,2.007 0.725,1.689 9.22,-3.955 -1.011,-2.358 -5.071,0.293 c -0.847,0.046 -1.739,0.134 -2.68,0.258 z M 339.046,253.29 c 0.067,-0.073 0.151,-0.163 0.25,-0.273 0.098,-0.109 0.177,-0.206 0.239,-0.285 l 0.009,0.009 c 0.019,0.14 0.053,0.294 0.098,0.453 0.046,0.16 0.104,0.327 0.179,0.494 0.078,0.17 0.175,0.339 0.297,0.509 0.121,0.17 0.261,0.328 0.419,0.474 0.283,0.263 0.607,0.467 0.968,0.617 0.364,0.148 0.751,0.209 1.162,0.186 0.41,-0.024 0.839,-0.148 1.283,-0.372 0.445,-0.224 0.889,-0.572 1.334,-1.054 0.445,-0.48 0.769,-0.962 0.974,-1.444 0.202,-0.482 0.309,-0.944 0.318,-1.388 0.006,-0.439 -0.073,-0.855 -0.24,-1.236 -0.169,-0.383 -0.399,-0.711 -0.698,-0.987 -0.325,-0.301 -0.678,-0.503 -1.061,-0.599 -0.378,-0.096 -0.748,-0.113 -1.106,-0.051 l -0.021,-0.019 0.498,-0.652 -1.143,-1.058 -5.029,5.432 c -0.288,0.313 -0.508,0.604 -0.665,0.874 -0.158,0.273 -0.265,0.539 -0.325,0.804 -0.068,0.311 -0.083,0.624 -0.04,0.943 0.043,0.32 0.133,0.636 0.276,0.95 0.14,0.315 0.327,0.627 0.561,0.934 0.233,0.307 0.504,0.606 0.815,0.895 0.177,0.163 0.358,0.315 0.539,0.459 0.181,0.143 0.354,0.272 0.526,0.384 0.171,0.114 0.332,0.214 0.483,0.295 0.153,0.081 0.286,0.147 0.399,0.191 l 0.984,-1.063 c -0.315,-0.141 -0.629,-0.316 -0.932,-0.52 -0.306,-0.203 -0.606,-0.442 -0.899,-0.713 -0.584,-0.541 -0.931,-1.07 -1.04,-1.592 -0.111,-0.52 0.046,-1.011 0.469,-1.469 z m 2.127,0.041 c -0.131,-0.121 -0.239,-0.253 -0.326,-0.396 -0.087,-0.14 -0.157,-0.284 -0.209,-0.427 -0.049,-0.143 -0.087,-0.282 -0.109,-0.417 -0.021,-0.135 -0.029,-0.259 -0.027,-0.375 l 2.204,-2.382 c 0.295,-0.035 0.593,-0.013 0.901,0.067 0.308,0.079 0.578,0.223 0.812,0.433 0.081,0.076 0.157,0.166 0.23,0.27 0.069,0.102 0.128,0.223 0.173,0.358 0.042,0.137 0.068,0.285 0.071,0.449 0.002,0.163 -0.028,0.337 -0.087,0.53 -0.06,0.189 -0.156,0.393 -0.288,0.612 -0.129,0.22 -0.309,0.45 -0.531,0.691 -0.249,0.269 -0.496,0.483 -0.743,0.648 -0.247,0.161 -0.491,0.272 -0.73,0.326 -0.241,0.052 -0.474,0.049 -0.7,-0.012 -0.225,-0.063 -0.438,-0.186 -0.641,-0.375 z m -18.142,-21.643 -0.418,0.906 1.731,9.056 -4.936,-2.278 -0.644,1.396 7.154,3.303 0.416,-0.9 -1.735,-9.066 4.759,2.198 0.645,-1.397 z m -18.447,5.932 0.137,-10.033 -2.655,-0.036 c -0.235,-0.004 -0.473,0.001 -0.716,0.013 -0.242,0.012 -0.48,0.032 -0.71,0.057 -0.229,0.027 -0.452,0.06 -0.667,0.1 -0.216,0.038 -0.413,0.087 -0.596,0.147 -0.537,0.161 -1.012,0.384 -1.425,0.669 -0.416,0.286 -0.764,0.621 -1.045,1.014 -0.284,0.391 -0.5,0.828 -0.648,1.315 -0.15,0.484 -0.229,1.012 -0.237,1.581 -0.007,0.54 0.048,1.048 0.16,1.522 0.113,0.473 0.292,0.908 0.53,1.3 0.24,0.393 0.545,0.742 0.912,1.045 0.369,0.302 0.806,0.551 1.313,0.747 0.422,0.167 0.905,0.294 1.444,0.38 0.539,0.087 1.144,0.137 1.82,0.145 z m -2.571,-1.574 c -1.075,-0.015 -1.913,-0.173 -2.516,-0.474 -0.531,-0.275 -0.931,-0.664 -1.197,-1.174 -0.27,-0.508 -0.399,-1.155 -0.388,-1.939 0.005,-0.411 0.057,-0.779 0.151,-1.108 0.094,-0.329 0.228,-0.621 0.403,-0.875 0.173,-0.257 0.383,-0.474 0.626,-0.657 0.245,-0.181 0.521,-0.328 0.828,-0.445 0.285,-0.106 0.603,-0.183 0.955,-0.227 0.351,-0.046 0.743,-0.065 1.179,-0.059 l 0.808,0.012 -0.094,6.956 z m -24.233,-0.913 -4.333,1.683 -1.495,-3.851 -1.713,0.665 3.632,9.354 1.712,-0.666 -1.579,-4.068 4.333,-1.682 1.58,4.068 1.713,-0.665 -3.631,-9.353 -1.713,0.665 z m -21.085,16.136 c -0.493,-0.298 -0.845,-0.509 -1.06,-0.632 -0.212,-0.123 -0.652,-0.374 -1.32,-0.755 l -4.64,-2.605 -1.483,1.255 9.475,5.127 1.259,-1.065 -3.485,-10.193 -1.493,1.263 1.799,5.008 c 0.264,0.722 0.439,1.198 0.525,1.427 0.086,0.232 0.235,0.614 0.447,1.15 z m -20.399,15.524 c 0.055,-0.193 0.12,-0.388 0.194,-0.586 0.074,-0.196 0.157,-0.372 0.25,-0.535 0.1,-0.173 0.207,-0.314 0.327,-0.425 0.116,-0.111 0.25,-0.185 0.399,-0.224 0.149,-0.038 0.314,-0.038 0.493,0 0.178,0.039 0.376,0.12 0.593,0.245 l 0.827,0.473 -1.002,1.751 1.269,0.725 1.001,-1.751 5.039,2.884 0.882,-1.542 -5.039,-2.883 0.612,-1.069 -1.268,-0.726 -0.612,1.069 -0.748,-0.428 c -0.442,-0.253 -0.858,-0.408 -1.247,-0.47 -0.391,-0.057 -0.753,-0.035 -1.083,0.068 -0.334,0.104 -0.636,0.276 -0.906,0.52 -0.266,0.246 -0.501,0.542 -0.702,0.893 -0.124,0.217 -0.23,0.443 -0.318,0.679 -0.084,0.235 -0.147,0.447 -0.183,0.633 z m 28.208,-54.313 c -0.194,0.009 -0.411,0.033 -0.649,0.065 -0.243,0.033 -0.489,0.075 -0.739,0.13 -0.251,0.053 -0.504,0.114 -0.751,0.181 -0.25,0.067 -0.488,0.138 -0.722,0.217 l -0.202,0.067 0.754,2.253 c 0.364,-0.162 0.806,-0.339 1.326,-0.536 0.524,-0.199 1.079,-0.398 1.671,-0.597 0.445,-0.148 0.849,-0.271 1.207,-0.367 0.36,-0.097 0.722,-0.193 1.087,-0.289 0.362,-0.093 0.741,-0.196 1.13,-0.301 0.394,-0.109 0.774,-0.223 1.138,-0.347 1.108,-0.371 2.027,-0.875 2.753,-1.512 0.727,-0.632 1.276,-1.369 1.638,-2.205 0.362,-0.837 0.543,-1.748 0.541,-2.74 -0.004,-0.99 -0.186,-2.025 -0.547,-3.104 -0.362,-1.079 -0.846,-2.017 -1.453,-2.814 -0.608,-0.795 -1.313,-1.417 -2.12,-1.871 -0.81,-0.45 -1.7,-0.711 -2.676,-0.785 -0.974,-0.071 -2.011,0.074 -3.104,0.442 -1.094,0.366 -2.008,0.874 -2.743,1.517 -0.735,0.648 -1.293,1.392 -1.667,2.228 -0.373,0.84 -0.565,1.752 -0.575,2.742 -0.009,0.987 0.162,2.006 0.514,3.056 0.184,0.547 0.414,1.069 0.697,1.553 0.28,0.488 0.593,0.92 0.946,1.303 0.355,0.385 0.749,0.723 1.181,1.011 0.436,0.292 0.891,0.512 1.375,0.663 z m -35.744,9.151 1.943,-1.603 c 0.294,-0.242 0.555,-0.434 0.79,-0.572 0.236,-0.146 0.465,-0.251 0.68,-0.326 0.514,-0.161 0.983,-0.173 1.415,-0.031 0.43,0.144 0.79,0.39 1.085,0.747 0.155,0.189 0.285,0.387 0.388,0.589 0.102,0.208 0.165,0.422 0.192,0.646 0.027,0.224 0.005,0.455 -0.056,0.698 -0.061,0.237 -0.182,0.485 -0.353,0.745 -0.119,0.173 -0.272,0.357 -0.464,0.556 -0.195,0.2 -0.446,0.427 -0.755,0.681 l -1.85,1.527 z m 6.69,3.571 6.17,2.184 2.398,-1.978 -6.705,-2.29 -0.029,-0.036 c 0.52,-0.888 0.741,-1.751 0.666,-2.579 -0.076,-0.828 -0.435,-1.628 -1.075,-2.403 -0.625,-0.758 -1.374,-1.265 -2.256,-1.519 -0.883,-0.259 -1.812,-0.223 -2.789,0.1 -0.369,0.12 -0.75,0.298 -1.143,0.523 -0.389,0.227 -0.81,0.525 -1.256,0.893 l -4.243,3.501 9.496,11.51 2.108,-1.739 -3.571,-4.329 z m -26.545,23.925 -1.839,0.987 -1.511,1.335 1.423,1.56 1.501,-1.342 1.159,-1.754 0.054,0.018 0.373,2.057 0.899,1.968 1.926,-0.871 -0.911,-1.962 -1.321,-1.63 0.039,-0.07 2.078,0.291 2.152,-0.231 -0.249,-2.099 -2.149,0.245 -1.954,0.742 -0.043,-0.039 0.905,-1.897 0.376,-1.979 -2.072,-0.414 -0.364,1.985 0.103,2.083 -0.06,0.013 -1.516,-1.442 -1.845,-1.043 -1.036,1.833 1.846,1.043 2.018,0.555 z m -6.86,75.143 c 0.98,-0.304 1.836,-0.696 2.575,-1.169 0.738,-0.474 1.372,-1.006 1.903,-1.601 0.457,0.296 0.886,0.574 1.296,0.832 0.41,0.254 0.771,0.488 1.093,0.69 0.317,0.207 0.581,0.372 0.782,0.502 0.205,0.128 0.33,0.207 0.375,0.237 l -0.753,-3.692 c -0.186,-0.094 -0.378,-0.198 -0.581,-0.316 -0.203,-0.118 -0.467,-0.275 -0.796,-0.476 0.341,-0.803 0.552,-1.634 0.623,-2.493 0.073,-0.857 0.021,-1.701 -0.147,-2.53 -0.137,-0.671 -0.355,-1.264 -0.65,-1.778 -0.3,-0.519 -0.662,-0.938 -1.077,-1.262 -0.419,-0.325 -0.896,-0.547 -1.421,-0.665 -0.53,-0.122 -1.089,-0.12 -1.683,0.001 -0.5,0.102 -0.938,0.273 -1.31,0.511 -0.374,0.241 -0.697,0.523 -0.972,0.851 -0.276,0.328 -0.509,0.691 -0.707,1.088 -0.194,0.398 -0.368,0.806 -0.516,1.22 -0.329,-0.22 -0.65,-0.423 -0.966,-0.603 -0.317,-0.183 -0.643,-0.331 -0.969,-0.448 -0.327,-0.115 -0.661,-0.187 -1.005,-0.222 -0.345,-0.03 -0.705,-0.007 -1.086,0.07 -0.552,0.112 -1.028,0.311 -1.424,0.597 -0.4,0.288 -0.712,0.647 -0.941,1.082 -0.231,0.432 -0.373,0.934 -0.425,1.5 -0.051,0.571 -0.006,1.19 0.132,1.869 0.138,0.679 0.333,1.273 0.588,1.781 0.257,0.51 0.569,0.921 0.933,1.238 0.364,0.319 0.781,0.533 1.248,0.652 0.466,0.118 0.98,0.118 1.543,0.004 0.496,-0.101 0.924,-0.27 1.292,-0.504 0.363,-0.235 0.68,-0.512 0.948,-0.842 0.267,-0.327 0.496,-0.688 0.691,-1.085 0.191,-0.396 0.365,-0.802 0.524,-1.222 0.439,0.283 0.939,0.605 1.506,0.967 0.567,0.362 1.166,0.745 1.792,1.15 -0.373,0.325 -0.801,0.623 -1.288,0.884 -0.483,0.262 -1.033,0.479 -1.641,0.657 z m 4.3,-9.345 c 0.103,0.504 0.127,1.023 0.078,1.561 -0.049,0.539 -0.187,1.068 -0.414,1.592 -0.34,-0.218 -0.685,-0.439 -1.037,-0.663 -0.349,-0.228 -0.694,-0.449 -1.033,-0.666 -0.34,-0.22 -0.672,-0.435 -0.99,-0.647 -0.32,-0.214 -0.624,-0.412 -0.914,-0.594 0.109,-0.286 0.228,-0.56 0.359,-0.811 0.13,-0.256 0.279,-0.481 0.441,-0.677 0.165,-0.197 0.353,-0.367 0.559,-0.502 0.208,-0.136 0.441,-0.234 0.706,-0.289 0.305,-0.061 0.582,-0.059 0.828,0.003 0.25,0.066 0.47,0.173 0.661,0.328 0.191,0.155 0.35,0.348 0.477,0.586 0.129,0.235 0.221,0.496 0.279,0.779 z m -9.803,0.782 c 0.221,-0.046 0.434,-0.053 0.635,-0.021 0.203,0.029 0.413,0.091 0.625,0.192 0.214,0.096 0.433,0.222 0.663,0.373 0.229,0.155 0.483,0.33 0.755,0.526 -0.108,0.287 -0.223,0.558 -0.347,0.809 -0.123,0.254 -0.26,0.48 -0.418,0.676 -0.158,0.199 -0.333,0.367 -0.53,0.504 -0.195,0.133 -0.417,0.225 -0.667,0.276 -0.422,0.086 -0.791,0.009 -1.103,-0.23 -0.317,-0.238 -0.528,-0.623 -0.637,-1.156 -0.054,-0.269 -0.065,-0.515 -0.038,-0.739 0.033,-0.223 0.094,-0.418 0.192,-0.586 0.094,-0.166 0.215,-0.3 0.363,-0.408 0.15,-0.108 0.321,-0.179 0.507,-0.216 z m 25.42,32.629 c -0.082,-0.3 -0.215,-0.635 -0.393,-1.014 -0.179,-0.373 -0.364,-0.712 -0.556,-1.013 -0.487,-0.764 -1.008,-1.312 -1.564,-1.64 -0.561,-0.327 -1.133,-0.446 -1.725,-0.348 -0.427,0.069 -0.921,0.284 -1.492,0.648 l -4.722,3.011 -1.087,-1.705 -1.833,1.169 1.087,1.704 -2.548,1.625 1.42,2.228 2.549,-1.625 1.678,2.632 1.832,-1.169 -1.678,-2.631 4.43,-2.825 c 0.353,-0.224 0.648,-0.336 0.882,-0.337 0.407,0.002 0.792,0.285 1.145,0.842 0.164,0.256 0.309,0.526 0.429,0.805 0.12,0.28 0.222,0.554 0.304,0.818 z m 53.368,39.336 c 0.318,0.077 0.602,0.2 0.849,0.361 0.247,0.161 0.448,0.369 0.601,0.626 0.155,0.256 0.246,0.556 0.283,0.897 0.038,0.346 0.003,0.737 -0.101,1.17 -0.111,0.463 -0.266,0.833 -0.461,1.106 -0.195,0.275 -0.419,0.476 -0.672,0.603 -0.254,0.124 -0.524,0.192 -0.823,0.198 -0.295,0.004 -0.607,-0.031 -0.94,-0.111 -0.419,-0.101 -0.837,-0.252 -1.249,-0.457 -0.418,-0.201 -0.79,-0.447 -1.128,-0.732 0.182,-0.759 0.405,-1.394 0.665,-1.907 0.258,-0.511 0.545,-0.911 0.858,-1.2 0.315,-0.285 0.655,-0.473 1.012,-0.56 0.361,-0.086 0.732,-0.083 1.106,0.006 z m -1.477,7.127 c 0.654,0.157 1.293,0.209 1.912,0.154 0.616,-0.057 1.179,-0.219 1.688,-0.496 0.51,-0.277 0.949,-0.657 1.321,-1.148 0.367,-0.487 0.637,-1.088 0.809,-1.806 0.171,-0.714 0.221,-1.396 0.141,-2.041 -0.077,-0.645 -0.277,-1.233 -0.599,-1.768 -0.319,-0.53 -0.757,-0.988 -1.314,-1.376 -0.557,-0.384 -1.228,-0.67 -2.006,-0.856 -0.826,-0.198 -1.595,-0.221 -2.313,-0.069 -0.718,0.152 -1.371,0.461 -1.955,0.924 -0.586,0.467 -1.091,1.074 -1.517,1.821 -0.431,0.75 -0.767,1.63 -1.007,2.632 -0.274,1.145 -0.394,2.262 -0.354,3.355 0.036,1.094 0.237,2.098 0.602,3.007 0.366,0.909 0.899,1.691 1.6,2.341 0.701,0.65 1.584,1.1 2.647,1.355 0.599,0.143 1.169,0.217 1.702,0.211 0.537,0 0.96,-0.036 1.278,-0.104 l 0.531,-2.214 c -0.477,0.062 -0.948,0.098 -1.418,0.103 -0.476,0.003 -0.928,-0.047 -1.37,-0.153 -0.447,-0.107 -0.861,-0.288 -1.239,-0.543 -0.381,-0.256 -0.7,-0.591 -0.964,-1.01 -0.263,-0.419 -0.46,-0.921 -0.596,-1.513 -0.134,-0.587 -0.178,-1.269 -0.129,-2.039 l 0.044,0.01 c 0.72,0.572 1.553,0.98 2.506,1.223 z m 33.54,1.487 c 0.712,0.087 1.274,0.313 1.69,0.67 0.413,0.362 0.651,0.84 0.709,1.438 0.033,0.334 0.003,0.631 -0.086,0.888 -0.09,0.261 -0.226,0.481 -0.406,0.667 -0.182,0.185 -0.4,0.332 -0.653,0.442 -0.254,0.105 -0.535,0.173 -0.837,0.203 -0.296,0.028 -0.581,0.013 -0.858,-0.04 -0.272,-0.057 -0.516,-0.16 -0.728,-0.307 -0.214,-0.149 -0.389,-0.338 -0.527,-0.577 -0.137,-0.235 -0.222,-0.522 -0.255,-0.855 -0.057,-0.599 0.083,-1.113 0.42,-1.548 0.341,-0.43 0.849,-0.759 1.531,-0.981 z m -0.657,-6.795 c 0.357,-0.034 0.691,-0.013 1.004,0.061 0.314,0.072 0.589,0.199 0.831,0.375 0.24,0.175 0.437,0.393 0.588,0.658 0.156,0.267 0.247,0.569 0.279,0.91 0.038,0.378 0.009,0.714 -0.084,1.018 -0.094,0.299 -0.244,0.562 -0.445,0.785 -0.203,0.226 -0.448,0.414 -0.741,0.564 -0.291,0.155 -0.616,0.27 -0.977,0.347 -0.367,-0.006 -0.708,-0.058 -1.023,-0.153 -0.314,-0.092 -0.592,-0.23 -0.836,-0.413 -0.239,-0.18 -0.437,-0.409 -0.586,-0.685 -0.149,-0.28 -0.241,-0.604 -0.278,-0.983 -0.033,-0.34 -0.002,-0.653 0.098,-0.946 0.1,-0.289 0.251,-0.541 0.452,-0.759 0.205,-0.219 0.452,-0.396 0.744,-0.527 0.294,-0.132 0.618,-0.217 0.974,-0.252 z m -0.208,-2.151 c -0.777,0.075 -1.474,0.25 -2.093,0.528 -0.622,0.274 -1.141,0.626 -1.56,1.052 -0.417,0.423 -0.726,0.912 -0.926,1.459 -0.202,0.551 -0.272,1.14 -0.211,1.768 0.047,0.489 0.155,0.927 0.321,1.304 0.168,0.381 0.379,0.71 0.635,0.994 0.254,0.282 0.547,0.514 0.874,0.699 0.328,0.183 0.678,0.322 1.054,0.411 l 0.004,0.046 c -0.659,0.37 -1.16,0.846 -1.502,1.43 -0.345,0.58 -0.479,1.251 -0.405,2.017 0.061,0.647 0.248,1.222 0.552,1.717 0.303,0.49 0.699,0.896 1.177,1.213 0.478,0.313 1.023,0.54 1.64,0.668 0.613,0.131 1.255,0.165 1.929,0.1 0.674,-0.065 1.298,-0.221 1.875,-0.468 0.581,-0.244 1.073,-0.571 1.483,-0.969 0.408,-0.403 0.717,-0.877 0.922,-1.416 0.203,-0.544 0.275,-1.145 0.212,-1.791 -0.073,-0.766 -0.333,-1.399 -0.784,-1.902 -0.447,-0.508 -1.029,-0.88 -1.748,-1.117 l -0.004,-0.044 c 0.351,-0.161 0.669,-0.364 0.955,-0.606 0.286,-0.245 0.529,-0.529 0.723,-0.854 0.199,-0.328 0.343,-0.69 0.434,-1.097 0.091,-0.403 0.112,-0.852 0.065,-1.341 -0.06,-0.629 -0.241,-1.193 -0.546,-1.695 -0.3,-0.499 -0.697,-0.919 -1.188,-1.255 -0.492,-0.338 -1.07,-0.584 -1.73,-0.734 -0.662,-0.155 -1.381,-0.192 -2.158,-0.117 z m 34.225,-4.569 c 0.487,1.02 0.673,1.894 0.559,2.619 -0.119,0.726 -0.526,1.259 -1.23,1.594 -0.711,0.341 -1.385,0.324 -2.024,-0.041 -0.635,-0.367 -1.199,-1.061 -1.685,-2.08 -0.487,-1.021 -0.672,-1.895 -0.559,-2.62 0.118,-0.726 0.529,-1.261 1.24,-1.6 0.705,-0.336 1.375,-0.318 2.013,0.047 0.636,0.368 1.199,1.06 1.686,2.081 z m 2.463,-1.176 c -0.35,-0.732 -0.769,-1.362 -1.267,-1.892 -0.496,-0.527 -1.047,-0.931 -1.651,-1.198 -0.61,-0.271 -1.267,-0.4 -1.972,-0.383 -0.705,0.016 -1.437,0.205 -2.203,0.571 -0.753,0.359 -1.353,0.805 -1.806,1.342 -0.453,0.536 -0.768,1.13 -0.945,1.776 -0.178,0.641 -0.222,1.329 -0.127,2.047 0.092,0.724 0.312,1.45 0.661,2.182 0.35,0.731 0.776,1.359 1.281,1.885 0.499,0.526 1.061,0.924 1.674,1.196 0.617,0.271 1.274,0.397 1.975,0.378 0.701,-0.014 1.425,-0.204 2.177,-0.563 0.766,-0.366 1.375,-0.812 1.831,-1.35 0.459,-0.535 0.771,-1.125 0.944,-1.771 0.168,-0.646 0.201,-1.329 0.104,-2.046 -0.1,-0.719 -0.327,-1.442 -0.676,-2.174 z m 22.788,-20.237 c 1.076,0.569 2.27,1.14 3.581,1.717 l 6.949,3.045 2.641,-2.785 -10.828,-10.268 -1.877,1.981 5.485,5.201 c 0.804,0.763 1.766,1.618 2.893,2.561 l -0.072,0.074 c -0.392,-0.2 -0.84,-0.42 -1.341,-0.659 -0.502,-0.24 -1.001,-0.467 -1.502,-0.685 l -8.244,-3.562 -1.212,1.279 3.994,8.042 c 0.244,0.489 0.498,0.975 0.764,1.464 0.266,0.488 0.509,0.924 0.729,1.305 l -0.07,0.075 c -0.935,-1.012 -1.835,-1.928 -2.7,-2.748 l -5.494,-5.21 -1.88,1.983 10.827,10.268 2.625,-2.768 -3.383,-6.754 c -0.561,-1.13 -1.205,-2.297 -1.929,-3.509 z m 18.128,-38.734 c 0.139,0.049 0.311,0.11 0.521,0.18 0.205,0.07 0.385,0.12 0.529,0.159 l -0.006,0.018 c -0.181,0.11 -0.369,0.246 -0.562,0.399 -0.192,0.158 -0.385,0.336 -0.569,0.537 -0.186,0.206 -0.36,0.439 -0.52,0.706 -0.162,0.264 -0.295,0.549 -0.402,0.85 -0.191,0.542 -0.281,1.104 -0.273,1.685 0.011,0.585 0.156,1.149 0.429,1.697 0.276,0.546 0.697,1.059 1.263,1.534 0.567,0.475 1.305,0.877 2.224,1.202 0.919,0.325 1.767,0.484 2.547,0.481 0.778,-0.008 1.472,-0.134 2.082,-0.382 0.605,-0.251 1.126,-0.604 1.548,-1.055 0.423,-0.456 0.737,-0.965 0.938,-1.535 0.22,-0.62 0.288,-1.222 0.193,-1.8 -0.092,-0.574 -0.285,-1.09 -0.581,-1.543 l 0.014,-0.039 1.182,0.297 0.774,-2.185 -10.379,-3.672 c -0.596,-0.21 -1.123,-0.34 -1.584,-0.394 -0.466,-0.056 -0.894,-0.046 -1.29,0.028 -0.465,0.09 -0.901,0.254 -1.312,0.501 -0.411,0.247 -0.791,0.556 -1.134,0.935 -0.348,0.377 -0.664,0.815 -0.947,1.317 -0.281,0.498 -0.531,1.043 -0.742,1.639 -0.118,0.338 -0.221,0.673 -0.31,1.005 -0.089,0.332 -0.164,0.644 -0.217,0.945 -0.054,0.3 -0.096,0.579 -0.117,0.835 -0.022,0.255 -0.032,0.477 -0.027,0.656 l 2.032,0.719 c 0.006,-0.515 0.061,-1.045 0.162,-1.579 0.096,-0.538 0.247,-1.087 0.444,-1.647 0.396,-1.116 0.916,-1.901 1.562,-2.358 0.647,-0.458 1.408,-0.532 2.284,-0.223 z m 1.195,2.93 c 0.089,-0.251 0.206,-0.476 0.35,-0.68 0.139,-0.201 0.296,-0.38 0.46,-0.535 0.167,-0.152 0.335,-0.285 0.507,-0.394 0.171,-0.109 0.335,-0.193 0.495,-0.258 l 4.55,1.61 c 0.221,0.381 0.368,0.801 0.439,1.27 0.073,0.466 0.034,0.92 -0.114,1.364 -0.055,0.155 -0.136,0.312 -0.232,0.472 -0.099,0.155 -0.23,0.306 -0.389,0.448 -0.162,0.137 -0.35,0.259 -0.572,0.358 -0.219,0.1 -0.476,0.164 -0.774,0.195 -0.293,0.03 -0.629,0.019 -1.006,-0.033 -0.375,-0.048 -0.798,-0.157 -1.256,-0.319 -0.513,-0.181 -0.952,-0.393 -1.322,-0.633 -0.366,-0.243 -0.659,-0.512 -0.875,-0.807 -0.213,-0.297 -0.346,-0.617 -0.394,-0.962 -0.048,-0.344 -0.004,-0.708 0.133,-1.096 z m -1.749,-52.856 13.785,-5.715 -1.512,-3.647 c -0.134,-0.323 -0.28,-0.646 -0.439,-0.972 -0.159,-0.321 -0.326,-0.635 -0.497,-0.936 -0.171,-0.296 -0.347,-0.581 -0.529,-0.851 -0.179,-0.272 -0.362,-0.514 -0.549,-0.728 -0.537,-0.64 -1.122,-1.155 -1.753,-1.553 -0.636,-0.399 -1.298,-0.677 -2.001,-0.83 -0.7,-0.159 -1.425,-0.197 -2.176,-0.112 -0.751,0.08 -1.518,0.284 -2.3,0.607 -0.741,0.308 -1.403,0.681 -1.986,1.113 -0.579,0.432 -1.067,0.932 -1.463,1.489 -0.395,0.559 -0.693,1.181 -0.89,1.86 -0.196,0.683 -0.279,1.425 -0.248,2.233 0.019,0.675 0.13,1.409 0.329,2.196 0.199,0.787 0.488,1.645 0.873,2.572 z m 0.637,-4.438 c -0.612,-1.477 -0.89,-2.714 -0.832,-3.716 0.062,-0.885 0.359,-1.66 0.899,-2.325 0.536,-0.667 1.343,-1.223 2.42,-1.67 0.565,-0.234 1.098,-0.381 1.603,-0.446 0.505,-0.065 0.982,-0.054 1.432,0.036 0.451,0.085 0.872,0.245 1.266,0.469 0.391,0.228 0.755,0.518 1.094,0.869 0.312,0.327 0.606,0.717 0.873,1.17 0.269,0.453 0.524,0.978 0.772,1.576 l 0.461,1.111 -9.557,3.963 z m -14.133,-35.51 -4.847,-4.928 4.38,-4.308 -1.917,-1.948 -10.638,10.462 1.917,1.949 4.627,-4.552 4.846,4.93 -4.627,4.551 1.916,1.948 10.64,-10.462 -1.917,-1.949 z m -34.445,-19.309 c 0.117,-0.848 0.197,-1.454 0.24,-1.819 0.043,-0.362 0.127,-1.111 0.254,-2.247 l 0.83,-7.872 -2.586,-1.287 -1.431,15.958 2.196,1.094 11.873,-10.757 -2.604,-1.297 -5.783,5.404 c -0.829,0.786 -1.377,1.303 -1.639,1.557 -0.266,0.253 -0.702,0.683 -1.309,1.287 z M 302.432,194.71 c 0.295,-0.039 0.601,-0.065 0.912,-0.08 0.312,-0.014 0.603,-0.005 0.879,0.028 0.295,0.033 0.551,0.096 0.774,0.194 0.22,0.095 0.399,0.233 0.539,0.414 0.142,0.181 0.237,0.406 0.292,0.674 0.051,0.266 0.056,0.584 0.014,0.954 l -0.159,1.408 -2.982,-0.34 -0.245,2.16 2.982,0.339 -0.975,8.579 2.625,0.298 0.974,-8.579 1.82,0.206 0.245,-2.158 -1.819,-0.207 0.144,-1.274 c 0.085,-0.753 0.053,-1.412 -0.094,-1.98 -0.149,-0.569 -0.393,-1.05 -0.728,-1.44 -0.338,-0.395 -0.751,-0.705 -1.243,-0.931 -0.492,-0.22 -1.035,-0.366 -1.633,-0.434 -0.37,-0.042 -0.741,-0.053 -1.115,-0.035 -0.369,0.024 -0.697,0.063 -0.972,0.124 l -0.235,2.08" /></g></svg>


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | pom.xml.asc
 2 | *.jar
 3 | *.class
 4 | /lib/
 5 | /classes/
 6 | /target/
 7 | /checkouts/
 8 | .lein-deps-sum
 9 | .lein-repl-history
10 | .lein-plugins/
11 | .lein-failures
12 | .nrepl-port
13 | .cpcache/
14 | target/*
15 | .idea
16 | *.iml
17 | .env
18 | *.json
19 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
 1 | stages:
 2 |   - test
 3 | 
 4 | variables:
 5 |   GIT_DEPTH: 3
 6 | 
 7 | cache:
 8 |   key: one-key-to-rule-them-all
 9 |   paths:
10 |     - ./.m2/repository
11 |     - ./.gitlibs
12 | 
13 | lint:
14 |   stage: test
15 |   image: borkdude/clj-kondo
16 |   cache: {}
17 |   when: always
18 |   script:
19 |   - clj-kondo --lint src test --config '{:output {:exclude-files ["java"]}}'
20 | 
21 | unit-test:
22 |   stage: test
23 |   when: always
24 |   image: clojure:tools-deps-alpine
25 |   script:
26 |     - export GITLIBS=".gitlibs/"
27 |     - clojure -Sdeps '{:mvn/local-repo "./.m2/repository"}' -A:test
28 | 
29 | validate-sample-dictionaries:
30 |   stage: test
31 |   when: always
32 |   image: clojure:tools-deps-alpine
33 |   script:
34 |     - export GITLIBS=".gitlibs/"
35 |     - >
36 |       clojure -Sdeps '{:mvn/local-repo "./.m2/repository"}' -m beagle.validator
37 |       test/resources/dict.csv csv
38 |       test/resources/dict.json json
39 |       test/resources/dict.edn edn
40 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
 3 | 
 4 | ## 0.9.0-SNAPSHOT - 2019-10-13
 5 | ### Added
 6 | - Ensuring ordering for phrases with slop
 7 | 
 8 | ## 0.4.0-SNAPSHOT - 2019-10-12
 9 | ### Added
10 | - Tokenizer can be specified for every dictionary entry
11 | - Java Interface accepts tokenizer string
12 | - Highlighter options support for text analysis options
13 | ### Changed
14 | - Use MultiPhraseQuery instead of PhraseQuery internally
15 | 
16 | ## 0.3.1 - 2019-10-03
17 | ### Fixed
18 | - Java interface for phrase highlighting
19 | 
20 | ## 0.3.0 - 2019-09-24
21 | ### Added
22 | - Performance optimizations
23 | ### Changed
24 | - Refactored code towards batch document highlighting
25 | 
26 | ## 0.2.0 - 2019-09-24
27 | ### Added
28 | - Alpha version for Lucene query support
29 | 
30 | ## 0.1.7 - 2019-09-20
31 | ### Added
32 | - Deployment to Maven Central
33 | 
34 | ## 0.1.6 - 2019-09-19
35 | ### Added
36 | - Added Java interface
37 | ### Fixed
38 | - Concurrent usage
39 | 
40 | ## 0.1.5 - 2019-09-16
41 | ### Fixed
42 | - Handling of cases when text or phrases are tokenized to 0 tokens
43 | 
44 | ## 0.1.4 - 2019-09-10
45 | ### Added
46 | - Phrase slop support
47 | 
48 | ## 0.1.3 - 2019-09-04
49 | ### Added
50 | - Use one Lucene Monitor in total
51 | 
52 | ## 0.1.2 - 2019-09-03
53 | ### Added
54 | - Support for stemming for multiple languages
55 | 
56 | ## 0.1.1 - 2019-08-26
57 | ### Added
58 | - Initial release
59 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at info@tokenmill.lt. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | First off, thanks for taking the time to contribute!
 2 | 
 3 | The following is a set of guidelines for contributing to Beagle which is hosted at https://github.com/tokenmill/beagle. These are just guidelines, not rules, use your best judgment and feel free to propose changes to this document in a pull request.
 4 | 
 5 | This project adheres to the Contributor Covenant code of conduct. By participating, you are expected to uphold this code. Please report unacceptable behavior to info@tokenmill.lt.
 6 | Issues & Pull requests
 7 | 
 8 | Issues and Pull requests welcome!
 9 | 
10 | We do ask that before submitting a pull request you open an issue tracking the bug of enhancement you'd like to fix or submit. This makes it easier to discuss changes in the abstract, before focusing on a particular solution.
11 | 
12 | Furthermore, please be diligent about submitting pull requests which only make one essential change at a time. While formatting changes and code cleanups are welcome, they should be separate from features and a pull request should only introduce one logical feature at a time. When adding new features, please ensure there are accompanying tests.
13 | 
14 | Commit Messages
15 | 
16 | Commit messages should be well formed, according to the guidelines outlined by Tim Pope: http://karma-runner.github.io/4.0/dev/git-commit-msg.html
17 | 
18 | When fixing an existing issue, add - fixes #xxx somewhere in the commit message: this has the dual purpose of closing the issue when your patch is merged to master as well as automatically providing a link in to the related issue.
19 | 
20 | Change Log
21 | 
22 | Pull requests are required to update the changelog. Changelog entries should mention and link to any issues or tickets involved in the change, and should provide a short summary description of the particular changes of the patch.
23 | 
24 | Include the issue number (#xxx) which will link back to the originating issue in Github. Commentary on the change should appear as a nested, unordered list.
25 | 
26 | Whitespace & Linting
27 | 
28 | Beagle is maintained with fairly strict whitespace and style standards.
29 | 
30 | Gitlab CI jobs will fail if the clj-kondo rules are violated, or the source format doesnt match the default cljfmt style guidelines. Hence, patches must be formatted and whitespace linted before they will be accepted.
31 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM oracle/graalvm-ce:19.2.0.1 as builder
 2 | RUN gu install native-image
 3 | 
 4 | ENV GRAALVM_HOME=$JAVA_HOME
 5 | 
 6 | RUN curl -O https://download.clojure.org/install/linux-install-1.10.1.469.sh
 7 | RUN chmod +x linux-install-1.10.1.469.sh
 8 | RUN ./linux-install-1.10.1.469.sh
 9 | 
10 | RUN mkdir -p /usr/src/app
11 | WORKDIR /usr/src/app
12 | 
13 | COPY deps.edn /usr/src/app/
14 | RUN clojure -R:native-image
15 | COPY . /usr/src/app
16 | 
17 | RUN clojure -A:native-image
18 | 
19 | RUN chmod 755 dictionary-validator
20 | 
21 | FROM alpine:3.9.4 as validator
22 | 
23 | WORKDIR /opt
24 | COPY --from=builder /usr/src/app/dictionary-validator /usr/local/bin/dictionary-validator
25 | 
26 | CMD ["dictionary-validator"]
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Tokenmill, UAB
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 | http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | lint-code:
 2 | 	clojure -M:clj-kondo --config '{:output {:exclude-files ["java"]}}'
 3 | 
 4 | unit-test:
 5 | 	clojure -M:runner:test -e :noisy
 6 | 
 7 | build-dictionary-validator:
 8 | 	docker build --target builder -f Dockerfile -t tokenmill/beagle-dictionary-validator .
 9 | 	docker rm build || true
10 | 	docker create --name build tokenmill/beagle-dictionary-validator
11 | 	docker cp build:/usr/src/app/dictionary-validator dictionary-validator
12 | 
13 | build-graal-validator-docker:
14 | 	docker build --target validator -f Dockerfile -t tokenmill/beagle-dictionary-validator .
15 | 
16 | recompile-java-interface:
17 | 	rm -rf classes
18 | 	mkdir classes
19 | 	clojure -e "(require 'beagle.java.annotation) (compile 'beagle.java.annotation) (compile 'beagle.java.java)"
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a href="http://www.tokenmill.lt">
  2 |       <img src=".github/tokenmill-logo.svg" width="125" height="125" align="right" />
  3 | </a>
  4 | 
  5 | # Beagle
  6 | 
  7 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  8 | [![pipeline status](https://gitlab.com/tokenmill/oss/beagle/badges/master/pipeline.svg)](https://gitlab.com/tokenmill/oss/beagle/pipelines/master/latest)
  9 | [![Maven Central](https://img.shields.io/maven-central/v/lt.tokenmill/beagle.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:%22lt.tokenmill%22%20AND%20a:%22beagle%22)
 10 | 
 11 | Beagle is a detector of interesting things in text. Its intended use is in-stream search applications. Suppose you need to monitor a stream of text documents such as web crawl results, chat messages, or corporate documents in order to identify keywords, phrases, regexes, and [complex search queries](https://lucene.apache.org/core/2_9_4/queryparsersyntax.html) of interest. With Beagle you can quickly be up and running with such a system, allowing you to focus on productively monitoring your documents.
 12 | 
 13 | Beagle is based on the [Lucene monitor](https://github.com/apache/lucene-solr/tree/master/lucene/monitor) library which is based on [Luwak](https://github.com/flaxsearch/luwak).
 14 | 
 15 | ## Components
 16 | 
 17 | - [Phrase highlighter with support for](#phrase-annotator-usage):
 18 |   - case sensitivity,
 19 |   - ascii folding,
 20 |   - stemming support for various languages,
 21 |   - phrase slop,
 22 |   - synonymous phrases,
 23 |   - metadata,
 24 |   - tokenizer,
 25 |   - ensuring order of terms in a phrase with slop,
 26 |   - any combination of previously mentioned features.
 27 | - [Java interface to the phrase highlighter](#java-interface-to-the-phrase-highlighter)
 28 | - (alpha!) [Lucene query string support](#lucene-query-support) (interface is subject to change)
 29 | - [Dictionary file readers (csv, json, edn)](#dictionary-readers)
 30 | - [Dictionary validator](#dictionary-validator)
 31 | - [Dictionary optimizer](#dictionary-optimizer)
 32 | - [Annotation merger](#annotation-merger)
 33 | 
 34 | ## Phrase Annotator Usage
 35 | 
 36 | ```clojure
 37 | (require '[beagle.phrases :as phrases])
 38 | 
 39 | (let [dictionary [{:text "to be annotated" :id "1"}]
 40 |       highlighter-fn (phrases/highlighter dictionary)]
 41 |   (highlighter-fn "before annotated to be annotated after annotated"))
 42 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32})
 43 | 
 44 | ;; Case sensitivity is controlled per dictionary entry 
 45 | (let [dictionary [{:text "TO BE ANNOTATED" :id "1" :case-sensitive? false}]
 46 |       highlighter-fn (phrases/highlighter dictionary)]
 47 |   (highlighter-fn "before annotated to be annotated after annotated"))
 48 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32})
 49 | 
 50 | ;; ASCII folding is controlled per dictionary entry
 51 | (let [dictionary [{:text "TÖ BE ÄNNÖTÄTED" :id "1" :case-sensitive? false :ascii-fold? true}]
 52 |       highlighter-fn (phrases/highlighter dictionary)]
 53 |   (highlighter-fn "before annotated to be annotated after annotated"))
 54 | => ({:text "to be annotated", :type "LABEL", :dict-entry-id "1", :meta {}, :begin-offset 17, :end-offset 32})
 55 | 
 56 | ;; Stemming is supported for multiple languages per dictionary entry
 57 | (let [dictionary [{:text "Kaunas" :id "1" :stem? true :stemmer :lithuanian}]
 58 |       highlighter-fn (phrases/highlighter dictionary)]
 59 |   (highlighter-fn "Kauno miestas"))
 60 | => ({:text "Kauno", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 5})
 61 | 
 62 | ;; Phrases also support slop (i.e. terms edit distance) per dictionary entry
 63 | (let [txt "before start and end after"
 64 |       dictionary [{:text "start end" :id "1" :slop 1}]
 65 |       highlighter-fn (phrases/highlighter dictionary)]
 66 |   (highlighter-fn txt))
 67 | => ({:text "start and end", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 7, :end-offset 20})
 68 | 
 69 | ;; Every phrase can specify which tokenizer to use
 70 | (let [txt "[URGENT!] Do this immediately!"
 71 |       dictionary [{:text "[URGENT!]" :id "a" :tokenizer :whitespace}
 72 |                   {:text "[URGENT!]" :id "b" :tokenizer :standard}]
 73 |       highlighter-fn (phrases/highlighter dictionary)]
 74 |   (clojure.pprint/pprint (highlighter-fn txt)))
 75 | => 
 76 | ({:text "[URGENT!]",
 77 |   :type "PHRASE",
 78 |   :dict-entry-id "a",
 79 |   :meta {},
 80 |   :begin-offset 0,
 81 |   :end-offset 9}
 82 |  {:text "URGENT",
 83 |   :type "PHRASE",
 84 |   :dict-entry-id "b",
 85 |   :meta {},
 86 |   :begin-offset 1,
 87 |   :end-offset 7})
 88 | 
 89 | ;; Ensure that phrase terms are matched in the provided order
 90 | ;; e.g. NOT preserving order (default)
 91 | (let [txt "Mill Token"
 92 |       dictionary [{:text "Token Mill" :slop 2 :in-order? false}]
 93 |       highlighter-fn (phrases/highlighter dictionary)]
 94 |   (highlighter-fn txt))
 95 | => [{:text "Mill Token" :type "PHRASE" :dict-entry-id "0" :meta {} :begin-offset 0 :end-offset 10}]
 96 | ;; e.g. Preserving order
 97 | (let [txt "Mill Token"
 98 |       dictionary [{:text "Token Mill" :slop 2 :in-order? true}]
 99 |       highlighter-fn (phrases/highlighter dictionary)]
100 |   (highlighter-fn txt))
101 | => ()
102 | ```
103 | 
104 | ## Java Interface to the Phrase Highlighter
105 | 
106 | Example:
107 | ```java
108 | import lt.tokenmill.beagle.phrases.Annotation;
109 | import lt.tokenmill.beagle.phrases.Annotator;
110 | import lt.tokenmill.beagle.phrases.DictionaryEntry;
111 | 
112 | import java.util.Arrays;
113 | import java.util.Collection;
114 | import java.util.HashMap;
115 | 
116 | public class Main {
117 |     public static void main(String[] args) {
118 |         DictionaryEntry dictionaryEntry = new DictionaryEntry("test phrase");
119 |         Annotator annotator = new Annotator(Arrays.asList(dictionaryEntry));
120 |         Collection<Annotation> annotations = annotator.annotate("This is my test phrase");
121 |         annotations.forEach(s -> System.out.println("Annotated: \'" + s.text() + "\' at offset: " + s.beginOffset() + ":" + s.endOffset()));
122 |     }
123 | }
124 | 
125 | // => Annotated: 'test phrase' at offset: 11:22
126 | ```
127 | 
128 | The available options for the Java API are explained with examples in the [Java Interface for Phrase Highlighting wiki page](https://github.com/tokenmill/beagle/wiki/Java-Interface-for-Phrase-Highlighting).
129 | 
130 | All the options that are present in the Clojure interface are also available for use in Java, just convert Clojure keywords to Java strings, e.g.
131 | ```
132 | :case-sensitive? => "case-sensitive?"
133 | ```  
134 | 
135 | ### Project Setup with Maven
136 | 
137 | The library is deployed in the Maven Central Repository and you can just add the beagle dependency to your `pom.xml`:
138 | 
139 | ```xml
140 | <dependency>
141 |     <groupId>lt.tokenmill</groupId>
142 |     <artifactId>beagle</artifactId>
143 |     <version>0.3.1</version>
144 | </dependency>
145 | ```
146 | 
147 | ## Lucene Query Support
148 | 
149 | Examples: 
150 | 
151 | ```clojure
152 | (require '[beagle.lucene-alpha :as lucene])
153 | 
154 | (let [txt "some text this other that"
155 |         dictionary [{:text "this AND that" :id "1" :slop 1}]
156 |         annotator-fn (lucene/annotator dictionary)]
157 |   (annotator-fn txt {}))
158 | => ({:text "this AND that", :type "QUERY", :dict-entry-id "1", :meta {}})
159 | ```
160 | 
161 | ## Performance
162 | 
163 | The performance was measured on a desktop PC with Ubuntu 19.04 and 8-core Ryzen 1700.
164 |  
165 | The test setup was for news articles and dictionary made up of names of city names in USA.
166 | 
167 | Code and data for benchmarking and more benchmarks can be found [here](https://github.com/tokenmill/beagle-performance-benchmarks).
168 | 
169 | ### Single-thread
170 | 
171 | Average time spent per document ranged from 1.58 ms for dictionary of 5k phrases to 4.58 ms per document for 80k phrases.
172 | 
173 | ![alt text](charts/st-avg-per-doc.png)
174 | 
175 | Throughput of docs analyzed ranged from 626 docs/sec for dictionary of 5k phrases to 210 docs/sec for 80k phrases.
176 | 
177 | ![alt text](charts/st-throughput-per-sec.png)
178 | 
179 | Max time spent per document has couple of spikes when processing a document takes ~1000ms. These spikes should 
180 | have been caused either by GC pauses, or JVM deoptimizations. Aside from those spikes, max time ranges grows steadily
181 | from 15 ms to 72 ms as the dictionary size grows. 
182 | 
183 | Min time spent per document is fairly stable for any dictionary size and is about 0.45 ms. Most likely these are the
184 | cases when [Presearcher](https://lucene.apache.org/core/8_2_0/monitor/index.html) haven't found any candidate queries to run against the document. 
185 | 
186 | ![alt text](charts/st-min-max-per-doc.png)
187 | 
188 | ### Multi-threaded
189 | 
190 | Using `core.async` pipeline time spent per single doc ranged from 3.38 ms for dictionary of 5k phrases to 15.34 ms per document for 80k phrases.
191 | 
192 | ![alt text](charts/mt-avg-per-doc.png)
193 | 
194 | Total time spent to process all 10k docs ranged from 2412 ms for dictionary of 5k phrases to 12595 ms per document for 80k phrases.
195 | 
196 | ![alt text](charts/mt-total.png)
197 | 
198 | Throughput of docs analyzed ranged from 4143 docs/sec for dictionary of 5k phrases to 793 docs/sec for 80k phrases.
199 | 
200 | ![alt text](charts/mt-throughput-per-sec.png)
201 | 
202 | Max time spent per document has risen fairy steady from 24.15 ms for dictionary of 10k phrases to 113.45 ms per document for 60k phrases.
203 | 
204 | Min time spent per document varied from 0.6 ms for dictionary of 10k phrases to 1.1 ms per document for 55k phrases.
205 | 
206 | ![alt text](charts/mt-min-max-per-doc.png)
207 | 
208 | ### Conclusions about Performance
209 | 
210 | Processing of a one document on average is faster in the single-thread mode by roughly by 3x compared to multi-threaded mode but even 
211 | in multi-threaded mode one document rarely takes more than 10 ms. 
212 | 
213 | In multi-threaded mode throughput grows with the number on CPU cores almost linearly: 4143/8=518 docs per core per sec in multi-threaded mode
214 | while in single-thread mode 626 docs per core per sec.
215 | 
216 | ## Dictionary Readers
217 | 
218 | Three file formats are supported: csv, edn, json.
219 | 
220 | ### CSV Dictionary Format
221 | 
222 | Separator: ","
223 | Escape: "\""
224 | 
225 | The first line *MUST* be a header.
226 | 
227 | Supported header keys: `["text" "type" "id" "synonyms" "case-sensitive?" ":ascii-fold?" "meta"]`
228 | 
229 | Order is not important.
230 | 
231 | Under `synonyms`, there should be a list of string separated by ";"
232 | Under `meta`, there should be a list of strings separated by ";". Even number of strings is expected. In case of odd number, last one is ignored.
233 | 
234 | ## Dictionary Validator
235 | 
236 | Accepts any number of dictionaries to validate as long as they are provided in pairs as '"/path/to/dictionary/file" "file-type"'
237 | 
238 | ### Supported File Types
239 | 
240 | - csv
241 | - json
242 | - edn
243 | 
244 | ### Output
245 | 
246 | - If any dictionary is invalid exception will be thrown with exit status 1
247 | 
248 | ### Usage
249 | 
250 | #### Clojure
251 | 
252 | To use validator directly execute command: `clj -m beagle.validator "/path/to/dictionary/file" "file-type" "/path/to/dictionary/file2" "file-type" & ...`
253 | 
254 | ##### Example:
255 | 
256 | ```
257 | clj -m beagle.validator "your-dict.csv" "csv" "your-other-dict.json" "json"
258 | ```
259 | 
260 | #### Docker
261 | 
262 | Example in Gitlab CI:
263 | 
264 | ```
265 | validate-dictionaries:
266 |   stage: dictionary-validation
267 |   when: always
268 |   image: tokenmill/beagle-dictionary-validator
269 |   script:
270 |     - >
271 |       dictionary-validator
272 |       /path/to/dict.csv csv
273 |       /path/to/dict.json json
274 |       /path/to/dict.edn edn
275 | ```
276 | 
277 | ## Dictionary Optimizer
278 | 
279 | Supported optimizations:
280 | - Remove duplicate dictionary entries
281 | - Merge synonyms
282 | - Synonyms and text equality check
283 | 
284 | There are cases when dictionary entries can't be merged:
285 | - Differences in text analysis
286 | 
287 | Examples:
288 | ```clojure
289 | (require '[beagle.dictionary-optimizer :as optimizer])
290 | 
291 | ; Remove duplicates
292 | (let [dictionary [{:text "TO BE ANNOTATED" :id "1"}
293 |                   {:text "TO BE ANNOTATED"}]]
294 |   (optimizer/optimize dictionary))
295 | => ({:text "TO BE ANNOTATED", :id "1"})
296 | 
297 | ; Merge synonyms
298 | (let [dictionary [{:text "TO BE ANNOTATED" :synonyms ["ONE"]}
299 |                   {:text "TO BE ANNOTATED" :synonyms ["TWO"]}]]
300 |   (optimizer/optimize dictionary))
301 | => ({:text "TO BE ANNOTATED", :synonyms ("TWO" "ONE")})
302 | 
303 | ; Synonyms and text equality check
304 | (let [dictionary [{:text "TO BE ANNOTATED" :synonyms ["TO BE ANNOTATED"]}]]
305 |   (optimizer/optimize dictionary))
306 | => ({:text "TO BE ANNOTATED", :synonyms ["TO BE ANNOTATED"]})
307 | 
308 | ; Can't be merged because of differences in text analysis
309 | (let [dictionary [{:text "TO BE ANNOTATED" :case-sensitive? true}
310 |                   {:text "TO BE ANNOTATED" :case-sensitive? false}]]
311 |   (optimizer/optimize dictionary))
312 | => ({:text "TO BE ANNOTATED", :case-sensitive? true} {:text "TO BE ANNOTATED", :case-sensitive? false})
313 | ```
314 | 
315 | ## Annotation Merger
316 | 
317 | Only annotations of the same type are merged.
318 | 
319 | Handled cases:
320 | - Duplicate annotations
321 | - Nested annotations
322 | 
323 | Examples:
324 | ```clojure
325 | (require '[beagle.annotation-merger :as merger])
326 | 
327 | (let [dictionary [{:text "TEST"}
328 |                   {:text "This TEST is"}]
329 |       highlighter-fn (phrases/highlighter dictionary)
330 |       annotations (highlighter-fn "This TEST is")]
331 |   (println "Annotations: " annotations)
332 |   (merger/merge-same-type-annotations annotations))
333 | Annotations:  ({:text TEST, :type PHRASE, :dict-entry-id 0, :meta {}, :begin-offset 5, :end-offset 9} {:text This TEST is, :type PHRASE, :dict-entry-id 1, :meta {}, :begin-offset 0, :end-offset 12})
334 | => ({:text "This TEST is", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 12})
335 | 
336 | ;; You can also inline the need of merging annotations
337 | (let [dictionary [{:text "TEST"}
338 |                   {:text "This TEST is"}]
339 |       highlighter-fn (phrases/highlighter dictionary)]
340 |   (highlighter-fn "This TEST is" {:merge-annotations? true}))
341 | => ({:text "This TEST is", :type "PHRASE", :dict-entry-id "1", :meta {}, :begin-offset 0, :end-offset 12})
342 | ```
343 | 
344 | ## License
345 | 
346 | Copyright &copy; 2019 [TokenMill UAB](http://www.tokenmill.lt).
347 | 
348 | Distributed under the The Apache License, Version 2.0.
349 | 


--------------------------------------------------------------------------------
/charts/mt-avg-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-avg-per-doc.png


--------------------------------------------------------------------------------
/charts/mt-min-max-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-min-max-per-doc.png


--------------------------------------------------------------------------------
/charts/mt-throughput-per-sec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-throughput-per-sec.png


--------------------------------------------------------------------------------
/charts/mt-total.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/mt-total.png


--------------------------------------------------------------------------------
/charts/st-avg-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-avg-per-doc.png


--------------------------------------------------------------------------------
/charts/st-min-max-per-doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-min-max-per-doc.png


--------------------------------------------------------------------------------
/charts/st-throughput-per-sec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/charts/st-throughput-per-sec.png


--------------------------------------------------------------------------------
/classes/lt/tokenmill/beagle/phrases/Annotation.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/Annotation.class


--------------------------------------------------------------------------------
/classes/lt/tokenmill/beagle/phrases/Annotator.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/Annotator.class


--------------------------------------------------------------------------------
/classes/lt/tokenmill/beagle/phrases/DictionaryEntry.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tokenmill/beagle/2b37ca640db1deea7887246bde3abc45d0ab1ce3/classes/lt/tokenmill/beagle/phrases/DictionaryEntry.class


--------------------------------------------------------------------------------
/deps.edn:
--------------------------------------------------------------------------------
 1 | {:deps      {org.clojure/clojure              {:mvn/version "1.10.3"}
 2 |              org.clojure/data.csv             {:mvn/version "1.0.0"}
 3 |              org.clojure/tools.logging        {:mvn/version "1.1.0"}
 4 |              org.apache.lucene/lucene-core    {:mvn/version "8.9.0"}
 5 |              org.apache.lucene/lucene-monitor {:mvn/version "8.9.0"}
 6 |              metosin/jsonista                 {:mvn/version "0.3.3"}}
 7 |  :paths     ["src" "classes"]
 8 |  :mvn/repos {"central" {:url "https://repo1.maven.org/maven2/"}
 9 |              "clojars" {:url "https://repo.clojars.org/"}}
10 |  :aliases   {:dev
11 |              {:extra-deps  {org.jsoup/jsoup                {:mvn/version "1.13.1"}
12 |                             org.clojure/test.check         {:mvn/version "1.0.0"}
13 |                             criterium/criterium            {:mvn/version "0.4.6"}
14 |                             ch.qos.logback/logback-classic {:mvn/version "1.2.3"}}
15 |               :extra-paths ["test/resources"]}
16 |              :clj-kondo
17 |              {:main-opts  ["-m" "clj-kondo.main --lint src test"]
18 |               :extra-deps {clj-kondo/clj-kondo {:mvn/version "2019.07.31-alpha"}}
19 |               :jvm-opts   ["-Dclojure.main.report=stderr"]}
20 |              :test
21 |              {:extra-paths ["test"]
22 |               :extra-deps  {com.cognitect/test-runner {:git/url "https://github.com/cognitect-labs/test-runner.git"
23 |                                                        :sha     "62ef1de18e076903374306060ac0e8a752e57c86"}
24 |                             org.jsoup/jsoup           {:mvn/version "1.13.1"}
25 |                             org.clojure/test.check    {:mvn/version "1.0.0"}}}
26 |              :runner
27 |              {:extra-paths ["test"]
28 |               :main-opts   ["-m" "cognitect.test-runner"]}
29 |              :native-image
30 |              {:override-deps {org.clojure/clojure {:mvn/version "1.9.0"}}
31 |               :main-opts     ["-m clj.native-image beagle.validator"
32 |                               "--initialize-at-build-time"
33 |                               "--report-unsupported-elements-at-runtime"
34 |                               "-H:Name=dictionary-validator"]
35 |               :jvm-opts      ["-Dclojure.compiler.direct-linking=true"]
36 |               :extra-deps    {clj.native-image/clj.native-image
37 |                               {:git/url "https://github.com/taylorwood/clj.native-image.git"
38 |                                :sha     "7708e7fd4572459c81f6a6b8e44c96f41cdd92d4"}}}}}
39 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>lt.tokenmill</groupId>
  5 |   <artifactId>beagle</artifactId>
  6 |   <version>0.9.0-SNAPSHOT</version>
  7 |   <name>beagle</name>
  8 |   <description>Stream search library</description>
  9 |   <url>https://github.com/tokenmill/beagle</url>
 10 | 
 11 |   <licenses>
 12 |     <license>
 13 |       <name>The Apache License, Version 2.0</name>
 14 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 15 |     </license>
 16 |   </licenses>
 17 | 
 18 |   <developers>
 19 |     <developer>
 20 |       <name>Dainius Jocas</name>
 21 |       <email>dainius.jocas@tokenmill.lt</email>
 22 |       <organization>TokenMill</organization>
 23 |       <organizationUrl>http://www.tokenmill.lt</organizationUrl>
 24 |     </developer>
 25 |     <developer>
 26 |       <name>Žygimantas Medelis</name>
 27 |       <email>zygimantas.medelis@gmail.com</email>
 28 |       <organization>TokenMill</organization>
 29 |       <organizationUrl>http://www.tokenmill.lt</organizationUrl>
 30 |     </developer>
 31 |   </developers>
 32 | 
 33 |   <dependencies>
 34 |     <dependency>
 35 |       <groupId>org.clojure</groupId>
 36 |       <artifactId>clojure</artifactId>
 37 |       <version>1.10.1</version>
 38 |     </dependency>
 39 |     <dependency>
 40 |       <groupId>org.clojure</groupId>
 41 |       <artifactId>data.csv</artifactId>
 42 |       <version>0.1.4</version>
 43 |     </dependency>
 44 |     <dependency>
 45 |       <groupId>org.clojure</groupId>
 46 |       <artifactId>tools.logging</artifactId>
 47 |       <version>0.5.0</version>
 48 |     </dependency>
 49 |     <dependency>
 50 |       <groupId>org.apache.lucene</groupId>
 51 |       <artifactId>lucene-core</artifactId>
 52 |       <version>8.2.0</version>
 53 |     </dependency>
 54 |     <dependency>
 55 |       <groupId>org.apache.lucene</groupId>
 56 |       <artifactId>lucene-monitor</artifactId>
 57 |       <version>8.2.0</version>
 58 |     </dependency>
 59 |     <dependency>
 60 |       <groupId>metosin</groupId>
 61 |       <artifactId>jsonista</artifactId>
 62 |       <version>0.2.4</version>
 63 |     </dependency>
 64 |   </dependencies>
 65 | 
 66 |   <properties>
 67 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 68 |   </properties>
 69 | 
 70 |   <profiles>
 71 |     <profile>
 72 |       <id>release-sign-artifacts</id>
 73 |       <activation>
 74 |         <property>
 75 |           <name>performRelease</name>
 76 |           <value>true</value>
 77 |         </property>
 78 |       </activation>
 79 |       <build>
 80 |         <plugins>
 81 |           <plugin>
 82 |             <groupId>org.apache.maven.plugins</groupId>
 83 |             <artifactId>maven-javadoc-plugin</artifactId>
 84 |             <version>3.1.1</version>
 85 |             <executions>
 86 |               <execution>
 87 |                 <id>attach-javadoc</id>
 88 |                 <goals>
 89 |                   <goal>jar</goal>
 90 |                 </goals>
 91 |               </execution>
 92 |             </executions>
 93 |           </plugin>
 94 |           <plugin>
 95 |             <groupId>org.apache.maven.plugins</groupId>
 96 |             <artifactId>maven-gpg-plugin</artifactId>
 97 |             <version>1.6</version>
 98 |             <executions>
 99 |               <execution>
100 |                 <id>sign-artifacts</id>
101 |                 <phase>verify</phase>
102 |                 <goals>
103 |                   <goal>sign</goal>
104 |                 </goals>
105 |               </execution>
106 |             </executions>
107 |           </plugin>
108 |           <plugin>
109 |             <groupId>org.apache.maven.plugins</groupId>
110 |             <artifactId>maven-source-plugin</artifactId>
111 |             <version>3.1.0</version>
112 |           </plugin>
113 |           <plugin>
114 |             <groupId>org.apache.maven.plugins</groupId>
115 |             <artifactId>maven-deploy-plugin</artifactId>
116 |             <version>3.0.0-M1</version>
117 |           </plugin>
118 |         </plugins>
119 |       </build>
120 |     </profile>
121 |   </profiles>
122 | 
123 |   <build>
124 |     <sourceDirectory>src</sourceDirectory>
125 |     <resources>
126 |       <resource>
127 |         <directory>src</directory>
128 |       </resource>
129 |       <resource>
130 |         <directory>classes</directory>
131 |       </resource>
132 |     </resources>
133 |     <plugins>
134 |       <plugin>
135 |         <groupId>org.apache.maven.plugins</groupId>
136 |         <artifactId>maven-jar-plugin</artifactId>
137 |         <version>3.1.2</version>
138 |         <executions>
139 |           <execution>
140 |             <id>empty-javadoc-jar</id>
141 |             <phase>package</phase>
142 |             <goals>
143 |               <goal>jar</goal>
144 |             </goals>
145 |             <configuration>
146 |               <classifier>javadoc</classifier>
147 |               <classesDirectory>${basedir}/javadoc</classesDirectory>
148 |             </configuration>
149 |           </execution>
150 |         </executions>
151 |       </plugin>
152 |     </plugins>
153 |   </build>
154 | 
155 |   <scm>
156 |     <url>https://github.com/tokenmill/beagle</url>
157 |     <connection>scm:git:git://github.com/tokenmill/beagle.git</connection>
158 |     <developerConnection>scm:git:ssh://git@github.com/tokenmill/beagle.git</developerConnection>
159 |     <tag>HEAD</tag>
160 |   </scm>
161 | 
162 |   <repositories>
163 |     <repository>
164 |       <id>clojars</id>
165 |       <url>https://repo.clojars.org/</url>
166 |     </repository>
167 |     <repository>
168 |       <id>sonatype</id>
169 |       <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
170 |     </repository>
171 |   </repositories>
172 | 
173 |   <distributionManagement>
174 |     <snapshotRepository>
175 |         <id>ossrh</id>
176 |       <url>https://oss.sonatype.org/content/repositories/snapshots</url>
177 |     </snapshotRepository>
178 |     <repository>
179 |       <id>ossrh</id>
180 |       <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
181 |     </repository>
182 |   </distributionManagement>
183 | </project>
184 | 


--------------------------------------------------------------------------------
/src/beagle/annotation_merger.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.annotation-merger)
 2 | 
 3 | (defn related-annotations? [anno1 anno2]
 4 |   (<= (:begin-offset anno1) (:begin-offset anno2) (:end-offset anno1)))
 5 | 
 6 | (defn parent-child-annotations? [parent-anno child-anno]
 7 |   (and (>= (:begin-offset child-anno) (:begin-offset parent-anno))
 8 |        (<= (:end-offset child-anno) (:end-offset parent-anno))))
 9 | 
10 | (defn merge-annotations [annotations]
11 |   (let [sorted-annotation (sort-by :begin-offset annotations)]
12 |     (loop [parent-annotation (first sorted-annotation)
13 |            [child-annotation & remaining] (rest sorted-annotation)
14 |            result []]
15 |       (if child-annotation
16 |         (if (related-annotations? parent-annotation child-annotation)
17 |           (recur (if (and (parent-child-annotations? parent-annotation child-annotation)
18 |                           (not (parent-child-annotations? child-annotation parent-annotation)))
19 |                    parent-annotation
20 |                    child-annotation)
21 |                  remaining
22 |                  result)
23 |           (recur child-annotation remaining (conj result parent-annotation)))
24 |         (conj result parent-annotation)))))
25 | 
26 | (defn merge-same-type-annotations [annotations]
27 |   (mapcat (fn [[_ anns]] (merge-annotations anns)) (group-by :type annotations)))
28 | 


--------------------------------------------------------------------------------
/src/beagle/dictionary_optimizer.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.dictionary-optimizer
 2 |   (:require [clojure.set :as set]
 3 |             [clojure.string :as str]))
 4 | 
 5 | (defn merge-synonyms [group-of-entries]
 6 |   (reduce (fn [synonyms-set {synonyms :synonyms}]
 7 |             (into synonyms-set synonyms))
 8 |           #{} group-of-entries))
 9 | 
10 | (defn merge-meta [group-of-entries]
11 |   (reduce (fn [acc {meta :meta}] (merge acc meta)) {} group-of-entries))
12 | 
13 | (defn merge-entries [entries]
14 |   (let [{:keys [text case-sensitive? ascii-fold? id]} (first entries)
15 |         synonyms (remove #(= text %) (merge-synonyms entries))
16 |         meta (merge-meta entries)]
17 |     (cond-> {:text text}
18 |             (not-empty synonyms) (assoc :synonyms synonyms)
19 |             (not-empty meta) (assoc :meta meta)
20 |             id (assoc :id id)
21 |             (not (nil? case-sensitive?)) (assoc :case-sensitive? case-sensitive?)
22 |             (not (nil? ascii-fold?)) (assoc :ascii-fold? ascii-fold?))))
23 | 
24 | (defn mergeable-meta? [{meta-a :meta} {meta-b :meta}]
25 |   (every? #(= (get meta-a %) (get meta-b %)) (set/intersection (set (keys meta-a)) (set (keys meta-b)))))
26 | 
27 | (defn aggregate-entries-by-meta [entries]
28 |   (loop [entry-a (first entries)
29 |          [entry-b & remaining] (rest entries)
30 |          acc []
31 |          exceptions []]
32 |     (if entry-b
33 |       (if (mergeable-meta? entry-a entry-b)
34 |         (recur (merge-entries [entry-a entry-b]) remaining acc exceptions)
35 |         (recur entry-a remaining acc (conj exceptions entry-b)))
36 |       (if (seq exceptions)
37 |         (recur (first exceptions) (rest exceptions) (conj acc entry-a) [])
38 |         (conj acc entry-a)))))
39 | 
40 | (defn group-dictionary-entries [dictionary]
41 |   (group-by (fn [entry] [(:text entry) (:case-sensitive? entry) (:ascii-fold? entry)]) dictionary))
42 | 
43 | (defn optimize [dictionary]
44 |   (mapcat (fn [[_ grouped-entries]] (aggregate-entries-by-meta grouped-entries))
45 |           (group-dictionary-entries dictionary)))
46 | 
47 | (defn optimization-suggestion [entries]
48 |   {:suggestion       (-> (format "Dictionary items '%s' have identical `[text case-sensitivity ascii-folding] features."
49 |                                  (reduce #(conj %1 (or (:id %2) (:text %2))) [] entries))
50 |                          (str/replace #"\"" ""))
51 |    :dictionary-items entries})
52 | 
53 | (defn dry-run [dictionary]
54 |   (reduce (fn [acc [_ grouped-entries]]
55 |             (if (< 1 (count grouped-entries))
56 |               (conj acc (optimization-suggestion grouped-entries))
57 |               acc))
58 |        [] (group-dictionary-entries dictionary)))
59 | 


--------------------------------------------------------------------------------
/src/beagle/java/annotation.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.java.annotation)
 2 | 
 3 | (gen-class
 4 |   :name lt.tokenmill.beagle.phrases.Annotation
 5 |   :prefix Annotation-
 6 |   :state "state"
 7 |   :init "init"
 8 |   :constructors {[String String Long Long String java.util.Map] []}
 9 |   :methods [[text [] String]
10 |             [type [] String]
11 |             [beginOffset [] Long]
12 |             [endOffset [] Long]
13 |             [dictionaryEntryId [] String]
14 |             [meta [] java.util.Map]]
15 |   :prefix Annotation-)
16 | 
17 | (defn Annotation-init [text type begin end dictionaryEntryId meta]
18 |   [[] (atom {:text          text
19 |              :type          type
20 |              :begin         begin
21 |              :end           end
22 |              :dict-entry-id dictionaryEntryId
23 |              :meta          meta})])
24 | 
25 | (defn Annotation-text [this]
26 |   (@(.state this) :text))
27 | (defn Annotation-type [this]
28 |   (@(.state this) :type))
29 | (defn Annotation-beginOffset [this]
30 |   (@(.state this) :begin))
31 | (defn Annotation-endOffset [this]
32 |   (@(.state this) :end))
33 | (defn Annotation-dictionaryEntryId [this]
34 |   (@(.state this) :dict-entry-id))
35 | (defn Annotation-meta [this]
36 |   (@(.state this) :meta))
37 | 


--------------------------------------------------------------------------------
/src/beagle/java/java.clj:
--------------------------------------------------------------------------------
  1 | (ns beagle.java.java
  2 |   (:gen-class)
  3 |   (:require [beagle.phrases :as phrases]))
  4 | 
  5 | (gen-class
  6 |   :name lt.tokenmill.beagle.phrases.DictionaryEntry
  7 |   :state "state"
  8 |   :init "init"
  9 |   :constructors {[String] []}
 10 |   :methods [[text [] String]
 11 |             [type [] String]
 12 |             [setType [String] void]
 13 |             [id [] String]
 14 |             [setId [String] void]
 15 |             [synonyms [] java.util.Collection]
 16 |             [setSynonyms [java.util.Collection] void]
 17 |             [caseSensitive [] Boolean]
 18 |             [setCaseSensitive [Boolean] void]
 19 |             [asciiFold [] Boolean]
 20 |             [setAsciiFold [Boolean] void]
 21 |             [stem [] Boolean]
 22 |             [setStem [Boolean] void]
 23 |             [stemmer [] String]
 24 |             [setStemmer [String] void]
 25 |             [slop [] Integer]
 26 |             [setSlop [Integer] void]
 27 |             [tokenizer [] String]
 28 |             [setTokenizer [String] void]
 29 |             [meta [] java.util.Map]
 30 |             [setMeta [java.util.Map] void]]
 31 |   :prefix DictionaryEntry-)
 32 | 
 33 | (defn DictionaryEntry-init [phrase]
 34 |   [[] (atom {:text phrase})])
 35 | 
 36 | (defn DictionaryEntry-text [this]
 37 |   (@(.state this) :text))
 38 | (defn DictionaryEntry-type [this]
 39 |   (@(.state this) :type))
 40 | (defn DictionaryEntry-setType [this type]
 41 |   (swap! (.state this) assoc :type type))
 42 | (defn DictionaryEntry-id [this]
 43 |   (@(.state this) :id))
 44 | (defn DictionaryEntry-setId [this id]
 45 |   (swap! (.state this) assoc :id id))
 46 | (defn DictionaryEntry-synonyms [this]
 47 |   (@(.state this) :synonyms))
 48 | (defn DictionaryEntry-setSynonyms [this synonyms]
 49 |   (swap! (.state this) assoc :synonyms synonyms))
 50 | (defn DictionaryEntry-caseSensitive [this]
 51 |   (@(.state this) :case-sensitive?))
 52 | (defn DictionaryEntry-setCaseSensitive [this case-sensitive]
 53 |   (swap! (.state this) assoc :case-sensitive? case-sensitive))
 54 | (defn DictionaryEntry-asciiFold [this]
 55 |   (@(.state this) :ascii-fold?))
 56 | (defn DictionaryEntry-setAsciiFold [this ascii-fold]
 57 |   (swap! (.state this) assoc :ascii-fold? ascii-fold))
 58 | (defn DictionaryEntry-stem [this]
 59 |   (@(.state this) :stem?))
 60 | (defn DictionaryEntry-setStem [this stem]
 61 |   (swap! (.state this) assoc :stem? stem))
 62 | (defn DictionaryEntry-stemmer [this]
 63 |   (@(.state this) :stemmer))
 64 | (defn DictionaryEntry-setStemmer [this stemmer]
 65 |   (swap! (.state this) assoc :stemmer stemmer))
 66 | (defn DictionaryEntry-slop [this]
 67 |   (@(.state this) :slop))
 68 | (defn DictionaryEntry-setSlop [this slop]
 69 |   (swap! (.state this) assoc :slop slop))
 70 | (defn DictionaryEntry-meta [this]
 71 |   (@(.state this) :meta))
 72 | (defn DictionaryEntry-setMeta [this meta]
 73 |   (swap! (.state this) assoc :meta meta))
 74 | (defn DictionaryEntry-tokenizer [this]
 75 |   (@(.state this) :tokenizer))
 76 | (defn DictionaryEntry-setTokenizer [this tokenizer]
 77 |   (swap! (.state this) assoc :tokenizer tokenizer))
 78 | 
 79 | (gen-class
 80 |   :name lt.tokenmill.beagle.phrases.Annotator
 81 |   :state "state"
 82 |   :init "init"
 83 |   :constructors {[java.util.Collection]               []
 84 |                  [java.util.Collection java.util.Map] []}
 85 |   :prefix Phrases-
 86 |   :methods [[annotate [String] java.util.Collection]
 87 |             [annotate [String java.util.Map] java.util.Collection]])
 88 | 
 89 | (defn Phrases-init
 90 |   ([dictionary] (Phrases-init dictionary {}))
 91 |   ([dictionary opts]
 92 |    [[] (atom {:dictionary   dictionary
 93 |               :annotator-fn (phrases/highlighter
 94 |                               (map (fn [dictionary-entry]
 95 |                                      {:text            (.text dictionary-entry)
 96 |                                       :type            (.type dictionary-entry)
 97 |                                       :id              (.id dictionary-entry)
 98 |                                       :synonyms        (.synonyms dictionary-entry)
 99 |                                       :case-sensitive? (.caseSensitive dictionary-entry)
100 |                                       :ascii-fold?     (.asciiFold dictionary-entry)
101 |                                       :stem?           (.stem dictionary-entry)
102 |                                       :stemmer         (keyword (.stemmer dictionary-entry))
103 |                                       :slop            (.slop dictionary-entry)
104 |                                       :tokenizer       (keyword (.tokenizer dictionary-entry))
105 |                                       :meta            (.meta dictionary-entry)}) dictionary)
106 |                               (reduce (fn [m [k v]]
107 |                                         (assoc m (keyword k) v)) {} opts))})]))
108 | 
109 | (defn Phrases-annotate
110 |   ([this text] (Phrases-annotate this text {}))
111 |   ([this text opts]
112 |    (map (fn [ann] (lt.tokenmill.beagle.phrases.Annotation.
113 |                     (:text ann)
114 |                     (:type ann)
115 |                     (long (:begin-offset ann))
116 |                     (long (:end-offset ann))
117 |                     (:dict-entry-id ann)
118 |                     (:meta ann)))
119 |         ((@(.state this) :annotator-fn) text (reduce (fn [m [k v]]
120 |                                                        (assoc m (keyword k) v)) {} opts)))))
121 | 


--------------------------------------------------------------------------------
/src/beagle/lucene_alpha.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.lucene-alpha
 2 |   (:require [clojure.string :as s]
 3 |             [clojure.tools.logging :as log]
 4 |             [beagle.monitor :as monitor]
 5 |             [beagle.text-analysis :as text-analysis])
 6 |   (:import (org.apache.lucene.monitor MonitorQuery QueryMatch Monitor)
 7 |            (org.apache.lucene.queryparser.classic QueryParser ParseException)
 8 |            (org.apache.lucene.document Document Field FieldType)
 9 |            (org.apache.lucene.index IndexOptions)))
10 | 
11 | (def ^FieldType field-type
12 |   (doto (FieldType.)
13 |     (.setTokenized true)
14 |     (.setIndexOptions IndexOptions/DOCS_AND_FREQS)
15 |     (.setStoreTermVectors true)
16 |     (.setStoreTermVectorOffsets true)))
17 | 
18 | (defn match-text [^String text ^Monitor monitor field-names type-name]
19 |   (let [doc (Document.)]
20 |     (doseq [field-name field-names]
21 |       (.add doc (Field. ^String field-name text field-type)))
22 |     (map (fn [^QueryMatch query-match]
23 |            (let [^MonitorQuery query (.getQuery monitor (.getQueryId query-match))
24 |                  meta (.getMetadata query)]
25 |              {:text          (.getQueryString query)
26 |               :type          (or (get meta "_type") type-name)
27 |               :dict-entry-id (.getQueryId query-match)
28 |               :meta          (into {} meta)})) (.getMatches (.match monitor doc (QueryMatch/SIMPLE_MATCHER))))))
29 | 
30 | (defn dict-entry->monitor-queries [{:keys [id text meta type] :as dict-entry} default-analysis-conf idx]
31 |   (try
32 |     (let [query-id (or id (str idx))
33 |           metadata (reduce-kv (fn [m k v] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta))]
34 |       (MonitorQuery. query-id
35 |                      (.parse (QueryParser.
36 |                                (text-analysis/get-field-name dict-entry default-analysis-conf)
37 |                                (text-analysis/get-string-analyzer dict-entry default-analysis-conf))
38 |                              text)
39 |                      text
40 |                      metadata))
41 |     (catch ParseException e
42 |       (log/errorf "Failed to parse query: '%s' with exception '%s'" dict-entry e))
43 |     (catch Exception e (log/errorf "Failed create query: '%s' with '%s'" dict-entry e))))
44 | 
45 | (defn dictionary->monitor-queries [dictionary default-analysis-conf]
46 |   (remove nil?
47 |           (map (fn [dict-entry idx]
48 |                  (dict-entry->monitor-queries dict-entry default-analysis-conf idx))
49 |                dictionary (range))))
50 | 
51 | (defn match-monitor [text monitor field-names type-name opts]
52 |   (log/debugf "Match monitor with opts='%s'" opts)
53 |   (if (s/blank? text)
54 |     []
55 |     (match-text text monitor field-names type-name)))
56 | 
57 | (defn annotator
58 |   ([dictionary] (annotator dictionary {}))
59 |   ([dictionary {:keys [type-name tokenizer]}]
60 |    (let [type-name (if (s/blank? type-name) "QUERY" type-name)
61 |          {:keys [monitor field-names]} (monitor/setup dictionary
62 |                                                       {:tokenizer tokenizer}
63 |                                                       dictionary->monitor-queries)]
64 |      (fn
65 |        ([text] (match-monitor text monitor field-names type-name {}))
66 |        ([text opts] (match-monitor text monitor field-names type-name opts))))))
67 | 


--------------------------------------------------------------------------------
/src/beagle/monitor.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.monitor
 2 |   (:require [clojure.java.io :as io]
 3 |             [clojure.tools.logging :as log]
 4 |             [jsonista.core :as json]
 5 |             [beagle.text-analysis :as text-analysis])
 6 |   (:import (org.apache.lucene.monitor MonitorConfiguration Monitor MonitorQuerySerializer MonitorQuery)
 7 |            (org.apache.lucene.analysis.miscellaneous PerFieldAnalyzerWrapper)
 8 |            (org.apache.lucene.util BytesRef)
 9 |            (org.apache.lucene.search MatchAllDocsQuery)
10 |            (java.util ArrayList)))
11 | 
12 | (def monitor-query-serializer
13 |   (reify MonitorQuerySerializer
14 |     (serialize [_ query]
15 |       (BytesRef.
16 |         (json/write-value-as-string
17 |           {"query-id" (.getId query)
18 |            "query"    (.getQueryString query)
19 |            "metadata" (.getMetadata query)})))
20 |     (deserialize [_ binary-value]
21 |       (let [dq (json/read-value (io/reader (.bytes ^BytesRef binary-value)))]
22 |         (MonitorQuery. (get dq "query-id")
23 |                        (MatchAllDocsQuery.)
24 |                        (get dq "query")
25 |                        (get dq "metadata"))))))
26 | 
27 | (defn create [field-names-w-analyzers]
28 |   (let [^MonitorConfiguration config (MonitorConfiguration.)
29 |         per-field-analyzers (PerFieldAnalyzerWrapper.
30 |                               (text-analysis/get-string-analyzer {} {}) field-names-w-analyzers)]
31 |     (.setIndexPath config nil monitor-query-serializer)
32 |     (Monitor. per-field-analyzers config)))
33 | 
34 | (defn defer-to-one-by-one-registration [^Monitor monitor monitor-queries]
35 |   (doseq [mq monitor-queries]
36 |     (try
37 |       (.register monitor (doto (ArrayList.) (.add mq)))
38 |       (catch Exception e
39 |         (log/errorf "Failed to register query: '%s'" mq)
40 |         (.printStackTrace e)))))
41 | 
42 | (defn register-queries [^Monitor monitor monitor-queries]
43 |   (try
44 |     (.register monitor ^Iterable monitor-queries)
45 |     (catch Exception _
46 |       (defer-to-one-by-one-registration monitor monitor-queries))))
47 | 
48 | (defn field-name-analyzer-mappings
49 |   "Creates a map with field names as keys and Lucene analyzers as values.
50 |   Both field name and analyzer are decided based on the dictionary entry configuration.
51 |   First group dictionary entries by field name. Then from every group of dictionary entries
52 |   take the first entry and create an analyzer based on analysis configuration."
53 |   [dictionary default-analysis-conf]
54 |   (->> dictionary
55 |        (group-by (fn [dictionary-entry]
56 |                    (text-analysis/get-field-name dictionary-entry default-analysis-conf)))
57 |        (reduce (fn [acc [field-name dict]]
58 |                  (assoc acc field-name (text-analysis/get-string-analyzer (first dict) default-analysis-conf)))
59 |                {})))
60 | 
61 | (defn prepare [monitor dict-entries default-analysis-conf dictionary->monitor-queries-fn]
62 |   (register-queries monitor (dictionary->monitor-queries-fn dict-entries default-analysis-conf)))
63 | 
64 | (defn setup
65 |   "Setups the monitor with all the dictionary entries."
66 |   [dictionary default-analysis-conf dict-entry->monitor-queries-fn]
67 |   (let [mappings-from-field-names-to-analyzers (field-name-analyzer-mappings dictionary default-analysis-conf)
68 |         monitor (create mappings-from-field-names-to-analyzers)]
69 |     (prepare monitor dictionary default-analysis-conf dict-entry->monitor-queries-fn)
70 |     {:monitor     monitor
71 |      :field-names (keys mappings-from-field-names-to-analyzers)}))
72 | 


--------------------------------------------------------------------------------
/src/beagle/phrases.clj:
--------------------------------------------------------------------------------
  1 | (ns beagle.phrases
  2 |   (:require [clojure.string :as s]
  3 |             [clojure.tools.logging :as log]
  4 |             [beagle.validator :as validator]
  5 |             [beagle.annotation-merger :as merger]
  6 |             [beagle.dictionary-optimizer :as optimizer]
  7 |             [beagle.text-analysis :as text-analysis]
  8 |             [beagle.monitor :as monitor]
  9 |             [beagle.schema :refer [->Highlight ->DictionaryEntry]])
 10 |   (:import (java.util UUID)
 11 |            (org.apache.lucene.document Document FieldType Field)
 12 |            (org.apache.lucene.index IndexOptions Term)
 13 |            (org.apache.lucene.monitor Monitor MonitorQuery HighlightsMatch HighlightsMatch$Hit)
 14 |            (org.apache.lucene.search MultiPhraseQuery$Builder FuzzyQuery)
 15 |            (org.apache.lucene.search.spans SpanNearQuery$Builder SpanTermQuery SpanMultiTermQueryWrapper)))
 16 | 
 17 | (defn filter-and-sort-ordered-hits [^String text ^String highlight-text ordered-hits]
 18 |   (->> ordered-hits
 19 |        (filter (fn [^HighlightsMatch$Hit hit]
 20 |                  (= highlight-text (let [s (.-startOffset hit)
 21 |                                          e (.-endOffset hit)]
 22 |                                      (subs text s e)))))
 23 |        (sort-by (fn [^HighlightsMatch$Hit hit] (.-startOffset hit)))))
 24 | 
 25 | (defn group-sequencial-ending
 26 |   "Groups a sequence taking only the last hit from a consecutive sub-sequence
 27 |    of terms, e.g. [1 2 3 6 7] => [3 7]"
 28 |   [spans-end-hits]
 29 |   (loop [[current-term & terms] spans-end-hits
 30 |          last-item nil
 31 |          current-seq []
 32 |          filtered-ends []]
 33 |     (if (nil? current-term)
 34 |       (conj filtered-ends (last current-seq))
 35 |       (if (nil? last-item)
 36 |         (recur terms current-term [current-term] (if (seq current-seq)
 37 |                                                    (conj filtered-ends (last current-seq))
 38 |                                                    filtered-ends))
 39 |         (if (= (inc (.-startPosition last-item)) (.-startPosition current-term))
 40 |           (recur terms current-term (conj current-seq current-term) filtered-ends)
 41 |           (recur terms current-term [current-term] (conj filtered-ends (last current-seq))))))))
 42 | 
 43 | (defn pair-begins-with-ends [spans-start-hits spans-end-hits]
 44 |   (let [grouped-ends (group-sequencial-ending spans-end-hits)]
 45 |     (loop [[start & starts-tail :as starts] spans-start-hits
 46 |            [end & ends-tail] grouped-ends
 47 |            pairs []]
 48 |       (if (or (nil? start) (nil? end))
 49 |         pairs
 50 |         (if (= start end)
 51 |           (recur starts ends-tail pairs)
 52 |           (recur (remove #(< (.-startPosition %) (.-startPosition end)) starts-tail)
 53 |                  ends-tail (conj pairs [start end])))))))
 54 | 
 55 | (defn ordered-hits->highlights
 56 |   "The default highlighter fails to handle SpanNearQuery: highlights are term highlights not the whole
 57 |   span highlights.
 58 |   The temporary workaround works as follows:
 59 |   1) find the very first hit
 60 |   2) find the very last hit
 61 |   3) assume that all spans begins and ends with the same terms
 62 |   4) collect all hits like the beginning
 63 |   5) collect all hits like the ending
 64 |   6) pair beginnings with endings and make one highlight per pair"
 65 |   [text type-name query-id metadata ordered-hits]
 66 |   (let [^HighlightsMatch$Hit first-hit (apply min-key #(.-startOffset ^HighlightsMatch$Hit %) ordered-hits)
 67 |         first-text (subs text (.-startOffset first-hit) (.-endOffset first-hit))
 68 |         ^HighlightsMatch$Hit last-hit (apply max-key #(.-startOffset ^HighlightsMatch$Hit %) ordered-hits)
 69 |         last-text (subs text (.-startOffset last-hit) (.-endOffset last-hit))
 70 |         spans-start-hits (filter-and-sort-ordered-hits text first-text ordered-hits)
 71 |         spans-end-hits (filter-and-sort-ordered-hits text last-text ordered-hits)
 72 |         normalized-metadata (dissoc metadata "_in-order")]
 73 |     (map (fn [[^HighlightsMatch$Hit span-start-hit ^HighlightsMatch$Hit span-end-hit]]
 74 |            (let [start-offset (.-startOffset span-start-hit)
 75 |                  end-offset (.-endOffset span-end-hit)]
 76 |              (->Highlight
 77 |                (subs text start-offset end-offset)
 78 |                (or (get meta "_type") type-name)
 79 |                query-id
 80 |                normalized-metadata
 81 |                start-offset
 82 |                end-offset))) (pair-begins-with-ends spans-start-hits spans-end-hits))))
 83 | 
 84 | (defn match->annotation [text ^Monitor monitor type-name ^HighlightsMatch match]
 85 |   (mapcat
 86 |     (fn [[_ hits]]
 87 |       (let [query-id (.getQueryId match)
 88 |             metadata (into {} (.getMetadata (.getQuery monitor query-id)))]
 89 |         (if (get metadata "_in-order")
 90 |           (ordered-hits->highlights text type-name query-id metadata hits)
 91 |           (map (fn [^HighlightsMatch$Hit hit]
 92 |                  (let [start-offset (.-startOffset hit)
 93 |                        end-offset (.-endOffset hit)]
 94 |                    (->Highlight
 95 |                      (subs text start-offset end-offset)
 96 |                      (or (get metadata "_type") type-name)
 97 |                      query-id
 98 |                      metadata
 99 |                      start-offset
100 |                      end-offset))) hits))))
101 |     (.getHits match)))
102 | 
103 | (def ^FieldType field-type
104 |   (doto (FieldType.)
105 |     (.setTokenized true)
106 |     (.setIndexOptions IndexOptions/DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
107 |     (.setStoreTermVectors true)
108 |     (.setStoreTermVectorOffsets true)))
109 | 
110 | (defn annotate-text [^String text ^Monitor monitor field-names ^String type-name]
111 |   (try
112 |     (let [doc (Document.)]
113 |       (doseq [field-name field-names]
114 |         (.add doc (Field. ^String field-name text field-type)))
115 |       (mapcat #(match->annotation text monitor type-name %)
116 |               (.getMatches
117 |                 (.match monitor
118 |                         #^"[Lorg.apache.lucene.document.Document;" (into-array Document [doc])
119 |                         (HighlightsMatch/MATCHER))
120 |                 0)))
121 |     (catch Exception e
122 |       (log/errorf "Failed to match text: '%s'" text)
123 |       (.printStackTrace e))))
124 | 
125 | (defn prepare-synonyms [query-id {:keys [synonyms] :as dict-entry}]
126 |   (map (fn [synonym]
127 |          (->DictionaryEntry
128 |            synonym
129 |            (:type dict-entry)
130 |            (str (UUID/randomUUID))
131 |            nil
132 |            (:case-sensitive? dict-entry)
133 |            (:ascii-fold? dict-entry)
134 |            (:stem? dict-entry)
135 |            (:stemmer dict-entry)
136 |            (:slop dict-entry)
137 |            (:tokenizer dict-entry)
138 |            (assoc (:meta dict-entry)
139 |              :synonym? "true" :query-id query-id)))
140 |        synonyms))
141 | 
142 | (defn dict-entry->terms [dict-entry default-analysis-conf]
143 |   (let [analyzer (text-analysis/get-string-analyzer dict-entry default-analysis-conf)]
144 |     (into-array String (text-analysis/text->token-strings (:text dict-entry) analyzer))))
145 | 
146 | (defn merge-dict-entry-with-highlighter-opts
147 |   "There are dictionary opts that do not contribute to text analysis, but contributes
148 |   to querying. This function acts a single point in merging default highlighter opts
149 |   to the dictionary entry."
150 |   [dict-entry default-analysis-conf]
151 |   (cond-> dict-entry
152 |           (and (not (contains? dict-entry :slop))
153 |                (contains? default-analysis-conf :slop))
154 |           (assoc :slop (:slop default-analysis-conf))
155 | 
156 |           (and (not (contains? dict-entry :in-order?))
157 |                (contains? default-analysis-conf :in-order?))
158 |           (assoc :in-order? (:in-order? default-analysis-conf))))
159 | 
160 | (defn dict-entry->monitor-query [dict-entry default-analysis-conf idx]
161 |   (let [field-name (text-analysis/get-field-name dict-entry default-analysis-conf)
162 |         terms (dict-entry->terms dict-entry default-analysis-conf)
163 |         {:keys [id text meta type slop in-order?]
164 |          :as dict-entry} (merge-dict-entry-with-highlighter-opts dict-entry default-analysis-conf)
165 |         query-id (or id (str idx))
166 |         metadata (reduce (fn [m [k v]] (assoc m (name k) v)) {} (if type (assoc meta :_type type) meta))
167 |         normalized-slop (when slop (max 0 (min slop Integer/MAX_VALUE)))]
168 |     (if (seq terms)
169 |       (if (or (and (and (number? slop) (< 0 slop)) in-order? (< 1 (count terms)))
170 |               (:fuzzy? dict-entry))
171 |         (MonitorQuery. query-id
172 |                        (try
173 |                          (let [ordered? (cond
174 |                                           in-order? true
175 |                                           (and (nil? in-order?) (:fuzzy? dict-entry)) true
176 |                                           :else false)
177 |                                snqb (SpanNearQuery$Builder. ^String field-name ordered?)]
178 |                            (doseq [term terms]
179 |                              (if (true? (:fuzzy? dict-entry))
180 |                                (.addClause snqb (SpanMultiTermQueryWrapper.
181 |                                                   (FuzzyQuery.
182 |                                                     (Term. ^String field-name ^String term)
183 |                                                     (or (:fuzziness dict-entry) 1))))
184 |                                (.addClause snqb (SpanTermQuery. (Term. ^String field-name ^String term)))))
185 |                            (when-not (= slop normalized-slop)
186 |                              (log/warnf "Phrase slop '%s' normalized to '%s'" slop normalized-slop))
187 |                            (when normalized-slop
188 |                              (.setSlop snqb normalized-slop))
189 |                            (.build snqb))
190 |                          (catch Exception e (.printStackTrace e)))
191 |                        text
192 |                        (assoc metadata "_in-order" true))
193 |         (MonitorQuery. query-id
194 |                        (let [mpqb (MultiPhraseQuery$Builder.)]
195 |                          (doseq [s terms]
196 |                            (.add mpqb (Term. ^String field-name ^String s)))
197 |                          (when slop
198 |                            (when-not (= slop normalized-slop)
199 |                              (log/warnf "Phrase slop '%s' normalized to '%s'" slop normalized-slop))
200 |                            (.setSlop mpqb normalized-slop))
201 |                          (.build mpqb))
202 |                        text
203 |                        metadata))
204 |       (log/warnf "Discarding the dictionary entry because no tokens: '%s'" dict-entry))))
205 | 
206 | (defn dict-entries->monitor-queries [dict-entries default-analysis-conf]
207 |   (->> dict-entries
208 |        (mapcat (fn [idx dict-entry]
209 |                  (let [query-id (or (get dict-entry :id) (str idx))]
210 |                    (cons
211 |                      (dict-entry->monitor-query dict-entry default-analysis-conf idx)
212 |                      (map #(dict-entry->monitor-query % default-analysis-conf nil)
213 |                           (prepare-synonyms query-id dict-entry)))))
214 |                (range))
215 |        (remove nil?)))
216 | 
217 | (defn synonym-annotation? [annotation]
218 |   (= "true" (get-in annotation [:meta "synonym?"])))
219 | 
220 | (defn meta-type? [annotation]
221 |   (string? (get-in annotation [:meta "_type"])))
222 | 
223 | (defn post-process [annotation]
224 |   (cond-> annotation
225 |           (synonym-annotation? annotation) (assoc :dict-entry-id (get-in annotation [:meta "query-id"]))
226 |           (meta-type? annotation) (update-in [:meta] dissoc "_type")))
227 | 
228 | (defn match [text monitor field-names type-name opts]
229 |   (if (s/blank? text)
230 |     []
231 |     (let [annotations (map post-process (annotate-text text monitor field-names type-name))]
232 |       (if (:merge-annotations? opts)
233 |         (merger/merge-same-type-annotations annotations)
234 |         annotations))))
235 | 
236 | (defn highlighter
237 |   "Creates a highlighter function with for a given dictionary.
238 |   Params:
239 |   - dictionary
240 |       a list of dictionary entries as described in `beagle.schema/dict-entry`
241 |   Opts:
242 |   - type-name
243 |       a string, defaults to \"PHRASE\"
244 |   - validate-dictionary?
245 |       if set to true then validates the dictionary, default false
246 |   - optimize-dictionary?
247 |       if set to true then optimizes dictionary before creating the monitor, default false
248 |   - tokenizer
249 |       a keyword one of #{:keyword :letter :standard :classic :strict :unicode-whitespace :whitespace}, default :standard
250 |   - case-sensitive?
251 |       if set to true text matching is case sensitive, default true
252 |   - ascii-fold?
253 |       if set to true before matching text is ascii folded, default false
254 |   - stem?
255 |       if set to true before matching text is stemmed, default false
256 |   - stemmer
257 |       a keyword one of #{:arabic :armenian :basque :catalan :danish :dutch :english :estonian
258 |       :finnish :french :german :german2 :hungarian :irish :italian :kp :lithuanian :lovins
259 |       :norwegian :porter :portuguese :romanian :russian :spanish :swedish :turkish}
260 |       that specifies the stemmer algorithm, default :english
261 |   - slop
262 |       the max edit-distance for phrase matching, default 0
263 |   - in-order?
264 |       if set to true enforces phrase terms ordering in matches, default false"
265 |   ([dictionary] (highlighter dictionary {}))
266 |   ([dictionary opts]
267 |    (when (:validate-dictionary? opts) (validator/validate-dictionary dictionary))
268 |    (let [dictionary (if (:optimize-dictionary? opts) (optimizer/optimize dictionary) dictionary)
269 |          type-name (if (s/blank? (:type-name opts)) "PHRASE" (:type-name opts))
270 |          {:keys [monitor field-names]} (monitor/setup dictionary opts dict-entries->monitor-queries)]
271 |      (fn
272 |        ([text] (match text monitor field-names type-name {}))
273 |        ([text opts] (match text monitor field-names type-name opts))))))
274 | 
275 | (defn ^:deprecated annotator
276 |   [dictionary & {:keys [type-name validate-dictionary? optimize-dictionary? tokenizer]}]
277 |   (when validate-dictionary? (validator/validate-dictionary dictionary))
278 |   (let [dictionary (if optimize-dictionary? (optimizer/optimize dictionary) dictionary)
279 |         type-name (if (s/blank? type-name) "PHRASE" type-name)
280 |         {:keys [monitor field-names]} (monitor/setup dictionary {:tokenizer tokenizer}
281 |                                                      dict-entries->monitor-queries)]
282 |     (fn
283 |       ([text] (match text monitor field-names type-name {}))
284 |       ([text & {:as opts}] (match text monitor field-names type-name opts)))))
285 | 


--------------------------------------------------------------------------------
/src/beagle/readers.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.readers
 2 |   (:require [clojure.string :as s]
 3 |             [clojure.edn :as edn]
 4 |             [clojure.java.io :as io]
 5 |             [clojure.data.csv :as csv]
 6 |             [jsonista.core :as json])
 7 |   (:import (java.io PushbackReader)))
 8 | 
 9 | (def mapper (json/object-mapper {:decode-key-fn true}))
10 | 
11 | (defn read-edn
12 |   "Reads dictionary from the source.
13 |   `source` - must be something that an input stream can be created."
14 |   [source]
15 |   (with-open [rdr (PushbackReader. (io/reader (io/input-stream source)))]
16 |     (doall (edn/read rdr))))
17 | 
18 | (defn read-csv [source]
19 |   (with-open [reader (io/reader source)]
20 |     (let [[header & lines] (csv/read-csv reader :separator \, :quote \")
21 |           kvs (map keyword header)]
22 |       (->> lines
23 |            (map (fn [line] (map s/trim line)))
24 |            (map #(apply array-map (interleave kvs %)))
25 |            (map #(into {} (remove (fn [[_ v]] (s/blank? v)) %)))
26 |            (map (fn [{:keys [synonyms] :as dict}]
27 |                   (if-not (s/blank? synonyms)
28 |                     (assoc dict :synonyms (map s/trim (s/split synonyms #";")))
29 |                     dict)))
30 |            (map (fn [{:keys [case-sensitive?] :as dict}]
31 |                   (if-not (s/blank? case-sensitive?)
32 |                     (assoc dict :case-sensitive? (Boolean/valueOf ^String case-sensitive?))
33 |                     dict)))
34 |            (map (fn [{:keys [ascii-fold?] :as dict}]
35 |                   (if-not (s/blank? ascii-fold?)
36 |                     (assoc dict :ascii-fold? (Boolean/valueOf ^String ascii-fold?))
37 |                     dict)))
38 |            (map (fn [{:keys [meta] :as dict}]
39 |                   (if-not (s/blank? meta)
40 |                     (assoc dict :meta (reduce (fn [acc [k v]] (assoc acc k v))
41 |                                               {}
42 |                                               (->> (map s/trim (s/split meta #";"))
43 |                                                    (partition-all 2)
44 |                                                    (remove (fn [[_ v]] (s/blank? (str v)))))))
45 | 
46 |                     dict)))
47 |            (doall)))))
48 | 
49 | (defn read-json [source]
50 |   (with-open [rdr (io/reader (io/input-stream source))]
51 |     (doall (json/read-value rdr mapper))))
52 | 


--------------------------------------------------------------------------------
/src/beagle/schema.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.schema
 2 |   (:require [clojure.spec.alpha :as s]
 3 |             [clojure.spec.gen.alpha :as gen]
 4 |             [clojure.string :as str]))
 5 | 
 6 | (s/def ::non-empty-string
 7 |   (s/and string? (complement str/blank?)))
 8 | 
 9 | (s/def ::text ::non-empty-string)
10 | (s/def ::type (s/nilable string?))
11 | (s/def ::id (s/nilable string?))
12 | (s/def ::synonyms (s/nilable (s/coll-of ::non-empty-string)))
13 | (s/def ::case-sensitive? (s/nilable boolean?))
14 | (s/def ::ascii-fold? (s/nilable boolean?))
15 | (s/def ::stem? (s/nilable boolean?))
16 | (s/def ::stemmer (s/nilable keyword?))
17 | (s/def ::slop (s/nilable #(and (number? %) (or (pos-int? %) (zero? %)))))
18 | (s/def ::tokenizer (s/nilable keyword?))
19 | (s/def ::in-order? (s/nilable boolean?))
20 | (s/def ::meta
21 |   (s/with-gen
22 |     (s/nilable (s/map-of #(or (string? %) (keyword? %)) string?))
23 |     #(gen/fmap (fn [s] {s s}) (s/gen string?))))
24 | 
25 | (s/def ::dict-entry
26 |   (s/keys :req-un [::text]
27 |           :opt-un [::type ::id ::synonyms ::meta
28 |                    ::case-sensitive? ::ascii-fold? ::stem? ::stemmer ::slop
29 |                    ::tokenizer ::in-order?]))
30 | 
31 | (defrecord DictionaryEntry [text type id synonyms case-sensitive? ascii-fold?
32 |                             stem? stemmer slop tokenizer meta])
33 | 
34 | (s/def ::dictionary (s/coll-of ::dict-entry))
35 | 
36 | (s/def ::begin-offset pos-int?)
37 | (s/def ::end-offset pos-int?)
38 | (s/def ::dict-entry-id ::non-empty-string)
39 | 
40 | (s/def ::dictionary-annotation
41 |   (s/keys :req-un [::text ::type ::begin-offset ::end-offset]
42 |           :opt-un [::dict-entry-id ::meta]))
43 | 
44 | (defrecord Highlight [text type dict-entry-id meta begin-offset end-offset])
45 | 
46 | (s/def ::annotations (s/coll-of ::dictionary-annotation))
47 | 


--------------------------------------------------------------------------------
/src/beagle/text_analysis.clj:
--------------------------------------------------------------------------------
  1 | (ns beagle.text-analysis
  2 |   (:require [clojure.string :as string]
  3 |             [clojure.tools.logging :as log])
  4 |   (:import (org.apache.lucene.analysis Analyzer Analyzer$TokenStreamComponents Tokenizer TokenStream)
  5 |            (org.apache.lucene.analysis.core LowerCaseFilter WhitespaceTokenizer LetterTokenizer KeywordTokenizer UnicodeWhitespaceTokenizer)
  6 |            (org.apache.lucene.analysis.miscellaneous ASCIIFoldingFilter)
  7 |            (org.apache.lucene.analysis.standard ClassicFilter StandardTokenizer ClassicTokenizer)
  8 |            (org.apache.lucene.analysis.tokenattributes CharTermAttribute)
  9 |            (org.apache.lucene.analysis.pattern PatternTokenizer)
 10 |            (org.apache.lucene.analysis.snowball SnowballFilter)
 11 |            (org.tartarus.snowball.ext LithuanianStemmer ArabicStemmer ArmenianStemmer BasqueStemmer EnglishStemmer CatalanStemmer DanishStemmer DutchStemmer EstonianStemmer FinnishStemmer FrenchStemmer German2Stemmer GermanStemmer HungarianStemmer IrishStemmer ItalianStemmer KpStemmer LovinsStemmer NorwegianStemmer PorterStemmer PortugueseStemmer RomanianStemmer RussianStemmer SpanishStemmer SwedishStemmer TurkishStemmer)
 12 |            (org.tartarus.snowball SnowballProgram)
 13 |            (java.io StringReader)))
 14 | 
 15 | (defn ^SnowballProgram stemmer
 16 |   "Creates a stemmer object given the stemmer keyword.
 17 |   Default stemmer is English."
 18 |   [stemmer-kw]
 19 |   (case stemmer-kw
 20 |     :arabic (ArabicStemmer.)
 21 |     :armenian (ArmenianStemmer.)
 22 |     :basque (BasqueStemmer.)
 23 |     :catalan (CatalanStemmer.)
 24 |     :danish (DanishStemmer.)
 25 |     :dutch (DutchStemmer.)
 26 |     :english (EnglishStemmer.)
 27 |     :estonian (EstonianStemmer.)
 28 |     :finnish (FinnishStemmer.)
 29 |     :french (FrenchStemmer.)
 30 |     :german2 (German2Stemmer.)
 31 |     :german (GermanStemmer.)
 32 |     :hungarian (HungarianStemmer.)
 33 |     :irish (IrishStemmer.)
 34 |     :italian (ItalianStemmer.)
 35 |     :kp (KpStemmer.)
 36 |     :lithuanian (LithuanianStemmer.)
 37 |     :lovins (LovinsStemmer.)
 38 |     :norwegian (NorwegianStemmer.)
 39 |     :porter (PorterStemmer.)
 40 |     :portuguese (PortugueseStemmer.)
 41 |     :romanian (RomanianStemmer.)
 42 |     :russian (RussianStemmer.)
 43 |     :spanish (SpanishStemmer.)
 44 |     :swedish (SwedishStemmer.)
 45 |     :turkish (TurkishStemmer.)
 46 |     (do
 47 |       (when stemmer-kw
 48 |         (log/debugf "Stemmer '%s' not found! EnglishStemmer is used." stemmer-kw))
 49 |       (EnglishStemmer.))))
 50 | 
 51 | (defn ^Tokenizer tokenizer [tokenizer-kw]
 52 |   (case tokenizer-kw
 53 |     :keyword (KeywordTokenizer.)
 54 |     :letter (LetterTokenizer.)
 55 |     :classic (ClassicTokenizer.)
 56 |     :standard (StandardTokenizer.)
 57 |     :strict (PatternTokenizer. #"[^a-zA-Z0-9{}\[\]()<>#+=@&']+" -1)
 58 |     :unicode-whitespace (UnicodeWhitespaceTokenizer.)
 59 |     :whitespace (WhitespaceTokenizer.)
 60 |     (do
 61 |       (when tokenizer-kw
 62 |         (log/debugf "Tokenizer '%s' not found. StandardTokenizer is used." tokenizer-kw))
 63 |       (StandardTokenizer.))))
 64 | 
 65 | (defn analyzer-constructor [{tokenizer-kw    :tokenizer
 66 |                              ascii-fold?     :ascii-fold?
 67 |                              case-sensitive? :case-sensitive?
 68 |                              stem?           :stem?
 69 |                              stemmer-kw      :stemmer}]
 70 |   (proxy [Analyzer] []
 71 |     (createComponents [^String field-name]
 72 |       (let [^Tokenizer tokenizr (tokenizer tokenizer-kw)
 73 |             ^TokenStream filters-chain (cond-> tokenizr
 74 |                                                (not case-sensitive?) (LowerCaseFilter.)
 75 |                                                ascii-fold? (ASCIIFoldingFilter.))
 76 |             token-stream (if stem?
 77 |                            (SnowballFilter. filters-chain (stemmer stemmer-kw))
 78 |                            (if (instance? Tokenizer filters-chain)
 79 |                              (ClassicFilter. tokenizr)
 80 |                              filters-chain))]
 81 |         (Analyzer$TokenStreamComponents.
 82 |           ^Tokenizer tokenizr ^TokenStream token-stream)))))
 83 | 
 84 | (defn field-name-constructor [{tokenizer-kw    :tokenizer
 85 |                                ascii-fold?     :ascii-fold?
 86 |                                case-sensitive? :case-sensitive?
 87 |                                stem?           :stem?
 88 |                                stemmer-kw      :stemmer}]
 89 |   (let [tokenizr (str (name (or tokenizer-kw :standard)) "-tokenizer")
 90 |         filters (cond-> []
 91 |                         (not case-sensitive?) (conj "lowercased")
 92 |                         ascii-fold? (conj "ascii-folded")
 93 |                         stem? (conj (str "stemmed-" (name (or stemmer-kw :english)))))]
 94 |     (if (seq filters)
 95 |       (str "text" "." tokenizr "." (string/join "-" (sort filters)))
 96 |       (str "text" "." tokenizr))))
 97 | 
 98 | (def analyzer (memoize analyzer-constructor))
 99 | (def field-name (memoize field-name-constructor))
100 | 
101 | (def default-conf
102 |   {:tokenizer       :standard
103 |    :case-sensitive? true
104 |    :ascii-fold?     false
105 |    :stem?           false
106 |    :stemmer         :english})
107 | 
108 | (defrecord Conf [tokenizer case-sensitive? ascii-fold? stem? stemmer])
109 | 
110 | (defn three-way-merge
111 |   "Given a key and three maps return the value that would appear in the map after merge.
112 |   Semantics is of the default Clojure merge."
113 |   [k m1 m2 m3]
114 |   (if (nil? (k m3))
115 |     (if (nil? (k m2))
116 |       (k m1)
117 |       (k m2))
118 |     (k m3)))
119 | 
120 | (defn ^Analyzer get-string-analyzer [analysis-conf default-analysis-conf]
121 |   (analyzer (->Conf
122 |               (three-way-merge :tokenizer default-conf default-analysis-conf analysis-conf)
123 |               (three-way-merge :case-sensitive? default-conf default-analysis-conf analysis-conf)
124 |               (three-way-merge :ascii-fold? default-conf default-analysis-conf analysis-conf)
125 |               (three-way-merge :stem? default-conf default-analysis-conf analysis-conf)
126 |               (three-way-merge :stemmer default-conf default-analysis-conf analysis-conf))))
127 | 
128 | (defn ^String get-field-name [analysis-conf default-analysis-conf]
129 |   (field-name (->Conf
130 |                 (three-way-merge :tokenizer default-conf default-analysis-conf analysis-conf)
131 |                 (three-way-merge :case-sensitive? default-conf default-analysis-conf analysis-conf)
132 |                 (three-way-merge :ascii-fold? default-conf default-analysis-conf analysis-conf)
133 |                 (three-way-merge :stem? default-conf default-analysis-conf analysis-conf)
134 |                 (three-way-merge :stemmer default-conf default-analysis-conf analysis-conf))))
135 | 
136 | (defn text->token-strings
137 |   "Given a text and an analyzer returns a list of tokens as strings."
138 |   [^String text ^Analyzer analyzer]
139 |   (let [^TokenStream token-stream (.tokenStream analyzer "not-important" (StringReader. text))
140 |         ^CharTermAttribute termAtt (.addAttribute token-stream CharTermAttribute)]
141 |     (.reset token-stream)
142 |     (reduce (fn [acc _]
143 |               (if (.incrementToken token-stream)
144 |                 (conj acc (.toString termAtt))
145 |                 (do
146 |                   (.end token-stream)
147 |                   (.close token-stream)
148 |                   (reduced acc)))) [] (range))))
149 | 


--------------------------------------------------------------------------------
/src/beagle/validator.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.validator
 2 |   (:gen-class)
 3 |   (:require [clojure.spec.alpha :as s]
 4 |             [beagle.schema :as sch]
 5 |             [beagle.readers :as readers]))
 6 | 
 7 | (defn validate-dictionary [dictionary]
 8 |   (s/conform ::sch/dictionary dictionary))
 9 | 
10 | (defn valid-dictionary? [dictionary]
11 |   (try
12 |     (seq (validate-dictionary dictionary))
13 |     (catch Exception _)))
14 | 
15 | (def supported-dictionary-file-types #{"csv" "json" "edn"})
16 | 
17 | (defn valid-dictionary-file? [dictionary-file dictionary-file-type]
18 |   (if (contains? supported-dictionary-file-types dictionary-file-type)
19 |     (valid-dictionary? (case dictionary-file-type
20 |                          "csv" (readers/read-csv dictionary-file)
21 |                          "json" (readers/read-json dictionary-file)
22 |                          "edn" (readers/read-edn dictionary-file)))
23 |     (.printStackTrace (Exception. (format "File type not supported: `%s`" dictionary-file-type)))))
24 | 
25 | (defn -main [& args]
26 |   (when (odd? (count args))
27 |     (.printStackTrace (Exception. "Even number of arguments must be present - 'dictionary-name dictionary-type ...'"))
28 |     (System/exit 1))
29 |   (when (some #(not (apply valid-dictionary-file? %)) (partition-all 2 args))
30 |     (System/exit 1)))
31 | 


--------------------------------------------------------------------------------
/test/beagle/annotation_merge_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.annotation-merge-test
 2 |   (:require [clojure.test :refer [deftest is]]
 3 |             [beagle.phrases :as phrases]
 4 |             [beagle.annotation-merger :as merger]))
 5 | 
 6 | (deftest annotator-with-merge-option-test
 7 |   (let [dictionary [{:text "1 2"} {:text "2"} {:text "1 2 3 4"}
 8 |                     {:text "4"} {:text "5"} {:text "6 5 3 7"} {:text "6 5"}]
 9 |         highlighter-fn (phrases/highlighter dictionary {:type-name "TEST"})
10 |         text "A B C 1 2 3 4 D E F G 6 5 3 7"]
11 |     (is (= (count (highlighter-fn text {:merge-annotations? false})) (count (highlighter-fn text))))
12 |     (is (< (count (highlighter-fn text {:merge-annotations? true})) (count (highlighter-fn text))))
13 |     (is (= [(set (vals {:begin-offset  6
14 |                         :dict-entry-id "2"
15 |                         :end-offset    13
16 |                         :meta          {}
17 |                         :text          "1 2 3 4"
18 |                         :type          "TEST"}))
19 |             (set (vals {:begin-offset  22
20 |                         :dict-entry-id "5"
21 |                         :end-offset    29
22 |                         :meta          {}
23 |                         :text          "6 5 3 7"
24 |                         :type          "TEST"}))]
25 |            (map #(-> % vals set) (highlighter-fn text {:merge-annotations? true}))))))
26 | 
27 | (deftest annotation-merge-test
28 |   (is (= [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}]
29 |          (merger/merge-same-type-annotations
30 |            [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}
31 |             {:text "A" :type "TEST" :dict-entry-id "3" :meta {} :begin-offset 0 :end-offset 1}
32 |             {:text "AAAA" :type "TEST" :dict-entry-id "2" :meta {} :begin-offset 1 :end-offset 5}])))
33 | 
34 |   (is (= [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}
35 |           {:text "AAA" :type "TEST2" :dict-entry-id "10" :meta {} :begin-offset 0 :end-offset 3}]
36 |          (merger/merge-same-type-annotations
37 |            [{:text "AAAAA" :type "TEST" :dict-entry-id "1" :meta {} :begin-offset 0 :end-offset 5}
38 |             {:text "A" :type "TEST" :dict-entry-id "2" :meta {} :begin-offset 0 :end-offset 1}
39 |             {:text "AAAA" :type "TEST" :dict-entry-id "3" :meta {} :begin-offset 1 :end-offset 5}
40 |             {:text "AAA" :type "TEST2" :dict-entry-id "10" :meta {} :begin-offset 0 :end-offset 3}
41 |             {:text "A" :type "TEST2" :dict-entry-id "11" :meta {} :begin-offset 0 :end-offset 1}]))))
42 | 
43 | 


--------------------------------------------------------------------------------
/test/beagle/corner_case_phrases_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.corner-case-phrases-test
 2 |   (:require [clojure.test :refer [deftest is]]
 3 |             [beagle.phrases :as phrases])
 4 |   (:import (org.jsoup Jsoup)))
 5 | 
 6 | (deftest corner-cases
 7 |   (let [annotator (phrases/highlighter [{:text          "N-Able N-Central"
 8 |                                        :case-sensitive? false}])
 9 |         text (some-> (Jsoup/parse (slurp "test/resources/phrases.html")) (.body) (.text))]
10 |     (is (empty? (annotator text)))))
11 | 


--------------------------------------------------------------------------------
/test/beagle/dictionary_optimization_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.dictionary-optimization-test
 2 |   (:require [clojure.test :refer [deftest is]]
 3 |             [beagle.dictionary-optimizer :as optimizer]
 4 |             [beagle.phrases :as phrases]))
 5 | 
 6 | (deftest meta-merge-test
 7 |   (is (optimizer/mergeable-meta? nil {:meta {:email "123"}}))
 8 |   (is (optimizer/mergeable-meta? {:meta {}} {:meta {:email "123"}}))
 9 |   (is (optimizer/mergeable-meta? {:meta {:email "123"}} nil))
10 |   (is (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "123"}}))
11 |   (is (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "123" :total 5646}}))
12 |   (is (optimizer/mergeable-meta? {:meta {:email "123" :total 5646}} {:meta {:email "123"}}))
13 |   (is (not (optimizer/mergeable-meta? {:meta {:email "123"}} {:meta {:email "321"}})))
14 |   (is (not (optimizer/mergeable-meta? {:meta {:email "123" :total 5646}} {:meta {:email "123" :total 9999}})))
15 |   (is (= [{:ascii-fold?     true
16 |            :case-sensitive? true
17 |            :id              "test-id"
18 |            :meta            {:abc   "123" :email "test@example.com"}
19 |            :synonyms        ["abc" "XXXX"]
20 |            :text            "test text"}
21 |           {:ascii-fold?     true
22 |            :case-sensitive? true
23 |            :id              "test-id"
24 |            :meta            {:email "bobby@example.com"}
25 |            :synonyms        ["def"]
26 |            :text            "test text"}]
27 |          (optimizer/aggregate-entries-by-meta
28 |            [{:text            "test text"
29 |              :id              "test-id"
30 |              :synonyms        ["abc"]
31 |              :case-sensitive? true
32 |              :ascii-fold?     true
33 |              :meta            {:email "test@example.com"}}
34 |             {:text            "test text"
35 |              :id              "test-id"
36 |              :synonyms        ["def"]
37 |              :case-sensitive? true
38 |              :ascii-fold?     true
39 |              :meta            {:email "bobby@example.com"}}
40 |             {:text            "test text"
41 |              :id              "test-id"
42 |              :synonyms        ["XXXX"]
43 |              :case-sensitive? true
44 |              :ascii-fold?     true
45 |              :meta            {:email "test@example.com" :abc "123"}}]))))
46 | 
47 | (deftest dictionary-optimization-test
48 |   (let [dictionary [{:case-sensitive? true
49 |                      :ascii-fold?     true
50 |                      :synonyms        ["AAAA1"]
51 |                      :text            "AAAA"}
52 |                     {:case-sensitive? true
53 |                      :ascii-fold?     true
54 |                      :synonyms        ["AAAA2"]
55 |                      :text            "AAAA"}
56 |                     {:case-sensitive? false
57 |                      :ascii-fold?     true
58 |                      :synonyms        ["AAAA3"]
59 |                      :text            "AAAA"}
60 |                     {:case-sensitive? true
61 |                      :ascii-fold?     true
62 |                      :synonyms        ["AAAA4"]
63 |                      :text            "AAAA"}
64 |                     {:case-sensitive? true
65 |                      :ascii-fold?     false
66 |                      :synonyms        ["AAAA5"]
67 |                      :text            "AAAA"}
68 |                     {:case-sensitive? true
69 |                      :ascii-fold?     false
70 |                      :synonyms        ["AAAA"]
71 |                      :text            "AAAA"}
72 |                     {:case-sensitive? false
73 |                      :synonyms        ["BBBB1"]
74 |                      :text            "BBBB"}
75 |                     {:case-sensitive? false
76 |                      :synonyms        ["BBBB"]
77 |                      :text            "BBBB"}]
78 |         expected-dictionary [{:text            "AAAA"
79 |                               :synonyms        ["AAAA4" "AAAA2" "AAAA1"]
80 |                               :case-sensitive? true
81 |                               :ascii-fold?     true}
82 |                              {:case-sensitive? false :ascii-fold? true :synonyms ["AAAA3"] :text "AAAA"}
83 |                              {:text "AAAA" :synonyms ["AAAA5"] :case-sensitive? true :ascii-fold? false}
84 |                              {:text "BBBB" :synonyms ["BBBB1"] :case-sensitive? false}]
85 |         optimized-dictionary (optimizer/optimize dictionary)]
86 |     (is (< (count optimized-dictionary) (count dictionary)))
87 |     (is (= (count expected-dictionary) (count optimized-dictionary)))
88 |     (is (= (set (map #(update % :synonyms set) expected-dictionary))
89 |            (set (map #(update % :synonyms set) optimized-dictionary))))))
90 | 
91 | (deftest synonym-optimization
92 |   (let [dictionary [{:text "test" :id "1" :synonyms ["beagle" "luwak1"]}]
93 |         monitor-queries (phrases/dict-entries->monitor-queries dictionary {:tokenizer :standard})]
94 |     (is (= 3 (count monitor-queries)))
95 |     (let [highlighter-fn (phrases/highlighter dictionary {:type-name "TEST"})
96 |           anns (highlighter-fn "this is a beagle text test luwak1")]
97 |       (is (= 3 (count anns))))))
98 | 


--------------------------------------------------------------------------------
/test/beagle/java_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.java-test
 2 |   (:require [clojure.test :refer [deftest is]]))
 3 | 
 4 | (deftest simple-java-interface
 5 |   (let [de (doto (lt.tokenmill.beagle.phrases.DictionaryEntry. "test")
 6 |              (.setSlop (Integer. 1)))
 7 |         annotator (lt.tokenmill.beagle.phrases.Annotator. [de] {})]
 8 |     (is (= "test" (first (map #(.text %) (.annotate annotator "test txt" {})))))))
 9 | 
10 | (deftest case-sensitivity
11 |   (let [de (doto (lt.tokenmill.beagle.phrases.DictionaryEntry. "LYNDON BAINES JOHNSON")
12 |              (.setCaseSensitive false))
13 |         annotator (lt.tokenmill.beagle.phrases.Annotator. [de] {})]
14 |     (is (= 1 (count (filter #(= "Lyndon Baines Johnson" (.text %)) (.annotate annotator "Lyndon Baines Johnson (/ˈlɪndən ˈbeɪnz/; August 27, 1908 – January 22, 1973), often referred to as LBJ, was an American politician who served as the 36th president of the United States from 1963 to 1969." {})))))))
15 | 


--------------------------------------------------------------------------------
/test/beagle/lucene_alpha_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.lucene-alpha-test
 2 |   (:require [clojure.test :refer [deftest is]]
 3 |             [beagle.lucene-alpha :as lucene]))
 4 | 
 5 | (deftest smoke
 6 |   (let [txt "some text this other that"
 7 |         dictionary [{:text "this AND that" :id "1" :slop 1}]
 8 |         annotator-fn (lucene/annotator dictionary)
 9 |         [ann1 :as anns] (annotator-fn txt {})
10 |         anns2 (annotator-fn txt)]
11 |     (is (= anns anns2))
12 |     (is (= 1 (count anns)))
13 |     (is (= "1" (:dict-entry-id ann1)))))
14 | 
15 | (deftest ^:noisy smoke-2
16 |   (let [txt "some text this AND"
17 |         dictionary [{:text "this AND" :id "1" :slop 1}]
18 |         annotator-fn (lucene/annotator dictionary)
19 |         [ann1 :as anns] (annotator-fn txt)]
20 |     (is (= 0 (count anns)))
21 |     (is (nil? (:dict-entry-id ann1)))))
22 | 
23 | (deftest smoke-3
24 |   (let [txt "some number 1234 test"
25 |         dictionary [{:text "/.*\\d*.*/" :id "1" :slop 1}]
26 |         annotator-fn (lucene/annotator dictionary)
27 |         anns (annotator-fn txt)]
28 |     (is (< 0 (count anns)))))
29 | 


--------------------------------------------------------------------------------
/test/beagle/optimization_suggestions_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.optimization-suggestions-test
 2 |   (:require [clojure.test :refer [deftest is testing]]
 3 |             [beagle.dictionary-optimizer :as optimizer]))
 4 | 
 5 | (deftest optimization-suggestions
 6 |   (testing "Suggestions for similar dictionary items"
 7 |     (is (= [{:dictionary-items [{:id "1" :synonyms ["beagle"] :text "test"} {:id "2" :synonyms ["luwak1"] :text "test"}]
 8 |              :suggestion       "Dictionary items '[1 2]' have identical `[text case-sensitivity ascii-folding] features."}]
 9 |            (optimizer/dry-run [{:text "test" :id "1" :synonyms ["beagle"]}
10 |                                {:text "test" :id "2" :synonyms ["luwak1"]}]))))
11 | 
12 |   (testing "Suggestions for two similar dictionary item groups"
13 |     (is (= [{:suggestion       "Dictionary items '[1 3]' have identical `[text case-sensitivity ascii-folding] features."
14 |              :dictionary-items [{:id "1" :synonyms ["beagle"] :text "test"} {:id "3" :synonyms ["beagle"] :text "test"}]}
15 |             {:suggestion       "Dictionary items '[2 4]' have identical `[text case-sensitivity ascii-folding] features."
16 |              :dictionary-items [{:id "2" :synonyms ["luwak2"] :text "test2"} {:id "4" :synonyms ["beagle3"] :text "test2"}]}]
17 |            (optimizer/dry-run [{:id "1" :synonyms ["beagle"] :text "test"}
18 |                                {:id "2" :synonyms ["luwak2"] :text "test2"}
19 |                                {:id "3" :synonyms ["beagle"] :text "test"}
20 |                                {:id "4" :synonyms ["beagle3"] :text "test2"}]))))
21 | 
22 |   (testing "Suggestions for single dictionary item"
23 |     (is (= [] (optimizer/dry-run [{:id "1" :synonyms ["beagle"] :text "test"}]))))
24 | 
25 |   (testing "Suggestions for distinct dictionary items"
26 |     (is (= [] (optimizer/dry-run [{:id "1" :case-sensitive? true :synonyms ["beagle"] :text "test"}
27 |                                   {:id "2" :synonyms ["beagle"] :text "test2"}
28 |                                   {:id "3" :ascii-fold? false :synonyms ["beagle"] :text "test3"}]))))
29 | 
30 |   (testing "Suggestions for two similar dictionary item groups and one distinct dictionary item"
31 |     (is (= [{:suggestion       "Dictionary items '[test 3 4]' have identical `[text case-sensitivity ascii-folding] features."
32 |              :dictionary-items [{:synonyms ["beagle"] :text "test"}
33 |                                 {:id "3" :synonyms ["beagle"] :text "test"}
34 |                                 {:id "4" :synonyms ["luwak222"] :text "test"}]}
35 |             {:suggestion       "Dictionary items '[2 test2]' have identical `[text case-sensitivity ascii-folding] features."
36 |              :dictionary-items [{:id "2" :synonyms ["luwak2"] :text "test2"} {:synonyms ["beagle3"] :text "test2"}]}]
37 |            (optimizer/dry-run [{:synonyms ["beagle"] :text "test"}
38 |                                {:id "2" :synonyms ["luwak2"] :text "test2"}
39 |                                {:id "3" :synonyms ["beagle"] :text "test"}
40 |                                {:id "4" :synonyms ["luwak222"] :text "test"}
41 |                                {:synonyms ["beagle3"] :text "test2"}
42 |                                {:synonyms ["beagle"] :text "test" :ascii-fold? true}])))))
43 | 


--------------------------------------------------------------------------------
/test/beagle/phrases_test.clj:
--------------------------------------------------------------------------------
  1 | (ns beagle.phrases-test
  2 |   (:require [clojure.test :refer [deftest is testing]]
  3 |             [clojure.spec.alpha :as s]
  4 |             [clojure.spec.test.alpha :as stest]
  5 |             [beagle.phrases :as phrases]
  6 |             [beagle.schema :as schema]))
  7 | 
  8 | (s/def ::opts (s/* (s/cat :opt keyword? :val any?)))
  9 | 
 10 | (s/fdef phrases/highlighter
 11 |         :args (s/alt :unary (s/cat :dictionary ::schema/dictionary)
 12 |                      :binary (s/cat :dictionary ::schema/dictionary :opts any?))
 13 |         :ret (s/fspec :args (s/alt :unary (s/cat :text string?)
 14 |                                    :binary (s/cat :text string? :opts any?))
 15 |                       :ret ::schema/annotations))
 16 | 
 17 | (stest/instrument `phrases/highlighter)
 18 | 
 19 | (s/exercise-fn `phrases/highlighter)
 20 | 
 21 | (def label "LABEL")
 22 | 
 23 | (deftest dictionary-entry-record
 24 |   (let [dictionary [(schema/map->DictionaryEntry {:text "test"})]
 25 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 26 |         anns (highlighter-fn "before annotated test phrase after annotated")]
 27 |     (is (= 1 (count anns)))))
 28 | 
 29 | (deftest type-per-dictionary-entry
 30 |   (let [dictionary [{:text "test phrase" :id "1" :meta {:test "test"} :type "CUSTOM"}]
 31 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 32 |         anns (highlighter-fn "before annotated test phrase after annotated")]
 33 |     (is (seq (s/conform ::schema/annotations anns)))
 34 |     (is (seq anns))
 35 |     (is (= "1" (-> anns first :dict-entry-id)))
 36 |     (is (= "CUSTOM" (-> anns first :type)))
 37 |     (is (= "test phrase" (-> anns first :text)))
 38 |     (is (nil? (-> anns first (get-in [:meta "_type"]))))))
 39 | 
 40 | (deftest id
 41 |   (let [dictionary [{:text "test" :id "1" :meta {:test "test"}}]
 42 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 43 |         anns (highlighter-fn "before annotated test after annotated")]
 44 |     (is (seq anns))
 45 |     (is (= "1" (-> anns first :dict-entry-id)))
 46 |     (is (= "LABEL" (-> anns first :type)))))
 47 | 
 48 | (deftest metadata-append
 49 |   (let [dictionary [{:text "test" :meta {"email" "test@example.com"}}]
 50 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 51 |         anns (highlighter-fn "before annotated test after annotated")]
 52 |     (is (seq anns))
 53 |     (is (= {"email" "test@example.com"} (-> anns first :meta)))))
 54 | 
 55 | (deftest case-sensitivity
 56 |   (testing "case sensitive"
 57 |     (let [dictionary [{:text "test"}]
 58 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
 59 |           anns (highlighter-fn "before annotated test after annotated")]
 60 |       (is (seq anns)))
 61 |     (let [dictionary [{:text "TeSt" :case-sensitive? true}]
 62 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
 63 |           anns (highlighter-fn "before annotated test after annotated")]
 64 |       (is (empty? anns)))
 65 |     (let [label "LABEL"
 66 |           dictionary [{:text "test" :case-sensitive? true}]
 67 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
 68 |           anns (highlighter-fn "before annotated Test after annotated")]
 69 |       (is (empty? anns))))
 70 | 
 71 |   (testing "case insensitive"
 72 |     (let [dictionary [{:text "TeSt" :case-sensitive? false}]
 73 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
 74 |           anns (highlighter-fn "before annotated test after annotated")]
 75 |       (is (seq anns)))
 76 |     (let [dictionary [{:text "test" :case-sensitive? false}]
 77 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
 78 |           anns (highlighter-fn "before annotated test after annotated")]
 79 |       (is (seq anns)))))
 80 | 
 81 | (deftest ascii-folding-dictionary
 82 |   (let [dictionary [{:text "wörd"}]
 83 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 84 |         anns (highlighter-fn "before annotated wörd after annotated")]
 85 |     (is (seq anns)))
 86 |   (let [dictionary [{:text "wörd"}]
 87 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 88 |         anns (highlighter-fn "before annotated word after annotated")]
 89 |     (is (empty? anns)))
 90 |   (let [label "LABEL"
 91 |         dictionary [{:text "wörd" :ascii-fold? true}]
 92 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 93 |         anns (highlighter-fn "before annotated word after annotated")]
 94 |     (is (seq anns)))
 95 |   (let [dictionary [{:text "word" :ascii-fold? true}]
 96 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
 97 |         anns (highlighter-fn "before annotated wörd after annotated")]
 98 |     (is (seq anns)))
 99 |   (let [label "LABEL"
100 |         dictionary [{:text "word" :ascii-fold? false}]
101 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
102 |         anns (highlighter-fn "before annotated wörd after annotated")]
103 |     (is (empty? anns))))
104 | 
105 | (deftest ascii-folding-with-case-sensitivity
106 |   (let [label "TYPE"]
107 |     (testing "case sensitive"
108 |       (let [dictionary [{:text "schön" :ascii-fold? true}]
109 |             highlighter-fn (phrases/highlighter dictionary {:type-name label})
110 |             anns (highlighter-fn "before annotated Schön after annotated")]
111 |         (is (empty? anns)))
112 |       (let [dictionary [{:text "Schön" :ascii-fold? true}]
113 |             highlighter-fn (phrases/highlighter dictionary {:type-name label})
114 |             anns (highlighter-fn "before annotated Schon after annotated")]
115 |         (is (seq anns)))
116 |       (let [dictionary [{:text "schön" :ascii-fold? true}]
117 |             highlighter-fn (phrases/highlighter dictionary {:type-name label})
118 |             anns (highlighter-fn "before annotated Schon after annotated")]
119 |         (is (empty? anns))))
120 | 
121 |     (testing "case insensitive"
122 |       (let [dictionary [{:text "schön" :ascii-fold? true :case-sensitive? false}]
123 |             highlighter-fn (phrases/highlighter dictionary {:type-name label})
124 |             anns (highlighter-fn "before annotated Schon after annotated")]
125 |         (is (seq anns))))
126 |     (let [dictionary [{:text "schön" :ascii-fold? true :case-sensitive? false}]
127 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
128 |           anns (highlighter-fn "before annotated schon after annotated")]
129 |       (is (seq anns)))
130 |     (let [dictionary [{:text "schon" :ascii-fold? true :case-sensitive? false}]
131 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
132 |           anns (highlighter-fn "before annotated schön after annotated")]
133 |       (is (seq anns)))
134 | 
135 |     (testing "false ascii fold"
136 |       (let [dictionary [{:text "schon" :ascii-fold? false}]
137 |             highlighter-fn (phrases/highlighter dictionary {:type-name label})
138 |             anns (highlighter-fn "before annotated schön after annotated")]
139 |         (is (empty? anns))))))
140 | 
141 | (deftest synonyms
142 |   (let [dictionary [{:text "test" :id "1" :synonyms ["beagle"]}]
143 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
144 |         anns (highlighter-fn "before annotated beagle after annotated")]
145 |     (is (= 1 (count anns)))
146 |     (is (= "1" (-> anns first :dict-entry-id)))
147 |     (is (= "beagle" (-> anns first :text))))
148 | 
149 |   (let [dictionary [{:text "test" :id "1" :synonyms ["Luwak"] :case-sensitive? true}]
150 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
151 |         anns (highlighter-fn "before annotated beagle after annotated")]
152 |     (is (empty? anns)))
153 | 
154 |   (let [dictionary [{:text "test" :id "1" :synonyms ["beagle"] :case-sensitive? false}]
155 |         highlighter-fn (phrases/highlighter dictionary {:type-name label})
156 |         anns (highlighter-fn "before annotated beagle after annotated")]
157 |     (is (= 1 (count anns)))
158 |     (is (= "1" (-> anns first :dict-entry-id)))
159 |     (is (= "beagle" (-> anns first :text))))
160 | 
161 |   (testing "synonyms with false ascii fold"
162 |     (let [dictionary [{:text "test" :synonyms ["schön"] :ascii-fold? false}]
163 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
164 |           anns (highlighter-fn "before annotated schon after annotated")]
165 |       (is (empty? anns)))
166 |     (let [dictionary [{:text "test" :synonyms ["schön"] :ascii-fold? true}]
167 |           highlighter-fn (phrases/highlighter dictionary {:type-name label})
168 |           anns (highlighter-fn "before annotated schon after annotated")]
169 |       (is (seq anns))
170 |       (is (= "schon" (-> anns first :text))))))
171 | 
172 | (deftest phrase-end-sentence
173 |   (let [dictionary [{:text "test-test"}]
174 |         highlighter-fn (phrases/highlighter dictionary)
175 |         anns (highlighter-fn "before annotated test-test.")]
176 |     (is (seq anns))
177 |     (is (= "test-test" (:text (first anns))))))
178 | 
179 | (deftest phrase-in-quotes
180 |   (let [dictionary [{:text "test-test" :case-sensitive? false}]
181 |         highlighter-fn (phrases/highlighter dictionary)
182 |         anns (highlighter-fn "before annotated \"TEST-test\".")]
183 |     (is (seq anns))
184 |     (is (= "TEST-test" (:text (first anns))))))
185 | 
186 | (deftest phrase-in-quotes-should-not-match
187 |   (let [dictionary [{:text "test-test" :case-sensitive? false}]
188 |         highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace})
189 |         anns (highlighter-fn "before annotated \"TEST-test\".")]
190 |     (is (empty? anns))))
191 | 
192 | (deftest overlapping-phrases
193 |   (let [dictionary [{:text "test phrase test" :case-sensitive? false}]
194 |         highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace})
195 |         anns (highlighter-fn "start test phrase test phrase test end")]
196 |     (is (= 2 (count anns)))))
197 | 
198 | (deftest lt-stemming
199 |   (let [dictionary [{:text "Kaunas" :id "1" :stem? true :stemmer :lithuanian}]
200 |         highlighter-fn (phrases/highlighter dictionary)
201 |         anns (highlighter-fn "Kauno miestas")]
202 |     (is (seq anns))
203 |     (is (= "Kauno" (-> anns first :text))))
204 |   (let [dictionary [{:text "Kaunas Vilnius" :id "1" :stem? true}]
205 |         highlighter-fn (phrases/highlighter dictionary)
206 |         anns (highlighter-fn "Kaunas, Vilnius")]
207 |     (is (seq anns))
208 |     (is (= "Kaunas, Vilnius" (-> anns first :text))))
209 |   (let [dictionary [{:text "Kaunas" :id "1" :case-sensitive? false :stem? true :stemmer :lithuanian}]
210 |         highlighter-fn (phrases/highlighter dictionary)
211 |         anns (highlighter-fn "kauno miestas")]
212 |     (is (seq anns))
213 |     (is (= "kauno" (-> anns first :text)))))
214 | 
215 | (deftest en-stemming
216 |   (let [txt "who let the dogs out?"]
217 |     (let [dictionary [{:text "dog" :id "1"}]
218 |           highlighter-fn (phrases/highlighter dictionary)
219 |           anns (highlighter-fn txt)]
220 |       (is (empty? anns)))
221 |     (let [dictionary [{:text "dog" :id "1" :stem? true}]
222 |           highlighter-fn (phrases/highlighter dictionary)
223 |           anns (highlighter-fn txt)]
224 |       (is (seq anns))
225 |       (is (= "dogs" (-> anns first :text))))
226 |     (let [dictionary [{:text "dog" :id "1" :stem? true :stemmer :english}]
227 |           highlighter-fn (phrases/highlighter dictionary)
228 |           anns (highlighter-fn txt)]
229 |       (is (seq anns))
230 |       (is (= "dogs" (-> anns first :text))))
231 |     (let [dictionary [{:text "dog" :id "1" :stem? true :stemmer :estonian}]
232 |           highlighter-fn (phrases/highlighter dictionary)
233 |           anns (highlighter-fn txt)]
234 |       (is (empty? anns)))))
235 | 
236 | (deftest mixed-stemmers
237 |   (let [txt "Saboniai plays basketball"
238 |         dictionary [{:text "Sabonis" :id "1" :stem? true :stemmer :lithuanian}
239 |                     {:text "play" :id "2" :stem? true :stemmer :english}]
240 |         highlighter-fn (phrases/highlighter dictionary)
241 |         anns (highlighter-fn txt)]
242 |     (is (= 2 (count anns)))))
243 | 
244 | (deftest phrase-slop
245 |   (let [txt "before start and end after"
246 |         dictionary [{:text "start end" :id "1" :slop 1}]
247 |         highlighter-fn (phrases/highlighter dictionary)
248 |         anns (highlighter-fn txt)]
249 |     (is (= 1 (count anns)))
250 |     (is (= "start and end" (:text (first anns)))))
251 |   (testing "all terms in the phrase should match"
252 |     (let [txt "before start end after"
253 |           dictionary [{:text "start NOPE end" :id "1" :slop 10}]
254 |           highlighter-fn (phrases/highlighter dictionary)
255 |           anns (highlighter-fn txt)]
256 |       (is (empty? anns))))
257 |   (let [txt "before start phrase and end phrase after"
258 |         dictionary [{:text "start phrase end phrase" :id "1" :slop 1}]
259 |         highlighter-fn (phrases/highlighter dictionary)
260 |         anns (highlighter-fn txt)]
261 |     (is (= 1 (count anns)))
262 |     (is (= "start phrase and end phrase" (:text (first anns)))))
263 |   (testing "phrase edit distance"
264 |     (let [txt "before start end after"
265 |           dictionary [{:text "end start" :id "1" :slop 0}]
266 |           highlighter-fn (phrases/highlighter dictionary)
267 |           anns (highlighter-fn txt)]
268 |       (is (empty? anns)))
269 |     (let [txt "before start end after"
270 |           dictionary [{:text "end start" :id "1" :slop 2}]
271 |           highlighter-fn (phrases/highlighter dictionary)
272 |           anns (highlighter-fn txt)]
273 |       (is (= 1 (count anns)))
274 |       (is (= "start end" (:text (first anns))))))
275 |   (testing "all terms should match despite the slop"
276 |     (let [txt "before start end after"
277 |           dictionary [{:text "end start foo" :id "1" :slop 100}]
278 |           highlighter-fn (phrases/highlighter dictionary)
279 |           anns (highlighter-fn txt)]
280 |       (is (empty? anns)))))
281 | 
282 | (deftest dictionary-corner-cases
283 |   (let [txt "Some text to test ."
284 |         dictionary [{:text "."} {:text "text"}]
285 |         highlighter-fn (phrases/highlighter dictionary {:tokenizer :whitespace})
286 |         anns (highlighter-fn txt)]
287 |     (is (= 2 (count anns))))
288 |   (let [txt "Some text to test."
289 |         dictionary [{:text "<html></html>"} {:text "text"}]
290 |         highlighter-fn (phrases/highlighter dictionary)
291 |         anns (highlighter-fn txt)]
292 |     (is (seq anns))))
293 | 
294 | (deftest ^:noisy noisy-tests-for-corner-cases
295 |   (let [txt "Some text to test."
296 |         dictionary [{:text "."} {:text "text"}]
297 |         highlighter-fn (phrases/highlighter dictionary)
298 |         anns (highlighter-fn txt)]
299 |     (is (seq anns))
300 |     (is (= 1 (count anns))))
301 |   (let [txt " `  `"
302 |         dictionary [{:text "test" :id "1"}]
303 |         highlighter-fn (phrases/highlighter dictionary)
304 |         anns (highlighter-fn txt)]
305 |     (is (coll? anns))
306 |     (is (empty? anns)))
307 |   (testing "slop versions"
308 |     (stest/unstrument `phrases/highlighter)
309 |     (testing "nil slop"
310 |       (let [txt "before start end after"
311 |             dictionary [{:text "end start foo" :id "1" :slop nil}]
312 |             highlighter-fn (phrases/highlighter dictionary)
313 |             anns (highlighter-fn txt)]
314 |         (is (empty? anns))))
315 |     (testing "very big slop"
316 |       (let [txt "before start end after"
317 |             dictionary [{:text "end start foo" :id "1" :slop 1000000000000}]
318 |             highlighter-fn (phrases/highlighter dictionary)
319 |             anns (highlighter-fn txt)]
320 |         (is (empty? anns))))
321 |     (testing "slop with negative value"
322 |       (let [txt "before start end after"
323 |             dictionary [{:text "end start foo" :id "1" :slop -1}]
324 |             highlighter-fn (phrases/highlighter dictionary)
325 |             anns (highlighter-fn txt)]
326 |         (is (empty? anns))))
327 |     (stest/instrument `phrases/highlighter)))
328 | 
329 | (deftest tokenizer-conf
330 |   (let [txt "URGENT! Do this immediately!"
331 |         dictionary [{:text "URGENT" :id "a" :tokenizer :whitespace}
332 |                     {:text "URGENT" :id "b" :tokenizer :standard}]
333 |         highlighter-fn (phrases/highlighter dictionary)
334 |         anns (highlighter-fn txt)]
335 |     (is (= 1 (count anns)))
336 |     (is (= "b" (:dict-entry-id (first anns)))))
337 |   (let [txt "[URGENT!] Do this immediately!"
338 |         dictionary [{:text "[URGENT!]" :id "a" :tokenizer :whitespace}
339 |                     {:text "[URGENT!]" :id "b" :tokenizer :standard}]
340 |         highlighter-fn (phrases/highlighter dictionary)
341 |         anns (highlighter-fn txt)]
342 |     (is (= 2 (count anns)))
343 |     (is (= "[URGENT!]" (:text (first (filter #(= "a" (:dict-entry-id %)) anns)))))
344 |     (is (= "URGENT" (:text (first (filter #(= "b" (:dict-entry-id %)) anns)))))))
345 | 
346 | (deftest phrase-ordering-basic-case
347 |   (is (= 1 (count ((phrases/highlighter [{:text "Token Mill" :slop 2 :in-order? false}])
348 |                    "Mill Token"))))
349 |   (is (= 0 (count ((phrases/highlighter [{:text "Token Mill" :slop 2 :in-order? true}])
350 |                    "Mill Token")))))
351 | 
352 | (deftest highlighter-opts-for-slop-with-order
353 |   (is (= 0 (count ((phrases/highlighter [{:text "Token Mill"}]
354 |                                         {})
355 |                    "Mill Token"))))
356 |   (is (= 1 (count ((phrases/highlighter [{:text "Token Mill"}]
357 |                                         {:slop 2})
358 |                    "Mill Token"))))
359 |   (is (= 0 (count ((phrases/highlighter [{:text "Token Mill"}]
360 |                                         {:slop 2 :in-order? true})
361 |                    "Mill Token")))))
362 | 
363 | (deftest ordered-phrase-with-on-term
364 |   (is (= 1 (count ((phrases/highlighter [{:text "phrase" :slop 2 :in-order? true}])
365 |                    "prefix phrase suffix")))))
366 | 
367 | (deftest ordered-phrase-with-two-equal-terms-in-front-and-end
368 |   (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase" :slop 2 :in-order? true}])
369 |                             "prefix phrase phrase suffix")]
370 |     (is (= 1 (count anns)))
371 |     (is (= "phrase phrase" (:text ann)))
372 |     (is (= 7 (:begin-offset ann)))
373 |     (is (= 20 (:end-offset ann))))
374 |   (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase and phrase" :slop 2 :in-order? true}])
375 |                             "prefix phrase and phrase suffix")]
376 |     (is (= 1 (count anns)))
377 |     (is (= "phrase and phrase" (:text ann)))
378 |     (is (= 7 (:begin-offset ann)))
379 |     (is (= 24 (:end-offset ann)))))
380 | 
381 | (deftest ordered-ambigous-phrase
382 |   (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}])
383 |                             "prefix phrase phrase end suffix")]
384 |     (is (= 1 (count anns)))
385 |     (is (= "phrase phrase end" (:text ann)))
386 |     (is (= 7 (:begin-offset ann)))
387 |     (is (= 24 (:end-offset ann))))
388 |   (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}])
389 |                             "prefix phrase phrase end end suffix")]
390 |     (is (= 1 (count anns)))
391 |     (is (= "phrase phrase end" (:text ann)))
392 |     (is (= 7 (:begin-offset ann)))
393 |     (is (= 24 (:end-offset ann))))
394 |   (let [[ann1 & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 1 :in-order? true}])
395 |                              "prefix phrase phrase a phrase end suffix")]
396 |     (is (= 1 (count anns)))
397 |     (is (= "phrase a phrase end" (:text ann1)))
398 |     (is (= 14 (:begin-offset ann1)))
399 |     (is (= 33 (:end-offset ann1))))
400 | 
401 |   (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase end end" :slop 1 :in-order? true}])
402 |                             "prefix phrase phrase end end suffix")]
403 |     (is (= 1 (count anns)))
404 |     (is (= "phrase phrase end end" (:text ann)))
405 |     (is (= 7 (:begin-offset ann)))
406 |     (is (= 28 (:end-offset ann))))
407 |   (let [[ann & _ :as anns] ((phrases/highlighter [{:text "phrase end end" :slop 1 :in-order? true}])
408 |                             "prefix phrase phrase end end X X phrase phrase end end suffix")]
409 |     (is (= 2 (count anns)))
410 |     (is (= "phrase phrase end end" (:text ann)))
411 |     (is (= 7 (:begin-offset ann)))
412 |     (is (= 28 (:end-offset ann)))))
413 | 
414 | (deftest complicated-ordering
415 |   (let [[ann1 ann2 & _ :as anns] ((phrases/highlighter [{:text "phrase phrase end" :slop 10 :in-order? true}])
416 |                                   "prefix phrase phrase end phrase end suffix")]
417 |     (is (= 2 (count anns)))
418 |     (is (= "phrase phrase end" (:text ann1)))
419 |     (is (= 7 (:begin-offset ann1)))
420 |     (is (= 24 (:end-offset ann1)))
421 |     ;; FIXME: this highlight is not correct
422 |     (is (= "phrase end" (:text ann2)))
423 |     (is (= 25 (:begin-offset ann2)))
424 |     (is (= 35 (:end-offset ann2)))))
425 | 
426 | (deftest preserve-order-edge-cases
427 |   (testing "multiple match of a phrase"
428 |     (is (= 3 (count ((phrases/highlighter
429 |                        [{:text "Token Mill" :slop 3 :in-order? false}])
430 |                      "Prefix Token Mill Infix Token a Mill Suffix"))))
431 |     (is (= 2 (count ((phrases/highlighter
432 |                        [{:text "Token Mill" :slop 1 :in-order? true}])
433 |                      "Prefix Token Mill Infix Token a Mill Suffix"))))
434 |     (is (= 1 (count ((phrases/highlighter
435 |                        [{:text "Token Mill" :slop 0 :in-order? true}])
436 |                      "Prefix Token Mill Infix Token a Mill Suffix"))))
437 |     (let [highlights ((phrases/highlighter
438 |                         [{:text "Token Mill" :slop 1 :in-order? true :meta {:test "test"}}])
439 |                       "Prefix Token Mill Infix Token a Mill Suffix")]
440 |       (is (= 2 (count highlights)))
441 |       (let [first-highlight (apply min-key :begin-offset highlights)]
442 |         (is (= "Token Mill" (:text first-highlight)))
443 |         (is (= 7 (:begin-offset first-highlight)))
444 |         (is (= 17 (:end-offset first-highlight)))
445 |         (is (= {"test" "test"} (:meta first-highlight)))
446 |         (is (= "PHRASE" (:type first-highlight))))
447 |       (let [second-highlight (apply max-key :begin-offset highlights)]
448 |         (is (= "Token a Mill" (:text second-highlight)))
449 |         (is (= 24 (:begin-offset second-highlight)))
450 |         (is (= 36 (:end-offset second-highlight)))
451 |         (is (= {"test" "test"} (:meta second-highlight)))
452 |         (is (= "PHRASE" (:type second-highlight)))))))
453 | 
454 | (deftest annotator-options
455 |   (testing "case sensitivity flag"
456 |     (let [txt "prefix PHRASE suffix"
457 |           dictionary [{:text "phrase"}]
458 |           highlighter-fn (phrases/highlighter dictionary)
459 |           anns (highlighter-fn txt)]
460 |       (is (empty? anns)))
461 |     (let [txt "prefix PHRASE suffix"
462 |           dictionary [{:text "phrase"}]
463 |           highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false})
464 |           anns (highlighter-fn txt)]
465 |       (is (= 1 (count anns)))))
466 | 
467 |   (testing "ascii folding flag"
468 |     (let [txt "prefix PHRÄSE suffix"
469 |           dictionary [{:text "phrase"}]
470 |           highlighter-fn (phrases/highlighter dictionary)
471 |           anns (highlighter-fn txt)]
472 |       (is (empty? anns)))
473 |     (let [txt "prefix PHRÄSE suffix"
474 |           dictionary [{:text "phrase"}]
475 |           highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false
476 |                                                           :ascii-fold? true})
477 |           anns (highlighter-fn txt)]
478 |       (is (= 1 (count anns)))))
479 | 
480 |   (testing "stemming options"
481 |     (let [txt "prefix PHRASES suffix"
482 |           dictionary [{:text "phrase"}]
483 |           highlighter-fn (phrases/highlighter dictionary)
484 |           anns (highlighter-fn txt)]
485 |       (is (empty? anns)))
486 |     (let [txt "prefix PHRASES suffix"
487 |           dictionary [{:text "phrase"}]
488 |           highlighter-fn (phrases/highlighter dictionary {:case-sensitive? false
489 |                                                           :stem?           true
490 |                                                           :stemmer         :english})
491 |           anns (highlighter-fn txt)]
492 |       (is (= 1 (count anns))))))
493 | 
494 | (deftest phrases-with-edit-distance
495 |   (let [txt "prefix tokne mill suffix"
496 |         dictionary [{:text "token mill" :fuzzy? true :fuzziness 1}]
497 |         highlighter-fn (phrases/highlighter dictionary {})
498 |         [ann1 :as anns] (highlighter-fn txt)]
499 |     (is (= 1 (count anns)))
500 |     (is (= "tokne mill" (:text ann1))))
501 |   (let [txt "prefix mill tokne suffix"
502 |         dictionary [{:text "token mill" :fuzzy? true :fuzziness 1}]
503 |         highlighter-fn (phrases/highlighter dictionary {})
504 |         anns (highlighter-fn txt)]
505 |     (is (empty? anns)))
506 |   (let [txt "prefix tokne mill suffix"
507 |         dictionary [{:text "mill token" :fuzzy? true :fuzziness 1 :in-order? true}]
508 |         highlighter-fn (phrases/highlighter dictionary {})
509 |         anns (highlighter-fn txt)]
510 |     (is (empty? anns)))
511 |   (let [txt "prefix mill tokne suffix"
512 |         dictionary [{:text "token mill" :fuzzy? true :fuzziness 1 :in-order? false}]
513 |         highlighter-fn (phrases/highlighter dictionary {})
514 |         [ann1 :as anns] (highlighter-fn txt)]
515 |     (is (= 1 (count anns)))
516 |     (is (= "mill tokne" (:text ann1))))
517 |   (let [txt "prefix tokne uab mill suffix"
518 |         dictionary [{:text "mill token" :fuzzy? true :fuzziness 1 :in-order? false}]
519 |         highlighter-fn (phrases/highlighter dictionary {})
520 |         anns (highlighter-fn txt)]
521 |     (is (empty? anns))))
522 | 


--------------------------------------------------------------------------------
/test/beagle/readers_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.readers-test
 2 |   (:require [clojure.test :refer [deftest is]]
 3 |             [clojure.spec.alpha :as s]
 4 |             [beagle.schema :as sch]
 5 |             [beagle.readers :as readers])
 6 |   (:import (java.io ByteArrayInputStream)))
 7 | 
 8 | (deftest json-reader
 9 |   (is (not (nil? (s/conform ::sch/dictionary
10 |                              (readers/read-json
11 |                                (ByteArrayInputStream.
12 |                                  (.getBytes "[{\"text\": \"moo\"}]")))))))
13 |   (is (not (nil? (s/conform ::sch/dictionary
14 |                              (readers/read-json "test/resources/dict.json"))))))
15 | 
16 | (deftest csv-file-reader
17 |   (is (not (nil? (s/conform ::sch/dictionary (readers/read-csv "test/resources/dict.csv"))))))
18 | 
19 | (deftest edn-file-reader
20 |   (is (not (nil? (s/conform ::sch/dictionary (readers/read-edn "test/resources/dict.edn"))))))
21 | 


--------------------------------------------------------------------------------
/test/beagle/text_analysis_test.clj:
--------------------------------------------------------------------------------
 1 | (ns beagle.text-analysis-test
 2 |   (:require [clojure.test :refer [deftest is]]
 3 |             [beagle.text-analysis :as text-analysis]))
 4 | 
 5 | (deftest field-name-construction
 6 |   (is (= "text.standard-tokenizer"
 7 |          (text-analysis/get-field-name {} {})))
 8 |   (is (= "text.standard-tokenizer"
 9 |          (text-analysis/get-field-name {:case-sensitive? true} {})))
10 |   (is (= "text.standard-tokenizer.lowercased"
11 |          (text-analysis/get-field-name {:case-sensitive? false} {})))
12 |   (is (= "text.standard-tokenizer.ascii-folded"
13 |          (text-analysis/get-field-name {:ascii-fold? true} {})))
14 |   (is (= "text.standard-tokenizer.stemmed-english"
15 |          (text-analysis/get-field-name {:stem? true} {})))
16 |   (is (= "text.standard-tokenizer.stemmed-lithuanian"
17 |          (text-analysis/get-field-name {:stem? true :stemmer :lithuanian} {})))
18 |   (is (= "text.standard-tokenizer.ascii-folded-lowercased-stemmed-lithuanian"
19 |          (text-analysis/get-field-name {:ascii-fold? true
20 |                                         :case-sensitive? false
21 |                                         :stem? true
22 |                                         :stemmer :lithuanian} {}))))
23 | 
24 | (deftest token-stream
25 |   (let [txt "These are tests."]
26 |     (is (= ["These" "are" "tests"]
27 |            (text-analysis/text->token-strings
28 |              txt (text-analysis/get-string-analyzer {:case-sensitive? true} {}))))
29 |     (is (= ["these" "are" "tests"]
30 |            (text-analysis/text->token-strings
31 |              txt (text-analysis/get-string-analyzer {:case-sensitive? false} {}))))
32 |     (is (= ["these" "are" "tests"]
33 |            (text-analysis/text->token-strings
34 |              txt (text-analysis/get-string-analyzer {:case-sensitive? false
35 |                                                      :ascii-fold? true} {}))))
36 |     (is (= ["these" "are" "test"]
37 |            (text-analysis/text->token-strings
38 |              txt (text-analysis/get-string-analyzer {:case-sensitive? false
39 |                                                      :ascii-fold? true
40 |                                                      :stem? true} {}))))
41 |     ; this one is surprising but correct
42 |     (is (= ["these" "are" "tests."]
43 |            (text-analysis/text->token-strings
44 |              txt (text-analysis/get-string-analyzer {:case-sensitive? false
45 |                                                      :ascii-fold? true
46 |                                                      :stem? true} {:tokenizer :whitespace}))))))
47 | 


--------------------------------------------------------------------------------
/test/beagle/validator_test.clj:
--------------------------------------------------------------------------------
1 | (ns beagle.validator-test
2 |   (:require [clojure.test :refer [deftest is]]
3 |             [beagle.validator :as validator]))
4 | 
5 | (deftest basic-cases
6 |   (is (seq (validator/valid-dictionary? [{:text "test" :id "1" :meta {:test "test"} :type "CUSTOM"}])))
7 |   (is (nil? (validator/valid-dictionary? [{:id "1" :meta {:test "test"} :type "CUSTOM"}]))))
8 | 


--------------------------------------------------------------------------------
/test/resources/dict.csv:
--------------------------------------------------------------------------------
 1 | text,id,synonyms,meta,case-sensitive?,ascii-fold?,type
 2 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,true,TEST
 3 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,true
 4 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,false
 5 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,FALSE
 6 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true,NOT_BOOL
 7 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2,true
 8 | test-dictionary-item,id1,syn1;syn2,k1;v1;k2;v2
 9 | test-dictionary-item,id1,syn1;syn2,k;v
10 | test-dictionary-item,id1,syn1;syn2,k
11 | test-dictionary-item,id1,syn1;syn2,
12 | test-dictionary-item,id1,syn1;syn2
13 | test-dictionary-item,id1,syn1;
14 | test-dictionary-item,id1,syn1
15 | test-dictionary-item,id1,
16 | test-dictionary-item,id1
17 | test-dictionary-item
18 | test-dictionary-item,,,,,,TEST
19 | 


--------------------------------------------------------------------------------
/test/resources/dict.edn:
--------------------------------------------------------------------------------
1 | [{:text "test text"
2 |   :id "test-id"
3 |   :case-sensitive? true
4 |   :ascii-fold? true
5 |   :meta {:email "test@example.com"}}]
6 | 


--------------------------------------------------------------------------------
/test/resources/dict.json:
--------------------------------------------------------------------------------
1 | [{"text": "test text",
2 |   "id": "test-id",
3 |   "case-sensitive?": true,
4 |   "ascii-fold?": true,
5 |   "meta": {"email": "test@example.com"}}]


--------------------------------------------------------------------------------
/test/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration scan="true" scanPeriod="10 seconds">
 2 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 3 |         <encoder>
 4 |             <pattern>%d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n</pattern>
 5 |         </encoder>
 6 |     </appender>
 7 | 
 8 |     <root level="debug">
 9 |         <appender-ref ref="STDOUT" />
10 |     </root>
11 | 
12 | </configuration>
13 | 


--------------------------------------------------------------------------------