├── .coveralls.yml
├── .github
└── FUNDING.yml
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.MD
├── pom.xml
└── src
└── main
├── cpp
├── BUILD.gn
├── Cld3LangDetector.cc
├── Cld3LangDetector.cpp
└── Cld3LangDetector.h
├── kotlin
└── com
│ └── b8
│ ├── LangDetect.kt
│ ├── NativeLangDetector.kt
│ └── OperationSystemCoordinator.kt
├── lib
├── osx
│ ├── libc++.dylib
│ ├── libnative.dylib
│ └── libprotobuf_lite.dylib
└── unix
│ ├── libc++.so
│ ├── libnative.so
│ └── libprotobuf_lite.so
└── test
└── kotlin
└── com
└── b8
├── LangDetectTest.kt
└── OperationSystemCoordinatorTest.kt
/.coveralls.yml:
--------------------------------------------------------------------------------
1 | service_name: travis-pro
2 | repo_token: hadEbsfN3Hm0dspJLYYJkIj838UiWEg43
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [ntedgi]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | pom.xml.tag
3 | pom.xml.releaseBackup
4 | pom.xml.versionsBackup
5 | pom.xml.next
6 | release.properties
7 | dependency-reduced-pom.xml
8 | buildNumber.properties
9 | .mvn/timing.properties
10 | .mvn/wrapper/maven-wrapper.jar
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | os:
3 | - osx
4 | - linux
5 |
6 | osx_image: xcode8
7 |
8 | after_success:
9 | - mvn clean test jacoco:report
10 | - bash <(curl -s https://codecov.io/bash) -t 187da4be-aeee-4681-ae78-e199dce98794
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Naor Shlomo Tedgi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | [](https://travis-ci.com/ntedgi/cld3-kotlin) [](https://codecov.io/gh/ntedgi/cld3-kotlin)
2 | [](https://codebeat.co/projects/github-com-ntedgi-cld3-kotlin-master) [](https://jitpack.io/#ntedgi/cld3-kotlin)
3 |
4 | # cld3-kotlin
5 | WIP - Kotlin [CLD3](https://github.com/google/cld3#model) - Google's Compact Language Detector 3
6 |
7 | Bridge from c++ to Kotlin using [Java Abstracted Foreign Function Layer](https://github.com/jnr/jnr-ffi)
8 |
9 | ## Operations Systems Support:
10 |
11 | |Job | OS | State| Shared Objects|
12 | |:------------- | :-------------:| :-------------:| :-------------:|
13 | |[47.1](https://travis-ci.com/ntedgi/cld3-kotlin/builds/114315426) | macOS | passed| [dylib](https://github.com/ntedgi/cld3-kotlin/tree/master/src/main/lib/osx)
14 | |[47.2](https://travis-ci.com/ntedgi/cld3-kotlin/builds/114315426) | Linux | passed|[so](https://github.com/ntedgi/cld3-kotlin/tree/master/src/main/lib/unix)
15 | | - | windows |not supported |-|
16 |
17 |
18 |
19 | ## Usage Examples:
20 |
21 |
22 | ### add maven dependencies
23 |
24 | ```maven
25 |
26 |
27 | jitpack.io
28 | https://jitpack.io
29 |
30 |
31 | ...
32 |
33 | com.github.ntedgi
34 | cld3-kotlin
35 | 1.0.2
36 |
37 |
38 | ```
39 |
40 | ### download [os shred objects](https://github.com/ntedgi/cld3-kotlin/tree/master/src/main/lib) and add them under src/lib/(os-name)
41 |
42 |
43 | ```kotlin
44 | val ld = LangDetect()
45 | val englishText = "This piece of text is in English";
46 | var result = ld.detect(englishText)
47 | assert(result.language == "English")
48 | assert(result.isReliable)
49 | assert(result.proportion == 1f)
50 | ```
51 |
52 |
53 | ```kotlin
54 | val ld = LangDetect()
55 | val englishBulgarianText = "This piece of text is in English Този текст е на Български";
56 | val results = ld.findTopNMostFreqLangs(englishBulgarianText, 3)
57 | val languages = results.map { it.language }
58 | assert(languages.size == 3)
59 | assert(languages.contains("English"))
60 | assert(languages.contains("Bulgarian"))
61 | assert(languages.contains("UNKNOWN"))
62 | ```
63 |
64 |
65 |
66 |
67 |
68 | ## The Bridge Interface Implemantation:
69 |
70 | ## from (C++)
71 | ```cpp
72 | std::vector FindTopNMostFreqLangs(const string &text, int num_langs);
73 | ```
74 | ```cpp
75 | Result FindLanguage(const string &text);
76 | ```
77 |
78 | ## to (Kotlin)
79 | ```kotlin
80 | fun findTopNMostFreqLangs(text: String, n: Int): List
81 | ```
82 |
83 | ```kotlin
84 | fun detect(text: String): LangDetectResponse
85 | ```
86 |
87 | ```kotlin
88 | data class LangDetectResponse(
89 | val probability: Float,
90 | val proportion: Float,
91 | val isReliable: Boolean,
92 | val language: String
93 | )
94 | ```
95 | ---
96 | ```ts
97 | if (this.repo.isAwesome || this.repo.isHelpful) {
98 | Star(this.repo);
99 | }
100 | ```
101 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 | 4.0.0
6 |
7 | com.b8
8 | cld3-kotlin
9 | 1.0
10 | jar
11 | cld3-kotlin
12 |
13 |
14 |
15 |
16 | UTF-8
17 | 1.6.0
18 | official
19 | 4.13.1
20 |
21 |
22 |
23 |
24 | com.github.jnr
25 | jnr-ffi
26 | 2.1.9
27 |
28 |
29 |
30 | org.jetbrains.kotlin
31 | kotlin-stdlib
32 | ${kotlin.version}
33 |
34 |
35 | org.jetbrains.kotlin
36 | kotlin-test-junit
37 | ${kotlin.version}
38 | test
39 |
40 |
41 | junit
42 | junit
43 | ${junit.version}
44 | test
45 |
46 |
47 |
48 |
49 | src/main/kotlin
50 | src/main/test/kotlin
51 |
52 |
53 |
54 |
55 |
56 | org.jetbrains.kotlin
57 | kotlin-maven-plugin
58 | ${kotlin.version}
59 |
60 |
61 | compile
62 | compile
63 |
64 | compile
65 |
66 |
67 |
68 | test-compile
69 | test-compile
70 |
71 | test-compile
72 |
73 |
74 |
75 |
76 |
77 | org.jacoco
78 | jacoco-maven-plugin
79 | 0.8.4
80 |
81 |
82 | prepare-agent
83 |
84 | prepare-agent
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
--------------------------------------------------------------------------------
/src/main/cpp/BUILD.gn:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #==============================================================================
15 |
16 | import("//third_party/protobuf/proto_library.gni")
17 |
18 | proto_library("protos") {
19 | sources = [
20 | "feature_extractor.proto",
21 | "sentence.proto",
22 | "task_spec.proto",
23 | ]
24 | proto_out_dir = "cld_3/protos"
25 | }
26 |
27 | static_library("cld_3") {
28 | sources = [
29 | "base.cc",
30 | "base.h",
31 | "casts.h",
32 | "embedding_feature_extractor.cc",
33 | "embedding_feature_extractor.h",
34 | "embedding_network.cc",
35 | "embedding_network.h",
36 | "embedding_network_params.h",
37 | "feature_extractor.cc",
38 | "feature_extractor.h",
39 | "feature_types.cc",
40 | "feature_types.h",
41 | "float16.h",
42 | "fml_parser.cc",
43 | "fml_parser.h",
44 | "language_identifier_features.cc",
45 | "language_identifier_features.h",
46 | "lang_id_nn_params.cc",
47 | "lang_id_nn_params.h",
48 | "nnet_language_identifier.cc",
49 | "nnet_language_identifier.h",
50 | "registry.cc",
51 | "registry.h",
52 | "relevant_script_feature.cc",
53 | "relevant_script_feature.h",
54 | "script_detector.h",
55 | "sentence_features.cc",
56 | "sentence_features.h",
57 | "simple_adder.h",
58 | "script_span/fixunicodevalue.cc",
59 | "script_span/fixunicodevalue.h",
60 | "script_span/generated_entities.cc",
61 | "script_span/generated_ulscript.cc",
62 | "script_span/generated_ulscript.h",
63 | "script_span/getonescriptspan.cc",
64 | "script_span/getonescriptspan.h",
65 | "script_span/integral_types.h",
66 | "script_span/offsetmap.cc",
67 | "script_span/offsetmap.h",
68 | "script_span/port.h",
69 | "script_span/stringpiece.h",
70 | "script_span/text_processing.cc",
71 | "script_span/text_processing.h",
72 | "script_span/utf8acceptinterchange.h",
73 | "script_span/utf8prop_lettermarkscriptnum.h",
74 | "script_span/utf8repl_lettermarklower.h",
75 | "script_span/utf8scannot_lettermarkspecial.h",
76 | "script_span/utf8statetable.cc",
77 | "script_span/utf8statetable.h",
78 | "task_context.cc",
79 | "task_context.h",
80 | "task_context_params.cc",
81 | "task_context_params.h",
82 | "unicodetext.cc",
83 | "unicodetext.h",
84 | "utils.cc",
85 | "utils.h",
86 | "workspace.cc",
87 | "workspace.h"
88 | ]
89 | public_deps = [
90 | "//third_party/protobuf:protobuf_lite",
91 | ":protos",
92 | ]
93 | }
94 |
95 | # The executables below are functional. Uncomment to use.
96 |
97 | executable("language_identifier_main") {
98 | sources = [
99 | "language_identifier_main.cc",
100 | ]
101 | deps = [
102 | ":cld_3",
103 | ]
104 | }
105 |
106 |
107 | executable("main") {
108 | sources = [
109 | "main.cc",
110 | "Cld3LangDetector.cc"
111 | ]
112 | deps = [
113 | ":cld_3",
114 | ]
115 | }
116 |
117 |
118 | shared_library("native2") {
119 | sources = [
120 | "Cld3LangDetector.cc"
121 | ]
122 | deps = [
123 | ":cld_3",
124 | ]
125 | }
126 |
127 |
128 | shared_library("native") {
129 | sources = [
130 | "base.cc",
131 | "base.h",
132 | "casts.h",
133 | "embedding_feature_extractor.cc",
134 | "embedding_feature_extractor.h",
135 | "embedding_network.cc",
136 | "embedding_network.h",
137 | "embedding_network_params.h",
138 | "feature_extractor.cc",
139 | "feature_extractor.h",
140 | "feature_types.cc",
141 | "feature_types.h",
142 | "float16.h",
143 | "fml_parser.cc",
144 | "fml_parser.h",
145 | "language_identifier_features.cc",
146 | "language_identifier_features.h",
147 | "lang_id_nn_params.cc",
148 | "lang_id_nn_params.h",
149 | "nnet_language_identifier.cc",
150 | "nnet_language_identifier.h",
151 | "registry.cc",
152 | "registry.h",
153 | "relevant_script_feature.cc",
154 | "relevant_script_feature.h",
155 | "script_detector.h",
156 | "sentence_features.cc",
157 | "sentence_features.h",
158 | "simple_adder.h",
159 | "script_span/fixunicodevalue.cc",
160 | "script_span/fixunicodevalue.h",
161 | "script_span/generated_entities.cc",
162 | "script_span/generated_ulscript.cc",
163 | "script_span/generated_ulscript.h",
164 | "script_span/getonescriptspan.cc",
165 | "script_span/getonescriptspan.h",
166 | "script_span/integral_types.h",
167 | "script_span/offsetmap.cc",
168 | "script_span/offsetmap.h",
169 | "script_span/port.h",
170 | "script_span/stringpiece.h",
171 | "script_span/text_processing.cc",
172 | "script_span/text_processing.h",
173 | "script_span/utf8acceptinterchange.h",
174 | "script_span/utf8prop_lettermarkscriptnum.h",
175 | "script_span/utf8repl_lettermarklower.h",
176 | "script_span/utf8scannot_lettermarkspecial.h",
177 | "script_span/utf8statetable.cc",
178 | "script_span/utf8statetable.h",
179 | "task_context.cc",
180 | "task_context.h",
181 | "task_context_params.cc",
182 | "task_context_params.h",
183 | "unicodetext.cc",
184 | "unicodetext.h",
185 | "utils.cc",
186 | "utils.h",
187 | "workspace.cc",
188 | "workspace.h",
189 | "Cld3LangDetector.cc",
190 | "Cld3LangDetector.h"
191 | ]
192 | public_deps = [
193 | "//third_party/protobuf:protobuf_lite",
194 | ":protos",
195 | ]
196 | }
197 |
198 | #executable("getonescriptspan_test") {
199 | # sources = [
200 | # "script_span/getonescriptspan_test.cc",
201 | # ]
202 | # deps = [
203 | # ":cld_3",
204 | # ]
205 | #}
206 |
207 | #executable("language_identifier_features_test") {
208 | # sources = [
209 | # "language_identifier_features_test.cc",
210 | # ]
211 | # deps = [
212 | # ":cld_3",
213 | # ]
214 | #}
215 |
216 | #executable("nnet_lang_id_test") {
217 | # sources = [
218 | # "nnet_lang_id_test.cc",
219 | # "nnet_lang_id_test_data.cc",
220 | # "nnet_lang_id_test_data.h",
221 | # ]
222 | # deps = [
223 | # ":cld_3",
224 | # ]
225 | #}
226 |
--------------------------------------------------------------------------------
/src/main/cpp/Cld3LangDetector.cc:
--------------------------------------------------------------------------------
1 | //
2 | // Created by naor on 5/29/19.
3 | //
4 |
5 | #include "Cld3LangDetector.h"
6 | #include "../nnet_language_identifier.h"
7 |
8 | using namespace chrome_lang_id;
9 |
10 | long create() {
11 | return (long) new NNetLanguageIdentifier();
12 | }
13 |
14 | long create1(int min_num_bytes, int max_num_bytes) {
15 | return (long) new NNetLanguageIdentifier(min_num_bytes, max_num_bytes);
16 | }
17 |
18 | void deallocate(long ptr) {
19 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr;
20 | delete nptr;
21 | }
22 |
23 | void detect(long ptr, const char *text, const char *into) {
24 |
25 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr;
26 | NNetLanguageIdentifier::Result res = nptr->FindLanguage(text);
27 |
28 | long current = (long) into;
29 |
30 | *((float *) current) = res.probability;
31 | current += sizeof(float);
32 |
33 | *((float *) current) = res.proportion;
34 | current += sizeof(float);
35 | *((short *) current) = res.is_reliable;
36 | current += sizeof(short);
37 | *((int *) current) = res.language.size();
38 | current += sizeof(int);
39 |
40 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language));
41 |
42 |
43 | }
44 |
45 |
46 | void findTopNMostFreqLangs(long ptr, const char *text, const char *into, int num_langs) {
47 |
48 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr;
49 |
50 | const std::vector results =
51 | nptr->FindTopNMostFreqLangs(text, num_langs);
52 |
53 |
54 | long current = (long) into;
55 |
56 |
57 | int size = (int )results.size();
58 | *((int *) current) = size;
59 | current += sizeof(int);
60 |
61 |
62 | for (const NNetLanguageIdentifier::Result &res : results) {
63 |
64 | *((float *) current) = res.probability;
65 | current += sizeof(float);
66 |
67 | *((float *) current) = res.proportion;
68 | current += sizeof(float);
69 |
70 | *((short *) current) = res.is_reliable;
71 | current += sizeof(short);
72 |
73 | *((int *) current) = res.language.size();
74 | current += sizeof(int);
75 |
76 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language));
77 | current += sizeof(char) * res.language.size();
78 | }
79 |
80 |
81 | }
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/src/main/cpp/Cld3LangDetector.cpp:
--------------------------------------------------------------------------------
1 | //
2 | // Created by naor on 5/29/19.
3 | //
4 |
5 | #include "Cld3LangDetector.h"
6 | #include "../nnet_language_identifier.h"
7 |
8 | using namespace chrome_lang_id;
9 |
10 | long create() {
11 | return (long) new NNetLanguageIdentifier();
12 | }
13 |
14 | long create1(int min_num_bytes, int max_num_bytes) {
15 | return (long) new NNetLanguageIdentifier(min_num_bytes, max_num_bytes);
16 | }
17 |
18 | void deallocate(long ptr) {
19 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr;
20 | delete nptr;
21 | }
22 |
23 | void detect(long ptr, const char *text, const char *into) {
24 |
25 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr;
26 | NNetLanguageIdentifier::Result res = nptr->FindLanguage(text);
27 |
28 | long current = (long) into;
29 |
30 | *((float *) current) = res.probability;
31 | current += sizeof(float);
32 |
33 | *((float *) current) = res.proportion;
34 | current += sizeof(float);
35 | *((short *) current) = res.is_reliable;
36 | current += sizeof(short);
37 | *((int *) current) = res.language.size();
38 | current += sizeof(int);
39 |
40 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language));
41 |
42 |
43 | }
44 |
45 |
46 | void findTopNMostFreqLangs(long ptr, const char *text, const char *into, int num_langs) {
47 |
48 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr;
49 |
50 | const std::vector results =
51 | nptr->FindTopNMostFreqLangs(text, num_langs);
52 |
53 |
54 | long current = (long) into;
55 |
56 |
57 | int size = (int )results.size();
58 | *((int *) current) = size;
59 | current += sizeof(int);
60 |
61 |
62 | for (const NNetLanguageIdentifier::Result &res : results) {
63 |
64 | *((float *) current) = res.probability;
65 | current += sizeof(float);
66 |
67 | *((float *) current) = res.proportion;
68 | current += sizeof(float);
69 |
70 | *((short *) current) = res.is_reliable;
71 | current += sizeof(short);
72 |
73 | *((int *) current) = res.language.size();
74 | current += sizeof(int);
75 |
76 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language));
77 | current += sizeof(char) * res.language.size();
78 | }
79 |
80 |
81 | }
82 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/src/main/cpp/Cld3LangDetector.h:
--------------------------------------------------------------------------------
1 | //
2 | // Created by naor on 5/29/19.
3 | //
4 |
5 | #ifndef __CLD_3_LANG_DETECTOR__
6 | #define __CLD_3_LANG_DETECTOR__
7 |
8 | extern "C" {
9 | long __attribute__ ((visibility ("default"))) create();
10 |
11 | long __attribute__ ((visibility ("default"))) create1(int min_num_bytes, int max_num_bytes);
12 |
13 | void __attribute__ ((visibility ("default"))) deallocate(long ptr);
14 |
15 | void __attribute__ ((visibility ("default"))) detect(long ptr, const char *text, const char *into);
16 |
17 | void __attribute__ ((visibility ("default"))) findTopNMostFreqLangs(long ptr, const char *text, const char *into,int num_langs);
18 |
19 | }
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/src/main/kotlin/com/b8/LangDetect.kt:
--------------------------------------------------------------------------------
1 | package com.b8
2 |
3 | import jnr.ffi.LibraryLoader
4 | import java.nio.ByteBuffer
5 | import java.nio.ByteOrder
6 | import java.nio.charset.StandardCharsets
7 |
8 | const val MIN_NUM_OF_BYTES = 0
9 | const val MAX_NUM_OF_BYTES = 10000
10 | const val SINGLE_LANGUAGE_DETECTION_BUFFER_SIZE = 100
11 | const val MULTI_LANGUAGE_DETECTION_BUFFER_SIZE = 250
12 | const val UNKNOWN_LANGUAGE = "UNKNOWN"
13 |
14 | data class LangDetectResponse(
15 | val probability: Float,
16 | val proportion: Float,
17 | val isReliable: Boolean,
18 | val language: String
19 | )
20 |
21 | class LangDetect : AutoCloseable {
22 |
23 | private val fullLanguageName = mapOf(
24 | "eo" to "Esperanto",
25 | "co" to "Corsican",
26 | "eu" to "Basque",
27 | "ta" to "Tamil",
28 | "de" to "German",
29 | "mt" to "Maltese",
30 | "ps" to "Pushto",
31 | "te" to "Telugu",
32 | "su" to "Sundanese",
33 | "uz" to "Uzbek",
34 | "ne" to "Nepali",
35 | "nl" to "Dutch",
36 | "sw" to "Swahili",
37 | "sq" to "Albanian",
38 | "ja" to "Japanese",
39 | "no" to "Norwegian",
40 | "mn" to "Mongolian",
41 | "so" to "Somali",
42 | "ko" to "Korean",
43 | "kk" to "Kazakh",
44 | "sl" to "Slovenian",
45 | "mr" to "Marathi",
46 | "th" to "Thai",
47 | "zu" to "Zulu",
48 | "ml" to "Malayalam",
49 | "hr" to "Croatian",
50 | "bs" to "Bosnian",
51 | "lo" to "Lao",
52 | "sd" to "Sindhi",
53 | "cy" to "Welsh",
54 | "hy" to "Armenian",
55 | "uk" to "Ukrainian",
56 | "pt" to "Portuguese",
57 | "lv" to "Latvian",
58 | "cs" to "Czech",
59 | "vi" to "Vietnamese",
60 | "jv" to "Javanese",
61 | "be" to "Belarusian",
62 | "km" to "Khmer",
63 | "mk" to "Macedonian",
64 | "tr" to "Turkish",
65 | "fy" to "Western Frisian",
66 | "am" to "Amharic",
67 | "zh" to "Chinese",
68 | "da" to "Danish",
69 | "sv" to "Swedish",
70 | "fi" to "Finnish",
71 | "ht" to "Haitian; Haitian Creole",
72 | "af" to "Afrikaans",
73 | "la" to "Latin",
74 | "id" to "Indonesian",
75 | "sm" to "Samoan",
76 | "ca" to "Catalan",
77 | "ka" to "Georgian",
78 | "sr" to "Serbian",
79 | "it" to "Italian",
80 | "sk" to "Slovak",
81 | "ru" to "Russian",
82 | "bg" to "Bulgarian",
83 | "ny" to "Nyanja; Chichewa; Chewa",
84 | "fa" to "Persian",
85 | "gl" to "Galician",
86 | "et" to "Estonian",
87 | "ms" to "Malay",
88 | "gd" to "Gaelic; Scottish Gaelic",
89 | "ha" to "Hausa",
90 | "is" to "Icelandic",
91 | "ur" to "Urdu",
92 | "mi" to "Maori",
93 | "hi" to "Hindi",
94 | "bn" to "Bengali",
95 | "fr" to "French",
96 | "yi" to "Yiddish",
97 | "hu" to "Hungarian",
98 | "xh" to "Xhosa",
99 | "my" to "Burmese",
100 | "tg" to "Tajik",
101 | "ro" to "Romanian",
102 | "ar" to "Arabic",
103 | "lb" to "Luxembourgish",
104 | "kn" to "Kannada",
105 | "az" to "Azerbaijani",
106 | "si" to "Sinhala; Sinhalese",
107 | "ky" to "Kirghiz",
108 | "mg" to "Malagasy",
109 | "en" to "English",
110 | "gu" to "Gujarati",
111 | "es" to "Spanish",
112 | "pl" to "Polish",
113 | "ga" to "Irish",
114 | "lt" to "Lithuanian",
115 | "sn" to "Shona",
116 | "yo" to "Yoruba",
117 | "pa" to "Panjabi",
118 | "ku" to "Kurdish",
119 | "zh-Latn" to "Chinese-Latin",
120 | "bg-Latn" to "Bulgarian-Latin",
121 | "ru-Latn" to "Russian-Latin",
122 | "hi-Latn" to "Hindi-Latin",
123 | "ja-Latn" to "Japanese-Latin",
124 | "st" to "Sotho,Southern",
125 | "el" to "Greek",
126 | "el-Latn" to "Greek-Latin",
127 | "hmn" to "Hmong, Mong",
128 | "ig" to "Igbo",
129 | "iw" to "Hebrew",
130 | "fil" to "Filipino",
131 | "haw" to "Hawaiian",
132 | "ceb" to "Cebuano"
133 | )
134 | private val operatingSystemCoordinator = OperationSystemCoordinator()
135 | private val os = operatingSystemCoordinator.getRunningOperationSystem()
136 | private var ptr: Long = -1
137 | private val detector: NativeLangDetector
138 |
139 | init {
140 | System.load(operatingSystemCoordinator.getOperationSystemSharedObjects(os).libPath())
141 | detector = LibraryLoader.create(NativeLangDetector::class.java)
142 | .load("native")
143 | ptr = detector.create1(MIN_NUM_OF_BYTES, MAX_NUM_OF_BYTES)
144 | }
145 |
146 | fun detect(text: String): LangDetectResponse {
147 | val buffer = ByteArray(SINGLE_LANGUAGE_DETECTION_BUFFER_SIZE)
148 | detector.detect(ptr, text, buffer)
149 |
150 | ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).let {
151 | return extractResponseFromByteBuffer(it)
152 | }
153 | }
154 |
155 | private fun extractResponseFromByteBuffer(byteBuffer: ByteBuffer): LangDetectResponse {
156 | val probability = byteBuffer.float
157 | val proportion = byteBuffer.float
158 | val isReliable = byteBuffer.short
159 | val sizeOfLanguage = byteBuffer.int
160 | val languageBuffer = ByteArray(sizeOfLanguage) { 127.toByte() }
161 | byteBuffer.get(languageBuffer, 0, sizeOfLanguage)
162 | val language = String(languageBuffer, StandardCharsets.UTF_8)
163 | return LangDetectResponse(probability, proportion, isReliable.toInt() == 1, language)
164 | }
165 |
166 | fun findTopNMostFreqLangs(text: String, n: Int): List {
167 | val buffer = ByteArray(MULTI_LANGUAGE_DETECTION_BUFFER_SIZE)
168 | try {
169 | detector.findTopNMostFreqLangs(ptr, text, buffer, n)
170 | } catch (e: Exception) {
171 | e.printStackTrace()
172 | }
173 | val result = ArrayList()
174 | ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).let { byteBuffer ->
175 | val size = byteBuffer.int
176 | repeat(size) {
177 | result.add(extractResponseFromByteBuffer(byteBuffer))
178 | }
179 | }
180 | return result.toList()
181 | }
182 |
183 | fun changeLanguageCodeToFullName(languageCode: String): String {
184 | return fullLanguageName[languageCode] ?: UNKNOWN_LANGUAGE
185 | }
186 |
187 | override fun close() {
188 | detector.deallocate(ptr)
189 | }
190 |
191 |
192 | }
--------------------------------------------------------------------------------
/src/main/kotlin/com/b8/NativeLangDetector.kt:
--------------------------------------------------------------------------------
1 | package com.b8
2 |
3 | import jnr.ffi.annotations.In
4 | import jnr.ffi.annotations.Out
5 |
6 | interface NativeLangDetector{
7 | fun create(): Long
8 | fun create1(min_num_bytes: Int, max_num_bytes: Int): Long
9 | fun deallocate(ptr: Long)
10 | fun detect(ptr: Long, @In text: String, @Out into: ByteArray)
11 | fun findTopNMostFreqLangs(ptr:Long, @In text: String, @Out into: ByteArray, num_langs:Int)
12 | }
--------------------------------------------------------------------------------
/src/main/kotlin/com/b8/OperationSystemCoordinator.kt:
--------------------------------------------------------------------------------
1 | package com.b8
2 |
3 |
4 | const val sharedLibrary = "libnative"
5 |
6 | enum class SharedObject(private val postfix: String) {
7 | OSX("dylib"),
8 | UNIX("so"),
9 | WINDOWS("dll");
10 | fun libPath(): String {
11 | val sharedObjectsFolder = "${System.getProperty("user.dir")}/src/main/lib/${this.name.toLowerCase()}/"
12 | return "$sharedObjectsFolder$sharedLibrary.$postfix"
13 | }
14 | }
15 |
16 |
17 | class OperationSystemCoordinator {
18 |
19 | private fun isWindows(OS: String): Boolean = OS.indexOf("win") >= 0
20 |
21 | private fun isMac(OS: String): Boolean = OS.indexOf("mac") >= 0
22 |
23 | fun getRunningOperationSystem(): String = System.getProperty("os.name").toLowerCase()
24 |
25 | fun getOperationSystemSharedObjects(OS: String): SharedObject {
26 | if (isWindows(OS)) return SharedObject.WINDOWS
27 | return if (isMac(OS)) SharedObject.OSX
28 | else SharedObject.UNIX
29 | }
30 |
31 | }
32 |
33 |
34 |
--------------------------------------------------------------------------------
/src/main/lib/osx/libc++.dylib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/osx/libc++.dylib
--------------------------------------------------------------------------------
/src/main/lib/osx/libnative.dylib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/osx/libnative.dylib
--------------------------------------------------------------------------------
/src/main/lib/osx/libprotobuf_lite.dylib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/osx/libprotobuf_lite.dylib
--------------------------------------------------------------------------------
/src/main/lib/unix/libc++.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/unix/libc++.so
--------------------------------------------------------------------------------
/src/main/lib/unix/libnative.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/unix/libnative.so
--------------------------------------------------------------------------------
/src/main/lib/unix/libprotobuf_lite.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/unix/libprotobuf_lite.so
--------------------------------------------------------------------------------
/src/main/test/kotlin/com/b8/LangDetectTest.kt:
--------------------------------------------------------------------------------
1 | package com.b8
2 |
3 | import org.junit.Test
4 |
5 |
6 | internal class LangDetectTest {
7 |
8 |
9 |
10 | @Test
11 | fun `Simple Build And Predict On English Text`() {
12 | val ld = LangDetect()
13 |
14 | val englishText = "This piece of text is in English";
15 | val result = ld.detect(englishText)
16 | assert(result.language == "en")
17 | assert(result.isReliable)
18 | assert(result.proportion == 1f)
19 | }
20 |
21 | @Test
22 | fun `Simple Build And Predict On Russian Text`() {
23 | val ld = LangDetect()
24 |
25 | val englishText = "Този текст е на Български.";
26 | val result = ld.detect(englishText)
27 | assert(result.language == "bg")
28 | assert(result.isReliable)
29 | assert(result.proportion == 1f)
30 | }
31 |
32 | @Test
33 | fun `Simple Build And Predict On English and Russian Text and n = 3`() {
34 | val ld = LangDetect()
35 | val englishText = "This piece of text is in English Този текст е на Български";
36 | val result = ld.findTopNMostFreqLangs(englishText, 3)
37 | val languages = result.map { it.language }
38 | assert(languages.size == 3)
39 | assert(languages.contains("en"))
40 | assert(languages.contains("bg"))
41 | assert(languages.contains("und"))
42 |
43 | }
44 |
45 | @Test
46 | fun `Simple Build And Predict On English and Russian Text and n = 2`() {
47 | val ld = LangDetect()
48 |
49 | val englishText = "Този текст е на Български This piece of text is in English ";
50 | val result = ld.findTopNMostFreqLangs(englishText, 2)
51 | val languages = result.map { it.language }
52 | assert(languages.size == 2)
53 | assert(languages.contains("en"))
54 | assert(languages.contains("bg"))
55 | }
56 |
57 |
58 | fun assertLanguageResult(text: String, expectedLanguage: String) {
59 | val ld = LangDetect()
60 | val result = ld.detect(text).language
61 | val fullName = ld.changeLanguageCodeToFullName(result)
62 | assert(fullName == expectedLanguage)
63 | }
64 |
65 | @Test
66 | fun `simple language detection hebrew`() {
67 | assertLanguageResult("מה המצב איתך ? מה המצב יא גבר", "Hebrew")
68 | }
69 |
70 |
71 | @Test
72 | fun `simple language detection english`() {
73 | assertLanguageResult("what does the fox says?", "English")
74 | }
75 |
76 | @Test
77 | fun `simple language detection spanish`() {
78 | assertLanguageResult("Hola cómo estás hoy?", "Spanish")
79 | }
80 | }
--------------------------------------------------------------------------------
/src/main/test/kotlin/com/b8/OperationSystemCoordinatorTest.kt:
--------------------------------------------------------------------------------
1 | package com.b8
2 |
3 |
4 | import org.junit.Test
5 |
6 | internal class OperationSystemCoordinatorTest {
7 | @Test
8 | fun `Get Running Operation System`() {
9 | val operatingSystemCoordinator = OperationSystemCoordinator()
10 | assert(operatingSystemCoordinator.getRunningOperationSystem() == System.getProperty("os.name").toLowerCase())
11 | }
12 |
13 | @Test
14 | fun `Get Operation System Shared Objects On All Operation Systems`() {
15 | val operatingSystemCoordinator = OperationSystemCoordinator()
16 | val mapOs2SharedObject = mapOf(
17 | "windows 95" to SharedObject.WINDOWS,
18 | "windows vista" to SharedObject.WINDOWS,
19 | "windows xp" to SharedObject.WINDOWS,
20 | "windows 10" to SharedObject.WINDOWS,
21 | "linux" to SharedObject.UNIX,
22 | "mpe/ix" to SharedObject.UNIX,
23 | "freebsd" to SharedObject.UNIX,
24 | "irix" to SharedObject.UNIX,
25 | "digital unix" to SharedObject.UNIX,
26 | "unix" to SharedObject.UNIX,
27 | "mac os" to SharedObject.OSX,
28 | "sun os" to SharedObject.UNIX,
29 | "sunos" to SharedObject.UNIX,
30 | "solaris" to SharedObject.UNIX,
31 | "hp-ux" to SharedObject.UNIX,
32 | "aix" to SharedObject.UNIX
33 | )
34 | mapOs2SharedObject.forEach { OS, sharedObject ->
35 | assert(
36 | operatingSystemCoordinator.getOperationSystemSharedObjects(OS) == sharedObject
37 | ) { "$OS not as Expected :Retrieve ${operatingSystemCoordinator.getOperationSystemSharedObjects(OS)}" }
38 | }
39 | }
40 | }
--------------------------------------------------------------------------------