├── .coveralls.yml ├── .github └── FUNDING.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.MD ├── pom.xml └── src └── main ├── cpp ├── BUILD.gn ├── Cld3LangDetector.cc ├── Cld3LangDetector.cpp └── Cld3LangDetector.h ├── kotlin └── com │ └── b8 │ ├── LangDetect.kt │ ├── NativeLangDetector.kt │ └── OperationSystemCoordinator.kt ├── lib ├── osx │ ├── libc++.dylib │ ├── libnative.dylib │ └── libprotobuf_lite.dylib └── unix │ ├── libc++.so │ ├── libnative.so │ └── libprotobuf_lite.so └── test └── kotlin └── com └── b8 ├── LangDetectTest.kt └── OperationSystemCoordinatorTest.kt /.coveralls.yml: -------------------------------------------------------------------------------- 1 | service_name: travis-pro 2 | repo_token: hadEbsfN3Hm0dspJLYYJkIj838UiWEg43 -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [ntedgi] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | pom.xml.tag 3 | pom.xml.releaseBackup 4 | pom.xml.versionsBackup 5 | pom.xml.next 6 | release.properties 7 | dependency-reduced-pom.xml 8 | buildNumber.properties 9 | .mvn/timing.properties 10 | .mvn/wrapper/maven-wrapper.jar -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | os: 3 | - osx 4 | - linux 5 | 6 | osx_image: xcode8 7 | 8 | after_success: 9 | - mvn clean test jacoco:report 10 | - bash <(curl -s https://codecov.io/bash) -t 187da4be-aeee-4681-ae78-e199dce98794 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Naor Shlomo Tedgi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.com/ntedgi/cld3-kotlin.svg?branch=master)](https://travis-ci.com/ntedgi/cld3-kotlin) [![codecov](https://codecov.io/gh/ntedgi/cld3-kotlin/branch/master/graph/badge.svg)](https://codecov.io/gh/ntedgi/cld3-kotlin) 2 | [![codebeat badge](https://codebeat.co/badges/a92dd040-a71c-4644-96a9-daad0aeb9ac4)](https://codebeat.co/projects/github-com-ntedgi-cld3-kotlin-master) [![](https://jitpack.io/v/ntedgi/cld3-kotlin.svg)](https://jitpack.io/#ntedgi/cld3-kotlin) 3 | 4 | # cld3-kotlin 5 | WIP - Kotlin [CLD3](https://github.com/google/cld3#model) - Google's Compact Language Detector 3 6 | 7 | Bridge from c++ to Kotlin using [Java Abstracted Foreign Function Layer](https://github.com/jnr/jnr-ffi) 8 | 9 | ## Operations Systems Support: 10 | 11 | |Job | OS | State| Shared Objects| 12 | |:------------- | :-------------:| :-------------:| :-------------:| 13 | |[47.1](https://travis-ci.com/ntedgi/cld3-kotlin/builds/114315426) | macOS | passed| [dylib](https://github.com/ntedgi/cld3-kotlin/tree/master/src/main/lib/osx) 14 | |[47.2](https://travis-ci.com/ntedgi/cld3-kotlin/builds/114315426) | Linux | passed|[so](https://github.com/ntedgi/cld3-kotlin/tree/master/src/main/lib/unix) 15 | | - | windows |not supported |-| 16 | 17 | 18 | 19 | ## Usage Examples: 20 | 21 | 22 | ### add maven dependencies 23 | 24 | ```maven 25 | 26 | 27 | jitpack.io 28 | https://jitpack.io 29 | 30 | 31 | ... 32 | 33 | com.github.ntedgi 34 | cld3-kotlin 35 | 1.0.2 36 | 37 | 38 | ``` 39 | 40 | ### download [os shred objects](https://github.com/ntedgi/cld3-kotlin/tree/master/src/main/lib) and add them under src/lib/(os-name) 41 | 42 | 43 | ```kotlin 44 | val ld = LangDetect() 45 | val englishText = "This piece of text is in English"; 46 | var result = ld.detect(englishText) 47 | assert(result.language == "English") 48 | assert(result.isReliable) 49 | assert(result.proportion == 1f) 50 | ``` 51 | 52 | 53 | ```kotlin 54 | val ld = LangDetect() 55 | val englishBulgarianText = "This piece of text is in English Този текст е на Български"; 56 | val results = ld.findTopNMostFreqLangs(englishBulgarianText, 3) 57 | val languages = results.map { it.language } 58 | assert(languages.size == 3) 59 | assert(languages.contains("English")) 60 | assert(languages.contains("Bulgarian")) 61 | assert(languages.contains("UNKNOWN")) 62 | ``` 63 | 64 | 65 | 66 | 67 | 68 | ## The Bridge Interface Implemantation: 69 | 70 | ## from (C++) 71 | ```cpp 72 | std::vector FindTopNMostFreqLangs(const string &text, int num_langs); 73 | ``` 74 | ```cpp 75 | Result FindLanguage(const string &text); 76 | ``` 77 | 78 | ## to (Kotlin) 79 | ```kotlin 80 | fun findTopNMostFreqLangs(text: String, n: Int): List 81 | ``` 82 | 83 | ```kotlin 84 | fun detect(text: String): LangDetectResponse 85 | ``` 86 | 87 | ```kotlin 88 | data class LangDetectResponse( 89 | val probability: Float, 90 | val proportion: Float, 91 | val isReliable: Boolean, 92 | val language: String 93 | ) 94 | ``` 95 | --- 96 | ```ts 97 | if (this.repo.isAwesome || this.repo.isHelpful) { 98 | Star(this.repo); 99 | } 100 | ``` 101 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 4.0.0 6 | 7 | com.b8 8 | cld3-kotlin 9 | 1.0 10 | jar 11 | cld3-kotlin 12 | 13 | 14 | 15 | 16 | UTF-8 17 | 1.6.0 18 | official 19 | 4.13.1 20 | 21 | 22 | 23 | 24 | com.github.jnr 25 | jnr-ffi 26 | 2.1.9 27 | 28 | 29 | 30 | org.jetbrains.kotlin 31 | kotlin-stdlib 32 | ${kotlin.version} 33 | 34 | 35 | org.jetbrains.kotlin 36 | kotlin-test-junit 37 | ${kotlin.version} 38 | test 39 | 40 | 41 | junit 42 | junit 43 | ${junit.version} 44 | test 45 | 46 | 47 | 48 | 49 | src/main/kotlin 50 | src/main/test/kotlin 51 | 52 | 53 | 54 | 55 | 56 | org.jetbrains.kotlin 57 | kotlin-maven-plugin 58 | ${kotlin.version} 59 | 60 | 61 | compile 62 | compile 63 | 64 | compile 65 | 66 | 67 | 68 | test-compile 69 | test-compile 70 | 71 | test-compile 72 | 73 | 74 | 75 | 76 | 77 | org.jacoco 78 | jacoco-maven-plugin 79 | 0.8.4 80 | 81 | 82 | prepare-agent 83 | 84 | prepare-agent 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /src/main/cpp/BUILD.gn: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | #============================================================================== 15 | 16 | import("//third_party/protobuf/proto_library.gni") 17 | 18 | proto_library("protos") { 19 | sources = [ 20 | "feature_extractor.proto", 21 | "sentence.proto", 22 | "task_spec.proto", 23 | ] 24 | proto_out_dir = "cld_3/protos" 25 | } 26 | 27 | static_library("cld_3") { 28 | sources = [ 29 | "base.cc", 30 | "base.h", 31 | "casts.h", 32 | "embedding_feature_extractor.cc", 33 | "embedding_feature_extractor.h", 34 | "embedding_network.cc", 35 | "embedding_network.h", 36 | "embedding_network_params.h", 37 | "feature_extractor.cc", 38 | "feature_extractor.h", 39 | "feature_types.cc", 40 | "feature_types.h", 41 | "float16.h", 42 | "fml_parser.cc", 43 | "fml_parser.h", 44 | "language_identifier_features.cc", 45 | "language_identifier_features.h", 46 | "lang_id_nn_params.cc", 47 | "lang_id_nn_params.h", 48 | "nnet_language_identifier.cc", 49 | "nnet_language_identifier.h", 50 | "registry.cc", 51 | "registry.h", 52 | "relevant_script_feature.cc", 53 | "relevant_script_feature.h", 54 | "script_detector.h", 55 | "sentence_features.cc", 56 | "sentence_features.h", 57 | "simple_adder.h", 58 | "script_span/fixunicodevalue.cc", 59 | "script_span/fixunicodevalue.h", 60 | "script_span/generated_entities.cc", 61 | "script_span/generated_ulscript.cc", 62 | "script_span/generated_ulscript.h", 63 | "script_span/getonescriptspan.cc", 64 | "script_span/getonescriptspan.h", 65 | "script_span/integral_types.h", 66 | "script_span/offsetmap.cc", 67 | "script_span/offsetmap.h", 68 | "script_span/port.h", 69 | "script_span/stringpiece.h", 70 | "script_span/text_processing.cc", 71 | "script_span/text_processing.h", 72 | "script_span/utf8acceptinterchange.h", 73 | "script_span/utf8prop_lettermarkscriptnum.h", 74 | "script_span/utf8repl_lettermarklower.h", 75 | "script_span/utf8scannot_lettermarkspecial.h", 76 | "script_span/utf8statetable.cc", 77 | "script_span/utf8statetable.h", 78 | "task_context.cc", 79 | "task_context.h", 80 | "task_context_params.cc", 81 | "task_context_params.h", 82 | "unicodetext.cc", 83 | "unicodetext.h", 84 | "utils.cc", 85 | "utils.h", 86 | "workspace.cc", 87 | "workspace.h" 88 | ] 89 | public_deps = [ 90 | "//third_party/protobuf:protobuf_lite", 91 | ":protos", 92 | ] 93 | } 94 | 95 | # The executables below are functional. Uncomment to use. 96 | 97 | executable("language_identifier_main") { 98 | sources = [ 99 | "language_identifier_main.cc", 100 | ] 101 | deps = [ 102 | ":cld_3", 103 | ] 104 | } 105 | 106 | 107 | executable("main") { 108 | sources = [ 109 | "main.cc", 110 | "Cld3LangDetector.cc" 111 | ] 112 | deps = [ 113 | ":cld_3", 114 | ] 115 | } 116 | 117 | 118 | shared_library("native2") { 119 | sources = [ 120 | "Cld3LangDetector.cc" 121 | ] 122 | deps = [ 123 | ":cld_3", 124 | ] 125 | } 126 | 127 | 128 | shared_library("native") { 129 | sources = [ 130 | "base.cc", 131 | "base.h", 132 | "casts.h", 133 | "embedding_feature_extractor.cc", 134 | "embedding_feature_extractor.h", 135 | "embedding_network.cc", 136 | "embedding_network.h", 137 | "embedding_network_params.h", 138 | "feature_extractor.cc", 139 | "feature_extractor.h", 140 | "feature_types.cc", 141 | "feature_types.h", 142 | "float16.h", 143 | "fml_parser.cc", 144 | "fml_parser.h", 145 | "language_identifier_features.cc", 146 | "language_identifier_features.h", 147 | "lang_id_nn_params.cc", 148 | "lang_id_nn_params.h", 149 | "nnet_language_identifier.cc", 150 | "nnet_language_identifier.h", 151 | "registry.cc", 152 | "registry.h", 153 | "relevant_script_feature.cc", 154 | "relevant_script_feature.h", 155 | "script_detector.h", 156 | "sentence_features.cc", 157 | "sentence_features.h", 158 | "simple_adder.h", 159 | "script_span/fixunicodevalue.cc", 160 | "script_span/fixunicodevalue.h", 161 | "script_span/generated_entities.cc", 162 | "script_span/generated_ulscript.cc", 163 | "script_span/generated_ulscript.h", 164 | "script_span/getonescriptspan.cc", 165 | "script_span/getonescriptspan.h", 166 | "script_span/integral_types.h", 167 | "script_span/offsetmap.cc", 168 | "script_span/offsetmap.h", 169 | "script_span/port.h", 170 | "script_span/stringpiece.h", 171 | "script_span/text_processing.cc", 172 | "script_span/text_processing.h", 173 | "script_span/utf8acceptinterchange.h", 174 | "script_span/utf8prop_lettermarkscriptnum.h", 175 | "script_span/utf8repl_lettermarklower.h", 176 | "script_span/utf8scannot_lettermarkspecial.h", 177 | "script_span/utf8statetable.cc", 178 | "script_span/utf8statetable.h", 179 | "task_context.cc", 180 | "task_context.h", 181 | "task_context_params.cc", 182 | "task_context_params.h", 183 | "unicodetext.cc", 184 | "unicodetext.h", 185 | "utils.cc", 186 | "utils.h", 187 | "workspace.cc", 188 | "workspace.h", 189 | "Cld3LangDetector.cc", 190 | "Cld3LangDetector.h" 191 | ] 192 | public_deps = [ 193 | "//third_party/protobuf:protobuf_lite", 194 | ":protos", 195 | ] 196 | } 197 | 198 | #executable("getonescriptspan_test") { 199 | # sources = [ 200 | # "script_span/getonescriptspan_test.cc", 201 | # ] 202 | # deps = [ 203 | # ":cld_3", 204 | # ] 205 | #} 206 | 207 | #executable("language_identifier_features_test") { 208 | # sources = [ 209 | # "language_identifier_features_test.cc", 210 | # ] 211 | # deps = [ 212 | # ":cld_3", 213 | # ] 214 | #} 215 | 216 | #executable("nnet_lang_id_test") { 217 | # sources = [ 218 | # "nnet_lang_id_test.cc", 219 | # "nnet_lang_id_test_data.cc", 220 | # "nnet_lang_id_test_data.h", 221 | # ] 222 | # deps = [ 223 | # ":cld_3", 224 | # ] 225 | #} 226 | -------------------------------------------------------------------------------- /src/main/cpp/Cld3LangDetector.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by naor on 5/29/19. 3 | // 4 | 5 | #include "Cld3LangDetector.h" 6 | #include "../nnet_language_identifier.h" 7 | 8 | using namespace chrome_lang_id; 9 | 10 | long create() { 11 | return (long) new NNetLanguageIdentifier(); 12 | } 13 | 14 | long create1(int min_num_bytes, int max_num_bytes) { 15 | return (long) new NNetLanguageIdentifier(min_num_bytes, max_num_bytes); 16 | } 17 | 18 | void deallocate(long ptr) { 19 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr; 20 | delete nptr; 21 | } 22 | 23 | void detect(long ptr, const char *text, const char *into) { 24 | 25 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr; 26 | NNetLanguageIdentifier::Result res = nptr->FindLanguage(text); 27 | 28 | long current = (long) into; 29 | 30 | *((float *) current) = res.probability; 31 | current += sizeof(float); 32 | 33 | *((float *) current) = res.proportion; 34 | current += sizeof(float); 35 | *((short *) current) = res.is_reliable; 36 | current += sizeof(short); 37 | *((int *) current) = res.language.size(); 38 | current += sizeof(int); 39 | 40 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language)); 41 | 42 | 43 | } 44 | 45 | 46 | void findTopNMostFreqLangs(long ptr, const char *text, const char *into, int num_langs) { 47 | 48 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr; 49 | 50 | const std::vector results = 51 | nptr->FindTopNMostFreqLangs(text, num_langs); 52 | 53 | 54 | long current = (long) into; 55 | 56 | 57 | int size = (int )results.size(); 58 | *((int *) current) = size; 59 | current += sizeof(int); 60 | 61 | 62 | for (const NNetLanguageIdentifier::Result &res : results) { 63 | 64 | *((float *) current) = res.probability; 65 | current += sizeof(float); 66 | 67 | *((float *) current) = res.proportion; 68 | current += sizeof(float); 69 | 70 | *((short *) current) = res.is_reliable; 71 | current += sizeof(short); 72 | 73 | *((int *) current) = res.language.size(); 74 | current += sizeof(int); 75 | 76 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language)); 77 | current += sizeof(char) * res.language.size(); 78 | } 79 | 80 | 81 | } 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /src/main/cpp/Cld3LangDetector.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by naor on 5/29/19. 3 | // 4 | 5 | #include "Cld3LangDetector.h" 6 | #include "../nnet_language_identifier.h" 7 | 8 | using namespace chrome_lang_id; 9 | 10 | long create() { 11 | return (long) new NNetLanguageIdentifier(); 12 | } 13 | 14 | long create1(int min_num_bytes, int max_num_bytes) { 15 | return (long) new NNetLanguageIdentifier(min_num_bytes, max_num_bytes); 16 | } 17 | 18 | void deallocate(long ptr) { 19 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr; 20 | delete nptr; 21 | } 22 | 23 | void detect(long ptr, const char *text, const char *into) { 24 | 25 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr; 26 | NNetLanguageIdentifier::Result res = nptr->FindLanguage(text); 27 | 28 | long current = (long) into; 29 | 30 | *((float *) current) = res.probability; 31 | current += sizeof(float); 32 | 33 | *((float *) current) = res.proportion; 34 | current += sizeof(float); 35 | *((short *) current) = res.is_reliable; 36 | current += sizeof(short); 37 | *((int *) current) = res.language.size(); 38 | current += sizeof(int); 39 | 40 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language)); 41 | 42 | 43 | } 44 | 45 | 46 | void findTopNMostFreqLangs(long ptr, const char *text, const char *into, int num_langs) { 47 | 48 | NNetLanguageIdentifier *nptr = (NNetLanguageIdentifier *) ptr; 49 | 50 | const std::vector results = 51 | nptr->FindTopNMostFreqLangs(text, num_langs); 52 | 53 | 54 | long current = (long) into; 55 | 56 | 57 | int size = (int )results.size(); 58 | *((int *) current) = size; 59 | current += sizeof(int); 60 | 61 | 62 | for (const NNetLanguageIdentifier::Result &res : results) { 63 | 64 | *((float *) current) = res.probability; 65 | current += sizeof(float); 66 | 67 | *((float *) current) = res.proportion; 68 | current += sizeof(float); 69 | 70 | *((short *) current) = res.is_reliable; 71 | current += sizeof(short); 72 | 73 | *((int *) current) = res.language.size(); 74 | current += sizeof(int); 75 | 76 | memccpy(reinterpret_cast(current), res.language.c_str(), res.language.size() + 1, sizeof(res.language)); 77 | current += sizeof(char) * res.language.size(); 78 | } 79 | 80 | 81 | } 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /src/main/cpp/Cld3LangDetector.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by naor on 5/29/19. 3 | // 4 | 5 | #ifndef __CLD_3_LANG_DETECTOR__ 6 | #define __CLD_3_LANG_DETECTOR__ 7 | 8 | extern "C" { 9 | long __attribute__ ((visibility ("default"))) create(); 10 | 11 | long __attribute__ ((visibility ("default"))) create1(int min_num_bytes, int max_num_bytes); 12 | 13 | void __attribute__ ((visibility ("default"))) deallocate(long ptr); 14 | 15 | void __attribute__ ((visibility ("default"))) detect(long ptr, const char *text, const char *into); 16 | 17 | void __attribute__ ((visibility ("default"))) findTopNMostFreqLangs(long ptr, const char *text, const char *into,int num_langs); 18 | 19 | } 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/main/kotlin/com/b8/LangDetect.kt: -------------------------------------------------------------------------------- 1 | package com.b8 2 | 3 | import jnr.ffi.LibraryLoader 4 | import java.nio.ByteBuffer 5 | import java.nio.ByteOrder 6 | import java.nio.charset.StandardCharsets 7 | 8 | const val MIN_NUM_OF_BYTES = 0 9 | const val MAX_NUM_OF_BYTES = 10000 10 | const val SINGLE_LANGUAGE_DETECTION_BUFFER_SIZE = 100 11 | const val MULTI_LANGUAGE_DETECTION_BUFFER_SIZE = 250 12 | const val UNKNOWN_LANGUAGE = "UNKNOWN" 13 | 14 | data class LangDetectResponse( 15 | val probability: Float, 16 | val proportion: Float, 17 | val isReliable: Boolean, 18 | val language: String 19 | ) 20 | 21 | class LangDetect : AutoCloseable { 22 | 23 | private val fullLanguageName = mapOf( 24 | "eo" to "Esperanto", 25 | "co" to "Corsican", 26 | "eu" to "Basque", 27 | "ta" to "Tamil", 28 | "de" to "German", 29 | "mt" to "Maltese", 30 | "ps" to "Pushto", 31 | "te" to "Telugu", 32 | "su" to "Sundanese", 33 | "uz" to "Uzbek", 34 | "ne" to "Nepali", 35 | "nl" to "Dutch", 36 | "sw" to "Swahili", 37 | "sq" to "Albanian", 38 | "ja" to "Japanese", 39 | "no" to "Norwegian", 40 | "mn" to "Mongolian", 41 | "so" to "Somali", 42 | "ko" to "Korean", 43 | "kk" to "Kazakh", 44 | "sl" to "Slovenian", 45 | "mr" to "Marathi", 46 | "th" to "Thai", 47 | "zu" to "Zulu", 48 | "ml" to "Malayalam", 49 | "hr" to "Croatian", 50 | "bs" to "Bosnian", 51 | "lo" to "Lao", 52 | "sd" to "Sindhi", 53 | "cy" to "Welsh", 54 | "hy" to "Armenian", 55 | "uk" to "Ukrainian", 56 | "pt" to "Portuguese", 57 | "lv" to "Latvian", 58 | "cs" to "Czech", 59 | "vi" to "Vietnamese", 60 | "jv" to "Javanese", 61 | "be" to "Belarusian", 62 | "km" to "Khmer", 63 | "mk" to "Macedonian", 64 | "tr" to "Turkish", 65 | "fy" to "Western Frisian", 66 | "am" to "Amharic", 67 | "zh" to "Chinese", 68 | "da" to "Danish", 69 | "sv" to "Swedish", 70 | "fi" to "Finnish", 71 | "ht" to "Haitian; Haitian Creole", 72 | "af" to "Afrikaans", 73 | "la" to "Latin", 74 | "id" to "Indonesian", 75 | "sm" to "Samoan", 76 | "ca" to "Catalan", 77 | "ka" to "Georgian", 78 | "sr" to "Serbian", 79 | "it" to "Italian", 80 | "sk" to "Slovak", 81 | "ru" to "Russian", 82 | "bg" to "Bulgarian", 83 | "ny" to "Nyanja; Chichewa; Chewa", 84 | "fa" to "Persian", 85 | "gl" to "Galician", 86 | "et" to "Estonian", 87 | "ms" to "Malay", 88 | "gd" to "Gaelic; Scottish Gaelic", 89 | "ha" to "Hausa", 90 | "is" to "Icelandic", 91 | "ur" to "Urdu", 92 | "mi" to "Maori", 93 | "hi" to "Hindi", 94 | "bn" to "Bengali", 95 | "fr" to "French", 96 | "yi" to "Yiddish", 97 | "hu" to "Hungarian", 98 | "xh" to "Xhosa", 99 | "my" to "Burmese", 100 | "tg" to "Tajik", 101 | "ro" to "Romanian", 102 | "ar" to "Arabic", 103 | "lb" to "Luxembourgish", 104 | "kn" to "Kannada", 105 | "az" to "Azerbaijani", 106 | "si" to "Sinhala; Sinhalese", 107 | "ky" to "Kirghiz", 108 | "mg" to "Malagasy", 109 | "en" to "English", 110 | "gu" to "Gujarati", 111 | "es" to "Spanish", 112 | "pl" to "Polish", 113 | "ga" to "Irish", 114 | "lt" to "Lithuanian", 115 | "sn" to "Shona", 116 | "yo" to "Yoruba", 117 | "pa" to "Panjabi", 118 | "ku" to "Kurdish", 119 | "zh-Latn" to "Chinese-Latin", 120 | "bg-Latn" to "Bulgarian-Latin", 121 | "ru-Latn" to "Russian-Latin", 122 | "hi-Latn" to "Hindi-Latin", 123 | "ja-Latn" to "Japanese-Latin", 124 | "st" to "Sotho,Southern", 125 | "el" to "Greek", 126 | "el-Latn" to "Greek-Latin", 127 | "hmn" to "Hmong, Mong", 128 | "ig" to "Igbo", 129 | "iw" to "Hebrew", 130 | "fil" to "Filipino", 131 | "haw" to "Hawaiian", 132 | "ceb" to "Cebuano" 133 | ) 134 | private val operatingSystemCoordinator = OperationSystemCoordinator() 135 | private val os = operatingSystemCoordinator.getRunningOperationSystem() 136 | private var ptr: Long = -1 137 | private val detector: NativeLangDetector 138 | 139 | init { 140 | System.load(operatingSystemCoordinator.getOperationSystemSharedObjects(os).libPath()) 141 | detector = LibraryLoader.create(NativeLangDetector::class.java) 142 | .load("native") 143 | ptr = detector.create1(MIN_NUM_OF_BYTES, MAX_NUM_OF_BYTES) 144 | } 145 | 146 | fun detect(text: String): LangDetectResponse { 147 | val buffer = ByteArray(SINGLE_LANGUAGE_DETECTION_BUFFER_SIZE) 148 | detector.detect(ptr, text, buffer) 149 | 150 | ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).let { 151 | return extractResponseFromByteBuffer(it) 152 | } 153 | } 154 | 155 | private fun extractResponseFromByteBuffer(byteBuffer: ByteBuffer): LangDetectResponse { 156 | val probability = byteBuffer.float 157 | val proportion = byteBuffer.float 158 | val isReliable = byteBuffer.short 159 | val sizeOfLanguage = byteBuffer.int 160 | val languageBuffer = ByteArray(sizeOfLanguage) { 127.toByte() } 161 | byteBuffer.get(languageBuffer, 0, sizeOfLanguage) 162 | val language = String(languageBuffer, StandardCharsets.UTF_8) 163 | return LangDetectResponse(probability, proportion, isReliable.toInt() == 1, language) 164 | } 165 | 166 | fun findTopNMostFreqLangs(text: String, n: Int): List { 167 | val buffer = ByteArray(MULTI_LANGUAGE_DETECTION_BUFFER_SIZE) 168 | try { 169 | detector.findTopNMostFreqLangs(ptr, text, buffer, n) 170 | } catch (e: Exception) { 171 | e.printStackTrace() 172 | } 173 | val result = ArrayList() 174 | ByteBuffer.wrap(buffer).order(ByteOrder.LITTLE_ENDIAN).let { byteBuffer -> 175 | val size = byteBuffer.int 176 | repeat(size) { 177 | result.add(extractResponseFromByteBuffer(byteBuffer)) 178 | } 179 | } 180 | return result.toList() 181 | } 182 | 183 | fun changeLanguageCodeToFullName(languageCode: String): String { 184 | return fullLanguageName[languageCode] ?: UNKNOWN_LANGUAGE 185 | } 186 | 187 | override fun close() { 188 | detector.deallocate(ptr) 189 | } 190 | 191 | 192 | } -------------------------------------------------------------------------------- /src/main/kotlin/com/b8/NativeLangDetector.kt: -------------------------------------------------------------------------------- 1 | package com.b8 2 | 3 | import jnr.ffi.annotations.In 4 | import jnr.ffi.annotations.Out 5 | 6 | interface NativeLangDetector{ 7 | fun create(): Long 8 | fun create1(min_num_bytes: Int, max_num_bytes: Int): Long 9 | fun deallocate(ptr: Long) 10 | fun detect(ptr: Long, @In text: String, @Out into: ByteArray) 11 | fun findTopNMostFreqLangs(ptr:Long, @In text: String, @Out into: ByteArray, num_langs:Int) 12 | } -------------------------------------------------------------------------------- /src/main/kotlin/com/b8/OperationSystemCoordinator.kt: -------------------------------------------------------------------------------- 1 | package com.b8 2 | 3 | 4 | const val sharedLibrary = "libnative" 5 | 6 | enum class SharedObject(private val postfix: String) { 7 | OSX("dylib"), 8 | UNIX("so"), 9 | WINDOWS("dll"); 10 | fun libPath(): String { 11 | val sharedObjectsFolder = "${System.getProperty("user.dir")}/src/main/lib/${this.name.toLowerCase()}/" 12 | return "$sharedObjectsFolder$sharedLibrary.$postfix" 13 | } 14 | } 15 | 16 | 17 | class OperationSystemCoordinator { 18 | 19 | private fun isWindows(OS: String): Boolean = OS.indexOf("win") >= 0 20 | 21 | private fun isMac(OS: String): Boolean = OS.indexOf("mac") >= 0 22 | 23 | fun getRunningOperationSystem(): String = System.getProperty("os.name").toLowerCase() 24 | 25 | fun getOperationSystemSharedObjects(OS: String): SharedObject { 26 | if (isWindows(OS)) return SharedObject.WINDOWS 27 | return if (isMac(OS)) SharedObject.OSX 28 | else SharedObject.UNIX 29 | } 30 | 31 | } 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/main/lib/osx/libc++.dylib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/osx/libc++.dylib -------------------------------------------------------------------------------- /src/main/lib/osx/libnative.dylib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/osx/libnative.dylib -------------------------------------------------------------------------------- /src/main/lib/osx/libprotobuf_lite.dylib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/osx/libprotobuf_lite.dylib -------------------------------------------------------------------------------- /src/main/lib/unix/libc++.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/unix/libc++.so -------------------------------------------------------------------------------- /src/main/lib/unix/libnative.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/unix/libnative.so -------------------------------------------------------------------------------- /src/main/lib/unix/libprotobuf_lite.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ntedgi/cld3-kotlin/9ccd7c228d480f21e1ba5cb561cad8c9773d04b3/src/main/lib/unix/libprotobuf_lite.so -------------------------------------------------------------------------------- /src/main/test/kotlin/com/b8/LangDetectTest.kt: -------------------------------------------------------------------------------- 1 | package com.b8 2 | 3 | import org.junit.Test 4 | 5 | 6 | internal class LangDetectTest { 7 | 8 | 9 | 10 | @Test 11 | fun `Simple Build And Predict On English Text`() { 12 | val ld = LangDetect() 13 | 14 | val englishText = "This piece of text is in English"; 15 | val result = ld.detect(englishText) 16 | assert(result.language == "en") 17 | assert(result.isReliable) 18 | assert(result.proportion == 1f) 19 | } 20 | 21 | @Test 22 | fun `Simple Build And Predict On Russian Text`() { 23 | val ld = LangDetect() 24 | 25 | val englishText = "Този текст е на Български."; 26 | val result = ld.detect(englishText) 27 | assert(result.language == "bg") 28 | assert(result.isReliable) 29 | assert(result.proportion == 1f) 30 | } 31 | 32 | @Test 33 | fun `Simple Build And Predict On English and Russian Text and n = 3`() { 34 | val ld = LangDetect() 35 | val englishText = "This piece of text is in English Този текст е на Български"; 36 | val result = ld.findTopNMostFreqLangs(englishText, 3) 37 | val languages = result.map { it.language } 38 | assert(languages.size == 3) 39 | assert(languages.contains("en")) 40 | assert(languages.contains("bg")) 41 | assert(languages.contains("und")) 42 | 43 | } 44 | 45 | @Test 46 | fun `Simple Build And Predict On English and Russian Text and n = 2`() { 47 | val ld = LangDetect() 48 | 49 | val englishText = "Този текст е на Български This piece of text is in English "; 50 | val result = ld.findTopNMostFreqLangs(englishText, 2) 51 | val languages = result.map { it.language } 52 | assert(languages.size == 2) 53 | assert(languages.contains("en")) 54 | assert(languages.contains("bg")) 55 | } 56 | 57 | 58 | fun assertLanguageResult(text: String, expectedLanguage: String) { 59 | val ld = LangDetect() 60 | val result = ld.detect(text).language 61 | val fullName = ld.changeLanguageCodeToFullName(result) 62 | assert(fullName == expectedLanguage) 63 | } 64 | 65 | @Test 66 | fun `simple language detection hebrew`() { 67 | assertLanguageResult("מה המצב איתך ? מה המצב יא גבר", "Hebrew") 68 | } 69 | 70 | 71 | @Test 72 | fun `simple language detection english`() { 73 | assertLanguageResult("what does the fox says?", "English") 74 | } 75 | 76 | @Test 77 | fun `simple language detection spanish`() { 78 | assertLanguageResult("Hola cómo estás hoy?", "Spanish") 79 | } 80 | } -------------------------------------------------------------------------------- /src/main/test/kotlin/com/b8/OperationSystemCoordinatorTest.kt: -------------------------------------------------------------------------------- 1 | package com.b8 2 | 3 | 4 | import org.junit.Test 5 | 6 | internal class OperationSystemCoordinatorTest { 7 | @Test 8 | fun `Get Running Operation System`() { 9 | val operatingSystemCoordinator = OperationSystemCoordinator() 10 | assert(operatingSystemCoordinator.getRunningOperationSystem() == System.getProperty("os.name").toLowerCase()) 11 | } 12 | 13 | @Test 14 | fun `Get Operation System Shared Objects On All Operation Systems`() { 15 | val operatingSystemCoordinator = OperationSystemCoordinator() 16 | val mapOs2SharedObject = mapOf( 17 | "windows 95" to SharedObject.WINDOWS, 18 | "windows vista" to SharedObject.WINDOWS, 19 | "windows xp" to SharedObject.WINDOWS, 20 | "windows 10" to SharedObject.WINDOWS, 21 | "linux" to SharedObject.UNIX, 22 | "mpe/ix" to SharedObject.UNIX, 23 | "freebsd" to SharedObject.UNIX, 24 | "irix" to SharedObject.UNIX, 25 | "digital unix" to SharedObject.UNIX, 26 | "unix" to SharedObject.UNIX, 27 | "mac os" to SharedObject.OSX, 28 | "sun os" to SharedObject.UNIX, 29 | "sunos" to SharedObject.UNIX, 30 | "solaris" to SharedObject.UNIX, 31 | "hp-ux" to SharedObject.UNIX, 32 | "aix" to SharedObject.UNIX 33 | ) 34 | mapOs2SharedObject.forEach { OS, sharedObject -> 35 | assert( 36 | operatingSystemCoordinator.getOperationSystemSharedObjects(OS) == sharedObject 37 | ) { "$OS not as Expected :Retrieve ${operatingSystemCoordinator.getOperationSystemSharedObjects(OS)}" } 38 | } 39 | } 40 | } --------------------------------------------------------------------------------