("maven") {
43 | from(components["javaPlatform"])
44 | pom.addRequiredMetadataForPublicationOnMavenCentral()
45 | }
46 | }
47 | }
48 | }
49 |
50 | signing {
51 | setRequired {
52 | gradle.taskGraph.allTasks.any {
53 | it.name.startsWith("publish")
54 | }
55 | }
56 | publishing.publications.configureEach {
57 | sign(this)
58 | }
59 | useGpgCmd()
60 | }
61 |
62 | // Follow the requirements described at https://central.sonatype.org/pages/requirements.html
63 | // (which assume that POM is the king)
64 | fun MavenPom.addRequiredMetadataForPublicationOnMavenCentral() {
65 | name.set(providers.provider { "${project.group}:${project.name}" })
66 | description.set(providers.provider { project.description })
67 | url.set("https://github.com/melix/jlangdetect")
68 | licenses {
69 | license {
70 | name.set("The Apache Software License, Version 2.0")
71 | url.set("http://www.apache.org/licenses/LICENSE-2.0.txt")
72 | }
73 | }
74 |
75 | developers {
76 | developer {
77 | name.set("Cédric Champeau")
78 | email.set("cedric.champeau@gmail.com")
79 | organization.set("Personal")
80 | organizationUrl.set("https://github.com/melix")
81 | }
82 | }
83 |
84 | scm {
85 | connection.set("scm:git:git://github.com/melix/jlangdetect.git")
86 | developerConnection.set("scm:git:ssh://github.com/melix/jlangdetect.git")
87 | url.set("https://github.com/melix/jlangdetect/tree/master")
88 | }
89 | }
90 |
91 | fun isSnapshot() = (version as String).endsWith("-SNAPSHOT")
--------------------------------------------------------------------------------
/changelog/CHANGELOG.txt:
--------------------------------------------------------------------------------
1 | ##############################################
2 | # Project JLangDetect #
3 | # See http://code.google.com/p/jlangdetect/ #
4 | ##############################################
5 |
6 |
7 | Changes in version 0.4
8 | ----------------------
9 |
10 | Issue #1 switch from log4j to SLF4J
11 | Issue #4 Returns null if all scores are 0
12 | Issue #5 Make Score fields accessible
--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
1 | org.gradle.parallel=true
2 |
--------------------------------------------------------------------------------
/gradle/libs.versions.toml:
--------------------------------------------------------------------------------
1 | [versions]
2 | nexus = "1.0.0"
3 |
4 | [libraries]
5 | slf4j="org.slf4j:slf4j-api:1.7.30"
6 | testng="org.testng:testng:7.4.0"
7 | logbackClassic="ch.qos.logback:logback-classic:1.0.13"
8 | janino="org.codehaus.janino:janino:2.6.1"
9 |
10 | plugins-nexus = { module = "io.github.gradle-nexus:publish-plugin", version.ref = "nexus"}
11 |
12 |
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.0.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | #
4 | # Copyright 2015 the original author or authors.
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # https://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | ##############################################################################
20 | ##
21 | ## Gradle start up script for UN*X
22 | ##
23 | ##############################################################################
24 |
25 | # Attempt to set APP_HOME
26 | # Resolve links: $0 may be a link
27 | PRG="$0"
28 | # Need this for relative symlinks.
29 | while [ -h "$PRG" ] ; do
30 | ls=`ls -ld "$PRG"`
31 | link=`expr "$ls" : '.*-> \(.*\)$'`
32 | if expr "$link" : '/.*' > /dev/null; then
33 | PRG="$link"
34 | else
35 | PRG=`dirname "$PRG"`"/$link"
36 | fi
37 | done
38 | SAVED="`pwd`"
39 | cd "`dirname \"$PRG\"`/" >/dev/null
40 | APP_HOME="`pwd -P`"
41 | cd "$SAVED" >/dev/null
42 |
43 | APP_NAME="Gradle"
44 | APP_BASE_NAME=`basename "$0"`
45 |
46 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
47 | DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
48 |
49 | # Use the maximum available, or set MAX_FD != -1 to use that value.
50 | MAX_FD="maximum"
51 |
52 | warn () {
53 | echo "$*"
54 | }
55 |
56 | die () {
57 | echo
58 | echo "$*"
59 | echo
60 | exit 1
61 | }
62 |
63 | # OS specific support (must be 'true' or 'false').
64 | cygwin=false
65 | msys=false
66 | darwin=false
67 | nonstop=false
68 | case "`uname`" in
69 | CYGWIN* )
70 | cygwin=true
71 | ;;
72 | Darwin* )
73 | darwin=true
74 | ;;
75 | MINGW* )
76 | msys=true
77 | ;;
78 | NONSTOP* )
79 | nonstop=true
80 | ;;
81 | esac
82 |
83 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
84 |
85 |
86 | # Determine the Java command to use to start the JVM.
87 | if [ -n "$JAVA_HOME" ] ; then
88 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
89 | # IBM's JDK on AIX uses strange locations for the executables
90 | JAVACMD="$JAVA_HOME/jre/sh/java"
91 | else
92 | JAVACMD="$JAVA_HOME/bin/java"
93 | fi
94 | if [ ! -x "$JAVACMD" ] ; then
95 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
96 |
97 | Please set the JAVA_HOME variable in your environment to match the
98 | location of your Java installation."
99 | fi
100 | else
101 | JAVACMD="java"
102 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
103 |
104 | Please set the JAVA_HOME variable in your environment to match the
105 | location of your Java installation."
106 | fi
107 |
108 | # Increase the maximum file descriptors if we can.
109 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
110 | MAX_FD_LIMIT=`ulimit -H -n`
111 | if [ $? -eq 0 ] ; then
112 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
113 | MAX_FD="$MAX_FD_LIMIT"
114 | fi
115 | ulimit -n $MAX_FD
116 | if [ $? -ne 0 ] ; then
117 | warn "Could not set maximum file descriptor limit: $MAX_FD"
118 | fi
119 | else
120 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
121 | fi
122 | fi
123 |
124 | # For Darwin, add options to specify how the application appears in the dock
125 | if $darwin; then
126 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
127 | fi
128 |
129 | # For Cygwin or MSYS, switch paths to Windows format before running java
130 | if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
131 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
132 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
133 |
134 | JAVACMD=`cygpath --unix "$JAVACMD"`
135 |
136 | # We build the pattern for arguments to be converted via cygpath
137 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
138 | SEP=""
139 | for dir in $ROOTDIRSRAW ; do
140 | ROOTDIRS="$ROOTDIRS$SEP$dir"
141 | SEP="|"
142 | done
143 | OURCYGPATTERN="(^($ROOTDIRS))"
144 | # Add a user-defined pattern to the cygpath arguments
145 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
146 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
147 | fi
148 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
149 | i=0
150 | for arg in "$@" ; do
151 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
152 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
153 |
154 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
155 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
156 | else
157 | eval `echo args$i`="\"$arg\""
158 | fi
159 | i=`expr $i + 1`
160 | done
161 | case $i in
162 | 0) set -- ;;
163 | 1) set -- "$args0" ;;
164 | 2) set -- "$args0" "$args1" ;;
165 | 3) set -- "$args0" "$args1" "$args2" ;;
166 | 4) set -- "$args0" "$args1" "$args2" "$args3" ;;
167 | 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
168 | 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
169 | 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
170 | 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
171 | 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
172 | esac
173 | fi
174 |
175 | # Escape application args
176 | save () {
177 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
178 | echo " "
179 | }
180 | APP_ARGS=`save "$@"`
181 |
182 | # Collect all arguments for the java command, following the shell quoting and substitution rules
183 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
184 |
185 | exec "$JAVACMD" "$@"
186 |
--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
1 | @rem
2 | @rem Copyright 2015 the original author or authors.
3 | @rem
4 | @rem Licensed under the Apache License, Version 2.0 (the "License");
5 | @rem you may not use this file except in compliance with the License.
6 | @rem You may obtain a copy of the License at
7 | @rem
8 | @rem https://www.apache.org/licenses/LICENSE-2.0
9 | @rem
10 | @rem Unless required by applicable law or agreed to in writing, software
11 | @rem distributed under the License is distributed on an "AS IS" BASIS,
12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | @rem See the License for the specific language governing permissions and
14 | @rem limitations under the License.
15 | @rem
16 |
17 | @if "%DEBUG%" == "" @echo off
18 | @rem ##########################################################################
19 | @rem
20 | @rem Gradle startup script for Windows
21 | @rem
22 | @rem ##########################################################################
23 |
24 | @rem Set local scope for the variables with windows NT shell
25 | if "%OS%"=="Windows_NT" setlocal
26 |
27 | set DIRNAME=%~dp0
28 | if "%DIRNAME%" == "" set DIRNAME=.
29 | set APP_BASE_NAME=%~n0
30 | set APP_HOME=%DIRNAME%
31 |
32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter.
33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
34 |
35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
37 |
38 | @rem Find java.exe
39 | if defined JAVA_HOME goto findJavaFromJavaHome
40 |
41 | set JAVA_EXE=java.exe
42 | %JAVA_EXE% -version >NUL 2>&1
43 | if "%ERRORLEVEL%" == "0" goto execute
44 |
45 | echo.
46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
47 | echo.
48 | echo Please set the JAVA_HOME variable in your environment to match the
49 | echo location of your Java installation.
50 |
51 | goto fail
52 |
53 | :findJavaFromJavaHome
54 | set JAVA_HOME=%JAVA_HOME:"=%
55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
56 |
57 | if exist "%JAVA_EXE%" goto execute
58 |
59 | echo.
60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
61 | echo.
62 | echo Please set the JAVA_HOME variable in your environment to match the
63 | echo location of your Java installation.
64 |
65 | goto fail
66 |
67 | :execute
68 | @rem Setup the command line
69 |
70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
71 |
72 |
73 | @rem Execute Gradle
74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
75 |
76 | :end
77 | @rem End local scope for the variables with windows NT shell
78 | if "%ERRORLEVEL%"=="0" goto mainEnd
79 |
80 | :fail
81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
82 | rem the _cmd.exe /c_ return code!
83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
84 | exit /b 1
85 |
86 | :mainEnd
87 | if "%OS%"=="Windows_NT" endlocal
88 |
89 | :omega
90 |
--------------------------------------------------------------------------------
/jlangdetect-europarl/build.gradle.kts:
--------------------------------------------------------------------------------
1 | plugins {
2 | id("me.champeau.jlangdetect.java-conventions")
3 | }
4 |
5 | dependencies {
6 | implementation(libs.slf4j)
7 | api(project(":jlangdetect"))
8 | testImplementation(libs.testng)
9 | testImplementation(libs.janino)
10 | testRuntimeOnly(libs.logbackClassic)
11 | }
12 |
13 | description = "JLangDetect Europarl"
14 |
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/java/me/champeau/ld/EuroparlDetector.java:
--------------------------------------------------------------------------------
1 | package me.champeau.ld;
2 | /**
3 | * Created by IntelliJ IDEA.
4 | * User: cedric
5 | * Date: 26/06/11
6 | * Time: 22:35
7 | */
8 |
9 |
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import java.io.BufferedInputStream;
14 | import java.io.IOException;
15 | import java.io.ObjectInputStream;
16 |
17 | /**
18 | * An extension of the language detector which automatically loads europarl resources from classpath, thus supports
19 | * detecting the language of documents in the following languages :
20 | *
21 | * - Bulgarian
22 | * - Czech
23 | * - Danish
24 | * - German
25 | * - Greek
26 | * - English
27 | * - Spanish
28 | * - Estonian
29 | * - Finnish
30 | * - French
31 | * - Hungarian
32 | * - Italian
33 | * - Lithuanian
34 | * - Latvian
35 | * - Dutch
36 | * - Polish
37 | * - Portuguese
38 | * - Romanian
39 | * - Slovak
40 | * - Slovene
41 | * - Swedish
42 | *
43 | *
44 | * The detector has been trained thanks to the resources available from http://www.statmt.org/europarl/.
45 | *
46 | * See Europarl: A Parallel Corpus for Statistical Machine Translation, Philipp Koehn, MT Summit 2005
47 | *
48 | * @author Cedric Champeau
49 | */
50 | public class EuroparlDetector extends LangDetector {
51 | private final static Logger theLogger = LoggerFactory.getLogger(EuroparlDetector.class);
52 |
53 | private final static String[] EUROPARL_LANGUAGES = {
54 | "bg",
55 | "cs",
56 | "da",
57 | "de",
58 | "el",
59 | "en",
60 | "es",
61 | "et",
62 | "fi",
63 | "fr",
64 | "hu",
65 | "it",
66 | "lt",
67 | "lv",
68 | "nl",
69 | "pl",
70 | "pt",
71 | "ro",
72 | "sk",
73 | "sl",
74 | "sv"
75 | };
76 |
77 | private final static EuroparlDetector INSTANCE = new EuroparlDetector();
78 |
79 | protected EuroparlDetector() {
80 | super();
81 | ClassLoader loader = EuroparlDetector.class.getClassLoader();
82 | for (String lang : EUROPARL_LANGUAGES) {
83 | try {
84 | register(lang, new ObjectInputStream(new BufferedInputStream(loader.getResourceAsStream("europarl-ld/" +lang+"_tree.bin"))));
85 | } catch (IOException e) {
86 | theLogger.warn("Unable to read Europarl resources for language "+lang);
87 | }
88 | }
89 | }
90 |
91 | public static EuroparlDetector getInstance() {
92 | return INSTANCE;
93 | }
94 |
95 | @Override
96 | public void register(final String lang, final AbstractGramTree tree) {
97 | if (INSTANCE!=null) throw new IllegalStateException("Cannot add languages to Europarl detector once loaded");
98 | super.register(lang, tree);
99 | }
100 |
101 | @Override
102 | public void register(final String lang, final ObjectInputStream in) {
103 | if (INSTANCE!=null) throw new IllegalStateException("Cannot add languages to Europarl detector once loaded");
104 | super.register(lang, in);
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/java/me/champeau/ld/learn/util/EuroparlLoader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | *
18 | */
19 |
20 | package me.champeau.ld.learn.util;
21 |
22 | import me.champeau.ld.AbstractGramTree;
23 | import me.champeau.ld.GramTreeBuilder;
24 | import me.champeau.ld.LangDetector;
25 | import org.slf4j.Logger;
26 | import org.slf4j.LoggerFactory;
27 |
28 | import java.io.*;
29 | import java.util.concurrent.*;
30 | import java.util.List;
31 | import java.util.LinkedList;
32 | import java.util.Map;
33 |
34 | /**
35 | * Parses the Europarl corpus (http://www.statmt.org/europarl/). This corpus consists of (parallel) translations
36 | * of European Parliament proceedings for the 1996-2006 period. It is a perfect candidate for our learning
37 | * algorithm, with up to 44 million words per language.
38 | *
39 | * Training takes less than 1 minute/language on my computer, with a quad core processor. The loader has been optimized
40 | * for multi-core systems.
41 | *
42 | */
43 | public class EuroparlLoader {
44 | private final static Logger theLogger = LoggerFactory.getLogger(EuroparlLoader.class);
45 |
46 | /**
47 | * Reads a single EPPPC file, strips XML lines and returns a single string containing raw text.
48 | * @param aFile a EPPPC text file, encoded in UTF-8.
49 | * @return raw text
50 | */
51 | private static String readSingleFile(File aFile) {
52 | StringBuilder sb = new StringBuilder();
53 | try {
54 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(aFile), "UTF-8"));
55 | String line;
56 | while ((line = reader.readLine()) != null) {
57 | if (!line.startsWith("<")) {
58 | sb.append(line).append('\n');
59 | }
60 | }
61 | reader.close();
62 | } catch (IOException e) {
63 | theLogger.error("Unable to read file : " + aFile);
64 | return null;
65 | }
66 | return sb.toString();
67 | }
68 |
69 | /**
70 | * Returns a map (lang code -> gram tree) of gram trees
71 | * @param srcDir source directory where to find language specific directories
72 | * @param dstDir output directory for compiled n-grams trees
73 | * @param langs list of languages to be compiled
74 | * @return the map of trees
75 | */
76 | private static Map readCorpus(final File srcDir, final File dstDir, String[] langs) {
77 | int threads = Runtime.getRuntime().availableProcessors();
78 | ExecutorService service = Executors.newFixedThreadPool(threads);
79 | theLogger.info("Parallel processing of "+threads+" languages over "+langs.length+"...");
80 | List> tasks = new LinkedList>();
81 | final Map trees = new ConcurrentHashMap();
82 | for (final String lang : langs) {
83 | tasks.add(service.submit(new Runnable() {
84 | public void run() {
85 | theLogger.info("Processing directory " + lang);
86 | GramTreeBuilder tree = new GramTreeBuilder(1, 3);
87 | tree.setTruncationThreshold(0.1d);
88 | File sourceFiles = new File(srcDir, lang);
89 | File[] files = sourceFiles.listFiles();
90 | int cpt = 0;
91 | for (File file : files) {
92 | tree.learn(readSingleFile(file));
93 | cpt++;
94 | if (cpt % 20 == 0) {
95 | theLogger.info("Processed " + (100 * cpt / files.length) + "% of " + lang);
96 | }
97 | }
98 | final AbstractGramTree build = tree.build();
99 | trees.put(lang, build);
100 | theLogger.info("Saving tree : "+lang);
101 | File dst = new File(dstDir, lang+"_tree.bin");
102 | try {
103 | ObjectOutputStream out = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(dst)));
104 | out.writeObject(build);
105 | out.close();
106 | } catch (IOException e) {
107 | theLogger.error("Unable to write lang tree "+lang,e);
108 | }
109 | theLogger.info("Lang "+ lang+" complete !");
110 | }
111 | }));
112 | }
113 | // passive wait
114 | for (Future> task : tasks) {
115 | try {
116 | task.get();
117 | } catch (InterruptedException e) {
118 | e.printStackTrace();
119 | } catch (ExecutionException e) {
120 | e.printStackTrace();
121 | }
122 | }
123 | service.shutdown();
124 | return trees;
125 | }
126 |
127 | public static void main(String[] args) {
128 | if (args.length != 2) {
129 | System.out.println("Usage : java " + EuroparlLoader.class.getCanonicalName() + " ");
130 | System.exit(-1);
131 | }
132 | File srcDir = new File(args[0]);
133 | File dstDir = new File(args[1]);
134 | dstDir.mkdirs();
135 | final String[] langs = srcDir.list();
136 | LangDetector detector = new LangDetector();
137 | Map trees = readCorpus(srcDir, dstDir, langs);
138 | for (Map.Entry entry : trees.entrySet()) {
139 | detector.register(entry.getKey(), entry.getValue());
140 | }
141 | }
142 |
143 | /**
144 | public static void main(String[] args) throws IOException, ClassNotFoundException {
145 | File in = new File(args[0]);
146 | LangDetector detector = new LangDetector();
147 | for (File file : in.listFiles()) {
148 | ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(file)));
149 | detector.register(file.getName().substring(0,2), (AbstractGramTree) ois.readObject());
150 | ois.close();
151 | }
152 | }**/
153 |
154 | }
155 |
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/bg_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/bg_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/cs_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/cs_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/da_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/da_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/de_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/de_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/el_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/el_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/en_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/en_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/es_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/es_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/et_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/et_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/fi_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/fi_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/fr_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/fr_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/hu_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/hu_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/it_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/it_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/lt_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/lt_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/lv_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/lv_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/nl_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/nl_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/pl_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/pl_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/pt_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/pt_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/ro_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/ro_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/sk_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/sk_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/sl_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/sl_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/main/resources/europarl-ld/sv_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-europarl/src/main/resources/europarl-ld/sv_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/test/java/com/champeau/ld/TestLangDetection.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | *
18 | */
19 |
20 | package com.champeau.ld;
21 |
22 | import me.champeau.ld.AbstractGramTree;
23 | import me.champeau.ld.EuroparlDetector;
24 | import me.champeau.ld.LangDetector;
25 | import org.testng.annotations.Test;
26 |
27 | import java.io.IOException;
28 | import java.util.Collection;
29 |
30 | import static org.testng.Assert.assertEquals;
31 | import static org.testng.Assert.assertNull;
32 | /**
33 | * User: cedric
34 | * Date: 21 sept. 2008
35 | * Time: 16:19:48
36 | */
37 |
38 | /**
39 | * A very simple test class aimed at testing that simple Europarl learning should be sufficient for regular
40 | * european languages.
41 | */
42 | public class TestLangDetection {
43 |
44 | @Test
45 | public void shouldDetectLanguages() {
46 | String[][] texts = new String[][] {
47 | new String[] {"un texte en français","fr"},
48 | new String[] {"a text in english","en"},
49 | new String[] {"un texto en español","es"},
50 | new String[] {"un texte un peu plus long en français","fr"},
51 | new String[] {"a text a little longer in english","en"},
52 | new String[] {"a little longer text in english","en"},
53 | new String[] {"un texto un poco más largo en español","es"},
54 | new String[] {"J'aime les bisounours !","fr"},
55 | new String[] {"Bienvenue à Montmartre !", "fr"},
56 | new String[] {"Welcome to London !", "en"},
57 | new String[] {"un piccolo testo in italiano", "it"},
58 | new String[] {"Du kan blive medlem ved at melde dig ind her.", "da"},
59 | new String[] {"Kaasotsustamismenetlusel vastu võetud aktide allkirjastamine", "et"},
60 | new String[] {"μια μικρή ελληνική γλώσσα", "el"},
61 | new String[] {"На 16 юни в 11.00 ч. сутринта местно време в щата Аляска, САЩ, е усетено", "bg"},
62 | new String[] {"Směrnice navíc zakáže nadstandardní zpoplatnění tzv. zákaznických linek.", "cs"},
63 | new String[] {"Emellett igény fogalmazódott meg az iparági önszabályozási és belső konfliktuskezelési, valamint a magyar vasút külföldi vasutak szervezeteivel fennálló kapcsolat-tartási feladatok ellátására is.", "hu"},
64 | new String[] {"een kleine Nederlandse tekst", "nl"},
65 | new String[] {"Matching sur des lexiques", "fr"},
66 | new String[] {"Tarybos perduoti susitarimų tekstai", "lt"},
67 | new String[] {"Koplēmuma procedūrā pieņemto tiesību aktu", "lv"},
68 | new String[] {"Utworzenie komisji śledczej i komisji tymczasowej", "pl"},
69 | new String[] {"Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal", "ro"},
70 | new String[] {"Skončenie rokovania", "sk"},
71 | new String[] {"Fru talman! Rörande en ordningsfråga.", "sv"},
72 | new String[] {"Homofobija v Evropi", "sl"},
73 | new String[] {"Matching on lexicons", "en"},
74 | new String[] {"Une première optimisation consiste à ne tester que les sous-chaînes de taille compatibles avec le lexique.", "fr"},
75 | new String[] {"A otimização é a primeira prova de que não sub-canais compatível com o tamanho do léxico.", "pt"},
76 | new String[] {"Ensimmäinen optimointi ei pidä testata, että osa-kanavien kanssa koko sanakirja.", "fi"},
77 | };
78 |
79 | EuroparlDetector detector = EuroparlDetector.getInstance();
80 |
81 | for (String[] text : texts) {
82 | String det = detector.detectLang(text[0]);
83 | System.out.println("langof(\""+text[0]+"\") = " + det + " : " + (det.equals(text[1])?"OK":"Error"));
84 | assertEquals(det,text[1]);
85 | }
86 |
87 | }
88 |
89 | @Test
90 | public void shouldReturnNullIfNoScore() {
91 | EuroparlDetector detector = EuroparlDetector.getInstance();
92 | String det = detector.detectLang("");
93 | assertNull(det);
94 | }
95 |
96 | @Test
97 | public void testScores() {
98 | final Collection scores = EuroparlDetector.getInstance().scoreLanguages("马兜铃猪笼草是苏门答腊特有的热带食虫植物,其种加词“类似于马兜铃”,指该猪笼草捕虫笼的形状和颜色都非常近似于马兜铃的花朵。其生长于海拔1800至2500米的地区。1956年8月5日,威廉·梅哲在占碑省的土朱山上首次采集到了马兜铃猪笼草。但直到1988年约阿希姆·那兹访问莱顿大学植物标本馆后,该标本才被注意到。1994年,其最终被命名为马兜铃猪笼草。马兜铃猪笼草的叶片革质,无柄,呈线形、披针形或匙形-披针形,可长达20厘米,宽至5厘米。叶片末端为急尖或钝尖,中脉的两侧各有2条纵脉。羽状脉呈不规则的网状,笼蔓长达15厘米。马兜铃猪笼草的花序为总状花序,可长达30厘米。总花梗和花序轴都可长达15厘米,通常雌性花序较短。花梗具小苞片,带一朵花,可长达12毫米。马兜铃猪笼草已被列入《2006年世界自然保护联盟濒危物种红色名录》中,保护状况为极危。");
99 | for (LangDetector.Score score : scores) {
100 | System.out.println("Score ["+score.getLanguage()+"]"+" = "+score.getScore());
101 | }
102 | }
103 |
104 | @Test(expectedExceptions = IllegalStateException.class)
105 | public void shouldFailRegisteringLanguage() {
106 | EuroparlDetector.getInstance().register("lang", new AbstractGramTree(0,0,0) {});
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/jlangdetect-europarl/src/test/resources/logback.groovy:
--------------------------------------------------------------------------------
1 | root(INFO, ["CONSOLE"])
2 |
--------------------------------------------------------------------------------
/jlangdetect-extra/build.gradle.kts:
--------------------------------------------------------------------------------
1 | plugins {
2 | id("me.champeau.jlangdetect.java-conventions")
3 | }
4 |
5 | dependencies {
6 | api(project(":jlangdetect"))
7 | implementation(project(":jlangdetect-europarl"))
8 | implementation(libs.slf4j)
9 | testImplementation(libs.testng)
10 | testImplementation(libs.janino)
11 | testRuntimeOnly(libs.logbackClassic)
12 | }
13 |
14 | description = "JLangDetect extras"
15 |
--------------------------------------------------------------------------------
/jlangdetect-extra/src/main/java/me/champeau/ld/UberLanguageDetector.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package me.champeau.ld;
19 | /**
20 | * Created by IntelliJ IDEA.
21 | * User: cedric
22 | * Date: 27/06/11
23 | * Time: 09:51
24 | */
25 |
26 |
27 | import org.slf4j.Logger;
28 | import org.slf4j.LoggerFactory;
29 |
30 | import java.io.BufferedInputStream;
31 | import java.io.IOException;
32 | import java.io.ObjectInputStream;
33 |
34 | /**
35 | * A language detector which includes the {@link EuroparlDetector Europarl detection resources} plus extra languages :
36 | *
37 | * - Russian
38 | * - Chinese
39 | * - Japanese
40 | * - Korean
41 | *
42 | *
43 | * The extra languages have been learnt thanks to the Project Gutenberg (www.gutenberg.org) resources which are not
44 | * sufficient for excellent accuracy but should be enough for most needs.
45 | *
46 | * Note that due to the lack of royalty-free corpora, those language profiles are subject to caution. For example,
47 | * russian and bulgarian can look very similar, and the detector is likely to fail.
48 | *
49 | */
50 | public class UberLanguageDetector extends LangDetector {
51 | private final static Logger theLogger = LoggerFactory.getLogger(UberLanguageDetector.class);
52 |
53 | public final static String[] EXTRA_LANGUAGES = {"ru","zh","ja","ko"};
54 | private final static UberLanguageDetector INSTANCE = new UberLanguageDetector();
55 |
56 |
57 | protected UberLanguageDetector() {
58 | super(EuroparlDetector.getInstance());
59 | ClassLoader loader = EuroparlDetector.class.getClassLoader();
60 | for (String lang : EXTRA_LANGUAGES) {
61 | try {
62 | register(lang, new ObjectInputStream(new BufferedInputStream(loader.getResourceAsStream("jlangdetect-extra/" + lang + "_tree.bin"))));
63 | } catch (IOException e) {
64 | theLogger.warn("Unable to read Europarl resources for language " + lang);
65 | }
66 | }
67 | }
68 |
69 | public static UberLanguageDetector getInstance() {
70 | return INSTANCE;
71 | }
72 |
73 | @Override
74 | public void register(final String lang, final AbstractGramTree tree) {
75 | if (INSTANCE!=null) throw new IllegalStateException("Cannot add languages to Europarl detector once loaded");
76 | super.register(lang, tree);
77 | }
78 |
79 | @Override
80 | public void register(final String lang, final ObjectInputStream in) {
81 | if (INSTANCE!=null) throw new IllegalStateException("Cannot add languages to Europarl detector once loaded");
82 | super.register(lang, in);
83 | }
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/jlangdetect-extra/src/main/resources/jlangdetect-extra/ja_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-extra/src/main/resources/jlangdetect-extra/ja_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-extra/src/main/resources/jlangdetect-extra/ko_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-extra/src/main/resources/jlangdetect-extra/ko_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-extra/src/main/resources/jlangdetect-extra/ru_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-extra/src/main/resources/jlangdetect-extra/ru_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-extra/src/main/resources/jlangdetect-extra/zh_tree.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/melix/jlangdetect/179fe8655cccee1586d128f98e7ef37bd69bb3c9/jlangdetect-extra/src/main/resources/jlangdetect-extra/zh_tree.bin
--------------------------------------------------------------------------------
/jlangdetect-extra/src/test/java/com/champeau/ld/UberLanguageDetectionTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.champeau.ld;
19 |
20 | import me.champeau.ld.AbstractGramTree;
21 | import me.champeau.ld.LangDetector;
22 | import me.champeau.ld.UberLanguageDetector;
23 | import org.testng.annotations.Test;
24 |
25 | import java.util.Collection;
26 |
27 | import static org.testng.Assert.assertEquals;
28 | /**
29 | * User: cedric
30 | * Date: 21 sept. 2008
31 | * Time: 16:19:48
32 | */
33 |
34 | /**
35 | * A very simple test class aimed at testing the UberLangDetector.
36 | */
37 | public class UberLanguageDetectionTest {
38 |
39 | @Test
40 | public void shouldDetectLanguages() {
41 | String[][] texts = new String[][] {
42 | new String[] {"un texte en français","fr"},
43 | new String[] {"a text in english","en"},
44 | new String[] {"un texto en español","es"},
45 | new String[] {"un texte un peu plus long en français","fr"},
46 | new String[] {"a text a little longer in english","en"},
47 | new String[] {"a little longer text in english","en"},
48 | new String[] {"un texto un poco más largo en español","es"},
49 | new String[] {"J'aime les bisounours !","fr"},
50 | new String[] {"Bienvenue à Montmartre !", "fr"},
51 | new String[] {"Welcome to London !", "en"},
52 | new String[] {"un piccolo testo in italiano", "it"},
53 | new String[] {"Du kan blive medlem ved at melde dig ind her.", "da"},
54 | new String[] {"Kaasotsustamismenetlusel vastu võetud aktide allkirjastamine", "et"},
55 | new String[] {"μια μικρή ελληνική γλώσσα", "el"},
56 | new String[] {"На 16 юни в 11.00 ч. сутринта местно време в щата Аляска, САЩ, е усетено", "bg"},
57 | new String[] {"Směrnice navíc zakáže nadstandardní zpoplatnění tzv. zákaznických linek.", "cs"},
58 | new String[] {"Emellett igény fogalmazódott meg az iparági önszabályozási és belső konfliktuskezelési, valamint a magyar vasút külföldi vasutak szervezeteivel fennálló kapcsolat-tartási feladatok ellátására is.", "hu"},
59 | new String[] {"een kleine Nederlandse tekst", "nl"},
60 | new String[] {"Matching sur des lexiques", "fr"},
61 | new String[] {"Tarybos perduoti susitarimų tekstai", "lt"},
62 | new String[] {"Koplēmuma procedūrā pieņemto tiesību aktu", "lv"},
63 | new String[] {"Utworzenie komisji śledczej i komisji tymczasowej", "pl"},
64 | new String[] {"Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal", "ro"},
65 | new String[] {"Skončenie rokovania", "sk"},
66 | new String[] {"Fru talman! Rörande en ordningsfråga.", "sv"},
67 | new String[] {"Homofobija v Evropi", "sl"},
68 | new String[] {"Matching on lexicons", "en"},
69 | new String[] {"Une première optimisation consiste à ne tester que les sous-chaînes de taille compatibles avec le lexique.", "fr"},
70 | new String[] {"A otimização é a primeira prova de que não sub-canais compatível com o tamanho do léxico.", "pt"},
71 | new String[] {"Ensimmäinen optimointi ei pidä testata, että osa-kanavien kanssa koko sanakirja.", "fi"},
72 | new String[] {"您好", "zh"},
73 | new String[] {"端午节", "zh"},
74 | new String[] {"6月24日,欧盟成员国领导人任命意大利中央銀行行长馬里奧·德拉吉(图)為下一任歐洲中央銀行行長,以接替10月底离任的让-克洛德·特里谢。", "zh"},
75 | new String[] {"ウィキペディアはオープンコンテントの百科事典です。方針に賛同していただけるなら、誰でも記事を編集したり新しく作成したりできます。ガイドブックを読んでから、サンドボックスで練習してみましょう。質問は利用案内でどうぞ", "ja"},
76 | new String[] {"松本サリン事件(1994年)", "ja"},
77 | new String[] {"В Госдуму внесён законопроект о службе в органах внутренних дел", "ru"},
78 | new String[] {"Виктор Христенко назначен специальным представителем Президента по вопросу внесения изменений в Договор о Комиссии Таможенного союза", "ru"},
79 | new String[] {"Виктор", "ru"},
80 | new String[] {"여기로 연결됩니다. 다른 뜻에 대해서는", "ko"},
81 | };
82 |
83 | UberLanguageDetector detector = UberLanguageDetector.getInstance();
84 |
85 | for (String[] text : texts) {
86 | String det = detector.detectLang(text[0]);
87 | System.out.println("langof(\""+text[0]+"\") = " + det + " : " + (det.equals(text[1])?"OK":"Error"));
88 | assertEquals(det,text[1]);
89 | }
90 |
91 | }
92 |
93 | @Test
94 | public void testScores() {
95 | final Collection scores = UberLanguageDetector.getInstance().scoreLanguages(
96 | "Виктор"
97 | );
98 | System.out.println("scores = " + scores);
99 | }
100 |
101 | @Test(expectedExceptions = IllegalStateException.class)
102 | public void shouldFailRegisteringLanguage() {
103 | UberLanguageDetector.getInstance().register("lang", new AbstractGramTree(0,0,0) {});
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/jlangdetect-extra/src/test/resources/logback.groovy:
--------------------------------------------------------------------------------
1 | root(INFO, ["CONSOLE"])
2 |
--------------------------------------------------------------------------------
/jlangdetect/build.gradle.kts:
--------------------------------------------------------------------------------
1 | plugins {
2 | id("me.champeau.jlangdetect.java-conventions")
3 | }
4 |
5 | description = "Java Language Detector"
6 |
7 | dependencies {
8 | implementation(libs.slf4j)
9 | }
--------------------------------------------------------------------------------
/jlangdetect/src/main/java/me/champeau/ld/AbstractGramTree.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | *
18 | */
19 |
20 | package me.champeau.ld;
21 |
22 | import org.slf4j.Logger;
23 | import org.slf4j.LoggerFactory;
24 |
25 | import java.io.Serializable;
26 |
27 | /**
28 | * A gram tree is used to learn n-grams from texts, and is able to score a text.
29 | * The n-gram data is represented as a lexical tree. The representation is rather compact, but one
30 | * could do better with annotated-DFAs.
31 | *
32 | * Not thread-safe.
33 | *
34 | */
35 | public abstract class AbstractGramTree implements Serializable {
36 | private final static Logger theLogger = LoggerFactory.getLogger(AbstractGramTree.class);
37 |
38 | private static final long serialVersionUID = 3284917449023378874L;
39 | protected AbstractNode root;
40 | protected long gramcount;
41 | protected int min;
42 | protected int max;
43 |
44 | protected AbstractGramTree(int min, int max) {
45 | this.max = max;
46 | this.min = min;
47 | gramcount = 0;
48 | }
49 |
50 | protected AbstractGramTree(int min, int max, long gramcount) {
51 | this.max = max;
52 | this.min = min;
53 | this.gramcount = gramcount;
54 | }
55 |
56 | /**
57 | * Returns a score for the input sequence against this n-gram tree.
58 | *
59 | * @param text the text to be checked
60 | * @return a score
61 | */
62 | public double scoreText(CharSequence text) {
63 | NGramTokenizer tokenizer = new NGramTokenizer(text, min, max);
64 | double tot = 0;
65 | for (CharSequence charSequence : tokenizer) {
66 | double s = scoreGram(charSequence);
67 | if (theLogger.isDebugEnabled()) {
68 | theLogger.debug(charSequence + " scores " + s);
69 | }
70 | tot += s;
71 | }
72 | double score = tot / Math.log(gramcount);
73 | if (theLogger.isDebugEnabled()) {
74 | theLogger.debug(text + ", total " + tot + "/" + Math.log(gramcount) + "=" + score);
75 | }
76 | return score;
77 | }
78 |
79 | private double scoreGram(CharSequence gram) {
80 | AbstractNode cur = root;
81 | for (int i=0; ic) return null;
104 | }
105 | return null;
106 | }
107 |
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/jlangdetect/src/main/java/me/champeau/ld/GramTreeBuilder.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | *
18 | */
19 |
20 | package me.champeau.ld;
21 |
22 | import me.champeau.ld.learn.util.LearningException;
23 |
24 | import java.util.*;
25 |
26 | /**
27 | * A gram tree is used to learn n-grams from texts, and is able to score a text. The n-gram data is represented as a
28 | * lexical tree. The representation is rather compact, but one could do better with annotated-DFAs.
29 | *
30 | * Not thread-safe.
31 | *
32 | */
33 | public class GramTreeBuilder extends AbstractGramTree {
34 |
35 | private static final long serialVersionUID = 4421643808498040212L;
36 | private boolean built = false;
37 | private double truncationThreshold = 1.0;
38 |
39 | /**
40 | * Builds an n-gram tree
41 | *
42 | * @param min minimal n-gram size
43 | * @param max maximum n-gram size
44 | */
45 | public GramTreeBuilder(int min, int max) {
46 | super(min, max);
47 | root = new NodeBuilder('\u0000');
48 | }
49 |
50 | public void setTruncationThreshold(final double truncationThreshold) {
51 | if (truncationThreshold<0 || truncationThreshold>1.0d) {
52 | throw new IllegalArgumentException("Truncation threshold must be comprised between 0.0 and 1.0");
53 | }
54 | this.truncationThreshold = truncationThreshold;
55 | }
56 |
57 | /**
58 | * Adds n-grams statistics to the n-gram tree.
59 | *
60 | * @param text character sequence to learn n-grams from.
61 | */
62 | public void learn(CharSequence text) {
63 | if (built) throw new IllegalStateException("N-Gram tree has already been built");
64 | NGramTokenizer tokenizer = new NGramTokenizer(text, min, max);
65 | for (CharSequence token : tokenizer) {
66 | addGram(token);
67 | }
68 | }
69 |
70 | /**
71 | * Adds a single n-gram to the n-gram tree.
72 | *
73 | * @param gram n-gram to be added to the tree.
74 | */
75 | private void addGram(CharSequence gram) {
76 | if (built) throw new IllegalStateException("N-Gram tree has already been built");
77 | NodeBuilder cur = (NodeBuilder) root;
78 | for (int i = 0; i < gram.length(); i++) {
79 | char c = gram.charAt(i);
80 | NodeBuilder next = (NodeBuilder) cur.getChild(c);
81 | if (next == null) next = cur.addTransition(c);
82 | cur = next;
83 | if (i == gram.length() - 1) cur.inc();
84 | }
85 | gramcount++;
86 | }
87 |
88 | /**
89 | * Optimizes the n-gram tree memory consumption.
90 | *
91 | * @return an immutable gram tree
92 | */
93 | public AbstractGramTree build() {
94 | built = true;
95 | final NodeBuilder nodeBuilder = (NodeBuilder) root;
96 | ArrayList freqs = new ArrayList();
97 | nodeBuilder.collectFreqs(freqs);
98 | Collections.sort(freqs);
99 | root = nodeBuilder.build(freqs.get((int) (freqs.size()*(1.0-truncationThreshold))));
100 | return new GramTreeImpl(root, min, max, gramcount);
101 | }
102 |
103 |
104 | /**
105 | * A node of the n-gram tree. Consists of a character, its frequency, and the list of followers.
106 | */
107 | private static class NodeBuilder extends AbstractNode implements Comparable {
108 | private final static int DEFAULT_ALLOC = 64;
109 |
110 | int childcount;
111 |
112 | private NodeBuilder(char c) {
113 | this.c = c;
114 | this.freq = 0;
115 | this.childcount = 0;
116 | }
117 |
118 | public AbstractNode getChild(char c) {
119 | if (children == null) return null;
120 | for (int i = 0; i < childcount; i++) {
121 | if (children[i].c == c) return children[i];
122 | if (children[i].c > c) return null;
123 | }
124 | return null;
125 | }
126 |
127 | public NodeBuilder addTransition(char c) {
128 | NodeBuilder child = new NodeBuilder(c);
129 | if (children == null) {
130 | children = new NodeBuilder[DEFAULT_ALLOC];
131 | }
132 | if (childcount == children.length - 1) {
133 | // reallocate
134 | NodeBuilder[] realloc = new NodeBuilder[children.length + DEFAULT_ALLOC];
135 | System.arraycopy(children, 0, realloc, 0, children.length);
136 | children = realloc;
137 | }
138 | children[childcount] = child;
139 | childcount++;
140 | Arrays.sort(children, 0, childcount);
141 | return child;
142 | }
143 |
144 | private void inc() {
145 | if (freq==Integer.MAX_VALUE) {
146 | throw new LearningException("Maximum frequency is reached. N-Gram is too frequent in the corpus. Try to use a smaller corpus.");
147 | }
148 | freq++;
149 | }
150 |
151 | public int compareTo(NodeBuilder o) {
152 | return c - o.c;
153 | }
154 |
155 | /**
156 | * Builds an immutable n-gram tree from this builder data. The minimal frequency value is used to discard
157 | * n-grams which are supposed to be irrelevant for a language, reducing the total amount of memory required
158 | * to model a language.
159 | * @param minFreq the minimum number of occurrences an n-gram must have been seen to be considered relevant
160 | * @return an immutable n-gram tree
161 | */
162 | private AbstractNode build(final int minFreq) {
163 | if (childcount == 0) return new MinimalNode(c, freq, null);
164 | List children2 = new LinkedList();
165 | for (int i = 0; i < childcount; i++) {
166 | if (children[i].freq>=minFreq) children2.add(((NodeBuilder) children[i]).build(minFreq));
167 | }
168 | return new MinimalNode(c, freq, children2.isEmpty()?null:children2.toArray(new AbstractNode[children2.size()]));
169 | }
170 |
171 | public void collectFreqs(ArrayList freqs) {
172 | freqs.add(freq);
173 | for (int i=0; i
33 | * Such an algorithm requires that the corpus used for training look as identical as possible. Parallel corpus are good
34 | * candidates.
35 | *
36 | */
37 | public class LangDetector {
38 | private final static Logger theLogger = LoggerFactory.getLogger(LangDetector.class);
39 |
40 | private Map statsMap = new HashMap();
41 |
42 | public LangDetector() {
43 | }
44 |
45 | /**
46 | * Creates a language detector using the same language profiles as the provided detector.
47 | * @param other the detector from which copy resources from.
48 | */
49 | protected LangDetector(LangDetector other) {
50 | for (Map.Entry entry : other.statsMap.entrySet()) {
51 | statsMap.put(entry.getKey(), entry.getValue());
52 | }
53 | }
54 |
55 | public void register(String lang, ObjectInputStream in) {
56 | try {
57 | statsMap.put(lang, (AbstractGramTree) in.readObject());
58 | in.close();
59 | } catch (IOException e) {
60 | e.printStackTrace();
61 | } catch (ClassNotFoundException e) {
62 | e.printStackTrace();
63 | }
64 | }
65 |
66 | public void register(String lang, AbstractGramTree tree) {
67 | statsMap.put(lang, tree);
68 | }
69 |
70 | /**
71 | * Performs a language detection, using the whole set of possible languages.
72 | *
73 | * @param aText the text for which to detect the language
74 | * @return the detected language
75 | */
76 | public String detectLang(CharSequence aText) {
77 | return detectLang(aText, statsMap.keySet());
78 | }
79 |
80 | /**
81 | * Performs a language detection, but limits the detection to the set of provided languages. This is useful when the
82 | * detector has been trained with many languages, but you wish to discriminate between a smaller set of possible
83 | * languages (or, you know that the document is either in english or french).
84 | *
85 | *
86 | * @param aText the text for which to detect the language
87 | * @param languageRestrictions the set of languages the detector should be limited to
88 | * @return the detected language or null if all scores are 0
89 | */
90 | public String detectLang(CharSequence aText, Set languageRestrictions) {
91 | double best = 0;
92 | String bestLang = null;
93 | for (Map.Entry entry : statsMap.entrySet()) {
94 | final String currentLanguage = entry.getKey();
95 | if (languageRestrictions.contains(currentLanguage)) {
96 | if (theLogger.isDebugEnabled()) {
97 | theLogger.debug("---------- testing : " + currentLanguage + " -------------");
98 | }
99 | double score = entry.getValue().scoreText(aText);
100 | if (theLogger.isDebugEnabled()) {
101 | theLogger.debug("---------- result : " + currentLanguage + " : " + score + " -------------");
102 | }
103 | if (score > best) {
104 | best = score;
105 | bestLang = currentLanguage;
106 | }
107 | }
108 | }
109 | return bestLang;
110 | }
111 |
112 | /**
113 | * Returns the scores of each language profile for the given input text. The language detection is limited
114 | * to the languages specified by the languageRestrictions parameter, and the resulting list is sorted by
115 | * descending score.
116 | * @param aText the text for which to detect score
117 | * @param languageRestrictions the list of languages to be tested
118 | * @return the scores for each language, sorted by descending score
119 | */
120 | public Collection scoreLanguages(CharSequence aText, Set languageRestrictions) {
121 | List scores = new LinkedList();
122 | for (Map.Entry entry : statsMap.entrySet()) {
123 | final String currentLanguage = entry.getKey();
124 | if (languageRestrictions.contains(currentLanguage)) {
125 | scores.add(new Score(currentLanguage,entry.getValue().scoreText(aText)));
126 | }
127 | }
128 | Collections.sort(scores);
129 | return scores;
130 | }
131 |
132 | /**
133 | * Returns the scores of each language profile for the given input text. The resulting list is sorted by
134 | * descending score.
135 | * @param aText the text for which to detect score
136 | * @return the scores for each language, sorted by descending score
137 | */
138 | public Collection scoreLanguages(CharSequence aText) {
139 | return scoreLanguages(aText, statsMap.keySet());
140 | }
141 |
142 | public static class Score implements Comparable {
143 | private final String language;
144 | private final double score;
145 |
146 | public Score(final String language, final double score) {
147 | this.language = language;
148 | this.score = score;
149 | }
150 |
151 | public int compareTo(final Score o) {
152 | return Double.compare(o.score, score);
153 | }
154 |
155 | public String getLanguage() {
156 | return language;
157 | }
158 |
159 | public double getScore() {
160 | return score;
161 | }
162 |
163 | @Override
164 | public String toString() {
165 | final StringBuilder sb = new StringBuilder();
166 | sb.append("Score");
167 | sb.append("{language='").append(language).append('\'');
168 | sb.append(", score=").append(score);
169 | sb.append('}');
170 | return sb.toString();
171 | }
172 | }
173 | }
174 |
175 |
--------------------------------------------------------------------------------
/jlangdetect/src/main/java/me/champeau/ld/NGramIterator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | *
18 | */
19 |
20 | package me.champeau.ld;
21 |
22 | import java.util.Iterator;
23 |
24 | /**
25 | * Iterates over a char sequence to produce n-grams. Requires both minimal and maximal gram length.
26 | */
27 | public class NGramIterator implements Iterator {
28 | private final CharSequence buffer;
29 | private final int max;
30 |
31 | private int pos;
32 | private int window;
33 |
34 | public NGramIterator(CharSequence buffer, int min, int max) {
35 | this.buffer = buffer;
36 | this.max = max;
37 | pos = -1;
38 | window = min;
39 | }
40 |
41 | public boolean hasNext() {
42 | boolean ok = pos+windowbuffer.length()) {
52 | pos = 0;
53 | window++;
54 | }
55 | if ((window>max)||(pos+window>buffer.length())) return null;
56 | return buffer.subSequence(pos, pos+window);
57 | }
58 |
59 | public void remove() {
60 | throw new UnsupportedOperationException();
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/jlangdetect/src/main/java/me/champeau/ld/NGramTokenizer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | *
17 | *
18 | */
19 |
20 | package me.champeau.ld;
21 |
22 | import java.util.Iterator;
23 |
24 | /**
25 | * Tokenizes char sequences. Wrapper for an @link NGramIterator.
26 | */
27 | public class NGramTokenizer implements Iterable {
28 | final private CharSequence buffer;
29 | final private int min,max;
30 |
31 | public NGramTokenizer(CharSequence buffer, int min, int max) {
32 | this.buffer = buffer;
33 | this.min = min;
34 | this.max = max;
35 | }
36 |
37 | public Iterator iterator() {
38 | return new NGramIterator(buffer, min, max);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/jlangdetect/src/main/java/me/champeau/ld/learn/util/DirectoryLearning.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package me.champeau.ld.learn.util;
19 |
20 | import me.champeau.ld.AbstractGramTree;
21 | import me.champeau.ld.GramTreeBuilder;
22 | import me.champeau.ld.LangDetector;
23 | import org.slf4j.Logger;
24 | import org.slf4j.LoggerFactory;
25 |
26 | import java.io.*;
27 | import java.util.LinkedList;
28 | import java.util.List;
29 | import java.util.Map;
30 | import java.util.concurrent.*;
31 |
32 | /**
33 | * A simple learning tool which takes a directory as input, and another directory as output. The input directory
34 | * must consist of subdirectories which name correspond to a language to learn. Each language directory is supposed
35 | * to contain a list of plain text files encoded in UTF-8.
36 | *
37 | * The resulting output directory will
38 | * consist of files named : [lang]_tree.bin
39 | *
40 | * Training takes less than 1 minute/language on my computer, with a quad core processor. The loader has been optimized
41 | * for multi-core systems.
42 | *
43 | */
44 | public class DirectoryLearning {
45 | private final static Logger theLogger = LoggerFactory.getLogger(DirectoryLearning.class);
46 |
47 | /**
48 | * Reads a single EPPPC file, strips XML lines and returns a single string containing raw text.
49 | * @param aFile a EPPPC text file, encoded in UTF-8.
50 | * @return raw text
51 | */
52 | private static String readSingleFile(File aFile) {
53 | StringBuilder sb = new StringBuilder();
54 | try {
55 | BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(aFile), "UTF-8"));
56 | String line;
57 | while ((line = reader.readLine()) != null) {
58 | sb.append(line).append('\n');
59 | }
60 | reader.close();
61 | } catch (IOException e) {
62 | theLogger.error("Unable to read file : " + aFile);
63 | return null;
64 | }
65 | return sb.toString();
66 | }
67 |
68 | /**
69 | * Returns a map (lang code -> gram tree) of gram trees
70 | * @param srcDir source directory where to find language specific directories
71 | * @param dstDir output directory for compiled n-grams trees
72 | * @param langs list of languages to be compiled
73 | * @return the map of trees
74 | */
75 | private static Map readCorpus(final File srcDir, final File dstDir, String[] langs) {
76 | int threads = Runtime.getRuntime().availableProcessors();
77 | ExecutorService service = Executors.newFixedThreadPool(threads);
78 | theLogger.info("Parallel processing of "+threads+" languages over "+langs.length+"...");
79 | List> tasks = new LinkedList>();
80 | final Map trees = new ConcurrentHashMap();
81 | for (final String lang : langs) {
82 | tasks.add(service.submit(new Runnable() {
83 | public void run() {
84 | theLogger.info("Processing directory " + lang);
85 | GramTreeBuilder tree = new GramTreeBuilder(1, 3);
86 | tree.setTruncationThreshold(0.1d);
87 | if (lang.equals("ru")) {
88 | tree.setTruncationThreshold(0.2d);
89 | }
90 | File sourceFiles = new File(srcDir, lang);
91 | File[] files = sourceFiles.listFiles();
92 | int cpt = 0;
93 | for (File file : files) {
94 | final String text = readSingleFile(file);
95 | final String[] lines = text.split("\n");
96 | for (String line : lines) {
97 | tree.learn(line);
98 | }
99 | cpt++;
100 | if (cpt % 20 == 0) {
101 | theLogger.info("Processed " + (100 * cpt / files.length) + "% of " + lang);
102 | }
103 | }
104 | final AbstractGramTree build = tree.build();
105 | trees.put(lang, build);
106 | theLogger.info("Saving tree : "+lang);
107 | File dst = new File(dstDir, lang+"_tree.bin");
108 | try {
109 | ObjectOutputStream out = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(dst)));
110 | out.writeObject(build);
111 | out.close();
112 | } catch (IOException e) {
113 | theLogger.error("Unable to write lang tree "+lang,e);
114 | }
115 | theLogger.info("Lang "+ lang+" complete !");
116 | }
117 | }));
118 | }
119 | // passive wait
120 | for (Future> task : tasks) {
121 | try {
122 | task.get();
123 | } catch (InterruptedException e) {
124 | e.printStackTrace();
125 | } catch (ExecutionException e) {
126 | e.printStackTrace();
127 | }
128 | }
129 | service.shutdown();
130 | return trees;
131 | }
132 |
133 | public static void main(String[] args) {
134 | if (args.length != 2) {
135 | System.out.println("Usage : java " + DirectoryLearning.class.getCanonicalName() + " ");
136 | System.exit(-1);
137 | }
138 | File srcDir = new File(args[0]);
139 | File dstDir = new File(args[1]);
140 | dstDir.mkdirs();
141 | final String[] langs = srcDir.list();
142 | LangDetector detector = new LangDetector();
143 | Map trees = readCorpus(srcDir, dstDir, langs);
144 | for (Map.Entry entry : trees.entrySet()) {
145 | detector.register(entry.getKey(), entry.getValue());
146 | }
147 | }
148 |
149 | }
150 |
--------------------------------------------------------------------------------
/jlangdetect/src/main/java/me/champeau/ld/learn/util/LearningException.java:
--------------------------------------------------------------------------------
1 | package me.champeau.ld.learn.util;
2 |
3 | /**
4 | * This exception is thrown whenever an error is encountered during corpus learning.
5 | */
6 | public class LearningException extends RuntimeException {
7 | public LearningException() {
8 | super();
9 | }
10 |
11 | public LearningException(final Throwable cause) {
12 | super(cause);
13 | }
14 |
15 | public LearningException(final String message) {
16 | super(message);
17 | }
18 |
19 | public LearningException(final String message, final Throwable cause) {
20 | super(message, cause);
21 | }
22 |
23 | protected LearningException(final String message, final Throwable cause, final boolean enableSuppression, final boolean writableStackTrace) {
24 | super(message, cause, enableSuppression, writableStackTrace);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/settings.gradle.kts:
--------------------------------------------------------------------------------
1 | rootProject.name = "jlangdetect-parent"
2 |
3 | enableFeaturePreview("VERSION_CATALOGS")
4 |
5 | include(":jlangdetect-europarl")
6 | include(":jlangdetect")
7 | include(":jlangdetect-extra")
8 |
9 | dependencyResolutionManagement {
10 | repositories {
11 | mavenCentral()
12 | }
13 | }
--------------------------------------------------------------------------------