├── .gitignore
├── src
    ├── main
    │   ├── java
    │   │   ├── com
    │   │   │   └── optimaize
    │   │   │   │   └── langdetect
    │   │   │   │       ├── cybozu
    │   │   │   │           ├── package.html
    │   │   │   │           ├── util
    │   │   │   │           │   ├── package.html
    │   │   │   │           │   ├── Messages.java
    │   │   │   │           │   ├── TagExtractor.java
    │   │   │   │           │   ├── NGram.java
    │   │   │   │           │   ├── Util.java
    │   │   │   │           │   └── LangProfile.java
    │   │   │   │           └── GenProfile.java
    │   │   │   │       ├── ngram
    │   │   │   │           ├── package-info.java
    │   │   │   │           ├── NgramFilter.java
    │   │   │   │           ├── NgramExtractors.java
    │   │   │   │           ├── StandardNgramFilter.java
    │   │   │   │           ├── BackwardsCompatibleNgramFilter.java
    │   │   │   │           ├── OldNgramExtractor.java
    │   │   │   │           └── NgramExtractor.java
    │   │   │   │       ├── profiles
    │   │   │   │           ├── package-info.java
    │   │   │   │           ├── OldLangProfileConverter.java
    │   │   │   │           ├── util
    │   │   │   │           │   └── LanguageLister.java
    │   │   │   │           ├── LanguageProfileWriter.java
    │   │   │   │           ├── LanguageProfile.java
    │   │   │   │           ├── LanguageProfileBuilder.java
    │   │   │   │           ├── BuiltInLanguages.java
    │   │   │   │           ├── LanguageProfileImpl.java
    │   │   │   │           └── LanguageProfileReader.java
    │   │   │   │       ├── text
    │   │   │   │           ├── package-info.java
    │   │   │   │           ├── TextFilter.java
    │   │   │   │           ├── TextObjectFactory.java
    │   │   │   │           ├── CharNormalizerTextFilterImpl.java
    │   │   │   │           ├── UrlTextFilter.java
    │   │   │   │           ├── MultiTextFilter.java
    │   │   │   │           ├── CommonTextObjectFactories.java
    │   │   │   │           ├── TextObjectFactoryBuilder.java
    │   │   │   │           ├── RemoveMinorityScriptsTextFilter.java
    │   │   │   │           └── TextObject.java
    │   │   │   │       ├── frma
    │   │   │   │           ├── IOUtils.java
    │   │   │   │           ├── LangProfileWriter.java
    │   │   │   │           ├── GenProfile.java
    │   │   │   │           └── LangProfileReader.java
    │   │   │   │       ├── DetectedLanguage.java
    │   │   │   │       ├── LanguageDetector.java
    │   │   │   │       ├── NgramFrequencyData.java
    │   │   │   │       └── i18n
    │   │   │   │           └── LdLocale.java
    │   │   └── overview.html
    │   └── resources
    │   │   └── README.md
    └── test
    │   ├── resources
    │       ├── texts
    │       │   └── README.txt
    │       └── logback-test.xml
    │   └── java
    │       └── com
    │           └── optimaize
    │               └── langdetect
    │                   ├── frma
    │                       ├── IOUtilsTest.java
    │                       ├── LangProfileReaderTest.java
    │                       ├── LangProfileWriterTest.java
    │                       └── GenProfileTest.java
    │                   ├── text
    │                       ├── TextObjectTest.java
    │                       ├── MultiTextFilterTest.java
    │                       └── RemoveMinorityScriptsTextFilterTest.java
    │                   ├── ngram
    │                       ├── StandardNgramFilterTest.java
    │                       ├── BackwardsCompatibleNgramFilterTest.java
    │                       ├── OldNgramExtractorTest.java
    │                       └── NgramExtractorTest.java
    │                   ├── cybozu
    │                       ├── DetectedLanguageTest.java
    │                       └── util
    │                       │   ├── NGramTest.java
    │                       │   ├── LangProfileTest.java
    │                       │   └── TagExtractorTest.java
    │                   ├── profiles
    │                       ├── LanguageProfileWriterTest.java
    │                       ├── LanguageProfileBuilderTest.java
    │                       └── LanguageProfileReaderTest.java
    │                   ├── NgramFrequencyDataTest.java
    │                   ├── LanguageDetectorImplTest.java
    │                   ├── TechnicalLanguageDetectorImplTest.java
    │                   ├── i18n
    │                       └── LdLocaleTest.java
    │                   └── DataLanguageDetectorImplTest.java
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | /language-detector.iml
3 | .idea/


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/package.html:
--------------------------------------------------------------------------------
1 | 
2 | <body>
3 | Original language detection classes from https://code.google.com/p/language-detection/
4 | </body>
5 | 
6 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/util/package.html:
--------------------------------------------------------------------------------
1 | 
2 | <body>
3 | Provides the utility classes for language detection.
4 | Users don't use this package's classes directly.
5 | </body>
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/src/test/resources/texts/README.txt:
--------------------------------------------------------------------------------
1 | I created these by copying text from the Wikipedia articles.
2 | Example: https://de.wikipedia.org/wiki/Deutschland
3 | 
4 | The files are stored in UTF-8! (Save as UTF-8 in Windows Notepad)
5 | 


--------------------------------------------------------------------------------
/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 
 4 | <configuration>
 5 | 
 6 |     <appender name="stderr" class="ch.qos.logback.core.ConsoleAppender">
 7 |         <Target>System.out</Target>
 8 |         <encoder>
 9 |             <pattern>%d{yyyy-MM-dd/HH:mm:ss.SSS/zzz} [%t] %-5p %m%n</pattern>
10 |         </encoder>
11 |         <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
12 |             <level>INFO</level>
13 |         </filter>
14 |     </appender>
15 | 
16 | 
17 |     <root level="ERROR">
18 |         <appender-ref ref="stderr"/>
19 |     </root>
20 | 
21 | </configuration>


--------------------------------------------------------------------------------
/src/main/resources/README.md:
--------------------------------------------------------------------------------
 1 | ## About the "languages" folder and files
 2 | 
 3 | Most of these files are from the original software from Nakatani Shuyo.
 4 | Unfortunately, the data sources from which they were generated are not available.
 5 | It looks like the text comes from Wikipedia pages.
 6 | 
 7 | To generate your own language profile, see the main readme at https://github.com/optimaize/language-detector
 8 | 
 9 | km Khmer:
10 | sources available, see https://github.com/optimaize/language-detector/issues/19
11 | 
12 | ## About the "languages.shorttext" folder and files
13 | 
14 | These files are from the original software from Nakatani Shuyo.
15 | 
16 | Either they are for detecting language on short messages, or they are built from short message text, or
17 | both, I don't know.
18 | 
19 | 
20 | ## About the "messages.properties" file
21 | 
22 | They are used in the CharNormalizer.
23 | 
24 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /**
18 |  * Provides functionality for handling n-grams.
19 |  *
20 |  * <p>See http://en.wikipedia.org/wiki/N-gram</p>
21 |  *
22 |  * @author Fabian Kessler
23 |  */
24 | package com.optimaize.langdetect.ngram;
25 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /**
18 |  * Provides functionality for loading, storing and creating {@link com.optimaize.langdetect.profiles.LanguageProfile}s.
19 |  *
20 |  * @author Fabian Kessler
21 |  */
22 | package com.optimaize.langdetect.profiles;
23 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/NgramFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | /**
20 |  * Filters out some undesired n-grams.
21 |  *
22 |  * Implementations must be immutable.
23 |  *
24 |  * @author Fabian Kessler
25 |  */
26 | public interface NgramFilter {
27 | 
28 |     boolean use(String ngram);
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /**
18 |  * Provides functionality for concatenating and cleaning text that is used as
19 |  * a) learning text to produce {@link com.optimaize.langdetect.LanguageProfile}s
20 |   * b) for the text for which the language is to be guessed.
21 |  *
22 |  * @author Fabian Kessler
23 |  */
24 | package com.optimaize.langdetect.text;
25 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/TextFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | /**
20 |  * Allows to filter content from a text to be ignored for the n-gram analysis.
21 |  *
22 |  * <p>Implementations must be immutable and stateless.</p>
23 |  *
24 |  * @author Fabian Kessler
25 |  */
26 | public interface TextFilter {
27 | 
28 |     String filter(CharSequence text);
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/overview.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <body>
 3 | <p>
 4 | Language-Detection is a language detection library for Java. (aliases: language identification, language guessing)
 5 | </p>
 6 | 
 7 | <ul>
 8 | <li>Generate language profiles from Wikipedia abstract xml</li>
 9 | <li>Detect language of a text using naive Bayesian filter</li>
10 | </ul>
11 | 
12 | 
13 | <h2>Copyrights and License</h2>
14 | 
15 | <p>
16 | (c)2010 All rights reserved by Cybozu Labs, Inc.
17 | </p>
18 | 
19 | <blockquote>
20 | <p>
21 |    Licensed under the Apache License, Version 2.0 (the "License");
22 |    you may not use this file except in compliance with the License.
23 |    You may obtain a copy of the License at
24 | </p>
25 | <ul><li>
26 |        <a href="http://www.apache.org/licenses/LICENSE-2.0">http://www.apache.org/licenses/LICENSE-2.0</a>
27 | </li></ul>
28 | <p>
29 |    Unless required by applicable law or agreed to in writing, software
30 |    distributed under the License is distributed on an "AS IS" BASIS,
31 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32 |    See the License for the specific language governing permissions and
33 |    limitations under the License.
34 | </p>
35 | </blockquote>
36 | 
37 | </body>
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/TextObjectFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | /**
20 |  * Factory for {@link TextObject}s.
21 |  *
22 |  * @author Fabian Kessler
23 |  */
24 | public class TextObjectFactory {
25 | 
26 |     private final TextFilter textFilter;
27 |     private final int maxTextLength;
28 | 
29 |     /**
30 |      * @param maxTextLength 0 for none
31 |      */
32 |     public TextObjectFactory(TextFilter textFilter, int maxTextLength) {
33 |         this.textFilter = textFilter;
34 |         this.maxTextLength = maxTextLength;
35 |     }
36 | 
37 |     public TextObject create() {
38 |         return new TextObject(textFilter, maxTextLength);
39 |     }
40 | 
41 |     public TextObject forText(CharSequence text) {
42 |         return create().append(text);
43 |     }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/frma/IOUtils.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import java.io.Closeable;
20 | import java.io.IOException;
21 | 
22 | /**
23 |  * Utils to manage IO streams.
24 |  * @author François ROLAND
25 |  */
26 | @Deprecated
27 | public class IOUtils {
28 | 	/**
29 | 	 * Private constructor to prevent instantiation.
30 | 	 */
31 | 	private IOUtils() {}
32 | 
33 | 	/**
34 | 	 * Closes a stream without returning any exception.
35 | 	 * 
36 | 	 * @param stream the stream to close. Can be <code>null</code>.
37 |      * @deprecated use java7 closeable
38 | 	 */
39 | 	public static void closeQuietly(Closeable stream) {
40 | 		if (stream != null) {
41 | 			try {
42 | 				stream.close();
43 | 			} catch (IOException ioe) {
44 | 				// ignore exception at this point.
45 | 			}
46 | 		}
47 | 	}
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/frma/IOUtilsTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import static org.mockito.Mockito.*;
20 | 
21 | import java.io.Closeable;
22 | import java.io.IOException;
23 | 
24 | import org.junit.Test;
25 | 
26 | public class IOUtilsTest {
27 | 
28 | 	@Test
29 | 	public void closeQuietlyNullStream() {
30 | 		IOUtils.closeQuietly(null);
31 | 	}
32 | 
33 | 	@Test
34 | 	public void closeQuietlyWhenExceptionThrown() throws IOException {
35 | 		Closeable stream = mock(Closeable.class);
36 | 		doThrow(new IOException()).when(stream).close();
37 | 		IOUtils.closeQuietly(stream);
38 | 	}
39 | 
40 | 	@Test
41 | 	public void closeQuietly() throws IOException {
42 | 		Closeable stream = mock(Closeable.class);
43 | 		IOUtils.closeQuietly(stream);
44 | 		verify(stream, times(1)).close();
45 | 	}
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/CharNormalizerTextFilterImpl.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.CharNormalizer;
20 | 
21 | /**
22 |  * Runs through the {@link CharNormalizer}.
23 |  *
24 |  * @author Fabian Kessler
25 |  * @deprecated can't be used because it would be a big loss to not inline this code.
26 |  */
27 | public class CharNormalizerTextFilterImpl implements TextFilter {
28 | 
29 |     @Override
30 |     public String filter(CharSequence text) {
31 |         StringBuilder ret = new StringBuilder();
32 |         char pre = 0;
33 |         for (int i=0; i<text.length(); i++) {
34 |             char c = CharNormalizer.normalize(text.charAt(i));
35 |             if (c != ' ' || pre != ' ') {
36 |                 ret.append(c);
37 |             }
38 |             pre = c;
39 |         }
40 |         return ret.toString();
41 |     }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/util/Messages.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.cybozu.util;
18 | 
19 | import java.util.MissingResourceException;
20 | import java.util.ResourceBundle;
21 | 
22 | /**
23 |  * This is {@link Messages} class generated by Eclipse automatically.
24 |  * Users don't use this class directly.
25 |  *
26 |  * @author Nakatani Shuyo
27 |  */
28 | public class Messages {
29 |     private static final String BUNDLE_NAME = "com.optimaize.langdetect.cybozu.util.messages"; //$NON-NLS-1$
30 | 
31 |     private static final ResourceBundle RESOURCE_BUNDLE = ResourceBundle.getBundle(BUNDLE_NAME);
32 | 
33 |     private Messages() {
34 |     }
35 | 
36 |     public static String getString(String key) {
37 |         try {
38 |             return RESOURCE_BUNDLE.getString(key);
39 |         } catch (MissingResourceException e) {
40 |             return '!' + key + '!';
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/UrlTextFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import java.util.regex.Pattern;
20 | 
21 | /**
22 |  * Removes URLs and email addresses from the text.
23 |  *
24 |  * @author Fabian Kessler
25 |  */
26 | public class UrlTextFilter implements TextFilter {
27 | 
28 |     private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
29 |     private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
30 | 
31 |     private static final UrlTextFilter INSTANCE = new UrlTextFilter();
32 | 
33 |     public static UrlTextFilter getInstance() {
34 |         return INSTANCE;
35 |     }
36 | 
37 |     private UrlTextFilter() {
38 |     }
39 | 
40 |     @Override
41 |     public String filter(CharSequence text) {
42 |         String modified = URL_REGEX.matcher(text).replaceAll(" ");
43 |         return MAIL_REGEX.matcher(modified).replaceAll(" ");
44 |     }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/NgramExtractors.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | /**
20 |  * Provides easy access to commonly used NgramExtractor configs.
21 |  *
22 |  * @author Fabian Kessler
23 |  */
24 | public class NgramExtractors {
25 | 
26 |     private static final NgramExtractor STANDARD = NgramExtractor
27 |             .gramLengths(1, 2, 3)
28 |             .filter(StandardNgramFilter.getInstance())
29 |             .textPadding(' ');
30 | 
31 |     private static final NgramExtractor BACKWARDS = NgramExtractor
32 |             .gramLengths(1, 2, 3)
33 |             .filter(BackwardsCompatibleNgramFilter.getInstance())
34 |             .textPadding(' ');
35 | 
36 | 
37 |     /**
38 |      * The new standard n-gram algorithm.
39 |      */
40 |     public static NgramExtractor standard() {
41 |         return STANDARD;
42 |     }
43 | 
44 |     /**
45 |      * The old way of doing n-grams.
46 |      */
47 |     public static NgramExtractor backwards() {
48 |         return BACKWARDS;
49 |     }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/text/TextObjectTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import org.junit.Test;
20 | import static org.junit.Assert.*;
21 | 
22 | /**
23 |  * @author Fabian Kessler
24 |  */
25 | public class TextObjectTest {
26 | 
27 |     @Test
28 |     public void simpleText() throws Exception {
29 |         TextObjectFactory textObjectFactory = new TextObjectFactoryBuilder().withTextFilter(UrlTextFilter.getInstance()).build();
30 |         TextObject inputText = textObjectFactory.create().append("Dies ist").append(" ").append("deutscher Text.");
31 |         assertEquals(inputText.toString(), "Dies ist deutscher Text ");
32 |     }
33 | 
34 |     @Test
35 |     public void filteredContent() throws Exception {
36 |         TextObjectFactory textObjectFactory = new TextObjectFactoryBuilder().withTextFilter(UrlTextFilter.getInstance()).build();
37 |         TextObject inputText = textObjectFactory.create().append("deutscher Text").append(" ").append("http://www.github.com/");
38 |         assertEquals(inputText.toString(), "deutscher Text ");
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/OldLangProfileConverter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.profiles;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | import com.optimaize.langdetect.i18n.LdLocale;
21 | 
22 | import java.util.Map;
23 | 
24 | /**
25 |  * Converts an old {@link LangProfile} to a new {@link LanguageProfile}.
26 |  *
27 |  * @author Fabian Kessler
28 |  */
29 | public class OldLangProfileConverter {
30 | 
31 |     public static LanguageProfile convert(LangProfile langProfile) {
32 |         LdLocale locale;
33 |         try {
34 |             locale = LdLocale.fromString(langProfile.getName());
35 |         } catch (Exception e) {
36 |             throw new RuntimeException("Profile file name logic was changed in v0.5, please update your custom profiles!", e);
37 |         }
38 |         LanguageProfileBuilder builder = new LanguageProfileBuilder(locale);
39 |         for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) {
40 |             builder.addGram(entry.getKey(), entry.getValue());
41 |         }
42 |         return builder.build();
43 |     }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/ngram/StandardNgramFilterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | import org.junit.Test;
20 | import static org.junit.Assert.*;
21 | 
22 | /**
23 |  * @author Fabian Kessler
24 |  */
25 | public class StandardNgramFilterTest {
26 | 
27 |     private static final NgramFilter filter = StandardNgramFilter.getInstance();
28 | 
29 |     @Test
30 |     public void oneGram() throws Exception {
31 |         assertTrue(filter.use("a"));
32 |         assertTrue(filter.use("A"));
33 | 
34 |         assertFalse(filter.use(" "));
35 |     }
36 | 
37 |     @Test
38 |     public void twoGram() throws Exception {
39 |         assertTrue(filter.use("ab"));
40 |         assertTrue(filter.use("Ab"));
41 |         assertTrue(filter.use("AB"));
42 |         assertTrue(filter.use("a "));
43 |         assertTrue(filter.use("a"));
44 |     }
45 | 
46 |     @Test
47 |     public void threeGram() throws Exception {
48 |         assertTrue(filter.use("abc"));
49 |         assertTrue(filter.use("Abc"));
50 |         assertTrue(filter.use("ABC"));
51 |         assertTrue(filter.use("ab "));
52 |         assertTrue(filter.use(" ab"));
53 | 
54 |         assertFalse(filter.use("a c"));
55 |     }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/text/MultiTextFilterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import com.google.common.collect.ImmutableList;
20 | import org.junit.Test;
21 | 
22 | import java.util.Collections;
23 | 
24 | import static org.junit.Assert.assertEquals;
25 | 
26 | /**
27 |  * @author Fabian Kessler
28 |  */
29 | public class MultiTextFilterTest {
30 | 
31 |     @Test
32 |     public void empty() throws Exception {
33 |         assertEquals(new MultiTextFilter(Collections.<TextFilter>emptyList()).filter("foo"), "foo");
34 |     }
35 | 
36 |     @Test
37 |     public void doubleFilter() throws Exception {
38 |         assertEquals(new MultiTextFilter(ImmutableList.of(
39 |                 new TextFilter() {
40 |                     @Override
41 |                     public String filter(CharSequence text) {
42 |                         return text.toString().replace("a", "A");
43 |                     }
44 |                 }, new TextFilter() {
45 |                     @Override
46 |                     public String filter(CharSequence text) {
47 |                         return text.toString().replace("A", "B");
48 |                     }
49 |                 }
50 |         )).filter("nananaa"), "nBnBnBB");
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import com.google.common.collect.ImmutableList;
20 | import org.jetbrains.annotations.NotNull;
21 | import org.jetbrains.annotations.Nullable;
22 | 
23 | import java.util.List;
24 | 
25 | /**
26 |  * Groups multiple {@link com.optimaize.langdetect.text.TextFilter}s as one and runs them in the given order.
27 |  *
28 |  * @author Fabian Kessler
29 |  */
30 | public class MultiTextFilter implements TextFilter {
31 | 
32 |     @Nullable
33 |     private final List<TextFilter> filters;
34 | 
35 |     /**
36 |      * @param filters may be empty by definition
37 |      */
38 |     public MultiTextFilter(@NotNull List<TextFilter> filters) {
39 |         if (filters.isEmpty()) {
40 |             this.filters = null;
41 |         } else {
42 |             this.filters = ImmutableList.copyOf(filters);
43 |         }
44 |     }
45 | 
46 |     @Override
47 |     public String filter(CharSequence text) {
48 |         if (filters==null) {
49 |             return text.toString();
50 |         } else {
51 |             String modified = text.toString();
52 |             for (TextFilter filter : filters) {
53 |                 modified = filter.filter(modified);
54 |             }
55 |             return modified;
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/ngram/BackwardsCompatibleNgramFilterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | import org.junit.Test;
20 | 
21 | import static org.junit.Assert.assertFalse;
22 | import static org.junit.Assert.assertTrue;
23 | 
24 | /**
25 |  * @author Fabian Kessler
26 |  */
27 | public class BackwardsCompatibleNgramFilterTest {
28 | 
29 |     public static final NgramFilter filter = BackwardsCompatibleNgramFilter.getInstance();
30 | 
31 |     @Test
32 |     public void oneGram() throws Exception {
33 |         assertTrue(filter.use("a"));
34 |         assertTrue(filter.use("A"));
35 | 
36 |         assertFalse(filter.use(" "));
37 |     }
38 | 
39 |     @Test
40 |     public void twoGram() throws Exception {
41 |         assertTrue(filter.use("ab"));
42 |         assertTrue(filter.use("Ab"));
43 |         assertTrue(filter.use("a "));
44 |         assertTrue(filter.use("a"));
45 | 
46 |         assertFalse(filter.use("AB"));
47 |     }
48 | 
49 |     @Test
50 |     public void threeGram() throws Exception {
51 |         assertTrue(filter.use("abc"));
52 |         assertTrue(filter.use("Abc"));
53 |         assertTrue(filter.use("ab "));
54 |         assertTrue(filter.use(" ab"));
55 | 
56 |         assertFalse(filter.use("a c"));
57 |         assertFalse(filter.use("ABC"));
58 |     }
59 |     
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/CommonTextObjectFactories.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | /**
20 |  * Contains some standard {@link com.optimaize.langdetect.text.TextObjectFactory}s ready to use for
21 |  * common use cases.
22 |  *
23 |  * @author Fabian Kessler
24 |  */
25 | public class CommonTextObjectFactories {
26 | 
27 |     public static TextObjectFactory forDetectingOnLargeText() {
28 |         return new TextObjectFactoryBuilder()
29 |                 .maxTextLength(10000)
30 |                 .withTextFilter(UrlTextFilter.getInstance())
31 |                 .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3))
32 |                 .build();
33 |     }
34 | 
35 |     public static TextObjectFactory forDetectingShortCleanText() {
36 |         return new TextObjectFactoryBuilder()
37 |                 .build();
38 |     }
39 | 
40 |     public static TextObjectFactory forIndexing() {
41 |         return new TextObjectFactoryBuilder()
42 |                 .withTextFilter(UrlTextFilter.getInstance())
43 |                 .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3))
44 |                 .build();
45 |     }
46 | 
47 |     public static TextObjectFactory forIndexingCleanText() {
48 |         return new TextObjectFactoryBuilder()
49 |                 .build();
50 |     }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/TextObjectFactoryBuilder.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import java.util.ArrayList;
20 | import java.util.List;
21 | 
22 | /**
23 |  * Builder for {@link com.optimaize.langdetect.text.TextObjectFactory}.
24 |  *
25 |  * @author Fabian Kessler
26 |  */
27 | public class TextObjectFactoryBuilder {
28 | 
29 |     private int maxTextLength = 0;
30 |     private final List<TextFilter> textFilters = new ArrayList<>();
31 | 
32 |     /**
33 |      * @param maxTextLength 0 for no limit (that's the default).
34 |      */
35 |     public TextObjectFactoryBuilder maxTextLength(int maxTextLength) {
36 |         this.maxTextLength = maxTextLength;
37 |         return this;
38 |     }
39 | 
40 | 
41 |     /**
42 |      * Adds the given TextFilter to be run on {@link TextObject#append} methods.
43 |      *
44 |      * <p>Note that the order of filters. may be important. They are executed in the same order as they
45 |      * are passed in here.</p>
46 |      */
47 |     public TextObjectFactoryBuilder withTextFilter(TextFilter textFilter) {
48 |         textFilters.add(textFilter);
49 |         return this;
50 |     }
51 | 
52 |     public TextObjectFactory build() {
53 |         return new TextObjectFactory(
54 |                 new MultiTextFilter(textFilters),
55 |                 maxTextLength
56 |         );
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/ngram/OldNgramExtractorTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | import com.google.common.base.Stopwatch;
20 | import org.junit.Test;
21 | 
22 | import java.util.*;
23 | 
24 | import static org.junit.Assert.*;
25 | 
26 | /**
27 |  * @author Fabian Kessler
28 |  */
29 | public class OldNgramExtractorTest {
30 | 
31 |     @Test
32 |     public void testExtractNGrams() {
33 |         List<String> ngrams = OldNgramExtractor.extractNGrams("Foo bar", null);
34 |         assertTrue(ngrams.contains("Foo"));
35 |         assertTrue(ngrams.contains("F"));
36 |         assertTrue(ngrams.contains(" Fo"));  //algorithm makes prefix-grams
37 |         assertFalse(ngrams.contains("ar ")); //algorithm does not make suffix-grams
38 |         assertEquals(ngrams.size(), 18); //adapt when making changes to the extractor...
39 |     }
40 | 
41 |     @Test
42 |     public void testExtractNGrams2() {
43 |         List<String> ngrams = OldNgramExtractor.extractNGrams("Hallo DAA.", null);
44 |         System.out.println(ngrams);
45 |     }
46 | 
47 | 
48 | 
49 |     @Test
50 |     public void stressTestAlgo1() {
51 |         String text = "Foo bar hello world and so on nana nunu dada dudu asdf asdf akewf köjvnawer aisdfj awejfr iajdsöfj ewi adjsköfjwei ajsdökfj ief asd";
52 |         Stopwatch stopwatch = Stopwatch.createStarted();
53 |         for (int i=0; i<100000; i++) {
54 |             OldNgramExtractor.extractNGrams(text, null); //2.745s
55 |         }
56 |         System.out.println(stopwatch);
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | /**
20 |  * Filters what is generally not desired.
21 |  *
22 |  * Impl is immutable.
23 |  *
24 |  * @author Fabian Kessler
25 |  */
26 | public class StandardNgramFilter implements NgramFilter {
27 | 
28 |     private static final StandardNgramFilter INSTANCE = new StandardNgramFilter();
29 | 
30 |     public static NgramFilter getInstance() {
31 |         return INSTANCE;
32 |     }
33 | 
34 |     private StandardNgramFilter() {
35 |     }
36 | 
37 |     @Override
38 |     public boolean use(String ngram) {
39 |         switch (ngram.length()) {
40 |             case 1:
41 |                 if (ngram.charAt(0)==' ') {
42 |                     return false;
43 |                 }
44 |                 return true;
45 |             case 2:
46 |                 return true;
47 |             case 3:
48 |                 if (ngram.charAt(1)==' ') {
49 |                     //middle char is a space
50 |                     return false;
51 |                 }
52 |                 return true;
53 |             case 4:
54 |                 if (ngram.charAt(1)==' ' || ngram.charAt(2)==' ') {
55 |                     //one of the middle chars is a space
56 |                     return false;
57 |                 }
58 |                 return true;
59 |             default:
60 |                 //would need the same check: no space in the middle, border is fine.
61 |                 throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length());
62 |         }
63 |     }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/frma/LangProfileReaderTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | import org.junit.Test;
21 | 
22 | import java.io.File;
23 | import java.io.IOException;
24 | 
25 | import static org.hamcrest.Matchers.*;
26 | import static org.junit.Assert.assertThat;
27 | 
28 | public class LangProfileReaderTest {
29 | 	private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages");
30 | 
31 | 	@Test
32 | 	public void readEnFile() throws IOException {
33 | 		checkProfileFile("en", 3, 2301);
34 | 	}
35 | 
36 | 	@Test
37 | 	public void readBnFile() throws IOException {
38 | 		checkProfileFile("bn", 3, 2846);
39 | 	}
40 | 
41 | 	@Test
42 | 	public void readFrFile() throws IOException {
43 | 		checkProfileFile("fr", 3, 2232);
44 | 	}
45 | 
46 | 	@Test
47 | 	public void readNlFile() throws IOException {
48 | 		checkProfileFile("nl", 3, 2163);
49 | 	}
50 | 
51 | 
52 | 	private static void checkProfileFile(String language, int nWordSize, int freqSize) throws IOException {
53 | 		File profileFile = new File(PROFILE_DIR, language);
54 | 		final LangProfile langProfile = new LangProfileReader().read(profileFile);
55 | 		assertThat(langProfile, is(notNullValue()));
56 | 		assertThat(langProfile.getName(), is(equalTo(language)));
57 | 		assertThat(langProfile.getNWords(), is(notNullValue()));
58 | 		assertThat(langProfile.getNWords().length, is(equalTo(nWordSize)));
59 | 		assertThat(langProfile.getFreq(), is(notNullValue()));
60 | 		assertThat(langProfile.getFreq().size(), is(equalTo(freqSize)));
61 | 	}
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/util/LanguageLister.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nicole Torres
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.profiles.util;
18 | 
19 | import java.io.BufferedReader;
20 | import java.io.IOException;
21 | import java.io.InputStream;
22 | import java.io.InputStreamReader;
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | 
26 | /**
27 |  * This is just a utility to update the code with the existing languages.
28 |  *
29 |  * @author Nicole Torres
30 |  */
31 | class LanguageLister {
32 | 
33 |     public static void main(String[] args) throws IOException {
34 |         List<String> languages = readFilesFromClassPathFolder("languages/.");
35 |         for (String lang : languages) {
36 |             System.out.println("names.add(\""+lang+"\");");
37 |         }
38 |         System.out.println("--------------------------------");
39 |         List<String> shortText = readFilesFromClassPathFolder("languages.shorttext/.");
40 |         for (String text : shortText) {
41 |             System.out.println("texts.add(\""+text+"\");");
42 |         }
43 |     }
44 | 
45 |     private static List<String> readFilesFromClassPathFolder(String resourceNameFolder) throws IOException {
46 |         List<String> files = new ArrayList<>();
47 |         ClassLoader loader = LanguageLister.class.getClassLoader();
48 |         try (InputStream in = loader.getResourceAsStream(resourceNameFolder)) {
49 |             BufferedReader rdr = new BufferedReader(new InputStreamReader(in));
50 |             String line;
51 |             while ((line = rdr.readLine()) != null) {
52 |                 files.add(line);
53 |             }
54 |             rdr.close();
55 |         }
56 |         return files;
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/cybozu/DetectedLanguageTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.cybozu;
18 | 
19 | import com.optimaize.langdetect.DetectedLanguage;
20 | import com.optimaize.langdetect.i18n.LdLocale;
21 | import org.junit.Test;
22 | 
23 | import java.util.ArrayList;
24 | import java.util.Collections;
25 | import java.util.List;
26 | 
27 | import static org.junit.Assert.assertEquals;
28 | 
29 | /**
30 |  * @author Nakatani Shuyo
31 |  * @author Fabian Kessler
32 |  */
33 | public class DetectedLanguageTest {
34 | 
35 |     @Test
36 |     public final void basic() {
37 |         DetectedLanguage lang = new DetectedLanguage(LdLocale.fromString("en"), 1.0);
38 |         assertEquals(lang.getLocale().getLanguage(), "en");
39 |         assertEquals(lang.getProbability(), 1.0, 0.0001);
40 |         assertEquals(lang.toString(), "DetectedLanguage[en:1.0]");
41 |     }
42 | 
43 |     @Test(expected = IllegalArgumentException.class)
44 |     public final void invalidProbability() {
45 |         new DetectedLanguage(LdLocale.fromString("en"), 1.1);
46 |     }
47 | 
48 |     @Test
49 |     public final void comparable() {
50 |         List<DetectedLanguage> list = new ArrayList<>();
51 |         list.add(new DetectedLanguage(LdLocale.fromString("en"), 1.0));
52 |         list.add(new DetectedLanguage(LdLocale.fromString("de"), 1.0));
53 |         list.add(new DetectedLanguage(LdLocale.fromString("fr"), 0.9));
54 |         Collections.sort(list);
55 |         assertEquals(list.get(0).getLocale().getLanguage(), "de"); //alphabetical de before en
56 |         assertEquals(list.get(1).getLocale().getLanguage(), "en");
57 |         assertEquals(list.get(2).getLocale().getLanguage(), "fr"); //points 0.9 the last
58 |     }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/frma/LangProfileWriterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | import org.junit.Test;
21 | 
22 | import java.io.*;
23 | 
24 | import static org.hamcrest.Matchers.*;
25 | import static org.junit.Assert.assertThat;
26 | 
27 | public class LangProfileWriterTest {
28 | 	private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages");
29 | 
30 | 	@Test
31 | 	public void writeEnProfile() throws IOException {
32 | 		checkProfileCopy("en");
33 | 	}
34 | 
35 | 	@Test
36 | 	public void writeFrProfile() throws IOException {
37 | 		checkProfileCopy("fr");
38 | 	}
39 | 
40 | 	@Test
41 | 	public void writeNlProfile() throws IOException {
42 | 		checkProfileCopy("nl");
43 | 	}
44 | 
45 | 	protected void checkProfileCopy(String language) throws IOException {
46 | 		File originalFile = new File(PROFILE_DIR, language);
47 | 		final LangProfile originalProfile = new LangProfileReader().read(originalFile);
48 | 		File newFile = File.createTempFile("profile-copy-", null);
49 | 		try (FileOutputStream output = new FileOutputStream(newFile)) {
50 | 			new LangProfileWriter().write(originalProfile, output);
51 | 			LangProfile newProfile = new LangProfileReader().read(newFile);
52 | 			assertThat(newProfile.getFreq().size(), is(equalTo(originalProfile.getFreq().size())));
53 | 			assertThat(newProfile.getFreq(), is(equalTo(originalProfile.getFreq())));
54 | 			assertThat(newProfile.getNWords(), is(equalTo(originalProfile.getNWords())));
55 | 			assertThat(newProfile.getName(), is(equalTo(originalProfile.getName())));
56 | 		} finally {
57 |             //noinspection ResultOfMethodCallIgnored
58 |             newFile.delete();
59 | 		}
60 | 	}
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/BackwardsCompatibleNgramFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | /**
20 |  * Filters those that were not generated by the old n-gram generator.
21 |  *
22 |  * Impl is immutable.
23 |  *
24 |  * @author Fabian Kessler
25 |  */
26 | public class BackwardsCompatibleNgramFilter implements NgramFilter {
27 | 
28 |     private static final BackwardsCompatibleNgramFilter INSTANCE = new BackwardsCompatibleNgramFilter();
29 | 
30 |     public static NgramFilter getInstance() {
31 |         return INSTANCE;
32 |     }
33 | 
34 |     private BackwardsCompatibleNgramFilter() {
35 |     }
36 | 
37 | 
38 |     @Override
39 |     public boolean use(String ngram) {
40 |         switch (ngram.length()) {
41 |             case 1:
42 |                 if (ngram.charAt(0)==' ') {
43 |                     return false;
44 |                 }
45 |                 return true;
46 |             case 2:
47 |                 if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1))) {
48 |                     //all upper case
49 |                     return false;
50 |                 }
51 |                 return true;
52 |             case 3:
53 |                 if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1)) && Character.isUpperCase(ngram.charAt(2))) {
54 |                     //all upper case
55 |                     return false;
56 |                 }
57 |                 if (ngram.charAt(1)==' ') {
58 |                     //middle char is a space
59 |                     return false;
60 |                 }
61 |                 return true;
62 |             default:
63 |                 throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length());
64 |         }
65 |     }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/util/TagExtractor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.cybozu.util;
18 | 
19 | import com.optimaize.langdetect.text.CommonTextObjectFactories;
20 | import com.optimaize.langdetect.text.TextObjectFactory;
21 | 
22 | /**
23 |  * {@link TagExtractor} is a class which extracts inner texts of specified tag.
24 |  * Users don't use this class directly.
25 |  * @author Nakatani Shuyo
26 |  */
27 | public class TagExtractor {
28 | 
29 |     private static final TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexing();
30 | 
31 |     /* package scope */ String target_;
32 |     /* package scope */ int threshold_;
33 |     /* package scope */ StringBuilder buf_;
34 |     /* package scope */ String tag_;
35 |     private int count_;
36 | 
37 |     public TagExtractor(String tag, int threshold) {
38 |         target_ = tag;
39 |         threshold_ = threshold;
40 |         count_ = 0;
41 |         clear();
42 |     }
43 |     public int count() {
44 |         return count_;
45 |     }
46 |     public void clear() {
47 |         buf_ = new StringBuilder(" ");
48 |         tag_ = null;
49 |     }
50 |     public void setTag(String tag){
51 |         tag_ = tag;
52 |     }
53 |     public void add(String line) {
54 |         if (tag_ != null && tag_.equals(target_) && line != null) {
55 |             buf_.append(line);
56 |         }
57 |     }
58 |     public void closeTag(LangProfile profile) {
59 |         if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) {
60 |             Util.addCharSequence(profile, textObjectFactory.forText(buf_));
61 |             ++count_;
62 |         }
63 |         clear();
64 |     }
65 | 
66 |     private boolean isSpace() {
67 |         return (buf_.length()==1 && buf_.toString().equals(" "));
68 |     }
69 | 
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/frma/LangProfileWriter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | 
21 | import java.io.*;
22 | import java.nio.charset.Charset;
23 | import java.util.Map;
24 | 
25 | /**
26 |  * Writes a {@link LangProfile} to an output stream (file).
27 |  *
28 |  * @author François ROLAND
29 |  * @author Fabian Kessler
30 |  */
31 | public class LangProfileWriter {
32 | 
33 |     /**
34 |      * Writes a {@link LangProfile} to an OutputStream in UTF-8.
35 |      *
36 |      * @throws IOException
37 |      */
38 | 	public void write(LangProfile langProfile, OutputStream outputStream) throws IOException {
39 | 		try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName("utf-8")))) {
40 |             writer.write("{\"freq\":{");
41 |             boolean first = true;
42 |             for (Map.Entry<String, Integer> entry : langProfile.getFreq().entrySet()) {
43 |                 if (!first) {
44 |                     writer.write(',');
45 |                 }
46 |                 writer.write('"');
47 |                 writer.write(entry.getKey());
48 |                 writer.write("\":");
49 |                 writer.write(entry.getValue().toString());
50 |                 first = false;
51 |             }
52 |             writer.write("},\"n_words\":[");
53 |             first = true;
54 |             for (int nWord : langProfile.getNWords()) {
55 |                 if (!first) {
56 |                     writer.write(',');
57 |                 }
58 |                 writer.write(Integer.toString(nWord));
59 |                 first = false;
60 |             }
61 |             writer.write("],\"name\":\"");
62 |             writer.write(langProfile.getName());
63 |             writer.write("\"}");
64 |             writer.flush();
65 |         }
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileWriterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.profiles;
18 | 
19 | import org.junit.Test;
20 | 
21 | import java.io.File;
22 | import java.io.FileOutputStream;
23 | import java.io.IOException;
24 | 
25 | import static org.junit.Assert.assertEquals;
26 | 
27 | /**
28 |  * @author François ROLAND
29 |  * @author Fabian Kessler
30 |  */
31 | public class LanguageProfileWriterTest {
32 | 
33 |     private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages");
34 | 
35 |     @Test
36 |     public void writeEnProfile() throws IOException {
37 |         checkProfileCopy("en");
38 |     }
39 | 
40 |     @Test
41 |     public void writeFrProfile() throws IOException {
42 |         checkProfileCopy("fr");
43 |     }
44 | 
45 |     @Test
46 |     public void writeNlProfile() throws IOException {
47 |         checkProfileCopy("nl");
48 |     }
49 | 
50 |     protected void checkProfileCopy(String language) throws IOException {
51 |         File originalFile = new File(PROFILE_DIR, language);
52 |         final LanguageProfile originalProfile = new LanguageProfileReader().read(originalFile);
53 |         File newFile = File.createTempFile("profile-copy-", null);
54 |         try (FileOutputStream output = new FileOutputStream(newFile)) {
55 |             new LanguageProfileWriter().write(originalProfile, output);
56 |             LanguageProfile newProfile = new LanguageProfileReader().read(newFile);
57 |             assertEquals(newProfile.getLocale(), originalProfile.getLocale());
58 |             assertEquals(newProfile.getNumGrams(), originalProfile.getNumGrams());
59 |             assertEquals(newProfile.getGramLengths(), originalProfile.getGramLengths());
60 |             assertEquals(newProfile, originalProfile);
61 |         } finally {
62 |             //noinspection ResultOfMethodCallIgnored
63 |             newFile.delete();
64 |         }
65 |     }
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/DetectedLanguage.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect;
18 | 
19 | import com.optimaize.langdetect.i18n.LdLocale;
20 | import org.jetbrains.annotations.NotNull;
21 | 
22 | /**
23 |  * Holds information about a detected language: the locale (language) and the probability.
24 |  *
25 |  * <p>Comparable: the "better" one comes before the worse.
26 |  * First order by probability descending (1 to 0).
27 |  * Then order by language ascending (a to z).</p>
28 |  *
29 |  * <p>This class is immutable.</p>
30 |  *
31 |  * @author Nakatani Shuyo
32 |  * @author Fabian Kessler
33 |  */
34 | public class DetectedLanguage implements Comparable<DetectedLanguage> {
35 | 
36 |     @NotNull
37 |     private final LdLocale locale;
38 |     private final double probability;
39 | 
40 |     /**
41 |      * @param locale
42 |      * @param probability 0-1
43 |      */
44 |     public DetectedLanguage(@NotNull LdLocale locale, double probability) {
45 |         if (probability<0d) throw new IllegalArgumentException("Probability must be >= 0 but was "+probability);
46 |         if (probability>1d) throw new IllegalArgumentException("Probability must be <= 1 but was "+probability);
47 |         this.locale = locale;
48 |         this.probability = probability;
49 |     }
50 | 
51 |     @NotNull
52 |     public LdLocale getLocale() {
53 |         return locale;
54 |     }
55 | 
56 |     /**
57 |      * @return 0-1, the higher the better.
58 |      */
59 |     public double getProbability() {
60 |         return probability;
61 |     }
62 | 
63 |     public String toString() {
64 |         return "DetectedLanguage["+ locale + ":" + probability+"]";
65 |     }
66 | 
67 |     /**
68 |      * See class header.
69 |      */
70 |     @Override
71 |     public int compareTo(DetectedLanguage o) {
72 |         int compare = Double.compare(o.probability, this.probability);
73 |         if (compare!=0) return compare;
74 |         return this.locale.toString().compareTo(o.locale.toString());
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilterTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.text;
18 | 
19 | import org.junit.Test;
20 | import static org.junit.Assert.*;
21 | 
22 | /**
23 |  * @author Fabian Kessler
24 |  */
25 | public class RemoveMinorityScriptsTextFilterTest {
26 | 
27 |     @Test
28 |     public void testWithCyrillicAndHani() throws Exception {
29 |         RemoveMinorityScriptsTextFilter filter = RemoveMinorityScriptsTextFilter.forThreshold(0.35);
30 |         String result = filter.filter("Hu Jintao (in Chinese 胡錦濤) and Leo Tolstoy (in Russian Лев Николаевич Толстой) are two well known people.");
31 |         assertEquals("Hu Jintao (in Chinese ) and Leo Tolstoy (in Russian   ) are two well known people.", result);
32 |     }
33 | 
34 |     @Test
35 |     public void testWithChineseAndSomeEnglish() throws Exception {
36 |         String input = "设为首页收藏本站 开启辅助访问 为首页收藏本站 开启辅助访为首页收藏本站 开启辅助访切换到窄版 请 登录 后使用快捷导航 没有帐号 注册 用户名 Email 自动登录  找回密码 密码 登录  注册 快捷导航 论坛BBS 导读Guide 排行榜Ranklist 淘帖Collection 日志Blog 相册Album 分享Share 搜索 搜索 帖子 用户 公告";
37 | 
38 |         //expect no change, the ratio 0.35 is too low
39 |         RemoveMinorityScriptsTextFilter filter = RemoveMinorityScriptsTextFilter.forThreshold(0.42);
40 |         assertEquals(filter.filter(input), input);
41 | 
42 |         //expect the English to be removed
43 |         filter = RemoveMinorityScriptsTextFilter.forThreshold(0.43);
44 |         String result = filter.filter(input);
45 |         assertEquals("设为首页收藏本站 开启辅助访问 为首页收藏本站 开启辅助访为首页收藏本站 开启辅助访切换到窄版 请 登录 后使用快捷导航 没有帐号 注册 用户名  自动登录  找回密码 密码 登录  注册 快捷导航 论坛 导读 排行榜 淘帖 日志 相册 分享 搜索 搜索 帖子 用户 公告", result);
46 |     }
47 | 
48 |     /**
49 |      * Seems obvious, but better test: plain latin text may not be modified.
50 |      */
51 |     @Test
52 |     public void testJustLatin() throws Exception {
53 |         RemoveMinorityScriptsTextFilter filter = RemoveMinorityScriptsTextFilter.forThreshold(0.01);
54 |         String text = "Hu Jintao is a well known person.";
55 |         String result = filter.filter(text);
56 |         assertEquals(text, result);
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/frma/GenProfileTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | import org.junit.Test;
21 | 
22 | import java.io.*;
23 | import java.nio.charset.Charset;
24 | import java.util.Map;
25 | 
26 | import static org.hamcrest.Matchers.*;
27 | import static org.junit.Assert.assertThat;
28 | 
29 | public class GenProfileTest extends GenProfile {
30 | 
31 | 	@Test
32 | 	public void generateProfile() throws IOException {
33 | 		File inputFile = File.createTempFile("profileInput", ".txt");
34 | 		try {
35 | 			try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(inputFile), Charset.forName("utf-8")))) {
36 | 				writer.println("Salut tout le monde.");
37 | 				writer.println("Bonjour toi tout seul.");
38 | 				writer.println("Ca va ?");
39 | 				writer.println("Oui ça va. Et toi ?");
40 | 			}
41 | 			
42 | 			LangProfile trucProfile = generate("truc", inputFile);
43 | 			Map<String, Integer> freqs = trucProfile.getFreq();
44 | 			assertThat(freqs, is(notNullValue()));
45 | 			assertThat(freqs.get("t"), is(equalTo(8)));
46 | 			assertThat(freqs.get("to"), is(equalTo(4)));
47 | 			assertThat(freqs.get("out"), is(equalTo(2)));
48 | 			assertThat(freqs.get("o"), is(equalTo(7)));
49 | 			assertThat(freqs.get("ou"), is(equalTo(3)));
50 | 			assertThat(freqs.get("toi"), is(equalTo(2)));
51 | 			assertThat(freqs.get("u"), is(equalTo(6)));
52 | 			assertThat(freqs.get("ut"), is(equalTo(3)));
53 | 			assertThat(freqs.get("tou"), is(equalTo(2)));
54 | 			assertThat(freqs.get("a"), is(equalTo(5)));
55 | 			assertThat(freqs.get("oi"), is(equalTo(2)));
56 | 			assertThat(freqs.get("alu"), is(equalTo(1)));
57 | 			assertThat(freqs.get("on"), is(equalTo(2)));
58 | 			assertThat(freqs.get("Bon"), is(equalTo(1)));
59 | 			assertThat(freqs.get("e"), is(equalTo(3)));
60 | 			assertThat(freqs.get("va"), is(equalTo(2)));
61 | 			assertThat(freqs.get("i"), is(equalTo(3)));
62 | 			assertThat(freqs.get("jou"), is(equalTo(1)));
63 | 		} finally {
64 |             //noinspection ResultOfMethodCallIgnored
65 |             inputFile.delete();
66 | 		}
67 | 	}
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileBuilderTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.profiles;
18 | 
19 | import com.optimaize.langdetect.ngram.NgramExtractors;
20 | import com.optimaize.langdetect.text.*;
21 | import org.junit.Test;
22 | import static org.junit.Assert.*;
23 | 
24 | /**
25 |  * @author Fabian Kessler
26 |  */
27 | public class LanguageProfileBuilderTest {
28 | 
29 |     @Test
30 |     public void german() throws Exception {
31 |         TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexing();
32 | 
33 |         TextObject inputText = textObjectFactory.create()
34 |                 .append("deutsche Text")
35 |                 .append(" ")
36 |                 .append("http://www.github.com/");
37 | 
38 |         LanguageProfile languageProfile = new LanguageProfileBuilder("de")
39 |                 .ngramExtractor(NgramExtractors.standard())
40 |                 .addText(inputText)
41 |                 .build();
42 | 
43 |         assertEquals(1, languageProfile.getFrequency("sch"));
44 |         assertEquals(0, languageProfile.getFrequency("www"));
45 |     }
46 | 
47 |     @Test
48 |     public void profile_equals() throws Exception {
49 |         LanguageProfile languageProfile1 = new LanguageProfileBuilder("de")
50 |                 .addGram("foo", 1)
51 |                 .build();
52 | 
53 |         LanguageProfile languageProfile2 = new LanguageProfileBuilder("de")
54 |                 .addGram("foo", 1)
55 |                 .build();
56 | 
57 |         LanguageProfile languageProfile3 = new LanguageProfileBuilder("de")
58 |                 .addGram("bar", 1)
59 |                 .build();
60 | 
61 |         assertEquals(languageProfile1, languageProfile2);
62 |         assertNotEquals(languageProfile1, languageProfile3);
63 |     }
64 | 
65 |     @Test
66 |     public void profile_toString() throws Exception {
67 |         LanguageProfile languageProfile = new LanguageProfileBuilder("de")
68 |                 .addGram("foo", 1)
69 |                 .build();
70 |         assertTrue(languageProfile.toString().contains("de"));
71 |         assertTrue(languageProfile.toString().contains("1"));
72 |     }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/frma/GenProfile.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | import com.optimaize.langdetect.cybozu.util.Util;
21 | import com.optimaize.langdetect.text.CommonTextObjectFactories;
22 | import com.optimaize.langdetect.text.TextObject;
23 | import com.optimaize.langdetect.text.TextObjectFactory;
24 | 
25 | import java.io.*;
26 | import java.nio.charset.Charset;
27 | import java.util.zip.GZIPInputStream;
28 | 
29 | /**
30 |  * Generate a language profile from any given text file.
31 |  *
32 |  * TODO this is copy/paste from the other class with the same name. Check if code can be re-used. Rename to something meaningful.
33 |  * 
34 |  * @author François ROLAND
35 |  */
36 | public class GenProfile {
37 | 
38 |     private static final TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexing();
39 | 
40 | 
41 |     /**
42 |      * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8.
43 |      * @param lang target language name.
44 |      * @param textFile input text file.
45 |      * @return Language profile instance
46 |      */
47 |     public static LangProfile generate(String lang, File textFile) {
48 |         LangProfile profile = new LangProfile(lang);
49 | 
50 |         InputStream is = null;
51 |         try {
52 |             is = new BufferedInputStream(new FileInputStream(textFile));
53 |             if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is);
54 | 
55 |             BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
56 |             String line;
57 |             while ((line = reader.readLine()) != null) {
58 |                 TextObject textObject = textObjectFactory.forText(" "+line+" ");
59 |                 Util.addCharSequence(profile, textObject);
60 |             }
61 |         } catch (IOException e) {
62 |             throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e);
63 |         } finally {
64 |             IOUtils.closeQuietly(is);
65 |         }
66 |         return profile;
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/util/NGram.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.cybozu.util;
18 | 
19 | import org.jetbrains.annotations.Nullable;
20 | 
21 | /**
22 |  * TODO document.
23 |  *
24 |  * Users don't use this class directly.
25 |  *
26 |  * TODO this class treats a word as "upper case" if the first 2 characters are upper case. That seems like a simplification,
27 |  * would need documentation.
28 |  *
29 |  * @author Nakatani Shuyo
30 |  */
31 | public class NGram {
32 | 
33 |     /**
34 |      * ngrams are created from 1gram to this amount, currently 2grams and 3grams.
35 |      */
36 |     public static final int N_GRAM = 3;
37 | 
38 |     private StringBuilder grams_;
39 |     private boolean capitalword_;
40 | 
41 |     public NGram() {
42 |         grams_ = new StringBuilder(" ");
43 |         capitalword_ = false;
44 |     }
45 | 
46 |     public void addChar(char ch) {
47 |         ch = CharNormalizer.normalize(ch);
48 |         char lastChar = grams_.charAt(grams_.length() - 1);
49 |         if (lastChar == ' ') {
50 |             grams_ = new StringBuilder(" ");
51 |             capitalword_ = false;
52 |             if (ch==' ') return;
53 |         } else if (grams_.length() >= N_GRAM) {
54 |             grams_.deleteCharAt(0);
55 |         }
56 |         grams_.append(ch);
57 | 
58 |         if (Character.isUpperCase(ch)){
59 |             if (Character.isUpperCase(lastChar)) capitalword_ = true;
60 |         } else {
61 |             capitalword_ = false;
62 |         }
63 |     }
64 | 
65 |     /**
66 |      * TODO this method has some weird, undocumented behavior to ignore ngrams with upper case.
67 |      *
68 |      * Get n-Gram
69 |      * @param n length of n-gram
70 |      * @return n-Gram String (null if it is invalid)
71 |      */
72 |     @Nullable
73 |     public String get(int n) {
74 |         if (capitalword_) return null;
75 |         int len = grams_.length(); 
76 |         if (n < 1 || n > N_GRAM || len < n) return null;
77 |         if (n == 1) {
78 |             char ch = grams_.charAt(len - 1);
79 |             if (ch == ' ') return null;
80 |             return Character.toString(ch);
81 |         } else {
82 |             return grams_.substring(len - n, len);
83 |         }
84 |     }
85 | 
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/OldNgramExtractor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.ngram;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.NGram;
20 | import org.jetbrains.annotations.NotNull;
21 | import org.jetbrains.annotations.Nullable;
22 | 
23 | import java.util.ArrayList;
24 | import java.util.List;
25 | 
26 | /**
27 |  * @author Nakatani Shuyo
28 |  */
29 | @Deprecated
30 | public class OldNgramExtractor {
31 | 
32 | 
33 |     public interface Filter {
34 |         /**
35 |          * Allows to skip some n-grams.
36 |          *
37 |          * This is currently used to filter n-grams in to-analyze text when the n-gram is unknown to the loaded
38 |          * language profiles.
39 |          *
40 |          * @return true to use this n-gram, false to skip it.
41 |          */
42 |         boolean use(String gram);
43 |     }
44 | 
45 |     /**
46 |      * This was the method found in the <i>com.cybozu.labs.langdetect.Detector</i> class, it was used to extract
47 |      * grams from the to-analyze text.
48 |      *
49 |      * NOTE: although it adds the first ngram with space, it does not add the last n-gram with space. example: "foo" gives " fo" but not "oo "!.
50 |      * It is not clear yet whether this is desired (and why) or a bug.
51 |      *
52 |      * TODO replace this algorithm with a simpler, faster one that uses less memory: only by position shifting. also, the returned list size
53 |      * can be computed before making it (based on text length and number of n-grams).
54 |      *
55 |      */
56 |     @NotNull
57 |     @Deprecated
58 |     public static List<String> extractNGrams(@NotNull CharSequence text, @Nullable Filter filter) {
59 |         List<String> list = new ArrayList<>();
60 |         NGram ngram = new NGram();
61 |         for(int i=0;i<text.length();++i) {
62 |             ngram.addChar(text.charAt(i));
63 |             for(int n=1;n<=NGram.N_GRAM;++n){
64 |                 String w = ngram.get(n);
65 |                 if (w!=null) { //TODO this null check is ugly
66 |                     if (filter==null || filter.use(w)) {
67 |                         list.add(w);
68 |                     }
69 |                 }
70 |             }
71 |         }
72 |         return list;
73 |     }
74 | 
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/LanguageDetector.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect;
18 | 
19 | import com.google.common.base.Optional;
20 | import com.optimaize.langdetect.i18n.LdLocale;
21 | 
22 | import java.util.List;
23 | 
24 | /**
25 |  * Guesses the language of an input string or text.
26 |  *
27 |  * <p>See website for details.</p>
28 |  *
29 |  * <p>This detector cannot handle well:
30 |  * Short input text, can work or give wrong results.
31 |  * Text written in multiple languages. It likely returns the language for the most prominent text. It's not made for that.
32 |  * Text written in languages for which the detector has no profile loaded. It may just return other similar languages.
33 |  * </p>
34 |  *
35 |  * @author Fabian Kessler
36 |  */
37 | public interface LanguageDetector {
38 | 
39 |     /**
40 |      * Returns the best detected language if the algorithm is very confident.
41 |      *
42 |      * <p>Note: you may want to use getProbabilities() instead. This here is very strict, and sometimes returns
43 |      * absent even though the first choice in getProbabilities() is correct.</p>
44 |      *
45 |      * @param text You probably want a {@link com.optimaize.langdetect.text.TextObject}.
46 |      * @return The language if confident, absent if unknown or not confident enough.
47 |      */
48 |     Optional<LdLocale> detect(CharSequence text);
49 | 
50 |     /**
51 |      * Returns all languages with at least some likeliness.
52 |      *
53 |      * <p>There is a configurable cutoff applied for languages with very low probability.</p>
54 |      *
55 |      * <p>The way the algorithm currently works, it can be that, for example, this method returns a 0.99 for
56 |      * Danish and less than 0.01 for Norwegian, and still they have almost the same chance. It would be nice if
57 |      * this could be improved in future versions.</p>
58 |      *
59 |      * @param text You probably want a {@link com.optimaize.langdetect.text.TextObject}.
60 |      * @return Sorted from better to worse. May be empty.
61 |      *         It's empty if the program failed to detect any language, or if the input text did not
62 |      *         contain any usable text (just noise).
63 |      */
64 |     List<DetectedLanguage> getProbabilities(CharSequence text);
65 | 
66 | }
67 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/NgramFrequencyDataTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect;
18 | 
19 | import com.google.common.collect.ImmutableSet;
20 | import com.optimaize.langdetect.i18n.LdLocale;
21 | import com.optimaize.langdetect.profiles.LanguageProfile;
22 | import com.optimaize.langdetect.profiles.LanguageProfileReader;
23 | import org.junit.BeforeClass;
24 | import org.junit.Test;
25 | 
26 | import java.io.IOException;
27 | import java.util.List;
28 | 
29 | import static org.junit.Assert.assertEquals;
30 | import static org.junit.Assert.assertTrue;
31 | 
32 | /**
33 |  * Some rudimentary tests for NgramFrequencyData.
34 |  *
35 |  * @author Fabian Kessler
36 |  */
37 | public class NgramFrequencyDataTest {
38 | 
39 |     private static NgramFrequencyData allThreeGrams;
40 | 
41 |     @BeforeClass
42 |     public static void init() throws IOException {
43 |         allThreeGrams = forAll(3);
44 |     }
45 |     private static NgramFrequencyData forAll(int gramSize) throws IOException {
46 |         List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
47 |         return NgramFrequencyData.create(languageProfiles, ImmutableSet.of(gramSize));
48 |     }
49 | 
50 | 
51 |     @Test
52 |     public void size() throws Exception {
53 |         //update the number when adding built-in languages
54 |         assertEquals(allThreeGrams.getLanguageList().size(), 71);
55 |     }
56 | 
57 |     @Test
58 |     public void constantOrder() throws Exception {
59 |         //expect constant order:
60 |         int pos=0;
61 |         for (LdLocale locale : allThreeGrams.getLanguageList()) {
62 |             assertEquals(allThreeGrams.getLanguage(pos), locale);
63 |             pos++;
64 |         }
65 |     }
66 | 
67 |     @Test
68 |     public void expectGram() throws Exception {
69 |         //this must exist in many languages
70 |         double[] probabilities = allThreeGrams.getProbabilities("dam");
71 |         assert probabilities != null;
72 |         assertTrue(probabilities.length >= 5 && probabilities.length <= allThreeGrams.getLanguageList().size());
73 |     }
74 | 
75 |     @Test
76 |     public void forbidGramOfWrongSize() throws Exception {
77 |         //we said 3-grams, not 2 grams
78 |         assertEquals(allThreeGrams.getProbabilities("da"), null);
79 |     }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Fabian Kessler
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect;
18 | 
19 | import com.optimaize.langdetect.frma.LangProfileReader;
20 | import com.optimaize.langdetect.cybozu.util.LangProfile;
21 | import com.google.common.collect.ImmutableList;
22 | import com.optimaize.langdetect.ngram.NgramExtractors;
23 | import com.optimaize.langdetect.profiles.LanguageProfile;
24 | import com.optimaize.langdetect.profiles.OldLangProfileConverter;
25 | import com.optimaize.langdetect.text.*;
26 | import org.testng.annotations.DataProvider;
27 | import org.testng.annotations.Test;
28 | 
29 | import java.io.IOException;
30 | import java.util.List;
31 | import static org.testng.Assert.*;
32 | 
33 | 
34 | /**
35 |  * Basic tests for the LanguageDetectorImpl.
36 |  *
37 |  * @author Fabian Kessler
38 |  */
39 | public class LanguageDetectorImplTest {
40 | 
41 |     @Test(dataProvider = "confident")
42 |     public void confident(String expectedLanguage, CharSequence text) throws Exception {
43 |         LanguageDetector languageDetector = makeNewDetector();
44 |         List<DetectedLanguage> result = languageDetector.getProbabilities(text);
45 |         DetectedLanguage best = result.get(0);
46 |         assertEquals(best.getLocale().getLanguage(), expectedLanguage);
47 |         assertTrue(best.getProbability() >= 0.9999d);
48 |     }
49 |     @DataProvider
50 |     protected Object[][] confident() {
51 |         return new Object[][] {
52 |                 {"de", "Dies ist eine deutsche Text"},
53 |                 {"de", "deutsche Text"},
54 |                 {"de", CommonTextObjectFactories.forDetectingOnLargeText().create().append("deutsche Text").append(" ").append("http://www.github.com/")},
55 |         };
56 |     }
57 | 
58 | 
59 |     private LanguageDetector makeNewDetector() throws IOException {
60 |         LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard())
61 |             .shortTextAlgorithm(50)
62 |             .prefixFactor(1.5)
63 |             .suffixFactor(2.0);
64 | 
65 |         LangProfileReader langProfileReader = new LangProfileReader();
66 |         for (String language : ImmutableList.of("en", "fr", "nl", "de")) {
67 |             LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language));
68 |             LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile);
69 |             builder.withProfile(languageProfile);
70 |         }
71 | 
72 |         return builder.build();
73 |     }
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/frma/LangProfileReader.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.frma;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.LangProfile;
20 | 
21 | import java.io.*;
22 | import java.nio.charset.Charset;
23 | import java.util.regex.Matcher;
24 | import java.util.regex.Pattern;
25 | 
26 | /**
27 |  * Reads {@link LangProfile}s.
28 |  *
29 |  * @author François ROLAND
30 |  * @author Fabian Kessler
31 |  */
32 | public class LangProfileReader {
33 | 
34 | 	private static final Pattern FREQ_PATTERN = Pattern.compile("\"freq\" ?: ?\\{(.+?)\\}");
35 | 	private static final Pattern N_WORDS_PATTERN = Pattern.compile("\"n_words\" ?: ?\\[(.+?)\\]");
36 | 	private static final Pattern NAME_PATTERN = Pattern.compile("\"name\" ?: ?\"(.+?)\"");
37 | 
38 |     /**
39 |      * Reads a {@link LangProfile} from a File in UTF-8.
40 |      */
41 |     public LangProfile read(File profileFile) throws IOException {
42 |         if (!profileFile.exists()) {
43 |             throw new IOException("No such file: "+profileFile);
44 |         } else if (!profileFile.canRead()) {
45 |             throw new IOException("Cannot read file: "+profileFile);
46 |         }
47 |         try (FileInputStream input = new FileInputStream(profileFile)) {
48 |             return read(input);
49 |         }
50 |     }
51 | 
52 |     /**
53 |      * Reads a {@link LangProfile} from an InputStream in UTF-8.
54 |      */
55 | 	public LangProfile read(InputStream inputStream) throws IOException {
56 | 		StringBuilder buffer = new StringBuilder();
57 |         try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("utf-8")))) {
58 |             String line;
59 |             while((line = reader.readLine()) != null) {
60 |                 if (buffer.length() > 0) {
61 |                     buffer.append(' ');
62 |                 }
63 |                 buffer.append(line);
64 |             }
65 |         }
66 | 
67 | 		String storedProfile = buffer.toString();
68 | 		LangProfile langProfile = new LangProfile();
69 | 
70 | 		Matcher m = FREQ_PATTERN.matcher(storedProfile);
71 | 		if (m.find()) {
72 | 			String[] entries = m.group(1).split(",");
73 | 			for (String entry : entries) {
74 | 				String[] keyValue = entry.split(":");
75 | 				String label = keyValue[0].trim().replace("\"", "");
76 | 				langProfile.getFreq().put(label, Integer.valueOf(keyValue[1]));
77 | 			}
78 | 		}
79 | 
80 | 		m = N_WORDS_PATTERN.matcher(storedProfile);
81 | 		if (m.find()) {
82 | 			String[] nWords = m.group(1).split(",");
83 | 			langProfile.setNWords(new int[nWords.length]);
84 | 			for (int i = 0; i < nWords.length; i++) {
85 | 				langProfile.getNWords()[i] = Integer.parseInt(nWords[i]);
86 | 			}
87 | 		}
88 | 		
89 | 		m = NAME_PATTERN.matcher(storedProfile);
90 | 		if (m.find()) {
91 | 			langProfile.setName(m.group(1));
92 | 		}
93 | 
94 | 		return langProfile;
95 | 	}
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/TechnicalLanguageDetectorImplTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 François ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect;
18 | 
19 | import com.optimaize.langdetect.i18n.LdLocale;
20 | import com.optimaize.langdetect.ngram.NgramExtractor;
21 | import com.optimaize.langdetect.profiles.LanguageProfileBuilder;
22 | import org.junit.Test;
23 | 
24 | import static org.junit.Assert.assertEquals;
25 | 
26 | /**
27 |  * These are the tests of the old detector from Shoyu. Running them against the new detector from Fabian.
28 |  *
29 |  * @author Nakatani Shuyo
30 |  * @author Fabian Kessler
31 |  */
32 | public class TechnicalLanguageDetectorImplTest {
33 | 
34 |     private static final String TRAINING_EN = "a a a b b c c d e";
35 |     private static final String TRAINING_FR = "a b b c c c d d d";
36 |     private static final String TRAINING_JA = "\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048";
37 | 
38 | 
39 |     private LanguageDetector makeDetector() {
40 |         //building exactly like the old detector behaved.
41 |         LanguageDetectorBuilder detectorBuilder = LanguageDetectorBuilder.create(NgramExtractor.gramLengths(1))
42 |                 .affixFactor(1.0)
43 |                 .shortTextAlgorithm(0);
44 | 
45 |         LanguageProfileBuilder profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("en"));
46 |         add(detectorBuilder, profileBuilder, TRAINING_EN);
47 | 
48 |         profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("fr"));
49 |         add(detectorBuilder, profileBuilder, TRAINING_FR);
50 | 
51 |         profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("ja"));
52 |         add(detectorBuilder, profileBuilder, TRAINING_JA);
53 | 
54 |         return detectorBuilder.build();
55 |     }
56 |     private void add(LanguageDetectorBuilder detectorBuilder, LanguageProfileBuilder profileBuilder, String trainingEn) {
57 |         for (String w : trainingEn.split(" ")) {
58 |             profileBuilder.addGram(w);
59 |         }
60 |         detectorBuilder.withProfile(profileBuilder.build());
61 |     }
62 | 
63 | 
64 |     @Test
65 |     public final void testDetector1() {
66 |         LanguageDetector languageDetector = makeDetector();
67 |         assertEquals(languageDetector.detect("a").get().getLanguage(), "en");
68 |     }
69 | 
70 |     @Test
71 |     public final void testDetector2() {
72 |         LanguageDetector languageDetector = makeDetector();
73 |         assertEquals(languageDetector.detect("b d").get().getLanguage(), "fr");
74 |     }
75 | 
76 |     @Test
77 |     public final void testDetector3() {
78 |         LanguageDetector languageDetector = makeDetector();
79 |         assertEquals(languageDetector.detect("d e").get().getLanguage(), "en");
80 |     }
81 | 
82 |     @Test
83 |     public final void testDetector4() {
84 |         LanguageDetector languageDetector = makeDetector();
85 |         assertEquals(languageDetector.detect("\u3042\u3042\u3042\u3042a").get().getLanguage(), "ja");
86 |     }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/GenProfile.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.cybozu;
18 | 
19 | import com.optimaize.langdetect.cybozu.util.TagExtractor;
20 | import com.optimaize.langdetect.cybozu.util.LangProfile;
21 | import org.slf4j.Logger;
22 | import org.slf4j.LoggerFactory;
23 | 
24 | import javax.xml.stream.XMLInputFactory;
25 | import javax.xml.stream.XMLStreamException;
26 | import javax.xml.stream.XMLStreamReader;
27 | import java.io.*;
28 | import java.util.zip.GZIPInputStream;
29 | 
30 | /**
31 |  * Load Wikipedia's abstract XML as corpus and generate its language profile in JSON format.
32 |  * 
33 |  * @author Nakatani Shuyo
34 |  */
35 | public class GenProfile {
36 | 
37 |     private static final Logger logger = LoggerFactory.getLogger(GenProfile.class);
38 | 
39 |     /**
40 |      * Load Wikipedia abstract database file and generate its language profile
41 |      * @param lang target language name
42 |      * @param file target database file path
43 |      * @return Language profile instance
44 |      */
45 |     public static LangProfile load(String lang, File file) {
46 | 
47 |         LangProfile profile = new LangProfile(lang);
48 | 
49 |         try (InputStream is = file.getName().endsWith(".gz") ?
50 |             new GZIPInputStream(new BufferedInputStream(new FileInputStream(file))) :
51 |             new BufferedInputStream(new FileInputStream(file))) {
52 | 
53 |             TagExtractor tagextractor = new TagExtractor("abstract", 100);
54 | 
55 |             XMLStreamReader reader = null;
56 |             try {
57 |                 XMLInputFactory factory = XMLInputFactory.newInstance();
58 |                 reader = factory.createXMLStreamReader(is);
59 |                 while (reader.hasNext()) {
60 |                     switch (reader.next()) {
61 |                     case XMLStreamReader.START_ELEMENT:
62 |                         tagextractor.setTag(reader.getName().toString());
63 |                         break;
64 |                     case XMLStreamReader.CHARACTERS:
65 |                         tagextractor.add(reader.getText());
66 |                         break;
67 |                     case XMLStreamReader.END_ELEMENT:
68 |                         tagextractor.closeTag(profile);
69 |                         break;
70 |                     }
71 |                 }
72 |             } catch (XMLStreamException e) {
73 |                 throw new RuntimeException("Training database file '" + file.getName() + "' is an invalid XML.", e);
74 |             } finally {
75 |                 try {
76 |                     if (reader != null) reader.close();
77 |                 } catch (XMLStreamException e) { /* ignore exception */ }
78 |             }
79 |             logger.info(lang + ":" + tagextractor.count());
80 | 
81 |         } catch (IOException e) {
82 |             throw new RuntimeException("Can't open training database file '" + file.getName() + "'", e);
83 |         }
84 |         return profile;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/cybozu/util/NGramTest.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Nakatani Shuyo
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.cybozu.util;
18 | 
19 | import org.junit.Test;
20 | 
21 | import static org.hamcrest.CoreMatchers.is;
22 | import static org.junit.Assert.assertEquals;
23 | import static org.junit.Assert.assertThat;
24 | 
25 | /**
26 |  * @author Nakatani Shuyo
27 |  */
28 | public class NGramTest {
29 | 
30 |     /**
31 |      * Test method for constants
32 |      */
33 |     @Test
34 |     public final void testConstants() {
35 |         assertThat(NGram.N_GRAM, is(3));
36 |         assertEquals(NGram.N_GRAM, 3);
37 |     }
38 | 
39 | 
40 |     /**
41 |      * Test method for {@link NGram#get(int)} and {@link NGram#addChar(char)}
42 |      */
43 |     @Test
44 |     public final void testNGram() {
45 |         NGram ngram = new NGram();
46 |         assertEquals(ngram.get(0), null);
47 |         assertEquals(ngram.get(1), null);
48 |         assertEquals(ngram.get(2), null);
49 |         assertEquals(ngram.get(3), null);
50 |         assertEquals(ngram.get(4), null);
51 |         ngram.addChar(' ');
52 |         assertEquals(ngram.get(1), null);
53 |         assertEquals(ngram.get(2), null);
54 |         assertEquals(ngram.get(3), null);
55 |         ngram.addChar('A');
56 |         assertEquals(ngram.get(1), "A");
57 |         assertEquals(ngram.get(2), " A");
58 |         assertEquals(ngram.get(3), null);
59 |         ngram.addChar('\u06cc');
60 |         assertEquals(ngram.get(1), "\u064a");
61 |         assertEquals(ngram.get(2), "A\u064a");
62 |         assertEquals(ngram.get(3), " A\u064a");
63 |         ngram.addChar('\u1ea0');
64 |         assertEquals(ngram.get(1), "\u1ec3");
65 |         assertEquals(ngram.get(2), "\u064a\u1ec3");
66 |         assertEquals(ngram.get(3), "A\u064a\u1ec3");
67 |         ngram.addChar('\u3044');
68 |         assertEquals(ngram.get(1), "\u3042");
69 |         assertEquals(ngram.get(2), "\u1ec3\u3042");
70 |         assertEquals(ngram.get(3), "\u064a\u1ec3\u3042");
71 | 
72 |         ngram.addChar('\u30a4');
73 |         assertEquals(ngram.get(1), "\u30a2");
74 |         assertEquals(ngram.get(2), "\u3042\u30a2");
75 |         assertEquals(ngram.get(3), "\u1ec3\u3042\u30a2");
76 |         ngram.addChar('\u3106');
77 |         assertEquals(ngram.get(1), "\u3105");
78 |         assertEquals(ngram.get(2), "\u30a2\u3105");
79 |         assertEquals(ngram.get(3), "\u3042\u30a2\u3105");
80 |         ngram.addChar('\uac01');
81 |         assertEquals(ngram.get(1), "\uac00");
82 |         assertEquals(ngram.get(2), "\u3105\uac00");
83 |         assertEquals(ngram.get(3), "\u30a2\u3105\uac00");
84 |         ngram.addChar('\u2010');
85 |         assertEquals(ngram.get(1), null);
86 |         assertEquals(ngram.get(2), "\uac00 ");
87 |         assertEquals(ngram.get(3), "\u3105\uac00 ");
88 | 
89 |         ngram.addChar('a');
90 |         assertEquals(ngram.get(1), "a");
91 |         assertEquals(ngram.get(2), " a");
92 |         assertEquals(ngram.get(3), null);
93 |     }
94 |    
95 | }


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 Francois ROLAND
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.optimaize.langdetect.profiles;
18 | 
19 | import org.jetbrains.annotations.NotNull;
20 | 
21 | import java.io.*;
22 | import java.nio.charset.Charset;
23 | import java.util.Map;
24 | 
25 | /**
26 |  * Writes a {@link LanguageProfile} to an output stream or file.
27 |  *
28 |  * <p>All file operations are done with UTF-8.</p>
29 |  *
30 |  * @author François ROLAND
31 |  * @author Fabian Kessler
32 |  */
33 | public class LanguageProfileWriter {
34 | 
35 |     /**
36 |      * Writes a {@link LanguageProfile} to an OutputStream in UTF-8.
37 |      *
38 |      * @throws java.io.IOException
39 |      */
40 |     public void write(@NotNull LanguageProfile languageProfile, @NotNull OutputStream outputStream) throws IOException {
41 |         try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName("utf-8")))) {
42 |             writer.write("{\"freq\":{");
43 |             boolean first = true;
44 |             for (Map.Entry<String, Integer> entry : languageProfile.iterateGrams()) {
45 |                 if (!first) {
46 |                     writer.write(',');
47 |                 }
48 |                 writer.write('"');
49 |                 writer.write(entry.getKey());
50 |                 writer.write("\":");
51 |                 writer.write(entry.getValue().toString());
52 |                 first = false;
53 |             }
54 |             writer.write("},\"n_words\":[");
55 |             first = true;
56 |             for (int i=1; i<=10; i++) {
57 |                 long nWord = languageProfile.getNumGramOccurrences(i);
58 |                 if (nWord ==0) break;
59 |                 if (!first) {
60 |                     writer.write(',');
61 |                 }
62 |                 writer.write(Long.toString(nWord));
63 |                 first = false;
64 |             }
65 |             writer.write("],\"name\":\"");
66 |             writer.write(languageProfile.getLocale().toString());
67 |             writer.write("\"}");
68 |             writer.flush();
69 |         }
70 |     }
71 | 
72 |     /**
73 |      * Writes a {@link LanguageProfile} to a folder using the language name as the file name.
74 |      *
75 |      * @param fullPath Must be an existing writable directory path.
76 |      * @throws java.io.IOException if such a file name exists already.
77 |      */
78 |     public void writeToDirectory(@NotNull LanguageProfile languageProfile, @NotNull File fullPath) throws IOException {
79 |         if (!fullPath.exists()) {
80 |             throw new IOException("Path does not exist: "+fullPath);
81 |         }
82 |         if (!fullPath.canWrite()) {
83 |             throw new IOException("Path not writable: "+fullPath);
84 |         }
85 |         File file = new File(fullPath.getAbsolutePath()+"/"+languageProfile.getLocale());
86 |         if (file.exists()) {
87 |             throw new IOException("File exists already, refusing to overwrite: "+file);
88 |         }
89 |         try (FileOutputStream output = new FileOutputStream(file)) {
90 |             write(languageProfile, output);
91 |         }
92 |     }
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/cybozu/util/LangProfileTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Nakatani Shuyo
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.cybozu.util;
 18 | 
 19 | import static org.junit.Assert.*;
 20 | 
 21 | import org.junit.Test;
 22 | 
 23 | /**
 24 |  * @author Nakatani Shuyo
 25 |  *
 26 |  */
 27 | public class LangProfileTest {
 28 | 
 29 |     /**
 30 |      * Test method for {@link LangProfile#LangProfile()}.
 31 |      */
 32 |     @Test
 33 |     public final void testLangProfile() {
 34 |         LangProfile profile = new LangProfile();
 35 |         assertEquals(profile.getName(), null);
 36 |     }
 37 | 
 38 |     /**
 39 |      * Test method for {@link LangProfile#LangProfile(java.lang.String)}.
 40 |      */
 41 |     @Test
 42 |     public final void testLangProfileStringInt() {
 43 |         LangProfile profile = new LangProfile("en");
 44 |         assertEquals(profile.getName(), "en");
 45 |     }
 46 | 
 47 |     /**
 48 |      * Test method for {@link LangProfile#add(java.lang.String)}.
 49 |      */
 50 |     @Test
 51 |     public final void testAdd() {
 52 |         LangProfile profile = new LangProfile("en");
 53 |         profile.add("a");
 54 |         assertEquals((int)profile.getFreq().get("a"), 1);
 55 |         profile.add("a");
 56 |         assertEquals((int)profile.getFreq().get("a"), 2);
 57 |         profile.omitLessFreq();
 58 |     }
 59 | 
 60 |     
 61 |     @Test(expected = IllegalStateException.class)
 62 |     public final void testAddIllegally1() {
 63 |         LangProfile profile = new LangProfile(); // Illegal ( available for only JSONIC ) but ignore  
 64 |         profile.add("a");
 65 |     }
 66 | 
 67 |     @Test(expected = IllegalArgumentException.class)
 68 |     public final void testAddIllegally2() {
 69 |         LangProfile profile = new LangProfile("en");
 70 |         profile.add("");  // Illegal (string's length of parameter must be between 1 and 3)
 71 |     }
 72 | 
 73 |     @Test(expected = IllegalArgumentException.class)
 74 |     public final void testAddIllegally3() {
 75 |         LangProfile profile = new LangProfile("en");
 76 |         profile.add("abcd");  // Illegal (string's length of parameter must be between 1 and 3)
 77 |     }
 78 | 
 79 |     /**
 80 |      * Test method for {@link LangProfile#omitLessFreq()}.
 81 |      */
 82 |     @Test
 83 |     public final void testOmitLessFreq() {
 84 |         LangProfile profile = new LangProfile("en");
 85 |         String[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".split(" ");
 86 |         for (int i=0;i<5;++i) {
 87 |             for (String g : grams) {
 88 |                 profile.add(g);
 89 |             }
 90 |         }
 91 |         profile.add("\u3050");
 92 | 
 93 |         assertEquals((int)profile.getFreq().get("a"), 5);
 94 |         assertEquals((int)profile.getFreq().get("\u3042"), 5);
 95 |         assertEquals((int)profile.getFreq().get("\u3050"), 1);
 96 |         profile.omitLessFreq();
 97 |         assertEquals(profile.getFreq().get("a"), null); // omitted
 98 |         assertEquals((int)profile.getFreq().get("\u3042"), 5);
 99 |         assertEquals(profile.getFreq().get("\u3050"), null); // omitted
100 |     }
101 | 
102 |     @Test(expected = IllegalStateException.class)
103 |     public final void testOmitLessFreqIllegally() {
104 |         LangProfile profile = new LangProfile();
105 |         profile.omitLessFreq();
106 |     }
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/cybozu/util/TagExtractorTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Nakatani Shuyo
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.cybozu.util;
 18 | 
 19 | import static org.junit.Assert.*;
 20 | 
 21 | import org.junit.Test;
 22 | 
 23 | /**
 24 |  * @author Nakatani Shuyo
 25 |  *
 26 |  */
 27 | public class TagExtractorTest {
 28 | 
 29 |     /**
 30 |      * Test method for {@link TagExtractor#TagExtractor(java.lang.String, int)}.
 31 |      */
 32 |     @Test
 33 |     public final void testTagExtractor() {
 34 |         TagExtractor extractor = new TagExtractor(null, 0);
 35 |         assertEquals(extractor.target_, null);
 36 |         assertEquals(extractor.threshold_, 0);
 37 | 
 38 |         TagExtractor extractor2 = new TagExtractor("abstract", 10);
 39 |         assertEquals(extractor2.target_, "abstract");
 40 |         assertEquals(extractor2.threshold_, 10);
 41 | }
 42 | 
 43 |     /**
 44 |      * Test method for {@link TagExtractor#setTag(java.lang.String)}.
 45 |      */
 46 |     @Test
 47 |     public final void testSetTag() {
 48 |         TagExtractor extractor = new TagExtractor(null, 0);
 49 |         extractor.setTag("");
 50 |         assertEquals(extractor.tag_, "");
 51 |         extractor.setTag(null);
 52 |         assertEquals(extractor.tag_, null);
 53 |     }
 54 | 
 55 |     /**
 56 |      * Test method for {@link TagExtractor#add(java.lang.String)}.
 57 |      */
 58 |     @Test
 59 |     public final void testAdd() {
 60 |         TagExtractor extractor = new TagExtractor(null, 0);
 61 |         extractor.add("");
 62 |         extractor.add(null);    // ignore
 63 |     }
 64 | 
 65 |     /**
 66 |      * Test method for {@link TagExtractor#closeTag(LangProfile)}.
 67 |      */
 68 |     @Test
 69 |     public final void testCloseTag() {
 70 |         TagExtractor extractor = new TagExtractor(null, 0);
 71 |         LangProfile profile = null;
 72 |         extractor.closeTag(profile);    // ignore
 73 |     }
 74 | 
 75 |     
 76 |     /**
 77 |      * Scenario Test of extracting &lt;abstract&gt; tag from Wikipedia database.
 78 |      */
 79 |     @Test
 80 |     public final void testNormalScenario() {
 81 |         TagExtractor extractor = new TagExtractor("abstract", 10);
 82 |         assertEquals(extractor.count(), 0);
 83 | 
 84 |         LangProfile profile = new LangProfile("en");
 85 | 
 86 |         // normal
 87 |         extractor.setTag("abstract");
 88 |         extractor.add("This is a sample text.");
 89 |         extractor.closeTag(profile);
 90 |         assertEquals(extractor.count(), 1);
 91 |         assertEquals(profile.getNWords()[0], 17);  // Thisisasampletext
 92 |         assertEquals(profile.getNWords()[1], 22);  // _T, Th, hi, ...
 93 |         assertEquals(profile.getNWords()[2], 17);  // _Th, Thi, his, ...
 94 | 
 95 |         // too short
 96 |         extractor.setTag("abstract");
 97 |         extractor.add("sample");
 98 |         extractor.closeTag(profile);
 99 |         assertEquals(extractor.count(), 1);
100 | 
101 |         // other tags
102 |         extractor.setTag("div");
103 |         extractor.add("This is a sample text which is enough long.");
104 |         extractor.closeTag(profile);
105 |         assertEquals(extractor.count(), 1);
106 |     }
107 | 
108 |     /**
109 |      * Test method for {@link TagExtractor#clear()}.
110 |      */
111 |     @Test
112 |     public final void testClear() {
113 |         TagExtractor extractor = new TagExtractor("abstract", 10);
114 |         extractor.setTag("abstract");
115 |         extractor.add("This is a sample text.");
116 |         assertEquals(extractor.buf_.toString().trim(), "This is a sample text.");
117 |         assertEquals(extractor.tag_, "abstract");
118 |         extractor.clear();
119 |         assertEquals(extractor.buf_.toString().trim(), "");
120 |         assertEquals(extractor.tag_, null);
121 |     }
122 | 
123 | 
124 | }
125 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/LanguageProfile.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.profiles;
 18 | 
 19 | import com.optimaize.langdetect.i18n.LdLocale;
 20 | import org.jetbrains.annotations.NotNull;
 21 | 
 22 | import java.util.List;
 23 | import java.util.Map;
 24 | 
 25 | /**
 26 |  * A language profile knows the locale (language), and contains the n-grams and some statistics.
 27 |  *
 28 |  * <p>It is built from a training text that should be fairly large and clean.</p>
 29 |  *
 30 |  * <p>It contains the n-grams from the training text in the desired gram sizes (eg 2 and 3-grams),
 31 |  * with possible text filters applied for cleaning. Also, rarely occurring n-grams may have been cut to
 32 |  * reduce the noise and index size. Use a {@link LanguageProfileBuilder}.</p>
 33 |  *
 34 |  * <p>The profile may be created at runtime on-the-fly, or it may be loaded from a previously generated
 35 |  * text file (see OldLangProfileConverter).</p>
 36 |  *
 37 |  * @author Fabian Kessler
 38 |  */
 39 | public interface LanguageProfile {
 40 | 
 41 |     @NotNull
 42 |     LdLocale getLocale();
 43 | 
 44 |     /**
 45 |      * Tells what the n in n-grams are used here.
 46 |      * Example: [1,2,3]
 47 |      * @return Sorted from smaller to larger.
 48 |      */
 49 |     @NotNull
 50 |     List<Integer> getGramLengths();
 51 | 
 52 |     /**
 53 |      * @param gram for example "a" or "foo".
 54 |      * @return 0-n, also zero if this profile does not use n-grams of that length (for example if no 4-grams are made).
 55 |      */
 56 |     int getFrequency(String gram);
 57 | 
 58 |     /**
 59 |      * Tells how many different n-grams there are for a certain n-gram size.
 60 |      * For example the English language has about 57 different 1-grams, whereas Chinese in Hani has thousands.
 61 |      * @param gramLength 1-n
 62 |      * @return 0-n, returns zero if no such n-grams were made (for example if no 4-grams were made),
 63 |      *              or if all the training text did not contain such long words.
 64 |      */
 65 |     int getNumGrams(int gramLength);
 66 | 
 67 |     /**
 68 |      * Tells how many n-grams there are for all n-gram sizes combined.
 69 |      * @return 0-n (0 only on an empty profile...)
 70 |      */
 71 |     int getNumGrams();
 72 | 
 73 |     /**
 74 |      * Tells how often all n-grams of a certain length occurred, combined.
 75 |      * This returns a much larger number than {@link #getNumGrams}.
 76 |      * @param gramLength 1-n
 77 |      * @return 0-n, returns zero if no such n-grams were made (for example if no 4-grams were made),
 78 |      *              or if all the training text did not contain such long words.
 79 |      */
 80 |     long getNumGramOccurrences(int gramLength);
 81 | 
 82 |     /**
 83 |      * Tells how often the n-gram with the lowest amount of occurrences used in this profile occurred.
 84 |      *
 85 |      * Most likely there were n-grams with less (unless the returned number is 1), but they were eliminated
 86 |      * in order to keep the profile reasonably small.
 87 |      *
 88 |      * This is the opposite of getMaxGramCount().
 89 |      *
 90 |      * @param gramLength 1-n
 91 |      * @return 0-n, returns zero if no such n-grams were made or existed.
 92 |      */
 93 |     long getMinGramCount(int gramLength);
 94 |     /**
 95 |      * Tells how often the n-gram with the highest amount of occurrences used in this profile occurred.
 96 |      *
 97 |      * This is the opposite of getMinGramCount().
 98 |      *
 99 |      * @param gramLength 1-n
100 |      * @return 0-n, returns zero if no such n-grams were made or existed.
101 |      */
102 |     long getMaxGramCount(int gramLength);
103 | 
104 |     /**
105 |      * Iterates all ngram strings with frequency.
106 |      */
107 |     @NotNull
108 |     Iterable<Map.Entry<String,Integer>> iterateGrams();
109 | 
110 |     /**
111 |      * Iterates all gramLength-gram strings with frequency.
112 |      */
113 |     @NotNull
114 |     Iterable<Map.Entry<String,Integer>> iterateGrams(int gramLength);
115 | 
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/util/Util.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Nakatani Shuyo
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.cybozu.util;
 18 | 
 19 | import com.optimaize.langdetect.i18n.LdLocale;
 20 | import com.optimaize.langdetect.ngram.NgramExtractor;
 21 | import com.optimaize.langdetect.ngram.NgramExtractors;
 22 | import com.optimaize.langdetect.ngram.OldNgramExtractor;
 23 | import org.jetbrains.annotations.NotNull;
 24 | 
 25 | import java.util.Formatter;
 26 | import java.util.List;
 27 | import java.util.Map;
 28 | 
 29 | /**
 30 |  * A place for sharing code.
 31 |  *
 32 |  * @author Nakatani Shuyo
 33 |  */
 34 | public class Util {
 35 | 
 36 |     private static final NgramExtractor ngramExtractor = NgramExtractors.standard();
 37 | 
 38 |     public static void addCharSequence(LangProfile langProfile, CharSequence text) {
 39 |         //TODO replace with new code.
 40 | 
 41 | //        List<String> old = OldNgramExtractor.extractNGrams(text, null);
 42 | //        List<String> nuu = ngramExtractor.extractGrams(text);
 43 | //
 44 | //        Set<String> oldSet = new HashSet<>(old);
 45 | //        Set<String> nuuSet = new HashSet<>(nuu);
 46 | //
 47 | //        ArrayList<String> justNuu = new ArrayList<>(nuu);
 48 | //        justNuu.removeAll(old);
 49 | //
 50 | //        ArrayList<String> justOld = new ArrayList<>(old);
 51 | //        justOld.removeAll(nuu);
 52 | //
 53 | //        System.out.println(text);
 54 | 
 55 | //        for (String s : ngramExtractor.extractGrams(text)) {
 56 | //            langProfile.add(s);
 57 | //        }
 58 |         for (String s : OldNgramExtractor.extractNGrams(text, null)) {
 59 |             langProfile.add(s);
 60 |         }
 61 |     }
 62 | 
 63 | 
 64 | 
 65 |     /**
 66 |      * unicode encoding (for verbose mode)
 67 |      */
 68 |     public static String unicodeEncode(String s) {
 69 |         StringBuilder buf = new StringBuilder();
 70 |         for (int i = 0; i < s.length(); ++i) {
 71 |             char ch = s.charAt(i);
 72 |             if (ch >= '\u0080') {
 73 |                 String st = Integer.toHexString(0x10000 + (int) ch);
 74 |                 while (st.length() < 4) st = "0" + st;
 75 |                 buf.append("\\u").append(st.subSequence(1, 5));
 76 |             } else {
 77 |                 buf.append(ch);
 78 |             }
 79 |         }
 80 |         return buf.toString();
 81 |     }
 82 | 
 83 | 
 84 |     /**
 85 |      * normalize probabilities and check convergence by the maximum probability
 86 |      * @return maximum of probabilities
 87 |      */
 88 |     public static double normalizeProb(double[] prob) {
 89 |         double maxp = 0, sump = 0;
 90 |         for(int i=0;i<prob.length;++i) sump += prob[i];
 91 |         for(int i=0;i<prob.length;++i) {
 92 |             double p = prob[i] / sump;
 93 |             if (maxp < p) maxp = p;
 94 |             prob[i] = p;
 95 |         }
 96 |         return maxp;
 97 |     }
 98 | 
 99 | 
100 |     public static String wordProbToString(double[] prob, List<LdLocale> langlist) {
101 |         Formatter formatter = new Formatter();
102 |         for(int j=0;j<prob.length;++j) {
103 |             double p = prob[j];
104 |             if (p>=0.00001) {
105 |                 formatter.format(" %s:%.5f", langlist.get(j), p);
106 |             }
107 |         }
108 |         return formatter.toString();
109 |     }
110 | 
111 | 
112 |     /**
113 |      */
114 |     public static double[] makeInternalPrioMap(@NotNull Map<LdLocale, Double> langWeightingMap,
115 |                                                 @NotNull List<LdLocale> langlist) {
116 |         assert !langWeightingMap.isEmpty();
117 |         double[] priorMap = new double[langlist.size()];
118 |         double sump = 0;
119 |         for (int i=0;i<priorMap.length;++i) {
120 |             LdLocale lang = langlist.get(i);
121 |             if (langWeightingMap.containsKey(lang)) {
122 |                 double p = langWeightingMap.get(lang);
123 |                 assert p>=0 : "Prior probability must be non-negative!";
124 |                 priorMap[i] = p;
125 |                 sump += p;
126 |             }
127 |         }
128 |         assert sump > 0 : "Sum must be greater than zero!";
129 |         for (int i=0;i<priorMap.length;++i) priorMap[i] /= sump;
130 |         return priorMap;
131 |     }
132 | 
133 | }
134 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/i18n/LdLocaleTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.i18n;
 18 | 
 19 | import org.junit.Test;
 20 | 
 21 | import static org.junit.Assert.*;
 22 | 
 23 | public class LdLocaleTest {
 24 | 
 25 |     @Test
 26 |     public void justLanguage() throws Exception {
 27 |         expectJustLanguage("en");
 28 |         expectJustLanguage("gsw");
 29 |     }
 30 |     private void expectJustLanguage(String lang) throws Exception {
 31 |         LdLocale locale = LdLocale.fromString(lang);
 32 |         assertEquals(locale.toString(), lang);
 33 |         assertEquals(locale.getLanguage(), lang);
 34 |         assertFalse(locale.getScript().isPresent());
 35 |         assertFalse(locale.getRegion().isPresent());
 36 |     }
 37 | 
 38 |     @Test
 39 |     public void languageAndScript() throws Exception {
 40 |         expectLanguageAndScript("en", "Latn");
 41 |         expectLanguageAndScript("gsw", "Latn");
 42 |         expectLanguageAndScript("zh", "Hans");
 43 |     }
 44 |     private void expectLanguageAndScript(String lang, String script) throws Exception {
 45 |         LdLocale locale = LdLocale.fromString(lang+'-'+script);
 46 |         assertEquals(locale.toString(), lang+'-'+script);
 47 |         assertEquals(locale.getLanguage(), lang);
 48 |         assertEquals(locale.getScript().get(), script);
 49 |         assertFalse(locale.getRegion().isPresent());
 50 |     }
 51 | 
 52 |     @Test
 53 |     public void languageAndRegion() throws Exception {
 54 |         expectLanguageAndRegion("en", "UK");
 55 |         expectLanguageAndRegion("zh", "CN");
 56 |     }
 57 |     private void expectLanguageAndRegion(String lang, String region) throws Exception {
 58 |         LdLocale locale = LdLocale.fromString(lang+'-'+region);
 59 |         assertEquals(locale.toString(), lang+'-'+region);
 60 |         assertEquals(locale.getLanguage(), lang);
 61 |         assertFalse(locale.getScript().isPresent());
 62 |         assertEquals(locale.getRegion().get(), region);
 63 |     }
 64 | 
 65 |     @Test
 66 |     public void all() throws Exception {
 67 |         expectAll("en", "Latn", "UK");
 68 |         expectAll("zh", "Hant", "CN");
 69 |     }
 70 |     private void expectAll(String lang, String script, String region) throws Exception {
 71 |         LdLocale locale = LdLocale.fromString(lang+'-'+script+'-'+region);
 72 |         assertEquals(locale.toString(), lang+'-'+script+'-'+region);
 73 |         assertEquals(locale.getLanguage(), lang);
 74 |         assertEquals(locale.getScript().get(), script);
 75 |         assertEquals(locale.getRegion().get(), region);
 76 |     }
 77 | 
 78 | 
 79 | 
 80 |     @Test
 81 |     public void equalsYes() throws Exception {
 82 |         expectEqualsYes("en");
 83 |         expectEqualsYes("en-Latn-UK");
 84 |     }
 85 |     private void expectEqualsYes(String s) throws Exception {
 86 |         LdLocale locale1 = LdLocale.fromString(s);
 87 |         LdLocale locale2 = LdLocale.fromString(locale1.toString());
 88 |         assertEquals(locale1, locale2);
 89 |     }
 90 | 
 91 | 
 92 | 
 93 | 
 94 |     @Test
 95 |     public void invalid() throws Exception {
 96 |         //language required
 97 |         expectInvalid("");
 98 |         expectInvalid(null);
 99 | 
100 |         //invalid syntax
101 |         expectInvalid("-");
102 |         expectInvalid("--");
103 |         expectInvalid("xx-");
104 |         expectInvalid("-xx");
105 |         expectInvalid("-xx-");
106 |         expectInvalid("de--CH");
107 |         expectInvalid("de--Latn");
108 | 
109 |         //invalid language: too short or too long
110 |         expectInvalid("x");
111 |         expectInvalid("xxxx");
112 | 
113 |         //wrong order
114 |         expectInvalid("de-CH-Latn");
115 | 
116 |         //missing language
117 |         expectInvalid("Latn");
118 |         expectInvalid("CH");
119 |         expectInvalid("CH-Latn");
120 | 
121 |         //incorrect case
122 |         expectInvalid("JA");
123 |         expectInvalid("ja-jp");
124 |         expectInvalid("ja-jpan");
125 |         expectInvalid("ja-JPAN");
126 | 
127 |         //incorrect separator
128 |         expectInvalid("de_CH");
129 |         expectInvalid("de CH");
130 |     }
131 |     public void expectInvalid(String s) throws Exception {
132 |         try {
133 |             LdLocale.fromString(s);
134 |             fail("Expected failure for: "+s);
135 |         } catch (IllegalArgumentException e) {
136 |             //ok, expected that
137 |         }
138 |     }
139 | 
140 | }
141 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileBuilder.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.profiles;
 18 | 
 19 | import com.optimaize.langdetect.i18n.LdLocale;
 20 | import com.optimaize.langdetect.ngram.NgramExtractor;
 21 | import org.jetbrains.annotations.NotNull;
 22 | 
 23 | import java.util.HashMap;
 24 | import java.util.Iterator;
 25 | import java.util.Map;
 26 | 
 27 | /**
 28 |  * Builder for {@link LanguageProfile}.
 29 |  *
 30 |  * <p>This class does no internal synchronization.</p>
 31 |  *
 32 |  * @author Fabian Kessler
 33 |  */
 34 | public class LanguageProfileBuilder {
 35 | 
 36 |     @NotNull
 37 |     private final LdLocale locale;
 38 |     private int minimalFrequency = 1;
 39 |     private NgramExtractor ngramExtractor;
 40 |     private final Map<Integer, Map<String,Integer>> ngrams = new HashMap<>();
 41 | 
 42 | 
 43 |     public LanguageProfileBuilder(@NotNull LdLocale locale) {
 44 |         this.locale = locale;
 45 |     }
 46 |     @Deprecated
 47 |     public LanguageProfileBuilder(@NotNull String locale) {
 48 |         this.locale = LdLocale.fromString(locale);
 49 |     }
 50 | 
 51 |     /**
 52 |      * Copy constructor.
 53 |      */
 54 |     public LanguageProfileBuilder(@NotNull LanguageProfileBuilder languageProfileBuilder) {
 55 |         this.locale = languageProfileBuilder.locale;
 56 |         this.minimalFrequency = languageProfileBuilder.minimalFrequency;
 57 |         this.ngramExtractor = languageProfileBuilder.ngramExtractor;
 58 |         this.ngrams.putAll(languageProfileBuilder.ngrams);
 59 |     }
 60 | 
 61 |     public LanguageProfileBuilder ngramExtractor(@NotNull NgramExtractor ngramExtractor) {
 62 |         this.ngramExtractor = ngramExtractor;
 63 |         return this;
 64 |     }
 65 | 
 66 |     /**
 67 |      * @param minimalFrequency 1-n, the default is 1. n-grams that occurred less often in the text are removed.
 68 |      *                         This really should be set to something higher.
 69 |      *                         Try to play with the number until you get a profile file of satisfying size,
 70 |      *                         that produces good language detection results.
 71 |      */
 72 |     public LanguageProfileBuilder minimalFrequency(int minimalFrequency) {
 73 |         if (minimalFrequency < 1) throw new IllegalArgumentException("minimalFrequency must be >= 1, but was: "+minimalFrequency);
 74 |         this.minimalFrequency = minimalFrequency;
 75 |         return this;
 76 |     }
 77 | 
 78 |     /**
 79 |      * In order to use this you must set the {@link #ngramExtractor} first.
 80 |      */
 81 |     public LanguageProfileBuilder addText(CharSequence text) {
 82 |         if (ngramExtractor==null) {
 83 |             throw new IllegalStateException("NgramExtractor has not been set yet!");
 84 |         }
 85 |         for (Map.Entry<String, Integer> entry : ngramExtractor.extractCountedGrams(text).entrySet()) {
 86 |             addGram(entry.getKey(), entry.getValue());
 87 |         }
 88 |         return this;
 89 |     }
 90 | 
 91 |     /**
 92 |      * Shortcut for addGram(ngram, 1).
 93 |      */
 94 |     public LanguageProfileBuilder addGram(String ngram) {
 95 |         return addGram(ngram, 1);
 96 |     }
 97 |     /**
 98 |      * If the builder already has this ngram, the given frequency is added to the current count.
 99 |      */
100 |     public LanguageProfileBuilder addGram(String ngram, int frequency) {
101 |         Map<String, Integer> map = ngrams.get(ngram.length());
102 |         if (map==null) {
103 |             map = new HashMap<>();
104 |             ngrams.put(ngram.length(), map);
105 |         }
106 |         Integer total = map.get(ngram);
107 |         if (total==null) total = 0;
108 |         total += frequency;
109 |         map.put(ngram, total);
110 |         return this;
111 |     }
112 | 
113 | 
114 |     public LanguageProfile build() {
115 |         if (minimalFrequency >1) {
116 |             removeNgramsWithLessFrequency();
117 |         }
118 |         return new LanguageProfileImpl(locale, ngrams);
119 |     }
120 | 
121 | 
122 |     private void removeNgramsWithLessFrequency() {
123 |         for (Map<String, Integer> map : ngrams.values()) {
124 |             Iterator<Map.Entry<String, Integer>> iterator = map.entrySet().iterator();
125 |             while (iterator.hasNext()) {
126 |                 Map.Entry<String, Integer> next = iterator.next();
127 |                 if (next.getValue() < minimalFrequency) {
128 |                     iterator.remove();
129 |                 }
130 |             }
131 |         }
132 |     }
133 | 
134 | }
135 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/NgramFrequencyData.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect;
 18 | 
 19 | import com.optimaize.langdetect.i18n.LdLocale;
 20 | import com.optimaize.langdetect.profiles.LanguageProfile;
 21 | import org.jetbrains.annotations.NotNull;
 22 | import org.jetbrains.annotations.Nullable;
 23 | 
 24 | import java.util.*;
 25 | 
 26 | /**
 27 |  * Contains frequency information for n-grams coming from multiple {@link LanguageProfile}s.
 28 |  *
 29 |  * <p>For each n-gram string it knows the locales (languages) in which it occurs, and how frequent it
 30 |  * occurs in those languages in relation to other n-grams of the same length in those same languages.</p>
 31 |  *
 32 |  * <p>Immutable by definition (can't make Arrays unmodifiable).</p>
 33 |  *
 34 |  * @author Fabian Kessler
 35 |  */
 36 | public final class NgramFrequencyData {
 37 | 
 38 |     /**
 39 |      * Key   = ngram
 40 |      * Value = array with probabilities per loaded language, in the same order as {@code langlist}.
 41 |      */
 42 |     @NotNull
 43 |     private final Map<String, double[]> wordLangProbMap;
 44 | 
 45 |     /**
 46 |      * All the loaded languages, in exactly the same order as the data is in the double[] in wordLangProbMap.
 47 |      * Example: if wordLangProbMap has an entry for the n-gram "foo" then for each locale in this langlist here
 48 |      * it has a value there. Languages that don't know the n-gram have the value 0d.
 49 |      */
 50 |     @NotNull
 51 |     private final List<LdLocale> langlist;
 52 | 
 53 | 
 54 |     /**
 55 |      * @param gramLengths for example [1,2,3]
 56 |      * @throws java.lang.IllegalArgumentException if languageProfiles or gramLengths is empty, or if one of the
 57 |      *         languageProfiles does not have the grams of the required sizes.
 58 |      */
 59 |     @NotNull
 60 |     public static NgramFrequencyData create(@NotNull Collection<LanguageProfile> languageProfiles, @NotNull Collection<Integer> gramLengths) throws IllegalArgumentException {
 61 |         if (languageProfiles.isEmpty()) throw new IllegalArgumentException("No languageProfiles provided!");
 62 |         if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!");
 63 | 
 64 |         Map<String, double[]> wordLangProbMap = new HashMap<>();
 65 |         List<LdLocale> langlist = new ArrayList<>();
 66 |         int langsize = languageProfiles.size();
 67 | 
 68 |         int index = -1;
 69 |         for (LanguageProfile profile : languageProfiles) {
 70 |             index++;
 71 | 
 72 |             langlist.add( profile.getLocale() );
 73 | 
 74 |             for (Integer gramLength : gramLengths) {
 75 |                 if (!profile.getGramLengths().contains(gramLength)) {
 76 |                     throw new IllegalArgumentException("The language profile for "+profile.getLocale()+" does not contain "+gramLength+"-grams!");
 77 |                 }
 78 |                 for (Map.Entry<String, Integer> ngramEntry : profile.iterateGrams(gramLength)) {
 79 |                     String ngram      = ngramEntry.getKey();
 80 |                     Integer frequency = ngramEntry.getValue();
 81 |                     if (!wordLangProbMap.containsKey(ngram)) {
 82 |                         wordLangProbMap.put(ngram, new double[langsize]);
 83 |                     }
 84 |                     double prob = frequency.doubleValue() / profile.getNumGramOccurrences(ngram.length());
 85 |                     wordLangProbMap.get(ngram)[index] = prob;
 86 |                 }
 87 |             }
 88 |         }
 89 | 
 90 |         return new NgramFrequencyData(wordLangProbMap, langlist);
 91 |     }
 92 | 
 93 |     private NgramFrequencyData(@NotNull Map<String, double[]> wordLangProbMap,
 94 |                                @NotNull List<LdLocale> langlist) {
 95 |         //not making immutable copies because I create them here (optimization).
 96 |         this.wordLangProbMap = Collections.unmodifiableMap(wordLangProbMap);
 97 |         this.langlist = Collections.unmodifiableList(langlist);
 98 |     }
 99 | 
100 | 
101 |     @NotNull
102 |     public List<LdLocale> getLanguageList() {
103 |         return langlist;
104 |     }
105 |     @NotNull
106 |     public LdLocale getLanguage(int pos) {
107 |         return langlist.get(pos);
108 |     }
109 | 
110 |     /**
111 |      * Don't modify this data structure! (Can't make array immutable...)
112 |      * @return null if no language profile knows that ngram.
113 |      *         entries are 0 for languages that don't know that ngram at all.
114 |      *         The array is in the order of the {@link #getLanguageList()} language list, and has exactly that size.
115 |      *         impl note: this way the caller can handle it more efficient than returning an empty array.
116 |      */
117 |     @Nullable
118 |     public double[] getProbabilities(String ngram) {
119 |         return wordLangProbMap.get(ngram);
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.text;
 18 | 
 19 | import java.util.HashMap;
 20 | import java.util.HashSet;
 21 | import java.util.Map;
 22 | import java.util.Set;
 23 | 
 24 | /**
 25 |  * Removes text written in scripts that are not the dominant script of the text.
 26 |  *
 27 |  * TODO this does not do special handling for Japanese (3 scripts) and Korean (2 scripts), they should be
 28 |  * counted together and kept.
 29 |  *
 30 |  * @author Fabian Kessler
 31 |  */
 32 | public class RemoveMinorityScriptsTextFilter implements TextFilter {
 33 | 
 34 |     private final double threshold;
 35 | 
 36 |     /**
 37 |      * If a script has less than this fraction of content compared to the most used one, its text is removed.
 38 |      *
 39 |      * Example: Latin 10%, Cyrillic 80%, Common 10% (punctuation n'stuff). Now 10 is put in relation to 80.
 40 |      *
 41 |      * @param threshold 0-1, suggested value is 0.3. If smaller then removed, equal remains.
 42 |      */
 43 |     public static RemoveMinorityScriptsTextFilter forThreshold(double threshold) {
 44 |         return new RemoveMinorityScriptsTextFilter(threshold);
 45 |     }
 46 | 
 47 |     private RemoveMinorityScriptsTextFilter(double threshold) {
 48 |         this.threshold = threshold;
 49 |     }
 50 | 
 51 |     @Override
 52 |     public String filter(CharSequence text) {
 53 |         Map<Character.UnicodeScript, Long> counts = countByScript(text);
 54 |         if (counts.size()<=1) {
 55 |             //nothing to do
 56 |             return text.toString();
 57 |         } else {
 58 |             long most = findMost(counts);
 59 |             Set<Character.UnicodeScript> toRemove = new HashSet<>();
 60 |             for (Map.Entry<Character.UnicodeScript, Long> entry : counts.entrySet()) {
 61 |                 if (entry.getValue()==most) continue;
 62 |                 double ratio = entry.getValue().doubleValue() / most;
 63 |                 if (ratio <= threshold) {
 64 |                     toRemove.add(entry.getKey());
 65 |                 }
 66 |             }
 67 |             if (toRemove.isEmpty()) {
 68 |                 return text.toString();
 69 |             } else {
 70 |                 return remove(text, toRemove);
 71 |             }
 72 |         }
 73 |     }
 74 | 
 75 |     private String remove(CharSequence text, Set<Character.UnicodeScript> toRemove) {
 76 |         StringBuilder remaining = new StringBuilder();
 77 |         Character.UnicodeScript last = null;
 78 |         for (int i=0; i<text.length(); i++) {
 79 |             char c = text.charAt(i);
 80 |             Character.UnicodeScript unicodeScript = Character.UnicodeScript.of(c);
 81 |             if (unicodeScript == Character.UnicodeScript.INHERITED) {
 82 |                 if (toRemove.contains(last)) {
 83 |                     //remove, don't update 'last'
 84 |                     continue;
 85 |                 }
 86 |             }
 87 |             last = unicodeScript;
 88 |             if (toRemove.contains(unicodeScript)) {
 89 |                 continue; //remove it
 90 |             }
 91 |             //if we get here then we keep it.
 92 |             remaining.append(c);
 93 |         }
 94 |         return remaining.toString();
 95 |     }
 96 | 
 97 |     private long findMost(Map<Character.UnicodeScript, Long> counts) {
 98 |         long max = 0L;
 99 |         for (Long aLong : counts.values()) {
100 |             if (aLong > max) max = aLong;
101 |         }
102 |         return max;
103 |     }
104 | 
105 |     private Map<Character.UnicodeScript, Long> countByScript(CharSequence text) {
106 |         Map<Character.UnicodeScript, Long> counter = new HashMap<>();
107 |         Character.UnicodeScript last = null;
108 |         for (int i=0; i<text.length(); i++) {
109 |             char c = text.charAt(i);
110 |             Character.UnicodeScript unicodeScript = Character.UnicodeScript.of(c);
111 |             switch (unicodeScript) {
112 |                 case INHERITED:
113 |                     //counts as what the last was.
114 |                     if (last!=null) { //really shouldn't be null
115 |                         increment(counter, last);
116 |                     }
117 |                     break;
118 |                 case COMMON:
119 |                 case UNKNOWN:
120 |                     //don't count it
121 |                     break;
122 |                 default:
123 |                     increment(counter, unicodeScript);
124 |                     last = unicodeScript;
125 |             }
126 |         }
127 |         return counter;
128 |     }
129 |     private void increment(Map<Character.UnicodeScript, Long> counter, Character.UnicodeScript unicodeScript) {
130 |         Long number = counter.get(unicodeScript);
131 |         if (number==null) {
132 |             counter.put(unicodeScript, 1L);
133 |         } else {
134 |             counter.put(unicodeScript, number+1);
135 |         }
136 |     }
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/text/TextObject.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.text;
 18 | 
 19 | import com.optimaize.langdetect.cybozu.util.CharNormalizer;
 20 | import com.google.common.annotations.Beta;
 21 | import org.jetbrains.annotations.NotNull;
 22 | 
 23 | import java.io.IOException;
 24 | import java.io.Reader;
 25 | 
 26 | /**
 27 |  * A convenient text object implementing CharSequence and Appendable.
 28 |  *
 29 |  * This is an ideal object to use for learning text to create {@link com.optimaize.langdetect.profiles.LanguageProfile}s,
 30 |  * as well as to pass it in to {@link com.optimaize.langdetect.LanguageDetector#detect}.
 31 |  *
 32 |  * To get one, use a TextObjectFactory (through a TextObjectFactoryBuilder).
 33 |  *
 34 |  * Example use:
 35 |  * //create the factory once:
 36 |  * TextObjectFactory textObjectFactory = new TextObjectFactoryBuilder()
 37 |  *     .withTextFilter(UrlTextFilter.getInstance())
 38 |  *     .build();
 39 |  * //then create as many text objects as you like:
 40 |  * TextObject inputText = textObjectFactory.create().append("deutsche Text").append(" ").append("blah blah");
 41 |  *
 42 |  * All append() methods go through the {@code textFilter}.
 43 |  *
 44 |  * Equals/hashCode are not implemented as of now on purpose. You may want to call toString() and compare that.
 45 |  *
 46 |  * @author Fabian Kessler
 47 |  */
 48 | @Beta
 49 | public class TextObject implements CharSequence, Appendable {
 50 | 
 51 |     @NotNull
 52 |     private final TextFilter textFilter;
 53 | 
 54 |     @NotNull
 55 |     private final StringBuilder stringBuilder;
 56 | 
 57 |     private final int maxTextLength;
 58 | 
 59 | 
 60 |     /**
 61 |      * @param maxTextLength 0 for no limit
 62 |      */
 63 |     public TextObject(@NotNull TextFilter textFilter, int maxTextLength) {
 64 |         this.textFilter = textFilter;
 65 |         this.maxTextLength = maxTextLength;
 66 |         this.stringBuilder = new StringBuilder();
 67 |     }
 68 | 
 69 | 
 70 |     /**
 71 |      * Append the target text for language detection.
 72 |      * This method read the text from specified input reader.
 73 |      * If the total size of target text exceeds the limit size,
 74 |      * the rest is ignored.
 75 |      *
 76 |      * @param reader the input reader (BufferedReader as usual)
 77 |      * @throws java.io.IOException Can't read the reader.
 78 |      */
 79 |     public TextObject append(Reader reader) throws IOException {
 80 |         char[] buf = new char[1024];
 81 |         while (reader.ready() && (maxTextLength==0 || stringBuilder.length()<maxTextLength)) {
 82 |             int length = reader.read(buf);
 83 |             append(String.valueOf(buf, 0, length));
 84 |         }
 85 |         return this;
 86 |     }
 87 | 
 88 |     /**
 89 |      * Append the target text for language detection.
 90 |      * If the total size of target text exceeds the limit size ,
 91 |      * the rest is cut down.
 92 |      *
 93 |      * @param text the target text to append
 94 |      */
 95 |     @Override
 96 |     public TextObject append(CharSequence text) {
 97 |         if (maxTextLength>0 && stringBuilder.length()>=maxTextLength) return this;
 98 | 
 99 |         text = textFilter.filter(text);
100 | 
101 |         //unfortunately this code can't be put into a TextFilter because:
102 |         //1) the limit could not be detected early, a lot of work would be done to waste time and memory
103 |         //2) the last character of the existing string builder could not be seen. if it is a space, we don't want
104 |         //   to add yet another space.
105 |         char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1);
106 |         for (int i=0; i<text.length() && (maxTextLength==0 || stringBuilder.length()<maxTextLength); i++) {
107 |             char c = CharNormalizer.normalize(text.charAt(i));
108 |             if (c != ' ' || pre != ' ') {
109 |                 stringBuilder.append(c);
110 |             }
111 |             pre = c;
112 |         }
113 | 
114 |         return this;
115 |     }
116 | 
117 |     @Override
118 |     public Appendable append(CharSequence csq, int start, int end) throws IOException {
119 |         return append(csq.subSequence(start, end));
120 |     }
121 | 
122 |     @Override
123 |     public Appendable append(char c) throws IOException {
124 |         return append(Character.toString(c));
125 |     }
126 | 
127 | 
128 | 
129 |     @Override
130 |     public int length() {
131 |         return stringBuilder.length();
132 |     }
133 | 
134 |     @Override
135 |     public char charAt(int index) {
136 |         return stringBuilder.charAt(index);
137 |     }
138 | 
139 |     @Override
140 |     public CharSequence subSequence(int start, int end) {
141 |         return stringBuilder.subSequence(start, end);
142 |     }
143 | 
144 |     @Override @NotNull
145 |     public String toString() {
146 |         return stringBuilder.toString(); //only correct impl, see interface CharSequence!
147 |     }
148 | 
149 | }
150 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/DataLanguageDetectorImplTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect;
 18 | 
 19 | import com.optimaize.langdetect.ngram.NgramExtractors;
 20 | import com.optimaize.langdetect.profiles.LanguageProfile;
 21 | import com.optimaize.langdetect.profiles.LanguageProfileReader;
 22 | import com.optimaize.langdetect.text.CommonTextObjectFactories;
 23 | import org.testng.annotations.DataProvider;
 24 | import org.testng.annotations.Test;
 25 | 
 26 | import java.io.BufferedReader;
 27 | import java.io.IOException;
 28 | import java.io.InputStream;
 29 | import java.io.InputStreamReader;
 30 | import java.nio.charset.StandardCharsets;
 31 | import java.util.List;
 32 | 
 33 | import static org.junit.Assert.assertEquals;
 34 | 
 35 | /**
 36 |  * Uses all built-in language profiles and tests some simple clean phrases as well as longer texts  against them
 37 |  * with expected outcome.
 38 |  *
 39 |  * @author Fabian Kessler
 40 |  */
 41 | public class DataLanguageDetectorImplTest {
 42 | 
 43 |     private final LanguageDetector shortDetector;
 44 |     private final LanguageDetector longDetector;
 45 | 
 46 |     public DataLanguageDetectorImplTest() throws IOException {
 47 |         List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
 48 | 
 49 |         shortDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
 50 |                 .shortTextAlgorithm(100)
 51 |                 .withProfiles(languageProfiles)
 52 |                 .build();
 53 | 
 54 |         longDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
 55 |                 .shortTextAlgorithm(0)
 56 |                 .withProfiles(new LanguageProfileReader().readAllBuiltIn())
 57 |                 .build();
 58 |     }
 59 | 
 60 |     @Test(dataProvider = "shortCleanTexts")
 61 |     public void shortTextAlgo(String expectedLanguage, CharSequence text) throws IOException {
 62 |         assertEquals(shortDetector.getProbabilities(text).get(0).getLocale().getLanguage(), expectedLanguage);
 63 |         //the detect() method doesn't have enough confidence for all these short texts.
 64 |     }
 65 | 
 66 |     @Test(dataProvider = "shortCleanTexts")
 67 |     public void longTextAlgoWorkingOnShortText(String expectedLanguage, CharSequence text) throws IOException {
 68 |         assertEquals(longDetector.getProbabilities(text).get(0).getLocale().getLanguage(), expectedLanguage);
 69 |         //the detect() method doesn't have enough confidence for all these short texts.
 70 |     }
 71 | 
 72 |     @Test(dataProvider = "longerWikipediaTexts")
 73 |     public void longTextAlgoWorkingOnLongText(String expectedLanguage, CharSequence text) throws IOException {
 74 |         assertEquals(longDetector.getProbabilities(text).get(0).getLocale().getLanguage(), expectedLanguage);
 75 |         assertEquals(longDetector.detect(text).get().getLanguage(), expectedLanguage);
 76 |     }
 77 | 
 78 |     @DataProvider
 79 |     protected Object[][] shortCleanTexts() {
 80 |         return new Object[][] {
 81 |                 {"en", shortCleanText("This is some English text.")},
 82 |                 {"fr", shortCleanText("Ceci est un texte français.")},
 83 |                 {"nl", shortCleanText("Dit is een Nederlandse tekst.")},
 84 |                 {"de", shortCleanText("Dies ist eine deutsche Text")},
 85 |                 {"km", shortCleanText("សព្វវចនាធិប្បាយសេរីសម្រាប់អ្នកទាំងអស់គ្នា។" + "នៅក្នុងវិគីភីឌាភាសាខ្មែរឥឡូវនេះមាន ១១៩៨រូបភាព សមាជិក១៥៣៣៣នាក់ និងមាន៤៥៨៣អត្ថបទ។")},
 86 |                 {"bg", shortCleanText("Европа не трябва да стартира нов конкурентен маратон и изход с приватизация")},
 87 |                 {"wa", shortCleanText("Çouchal c' est on tecse pår e walon.")},
 88 |         };
 89 |     }
 90 |     private CharSequence shortCleanText(CharSequence text) {
 91 |         return CommonTextObjectFactories.forDetectingShortCleanText().forText( text );
 92 |     }
 93 | 
 94 |     @DataProvider
 95 |     protected Object[][] longerWikipediaTexts() {
 96 |         return new Object[][] {
 97 |                 {"de", largeText(readText("/texts/de-wikipedia-Deutschland.txt"))},
 98 |                 {"fr", largeText(readText("/texts/fr-wikipedia-France.txt"))},
 99 |                 {"it", largeText(readText("/texts/it-wikipedia-Italia.txt"))},
100 |         };
101 |     }
102 | 
103 |     private CharSequence readText(String path) {
104 |         try (InputStream inputStream = DataLanguageDetectorImplTest.class.getResourceAsStream(path)) {
105 |             try (BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
106 |                 StringBuilder sb = new StringBuilder();
107 |                 String str;
108 |                 while ((str = in.readLine()) != null) {
109 |                     sb.append(str);
110 |                 }
111 |                 return sb.toString();
112 |             }
113 |         } catch (IOException e) {
114 |             throw new RuntimeException(e);
115 |         }
116 |     }
117 | 
118 |     private CharSequence largeText(CharSequence text) {
119 |         return CommonTextObjectFactories.forDetectingOnLargeText().forText( text );
120 |     }
121 | 
122 | 
123 | 
124 | 
125 | }
126 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/profiles/LanguageProfileReaderTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.profiles;
 18 | 
 19 | import com.google.common.collect.ImmutableList;
 20 | import com.optimaize.langdetect.i18n.LdLocale;
 21 | import org.junit.Test;
 22 | 
 23 | import java.io.File;
 24 | import java.io.IOException;
 25 | import java.util.HashSet;
 26 | import java.util.List;
 27 | import java.util.Set;
 28 | 
 29 | import static org.hamcrest.Matchers.*;
 30 | import static org.junit.Assert.*;
 31 | 
 32 | /**
 33 |  * @author Fabian Kessler
 34 |  * @author François ROLAND
 35 |  */
 36 | public class LanguageProfileReaderTest {
 37 | 
 38 |     private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages");
 39 | 
 40 | 
 41 |     /*
 42 |     * In case someone creates new language profiles then these numbers need to be adjusted.
 43 |     */
 44 | 
 45 |     @Test
 46 |     public void readEnFile() throws IOException {
 47 |         checkProfileFile("en", 3, 2301, 26164, 3774627);
 48 |     }
 49 | 
 50 |     @Test
 51 |     public void readBnFile() throws IOException {
 52 |         checkProfileFile("bn", 3, 2846, 198, 22964);
 53 |     }
 54 | 
 55 |     @Test
 56 |     public void readFrFile() throws IOException {
 57 |         checkProfileFile("fr", 3, 2232, 6653, 1120211);
 58 |     }
 59 | 
 60 |     @Test
 61 |     public void readNlFile() throws IOException {
 62 |         checkProfileFile("nl", 3, 2163, 5640, 1373884);
 63 |     }
 64 | 
 65 |     private static void checkProfileFile(String language, int nWordSize, int freqSize, long minFreq, long maxFreq) throws IOException {
 66 |         File profileFile = new File(PROFILE_DIR, language);
 67 |         final LanguageProfile languageProfile = new LanguageProfileReader().read(profileFile);
 68 |         assertThat(languageProfile, is(notNullValue()));
 69 |         assertThat(languageProfile.getLocale().getLanguage(), is(equalTo(language)));
 70 |         assertEquals(languageProfile.getGramLengths().size(), nWordSize);
 71 |         assertEquals(languageProfile.getGramLengths(), ImmutableList.of(1, 2, 3));
 72 |         assertEquals(languageProfile.getNumGrams(), freqSize);
 73 | 
 74 |         assertTrue(languageProfile.getMinGramCount(nWordSize) < languageProfile.getMaxGramCount(nWordSize));
 75 |         assertEquals(languageProfile.getMinGramCount(nWordSize), minFreq);
 76 |         assertEquals(languageProfile.getMaxGramCount(nWordSize), maxFreq);
 77 |     }
 78 | 
 79 | 
 80 |     @Test
 81 |     public void readFromDir() throws IOException {
 82 |         List<LanguageProfile> read = new LanguageProfileReader().read(ImmutableList.of("de", "fr"));
 83 |         assertEquals(read.size(), 2);
 84 |     }
 85 | 
 86 |     @Test
 87 |     public void readFromDirWithClassloader() throws IOException {
 88 |         List<LanguageProfile> read = new LanguageProfileReader().read(
 89 |                 LanguageProfileReaderTest.class.getClassLoader(),
 90 |                 "languages",
 91 |                 ImmutableList.of("de", "fr")
 92 |         );
 93 |         assertEquals(read.size(), 2);
 94 |     }
 95 | 
 96 | 
 97 |     @Test
 98 |     public void read() throws IOException {
 99 |         List<LanguageProfile> read = new LanguageProfileReader().read(ImmutableList.of("de", "fr"));
100 |         assertEquals(read.size(), 2);
101 |     }
102 | 
103 |     @Test
104 |     public void read_folder() throws IOException {
105 |         List<LanguageProfile> read = new LanguageProfileReader().read("languages", ImmutableList.of("de", "fr"));
106 |         assertEquals(read.size(), 2);
107 |     }
108 | 
109 |     @Test
110 |     public void read_classpathAndFolder() throws IOException {
111 |         List<LanguageProfile> read = new LanguageProfileReader().read(LanguageProfileReaderTest.class.getClassLoader(), "languages", ImmutableList.of("de", "fr"));
112 |         assertEquals(read.size(), 2);
113 |     }
114 | 
115 |     @Test
116 |     public void readAllBuiltIn() throws IOException {
117 |         verify_readAllBuiltIn(new LanguageProfileReader().readAllBuiltIn());
118 |     }
119 |     private void verify_readAllBuiltIn(List<LanguageProfile> profiles) {
120 |         assertEquals(profiles.size(), 71); //adjust this number when adding more languages
121 |         Set<LdLocale> allLangs = new HashSet<>();
122 |         for (LanguageProfile profile : profiles) {
123 |             assertFalse("Duplicate language: "+profile.getLocale(), allLangs.contains(profile.getLocale()));
124 |             allLangs.add(profile.getLocale());
125 |         }
126 |         assertTrue(allLangs.contains(LdLocale.fromString("de")));
127 |         assertTrue(allLangs.contains(LdLocale.fromString("zh-CN")));
128 |         assertTrue(allLangs.contains(LdLocale.fromString("zh-TW")));
129 |     }
130 | 
131 | 
132 |     @Test
133 |     public void loadProfilesFromClasspath() throws IOException {
134 |         List<LanguageProfile> result = new LanguageProfileReader().read(this.getClass().getClassLoader(), "languages", ImmutableList.of("en", "fr", "nl", "de"));
135 |         assertEquals(result.size(), 4);
136 |     }
137 | 
138 |     @Test
139 |     public void loadProfilesFromFile() throws IOException {
140 |         List<LanguageProfile> result = new LanguageProfileReader().readAll(new File(new File(new File(new File("src"), "main"), "resources"), "languages"));
141 |         assertEquals(result.size(), 71); //adjust this number when adding more languages
142 |     }
143 | 
144 | }
145 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/BuiltInLanguages.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Nicole Torres
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.profiles;
 18 | 
 19 | import com.google.common.collect.ImmutableList;
 20 | import com.optimaize.langdetect.i18n.LdLocale;
 21 | 
 22 | import java.util.ArrayList;
 23 | import java.util.List;
 24 | 
 25 | /**
 26 |  * @author Nicole Torres
 27 |  */
 28 | public class BuiltInLanguages {
 29 | 
 30 |     private static final List<LdLocale> languages;
 31 |     private static final List<String> shortTextLanguages;
 32 | 
 33 |     static {
 34 |         List<LdLocale> names = new ArrayList<>();
 35 | 
 36 |         //sorted alphabetically
 37 |         names.add(LdLocale.fromString("af"));
 38 |         names.add(LdLocale.fromString("an"));
 39 |         names.add(LdLocale.fromString("ar"));
 40 |         names.add(LdLocale.fromString("ast"));
 41 |         names.add(LdLocale.fromString("be"));
 42 |         names.add(LdLocale.fromString("bg"));
 43 |         names.add(LdLocale.fromString("bn"));
 44 |         names.add(LdLocale.fromString("br"));
 45 |         names.add(LdLocale.fromString("ca"));
 46 |         names.add(LdLocale.fromString("cs"));
 47 |         names.add(LdLocale.fromString("cy"));
 48 |         names.add(LdLocale.fromString("da"));
 49 |         names.add(LdLocale.fromString("de"));
 50 |         names.add(LdLocale.fromString("el"));
 51 |         names.add(LdLocale.fromString("en"));
 52 |         names.add(LdLocale.fromString("es"));
 53 |         names.add(LdLocale.fromString("et"));
 54 |         names.add(LdLocale.fromString("eu"));
 55 |         names.add(LdLocale.fromString("fa"));
 56 |         names.add(LdLocale.fromString("fi"));
 57 |         names.add(LdLocale.fromString("fr"));
 58 |         names.add(LdLocale.fromString("ga"));
 59 |         names.add(LdLocale.fromString("gl"));
 60 |         names.add(LdLocale.fromString("gu"));
 61 |         names.add(LdLocale.fromString("he"));
 62 |         names.add(LdLocale.fromString("hi"));
 63 |         names.add(LdLocale.fromString("hr"));
 64 |         names.add(LdLocale.fromString("ht"));
 65 |         names.add(LdLocale.fromString("hu"));
 66 |         names.add(LdLocale.fromString("id"));
 67 |         names.add(LdLocale.fromString("is"));
 68 |         names.add(LdLocale.fromString("it"));
 69 |         names.add(LdLocale.fromString("ja"));
 70 |         names.add(LdLocale.fromString("km"));
 71 |         names.add(LdLocale.fromString("kn"));
 72 |         names.add(LdLocale.fromString("ko"));
 73 |         names.add(LdLocale.fromString("lt"));
 74 |         names.add(LdLocale.fromString("lv"));
 75 |         names.add(LdLocale.fromString("mk"));
 76 |         names.add(LdLocale.fromString("ml"));
 77 |         names.add(LdLocale.fromString("mr"));
 78 |         names.add(LdLocale.fromString("ms"));
 79 |         names.add(LdLocale.fromString("mt"));
 80 |         names.add(LdLocale.fromString("ne"));
 81 |         names.add(LdLocale.fromString("nl"));
 82 |         names.add(LdLocale.fromString("no"));
 83 |         names.add(LdLocale.fromString("oc"));
 84 |         names.add(LdLocale.fromString("pa"));
 85 |         names.add(LdLocale.fromString("pl"));
 86 |         names.add(LdLocale.fromString("pt"));
 87 |         names.add(LdLocale.fromString("ro"));
 88 |         names.add(LdLocale.fromString("ru"));
 89 |         names.add(LdLocale.fromString("sk"));
 90 |         names.add(LdLocale.fromString("sl"));
 91 |         names.add(LdLocale.fromString("so"));
 92 |         names.add(LdLocale.fromString("sq"));
 93 |         names.add(LdLocale.fromString("sr"));
 94 |         names.add(LdLocale.fromString("sv"));
 95 |         names.add(LdLocale.fromString("sw"));
 96 |         names.add(LdLocale.fromString("ta"));
 97 |         names.add(LdLocale.fromString("te"));
 98 |         names.add(LdLocale.fromString("th"));
 99 |         names.add(LdLocale.fromString("tl"));
100 |         names.add(LdLocale.fromString("tr"));
101 |         names.add(LdLocale.fromString("uk"));
102 |         names.add(LdLocale.fromString("ur"));
103 |         names.add(LdLocale.fromString("vi"));
104 |         names.add(LdLocale.fromString("wa"));
105 |         names.add(LdLocale.fromString("yi"));
106 |         names.add(LdLocale.fromString("zh-CN"));
107 |         names.add(LdLocale.fromString("zh-TW"));
108 | 
109 |         languages = ImmutableList.copyOf(names);
110 |     }
111 | 
112 |     static {
113 |         List<String> texts = new ArrayList<>();
114 |         texts.add("cs");
115 |         texts.add("da");
116 |         texts.add("de");
117 |         texts.add("en");
118 |         texts.add("es");
119 |         texts.add("fi");
120 |         texts.add("fr");
121 |         texts.add("id");
122 |         texts.add("it");
123 |         texts.add("nl");
124 |         texts.add("no");
125 |         texts.add("pl");
126 |         texts.add("pt");
127 |         texts.add("ro");
128 |         texts.add("sv");
129 |         texts.add("tr");
130 |         texts.add("vi");
131 |         shortTextLanguages = ImmutableList.copyOf(texts);
132 |     }
133 | 
134 |     /**
135 |      * Returns the languages for which the library provides full profiles.
136 |      * Full provides are generated from regular text, usually Wikipedia abstracts.
137 |      * @return immutable
138 |      */
139 |     public static List<LdLocale> getLanguages() {
140 |         return languages;
141 |     }
142 | 
143 |     /**
144 |      * Returns the languages for which the library provides profiles created from short text.
145 |      * Twitter was used as source by @shuyo.
146 |      * Much less languages have short text profiles as of now.
147 |      * @return immutable
148 |      */
149 |     public static List<String> getShortTextLanguages() {
150 |         return shortTextLanguages;
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/cybozu/util/LangProfile.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Nakatani Shuyo
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | /*
 18 |  * This file has been modified by François ROLAND.
 19 |  */
 20 | 
 21 | package com.optimaize.langdetect.cybozu.util;
 22 | 
 23 | import org.jetbrains.annotations.NotNull;
 24 | 
 25 | import java.io.Serializable;
 26 | import java.util.HashMap;
 27 | import java.util.Iterator;
 28 | import java.util.Map;
 29 | import java.util.Set;
 30 | 
 31 | /**
 32 |  * {@link LangProfile} is a Language Profile Class.
 33 |  * Users don't use this class directly.
 34 |  *
 35 |  * TODO split into builder and immutable class.
 36 |  *
 37 |  * TODO currently this only makes n-grams with the space before a word included. no n-gram with the space after the word.
 38 |  * Example: "foo" creates " fo" as 3gram, but not "oo ". Either this is a bug, or if intended then needs documentation.
 39 |  * 
 40 |  * @author Nakatani Shuyo
 41 |  * @deprecated replaced by LanguageProfile
 42 |  */
 43 | @Deprecated
 44 | public class LangProfile implements Serializable {
 45 | 
 46 | 	private static final long serialVersionUID = 1L;
 47 | 
 48 |     /**
 49 |      * n-grams that occur less than this often can be removed using omitLessFreq().
 50 |      * This number can change, see LESS_FREQ_RATIO.
 51 |      */
 52 | 	private static final int MINIMUM_FREQ = 2;
 53 | 
 54 |     /**
 55 |      * Explanation by example:
 56 |      *
 57 |      * If the most frequent n-gram occurs 1 mio times, then
 58 |      * 1'000'000 / this (100'000) = 10.
 59 |      * 10 is larger than MINIMUM_FREQ (2), thus MINIMUM_FREQ remains at 2.
 60 |      * All n-grams that occur less than 2 times can be removed as noise using omitLessFreq().
 61 |      *
 62 |      * If the most frequent n-gram occurs 5000 times, then
 63 |      * 5'000 / this (100'000) = 0.05.
 64 |      * 0.05 is smaller than MINIMUM_FREQ (2), thus MINIMUM_FREQ becomes 0.
 65 |      * No n-grams are removed because of insignificance when calling omitLessFreq().
 66 |      */
 67 |     private static final int LESS_FREQ_RATIO = 100000;
 68 | 
 69 |     /**
 70 |      * The language name (identifier).
 71 |      */
 72 |     private String name = null;
 73 | 
 74 |     /**
 75 |      * Key = ngram, value = count.
 76 |      * All n-grams are in here (1-gram, 2-gram, 3-gram).
 77 |      */
 78 |     private Map<String, Integer> freq = new HashMap<>();
 79 | 
 80 |     /**
 81 |      * Tells how many occurrences of n-grams exist per gram length.
 82 |      * When making 1grams, 2grams and 3grams (currently) then this contains 3 entries where
 83 |      * element 0 = number occurrences of 1-grams
 84 |      * element 1 = number occurrences of 2-grams
 85 |      * element 2 = number occurrences of 3-grams
 86 |      * Example: if there are 57 1-grams (English language has about that many) and the training text is
 87 |      * fairly long, then this number is in the millions.
 88 |      */
 89 |     private int[] nWords = new int[NGram.N_GRAM];
 90 | 
 91 |     /**
 92 |      * Constructor for JSONIC 
 93 |      */
 94 |     public LangProfile() {}
 95 | 
 96 |     /**
 97 |      * Normal Constructor
 98 |      * @param name language name
 99 |      */
100 |     public LangProfile(String name) {
101 |         this.setName(name);
102 |     }
103 |     
104 |     /**
105 |      * Add n-gram to profile
106 |      * @param gram
107 |      */
108 |     public void add(@NotNull String gram) {
109 |         if (name == null) throw new IllegalStateException();
110 |         int len = gram.length();
111 |         if (len < 1 || len > NGram.N_GRAM) {
112 |             throw new IllegalArgumentException("ngram length must be 1-3 but was "+len+": >>>"+gram+"<<<!");
113 |         }
114 |         nWords[len - 1]++;
115 |         if (freq.containsKey(gram)) {
116 |             freq.put(gram, freq.get(gram) + 1);
117 |         } else {
118 |             freq.put(gram, 1);
119 |         }
120 |     }
121 | 
122 |     /**
123 |      * Removes ngrams that occur fewer times than MINIMUM_FREQ to get rid of rare ngrams.
124 |      *
125 |      * Also removes ascii ngrams if the total number of ascii ngrams is less than one third of the total.
126 |      * This is done because non-latin text (such as Chinese) often has some latin noise in between.
127 |      *
128 |      * TODO split the 2 cleaning to separate methods.
129 |      * TODO distinguish ascii/latin, currently it looks for latin only, should include characters with diacritics, eg Vietnamese.
130 |      * TODO current code counts ascii, but removes any latin. is that desired? if so then this needs documentation.
131 |      */
132 |     public void omitLessFreq() {
133 |         if (name == null) throw new IllegalStateException();
134 | 
135 |         int threshold = nWords[0] / LESS_FREQ_RATIO;
136 |         if (threshold < MINIMUM_FREQ) threshold = MINIMUM_FREQ;
137 |         
138 |         Set<String> keys = freq.keySet();
139 |         int roman = 0;
140 |         for(Iterator<String> i = keys.iterator(); i.hasNext(); ){
141 |             String key = i.next();
142 |             int count = freq.get(key);
143 |             if (count <= threshold) {
144 |                 nWords[key.length()-1] -= count;
145 |                 i.remove();
146 |             } else {
147 |                 if (key.matches("^[A-Za-z]$")) {
148 |                     roman += count;
149 |                 }
150 |             }
151 |         }
152 | 
153 |         // roman check
154 |         if (roman < nWords[0] / 3) {
155 |             Set<String> keys2 = freq.keySet();
156 |             for(Iterator<String> i = keys2.iterator(); i.hasNext(); ){
157 |                 String key = i.next();
158 |                 if (key.matches(".*[A-Za-z].*")) {
159 |                     nWords[key.length()-1] -= freq.get(key);
160 |                     i.remove();
161 |                 }
162 |             }
163 |         }
164 |     }
165 | 
166 | 	public String getName() {
167 | 		return name;
168 | 	}
169 | 
170 | 	public void setName(String name) {
171 | 		this.name = name;
172 | 	}
173 | 
174 | 	public Map<String, Integer> getFreq() {
175 | 		return freq;
176 | 	}
177 | 
178 | 	public void setFreq(Map<String, Integer> freq) {
179 | 		this.freq = freq;
180 | 	}
181 | 
182 | 	public int[] getNWords() {
183 | 		return nWords;
184 | 	}
185 | 
186 | 	public void setNWords(int[] nWords) {
187 | 		this.nWords = nWords;
188 | 	}
189 | }
190 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # language-detector
  2 | 
  3 | Language Detection Library for Java
  4 | 
  5 |     <dependency>
  6 |         <groupId>com.optimaize.languagedetector</groupId>
  7 |         <artifactId>language-detector</artifactId>
  8 |         <version>0.6</version>
  9 |     </dependency>
 10 | 
 11 | 
 12 | ## Language Support
 13 | 
 14 | ### 71 Built-in Language Profiles
 15 | 
 16 | 1. af Afrikaans
 17 | 1. an Aragonese
 18 | 1. ar Arabic
 19 | 1. ast Asturian
 20 | 1. be Belarusian
 21 | 1. br Breton
 22 | 1. ca Catalan
 23 | 1. bg Bulgarian
 24 | 1. bn Bengali
 25 | 1. cs Czech
 26 | 1. cy Welsh
 27 | 1. da Danish
 28 | 1. de German
 29 | 1. el Greek
 30 | 1. en English
 31 | 1. es Spanish
 32 | 1. et Estonian
 33 | 1. eu Basque
 34 | 1. fa Persian
 35 | 1. fi Finnish
 36 | 1. fr French
 37 | 1. ga Irish
 38 | 1. gl Galician
 39 | 1. gu Gujarati
 40 | 1. he Hebrew
 41 | 1. hi Hindi
 42 | 1. hr Croatian
 43 | 1. ht Haitian
 44 | 1. hu Hungarian
 45 | 1. id Indonesian
 46 | 1. is Icelandic
 47 | 1. it Italian
 48 | 1. ja Japanese
 49 | 1. km Khmer
 50 | 1. kn Kannada
 51 | 1. ko Korean
 52 | 1. lt Lithuanian
 53 | 1. lv Latvian
 54 | 1. mk Macedonian
 55 | 1. ml Malayalam
 56 | 1. mr Marathi
 57 | 1. ms Malay
 58 | 1. mt Maltese
 59 | 1. ne Nepali
 60 | 1. nl Dutch
 61 | 1. no Norwegian
 62 | 1. oc Occitan
 63 | 1. pa Punjabi
 64 | 1. pl Polish
 65 | 1. pt Portuguese
 66 | 1. ro Romanian
 67 | 1. ru Russian
 68 | 1. sk Slovak
 69 | 1. sl Slovene
 70 | 1. so Somali
 71 | 1. sq Albanian
 72 | 1. sr Serbian
 73 | 1. sv Swedish
 74 | 1. sw Swahili
 75 | 1. ta Tamil
 76 | 1. te Telugu
 77 | 1. th Thai
 78 | 1. tl Tagalog
 79 | 1. tr Turkish
 80 | 1. uk Ukrainian
 81 | 1. ur Urdu
 82 | 1. vi Vietnamese
 83 | 1. wa Walloon
 84 | 1. yi Yiddish
 85 | 1. zh-cn Simplified Chinese
 86 | 1. zh-tw Traditional Chinese
 87 | 
 88 | User danielnaber has made available a profile for Esperanto on his website, see open tasks.
 89 | 
 90 | There are two kinds of profiles. The standard ones created from Wikipedia articles and similar.
 91 | And the "short text" profiles created from Twitter tweets. Fewer language profiles exist for the
 92 | short text, more would be available, see https://github.com/optimaize/language-detector/issues/57
 93 | 
 94 | ### Other Languages
 95 | 
 96 | You can create a language profile for your own language easily.
 97 | See https://github.com/optimaize/language-detector/blob/master/src/main/resources/README.md
 98 | 
 99 | 
100 | ## How it Works
101 | 
102 | The software uses language profiles which were created based on common text for each language.
103 | N-grams http://en.wikipedia.org/wiki/N-gram were then extracted from that text, and that's what is stored in the profiles.
104 | 
105 | When trying to figure out in what language a certain text is written, the program goes through the same process:
106 | It creates the same kind of n-grams of the input text. Then it compares the relative frequency of them, and finds the
107 | language that matches best.
108 | 
109 | 
110 | ### Challenges
111 | 
112 | This software does not work as well when the input text to analyze is short, or unclean. For example tweets.
113 | 
114 | When a text is written in multiple languages, the default algorithm of this software is not appropriate.
115 | You can try to split the text (by sentence or paragraph) and detect the individual parts. Running the language guesser
116 | on the whole text will just tell you the language that is most dominant, in the best case.
117 | 
118 | This software cannot handle it well when the input text is in none of the expected (and supported) languages.
119 | For example if you only load the language profiles from English and German, but the text is written in French,
120 | the program may pick the more likely one, or say it doesn't know. (An improvement would be to clearly detect that
121 | it's unlikely one of the supported languages.)
122 | 
123 | If you are looking for a language detector / language guesser library in Java, this seems to be the best open source
124 | library you can get at this time. If it doesn't need to be Java, you may want to take a look at https://code.google.com/p/cld2/
125 | 
126 | 
127 | ## How to Use
128 | 
129 | #### Language Detection for your Text
130 | 
131 |     //load all languages:
132 |     List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
133 | 
134 |     //build language detector:
135 |     LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
136 |             .withProfiles(languageProfiles)
137 |             .build();
138 | 
139 |     //create a text object factory
140 |     TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
141 | 
142 |     //query:
143 |     TextObject textObject = textObjectFactory.forText("my text");
144 |     Optional<LdLocale> lang = languageDetector.detect(textObject);
145 | 
146 | 
147 | #### Creating Language Profiles for your Training Text
148 | 
149 | See https://github.com/optimaize/language-detector/wiki/Creating-Language-Profiles
150 | 
151 | 
152 | ## How You Can Help
153 | 
154 | If your language is not supported yet, then you can provide clean "training text", that is, common text written in your
155 | language. The text should be fairly long (a couple of pages at the very least). If you can provide that, please open
156 | a ticket.
157 | 
158 | If your language is supported already, but not identified clearly all the time, you can still provide such training
159 | text. We might then be able to improve detection for your language.
160 | 
161 | If you're a programmer, dig in the source and see what you can improve. Check the open tasks.
162 | 
163 | 
164 | ## Memory Consumption
165 | 
166 | Loading all 71 language profiles uses 74MB ram to store the data in memory.
167 | For memory considerations see https://github.com/optimaize/language-detector/wiki/Memory-Consumption
168 | 
169 | 
170 | ## History and Changes
171 | 
172 | This project is a fork of a fork, the original author is Nakatani Shuyo.
173 | For detail see https://github.com/optimaize/language-detector/wiki/History-and-Changes
174 | 
175 | 
176 | ## Where it's used
177 | 
178 | An adapted version of this is used by the http://www.NameAPI.org server.
179 | 
180 | https://www.languagetool.org/ is a proof-reading software for LibreOffice/OpenOffice, for the Desktop and for Firefox.
181 | 
182 | 
183 | 
184 | ## License
185 | 
186 | Apache 2 (business friendly)
187 | 
188 | 
189 | 
190 | ## Authors
191 | 
192 | Nakatani Shuyo, Fabian Kessler, Francois ROLAND, Robert Theis
193 | 
194 | For detail see https://github.com/optimaize/language-detector/wiki/Authors
195 | 
196 | 
197 | ## For Maven Users
198 | 
199 | The project is in Maven central http://search.maven.org/#artifactdetails%7Ccom.optimaize.languagedetector%7Clanguage-detector%7C0.4%7Cjar this is the latest version:
200 | 
201 |     <dependency>
202 |         <groupId>com.optimaize.languagedetector</groupId>
203 |         <artifactId>language-detector</artifactId>
204 |         <version>0.6</version>
205 |     </dependency>
206 | 


--------------------------------------------------------------------------------
/src/test/java/com/optimaize/langdetect/ngram/NgramExtractorTest.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.ngram;
 18 | 
 19 | import com.google.common.base.Stopwatch;
 20 | import org.junit.Test;
 21 | 
 22 | import java.util.*;
 23 | 
 24 | import static org.junit.Assert.assertEquals;
 25 | import static org.junit.Assert.assertTrue;
 26 | 
 27 | /**
 28 |  * @author Fabian Kessler
 29 |  */
 30 | public class NgramExtractorTest {
 31 | 
 32 |     @Test
 33 |     public void extractGrams_1() {
 34 |         String text = "Foo bar";
 35 |         List<String> ngrams = NgramExtractor.gramLength(1).extractGrams(text);
 36 |         assertEquals(ngrams.size(), text.length());
 37 |         assertEquals(ngrams, Arrays.asList("F","o","o"," ","b","a","r"));
 38 |     }
 39 | 
 40 |     @Test
 41 |     public void extractGrams_2() {
 42 |         String text = "Foo bar";
 43 |         List<String> ngrams = NgramExtractor.gramLength(2).extractGrams(text);
 44 |         assertEquals(ngrams.size(), text.length() -1);
 45 |         assertEquals(ngrams, Arrays.asList("Fo","oo","o "," b","ba","ar"));
 46 |     }
 47 | 
 48 |     @Test
 49 |     public void extractGrams_3() {
 50 |         String text = "Foo bar";
 51 |         List<String> ngrams = NgramExtractor.gramLength(3).extractGrams(text);
 52 |         assertEquals(ngrams.size(), text.length()-2);
 53 |     }
 54 | 
 55 |     @Test
 56 |     public void extractGrams_6() {
 57 |         String text = "Foo bar";
 58 |         List<String> ngrams = NgramExtractor.gramLength(6).extractGrams(text);
 59 |         assertEquals(ngrams.size(), text.length()-5);
 60 |     }
 61 | 
 62 |     @Test
 63 |     public void extractGrams_7() {
 64 |         String text = "Foo bar";
 65 |         List<String> ngrams = NgramExtractor.gramLength(7).extractGrams(text);
 66 |         assertEquals(ngrams.size(), text.length()-6);
 67 |     }
 68 | 
 69 |     @Test
 70 |     public void extractGrams_8() {
 71 |         String text = "Foo bar";
 72 |         List<String> ngrams = NgramExtractor.gramLength(8).extractGrams(text);
 73 |         assertTrue(ngrams.isEmpty());
 74 |     }
 75 | 
 76 | 
 77 | 
 78 |     @Test
 79 |     public void stressTestAlgo2() {
 80 |         NgramExtractor ngramExtractor = NgramExtractor.gramLengths(1, 2, 3);
 81 |         String text = "Foo bar hello world and so on nana nunu dada dudu asdf asdf akewf köjvnawer aisdfj awejfr iajdsöfj ewi adjsköfjwei ajsdökfj ief asd";
 82 |         Stopwatch stopwatch = Stopwatch.createStarted();
 83 |         for (int i=0; i<100000; i++) {
 84 |             ngramExtractor.extractGrams(text);
 85 |         }
 86 |         System.out.println(stopwatch); //876.6ms
 87 |     }
 88 | 
 89 | 
 90 |     @Test
 91 |     public void extractGrams_threeSizesAtOnce() {
 92 |         String text = "Foo bar";
 93 | 
 94 |         List<String> expected = NgramExtractor.gramLengths(1, 2, 3).extractGrams(text);
 95 |         Collections.sort(expected);
 96 | 
 97 |         List<String> separate = new ArrayList<>();
 98 |         separate.addAll(NgramExtractor.gramLength(1).extractGrams(text));
 99 |         separate.addAll(NgramExtractor.gramLength(2).extractGrams(text));
100 |         separate.addAll(NgramExtractor.gramLength(3).extractGrams(text));
101 |         Collections.sort(separate);
102 | 
103 |         assertEquals(expected, separate);
104 |     }
105 | 
106 |     @Test
107 |     public void extractGrams_threeSizesAtOnce_short() {
108 |         List<String> ngrams = NgramExtractor.gramLengths(1, 2, 3).extractGrams("a");
109 |         assertEquals(ngrams.size(), 1);
110 | 
111 |         ngrams = NgramExtractor.gramLengths(1, 2, 3).extractGrams("");
112 |         assertEquals(ngrams.size(), 0);
113 |     }
114 | 
115 | 
116 | 
117 |     @Test
118 |     public void extractCountedGrams_single_1() {
119 |         Map<String,Integer> grams = NgramExtractor.gramLength(1).extractCountedGrams("Foo");
120 |         assertEquals(grams.size(), 2);
121 |     }
122 | 
123 |     @Test
124 |     public void extractCountedGrams_single_2() {
125 |         Map<String,Integer> grams = NgramExtractor.gramLengths(2).extractCountedGrams("Foo bar");
126 |         assertEquals(grams.size(), 6);
127 | 
128 |         grams = NgramExtractor.gramLengths(2).extractCountedGrams("aaaa");
129 |         assertEquals(grams, Collections.singletonMap("aa",3));
130 |     }
131 | 
132 |     @Test
133 |     public void extractCountedGrams_list_1() {
134 |         String text = "Foo bar dies ist ein längerer deutscher Text, und Texte sind üblicherweise auch gerne gross geschrieben und so nämlich.";
135 | 
136 |         Map<String,Integer> one = NgramExtractor.gramLength(1).extractCountedGrams(text);
137 |         Map<String,Integer> two = NgramExtractor.gramLengths(2).extractCountedGrams(text);
138 |         Map<String,Integer> three = NgramExtractor.gramLengths(3).extractCountedGrams(text);
139 |         Map<String,Integer> combined = new HashMap<>();
140 |         combined.putAll(one);
141 |         combined.putAll(two);
142 |         combined.putAll(three);
143 | 
144 |         Map<String,Integer> combined2 = NgramExtractor.gramLengths(1, 2, 3).extractCountedGrams(text);
145 |         assertEquals(combined, combined2);
146 |     }
147 | 
148 | 
149 |     @Test
150 |     public void extractGramsWithPadding_1() {
151 |         String text = "Foo bar";
152 |         List<String> ngrams = NgramExtractor.gramLength(1).textPadding(' ').extractGrams(text);
153 |         assertEquals(ngrams.size(), text.length()+2);
154 |         assertEquals(ngrams, Arrays.asList(" ","F","o","o"," ","b","a","r"," "));
155 |     }
156 | 
157 |     @Test
158 |     public void extractGramsWithPaddingAndFilter_1() {
159 |         String text = "Foo bar";
160 |         List<String> ngrams = NgramExtractor
161 |                 .gramLength(1)
162 |                 .filter(StandardNgramFilter.getInstance())
163 |                 .textPadding(' ')
164 |                 .extractGrams(text);
165 |         assertEquals(ngrams, Arrays.asList("F","o","o","b","a","r"));
166 |     }
167 | 
168 |     @Test
169 |     public void extractGramsWithPadding_2() {
170 |         String text = "Foo bar";
171 |         List<String> ngrams = NgramExtractor.gramLength(2).textPadding(' ').extractGrams(text);
172 |         assertEquals(ngrams.size(), text.length() +1);
173 |         assertEquals(ngrams, Arrays.asList(" F","Fo","oo","o "," b","ba","ar","r "));
174 |     }
175 | 
176 | }
177 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.profiles;
 18 | 
 19 | import com.google.common.collect.ImmutableMap;
 20 | import com.google.common.collect.Iterables;
 21 | import com.optimaize.langdetect.i18n.LdLocale;
 22 | import org.jetbrains.annotations.NotNull;
 23 | 
 24 | import java.util.*;
 25 | 
 26 | /**
 27 |  * <p>This class is immutable.</p>
 28 |  *
 29 |  * @author Fabian Kessler
 30 |  */
 31 | public final class LanguageProfileImpl implements LanguageProfile {
 32 | 
 33 |     @NotNull
 34 |     private final LdLocale locale;
 35 |     @NotNull
 36 |     private final Map<Integer, Map<String,Integer>> ngrams;
 37 |     @NotNull
 38 |     private final Stats stats;
 39 | 
 40 |     private static class Stats {
 41 |         /**
 42 |          * Key = gram length (1-3 or so).
 43 |          * Value = number of all occurrences of these grams combined.
 44 |          */
 45 |         @NotNull
 46 |         private final Map<Integer, Long> numOccurrences;
 47 | 
 48 |         /**
 49 |          * Key = gram length (1-3 or so).
 50 |          * Value = number of occurrences of the n-gram that occurs the least often.
 51 |          * this can be 1, or larger if a cutoff was applied to remove infrequent grams.
 52 |          */
 53 |         @NotNull
 54 |         private final Map<Integer, Long> minGramCounts;
 55 | 
 56 |         /**
 57 |          * Key = gram length (1-3 or so).
 58 |          * Value = number of occurrences of the n-gram that occurs the most often.
 59 |          */
 60 |         @NotNull
 61 |         private final Map<Integer, Long> maxGramCounts;
 62 | 
 63 |         public Stats(@NotNull Map<Integer, Long> numOccurrences,
 64 |                      @NotNull Map<Integer, Long> minGramCounts,
 65 |                      @NotNull Map<Integer, Long> maxGramCounts) {
 66 |             this.numOccurrences = ImmutableMap.copyOf(numOccurrences);
 67 |             this.minGramCounts  = ImmutableMap.copyOf(minGramCounts);
 68 |             this.maxGramCounts  = ImmutableMap.copyOf(maxGramCounts);
 69 |         }
 70 |     }
 71 | 
 72 | 
 73 |     /**
 74 |      * Use the builder.
 75 |      */
 76 |     LanguageProfileImpl(@NotNull LdLocale locale,
 77 |                         @NotNull Map<Integer, Map<String, Integer>> ngrams) {
 78 |         this.locale = locale;
 79 |         this.ngrams = ImmutableMap.copyOf(ngrams);
 80 |         this.stats  = makeStats(ngrams);
 81 |     }
 82 | 
 83 |     private static Stats makeStats(Map<Integer, Map<String, Integer>> ngrams) {
 84 |         Map<Integer, Long> numOccurrences = new HashMap<>(6);
 85 |         Map<Integer, Long> minGramCounts = new HashMap<>(6);
 86 |         Map<Integer, Long> maxGramCounts = new HashMap<>(6);
 87 |         for (Map.Entry<Integer, Map<String, Integer>> entry : ngrams.entrySet()) {
 88 |             long count = 0;
 89 |             Long min = null;
 90 |             Long max = null;
 91 |             for (Integer integer : entry.getValue().values()) {
 92 |                 count += integer;
 93 |                 if (min==null || min > integer) {
 94 |                     min = (long)integer;
 95 |                 }
 96 |                 if (max==null || max < integer) {
 97 |                     max = (long)integer;
 98 |                 }
 99 |             }
100 |             numOccurrences.put(entry.getKey(), count);
101 |             minGramCounts.put(entry.getKey(), min);
102 |             maxGramCounts.put(entry.getKey(), max);
103 |         }
104 |         return new Stats(numOccurrences, minGramCounts, maxGramCounts);
105 |     }
106 | 
107 | 
108 |     @NotNull
109 |     @Override
110 |     public LdLocale getLocale() {
111 |         return locale;
112 |     }
113 | 
114 |     @NotNull @Override
115 |     public List<Integer> getGramLengths() {
116 |         List<Integer> lengths = new ArrayList<>(ngrams.keySet());
117 |         Collections.sort(lengths);
118 |         return lengths;
119 |     }
120 | 
121 |     @Override
122 |     public int getFrequency(String gram) {
123 |         Map<String, Integer> map = ngrams.get(gram.length());
124 |         if (map==null) return 0;
125 |         Integer freq = map.get(gram);
126 |         if (freq==null) return 0;
127 |         return freq;
128 |     }
129 | 
130 |     @Override
131 |     public int getNumGrams(int gramLength) {
132 |         if (gramLength<1) throw new IllegalArgumentException(""+gramLength);
133 |         Map<String, Integer> map = ngrams.get(gramLength);
134 |         if (map==null) return 0;
135 |         return map.size();
136 |     }
137 | 
138 |     @Override
139 |     public int getNumGrams() {
140 |         int ret = 0;
141 |         for (Map<String, Integer> stringIntegerMap : ngrams.values()) {
142 |             ret += stringIntegerMap.size();
143 |         }
144 |         return ret;
145 |     }
146 | 
147 |     @Override
148 |     public long getNumGramOccurrences(int gramLength) {
149 |         Long aLong = stats.numOccurrences.get(gramLength);
150 |         if (aLong==null) return 0;
151 |         return aLong;
152 |     }
153 | 
154 |     @Override
155 |     public long getMinGramCount(int gramLength) {
156 |         Long aLong = stats.minGramCounts.get(gramLength);
157 |         if (aLong==null) return 0;
158 |         return aLong;
159 |     }
160 | 
161 |     @Override
162 |     public long getMaxGramCount(int gramLength) {
163 |         Long aLong = stats.maxGramCounts.get(gramLength);
164 |         if (aLong==null) return 0;
165 |         return aLong;
166 |     }
167 | 
168 | 
169 |     @NotNull @Override
170 |     public Iterable<Map.Entry<String,Integer>> iterateGrams() {
171 |         Iterable[] arr = new Iterable[ngrams.size()];
172 |         int i=0;
173 |         for (Map<String, Integer> stringIntegerMap : ngrams.values()) {
174 |             arr[i] = stringIntegerMap.entrySet();
175 |             i++;
176 |         }
177 |         //noinspection unchecked
178 |         return Iterables.concat(arr);
179 |     }
180 | 
181 |     @NotNull @Override
182 |     public Iterable<Map.Entry<String, Integer>> iterateGrams(int gramLength) {
183 |         return ngrams.get(gramLength).entrySet();
184 |     }
185 | 
186 |     @Override
187 |     public String toString() {
188 |         StringBuilder sb = new StringBuilder();
189 |         sb.append("LanguageProfile{locale=");
190 |         sb.append(locale);
191 |         for (Integer integer : getGramLengths()) {
192 |             sb.append(",");
193 |             sb.append(integer);
194 |             sb.append("-grams=");
195 |             sb.append(getNumGrams(integer));
196 |         }
197 |         sb.append("}");
198 |         return sb.toString();
199 |     }
200 | 
201 |     @Override
202 |     public boolean equals(Object o) {
203 |         if (this == o) return true;
204 |         if (o == null || getClass() != o.getClass()) return false;
205 | 
206 |         LanguageProfileImpl that = (LanguageProfileImpl) o;
207 | 
208 |         if (!locale.equals(that.locale)) return false;
209 |         if (!ngrams.equals(that.ngrams)) return false;
210 | 
211 |         return true;
212 |     }
213 |     @Override
214 |     public int hashCode() {
215 |         int result = locale.hashCode();
216 |         result = 31 * result + ngrams.hashCode();
217 |         return result;
218 |     }
219 | }
220 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.ngram;
 18 | 
 19 | import com.google.common.collect.ImmutableList;
 20 | import org.jetbrains.annotations.NotNull;
 21 | import org.jetbrains.annotations.Nullable;
 22 | 
 23 | import java.util.*;
 24 | 
 25 | /**
 26 |  * Class for extracting n-grams out of a text.
 27 |  *
 28 |  * This class is immutable.
 29 |  *
 30 |  * @author Fabian Kessler
 31 |  */
 32 | public class NgramExtractor {
 33 | 
 34 |     @NotNull
 35 |     private final List<Integer> gramLengths;
 36 |     @Nullable
 37 |     private final NgramFilter filter;
 38 |     @Nullable
 39 |     private final Character textPadding;
 40 | 
 41 |     public static NgramExtractor gramLength(int gramLength) {
 42 |         return new NgramExtractor(ImmutableList.of(gramLength), null, null);
 43 |     }
 44 |     public static NgramExtractor gramLengths(Integer... gramLength) {
 45 |         return new NgramExtractor(Arrays.asList(gramLength), null, null);
 46 |     }
 47 | 
 48 |     public NgramExtractor filter(NgramFilter filter) {
 49 |         return new NgramExtractor(this.gramLengths, filter, this.textPadding);
 50 |     }
 51 | 
 52 |     /**
 53 |      * To ensure having border grams, this character is added to the left and right of the text.
 54 |      *
 55 |      * <p>Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f"
 56 |      * are created.</p>
 57 |      *
 58 |      * <p>If the text already has such a character in that position (eg starts with), it is not added there.</p>
 59 |      *
 60 |      * @param textPadding for example a space ' '.
 61 |      */
 62 |     public NgramExtractor textPadding(char textPadding) {
 63 |         return new NgramExtractor(this.gramLengths, this.filter, textPadding);
 64 |     }
 65 | 
 66 |     private NgramExtractor(@NotNull List<Integer> gramLengths, @Nullable NgramFilter filter, @Nullable Character textPadding) {
 67 |         if (gramLengths.isEmpty()) throw new IllegalArgumentException();
 68 |         this.gramLengths = ImmutableList.copyOf(gramLengths);
 69 |         this.filter = filter;
 70 |         this.textPadding = textPadding;
 71 |     }
 72 | 
 73 |     public List<Integer> getGramLengths() {
 74 |         return gramLengths;
 75 |     }
 76 | 
 77 |     /**
 78 |      * Creates the n-grams for a given text in the order they occur.
 79 |      *
 80 |      * <p>Example: extractSortedGrams("Foo bar", 2) => [Fo,oo,o , b,ba,ar]</p>
 81 |      *
 82 |      * @param  text
 83 |      * @return The grams, empty if the input was empty or if none for that gramLength fits.
 84 |      */
 85 |     @NotNull
 86 |     public List<String> extractGrams(@NotNull CharSequence text) {
 87 |         text = applyPadding(text);
 88 |         int len = text.length();
 89 | 
 90 |         //the actual size will be totalNumGrams or less (filter)
 91 |         int totalNumGrams = 0;
 92 |         for (Integer gramLength : gramLengths) {
 93 |             int num = len - (gramLength - 1);
 94 |             if (num >= 1) { //yes can be negative
 95 |                 totalNumGrams += num;
 96 |             }
 97 |         }
 98 |         if (totalNumGrams <= 0) {
 99 |             return Collections.emptyList();
100 |         }
101 |         List<String> grams = new ArrayList<>(totalNumGrams);
102 | 
103 |         for (Integer gramLength : gramLengths) {
104 |             int numGrams = len - (gramLength -1);
105 |             if (numGrams >= 1) { //yes can be negative
106 |                 for (int pos=0; pos<numGrams; pos++) {
107 |                     String gram = text.subSequence(pos, pos + gramLength).toString();
108 |                     if (filter==null || filter.use(gram)) {
109 |                         grams.add(gram);
110 |                     }
111 |                 }
112 |             }
113 |         }
114 |         return grams;
115 |     }
116 | 
117 |     /**
118 |      * @return Key = ngram, value = count
119 |      *         The order is as the n-grams appeared first in the string.
120 |      *
121 |      */
122 |     @NotNull
123 |     public Map<String,Integer> extractCountedGrams(@NotNull CharSequence text) {
124 |         text = applyPadding(text);
125 |         int len = text.length();
126 | 
127 |         int initialCapacity = 0;
128 |         for (Integer gramLength : gramLengths) {
129 |             initialCapacity += guessNumDistinctiveGrams(len, gramLength);
130 |         }
131 | 
132 |         Map<String,Integer> grams = new LinkedHashMap<>(initialCapacity);
133 |         for (Integer gramLength : gramLengths) {
134 |             _extractCounted(text, gramLength, len, grams);
135 |         }
136 |         return grams;
137 |     }
138 | 
139 | 
140 |     private void _extractCounted(CharSequence text, int gramLength, int len, Map<String, Integer> grams) {
141 |         int endPos = len - (gramLength -1);
142 |         for (int pos=0; pos<endPos; pos++) {
143 |             String gram = text.subSequence(pos, pos + gramLength).toString();
144 |             if (filter==null || filter.use(gram)) {
145 |                 Integer counter = grams.get(gram);
146 |                 if (counter==null) {
147 |                     grams.put(gram, 1);
148 |                 } else {
149 |                     grams.put(gram, counter+1);
150 |                 }
151 |             }
152 |         }
153 |     }
154 | 
155 |     /**
156 |      * This is trying to be smart.
157 |      * It also depends on script (alphabet less than ideographic).
158 |      * So I'm not sure how good it really is. Just trying to prevent array copies... and for Latin it seems to work fine.
159 |      */
160 |     private static int guessNumDistinctiveGrams(int textLength, int gramLength) {
161 |         switch (gramLength) {
162 |             case 1:
163 |                 return Math.min(80, textLength);
164 |             case 2:
165 |                 if (textLength < 40) return textLength;
166 |                 if (textLength < 100) return (int)(textLength*0.8);
167 |                 if (textLength < 1000) return (int)(textLength * 0.6);
168 |                 return (int)(textLength * 0.5);
169 |             case 3:
170 |                 if (textLength < 40) return textLength;
171 |                 if (textLength < 100) return (int)(textLength*0.9);
172 |                 if (textLength < 1000) return (int)(textLength * 0.8);
173 |                 return (int)(textLength * 0.6);
174 |             case 4:
175 |             case 5:
176 |             default:
177 |                 if (textLength < 100) return textLength;
178 |                 if (textLength < 1000) return (int)(textLength * 0.95);
179 |                 return (int)(textLength * 0.9);
180 |         }
181 |     }
182 | 
183 |     private CharSequence applyPadding(CharSequence text) {
184 |         if (textPadding==null) return text;
185 |         if (text.length()==0) return text;
186 |         if (text.charAt(0)==textPadding && text.charAt(text.length()-1)==textPadding) {
187 |             return text;
188 |         }
189 |         StringBuilder sb = new StringBuilder();
190 |         if (text.charAt(0) != textPadding) {
191 |             sb.append(textPadding);
192 |         }
193 |         sb.append(text);
194 |         if (text.charAt(text.length()-1) != textPadding) {
195 |             sb.append(textPadding);
196 |         }
197 |         return sb;
198 |     }
199 | 
200 | }
201 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/profiles/LanguageProfileReader.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.profiles;
 18 | 
 19 | import com.optimaize.langdetect.frma.LangProfileReader;
 20 | import com.optimaize.langdetect.i18n.LdLocale;
 21 | import org.jetbrains.annotations.NotNull;
 22 | 
 23 | import java.io.*;
 24 | import java.util.ArrayList;
 25 | import java.util.Collection;
 26 | import java.util.List;
 27 | 
 28 | /**
 29 |  * Reads {@link LanguageProfile}s.
 30 |  *
 31 |  * @author Fabian Kessler
 32 |  */
 33 | public class LanguageProfileReader {
 34 | 
 35 |     private static final LangProfileReader internalReader = new LangProfileReader();
 36 |     private static final String PROFILES_DIR = "languages";
 37 | 
 38 | 
 39 |     /**
 40 |      * Reads a {@link LanguageProfile} from a File in UTF-8.
 41 |      */
 42 |     public LanguageProfile read(File profileFile) throws IOException {
 43 |         return OldLangProfileConverter.convert(internalReader.read(profileFile));
 44 |     }
 45 | 
 46 |     /**
 47 |      * Reads a {@link LanguageProfile} from an InputStream in UTF-8.
 48 |      */
 49 |     public LanguageProfile read(InputStream inputStream) throws IOException {
 50 |         return OldLangProfileConverter.convert(internalReader.read(inputStream));
 51 |     }
 52 | 
 53 | 
 54 |     /**
 55 |      * Load profiles from the classpath in a specific directory.
 56 |      *
 57 |      * <p>This is usually used to load built-in profiles, shipped with the jar.</p>
 58 |      *
 59 |      * @param classLoader the ClassLoader to load the profiles from. Use {@code MyClass.class.getClassLoader()}
 60 |      * @param profileDirectory profile directory path inside the classpath. The default profiles are in "languages".
 61 |      * @param profileFileNames for example ["en", "fr", "de"].
 62 |      */
 63 |     public List<LanguageProfile> read(ClassLoader classLoader, String profileDirectory, Collection<String> profileFileNames) throws IOException {
 64 |         List<LanguageProfile> loaded = new ArrayList<>(profileFileNames.size());
 65 |         for (String profileFileName : profileFileNames) {
 66 |             String path = makePathForClassLoader(profileDirectory, profileFileName);
 67 |             try (InputStream in = classLoader.getResourceAsStream(path)) {
 68 |                 if (in == null) {
 69 |                     throw new IOException("No language file available named "+profileFileName+" at " + path + "!");
 70 |                 }
 71 |                 loaded.add( read(in) );
 72 |             }
 73 |         }
 74 |         return loaded;
 75 |     }
 76 | 
 77 |     private String makePathForClassLoader(String profileDirectory, String fileName) {
 78 |         //WITHOUT slash before the profileDirectory when using the classloader!
 79 |         return profileDirectory + '/' + fileName;
 80 |     }
 81 | 
 82 |     /**
 83 |      * Same as {@link #read(ClassLoader, String, java.util.Collection)} using the class loader of this class.
 84 |      */
 85 |     public List<LanguageProfile> read(String profileDirectory, Collection<String> profileFileNames) throws IOException {
 86 |         return read(LanguageProfileReader.class.getClassLoader(), profileDirectory, profileFileNames);
 87 |     }
 88 | 
 89 |     /**
 90 |      * Same as {@link #read(ClassLoader, String, java.util.Collection)} using the class loader of this class,
 91 |      * and the default profiles directory of this library.
 92 |      */
 93 |     public List<LanguageProfile> read(Collection<String> profileFileNames) throws IOException {
 94 |         return read(LanguageProfileReader.class.getClassLoader(), PROFILES_DIR, profileFileNames);
 95 |     }
 96 | 
 97 |     @NotNull
 98 |     public LanguageProfile readBuiltIn(@NotNull LdLocale locale) throws IOException {
 99 |         String filename = makeProfileFileName(locale);
100 |         String path = makePathForClassLoader(PROFILES_DIR, filename);
101 |         try (InputStream in = LanguageProfileReader.class.getClassLoader().getResourceAsStream(path)) {
102 |             if (in == null) {
103 |                 throw new IOException("No language file available named "+filename+" at " + path + "!");
104 |             }
105 |             return read(in);
106 |         }
107 |     }
108 | 
109 |     @NotNull
110 |     private String makeProfileFileName(@NotNull LdLocale locale) {
111 |         return locale.toString();
112 |     }
113 | 
114 |     @NotNull
115 |     public List<LanguageProfile> readBuiltIn(@NotNull Collection<LdLocale> languages) throws IOException {
116 |         List<String> profileNames = new ArrayList<>();
117 |         for (LdLocale locale : languages) {
118 |             profileNames.add(makeProfileFileName(locale));
119 |         }
120 |         return read(LanguageProfileReader.class.getClassLoader(), PROFILES_DIR, profileNames);
121 |     }
122 | 
123 |     /**
124 |      * @deprecated renamed to readAllBuiltIn()
125 |      */
126 |     public List<LanguageProfile> readAll() throws IOException {
127 |         return readAllBuiltIn();
128 |     }
129 |     /**
130 |      * Reads all built-in language profiles from the "languages" folder (shipped with the jar).
131 |      */
132 |     public List<LanguageProfile> readAllBuiltIn() throws IOException {
133 |         List<LanguageProfile> loaded = new ArrayList<>();
134 |         for (LdLocale locale : BuiltInLanguages.getLanguages()) {
135 |             loaded.add(readBuiltIn(locale));
136 |         }
137 |         return loaded;
138 |     }
139 | 
140 |     /**
141 |      * Loads all profiles from the specified directory.
142 |      *
143 |      * Do not use this method for files distributed within a jar.
144 |      *
145 |      * @param path profile directory path
146 |      * @return empty if there is no language file in it.
147 |      */
148 |     public List<LanguageProfile> readAll(File path) throws IOException {
149 |         if (!path.exists()) {
150 |             throw new IOException("No such folder: "+path);
151 |         }
152 |         if (!path.canRead()) {
153 |             throw new IOException("Folder not readable: "+path);
154 |         }
155 |         File[] listFiles = path.listFiles(new FileFilter() {
156 |             @Override
157 |             public boolean accept(File pathname) {
158 |                 return looksLikeLanguageProfileFile(pathname);
159 |             }
160 |         });
161 |         if (listFiles == null) {
162 |             throw new IOException("Failed reading from folder: " + path);
163 |         }
164 | 
165 |         List<LanguageProfile> profiles = new ArrayList<>(listFiles.length);
166 |         for (File file: listFiles) {
167 |             if (!looksLikeLanguageProfileFile(file)) {
168 |                 continue;
169 |             }
170 |             profiles.add(read(file));
171 |         }
172 |         return profiles;
173 |     }
174 | 
175 |     private boolean looksLikeLanguageProfileFile(File file) {
176 |         if (!file.isFile()) {
177 |             return false;
178 |         }
179 |         return looksLikeLanguageProfileName(file.getName());
180 |     }
181 |     private boolean looksLikeLanguageProfileName(String fileName) {
182 |         if (fileName.contains(".")) {
183 |             return false;
184 |         }
185 |         try {
186 |             LdLocale.fromString(fileName);
187 |             return true;
188 |         } catch (Exception e) {
189 |             return false;
190 |         }
191 |     }
192 | 
193 | }
194 | 


--------------------------------------------------------------------------------
/src/main/java/com/optimaize/langdetect/i18n/LdLocale.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2011 Fabian Kessler
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.optimaize.langdetect.i18n;
 18 | 
 19 | import com.google.common.base.Optional;
 20 | import com.google.common.base.Splitter;
 21 | import org.jetbrains.annotations.NotNull;
 22 | 
 23 | import java.util.List;
 24 | 
 25 | /**
 26 |  * A language-detector implementation of a Locale, similar to the java.util.Locale.
 27 |  *
 28 |  * <p>It represents a IETF BCP 47 tag, but does not implement all the features. Features can be added as needed.</p>
 29 |  *
 30 |  * <p>It is constructed through the {@link #fromString} factory method. The {@link #toString()} method
 31 |  * produces a parseable and persistable string.</p>
 32 |  *
 33 |  * <p>The class is immutable.</p>
 34 |  *
 35 |  * <p>The java.util.Locale cannot be used because it has issues for historical reasons, notably the
 36 |  * script code conversion for Hebrew, Yiddish and Indonesian, and more. If one needs a Locale,
 37 |  * it is simple to create one based on this object.<br/>
 38 |  * The ICU ULocale cannot be used because a) it has issues too (for our use case) and b) we're not
 39 |  * using ICU in here [yet].</p>
 40 |  *
 41 |  * <p>This class does not perform any modifications on the input. The input is used as is, and the getters
 42 |  * return it in exactly the same way. No standardization, canonicalization, cleaning.</p>
 43 |  *
 44 |  * <p>The input is validated syntactically, but not for code existence. For example the script code must
 45 |  * be a valid ISO 15924 like "Latn" or "Cyrl", in correct case. But whether the code exists or not is not checked.
 46 |  * These code standards are not fixed, simply because regional entities like Countries can change for political
 47 |  * reasons, and languages are living entities. Therefore certain codes may exist at some point in time only
 48 |  * (be introduced late, or be deprecated or removed, or even be re-assigned another meaning).
 49 |  * It is not up to us to decide whether Kosovo is a country in 2015 or not.
 50 |  * If one needs to only work with a certain range of acceptable codes, he can validate the codes through other
 51 |  * classes that have knowledge about the codes.
 52 |  * </p>
 53 |  *
 54 |  * <p>Language: as for BCP 47, the iso 639-1 code must be used if there is one. For example "fr" for French.
 55 |  * If not, the ISO 639-3 should be used. It is highly discouraged to use 639-2.
 56 |  * Right now this class enforces a 2 or 3 char code, but this may be relaxed in the future.</p>
 57 |  *
 58 |  * <p>Script: Only ISO 15924, no discussion.</p>
 59 |  *
 60 |  * <p>Region: same as for BCP 47. That means ISO 3166-1 alpha-2 and "UN M.49".
 61 |  * I can imagine relaxing it in the future to also allow 3166-2 codes.
 62 |  * In most cases the "region" is a "country".</p>
 63 |  *
 64 |  * @author fabian kessler
 65 |  */
 66 | public final class LdLocale {
 67 | 
 68 |     @NotNull
 69 |     private final String language;
 70 |     @NotNull
 71 |     private final Optional<String> script;
 72 |     @NotNull
 73 |     private final Optional<String> region;
 74 | 
 75 |     private LdLocale(@NotNull String language, @NotNull Optional<String> script, @NotNull Optional<String> region) {
 76 |         this.language = language;
 77 |         this.script = script;
 78 |         this.region = region;
 79 |     }
 80 | 
 81 |     /**
 82 |      * @param string The output of the toString() method.
 83 |      * @return either a new or possibly a cached (immutable) instance.
 84 |      */
 85 |     @NotNull
 86 |     public static LdLocale fromString(@NotNull String string) {
 87 |         if (string==null || string.isEmpty()) throw new IllegalArgumentException("At least a language is required!");
 88 | 
 89 |         String language = null;
 90 |         Optional<String> script = null;
 91 |         Optional<String> region = null;
 92 | 
 93 |         List<String> strings = Splitter.on('-').splitToList(string);
 94 |         for (int i=0; i<strings.size(); i++) {
 95 |             String chunk = strings.get(i);
 96 |             if (i==0) {
 97 |                 language = assignLang(chunk);
 98 |             } else {
 99 |                 if (script == null && region == null && looksLikeScriptCode(chunk)) {
100 |                     script = Optional.of(chunk);
101 |                 } else if (region==null && (looksLikeGeoCode3166_1(chunk) || looksLikeGeoCodeNumeric(chunk))) {
102 |                     region = Optional.of(chunk);
103 |                 } else {
104 |                     throw new IllegalArgumentException("Unknown part: >>>"+chunk+"<<<!");
105 |                 }
106 |             }
107 |         }
108 |         assert language != null;
109 |         if (script==null) script = Optional.absent();
110 |         if (region==null) region = Optional.absent();
111 |         return new LdLocale(language, script, region);
112 |     }
113 | 
114 |     private static boolean looksLikeScriptCode(String string) {
115 |         return string.length() == 4 && string.matches("[A-Z][a-z]{3}");
116 |     }
117 | 
118 |     private static boolean looksLikeGeoCode3166_1(String string) {
119 |         return string.length()==2 && string.matches("[A-Z]{2}");
120 |     }
121 |     private static boolean looksLikeGeoCodeNumeric(String string) {
122 |         return string.length()==3 && string.matches("[0-9]{3}");
123 |     }
124 | 
125 |     private static String assignLang(String s) {
126 |         if (!s.matches("[a-z]{2,3}")) throw new IllegalArgumentException("Invalid language code syntax: >>>"+s+"<<<!");
127 |         return s;
128 |     }
129 | 
130 |     /**
131 |      * The output of this can be fed to the fromString() method.
132 |      * @return for example "de" or "de-Latn" or "de-CH" or "de-Latn-CH", see class header.
133 |      */
134 |     public String toString() {
135 |         StringBuilder sb = new StringBuilder();
136 | 
137 |         sb.append(language);
138 | 
139 |         if (script.isPresent()) {
140 |             sb.append('-');
141 |             sb.append(script.get());
142 |         }
143 | 
144 |         if (region.isPresent()) {
145 |             sb.append('-');
146 |             sb.append(region.get());
147 |         }
148 | 
149 |         return sb.toString();
150 |     }
151 | 
152 | 
153 |     /**
154 |      * @return ISO 639-1 or 639-3 language code, eg "fr" or "gsw", see class header.
155 |      */
156 |     @NotNull
157 |     public String getLanguage() {
158 |         return language;
159 |     }
160 | 
161 |     /**
162 |      * @return ISO 15924 script code, eg "Latn", see class header.
163 |      */
164 |     @NotNull
165 |     public Optional<String> getScript() {
166 |         return script;
167 |     }
168 | 
169 |     /**
170 |      * @return ISO 3166-1 or UN M.49 code, eg "DE" or 150, see class header.
171 |      */
172 |     @NotNull
173 |     public Optional<String> getRegion() {
174 |         return region;
175 |     }
176 | 
177 | 
178 | 
179 |     @Override //generated-code
180 |     public boolean equals(Object o) {
181 |         if (this == o) return true;
182 |         if (o == null || getClass() != o.getClass()) return false;
183 | 
184 |         LdLocale ldLocale = (LdLocale) o;
185 | 
186 |         if (!language.equals(ldLocale.language)) return false;
187 |         if (!region.equals(ldLocale.region)) return false;
188 |         if (!script.equals(ldLocale.script)) return false;
189 | 
190 |         return true;
191 |     }
192 | 
193 |     @Override //generated-code
194 |     public int hashCode() {
195 |         int result = language.hashCode();
196 |         result = 31 * result + script.hashCode();
197 |         result = 31 * result + region.hashCode();
198 |         return result;
199 |     }
200 | }
201 | 


--------------------------------------------------------------------------------