├── .gitignore ├── src ├── main │ ├── java │ │ ├── com │ │ │ └── optimaize │ │ │ │ └── langdetect │ │ │ │ ├── cybozu │ │ │ │ ├── package.html │ │ │ │ ├── util │ │ │ │ │ ├── package.html │ │ │ │ │ ├── Messages.java │ │ │ │ │ ├── TagExtractor.java │ │ │ │ │ ├── NGram.java │ │ │ │ │ ├── Util.java │ │ │ │ │ └── LangProfile.java │ │ │ │ └── GenProfile.java │ │ │ │ ├── ngram │ │ │ │ ├── package-info.java │ │ │ │ ├── NgramFilter.java │ │ │ │ ├── NgramExtractors.java │ │ │ │ ├── StandardNgramFilter.java │ │ │ │ ├── BackwardsCompatibleNgramFilter.java │ │ │ │ ├── OldNgramExtractor.java │ │ │ │ └── NgramExtractor.java │ │ │ │ ├── profiles │ │ │ │ ├── package-info.java │ │ │ │ ├── OldLangProfileConverter.java │ │ │ │ ├── util │ │ │ │ │ └── LanguageLister.java │ │ │ │ ├── LanguageProfileWriter.java │ │ │ │ ├── LanguageProfile.java │ │ │ │ ├── LanguageProfileBuilder.java │ │ │ │ ├── BuiltInLanguages.java │ │ │ │ ├── LanguageProfileImpl.java │ │ │ │ └── LanguageProfileReader.java │ │ │ │ ├── text │ │ │ │ ├── package-info.java │ │ │ │ ├── TextFilter.java │ │ │ │ ├── TextObjectFactory.java │ │ │ │ ├── CharNormalizerTextFilterImpl.java │ │ │ │ ├── UrlTextFilter.java │ │ │ │ ├── MultiTextFilter.java │ │ │ │ ├── CommonTextObjectFactories.java │ │ │ │ ├── TextObjectFactoryBuilder.java │ │ │ │ ├── RemoveMinorityScriptsTextFilter.java │ │ │ │ └── TextObject.java │ │ │ │ ├── frma │ │ │ │ ├── IOUtils.java │ │ │ │ ├── LangProfileWriter.java │ │ │ │ ├── GenProfile.java │ │ │ │ └── LangProfileReader.java │ │ │ │ ├── DetectedLanguage.java │ │ │ │ ├── LanguageDetector.java │ │ │ │ ├── NgramFrequencyData.java │ │ │ │ └── i18n │ │ │ │ └── LdLocale.java │ │ └── overview.html │ └── resources │ │ └── README.md └── test │ ├── resources │ ├── texts │ │ └── README.txt │ └── logback-test.xml │ └── java │ └── com │ └── optimaize │ └── langdetect │ ├── frma │ ├── IOUtilsTest.java │ ├── LangProfileReaderTest.java │ ├── LangProfileWriterTest.java │ └── GenProfileTest.java │ ├── text │ ├── TextObjectTest.java │ ├── MultiTextFilterTest.java │ └── RemoveMinorityScriptsTextFilterTest.java │ ├── ngram │ ├── StandardNgramFilterTest.java │ ├── BackwardsCompatibleNgramFilterTest.java │ ├── OldNgramExtractorTest.java │ └── NgramExtractorTest.java │ ├── cybozu │ ├── DetectedLanguageTest.java │ └── util │ │ ├── NGramTest.java │ │ ├── LangProfileTest.java │ │ └── TagExtractorTest.java │ ├── profiles │ ├── LanguageProfileWriterTest.java │ ├── LanguageProfileBuilderTest.java │ └── LanguageProfileReaderTest.java │ ├── NgramFrequencyDataTest.java │ ├── LanguageDetectorImplTest.java │ ├── TechnicalLanguageDetectorImplTest.java │ ├── i18n │ └── LdLocaleTest.java │ └── DataLanguageDetectorImplTest.java └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /language-detector.iml 3 | .idea/ -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Original language detection classes from https://code.google.com/p/language-detection/ 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/util/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Provides the utility classes for language detection. 4 | Users don't use this package's classes directly. 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/test/resources/texts/README.txt: -------------------------------------------------------------------------------- 1 | I created these by copying text from the Wikipedia articles. 2 | Example: https://de.wikipedia.org/wiki/Deutschland 3 | 4 | The files are stored in UTF-8! (Save as UTF-8 in Windows Notepad) 5 | -------------------------------------------------------------------------------- /src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | System.out 8 | 9 | %d{yyyy-MM-dd/HH:mm:ss.SSS/zzz} [%t] %-5p %m%n 10 | 11 | 12 | INFO 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/main/resources/README.md: -------------------------------------------------------------------------------- 1 | ## About the "languages" folder and files 2 | 3 | Most of these files are from the original software from Nakatani Shuyo. 4 | Unfortunately, the data sources from which they were generated are not available. 5 | It looks like the text comes from Wikipedia pages. 6 | 7 | To generate your own language profile, see the main readme at https://github.com/optimaize/language-detector 8 | 9 | km Khmer: 10 | sources available, see https://github.com/optimaize/language-detector/issues/19 11 | 12 | ## About the "languages.shorttext" folder and files 13 | 14 | These files are from the original software from Nakatani Shuyo. 15 | 16 | Either they are for detecting language on short messages, or they are built from short message text, or 17 | both, I don't know. 18 | 19 | 20 | ## About the "messages.properties" file 21 | 22 | They are used in the CharNormalizer. 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/ngram/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * Provides functionality for handling n-grams. 19 | * 20 | *

See http://en.wikipedia.org/wiki/N-gram

21 | * 22 | * @author Fabian Kessler 23 | */ 24 | package com.optimaize.langdetect.ngram; 25 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/profiles/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * Provides functionality for loading, storing and creating {@link com.optimaize.langdetect.profiles.LanguageProfile}s. 19 | * 20 | * @author Fabian Kessler 21 | */ 22 | package com.optimaize.langdetect.profiles; 23 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/ngram/NgramFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | /** 20 | * Filters out some undesired n-grams. 21 | * 22 | * Implementations must be immutable. 23 | * 24 | * @author Fabian Kessler 25 | */ 26 | public interface NgramFilter { 27 | 28 | boolean use(String ngram); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /** 18 | * Provides functionality for concatenating and cleaning text that is used as 19 | * a) learning text to produce {@link com.optimaize.langdetect.LanguageProfile}s 20 | * b) for the text for which the language is to be guessed. 21 | * 22 | * @author Fabian Kessler 23 | */ 24 | package com.optimaize.langdetect.text; 25 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/TextFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | /** 20 | * Allows to filter content from a text to be ignored for the n-gram analysis. 21 | * 22 | *

Implementations must be immutable and stateless.

23 | * 24 | * @author Fabian Kessler 25 | */ 26 | public interface TextFilter { 27 | 28 | String filter(CharSequence text); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | Language-Detection is a language detection library for Java. (aliases: language identification, language guessing) 5 |

6 | 7 | 11 | 12 | 13 |

Copyrights and License

14 | 15 |

16 | (c)2010 All rights reserved by Cybozu Labs, Inc. 17 |

18 | 19 |
20 |

21 | Licensed under the Apache License, Version 2.0 (the "License"); 22 | you may not use this file except in compliance with the License. 23 | You may obtain a copy of the License at 24 |

25 | 28 |

29 | Unless required by applicable law or agreed to in writing, software 30 | distributed under the License is distributed on an "AS IS" BASIS, 31 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 32 | See the License for the specific language governing permissions and 33 | limitations under the License. 34 |

35 |
36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/TextObjectFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | /** 20 | * Factory for {@link TextObject}s. 21 | * 22 | * @author Fabian Kessler 23 | */ 24 | public class TextObjectFactory { 25 | 26 | private final TextFilter textFilter; 27 | private final int maxTextLength; 28 | 29 | /** 30 | * @param maxTextLength 0 for none 31 | */ 32 | public TextObjectFactory(TextFilter textFilter, int maxTextLength) { 33 | this.textFilter = textFilter; 34 | this.maxTextLength = maxTextLength; 35 | } 36 | 37 | public TextObject create() { 38 | return new TextObject(textFilter, maxTextLength); 39 | } 40 | 41 | public TextObject forText(CharSequence text) { 42 | return create().append(text); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/frma/IOUtils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import java.io.Closeable; 20 | import java.io.IOException; 21 | 22 | /** 23 | * Utils to manage IO streams. 24 | * @author François ROLAND 25 | */ 26 | @Deprecated 27 | public class IOUtils { 28 | /** 29 | * Private constructor to prevent instantiation. 30 | */ 31 | private IOUtils() {} 32 | 33 | /** 34 | * Closes a stream without returning any exception. 35 | * 36 | * @param stream the stream to close. Can be null. 37 | * @deprecated use java7 closeable 38 | */ 39 | public static void closeQuietly(Closeable stream) { 40 | if (stream != null) { 41 | try { 42 | stream.close(); 43 | } catch (IOException ioe) { 44 | // ignore exception at this point. 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/frma/IOUtilsTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import static org.mockito.Mockito.*; 20 | 21 | import java.io.Closeable; 22 | import java.io.IOException; 23 | 24 | import org.junit.Test; 25 | 26 | public class IOUtilsTest { 27 | 28 | @Test 29 | public void closeQuietlyNullStream() { 30 | IOUtils.closeQuietly(null); 31 | } 32 | 33 | @Test 34 | public void closeQuietlyWhenExceptionThrown() throws IOException { 35 | Closeable stream = mock(Closeable.class); 36 | doThrow(new IOException()).when(stream).close(); 37 | IOUtils.closeQuietly(stream); 38 | } 39 | 40 | @Test 41 | public void closeQuietly() throws IOException { 42 | Closeable stream = mock(Closeable.class); 43 | IOUtils.closeQuietly(stream); 44 | verify(stream, times(1)).close(); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/CharNormalizerTextFilterImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import com.optimaize.langdetect.cybozu.util.CharNormalizer; 20 | 21 | /** 22 | * Runs through the {@link CharNormalizer}. 23 | * 24 | * @author Fabian Kessler 25 | * @deprecated can't be used because it would be a big loss to not inline this code. 26 | */ 27 | public class CharNormalizerTextFilterImpl implements TextFilter { 28 | 29 | @Override 30 | public String filter(CharSequence text) { 31 | StringBuilder ret = new StringBuilder(); 32 | char pre = 0; 33 | for (int i=0; i entry : langProfile.getFreq().entrySet()) { 40 | builder.addGram(entry.getKey(), entry.getValue()); 41 | } 42 | return builder.build(); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/ngram/StandardNgramFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | import org.junit.Test; 20 | import static org.junit.Assert.*; 21 | 22 | /** 23 | * @author Fabian Kessler 24 | */ 25 | public class StandardNgramFilterTest { 26 | 27 | private static final NgramFilter filter = StandardNgramFilter.getInstance(); 28 | 29 | @Test 30 | public void oneGram() throws Exception { 31 | assertTrue(filter.use("a")); 32 | assertTrue(filter.use("A")); 33 | 34 | assertFalse(filter.use(" ")); 35 | } 36 | 37 | @Test 38 | public void twoGram() throws Exception { 39 | assertTrue(filter.use("ab")); 40 | assertTrue(filter.use("Ab")); 41 | assertTrue(filter.use("AB")); 42 | assertTrue(filter.use("a ")); 43 | assertTrue(filter.use("a")); 44 | } 45 | 46 | @Test 47 | public void threeGram() throws Exception { 48 | assertTrue(filter.use("abc")); 49 | assertTrue(filter.use("Abc")); 50 | assertTrue(filter.use("ABC")); 51 | assertTrue(filter.use("ab ")); 52 | assertTrue(filter.use(" ab")); 53 | 54 | assertFalse(filter.use("a c")); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/text/MultiTextFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import org.junit.Test; 21 | 22 | import java.util.Collections; 23 | 24 | import static org.junit.Assert.assertEquals; 25 | 26 | /** 27 | * @author Fabian Kessler 28 | */ 29 | public class MultiTextFilterTest { 30 | 31 | @Test 32 | public void empty() throws Exception { 33 | assertEquals(new MultiTextFilter(Collections.emptyList()).filter("foo"), "foo"); 34 | } 35 | 36 | @Test 37 | public void doubleFilter() throws Exception { 38 | assertEquals(new MultiTextFilter(ImmutableList.of( 39 | new TextFilter() { 40 | @Override 41 | public String filter(CharSequence text) { 42 | return text.toString().replace("a", "A"); 43 | } 44 | }, new TextFilter() { 45 | @Override 46 | public String filter(CharSequence text) { 47 | return text.toString().replace("A", "B"); 48 | } 49 | } 50 | )).filter("nananaa"), "nBnBnBB"); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/MultiTextFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import org.jetbrains.annotations.NotNull; 21 | import org.jetbrains.annotations.Nullable; 22 | 23 | import java.util.List; 24 | 25 | /** 26 | * Groups multiple {@link com.optimaize.langdetect.text.TextFilter}s as one and runs them in the given order. 27 | * 28 | * @author Fabian Kessler 29 | */ 30 | public class MultiTextFilter implements TextFilter { 31 | 32 | @Nullable 33 | private final List filters; 34 | 35 | /** 36 | * @param filters may be empty by definition 37 | */ 38 | public MultiTextFilter(@NotNull List filters) { 39 | if (filters.isEmpty()) { 40 | this.filters = null; 41 | } else { 42 | this.filters = ImmutableList.copyOf(filters); 43 | } 44 | } 45 | 46 | @Override 47 | public String filter(CharSequence text) { 48 | if (filters==null) { 49 | return text.toString(); 50 | } else { 51 | String modified = text.toString(); 52 | for (TextFilter filter : filters) { 53 | modified = filter.filter(modified); 54 | } 55 | return modified; 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/ngram/BackwardsCompatibleNgramFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | import org.junit.Test; 20 | 21 | import static org.junit.Assert.assertFalse; 22 | import static org.junit.Assert.assertTrue; 23 | 24 | /** 25 | * @author Fabian Kessler 26 | */ 27 | public class BackwardsCompatibleNgramFilterTest { 28 | 29 | public static final NgramFilter filter = BackwardsCompatibleNgramFilter.getInstance(); 30 | 31 | @Test 32 | public void oneGram() throws Exception { 33 | assertTrue(filter.use("a")); 34 | assertTrue(filter.use("A")); 35 | 36 | assertFalse(filter.use(" ")); 37 | } 38 | 39 | @Test 40 | public void twoGram() throws Exception { 41 | assertTrue(filter.use("ab")); 42 | assertTrue(filter.use("Ab")); 43 | assertTrue(filter.use("a ")); 44 | assertTrue(filter.use("a")); 45 | 46 | assertFalse(filter.use("AB")); 47 | } 48 | 49 | @Test 50 | public void threeGram() throws Exception { 51 | assertTrue(filter.use("abc")); 52 | assertTrue(filter.use("Abc")); 53 | assertTrue(filter.use("ab ")); 54 | assertTrue(filter.use(" ab")); 55 | 56 | assertFalse(filter.use("a c")); 57 | assertFalse(filter.use("ABC")); 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/CommonTextObjectFactories.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | /** 20 | * Contains some standard {@link com.optimaize.langdetect.text.TextObjectFactory}s ready to use for 21 | * common use cases. 22 | * 23 | * @author Fabian Kessler 24 | */ 25 | public class CommonTextObjectFactories { 26 | 27 | public static TextObjectFactory forDetectingOnLargeText() { 28 | return new TextObjectFactoryBuilder() 29 | .maxTextLength(10000) 30 | .withTextFilter(UrlTextFilter.getInstance()) 31 | .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) 32 | .build(); 33 | } 34 | 35 | public static TextObjectFactory forDetectingShortCleanText() { 36 | return new TextObjectFactoryBuilder() 37 | .build(); 38 | } 39 | 40 | public static TextObjectFactory forIndexing() { 41 | return new TextObjectFactoryBuilder() 42 | .withTextFilter(UrlTextFilter.getInstance()) 43 | .withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3)) 44 | .build(); 45 | } 46 | 47 | public static TextObjectFactory forIndexingCleanText() { 48 | return new TextObjectFactoryBuilder() 49 | .build(); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/TextObjectFactoryBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | /** 23 | * Builder for {@link com.optimaize.langdetect.text.TextObjectFactory}. 24 | * 25 | * @author Fabian Kessler 26 | */ 27 | public class TextObjectFactoryBuilder { 28 | 29 | private int maxTextLength = 0; 30 | private final List textFilters = new ArrayList<>(); 31 | 32 | /** 33 | * @param maxTextLength 0 for no limit (that's the default). 34 | */ 35 | public TextObjectFactoryBuilder maxTextLength(int maxTextLength) { 36 | this.maxTextLength = maxTextLength; 37 | return this; 38 | } 39 | 40 | 41 | /** 42 | * Adds the given TextFilter to be run on {@link TextObject#append} methods. 43 | * 44 | *

Note that the order of filters. may be important. They are executed in the same order as they 45 | * are passed in here.

46 | */ 47 | public TextObjectFactoryBuilder withTextFilter(TextFilter textFilter) { 48 | textFilters.add(textFilter); 49 | return this; 50 | } 51 | 52 | public TextObjectFactory build() { 53 | return new TextObjectFactory( 54 | new MultiTextFilter(textFilters), 55 | maxTextLength 56 | ); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/ngram/OldNgramExtractorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | import com.google.common.base.Stopwatch; 20 | import org.junit.Test; 21 | 22 | import java.util.*; 23 | 24 | import static org.junit.Assert.*; 25 | 26 | /** 27 | * @author Fabian Kessler 28 | */ 29 | public class OldNgramExtractorTest { 30 | 31 | @Test 32 | public void testExtractNGrams() { 33 | List ngrams = OldNgramExtractor.extractNGrams("Foo bar", null); 34 | assertTrue(ngrams.contains("Foo")); 35 | assertTrue(ngrams.contains("F")); 36 | assertTrue(ngrams.contains(" Fo")); //algorithm makes prefix-grams 37 | assertFalse(ngrams.contains("ar ")); //algorithm does not make suffix-grams 38 | assertEquals(ngrams.size(), 18); //adapt when making changes to the extractor... 39 | } 40 | 41 | @Test 42 | public void testExtractNGrams2() { 43 | List ngrams = OldNgramExtractor.extractNGrams("Hallo DAA.", null); 44 | System.out.println(ngrams); 45 | } 46 | 47 | 48 | 49 | @Test 50 | public void stressTestAlgo1() { 51 | String text = "Foo bar hello world and so on nana nunu dada dudu asdf asdf akewf köjvnawer aisdfj awejfr iajdsöfj ewi adjsköfjwei ajsdökfj ief asd"; 52 | Stopwatch stopwatch = Stopwatch.createStarted(); 53 | for (int i=0; i<100000; i++) { 54 | OldNgramExtractor.extractNGrams(text, null); //2.745s 55 | } 56 | System.out.println(stopwatch); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/ngram/StandardNgramFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | /** 20 | * Filters what is generally not desired. 21 | * 22 | * Impl is immutable. 23 | * 24 | * @author Fabian Kessler 25 | */ 26 | public class StandardNgramFilter implements NgramFilter { 27 | 28 | private static final StandardNgramFilter INSTANCE = new StandardNgramFilter(); 29 | 30 | public static NgramFilter getInstance() { 31 | return INSTANCE; 32 | } 33 | 34 | private StandardNgramFilter() { 35 | } 36 | 37 | @Override 38 | public boolean use(String ngram) { 39 | switch (ngram.length()) { 40 | case 1: 41 | if (ngram.charAt(0)==' ') { 42 | return false; 43 | } 44 | return true; 45 | case 2: 46 | return true; 47 | case 3: 48 | if (ngram.charAt(1)==' ') { 49 | //middle char is a space 50 | return false; 51 | } 52 | return true; 53 | case 4: 54 | if (ngram.charAt(1)==' ' || ngram.charAt(2)==' ') { 55 | //one of the middle chars is a space 56 | return false; 57 | } 58 | return true; 59 | default: 60 | //would need the same check: no space in the middle, border is fine. 61 | throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length()); 62 | } 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/frma/LangProfileReaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import com.optimaize.langdetect.cybozu.util.LangProfile; 20 | import org.junit.Test; 21 | 22 | import java.io.File; 23 | import java.io.IOException; 24 | 25 | import static org.hamcrest.Matchers.*; 26 | import static org.junit.Assert.assertThat; 27 | 28 | public class LangProfileReaderTest { 29 | private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages"); 30 | 31 | @Test 32 | public void readEnFile() throws IOException { 33 | checkProfileFile("en", 3, 2301); 34 | } 35 | 36 | @Test 37 | public void readBnFile() throws IOException { 38 | checkProfileFile("bn", 3, 2846); 39 | } 40 | 41 | @Test 42 | public void readFrFile() throws IOException { 43 | checkProfileFile("fr", 3, 2232); 44 | } 45 | 46 | @Test 47 | public void readNlFile() throws IOException { 48 | checkProfileFile("nl", 3, 2163); 49 | } 50 | 51 | 52 | private static void checkProfileFile(String language, int nWordSize, int freqSize) throws IOException { 53 | File profileFile = new File(PROFILE_DIR, language); 54 | final LangProfile langProfile = new LangProfileReader().read(profileFile); 55 | assertThat(langProfile, is(notNullValue())); 56 | assertThat(langProfile.getName(), is(equalTo(language))); 57 | assertThat(langProfile.getNWords(), is(notNullValue())); 58 | assertThat(langProfile.getNWords().length, is(equalTo(nWordSize))); 59 | assertThat(langProfile.getFreq(), is(notNullValue())); 60 | assertThat(langProfile.getFreq().size(), is(equalTo(freqSize))); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/profiles/util/LanguageLister.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nicole Torres 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles.util; 18 | 19 | import java.io.BufferedReader; 20 | import java.io.IOException; 21 | import java.io.InputStream; 22 | import java.io.InputStreamReader; 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | 26 | /** 27 | * This is just a utility to update the code with the existing languages. 28 | * 29 | * @author Nicole Torres 30 | */ 31 | class LanguageLister { 32 | 33 | public static void main(String[] args) throws IOException { 34 | List languages = readFilesFromClassPathFolder("languages/."); 35 | for (String lang : languages) { 36 | System.out.println("names.add(\""+lang+"\");"); 37 | } 38 | System.out.println("--------------------------------"); 39 | List shortText = readFilesFromClassPathFolder("languages.shorttext/."); 40 | for (String text : shortText) { 41 | System.out.println("texts.add(\""+text+"\");"); 42 | } 43 | } 44 | 45 | private static List readFilesFromClassPathFolder(String resourceNameFolder) throws IOException { 46 | List files = new ArrayList<>(); 47 | ClassLoader loader = LanguageLister.class.getClassLoader(); 48 | try (InputStream in = loader.getResourceAsStream(resourceNameFolder)) { 49 | BufferedReader rdr = new BufferedReader(new InputStreamReader(in)); 50 | String line; 51 | while ((line = rdr.readLine()) != null) { 52 | files.add(line); 53 | } 54 | rdr.close(); 55 | } 56 | return files; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/cybozu/DetectedLanguageTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu; 18 | 19 | import com.optimaize.langdetect.DetectedLanguage; 20 | import com.optimaize.langdetect.i18n.LdLocale; 21 | import org.junit.Test; 22 | 23 | import java.util.ArrayList; 24 | import java.util.Collections; 25 | import java.util.List; 26 | 27 | import static org.junit.Assert.assertEquals; 28 | 29 | /** 30 | * @author Nakatani Shuyo 31 | * @author Fabian Kessler 32 | */ 33 | public class DetectedLanguageTest { 34 | 35 | @Test 36 | public final void basic() { 37 | DetectedLanguage lang = new DetectedLanguage(LdLocale.fromString("en"), 1.0); 38 | assertEquals(lang.getLocale().getLanguage(), "en"); 39 | assertEquals(lang.getProbability(), 1.0, 0.0001); 40 | assertEquals(lang.toString(), "DetectedLanguage[en:1.0]"); 41 | } 42 | 43 | @Test(expected = IllegalArgumentException.class) 44 | public final void invalidProbability() { 45 | new DetectedLanguage(LdLocale.fromString("en"), 1.1); 46 | } 47 | 48 | @Test 49 | public final void comparable() { 50 | List list = new ArrayList<>(); 51 | list.add(new DetectedLanguage(LdLocale.fromString("en"), 1.0)); 52 | list.add(new DetectedLanguage(LdLocale.fromString("de"), 1.0)); 53 | list.add(new DetectedLanguage(LdLocale.fromString("fr"), 0.9)); 54 | Collections.sort(list); 55 | assertEquals(list.get(0).getLocale().getLanguage(), "de"); //alphabetical de before en 56 | assertEquals(list.get(1).getLocale().getLanguage(), "en"); 57 | assertEquals(list.get(2).getLocale().getLanguage(), "fr"); //points 0.9 the last 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/frma/LangProfileWriterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import com.optimaize.langdetect.cybozu.util.LangProfile; 20 | import org.junit.Test; 21 | 22 | import java.io.*; 23 | 24 | import static org.hamcrest.Matchers.*; 25 | import static org.junit.Assert.assertThat; 26 | 27 | public class LangProfileWriterTest { 28 | private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages"); 29 | 30 | @Test 31 | public void writeEnProfile() throws IOException { 32 | checkProfileCopy("en"); 33 | } 34 | 35 | @Test 36 | public void writeFrProfile() throws IOException { 37 | checkProfileCopy("fr"); 38 | } 39 | 40 | @Test 41 | public void writeNlProfile() throws IOException { 42 | checkProfileCopy("nl"); 43 | } 44 | 45 | protected void checkProfileCopy(String language) throws IOException { 46 | File originalFile = new File(PROFILE_DIR, language); 47 | final LangProfile originalProfile = new LangProfileReader().read(originalFile); 48 | File newFile = File.createTempFile("profile-copy-", null); 49 | try (FileOutputStream output = new FileOutputStream(newFile)) { 50 | new LangProfileWriter().write(originalProfile, output); 51 | LangProfile newProfile = new LangProfileReader().read(newFile); 52 | assertThat(newProfile.getFreq().size(), is(equalTo(originalProfile.getFreq().size()))); 53 | assertThat(newProfile.getFreq(), is(equalTo(originalProfile.getFreq()))); 54 | assertThat(newProfile.getNWords(), is(equalTo(originalProfile.getNWords()))); 55 | assertThat(newProfile.getName(), is(equalTo(originalProfile.getName()))); 56 | } finally { 57 | //noinspection ResultOfMethodCallIgnored 58 | newFile.delete(); 59 | } 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/ngram/BackwardsCompatibleNgramFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | /** 20 | * Filters those that were not generated by the old n-gram generator. 21 | * 22 | * Impl is immutable. 23 | * 24 | * @author Fabian Kessler 25 | */ 26 | public class BackwardsCompatibleNgramFilter implements NgramFilter { 27 | 28 | private static final BackwardsCompatibleNgramFilter INSTANCE = new BackwardsCompatibleNgramFilter(); 29 | 30 | public static NgramFilter getInstance() { 31 | return INSTANCE; 32 | } 33 | 34 | private BackwardsCompatibleNgramFilter() { 35 | } 36 | 37 | 38 | @Override 39 | public boolean use(String ngram) { 40 | switch (ngram.length()) { 41 | case 1: 42 | if (ngram.charAt(0)==' ') { 43 | return false; 44 | } 45 | return true; 46 | case 2: 47 | if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1))) { 48 | //all upper case 49 | return false; 50 | } 51 | return true; 52 | case 3: 53 | if (Character.isUpperCase(ngram.charAt(0)) && Character.isUpperCase(ngram.charAt(1)) && Character.isUpperCase(ngram.charAt(2))) { 54 | //all upper case 55 | return false; 56 | } 57 | if (ngram.charAt(1)==' ') { 58 | //middle char is a space 59 | return false; 60 | } 61 | return true; 62 | default: 63 | throw new UnsupportedOperationException("Unsupported n-gram length: "+ngram.length()); 64 | } 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/util/TagExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu.util; 18 | 19 | import com.optimaize.langdetect.text.CommonTextObjectFactories; 20 | import com.optimaize.langdetect.text.TextObjectFactory; 21 | 22 | /** 23 | * {@link TagExtractor} is a class which extracts inner texts of specified tag. 24 | * Users don't use this class directly. 25 | * @author Nakatani Shuyo 26 | */ 27 | public class TagExtractor { 28 | 29 | private static final TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexing(); 30 | 31 | /* package scope */ String target_; 32 | /* package scope */ int threshold_; 33 | /* package scope */ StringBuilder buf_; 34 | /* package scope */ String tag_; 35 | private int count_; 36 | 37 | public TagExtractor(String tag, int threshold) { 38 | target_ = tag; 39 | threshold_ = threshold; 40 | count_ = 0; 41 | clear(); 42 | } 43 | public int count() { 44 | return count_; 45 | } 46 | public void clear() { 47 | buf_ = new StringBuilder(" "); 48 | tag_ = null; 49 | } 50 | public void setTag(String tag){ 51 | tag_ = tag; 52 | } 53 | public void add(String line) { 54 | if (tag_ != null && tag_.equals(target_) && line != null) { 55 | buf_.append(line); 56 | } 57 | } 58 | public void closeTag(LangProfile profile) { 59 | if ((profile != null) && tag_.equals(target_) && (buf_.length() > threshold_) && !isSpace()) { 60 | Util.addCharSequence(profile, textObjectFactory.forText(buf_)); 61 | ++count_; 62 | } 63 | clear(); 64 | } 65 | 66 | private boolean isSpace() { 67 | return (buf_.length()==1 && buf_.toString().equals(" ")); 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/frma/LangProfileWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import com.optimaize.langdetect.cybozu.util.LangProfile; 20 | 21 | import java.io.*; 22 | import java.nio.charset.Charset; 23 | import java.util.Map; 24 | 25 | /** 26 | * Writes a {@link LangProfile} to an output stream (file). 27 | * 28 | * @author François ROLAND 29 | * @author Fabian Kessler 30 | */ 31 | public class LangProfileWriter { 32 | 33 | /** 34 | * Writes a {@link LangProfile} to an OutputStream in UTF-8. 35 | * 36 | * @throws IOException 37 | */ 38 | public void write(LangProfile langProfile, OutputStream outputStream) throws IOException { 39 | try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName("utf-8")))) { 40 | writer.write("{\"freq\":{"); 41 | boolean first = true; 42 | for (Map.Entry entry : langProfile.getFreq().entrySet()) { 43 | if (!first) { 44 | writer.write(','); 45 | } 46 | writer.write('"'); 47 | writer.write(entry.getKey()); 48 | writer.write("\":"); 49 | writer.write(entry.getValue().toString()); 50 | first = false; 51 | } 52 | writer.write("},\"n_words\":["); 53 | first = true; 54 | for (int nWord : langProfile.getNWords()) { 55 | if (!first) { 56 | writer.write(','); 57 | } 58 | writer.write(Integer.toString(nWord)); 59 | first = false; 60 | } 61 | writer.write("],\"name\":\""); 62 | writer.write(langProfile.getName()); 63 | writer.write("\"}"); 64 | writer.flush(); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/profiles/LanguageProfileWriterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import org.junit.Test; 20 | 21 | import java.io.File; 22 | import java.io.FileOutputStream; 23 | import java.io.IOException; 24 | 25 | import static org.junit.Assert.assertEquals; 26 | 27 | /** 28 | * @author François ROLAND 29 | * @author Fabian Kessler 30 | */ 31 | public class LanguageProfileWriterTest { 32 | 33 | private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages"); 34 | 35 | @Test 36 | public void writeEnProfile() throws IOException { 37 | checkProfileCopy("en"); 38 | } 39 | 40 | @Test 41 | public void writeFrProfile() throws IOException { 42 | checkProfileCopy("fr"); 43 | } 44 | 45 | @Test 46 | public void writeNlProfile() throws IOException { 47 | checkProfileCopy("nl"); 48 | } 49 | 50 | protected void checkProfileCopy(String language) throws IOException { 51 | File originalFile = new File(PROFILE_DIR, language); 52 | final LanguageProfile originalProfile = new LanguageProfileReader().read(originalFile); 53 | File newFile = File.createTempFile("profile-copy-", null); 54 | try (FileOutputStream output = new FileOutputStream(newFile)) { 55 | new LanguageProfileWriter().write(originalProfile, output); 56 | LanguageProfile newProfile = new LanguageProfileReader().read(newFile); 57 | assertEquals(newProfile.getLocale(), originalProfile.getLocale()); 58 | assertEquals(newProfile.getNumGrams(), originalProfile.getNumGrams()); 59 | assertEquals(newProfile.getGramLengths(), originalProfile.getGramLengths()); 60 | assertEquals(newProfile, originalProfile); 61 | } finally { 62 | //noinspection ResultOfMethodCallIgnored 63 | newFile.delete(); 64 | } 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/DetectedLanguage.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect; 18 | 19 | import com.optimaize.langdetect.i18n.LdLocale; 20 | import org.jetbrains.annotations.NotNull; 21 | 22 | /** 23 | * Holds information about a detected language: the locale (language) and the probability. 24 | * 25 | *

Comparable: the "better" one comes before the worse. 26 | * First order by probability descending (1 to 0). 27 | * Then order by language ascending (a to z).

28 | * 29 | *

This class is immutable.

30 | * 31 | * @author Nakatani Shuyo 32 | * @author Fabian Kessler 33 | */ 34 | public class DetectedLanguage implements Comparable { 35 | 36 | @NotNull 37 | private final LdLocale locale; 38 | private final double probability; 39 | 40 | /** 41 | * @param locale 42 | * @param probability 0-1 43 | */ 44 | public DetectedLanguage(@NotNull LdLocale locale, double probability) { 45 | if (probability<0d) throw new IllegalArgumentException("Probability must be >= 0 but was "+probability); 46 | if (probability>1d) throw new IllegalArgumentException("Probability must be <= 1 but was "+probability); 47 | this.locale = locale; 48 | this.probability = probability; 49 | } 50 | 51 | @NotNull 52 | public LdLocale getLocale() { 53 | return locale; 54 | } 55 | 56 | /** 57 | * @return 0-1, the higher the better. 58 | */ 59 | public double getProbability() { 60 | return probability; 61 | } 62 | 63 | public String toString() { 64 | return "DetectedLanguage["+ locale + ":" + probability+"]"; 65 | } 66 | 67 | /** 68 | * See class header. 69 | */ 70 | @Override 71 | public int compareTo(DetectedLanguage o) { 72 | int compare = Double.compare(o.probability, this.probability); 73 | if (compare!=0) return compare; 74 | return this.locale.toString().compareTo(o.locale.toString()); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import org.junit.Test; 20 | import static org.junit.Assert.*; 21 | 22 | /** 23 | * @author Fabian Kessler 24 | */ 25 | public class RemoveMinorityScriptsTextFilterTest { 26 | 27 | @Test 28 | public void testWithCyrillicAndHani() throws Exception { 29 | RemoveMinorityScriptsTextFilter filter = RemoveMinorityScriptsTextFilter.forThreshold(0.35); 30 | String result = filter.filter("Hu Jintao (in Chinese 胡錦濤) and Leo Tolstoy (in Russian Лев Николаевич Толстой) are two well known people."); 31 | assertEquals("Hu Jintao (in Chinese ) and Leo Tolstoy (in Russian ) are two well known people.", result); 32 | } 33 | 34 | @Test 35 | public void testWithChineseAndSomeEnglish() throws Exception { 36 | String input = "设为首页收藏本站 开启辅助访问 为首页收藏本站 开启辅助访为首页收藏本站 开启辅助访切换到窄版 请 登录 后使用快捷导航 没有帐号 注册 用户名 Email 自动登录 找回密码 密码 登录 注册 快捷导航 论坛BBS 导读Guide 排行榜Ranklist 淘帖Collection 日志Blog 相册Album 分享Share 搜索 搜索 帖子 用户 公告"; 37 | 38 | //expect no change, the ratio 0.35 is too low 39 | RemoveMinorityScriptsTextFilter filter = RemoveMinorityScriptsTextFilter.forThreshold(0.42); 40 | assertEquals(filter.filter(input), input); 41 | 42 | //expect the English to be removed 43 | filter = RemoveMinorityScriptsTextFilter.forThreshold(0.43); 44 | String result = filter.filter(input); 45 | assertEquals("设为首页收藏本站 开启辅助访问 为首页收藏本站 开启辅助访为首页收藏本站 开启辅助访切换到窄版 请 登录 后使用快捷导航 没有帐号 注册 用户名 自动登录 找回密码 密码 登录 注册 快捷导航 论坛 导读 排行榜 淘帖 日志 相册 分享 搜索 搜索 帖子 用户 公告", result); 46 | } 47 | 48 | /** 49 | * Seems obvious, but better test: plain latin text may not be modified. 50 | */ 51 | @Test 52 | public void testJustLatin() throws Exception { 53 | RemoveMinorityScriptsTextFilter filter = RemoveMinorityScriptsTextFilter.forThreshold(0.01); 54 | String text = "Hu Jintao is a well known person."; 55 | String result = filter.filter(text); 56 | assertEquals(text, result); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/frma/GenProfileTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import com.optimaize.langdetect.cybozu.util.LangProfile; 20 | import org.junit.Test; 21 | 22 | import java.io.*; 23 | import java.nio.charset.Charset; 24 | import java.util.Map; 25 | 26 | import static org.hamcrest.Matchers.*; 27 | import static org.junit.Assert.assertThat; 28 | 29 | public class GenProfileTest extends GenProfile { 30 | 31 | @Test 32 | public void generateProfile() throws IOException { 33 | File inputFile = File.createTempFile("profileInput", ".txt"); 34 | try { 35 | try (PrintWriter writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(inputFile), Charset.forName("utf-8")))) { 36 | writer.println("Salut tout le monde."); 37 | writer.println("Bonjour toi tout seul."); 38 | writer.println("Ca va ?"); 39 | writer.println("Oui ça va. Et toi ?"); 40 | } 41 | 42 | LangProfile trucProfile = generate("truc", inputFile); 43 | Map freqs = trucProfile.getFreq(); 44 | assertThat(freqs, is(notNullValue())); 45 | assertThat(freqs.get("t"), is(equalTo(8))); 46 | assertThat(freqs.get("to"), is(equalTo(4))); 47 | assertThat(freqs.get("out"), is(equalTo(2))); 48 | assertThat(freqs.get("o"), is(equalTo(7))); 49 | assertThat(freqs.get("ou"), is(equalTo(3))); 50 | assertThat(freqs.get("toi"), is(equalTo(2))); 51 | assertThat(freqs.get("u"), is(equalTo(6))); 52 | assertThat(freqs.get("ut"), is(equalTo(3))); 53 | assertThat(freqs.get("tou"), is(equalTo(2))); 54 | assertThat(freqs.get("a"), is(equalTo(5))); 55 | assertThat(freqs.get("oi"), is(equalTo(2))); 56 | assertThat(freqs.get("alu"), is(equalTo(1))); 57 | assertThat(freqs.get("on"), is(equalTo(2))); 58 | assertThat(freqs.get("Bon"), is(equalTo(1))); 59 | assertThat(freqs.get("e"), is(equalTo(3))); 60 | assertThat(freqs.get("va"), is(equalTo(2))); 61 | assertThat(freqs.get("i"), is(equalTo(3))); 62 | assertThat(freqs.get("jou"), is(equalTo(1))); 63 | } finally { 64 | //noinspection ResultOfMethodCallIgnored 65 | inputFile.delete(); 66 | } 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/profiles/LanguageProfileBuilderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import com.optimaize.langdetect.ngram.NgramExtractors; 20 | import com.optimaize.langdetect.text.*; 21 | import org.junit.Test; 22 | import static org.junit.Assert.*; 23 | 24 | /** 25 | * @author Fabian Kessler 26 | */ 27 | public class LanguageProfileBuilderTest { 28 | 29 | @Test 30 | public void german() throws Exception { 31 | TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexing(); 32 | 33 | TextObject inputText = textObjectFactory.create() 34 | .append("deutsche Text") 35 | .append(" ") 36 | .append("http://www.github.com/"); 37 | 38 | LanguageProfile languageProfile = new LanguageProfileBuilder("de") 39 | .ngramExtractor(NgramExtractors.standard()) 40 | .addText(inputText) 41 | .build(); 42 | 43 | assertEquals(1, languageProfile.getFrequency("sch")); 44 | assertEquals(0, languageProfile.getFrequency("www")); 45 | } 46 | 47 | @Test 48 | public void profile_equals() throws Exception { 49 | LanguageProfile languageProfile1 = new LanguageProfileBuilder("de") 50 | .addGram("foo", 1) 51 | .build(); 52 | 53 | LanguageProfile languageProfile2 = new LanguageProfileBuilder("de") 54 | .addGram("foo", 1) 55 | .build(); 56 | 57 | LanguageProfile languageProfile3 = new LanguageProfileBuilder("de") 58 | .addGram("bar", 1) 59 | .build(); 60 | 61 | assertEquals(languageProfile1, languageProfile2); 62 | assertNotEquals(languageProfile1, languageProfile3); 63 | } 64 | 65 | @Test 66 | public void profile_toString() throws Exception { 67 | LanguageProfile languageProfile = new LanguageProfileBuilder("de") 68 | .addGram("foo", 1) 69 | .build(); 70 | assertTrue(languageProfile.toString().contains("de")); 71 | assertTrue(languageProfile.toString().contains("1")); 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/frma/GenProfile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import com.optimaize.langdetect.cybozu.util.LangProfile; 20 | import com.optimaize.langdetect.cybozu.util.Util; 21 | import com.optimaize.langdetect.text.CommonTextObjectFactories; 22 | import com.optimaize.langdetect.text.TextObject; 23 | import com.optimaize.langdetect.text.TextObjectFactory; 24 | 25 | import java.io.*; 26 | import java.nio.charset.Charset; 27 | import java.util.zip.GZIPInputStream; 28 | 29 | /** 30 | * Generate a language profile from any given text file. 31 | * 32 | * TODO this is copy/paste from the other class with the same name. Check if code can be re-used. Rename to something meaningful. 33 | * 34 | * @author François ROLAND 35 | */ 36 | public class GenProfile { 37 | 38 | private static final TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexing(); 39 | 40 | 41 | /** 42 | * Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8. 43 | * @param lang target language name. 44 | * @param textFile input text file. 45 | * @return Language profile instance 46 | */ 47 | public static LangProfile generate(String lang, File textFile) { 48 | LangProfile profile = new LangProfile(lang); 49 | 50 | InputStream is = null; 51 | try { 52 | is = new BufferedInputStream(new FileInputStream(textFile)); 53 | if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is); 54 | 55 | BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8"))); 56 | String line; 57 | while ((line = reader.readLine()) != null) { 58 | TextObject textObject = textObjectFactory.forText(" "+line+" "); 59 | Util.addCharSequence(profile, textObject); 60 | } 61 | } catch (IOException e) { 62 | throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e); 63 | } finally { 64 | IOUtils.closeQuietly(is); 65 | } 66 | return profile; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/util/NGram.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu.util; 18 | 19 | import org.jetbrains.annotations.Nullable; 20 | 21 | /** 22 | * TODO document. 23 | * 24 | * Users don't use this class directly. 25 | * 26 | * TODO this class treats a word as "upper case" if the first 2 characters are upper case. That seems like a simplification, 27 | * would need documentation. 28 | * 29 | * @author Nakatani Shuyo 30 | */ 31 | public class NGram { 32 | 33 | /** 34 | * ngrams are created from 1gram to this amount, currently 2grams and 3grams. 35 | */ 36 | public static final int N_GRAM = 3; 37 | 38 | private StringBuilder grams_; 39 | private boolean capitalword_; 40 | 41 | public NGram() { 42 | grams_ = new StringBuilder(" "); 43 | capitalword_ = false; 44 | } 45 | 46 | public void addChar(char ch) { 47 | ch = CharNormalizer.normalize(ch); 48 | char lastChar = grams_.charAt(grams_.length() - 1); 49 | if (lastChar == ' ') { 50 | grams_ = new StringBuilder(" "); 51 | capitalword_ = false; 52 | if (ch==' ') return; 53 | } else if (grams_.length() >= N_GRAM) { 54 | grams_.deleteCharAt(0); 55 | } 56 | grams_.append(ch); 57 | 58 | if (Character.isUpperCase(ch)){ 59 | if (Character.isUpperCase(lastChar)) capitalword_ = true; 60 | } else { 61 | capitalword_ = false; 62 | } 63 | } 64 | 65 | /** 66 | * TODO this method has some weird, undocumented behavior to ignore ngrams with upper case. 67 | * 68 | * Get n-Gram 69 | * @param n length of n-gram 70 | * @return n-Gram String (null if it is invalid) 71 | */ 72 | @Nullable 73 | public String get(int n) { 74 | if (capitalword_) return null; 75 | int len = grams_.length(); 76 | if (n < 1 || n > N_GRAM || len < n) return null; 77 | if (n == 1) { 78 | char ch = grams_.charAt(len - 1); 79 | if (ch == ' ') return null; 80 | return Character.toString(ch); 81 | } else { 82 | return grams_.substring(len - n, len); 83 | } 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/ngram/OldNgramExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | import com.optimaize.langdetect.cybozu.util.NGram; 20 | import org.jetbrains.annotations.NotNull; 21 | import org.jetbrains.annotations.Nullable; 22 | 23 | import java.util.ArrayList; 24 | import java.util.List; 25 | 26 | /** 27 | * @author Nakatani Shuyo 28 | */ 29 | @Deprecated 30 | public class OldNgramExtractor { 31 | 32 | 33 | public interface Filter { 34 | /** 35 | * Allows to skip some n-grams. 36 | * 37 | * This is currently used to filter n-grams in to-analyze text when the n-gram is unknown to the loaded 38 | * language profiles. 39 | * 40 | * @return true to use this n-gram, false to skip it. 41 | */ 42 | boolean use(String gram); 43 | } 44 | 45 | /** 46 | * This was the method found in the com.cybozu.labs.langdetect.Detector class, it was used to extract 47 | * grams from the to-analyze text. 48 | * 49 | * NOTE: although it adds the first ngram with space, it does not add the last n-gram with space. example: "foo" gives " fo" but not "oo "!. 50 | * It is not clear yet whether this is desired (and why) or a bug. 51 | * 52 | * TODO replace this algorithm with a simpler, faster one that uses less memory: only by position shifting. also, the returned list size 53 | * can be computed before making it (based on text length and number of n-grams). 54 | * 55 | */ 56 | @NotNull 57 | @Deprecated 58 | public static List extractNGrams(@NotNull CharSequence text, @Nullable Filter filter) { 59 | List list = new ArrayList<>(); 60 | NGram ngram = new NGram(); 61 | for(int i=0;iSee website for details.

28 | * 29 | *

This detector cannot handle well: 30 | * Short input text, can work or give wrong results. 31 | * Text written in multiple languages. It likely returns the language for the most prominent text. It's not made for that. 32 | * Text written in languages for which the detector has no profile loaded. It may just return other similar languages. 33 | *

34 | * 35 | * @author Fabian Kessler 36 | */ 37 | public interface LanguageDetector { 38 | 39 | /** 40 | * Returns the best detected language if the algorithm is very confident. 41 | * 42 | *

Note: you may want to use getProbabilities() instead. This here is very strict, and sometimes returns 43 | * absent even though the first choice in getProbabilities() is correct.

44 | * 45 | * @param text You probably want a {@link com.optimaize.langdetect.text.TextObject}. 46 | * @return The language if confident, absent if unknown or not confident enough. 47 | */ 48 | Optional detect(CharSequence text); 49 | 50 | /** 51 | * Returns all languages with at least some likeliness. 52 | * 53 | *

There is a configurable cutoff applied for languages with very low probability.

54 | * 55 | *

The way the algorithm currently works, it can be that, for example, this method returns a 0.99 for 56 | * Danish and less than 0.01 for Norwegian, and still they have almost the same chance. It would be nice if 57 | * this could be improved in future versions.

58 | * 59 | * @param text You probably want a {@link com.optimaize.langdetect.text.TextObject}. 60 | * @return Sorted from better to worse. May be empty. 61 | * It's empty if the program failed to detect any language, or if the input text did not 62 | * contain any usable text (just noise). 63 | */ 64 | List getProbabilities(CharSequence text); 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/NgramFrequencyDataTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect; 18 | 19 | import com.google.common.collect.ImmutableSet; 20 | import com.optimaize.langdetect.i18n.LdLocale; 21 | import com.optimaize.langdetect.profiles.LanguageProfile; 22 | import com.optimaize.langdetect.profiles.LanguageProfileReader; 23 | import org.junit.BeforeClass; 24 | import org.junit.Test; 25 | 26 | import java.io.IOException; 27 | import java.util.List; 28 | 29 | import static org.junit.Assert.assertEquals; 30 | import static org.junit.Assert.assertTrue; 31 | 32 | /** 33 | * Some rudimentary tests for NgramFrequencyData. 34 | * 35 | * @author Fabian Kessler 36 | */ 37 | public class NgramFrequencyDataTest { 38 | 39 | private static NgramFrequencyData allThreeGrams; 40 | 41 | @BeforeClass 42 | public static void init() throws IOException { 43 | allThreeGrams = forAll(3); 44 | } 45 | private static NgramFrequencyData forAll(int gramSize) throws IOException { 46 | List languageProfiles = new LanguageProfileReader().readAllBuiltIn(); 47 | return NgramFrequencyData.create(languageProfiles, ImmutableSet.of(gramSize)); 48 | } 49 | 50 | 51 | @Test 52 | public void size() throws Exception { 53 | //update the number when adding built-in languages 54 | assertEquals(allThreeGrams.getLanguageList().size(), 71); 55 | } 56 | 57 | @Test 58 | public void constantOrder() throws Exception { 59 | //expect constant order: 60 | int pos=0; 61 | for (LdLocale locale : allThreeGrams.getLanguageList()) { 62 | assertEquals(allThreeGrams.getLanguage(pos), locale); 63 | pos++; 64 | } 65 | } 66 | 67 | @Test 68 | public void expectGram() throws Exception { 69 | //this must exist in many languages 70 | double[] probabilities = allThreeGrams.getProbabilities("dam"); 71 | assert probabilities != null; 72 | assertTrue(probabilities.length >= 5 && probabilities.length <= allThreeGrams.getLanguageList().size()); 73 | } 74 | 75 | @Test 76 | public void forbidGramOfWrongSize() throws Exception { 77 | //we said 3-grams, not 2 grams 78 | assertEquals(allThreeGrams.getProbabilities("da"), null); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/LanguageDetectorImplTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect; 18 | 19 | import com.optimaize.langdetect.frma.LangProfileReader; 20 | import com.optimaize.langdetect.cybozu.util.LangProfile; 21 | import com.google.common.collect.ImmutableList; 22 | import com.optimaize.langdetect.ngram.NgramExtractors; 23 | import com.optimaize.langdetect.profiles.LanguageProfile; 24 | import com.optimaize.langdetect.profiles.OldLangProfileConverter; 25 | import com.optimaize.langdetect.text.*; 26 | import org.testng.annotations.DataProvider; 27 | import org.testng.annotations.Test; 28 | 29 | import java.io.IOException; 30 | import java.util.List; 31 | import static org.testng.Assert.*; 32 | 33 | 34 | /** 35 | * Basic tests for the LanguageDetectorImpl. 36 | * 37 | * @author Fabian Kessler 38 | */ 39 | public class LanguageDetectorImplTest { 40 | 41 | @Test(dataProvider = "confident") 42 | public void confident(String expectedLanguage, CharSequence text) throws Exception { 43 | LanguageDetector languageDetector = makeNewDetector(); 44 | List result = languageDetector.getProbabilities(text); 45 | DetectedLanguage best = result.get(0); 46 | assertEquals(best.getLocale().getLanguage(), expectedLanguage); 47 | assertTrue(best.getProbability() >= 0.9999d); 48 | } 49 | @DataProvider 50 | protected Object[][] confident() { 51 | return new Object[][] { 52 | {"de", "Dies ist eine deutsche Text"}, 53 | {"de", "deutsche Text"}, 54 | {"de", CommonTextObjectFactories.forDetectingOnLargeText().create().append("deutsche Text").append(" ").append("http://www.github.com/")}, 55 | }; 56 | } 57 | 58 | 59 | private LanguageDetector makeNewDetector() throws IOException { 60 | LanguageDetectorBuilder builder = LanguageDetectorBuilder.create(NgramExtractors.standard()) 61 | .shortTextAlgorithm(50) 62 | .prefixFactor(1.5) 63 | .suffixFactor(2.0); 64 | 65 | LangProfileReader langProfileReader = new LangProfileReader(); 66 | for (String language : ImmutableList.of("en", "fr", "nl", "de")) { 67 | LangProfile langProfile = langProfileReader.read(LanguageDetectorImplTest.class.getResourceAsStream("/languages/" + language)); 68 | LanguageProfile languageProfile = OldLangProfileConverter.convert(langProfile); 69 | builder.withProfile(languageProfile); 70 | } 71 | 72 | return builder.build(); 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/frma/LangProfileReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.frma; 18 | 19 | import com.optimaize.langdetect.cybozu.util.LangProfile; 20 | 21 | import java.io.*; 22 | import java.nio.charset.Charset; 23 | import java.util.regex.Matcher; 24 | import java.util.regex.Pattern; 25 | 26 | /** 27 | * Reads {@link LangProfile}s. 28 | * 29 | * @author François ROLAND 30 | * @author Fabian Kessler 31 | */ 32 | public class LangProfileReader { 33 | 34 | private static final Pattern FREQ_PATTERN = Pattern.compile("\"freq\" ?: ?\\{(.+?)\\}"); 35 | private static final Pattern N_WORDS_PATTERN = Pattern.compile("\"n_words\" ?: ?\\[(.+?)\\]"); 36 | private static final Pattern NAME_PATTERN = Pattern.compile("\"name\" ?: ?\"(.+?)\""); 37 | 38 | /** 39 | * Reads a {@link LangProfile} from a File in UTF-8. 40 | */ 41 | public LangProfile read(File profileFile) throws IOException { 42 | if (!profileFile.exists()) { 43 | throw new IOException("No such file: "+profileFile); 44 | } else if (!profileFile.canRead()) { 45 | throw new IOException("Cannot read file: "+profileFile); 46 | } 47 | try (FileInputStream input = new FileInputStream(profileFile)) { 48 | return read(input); 49 | } 50 | } 51 | 52 | /** 53 | * Reads a {@link LangProfile} from an InputStream in UTF-8. 54 | */ 55 | public LangProfile read(InputStream inputStream) throws IOException { 56 | StringBuilder buffer = new StringBuilder(); 57 | try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, Charset.forName("utf-8")))) { 58 | String line; 59 | while((line = reader.readLine()) != null) { 60 | if (buffer.length() > 0) { 61 | buffer.append(' '); 62 | } 63 | buffer.append(line); 64 | } 65 | } 66 | 67 | String storedProfile = buffer.toString(); 68 | LangProfile langProfile = new LangProfile(); 69 | 70 | Matcher m = FREQ_PATTERN.matcher(storedProfile); 71 | if (m.find()) { 72 | String[] entries = m.group(1).split(","); 73 | for (String entry : entries) { 74 | String[] keyValue = entry.split(":"); 75 | String label = keyValue[0].trim().replace("\"", ""); 76 | langProfile.getFreq().put(label, Integer.valueOf(keyValue[1])); 77 | } 78 | } 79 | 80 | m = N_WORDS_PATTERN.matcher(storedProfile); 81 | if (m.find()) { 82 | String[] nWords = m.group(1).split(","); 83 | langProfile.setNWords(new int[nWords.length]); 84 | for (int i = 0; i < nWords.length; i++) { 85 | langProfile.getNWords()[i] = Integer.parseInt(nWords[i]); 86 | } 87 | } 88 | 89 | m = NAME_PATTERN.matcher(storedProfile); 90 | if (m.find()) { 91 | langProfile.setName(m.group(1)); 92 | } 93 | 94 | return langProfile; 95 | } 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/TechnicalLanguageDetectorImplTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 François ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect; 18 | 19 | import com.optimaize.langdetect.i18n.LdLocale; 20 | import com.optimaize.langdetect.ngram.NgramExtractor; 21 | import com.optimaize.langdetect.profiles.LanguageProfileBuilder; 22 | import org.junit.Test; 23 | 24 | import static org.junit.Assert.assertEquals; 25 | 26 | /** 27 | * These are the tests of the old detector from Shoyu. Running them against the new detector from Fabian. 28 | * 29 | * @author Nakatani Shuyo 30 | * @author Fabian Kessler 31 | */ 32 | public class TechnicalLanguageDetectorImplTest { 33 | 34 | private static final String TRAINING_EN = "a a a b b c c d e"; 35 | private static final String TRAINING_FR = "a b b c c c d d d"; 36 | private static final String TRAINING_JA = "\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048"; 37 | 38 | 39 | private LanguageDetector makeDetector() { 40 | //building exactly like the old detector behaved. 41 | LanguageDetectorBuilder detectorBuilder = LanguageDetectorBuilder.create(NgramExtractor.gramLengths(1)) 42 | .affixFactor(1.0) 43 | .shortTextAlgorithm(0); 44 | 45 | LanguageProfileBuilder profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("en")); 46 | add(detectorBuilder, profileBuilder, TRAINING_EN); 47 | 48 | profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("fr")); 49 | add(detectorBuilder, profileBuilder, TRAINING_FR); 50 | 51 | profileBuilder = new LanguageProfileBuilder(LdLocale.fromString("ja")); 52 | add(detectorBuilder, profileBuilder, TRAINING_JA); 53 | 54 | return detectorBuilder.build(); 55 | } 56 | private void add(LanguageDetectorBuilder detectorBuilder, LanguageProfileBuilder profileBuilder, String trainingEn) { 57 | for (String w : trainingEn.split(" ")) { 58 | profileBuilder.addGram(w); 59 | } 60 | detectorBuilder.withProfile(profileBuilder.build()); 61 | } 62 | 63 | 64 | @Test 65 | public final void testDetector1() { 66 | LanguageDetector languageDetector = makeDetector(); 67 | assertEquals(languageDetector.detect("a").get().getLanguage(), "en"); 68 | } 69 | 70 | @Test 71 | public final void testDetector2() { 72 | LanguageDetector languageDetector = makeDetector(); 73 | assertEquals(languageDetector.detect("b d").get().getLanguage(), "fr"); 74 | } 75 | 76 | @Test 77 | public final void testDetector3() { 78 | LanguageDetector languageDetector = makeDetector(); 79 | assertEquals(languageDetector.detect("d e").get().getLanguage(), "en"); 80 | } 81 | 82 | @Test 83 | public final void testDetector4() { 84 | LanguageDetector languageDetector = makeDetector(); 85 | assertEquals(languageDetector.detect("\u3042\u3042\u3042\u3042a").get().getLanguage(), "ja"); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/GenProfile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu; 18 | 19 | import com.optimaize.langdetect.cybozu.util.TagExtractor; 20 | import com.optimaize.langdetect.cybozu.util.LangProfile; 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | 24 | import javax.xml.stream.XMLInputFactory; 25 | import javax.xml.stream.XMLStreamException; 26 | import javax.xml.stream.XMLStreamReader; 27 | import java.io.*; 28 | import java.util.zip.GZIPInputStream; 29 | 30 | /** 31 | * Load Wikipedia's abstract XML as corpus and generate its language profile in JSON format. 32 | * 33 | * @author Nakatani Shuyo 34 | */ 35 | public class GenProfile { 36 | 37 | private static final Logger logger = LoggerFactory.getLogger(GenProfile.class); 38 | 39 | /** 40 | * Load Wikipedia abstract database file and generate its language profile 41 | * @param lang target language name 42 | * @param file target database file path 43 | * @return Language profile instance 44 | */ 45 | public static LangProfile load(String lang, File file) { 46 | 47 | LangProfile profile = new LangProfile(lang); 48 | 49 | try (InputStream is = file.getName().endsWith(".gz") ? 50 | new GZIPInputStream(new BufferedInputStream(new FileInputStream(file))) : 51 | new BufferedInputStream(new FileInputStream(file))) { 52 | 53 | TagExtractor tagextractor = new TagExtractor("abstract", 100); 54 | 55 | XMLStreamReader reader = null; 56 | try { 57 | XMLInputFactory factory = XMLInputFactory.newInstance(); 58 | reader = factory.createXMLStreamReader(is); 59 | while (reader.hasNext()) { 60 | switch (reader.next()) { 61 | case XMLStreamReader.START_ELEMENT: 62 | tagextractor.setTag(reader.getName().toString()); 63 | break; 64 | case XMLStreamReader.CHARACTERS: 65 | tagextractor.add(reader.getText()); 66 | break; 67 | case XMLStreamReader.END_ELEMENT: 68 | tagextractor.closeTag(profile); 69 | break; 70 | } 71 | } 72 | } catch (XMLStreamException e) { 73 | throw new RuntimeException("Training database file '" + file.getName() + "' is an invalid XML.", e); 74 | } finally { 75 | try { 76 | if (reader != null) reader.close(); 77 | } catch (XMLStreamException e) { /* ignore exception */ } 78 | } 79 | logger.info(lang + ":" + tagextractor.count()); 80 | 81 | } catch (IOException e) { 82 | throw new RuntimeException("Can't open training database file '" + file.getName() + "'", e); 83 | } 84 | return profile; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/cybozu/util/NGramTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu.util; 18 | 19 | import org.junit.Test; 20 | 21 | import static org.hamcrest.CoreMatchers.is; 22 | import static org.junit.Assert.assertEquals; 23 | import static org.junit.Assert.assertThat; 24 | 25 | /** 26 | * @author Nakatani Shuyo 27 | */ 28 | public class NGramTest { 29 | 30 | /** 31 | * Test method for constants 32 | */ 33 | @Test 34 | public final void testConstants() { 35 | assertThat(NGram.N_GRAM, is(3)); 36 | assertEquals(NGram.N_GRAM, 3); 37 | } 38 | 39 | 40 | /** 41 | * Test method for {@link NGram#get(int)} and {@link NGram#addChar(char)} 42 | */ 43 | @Test 44 | public final void testNGram() { 45 | NGram ngram = new NGram(); 46 | assertEquals(ngram.get(0), null); 47 | assertEquals(ngram.get(1), null); 48 | assertEquals(ngram.get(2), null); 49 | assertEquals(ngram.get(3), null); 50 | assertEquals(ngram.get(4), null); 51 | ngram.addChar(' '); 52 | assertEquals(ngram.get(1), null); 53 | assertEquals(ngram.get(2), null); 54 | assertEquals(ngram.get(3), null); 55 | ngram.addChar('A'); 56 | assertEquals(ngram.get(1), "A"); 57 | assertEquals(ngram.get(2), " A"); 58 | assertEquals(ngram.get(3), null); 59 | ngram.addChar('\u06cc'); 60 | assertEquals(ngram.get(1), "\u064a"); 61 | assertEquals(ngram.get(2), "A\u064a"); 62 | assertEquals(ngram.get(3), " A\u064a"); 63 | ngram.addChar('\u1ea0'); 64 | assertEquals(ngram.get(1), "\u1ec3"); 65 | assertEquals(ngram.get(2), "\u064a\u1ec3"); 66 | assertEquals(ngram.get(3), "A\u064a\u1ec3"); 67 | ngram.addChar('\u3044'); 68 | assertEquals(ngram.get(1), "\u3042"); 69 | assertEquals(ngram.get(2), "\u1ec3\u3042"); 70 | assertEquals(ngram.get(3), "\u064a\u1ec3\u3042"); 71 | 72 | ngram.addChar('\u30a4'); 73 | assertEquals(ngram.get(1), "\u30a2"); 74 | assertEquals(ngram.get(2), "\u3042\u30a2"); 75 | assertEquals(ngram.get(3), "\u1ec3\u3042\u30a2"); 76 | ngram.addChar('\u3106'); 77 | assertEquals(ngram.get(1), "\u3105"); 78 | assertEquals(ngram.get(2), "\u30a2\u3105"); 79 | assertEquals(ngram.get(3), "\u3042\u30a2\u3105"); 80 | ngram.addChar('\uac01'); 81 | assertEquals(ngram.get(1), "\uac00"); 82 | assertEquals(ngram.get(2), "\u3105\uac00"); 83 | assertEquals(ngram.get(3), "\u30a2\u3105\uac00"); 84 | ngram.addChar('\u2010'); 85 | assertEquals(ngram.get(1), null); 86 | assertEquals(ngram.get(2), "\uac00 "); 87 | assertEquals(ngram.get(3), "\u3105\uac00 "); 88 | 89 | ngram.addChar('a'); 90 | assertEquals(ngram.get(1), "a"); 91 | assertEquals(ngram.get(2), " a"); 92 | assertEquals(ngram.get(3), null); 93 | } 94 | 95 | } -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/profiles/LanguageProfileWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Francois ROLAND 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import org.jetbrains.annotations.NotNull; 20 | 21 | import java.io.*; 22 | import java.nio.charset.Charset; 23 | import java.util.Map; 24 | 25 | /** 26 | * Writes a {@link LanguageProfile} to an output stream or file. 27 | * 28 | *

All file operations are done with UTF-8.

29 | * 30 | * @author François ROLAND 31 | * @author Fabian Kessler 32 | */ 33 | public class LanguageProfileWriter { 34 | 35 | /** 36 | * Writes a {@link LanguageProfile} to an OutputStream in UTF-8. 37 | * 38 | * @throws java.io.IOException 39 | */ 40 | public void write(@NotNull LanguageProfile languageProfile, @NotNull OutputStream outputStream) throws IOException { 41 | try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream, Charset.forName("utf-8")))) { 42 | writer.write("{\"freq\":{"); 43 | boolean first = true; 44 | for (Map.Entry entry : languageProfile.iterateGrams()) { 45 | if (!first) { 46 | writer.write(','); 47 | } 48 | writer.write('"'); 49 | writer.write(entry.getKey()); 50 | writer.write("\":"); 51 | writer.write(entry.getValue().toString()); 52 | first = false; 53 | } 54 | writer.write("},\"n_words\":["); 55 | first = true; 56 | for (int i=1; i<=10; i++) { 57 | long nWord = languageProfile.getNumGramOccurrences(i); 58 | if (nWord ==0) break; 59 | if (!first) { 60 | writer.write(','); 61 | } 62 | writer.write(Long.toString(nWord)); 63 | first = false; 64 | } 65 | writer.write("],\"name\":\""); 66 | writer.write(languageProfile.getLocale().toString()); 67 | writer.write("\"}"); 68 | writer.flush(); 69 | } 70 | } 71 | 72 | /** 73 | * Writes a {@link LanguageProfile} to a folder using the language name as the file name. 74 | * 75 | * @param fullPath Must be an existing writable directory path. 76 | * @throws java.io.IOException if such a file name exists already. 77 | */ 78 | public void writeToDirectory(@NotNull LanguageProfile languageProfile, @NotNull File fullPath) throws IOException { 79 | if (!fullPath.exists()) { 80 | throw new IOException("Path does not exist: "+fullPath); 81 | } 82 | if (!fullPath.canWrite()) { 83 | throw new IOException("Path not writable: "+fullPath); 84 | } 85 | File file = new File(fullPath.getAbsolutePath()+"/"+languageProfile.getLocale()); 86 | if (file.exists()) { 87 | throw new IOException("File exists already, refusing to overwrite: "+file); 88 | } 89 | try (FileOutputStream output = new FileOutputStream(file)) { 90 | write(languageProfile, output); 91 | } 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/cybozu/util/LangProfileTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu.util; 18 | 19 | import static org.junit.Assert.*; 20 | 21 | import org.junit.Test; 22 | 23 | /** 24 | * @author Nakatani Shuyo 25 | * 26 | */ 27 | public class LangProfileTest { 28 | 29 | /** 30 | * Test method for {@link LangProfile#LangProfile()}. 31 | */ 32 | @Test 33 | public final void testLangProfile() { 34 | LangProfile profile = new LangProfile(); 35 | assertEquals(profile.getName(), null); 36 | } 37 | 38 | /** 39 | * Test method for {@link LangProfile#LangProfile(java.lang.String)}. 40 | */ 41 | @Test 42 | public final void testLangProfileStringInt() { 43 | LangProfile profile = new LangProfile("en"); 44 | assertEquals(profile.getName(), "en"); 45 | } 46 | 47 | /** 48 | * Test method for {@link LangProfile#add(java.lang.String)}. 49 | */ 50 | @Test 51 | public final void testAdd() { 52 | LangProfile profile = new LangProfile("en"); 53 | profile.add("a"); 54 | assertEquals((int)profile.getFreq().get("a"), 1); 55 | profile.add("a"); 56 | assertEquals((int)profile.getFreq().get("a"), 2); 57 | profile.omitLessFreq(); 58 | } 59 | 60 | 61 | @Test(expected = IllegalStateException.class) 62 | public final void testAddIllegally1() { 63 | LangProfile profile = new LangProfile(); // Illegal ( available for only JSONIC ) but ignore 64 | profile.add("a"); 65 | } 66 | 67 | @Test(expected = IllegalArgumentException.class) 68 | public final void testAddIllegally2() { 69 | LangProfile profile = new LangProfile("en"); 70 | profile.add(""); // Illegal (string's length of parameter must be between 1 and 3) 71 | } 72 | 73 | @Test(expected = IllegalArgumentException.class) 74 | public final void testAddIllegally3() { 75 | LangProfile profile = new LangProfile("en"); 76 | profile.add("abcd"); // Illegal (string's length of parameter must be between 1 and 3) 77 | } 78 | 79 | /** 80 | * Test method for {@link LangProfile#omitLessFreq()}. 81 | */ 82 | @Test 83 | public final void testOmitLessFreq() { 84 | LangProfile profile = new LangProfile("en"); 85 | String[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".split(" "); 86 | for (int i=0;i<5;++i) { 87 | for (String g : grams) { 88 | profile.add(g); 89 | } 90 | } 91 | profile.add("\u3050"); 92 | 93 | assertEquals((int)profile.getFreq().get("a"), 5); 94 | assertEquals((int)profile.getFreq().get("\u3042"), 5); 95 | assertEquals((int)profile.getFreq().get("\u3050"), 1); 96 | profile.omitLessFreq(); 97 | assertEquals(profile.getFreq().get("a"), null); // omitted 98 | assertEquals((int)profile.getFreq().get("\u3042"), 5); 99 | assertEquals(profile.getFreq().get("\u3050"), null); // omitted 100 | } 101 | 102 | @Test(expected = IllegalStateException.class) 103 | public final void testOmitLessFreqIllegally() { 104 | LangProfile profile = new LangProfile(); 105 | profile.omitLessFreq(); 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/cybozu/util/TagExtractorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu.util; 18 | 19 | import static org.junit.Assert.*; 20 | 21 | import org.junit.Test; 22 | 23 | /** 24 | * @author Nakatani Shuyo 25 | * 26 | */ 27 | public class TagExtractorTest { 28 | 29 | /** 30 | * Test method for {@link TagExtractor#TagExtractor(java.lang.String, int)}. 31 | */ 32 | @Test 33 | public final void testTagExtractor() { 34 | TagExtractor extractor = new TagExtractor(null, 0); 35 | assertEquals(extractor.target_, null); 36 | assertEquals(extractor.threshold_, 0); 37 | 38 | TagExtractor extractor2 = new TagExtractor("abstract", 10); 39 | assertEquals(extractor2.target_, "abstract"); 40 | assertEquals(extractor2.threshold_, 10); 41 | } 42 | 43 | /** 44 | * Test method for {@link TagExtractor#setTag(java.lang.String)}. 45 | */ 46 | @Test 47 | public final void testSetTag() { 48 | TagExtractor extractor = new TagExtractor(null, 0); 49 | extractor.setTag(""); 50 | assertEquals(extractor.tag_, ""); 51 | extractor.setTag(null); 52 | assertEquals(extractor.tag_, null); 53 | } 54 | 55 | /** 56 | * Test method for {@link TagExtractor#add(java.lang.String)}. 57 | */ 58 | @Test 59 | public final void testAdd() { 60 | TagExtractor extractor = new TagExtractor(null, 0); 61 | extractor.add(""); 62 | extractor.add(null); // ignore 63 | } 64 | 65 | /** 66 | * Test method for {@link TagExtractor#closeTag(LangProfile)}. 67 | */ 68 | @Test 69 | public final void testCloseTag() { 70 | TagExtractor extractor = new TagExtractor(null, 0); 71 | LangProfile profile = null; 72 | extractor.closeTag(profile); // ignore 73 | } 74 | 75 | 76 | /** 77 | * Scenario Test of extracting <abstract> tag from Wikipedia database. 78 | */ 79 | @Test 80 | public final void testNormalScenario() { 81 | TagExtractor extractor = new TagExtractor("abstract", 10); 82 | assertEquals(extractor.count(), 0); 83 | 84 | LangProfile profile = new LangProfile("en"); 85 | 86 | // normal 87 | extractor.setTag("abstract"); 88 | extractor.add("This is a sample text."); 89 | extractor.closeTag(profile); 90 | assertEquals(extractor.count(), 1); 91 | assertEquals(profile.getNWords()[0], 17); // Thisisasampletext 92 | assertEquals(profile.getNWords()[1], 22); // _T, Th, hi, ... 93 | assertEquals(profile.getNWords()[2], 17); // _Th, Thi, his, ... 94 | 95 | // too short 96 | extractor.setTag("abstract"); 97 | extractor.add("sample"); 98 | extractor.closeTag(profile); 99 | assertEquals(extractor.count(), 1); 100 | 101 | // other tags 102 | extractor.setTag("div"); 103 | extractor.add("This is a sample text which is enough long."); 104 | extractor.closeTag(profile); 105 | assertEquals(extractor.count(), 1); 106 | } 107 | 108 | /** 109 | * Test method for {@link TagExtractor#clear()}. 110 | */ 111 | @Test 112 | public final void testClear() { 113 | TagExtractor extractor = new TagExtractor("abstract", 10); 114 | extractor.setTag("abstract"); 115 | extractor.add("This is a sample text."); 116 | assertEquals(extractor.buf_.toString().trim(), "This is a sample text."); 117 | assertEquals(extractor.tag_, "abstract"); 118 | extractor.clear(); 119 | assertEquals(extractor.buf_.toString().trim(), ""); 120 | assertEquals(extractor.tag_, null); 121 | } 122 | 123 | 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/profiles/LanguageProfile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import com.optimaize.langdetect.i18n.LdLocale; 20 | import org.jetbrains.annotations.NotNull; 21 | 22 | import java.util.List; 23 | import java.util.Map; 24 | 25 | /** 26 | * A language profile knows the locale (language), and contains the n-grams and some statistics. 27 | * 28 | *

It is built from a training text that should be fairly large and clean.

29 | * 30 | *

It contains the n-grams from the training text in the desired gram sizes (eg 2 and 3-grams), 31 | * with possible text filters applied for cleaning. Also, rarely occurring n-grams may have been cut to 32 | * reduce the noise and index size. Use a {@link LanguageProfileBuilder}.

33 | * 34 | *

The profile may be created at runtime on-the-fly, or it may be loaded from a previously generated 35 | * text file (see OldLangProfileConverter).

36 | * 37 | * @author Fabian Kessler 38 | */ 39 | public interface LanguageProfile { 40 | 41 | @NotNull 42 | LdLocale getLocale(); 43 | 44 | /** 45 | * Tells what the n in n-grams are used here. 46 | * Example: [1,2,3] 47 | * @return Sorted from smaller to larger. 48 | */ 49 | @NotNull 50 | List getGramLengths(); 51 | 52 | /** 53 | * @param gram for example "a" or "foo". 54 | * @return 0-n, also zero if this profile does not use n-grams of that length (for example if no 4-grams are made). 55 | */ 56 | int getFrequency(String gram); 57 | 58 | /** 59 | * Tells how many different n-grams there are for a certain n-gram size. 60 | * For example the English language has about 57 different 1-grams, whereas Chinese in Hani has thousands. 61 | * @param gramLength 1-n 62 | * @return 0-n, returns zero if no such n-grams were made (for example if no 4-grams were made), 63 | * or if all the training text did not contain such long words. 64 | */ 65 | int getNumGrams(int gramLength); 66 | 67 | /** 68 | * Tells how many n-grams there are for all n-gram sizes combined. 69 | * @return 0-n (0 only on an empty profile...) 70 | */ 71 | int getNumGrams(); 72 | 73 | /** 74 | * Tells how often all n-grams of a certain length occurred, combined. 75 | * This returns a much larger number than {@link #getNumGrams}. 76 | * @param gramLength 1-n 77 | * @return 0-n, returns zero if no such n-grams were made (for example if no 4-grams were made), 78 | * or if all the training text did not contain such long words. 79 | */ 80 | long getNumGramOccurrences(int gramLength); 81 | 82 | /** 83 | * Tells how often the n-gram with the lowest amount of occurrences used in this profile occurred. 84 | * 85 | * Most likely there were n-grams with less (unless the returned number is 1), but they were eliminated 86 | * in order to keep the profile reasonably small. 87 | * 88 | * This is the opposite of getMaxGramCount(). 89 | * 90 | * @param gramLength 1-n 91 | * @return 0-n, returns zero if no such n-grams were made or existed. 92 | */ 93 | long getMinGramCount(int gramLength); 94 | /** 95 | * Tells how often the n-gram with the highest amount of occurrences used in this profile occurred. 96 | * 97 | * This is the opposite of getMinGramCount(). 98 | * 99 | * @param gramLength 1-n 100 | * @return 0-n, returns zero if no such n-grams were made or existed. 101 | */ 102 | long getMaxGramCount(int gramLength); 103 | 104 | /** 105 | * Iterates all ngram strings with frequency. 106 | */ 107 | @NotNull 108 | Iterable> iterateGrams(); 109 | 110 | /** 111 | * Iterates all gramLength-gram strings with frequency. 112 | */ 113 | @NotNull 114 | Iterable> iterateGrams(int gramLength); 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/util/Util.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.cybozu.util; 18 | 19 | import com.optimaize.langdetect.i18n.LdLocale; 20 | import com.optimaize.langdetect.ngram.NgramExtractor; 21 | import com.optimaize.langdetect.ngram.NgramExtractors; 22 | import com.optimaize.langdetect.ngram.OldNgramExtractor; 23 | import org.jetbrains.annotations.NotNull; 24 | 25 | import java.util.Formatter; 26 | import java.util.List; 27 | import java.util.Map; 28 | 29 | /** 30 | * A place for sharing code. 31 | * 32 | * @author Nakatani Shuyo 33 | */ 34 | public class Util { 35 | 36 | private static final NgramExtractor ngramExtractor = NgramExtractors.standard(); 37 | 38 | public static void addCharSequence(LangProfile langProfile, CharSequence text) { 39 | //TODO replace with new code. 40 | 41 | // List old = OldNgramExtractor.extractNGrams(text, null); 42 | // List nuu = ngramExtractor.extractGrams(text); 43 | // 44 | // Set oldSet = new HashSet<>(old); 45 | // Set nuuSet = new HashSet<>(nuu); 46 | // 47 | // ArrayList justNuu = new ArrayList<>(nuu); 48 | // justNuu.removeAll(old); 49 | // 50 | // ArrayList justOld = new ArrayList<>(old); 51 | // justOld.removeAll(nuu); 52 | // 53 | // System.out.println(text); 54 | 55 | // for (String s : ngramExtractor.extractGrams(text)) { 56 | // langProfile.add(s); 57 | // } 58 | for (String s : OldNgramExtractor.extractNGrams(text, null)) { 59 | langProfile.add(s); 60 | } 61 | } 62 | 63 | 64 | 65 | /** 66 | * unicode encoding (for verbose mode) 67 | */ 68 | public static String unicodeEncode(String s) { 69 | StringBuilder buf = new StringBuilder(); 70 | for (int i = 0; i < s.length(); ++i) { 71 | char ch = s.charAt(i); 72 | if (ch >= '\u0080') { 73 | String st = Integer.toHexString(0x10000 + (int) ch); 74 | while (st.length() < 4) st = "0" + st; 75 | buf.append("\\u").append(st.subSequence(1, 5)); 76 | } else { 77 | buf.append(ch); 78 | } 79 | } 80 | return buf.toString(); 81 | } 82 | 83 | 84 | /** 85 | * normalize probabilities and check convergence by the maximum probability 86 | * @return maximum of probabilities 87 | */ 88 | public static double normalizeProb(double[] prob) { 89 | double maxp = 0, sump = 0; 90 | for(int i=0;i langlist) { 101 | Formatter formatter = new Formatter(); 102 | for(int j=0;j=0.00001) { 105 | formatter.format(" %s:%.5f", langlist.get(j), p); 106 | } 107 | } 108 | return formatter.toString(); 109 | } 110 | 111 | 112 | /** 113 | */ 114 | public static double[] makeInternalPrioMap(@NotNull Map langWeightingMap, 115 | @NotNull List langlist) { 116 | assert !langWeightingMap.isEmpty(); 117 | double[] priorMap = new double[langlist.size()]; 118 | double sump = 0; 119 | for (int i=0;i=0 : "Prior probability must be non-negative!"; 124 | priorMap[i] = p; 125 | sump += p; 126 | } 127 | } 128 | assert sump > 0 : "Sum must be greater than zero!"; 129 | for (int i=0;iThis class does no internal synchronization.

31 | * 32 | * @author Fabian Kessler 33 | */ 34 | public class LanguageProfileBuilder { 35 | 36 | @NotNull 37 | private final LdLocale locale; 38 | private int minimalFrequency = 1; 39 | private NgramExtractor ngramExtractor; 40 | private final Map> ngrams = new HashMap<>(); 41 | 42 | 43 | public LanguageProfileBuilder(@NotNull LdLocale locale) { 44 | this.locale = locale; 45 | } 46 | @Deprecated 47 | public LanguageProfileBuilder(@NotNull String locale) { 48 | this.locale = LdLocale.fromString(locale); 49 | } 50 | 51 | /** 52 | * Copy constructor. 53 | */ 54 | public LanguageProfileBuilder(@NotNull LanguageProfileBuilder languageProfileBuilder) { 55 | this.locale = languageProfileBuilder.locale; 56 | this.minimalFrequency = languageProfileBuilder.minimalFrequency; 57 | this.ngramExtractor = languageProfileBuilder.ngramExtractor; 58 | this.ngrams.putAll(languageProfileBuilder.ngrams); 59 | } 60 | 61 | public LanguageProfileBuilder ngramExtractor(@NotNull NgramExtractor ngramExtractor) { 62 | this.ngramExtractor = ngramExtractor; 63 | return this; 64 | } 65 | 66 | /** 67 | * @param minimalFrequency 1-n, the default is 1. n-grams that occurred less often in the text are removed. 68 | * This really should be set to something higher. 69 | * Try to play with the number until you get a profile file of satisfying size, 70 | * that produces good language detection results. 71 | */ 72 | public LanguageProfileBuilder minimalFrequency(int minimalFrequency) { 73 | if (minimalFrequency < 1) throw new IllegalArgumentException("minimalFrequency must be >= 1, but was: "+minimalFrequency); 74 | this.minimalFrequency = minimalFrequency; 75 | return this; 76 | } 77 | 78 | /** 79 | * In order to use this you must set the {@link #ngramExtractor} first. 80 | */ 81 | public LanguageProfileBuilder addText(CharSequence text) { 82 | if (ngramExtractor==null) { 83 | throw new IllegalStateException("NgramExtractor has not been set yet!"); 84 | } 85 | for (Map.Entry entry : ngramExtractor.extractCountedGrams(text).entrySet()) { 86 | addGram(entry.getKey(), entry.getValue()); 87 | } 88 | return this; 89 | } 90 | 91 | /** 92 | * Shortcut for addGram(ngram, 1). 93 | */ 94 | public LanguageProfileBuilder addGram(String ngram) { 95 | return addGram(ngram, 1); 96 | } 97 | /** 98 | * If the builder already has this ngram, the given frequency is added to the current count. 99 | */ 100 | public LanguageProfileBuilder addGram(String ngram, int frequency) { 101 | Map map = ngrams.get(ngram.length()); 102 | if (map==null) { 103 | map = new HashMap<>(); 104 | ngrams.put(ngram.length(), map); 105 | } 106 | Integer total = map.get(ngram); 107 | if (total==null) total = 0; 108 | total += frequency; 109 | map.put(ngram, total); 110 | return this; 111 | } 112 | 113 | 114 | public LanguageProfile build() { 115 | if (minimalFrequency >1) { 116 | removeNgramsWithLessFrequency(); 117 | } 118 | return new LanguageProfileImpl(locale, ngrams); 119 | } 120 | 121 | 122 | private void removeNgramsWithLessFrequency() { 123 | for (Map map : ngrams.values()) { 124 | Iterator> iterator = map.entrySet().iterator(); 125 | while (iterator.hasNext()) { 126 | Map.Entry next = iterator.next(); 127 | if (next.getValue() < minimalFrequency) { 128 | iterator.remove(); 129 | } 130 | } 131 | } 132 | } 133 | 134 | } 135 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/NgramFrequencyData.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect; 18 | 19 | import com.optimaize.langdetect.i18n.LdLocale; 20 | import com.optimaize.langdetect.profiles.LanguageProfile; 21 | import org.jetbrains.annotations.NotNull; 22 | import org.jetbrains.annotations.Nullable; 23 | 24 | import java.util.*; 25 | 26 | /** 27 | * Contains frequency information for n-grams coming from multiple {@link LanguageProfile}s. 28 | * 29 | *

For each n-gram string it knows the locales (languages) in which it occurs, and how frequent it 30 | * occurs in those languages in relation to other n-grams of the same length in those same languages.

31 | * 32 | *

Immutable by definition (can't make Arrays unmodifiable).

33 | * 34 | * @author Fabian Kessler 35 | */ 36 | public final class NgramFrequencyData { 37 | 38 | /** 39 | * Key = ngram 40 | * Value = array with probabilities per loaded language, in the same order as {@code langlist}. 41 | */ 42 | @NotNull 43 | private final Map wordLangProbMap; 44 | 45 | /** 46 | * All the loaded languages, in exactly the same order as the data is in the double[] in wordLangProbMap. 47 | * Example: if wordLangProbMap has an entry for the n-gram "foo" then for each locale in this langlist here 48 | * it has a value there. Languages that don't know the n-gram have the value 0d. 49 | */ 50 | @NotNull 51 | private final List langlist; 52 | 53 | 54 | /** 55 | * @param gramLengths for example [1,2,3] 56 | * @throws java.lang.IllegalArgumentException if languageProfiles or gramLengths is empty, or if one of the 57 | * languageProfiles does not have the grams of the required sizes. 58 | */ 59 | @NotNull 60 | public static NgramFrequencyData create(@NotNull Collection languageProfiles, @NotNull Collection gramLengths) throws IllegalArgumentException { 61 | if (languageProfiles.isEmpty()) throw new IllegalArgumentException("No languageProfiles provided!"); 62 | if (gramLengths.isEmpty()) throw new IllegalArgumentException("No gramLengths provided!"); 63 | 64 | Map wordLangProbMap = new HashMap<>(); 65 | List langlist = new ArrayList<>(); 66 | int langsize = languageProfiles.size(); 67 | 68 | int index = -1; 69 | for (LanguageProfile profile : languageProfiles) { 70 | index++; 71 | 72 | langlist.add( profile.getLocale() ); 73 | 74 | for (Integer gramLength : gramLengths) { 75 | if (!profile.getGramLengths().contains(gramLength)) { 76 | throw new IllegalArgumentException("The language profile for "+profile.getLocale()+" does not contain "+gramLength+"-grams!"); 77 | } 78 | for (Map.Entry ngramEntry : profile.iterateGrams(gramLength)) { 79 | String ngram = ngramEntry.getKey(); 80 | Integer frequency = ngramEntry.getValue(); 81 | if (!wordLangProbMap.containsKey(ngram)) { 82 | wordLangProbMap.put(ngram, new double[langsize]); 83 | } 84 | double prob = frequency.doubleValue() / profile.getNumGramOccurrences(ngram.length()); 85 | wordLangProbMap.get(ngram)[index] = prob; 86 | } 87 | } 88 | } 89 | 90 | return new NgramFrequencyData(wordLangProbMap, langlist); 91 | } 92 | 93 | private NgramFrequencyData(@NotNull Map wordLangProbMap, 94 | @NotNull List langlist) { 95 | //not making immutable copies because I create them here (optimization). 96 | this.wordLangProbMap = Collections.unmodifiableMap(wordLangProbMap); 97 | this.langlist = Collections.unmodifiableList(langlist); 98 | } 99 | 100 | 101 | @NotNull 102 | public List getLanguageList() { 103 | return langlist; 104 | } 105 | @NotNull 106 | public LdLocale getLanguage(int pos) { 107 | return langlist.get(pos); 108 | } 109 | 110 | /** 111 | * Don't modify this data structure! (Can't make array immutable...) 112 | * @return null if no language profile knows that ngram. 113 | * entries are 0 for languages that don't know that ngram at all. 114 | * The array is in the order of the {@link #getLanguageList()} language list, and has exactly that size. 115 | * impl note: this way the caller can handle it more efficient than returning an empty array. 116 | */ 117 | @Nullable 118 | public double[] getProbabilities(String ngram) { 119 | return wordLangProbMap.get(ngram); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/RemoveMinorityScriptsTextFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import java.util.HashMap; 20 | import java.util.HashSet; 21 | import java.util.Map; 22 | import java.util.Set; 23 | 24 | /** 25 | * Removes text written in scripts that are not the dominant script of the text. 26 | * 27 | * TODO this does not do special handling for Japanese (3 scripts) and Korean (2 scripts), they should be 28 | * counted together and kept. 29 | * 30 | * @author Fabian Kessler 31 | */ 32 | public class RemoveMinorityScriptsTextFilter implements TextFilter { 33 | 34 | private final double threshold; 35 | 36 | /** 37 | * If a script has less than this fraction of content compared to the most used one, its text is removed. 38 | * 39 | * Example: Latin 10%, Cyrillic 80%, Common 10% (punctuation n'stuff). Now 10 is put in relation to 80. 40 | * 41 | * @param threshold 0-1, suggested value is 0.3. If smaller then removed, equal remains. 42 | */ 43 | public static RemoveMinorityScriptsTextFilter forThreshold(double threshold) { 44 | return new RemoveMinorityScriptsTextFilter(threshold); 45 | } 46 | 47 | private RemoveMinorityScriptsTextFilter(double threshold) { 48 | this.threshold = threshold; 49 | } 50 | 51 | @Override 52 | public String filter(CharSequence text) { 53 | Map counts = countByScript(text); 54 | if (counts.size()<=1) { 55 | //nothing to do 56 | return text.toString(); 57 | } else { 58 | long most = findMost(counts); 59 | Set toRemove = new HashSet<>(); 60 | for (Map.Entry entry : counts.entrySet()) { 61 | if (entry.getValue()==most) continue; 62 | double ratio = entry.getValue().doubleValue() / most; 63 | if (ratio <= threshold) { 64 | toRemove.add(entry.getKey()); 65 | } 66 | } 67 | if (toRemove.isEmpty()) { 68 | return text.toString(); 69 | } else { 70 | return remove(text, toRemove); 71 | } 72 | } 73 | } 74 | 75 | private String remove(CharSequence text, Set toRemove) { 76 | StringBuilder remaining = new StringBuilder(); 77 | Character.UnicodeScript last = null; 78 | for (int i=0; i counts) { 98 | long max = 0L; 99 | for (Long aLong : counts.values()) { 100 | if (aLong > max) max = aLong; 101 | } 102 | return max; 103 | } 104 | 105 | private Map countByScript(CharSequence text) { 106 | Map counter = new HashMap<>(); 107 | Character.UnicodeScript last = null; 108 | for (int i=0; i counter, Character.UnicodeScript unicodeScript) { 130 | Long number = counter.get(unicodeScript); 131 | if (number==null) { 132 | counter.put(unicodeScript, 1L); 133 | } else { 134 | counter.put(unicodeScript, number+1); 135 | } 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/text/TextObject.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.text; 18 | 19 | import com.optimaize.langdetect.cybozu.util.CharNormalizer; 20 | import com.google.common.annotations.Beta; 21 | import org.jetbrains.annotations.NotNull; 22 | 23 | import java.io.IOException; 24 | import java.io.Reader; 25 | 26 | /** 27 | * A convenient text object implementing CharSequence and Appendable. 28 | * 29 | * This is an ideal object to use for learning text to create {@link com.optimaize.langdetect.profiles.LanguageProfile}s, 30 | * as well as to pass it in to {@link com.optimaize.langdetect.LanguageDetector#detect}. 31 | * 32 | * To get one, use a TextObjectFactory (through a TextObjectFactoryBuilder). 33 | * 34 | * Example use: 35 | * //create the factory once: 36 | * TextObjectFactory textObjectFactory = new TextObjectFactoryBuilder() 37 | * .withTextFilter(UrlTextFilter.getInstance()) 38 | * .build(); 39 | * //then create as many text objects as you like: 40 | * TextObject inputText = textObjectFactory.create().append("deutsche Text").append(" ").append("blah blah"); 41 | * 42 | * All append() methods go through the {@code textFilter}. 43 | * 44 | * Equals/hashCode are not implemented as of now on purpose. You may want to call toString() and compare that. 45 | * 46 | * @author Fabian Kessler 47 | */ 48 | @Beta 49 | public class TextObject implements CharSequence, Appendable { 50 | 51 | @NotNull 52 | private final TextFilter textFilter; 53 | 54 | @NotNull 55 | private final StringBuilder stringBuilder; 56 | 57 | private final int maxTextLength; 58 | 59 | 60 | /** 61 | * @param maxTextLength 0 for no limit 62 | */ 63 | public TextObject(@NotNull TextFilter textFilter, int maxTextLength) { 64 | this.textFilter = textFilter; 65 | this.maxTextLength = maxTextLength; 66 | this.stringBuilder = new StringBuilder(); 67 | } 68 | 69 | 70 | /** 71 | * Append the target text for language detection. 72 | * This method read the text from specified input reader. 73 | * If the total size of target text exceeds the limit size, 74 | * the rest is ignored. 75 | * 76 | * @param reader the input reader (BufferedReader as usual) 77 | * @throws java.io.IOException Can't read the reader. 78 | */ 79 | public TextObject append(Reader reader) throws IOException { 80 | char[] buf = new char[1024]; 81 | while (reader.ready() && (maxTextLength==0 || stringBuilder.length()0 && stringBuilder.length()>=maxTextLength) return this; 98 | 99 | text = textFilter.filter(text); 100 | 101 | //unfortunately this code can't be put into a TextFilter because: 102 | //1) the limit could not be detected early, a lot of work would be done to waste time and memory 103 | //2) the last character of the existing string builder could not be seen. if it is a space, we don't want 104 | // to add yet another space. 105 | char pre = stringBuilder.length()==0 ? 0 : stringBuilder.charAt(stringBuilder.length()-1); 106 | for (int i=0; i languageProfiles = new LanguageProfileReader().readAllBuiltIn(); 48 | 49 | shortDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) 50 | .shortTextAlgorithm(100) 51 | .withProfiles(languageProfiles) 52 | .build(); 53 | 54 | longDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) 55 | .shortTextAlgorithm(0) 56 | .withProfiles(new LanguageProfileReader().readAllBuiltIn()) 57 | .build(); 58 | } 59 | 60 | @Test(dataProvider = "shortCleanTexts") 61 | public void shortTextAlgo(String expectedLanguage, CharSequence text) throws IOException { 62 | assertEquals(shortDetector.getProbabilities(text).get(0).getLocale().getLanguage(), expectedLanguage); 63 | //the detect() method doesn't have enough confidence for all these short texts. 64 | } 65 | 66 | @Test(dataProvider = "shortCleanTexts") 67 | public void longTextAlgoWorkingOnShortText(String expectedLanguage, CharSequence text) throws IOException { 68 | assertEquals(longDetector.getProbabilities(text).get(0).getLocale().getLanguage(), expectedLanguage); 69 | //the detect() method doesn't have enough confidence for all these short texts. 70 | } 71 | 72 | @Test(dataProvider = "longerWikipediaTexts") 73 | public void longTextAlgoWorkingOnLongText(String expectedLanguage, CharSequence text) throws IOException { 74 | assertEquals(longDetector.getProbabilities(text).get(0).getLocale().getLanguage(), expectedLanguage); 75 | assertEquals(longDetector.detect(text).get().getLanguage(), expectedLanguage); 76 | } 77 | 78 | @DataProvider 79 | protected Object[][] shortCleanTexts() { 80 | return new Object[][] { 81 | {"en", shortCleanText("This is some English text.")}, 82 | {"fr", shortCleanText("Ceci est un texte français.")}, 83 | {"nl", shortCleanText("Dit is een Nederlandse tekst.")}, 84 | {"de", shortCleanText("Dies ist eine deutsche Text")}, 85 | {"km", shortCleanText("សព្វវចនាធិប្បាយសេរីសម្រាប់អ្នកទាំងអស់គ្នា។" + "នៅក្នុងវិគីភីឌាភាសាខ្មែរឥឡូវនេះមាន ១១៩៨រូបភាព សមាជិក១៥៣៣៣នាក់ និងមាន៤៥៨៣អត្ថបទ។")}, 86 | {"bg", shortCleanText("Европа не трябва да стартира нов конкурентен маратон и изход с приватизация")}, 87 | {"wa", shortCleanText("Çouchal c' est on tecse pår e walon.")}, 88 | }; 89 | } 90 | private CharSequence shortCleanText(CharSequence text) { 91 | return CommonTextObjectFactories.forDetectingShortCleanText().forText( text ); 92 | } 93 | 94 | @DataProvider 95 | protected Object[][] longerWikipediaTexts() { 96 | return new Object[][] { 97 | {"de", largeText(readText("/texts/de-wikipedia-Deutschland.txt"))}, 98 | {"fr", largeText(readText("/texts/fr-wikipedia-France.txt"))}, 99 | {"it", largeText(readText("/texts/it-wikipedia-Italia.txt"))}, 100 | }; 101 | } 102 | 103 | private CharSequence readText(String path) { 104 | try (InputStream inputStream = DataLanguageDetectorImplTest.class.getResourceAsStream(path)) { 105 | try (BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { 106 | StringBuilder sb = new StringBuilder(); 107 | String str; 108 | while ((str = in.readLine()) != null) { 109 | sb.append(str); 110 | } 111 | return sb.toString(); 112 | } 113 | } catch (IOException e) { 114 | throw new RuntimeException(e); 115 | } 116 | } 117 | 118 | private CharSequence largeText(CharSequence text) { 119 | return CommonTextObjectFactories.forDetectingOnLargeText().forText( text ); 120 | } 121 | 122 | 123 | 124 | 125 | } 126 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/profiles/LanguageProfileReaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import com.optimaize.langdetect.i18n.LdLocale; 21 | import org.junit.Test; 22 | 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.util.HashSet; 26 | import java.util.List; 27 | import java.util.Set; 28 | 29 | import static org.hamcrest.Matchers.*; 30 | import static org.junit.Assert.*; 31 | 32 | /** 33 | * @author Fabian Kessler 34 | * @author François ROLAND 35 | */ 36 | public class LanguageProfileReaderTest { 37 | 38 | private static final File PROFILE_DIR = new File(new File(new File(new File("src"), "main"), "resources"), "languages"); 39 | 40 | 41 | /* 42 | * In case someone creates new language profiles then these numbers need to be adjusted. 43 | */ 44 | 45 | @Test 46 | public void readEnFile() throws IOException { 47 | checkProfileFile("en", 3, 2301, 26164, 3774627); 48 | } 49 | 50 | @Test 51 | public void readBnFile() throws IOException { 52 | checkProfileFile("bn", 3, 2846, 198, 22964); 53 | } 54 | 55 | @Test 56 | public void readFrFile() throws IOException { 57 | checkProfileFile("fr", 3, 2232, 6653, 1120211); 58 | } 59 | 60 | @Test 61 | public void readNlFile() throws IOException { 62 | checkProfileFile("nl", 3, 2163, 5640, 1373884); 63 | } 64 | 65 | private static void checkProfileFile(String language, int nWordSize, int freqSize, long minFreq, long maxFreq) throws IOException { 66 | File profileFile = new File(PROFILE_DIR, language); 67 | final LanguageProfile languageProfile = new LanguageProfileReader().read(profileFile); 68 | assertThat(languageProfile, is(notNullValue())); 69 | assertThat(languageProfile.getLocale().getLanguage(), is(equalTo(language))); 70 | assertEquals(languageProfile.getGramLengths().size(), nWordSize); 71 | assertEquals(languageProfile.getGramLengths(), ImmutableList.of(1, 2, 3)); 72 | assertEquals(languageProfile.getNumGrams(), freqSize); 73 | 74 | assertTrue(languageProfile.getMinGramCount(nWordSize) < languageProfile.getMaxGramCount(nWordSize)); 75 | assertEquals(languageProfile.getMinGramCount(nWordSize), minFreq); 76 | assertEquals(languageProfile.getMaxGramCount(nWordSize), maxFreq); 77 | } 78 | 79 | 80 | @Test 81 | public void readFromDir() throws IOException { 82 | List read = new LanguageProfileReader().read(ImmutableList.of("de", "fr")); 83 | assertEquals(read.size(), 2); 84 | } 85 | 86 | @Test 87 | public void readFromDirWithClassloader() throws IOException { 88 | List read = new LanguageProfileReader().read( 89 | LanguageProfileReaderTest.class.getClassLoader(), 90 | "languages", 91 | ImmutableList.of("de", "fr") 92 | ); 93 | assertEquals(read.size(), 2); 94 | } 95 | 96 | 97 | @Test 98 | public void read() throws IOException { 99 | List read = new LanguageProfileReader().read(ImmutableList.of("de", "fr")); 100 | assertEquals(read.size(), 2); 101 | } 102 | 103 | @Test 104 | public void read_folder() throws IOException { 105 | List read = new LanguageProfileReader().read("languages", ImmutableList.of("de", "fr")); 106 | assertEquals(read.size(), 2); 107 | } 108 | 109 | @Test 110 | public void read_classpathAndFolder() throws IOException { 111 | List read = new LanguageProfileReader().read(LanguageProfileReaderTest.class.getClassLoader(), "languages", ImmutableList.of("de", "fr")); 112 | assertEquals(read.size(), 2); 113 | } 114 | 115 | @Test 116 | public void readAllBuiltIn() throws IOException { 117 | verify_readAllBuiltIn(new LanguageProfileReader().readAllBuiltIn()); 118 | } 119 | private void verify_readAllBuiltIn(List profiles) { 120 | assertEquals(profiles.size(), 71); //adjust this number when adding more languages 121 | Set allLangs = new HashSet<>(); 122 | for (LanguageProfile profile : profiles) { 123 | assertFalse("Duplicate language: "+profile.getLocale(), allLangs.contains(profile.getLocale())); 124 | allLangs.add(profile.getLocale()); 125 | } 126 | assertTrue(allLangs.contains(LdLocale.fromString("de"))); 127 | assertTrue(allLangs.contains(LdLocale.fromString("zh-CN"))); 128 | assertTrue(allLangs.contains(LdLocale.fromString("zh-TW"))); 129 | } 130 | 131 | 132 | @Test 133 | public void loadProfilesFromClasspath() throws IOException { 134 | List result = new LanguageProfileReader().read(this.getClass().getClassLoader(), "languages", ImmutableList.of("en", "fr", "nl", "de")); 135 | assertEquals(result.size(), 4); 136 | } 137 | 138 | @Test 139 | public void loadProfilesFromFile() throws IOException { 140 | List result = new LanguageProfileReader().readAll(new File(new File(new File(new File("src"), "main"), "resources"), "languages")); 141 | assertEquals(result.size(), 71); //adjust this number when adding more languages 142 | } 143 | 144 | } 145 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/profiles/BuiltInLanguages.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nicole Torres 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import com.optimaize.langdetect.i18n.LdLocale; 21 | 22 | import java.util.ArrayList; 23 | import java.util.List; 24 | 25 | /** 26 | * @author Nicole Torres 27 | */ 28 | public class BuiltInLanguages { 29 | 30 | private static final List languages; 31 | private static final List shortTextLanguages; 32 | 33 | static { 34 | List names = new ArrayList<>(); 35 | 36 | //sorted alphabetically 37 | names.add(LdLocale.fromString("af")); 38 | names.add(LdLocale.fromString("an")); 39 | names.add(LdLocale.fromString("ar")); 40 | names.add(LdLocale.fromString("ast")); 41 | names.add(LdLocale.fromString("be")); 42 | names.add(LdLocale.fromString("bg")); 43 | names.add(LdLocale.fromString("bn")); 44 | names.add(LdLocale.fromString("br")); 45 | names.add(LdLocale.fromString("ca")); 46 | names.add(LdLocale.fromString("cs")); 47 | names.add(LdLocale.fromString("cy")); 48 | names.add(LdLocale.fromString("da")); 49 | names.add(LdLocale.fromString("de")); 50 | names.add(LdLocale.fromString("el")); 51 | names.add(LdLocale.fromString("en")); 52 | names.add(LdLocale.fromString("es")); 53 | names.add(LdLocale.fromString("et")); 54 | names.add(LdLocale.fromString("eu")); 55 | names.add(LdLocale.fromString("fa")); 56 | names.add(LdLocale.fromString("fi")); 57 | names.add(LdLocale.fromString("fr")); 58 | names.add(LdLocale.fromString("ga")); 59 | names.add(LdLocale.fromString("gl")); 60 | names.add(LdLocale.fromString("gu")); 61 | names.add(LdLocale.fromString("he")); 62 | names.add(LdLocale.fromString("hi")); 63 | names.add(LdLocale.fromString("hr")); 64 | names.add(LdLocale.fromString("ht")); 65 | names.add(LdLocale.fromString("hu")); 66 | names.add(LdLocale.fromString("id")); 67 | names.add(LdLocale.fromString("is")); 68 | names.add(LdLocale.fromString("it")); 69 | names.add(LdLocale.fromString("ja")); 70 | names.add(LdLocale.fromString("km")); 71 | names.add(LdLocale.fromString("kn")); 72 | names.add(LdLocale.fromString("ko")); 73 | names.add(LdLocale.fromString("lt")); 74 | names.add(LdLocale.fromString("lv")); 75 | names.add(LdLocale.fromString("mk")); 76 | names.add(LdLocale.fromString("ml")); 77 | names.add(LdLocale.fromString("mr")); 78 | names.add(LdLocale.fromString("ms")); 79 | names.add(LdLocale.fromString("mt")); 80 | names.add(LdLocale.fromString("ne")); 81 | names.add(LdLocale.fromString("nl")); 82 | names.add(LdLocale.fromString("no")); 83 | names.add(LdLocale.fromString("oc")); 84 | names.add(LdLocale.fromString("pa")); 85 | names.add(LdLocale.fromString("pl")); 86 | names.add(LdLocale.fromString("pt")); 87 | names.add(LdLocale.fromString("ro")); 88 | names.add(LdLocale.fromString("ru")); 89 | names.add(LdLocale.fromString("sk")); 90 | names.add(LdLocale.fromString("sl")); 91 | names.add(LdLocale.fromString("so")); 92 | names.add(LdLocale.fromString("sq")); 93 | names.add(LdLocale.fromString("sr")); 94 | names.add(LdLocale.fromString("sv")); 95 | names.add(LdLocale.fromString("sw")); 96 | names.add(LdLocale.fromString("ta")); 97 | names.add(LdLocale.fromString("te")); 98 | names.add(LdLocale.fromString("th")); 99 | names.add(LdLocale.fromString("tl")); 100 | names.add(LdLocale.fromString("tr")); 101 | names.add(LdLocale.fromString("uk")); 102 | names.add(LdLocale.fromString("ur")); 103 | names.add(LdLocale.fromString("vi")); 104 | names.add(LdLocale.fromString("wa")); 105 | names.add(LdLocale.fromString("yi")); 106 | names.add(LdLocale.fromString("zh-CN")); 107 | names.add(LdLocale.fromString("zh-TW")); 108 | 109 | languages = ImmutableList.copyOf(names); 110 | } 111 | 112 | static { 113 | List texts = new ArrayList<>(); 114 | texts.add("cs"); 115 | texts.add("da"); 116 | texts.add("de"); 117 | texts.add("en"); 118 | texts.add("es"); 119 | texts.add("fi"); 120 | texts.add("fr"); 121 | texts.add("id"); 122 | texts.add("it"); 123 | texts.add("nl"); 124 | texts.add("no"); 125 | texts.add("pl"); 126 | texts.add("pt"); 127 | texts.add("ro"); 128 | texts.add("sv"); 129 | texts.add("tr"); 130 | texts.add("vi"); 131 | shortTextLanguages = ImmutableList.copyOf(texts); 132 | } 133 | 134 | /** 135 | * Returns the languages for which the library provides full profiles. 136 | * Full provides are generated from regular text, usually Wikipedia abstracts. 137 | * @return immutable 138 | */ 139 | public static List getLanguages() { 140 | return languages; 141 | } 142 | 143 | /** 144 | * Returns the languages for which the library provides profiles created from short text. 145 | * Twitter was used as source by @shuyo. 146 | * Much less languages have short text profiles as of now. 147 | * @return immutable 148 | */ 149 | public static List getShortTextLanguages() { 150 | return shortTextLanguages; 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/cybozu/util/LangProfile.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Nakatani Shuyo 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * This file has been modified by François ROLAND. 19 | */ 20 | 21 | package com.optimaize.langdetect.cybozu.util; 22 | 23 | import org.jetbrains.annotations.NotNull; 24 | 25 | import java.io.Serializable; 26 | import java.util.HashMap; 27 | import java.util.Iterator; 28 | import java.util.Map; 29 | import java.util.Set; 30 | 31 | /** 32 | * {@link LangProfile} is a Language Profile Class. 33 | * Users don't use this class directly. 34 | * 35 | * TODO split into builder and immutable class. 36 | * 37 | * TODO currently this only makes n-grams with the space before a word included. no n-gram with the space after the word. 38 | * Example: "foo" creates " fo" as 3gram, but not "oo ". Either this is a bug, or if intended then needs documentation. 39 | * 40 | * @author Nakatani Shuyo 41 | * @deprecated replaced by LanguageProfile 42 | */ 43 | @Deprecated 44 | public class LangProfile implements Serializable { 45 | 46 | private static final long serialVersionUID = 1L; 47 | 48 | /** 49 | * n-grams that occur less than this often can be removed using omitLessFreq(). 50 | * This number can change, see LESS_FREQ_RATIO. 51 | */ 52 | private static final int MINIMUM_FREQ = 2; 53 | 54 | /** 55 | * Explanation by example: 56 | * 57 | * If the most frequent n-gram occurs 1 mio times, then 58 | * 1'000'000 / this (100'000) = 10. 59 | * 10 is larger than MINIMUM_FREQ (2), thus MINIMUM_FREQ remains at 2. 60 | * All n-grams that occur less than 2 times can be removed as noise using omitLessFreq(). 61 | * 62 | * If the most frequent n-gram occurs 5000 times, then 63 | * 5'000 / this (100'000) = 0.05. 64 | * 0.05 is smaller than MINIMUM_FREQ (2), thus MINIMUM_FREQ becomes 0. 65 | * No n-grams are removed because of insignificance when calling omitLessFreq(). 66 | */ 67 | private static final int LESS_FREQ_RATIO = 100000; 68 | 69 | /** 70 | * The language name (identifier). 71 | */ 72 | private String name = null; 73 | 74 | /** 75 | * Key = ngram, value = count. 76 | * All n-grams are in here (1-gram, 2-gram, 3-gram). 77 | */ 78 | private Map freq = new HashMap<>(); 79 | 80 | /** 81 | * Tells how many occurrences of n-grams exist per gram length. 82 | * When making 1grams, 2grams and 3grams (currently) then this contains 3 entries where 83 | * element 0 = number occurrences of 1-grams 84 | * element 1 = number occurrences of 2-grams 85 | * element 2 = number occurrences of 3-grams 86 | * Example: if there are 57 1-grams (English language has about that many) and the training text is 87 | * fairly long, then this number is in the millions. 88 | */ 89 | private int[] nWords = new int[NGram.N_GRAM]; 90 | 91 | /** 92 | * Constructor for JSONIC 93 | */ 94 | public LangProfile() {} 95 | 96 | /** 97 | * Normal Constructor 98 | * @param name language name 99 | */ 100 | public LangProfile(String name) { 101 | this.setName(name); 102 | } 103 | 104 | /** 105 | * Add n-gram to profile 106 | * @param gram 107 | */ 108 | public void add(@NotNull String gram) { 109 | if (name == null) throw new IllegalStateException(); 110 | int len = gram.length(); 111 | if (len < 1 || len > NGram.N_GRAM) { 112 | throw new IllegalArgumentException("ngram length must be 1-3 but was "+len+": >>>"+gram+"<< keys = freq.keySet(); 139 | int roman = 0; 140 | for(Iterator i = keys.iterator(); i.hasNext(); ){ 141 | String key = i.next(); 142 | int count = freq.get(key); 143 | if (count <= threshold) { 144 | nWords[key.length()-1] -= count; 145 | i.remove(); 146 | } else { 147 | if (key.matches("^[A-Za-z]$")) { 148 | roman += count; 149 | } 150 | } 151 | } 152 | 153 | // roman check 154 | if (roman < nWords[0] / 3) { 155 | Set keys2 = freq.keySet(); 156 | for(Iterator i = keys2.iterator(); i.hasNext(); ){ 157 | String key = i.next(); 158 | if (key.matches(".*[A-Za-z].*")) { 159 | nWords[key.length()-1] -= freq.get(key); 160 | i.remove(); 161 | } 162 | } 163 | } 164 | } 165 | 166 | public String getName() { 167 | return name; 168 | } 169 | 170 | public void setName(String name) { 171 | this.name = name; 172 | } 173 | 174 | public Map getFreq() { 175 | return freq; 176 | } 177 | 178 | public void setFreq(Map freq) { 179 | this.freq = freq; 180 | } 181 | 182 | public int[] getNWords() { 183 | return nWords; 184 | } 185 | 186 | public void setNWords(int[] nWords) { 187 | this.nWords = nWords; 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # language-detector 2 | 3 | Language Detection Library for Java 4 | 5 | 6 | com.optimaize.languagedetector 7 | language-detector 8 | 0.6 9 | 10 | 11 | 12 | ## Language Support 13 | 14 | ### 71 Built-in Language Profiles 15 | 16 | 1. af Afrikaans 17 | 1. an Aragonese 18 | 1. ar Arabic 19 | 1. ast Asturian 20 | 1. be Belarusian 21 | 1. br Breton 22 | 1. ca Catalan 23 | 1. bg Bulgarian 24 | 1. bn Bengali 25 | 1. cs Czech 26 | 1. cy Welsh 27 | 1. da Danish 28 | 1. de German 29 | 1. el Greek 30 | 1. en English 31 | 1. es Spanish 32 | 1. et Estonian 33 | 1. eu Basque 34 | 1. fa Persian 35 | 1. fi Finnish 36 | 1. fr French 37 | 1. ga Irish 38 | 1. gl Galician 39 | 1. gu Gujarati 40 | 1. he Hebrew 41 | 1. hi Hindi 42 | 1. hr Croatian 43 | 1. ht Haitian 44 | 1. hu Hungarian 45 | 1. id Indonesian 46 | 1. is Icelandic 47 | 1. it Italian 48 | 1. ja Japanese 49 | 1. km Khmer 50 | 1. kn Kannada 51 | 1. ko Korean 52 | 1. lt Lithuanian 53 | 1. lv Latvian 54 | 1. mk Macedonian 55 | 1. ml Malayalam 56 | 1. mr Marathi 57 | 1. ms Malay 58 | 1. mt Maltese 59 | 1. ne Nepali 60 | 1. nl Dutch 61 | 1. no Norwegian 62 | 1. oc Occitan 63 | 1. pa Punjabi 64 | 1. pl Polish 65 | 1. pt Portuguese 66 | 1. ro Romanian 67 | 1. ru Russian 68 | 1. sk Slovak 69 | 1. sl Slovene 70 | 1. so Somali 71 | 1. sq Albanian 72 | 1. sr Serbian 73 | 1. sv Swedish 74 | 1. sw Swahili 75 | 1. ta Tamil 76 | 1. te Telugu 77 | 1. th Thai 78 | 1. tl Tagalog 79 | 1. tr Turkish 80 | 1. uk Ukrainian 81 | 1. ur Urdu 82 | 1. vi Vietnamese 83 | 1. wa Walloon 84 | 1. yi Yiddish 85 | 1. zh-cn Simplified Chinese 86 | 1. zh-tw Traditional Chinese 87 | 88 | User danielnaber has made available a profile for Esperanto on his website, see open tasks. 89 | 90 | There are two kinds of profiles. The standard ones created from Wikipedia articles and similar. 91 | And the "short text" profiles created from Twitter tweets. Fewer language profiles exist for the 92 | short text, more would be available, see https://github.com/optimaize/language-detector/issues/57 93 | 94 | ### Other Languages 95 | 96 | You can create a language profile for your own language easily. 97 | See https://github.com/optimaize/language-detector/blob/master/src/main/resources/README.md 98 | 99 | 100 | ## How it Works 101 | 102 | The software uses language profiles which were created based on common text for each language. 103 | N-grams http://en.wikipedia.org/wiki/N-gram were then extracted from that text, and that's what is stored in the profiles. 104 | 105 | When trying to figure out in what language a certain text is written, the program goes through the same process: 106 | It creates the same kind of n-grams of the input text. Then it compares the relative frequency of them, and finds the 107 | language that matches best. 108 | 109 | 110 | ### Challenges 111 | 112 | This software does not work as well when the input text to analyze is short, or unclean. For example tweets. 113 | 114 | When a text is written in multiple languages, the default algorithm of this software is not appropriate. 115 | You can try to split the text (by sentence or paragraph) and detect the individual parts. Running the language guesser 116 | on the whole text will just tell you the language that is most dominant, in the best case. 117 | 118 | This software cannot handle it well when the input text is in none of the expected (and supported) languages. 119 | For example if you only load the language profiles from English and German, but the text is written in French, 120 | the program may pick the more likely one, or say it doesn't know. (An improvement would be to clearly detect that 121 | it's unlikely one of the supported languages.) 122 | 123 | If you are looking for a language detector / language guesser library in Java, this seems to be the best open source 124 | library you can get at this time. If it doesn't need to be Java, you may want to take a look at https://code.google.com/p/cld2/ 125 | 126 | 127 | ## How to Use 128 | 129 | #### Language Detection for your Text 130 | 131 | //load all languages: 132 | List languageProfiles = new LanguageProfileReader().readAllBuiltIn(); 133 | 134 | //build language detector: 135 | LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()) 136 | .withProfiles(languageProfiles) 137 | .build(); 138 | 139 | //create a text object factory 140 | TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); 141 | 142 | //query: 143 | TextObject textObject = textObjectFactory.forText("my text"); 144 | Optional lang = languageDetector.detect(textObject); 145 | 146 | 147 | #### Creating Language Profiles for your Training Text 148 | 149 | See https://github.com/optimaize/language-detector/wiki/Creating-Language-Profiles 150 | 151 | 152 | ## How You Can Help 153 | 154 | If your language is not supported yet, then you can provide clean "training text", that is, common text written in your 155 | language. The text should be fairly long (a couple of pages at the very least). If you can provide that, please open 156 | a ticket. 157 | 158 | If your language is supported already, but not identified clearly all the time, you can still provide such training 159 | text. We might then be able to improve detection for your language. 160 | 161 | If you're a programmer, dig in the source and see what you can improve. Check the open tasks. 162 | 163 | 164 | ## Memory Consumption 165 | 166 | Loading all 71 language profiles uses 74MB ram to store the data in memory. 167 | For memory considerations see https://github.com/optimaize/language-detector/wiki/Memory-Consumption 168 | 169 | 170 | ## History and Changes 171 | 172 | This project is a fork of a fork, the original author is Nakatani Shuyo. 173 | For detail see https://github.com/optimaize/language-detector/wiki/History-and-Changes 174 | 175 | 176 | ## Where it's used 177 | 178 | An adapted version of this is used by the http://www.NameAPI.org server. 179 | 180 | https://www.languagetool.org/ is a proof-reading software for LibreOffice/OpenOffice, for the Desktop and for Firefox. 181 | 182 | 183 | 184 | ## License 185 | 186 | Apache 2 (business friendly) 187 | 188 | 189 | 190 | ## Authors 191 | 192 | Nakatani Shuyo, Fabian Kessler, Francois ROLAND, Robert Theis 193 | 194 | For detail see https://github.com/optimaize/language-detector/wiki/Authors 195 | 196 | 197 | ## For Maven Users 198 | 199 | The project is in Maven central http://search.maven.org/#artifactdetails%7Ccom.optimaize.languagedetector%7Clanguage-detector%7C0.4%7Cjar this is the latest version: 200 | 201 | 202 | com.optimaize.languagedetector 203 | language-detector 204 | 0.6 205 | 206 | -------------------------------------------------------------------------------- /src/test/java/com/optimaize/langdetect/ngram/NgramExtractorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | import com.google.common.base.Stopwatch; 20 | import org.junit.Test; 21 | 22 | import java.util.*; 23 | 24 | import static org.junit.Assert.assertEquals; 25 | import static org.junit.Assert.assertTrue; 26 | 27 | /** 28 | * @author Fabian Kessler 29 | */ 30 | public class NgramExtractorTest { 31 | 32 | @Test 33 | public void extractGrams_1() { 34 | String text = "Foo bar"; 35 | List ngrams = NgramExtractor.gramLength(1).extractGrams(text); 36 | assertEquals(ngrams.size(), text.length()); 37 | assertEquals(ngrams, Arrays.asList("F","o","o"," ","b","a","r")); 38 | } 39 | 40 | @Test 41 | public void extractGrams_2() { 42 | String text = "Foo bar"; 43 | List ngrams = NgramExtractor.gramLength(2).extractGrams(text); 44 | assertEquals(ngrams.size(), text.length() -1); 45 | assertEquals(ngrams, Arrays.asList("Fo","oo","o "," b","ba","ar")); 46 | } 47 | 48 | @Test 49 | public void extractGrams_3() { 50 | String text = "Foo bar"; 51 | List ngrams = NgramExtractor.gramLength(3).extractGrams(text); 52 | assertEquals(ngrams.size(), text.length()-2); 53 | } 54 | 55 | @Test 56 | public void extractGrams_6() { 57 | String text = "Foo bar"; 58 | List ngrams = NgramExtractor.gramLength(6).extractGrams(text); 59 | assertEquals(ngrams.size(), text.length()-5); 60 | } 61 | 62 | @Test 63 | public void extractGrams_7() { 64 | String text = "Foo bar"; 65 | List ngrams = NgramExtractor.gramLength(7).extractGrams(text); 66 | assertEquals(ngrams.size(), text.length()-6); 67 | } 68 | 69 | @Test 70 | public void extractGrams_8() { 71 | String text = "Foo bar"; 72 | List ngrams = NgramExtractor.gramLength(8).extractGrams(text); 73 | assertTrue(ngrams.isEmpty()); 74 | } 75 | 76 | 77 | 78 | @Test 79 | public void stressTestAlgo2() { 80 | NgramExtractor ngramExtractor = NgramExtractor.gramLengths(1, 2, 3); 81 | String text = "Foo bar hello world and so on nana nunu dada dudu asdf asdf akewf köjvnawer aisdfj awejfr iajdsöfj ewi adjsköfjwei ajsdökfj ief asd"; 82 | Stopwatch stopwatch = Stopwatch.createStarted(); 83 | for (int i=0; i<100000; i++) { 84 | ngramExtractor.extractGrams(text); 85 | } 86 | System.out.println(stopwatch); //876.6ms 87 | } 88 | 89 | 90 | @Test 91 | public void extractGrams_threeSizesAtOnce() { 92 | String text = "Foo bar"; 93 | 94 | List expected = NgramExtractor.gramLengths(1, 2, 3).extractGrams(text); 95 | Collections.sort(expected); 96 | 97 | List separate = new ArrayList<>(); 98 | separate.addAll(NgramExtractor.gramLength(1).extractGrams(text)); 99 | separate.addAll(NgramExtractor.gramLength(2).extractGrams(text)); 100 | separate.addAll(NgramExtractor.gramLength(3).extractGrams(text)); 101 | Collections.sort(separate); 102 | 103 | assertEquals(expected, separate); 104 | } 105 | 106 | @Test 107 | public void extractGrams_threeSizesAtOnce_short() { 108 | List ngrams = NgramExtractor.gramLengths(1, 2, 3).extractGrams("a"); 109 | assertEquals(ngrams.size(), 1); 110 | 111 | ngrams = NgramExtractor.gramLengths(1, 2, 3).extractGrams(""); 112 | assertEquals(ngrams.size(), 0); 113 | } 114 | 115 | 116 | 117 | @Test 118 | public void extractCountedGrams_single_1() { 119 | Map grams = NgramExtractor.gramLength(1).extractCountedGrams("Foo"); 120 | assertEquals(grams.size(), 2); 121 | } 122 | 123 | @Test 124 | public void extractCountedGrams_single_2() { 125 | Map grams = NgramExtractor.gramLengths(2).extractCountedGrams("Foo bar"); 126 | assertEquals(grams.size(), 6); 127 | 128 | grams = NgramExtractor.gramLengths(2).extractCountedGrams("aaaa"); 129 | assertEquals(grams, Collections.singletonMap("aa",3)); 130 | } 131 | 132 | @Test 133 | public void extractCountedGrams_list_1() { 134 | String text = "Foo bar dies ist ein längerer deutscher Text, und Texte sind üblicherweise auch gerne gross geschrieben und so nämlich."; 135 | 136 | Map one = NgramExtractor.gramLength(1).extractCountedGrams(text); 137 | Map two = NgramExtractor.gramLengths(2).extractCountedGrams(text); 138 | Map three = NgramExtractor.gramLengths(3).extractCountedGrams(text); 139 | Map combined = new HashMap<>(); 140 | combined.putAll(one); 141 | combined.putAll(two); 142 | combined.putAll(three); 143 | 144 | Map combined2 = NgramExtractor.gramLengths(1, 2, 3).extractCountedGrams(text); 145 | assertEquals(combined, combined2); 146 | } 147 | 148 | 149 | @Test 150 | public void extractGramsWithPadding_1() { 151 | String text = "Foo bar"; 152 | List ngrams = NgramExtractor.gramLength(1).textPadding(' ').extractGrams(text); 153 | assertEquals(ngrams.size(), text.length()+2); 154 | assertEquals(ngrams, Arrays.asList(" ","F","o","o"," ","b","a","r"," ")); 155 | } 156 | 157 | @Test 158 | public void extractGramsWithPaddingAndFilter_1() { 159 | String text = "Foo bar"; 160 | List ngrams = NgramExtractor 161 | .gramLength(1) 162 | .filter(StandardNgramFilter.getInstance()) 163 | .textPadding(' ') 164 | .extractGrams(text); 165 | assertEquals(ngrams, Arrays.asList("F","o","o","b","a","r")); 166 | } 167 | 168 | @Test 169 | public void extractGramsWithPadding_2() { 170 | String text = "Foo bar"; 171 | List ngrams = NgramExtractor.gramLength(2).textPadding(' ').extractGrams(text); 172 | assertEquals(ngrams.size(), text.length() +1); 173 | assertEquals(ngrams, Arrays.asList(" F","Fo","oo","o "," b","ba","ar","r ")); 174 | } 175 | 176 | } 177 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/profiles/LanguageProfileImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.profiles; 18 | 19 | import com.google.common.collect.ImmutableMap; 20 | import com.google.common.collect.Iterables; 21 | import com.optimaize.langdetect.i18n.LdLocale; 22 | import org.jetbrains.annotations.NotNull; 23 | 24 | import java.util.*; 25 | 26 | /** 27 | *

This class is immutable.

28 | * 29 | * @author Fabian Kessler 30 | */ 31 | public final class LanguageProfileImpl implements LanguageProfile { 32 | 33 | @NotNull 34 | private final LdLocale locale; 35 | @NotNull 36 | private final Map> ngrams; 37 | @NotNull 38 | private final Stats stats; 39 | 40 | private static class Stats { 41 | /** 42 | * Key = gram length (1-3 or so). 43 | * Value = number of all occurrences of these grams combined. 44 | */ 45 | @NotNull 46 | private final Map numOccurrences; 47 | 48 | /** 49 | * Key = gram length (1-3 or so). 50 | * Value = number of occurrences of the n-gram that occurs the least often. 51 | * this can be 1, or larger if a cutoff was applied to remove infrequent grams. 52 | */ 53 | @NotNull 54 | private final Map minGramCounts; 55 | 56 | /** 57 | * Key = gram length (1-3 or so). 58 | * Value = number of occurrences of the n-gram that occurs the most often. 59 | */ 60 | @NotNull 61 | private final Map maxGramCounts; 62 | 63 | public Stats(@NotNull Map numOccurrences, 64 | @NotNull Map minGramCounts, 65 | @NotNull Map maxGramCounts) { 66 | this.numOccurrences = ImmutableMap.copyOf(numOccurrences); 67 | this.minGramCounts = ImmutableMap.copyOf(minGramCounts); 68 | this.maxGramCounts = ImmutableMap.copyOf(maxGramCounts); 69 | } 70 | } 71 | 72 | 73 | /** 74 | * Use the builder. 75 | */ 76 | LanguageProfileImpl(@NotNull LdLocale locale, 77 | @NotNull Map> ngrams) { 78 | this.locale = locale; 79 | this.ngrams = ImmutableMap.copyOf(ngrams); 80 | this.stats = makeStats(ngrams); 81 | } 82 | 83 | private static Stats makeStats(Map> ngrams) { 84 | Map numOccurrences = new HashMap<>(6); 85 | Map minGramCounts = new HashMap<>(6); 86 | Map maxGramCounts = new HashMap<>(6); 87 | for (Map.Entry> entry : ngrams.entrySet()) { 88 | long count = 0; 89 | Long min = null; 90 | Long max = null; 91 | for (Integer integer : entry.getValue().values()) { 92 | count += integer; 93 | if (min==null || min > integer) { 94 | min = (long)integer; 95 | } 96 | if (max==null || max < integer) { 97 | max = (long)integer; 98 | } 99 | } 100 | numOccurrences.put(entry.getKey(), count); 101 | minGramCounts.put(entry.getKey(), min); 102 | maxGramCounts.put(entry.getKey(), max); 103 | } 104 | return new Stats(numOccurrences, minGramCounts, maxGramCounts); 105 | } 106 | 107 | 108 | @NotNull 109 | @Override 110 | public LdLocale getLocale() { 111 | return locale; 112 | } 113 | 114 | @NotNull @Override 115 | public List getGramLengths() { 116 | List lengths = new ArrayList<>(ngrams.keySet()); 117 | Collections.sort(lengths); 118 | return lengths; 119 | } 120 | 121 | @Override 122 | public int getFrequency(String gram) { 123 | Map map = ngrams.get(gram.length()); 124 | if (map==null) return 0; 125 | Integer freq = map.get(gram); 126 | if (freq==null) return 0; 127 | return freq; 128 | } 129 | 130 | @Override 131 | public int getNumGrams(int gramLength) { 132 | if (gramLength<1) throw new IllegalArgumentException(""+gramLength); 133 | Map map = ngrams.get(gramLength); 134 | if (map==null) return 0; 135 | return map.size(); 136 | } 137 | 138 | @Override 139 | public int getNumGrams() { 140 | int ret = 0; 141 | for (Map stringIntegerMap : ngrams.values()) { 142 | ret += stringIntegerMap.size(); 143 | } 144 | return ret; 145 | } 146 | 147 | @Override 148 | public long getNumGramOccurrences(int gramLength) { 149 | Long aLong = stats.numOccurrences.get(gramLength); 150 | if (aLong==null) return 0; 151 | return aLong; 152 | } 153 | 154 | @Override 155 | public long getMinGramCount(int gramLength) { 156 | Long aLong = stats.minGramCounts.get(gramLength); 157 | if (aLong==null) return 0; 158 | return aLong; 159 | } 160 | 161 | @Override 162 | public long getMaxGramCount(int gramLength) { 163 | Long aLong = stats.maxGramCounts.get(gramLength); 164 | if (aLong==null) return 0; 165 | return aLong; 166 | } 167 | 168 | 169 | @NotNull @Override 170 | public Iterable> iterateGrams() { 171 | Iterable[] arr = new Iterable[ngrams.size()]; 172 | int i=0; 173 | for (Map stringIntegerMap : ngrams.values()) { 174 | arr[i] = stringIntegerMap.entrySet(); 175 | i++; 176 | } 177 | //noinspection unchecked 178 | return Iterables.concat(arr); 179 | } 180 | 181 | @NotNull @Override 182 | public Iterable> iterateGrams(int gramLength) { 183 | return ngrams.get(gramLength).entrySet(); 184 | } 185 | 186 | @Override 187 | public String toString() { 188 | StringBuilder sb = new StringBuilder(); 189 | sb.append("LanguageProfile{locale="); 190 | sb.append(locale); 191 | for (Integer integer : getGramLengths()) { 192 | sb.append(","); 193 | sb.append(integer); 194 | sb.append("-grams="); 195 | sb.append(getNumGrams(integer)); 196 | } 197 | sb.append("}"); 198 | return sb.toString(); 199 | } 200 | 201 | @Override 202 | public boolean equals(Object o) { 203 | if (this == o) return true; 204 | if (o == null || getClass() != o.getClass()) return false; 205 | 206 | LanguageProfileImpl that = (LanguageProfileImpl) o; 207 | 208 | if (!locale.equals(that.locale)) return false; 209 | if (!ngrams.equals(that.ngrams)) return false; 210 | 211 | return true; 212 | } 213 | @Override 214 | public int hashCode() { 215 | int result = locale.hashCode(); 216 | result = 31 * result + ngrams.hashCode(); 217 | return result; 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/ngram/NgramExtractor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.ngram; 18 | 19 | import com.google.common.collect.ImmutableList; 20 | import org.jetbrains.annotations.NotNull; 21 | import org.jetbrains.annotations.Nullable; 22 | 23 | import java.util.*; 24 | 25 | /** 26 | * Class for extracting n-grams out of a text. 27 | * 28 | * This class is immutable. 29 | * 30 | * @author Fabian Kessler 31 | */ 32 | public class NgramExtractor { 33 | 34 | @NotNull 35 | private final List gramLengths; 36 | @Nullable 37 | private final NgramFilter filter; 38 | @Nullable 39 | private final Character textPadding; 40 | 41 | public static NgramExtractor gramLength(int gramLength) { 42 | return new NgramExtractor(ImmutableList.of(gramLength), null, null); 43 | } 44 | public static NgramExtractor gramLengths(Integer... gramLength) { 45 | return new NgramExtractor(Arrays.asList(gramLength), null, null); 46 | } 47 | 48 | public NgramExtractor filter(NgramFilter filter) { 49 | return new NgramExtractor(this.gramLengths, filter, this.textPadding); 50 | } 51 | 52 | /** 53 | * To ensure having border grams, this character is added to the left and right of the text. 54 | * 55 | *

Example: when textPadding is a space ' ' then a text input "foo" becomes " foo ", ensuring that n-grams like " f" 56 | * are created.

57 | * 58 | *

If the text already has such a character in that position (eg starts with), it is not added there.

59 | * 60 | * @param textPadding for example a space ' '. 61 | */ 62 | public NgramExtractor textPadding(char textPadding) { 63 | return new NgramExtractor(this.gramLengths, this.filter, textPadding); 64 | } 65 | 66 | private NgramExtractor(@NotNull List gramLengths, @Nullable NgramFilter filter, @Nullable Character textPadding) { 67 | if (gramLengths.isEmpty()) throw new IllegalArgumentException(); 68 | this.gramLengths = ImmutableList.copyOf(gramLengths); 69 | this.filter = filter; 70 | this.textPadding = textPadding; 71 | } 72 | 73 | public List getGramLengths() { 74 | return gramLengths; 75 | } 76 | 77 | /** 78 | * Creates the n-grams for a given text in the order they occur. 79 | * 80 | *

Example: extractSortedGrams("Foo bar", 2) => [Fo,oo,o , b,ba,ar]

81 | * 82 | * @param text 83 | * @return The grams, empty if the input was empty or if none for that gramLength fits. 84 | */ 85 | @NotNull 86 | public List extractGrams(@NotNull CharSequence text) { 87 | text = applyPadding(text); 88 | int len = text.length(); 89 | 90 | //the actual size will be totalNumGrams or less (filter) 91 | int totalNumGrams = 0; 92 | for (Integer gramLength : gramLengths) { 93 | int num = len - (gramLength - 1); 94 | if (num >= 1) { //yes can be negative 95 | totalNumGrams += num; 96 | } 97 | } 98 | if (totalNumGrams <= 0) { 99 | return Collections.emptyList(); 100 | } 101 | List grams = new ArrayList<>(totalNumGrams); 102 | 103 | for (Integer gramLength : gramLengths) { 104 | int numGrams = len - (gramLength -1); 105 | if (numGrams >= 1) { //yes can be negative 106 | for (int pos=0; pos extractCountedGrams(@NotNull CharSequence text) { 124 | text = applyPadding(text); 125 | int len = text.length(); 126 | 127 | int initialCapacity = 0; 128 | for (Integer gramLength : gramLengths) { 129 | initialCapacity += guessNumDistinctiveGrams(len, gramLength); 130 | } 131 | 132 | Map grams = new LinkedHashMap<>(initialCapacity); 133 | for (Integer gramLength : gramLengths) { 134 | _extractCounted(text, gramLength, len, grams); 135 | } 136 | return grams; 137 | } 138 | 139 | 140 | private void _extractCounted(CharSequence text, int gramLength, int len, Map grams) { 141 | int endPos = len - (gramLength -1); 142 | for (int pos=0; posThis is usually used to load built-in profiles, shipped with the jar.

58 | * 59 | * @param classLoader the ClassLoader to load the profiles from. Use {@code MyClass.class.getClassLoader()} 60 | * @param profileDirectory profile directory path inside the classpath. The default profiles are in "languages". 61 | * @param profileFileNames for example ["en", "fr", "de"]. 62 | */ 63 | public List read(ClassLoader classLoader, String profileDirectory, Collection profileFileNames) throws IOException { 64 | List loaded = new ArrayList<>(profileFileNames.size()); 65 | for (String profileFileName : profileFileNames) { 66 | String path = makePathForClassLoader(profileDirectory, profileFileName); 67 | try (InputStream in = classLoader.getResourceAsStream(path)) { 68 | if (in == null) { 69 | throw new IOException("No language file available named "+profileFileName+" at " + path + "!"); 70 | } 71 | loaded.add( read(in) ); 72 | } 73 | } 74 | return loaded; 75 | } 76 | 77 | private String makePathForClassLoader(String profileDirectory, String fileName) { 78 | //WITHOUT slash before the profileDirectory when using the classloader! 79 | return profileDirectory + '/' + fileName; 80 | } 81 | 82 | /** 83 | * Same as {@link #read(ClassLoader, String, java.util.Collection)} using the class loader of this class. 84 | */ 85 | public List read(String profileDirectory, Collection profileFileNames) throws IOException { 86 | return read(LanguageProfileReader.class.getClassLoader(), profileDirectory, profileFileNames); 87 | } 88 | 89 | /** 90 | * Same as {@link #read(ClassLoader, String, java.util.Collection)} using the class loader of this class, 91 | * and the default profiles directory of this library. 92 | */ 93 | public List read(Collection profileFileNames) throws IOException { 94 | return read(LanguageProfileReader.class.getClassLoader(), PROFILES_DIR, profileFileNames); 95 | } 96 | 97 | @NotNull 98 | public LanguageProfile readBuiltIn(@NotNull LdLocale locale) throws IOException { 99 | String filename = makeProfileFileName(locale); 100 | String path = makePathForClassLoader(PROFILES_DIR, filename); 101 | try (InputStream in = LanguageProfileReader.class.getClassLoader().getResourceAsStream(path)) { 102 | if (in == null) { 103 | throw new IOException("No language file available named "+filename+" at " + path + "!"); 104 | } 105 | return read(in); 106 | } 107 | } 108 | 109 | @NotNull 110 | private String makeProfileFileName(@NotNull LdLocale locale) { 111 | return locale.toString(); 112 | } 113 | 114 | @NotNull 115 | public List readBuiltIn(@NotNull Collection languages) throws IOException { 116 | List profileNames = new ArrayList<>(); 117 | for (LdLocale locale : languages) { 118 | profileNames.add(makeProfileFileName(locale)); 119 | } 120 | return read(LanguageProfileReader.class.getClassLoader(), PROFILES_DIR, profileNames); 121 | } 122 | 123 | /** 124 | * @deprecated renamed to readAllBuiltIn() 125 | */ 126 | public List readAll() throws IOException { 127 | return readAllBuiltIn(); 128 | } 129 | /** 130 | * Reads all built-in language profiles from the "languages" folder (shipped with the jar). 131 | */ 132 | public List readAllBuiltIn() throws IOException { 133 | List loaded = new ArrayList<>(); 134 | for (LdLocale locale : BuiltInLanguages.getLanguages()) { 135 | loaded.add(readBuiltIn(locale)); 136 | } 137 | return loaded; 138 | } 139 | 140 | /** 141 | * Loads all profiles from the specified directory. 142 | * 143 | * Do not use this method for files distributed within a jar. 144 | * 145 | * @param path profile directory path 146 | * @return empty if there is no language file in it. 147 | */ 148 | public List readAll(File path) throws IOException { 149 | if (!path.exists()) { 150 | throw new IOException("No such folder: "+path); 151 | } 152 | if (!path.canRead()) { 153 | throw new IOException("Folder not readable: "+path); 154 | } 155 | File[] listFiles = path.listFiles(new FileFilter() { 156 | @Override 157 | public boolean accept(File pathname) { 158 | return looksLikeLanguageProfileFile(pathname); 159 | } 160 | }); 161 | if (listFiles == null) { 162 | throw new IOException("Failed reading from folder: " + path); 163 | } 164 | 165 | List profiles = new ArrayList<>(listFiles.length); 166 | for (File file: listFiles) { 167 | if (!looksLikeLanguageProfileFile(file)) { 168 | continue; 169 | } 170 | profiles.add(read(file)); 171 | } 172 | return profiles; 173 | } 174 | 175 | private boolean looksLikeLanguageProfileFile(File file) { 176 | if (!file.isFile()) { 177 | return false; 178 | } 179 | return looksLikeLanguageProfileName(file.getName()); 180 | } 181 | private boolean looksLikeLanguageProfileName(String fileName) { 182 | if (fileName.contains(".")) { 183 | return false; 184 | } 185 | try { 186 | LdLocale.fromString(fileName); 187 | return true; 188 | } catch (Exception e) { 189 | return false; 190 | } 191 | } 192 | 193 | } 194 | -------------------------------------------------------------------------------- /src/main/java/com/optimaize/langdetect/i18n/LdLocale.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 Fabian Kessler 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.optimaize.langdetect.i18n; 18 | 19 | import com.google.common.base.Optional; 20 | import com.google.common.base.Splitter; 21 | import org.jetbrains.annotations.NotNull; 22 | 23 | import java.util.List; 24 | 25 | /** 26 | * A language-detector implementation of a Locale, similar to the java.util.Locale. 27 | * 28 | *

It represents a IETF BCP 47 tag, but does not implement all the features. Features can be added as needed.

29 | * 30 | *

It is constructed through the {@link #fromString} factory method. The {@link #toString()} method 31 | * produces a parseable and persistable string.

32 | * 33 | *

The class is immutable.

34 | * 35 | *

The java.util.Locale cannot be used because it has issues for historical reasons, notably the 36 | * script code conversion for Hebrew, Yiddish and Indonesian, and more. If one needs a Locale, 37 | * it is simple to create one based on this object.
38 | * The ICU ULocale cannot be used because a) it has issues too (for our use case) and b) we're not 39 | * using ICU in here [yet].

40 | * 41 | *

This class does not perform any modifications on the input. The input is used as is, and the getters 42 | * return it in exactly the same way. No standardization, canonicalization, cleaning.

43 | * 44 | *

The input is validated syntactically, but not for code existence. For example the script code must 45 | * be a valid ISO 15924 like "Latn" or "Cyrl", in correct case. But whether the code exists or not is not checked. 46 | * These code standards are not fixed, simply because regional entities like Countries can change for political 47 | * reasons, and languages are living entities. Therefore certain codes may exist at some point in time only 48 | * (be introduced late, or be deprecated or removed, or even be re-assigned another meaning). 49 | * It is not up to us to decide whether Kosovo is a country in 2015 or not. 50 | * If one needs to only work with a certain range of acceptable codes, he can validate the codes through other 51 | * classes that have knowledge about the codes. 52 | *

53 | * 54 | *

Language: as for BCP 47, the iso 639-1 code must be used if there is one. For example "fr" for French. 55 | * If not, the ISO 639-3 should be used. It is highly discouraged to use 639-2. 56 | * Right now this class enforces a 2 or 3 char code, but this may be relaxed in the future.

57 | * 58 | *

Script: Only ISO 15924, no discussion.

59 | * 60 | *

Region: same as for BCP 47. That means ISO 3166-1 alpha-2 and "UN M.49". 61 | * I can imagine relaxing it in the future to also allow 3166-2 codes. 62 | * In most cases the "region" is a "country".

63 | * 64 | * @author fabian kessler 65 | */ 66 | public final class LdLocale { 67 | 68 | @NotNull 69 | private final String language; 70 | @NotNull 71 | private final Optional script; 72 | @NotNull 73 | private final Optional region; 74 | 75 | private LdLocale(@NotNull String language, @NotNull Optional script, @NotNull Optional region) { 76 | this.language = language; 77 | this.script = script; 78 | this.region = region; 79 | } 80 | 81 | /** 82 | * @param string The output of the toString() method. 83 | * @return either a new or possibly a cached (immutable) instance. 84 | */ 85 | @NotNull 86 | public static LdLocale fromString(@NotNull String string) { 87 | if (string==null || string.isEmpty()) throw new IllegalArgumentException("At least a language is required!"); 88 | 89 | String language = null; 90 | Optional script = null; 91 | Optional region = null; 92 | 93 | List strings = Splitter.on('-').splitToList(string); 94 | for (int i=0; i>>"+chunk+"<<>>"+s+"<< getScript() { 166 | return script; 167 | } 168 | 169 | /** 170 | * @return ISO 3166-1 or UN M.49 code, eg "DE" or 150, see class header. 171 | */ 172 | @NotNull 173 | public Optional getRegion() { 174 | return region; 175 | } 176 | 177 | 178 | 179 | @Override //generated-code 180 | public boolean equals(Object o) { 181 | if (this == o) return true; 182 | if (o == null || getClass() != o.getClass()) return false; 183 | 184 | LdLocale ldLocale = (LdLocale) o; 185 | 186 | if (!language.equals(ldLocale.language)) return false; 187 | if (!region.equals(ldLocale.region)) return false; 188 | if (!script.equals(ldLocale.script)) return false; 189 | 190 | return true; 191 | } 192 | 193 | @Override //generated-code 194 | public int hashCode() { 195 | int result = language.hashCode(); 196 | result = 31 * result + script.hashCode(); 197 | result = 31 * result + region.hashCode(); 198 | return result; 199 | } 200 | } 201 | --------------------------------------------------------------------------------