├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── enhancement_or_feature_request.md ├── .gitignore ├── .travis.yml ├── CODE_OF_CONDUCT.md ├── LICENSE.txt ├── README.md ├── assets └── banner.png ├── elasticsearch ├── pom.xml └── src │ └── main │ ├── assemblies │ └── plugin.xml │ ├── java │ └── com │ │ └── infinilabs │ │ └── elasticsearch │ │ └── analysis │ │ ├── AnalysisPinyinPlugin.java │ │ ├── ESPinyinConfig.java │ │ ├── PinyinAbbreviationsTokenizerFactory.java │ │ ├── PinyinAnalyzerProvider.java │ │ ├── PinyinTokenFilterFactory.java │ │ └── PinyinTokenizerFactory.java │ └── resources │ └── plugin-descriptor.properties ├── opensearch ├── pom.xml └── src │ └── main │ ├── assemblies │ └── plugin.xml │ ├── java │ └── com │ │ └── infinilabs │ │ └── opensearch │ │ └── analysis │ │ ├── AnalysisPinyinPlugin.java │ │ ├── ESPinyinConfig.java │ │ ├── PinyinAbbreviationsTokenizerFactory.java │ │ ├── PinyinAnalyzerProvider.java │ │ ├── PinyinTokenFilterFactory.java │ │ └── PinyinTokenizerFactory.java │ └── resources │ └── plugin-descriptor.properties ├── pinyin-core ├── pom.xml └── src │ ├── main │ ├── java │ │ ├── com │ │ │ └── infinilabs │ │ │ │ └── pinyin │ │ │ │ └── analysis │ │ │ │ ├── ChineseUtil.java │ │ │ │ ├── ConfigErrorException.java │ │ │ │ ├── PinyinAlphabetTokenizer.java │ │ │ │ ├── PinyinAnalyzer.java │ │ │ │ ├── PinyinConfig.java │ │ │ │ ├── PinyinTokenFilter.java │ │ │ │ ├── PinyinTokenizer.java │ │ │ │ └── TermItem.java │ │ └── org │ │ │ └── nlpcn │ │ │ └── commons │ │ │ └── lang │ │ │ ├── pinyin │ │ │ ├── CaseType.java │ │ │ ├── Pinyin.java │ │ │ ├── PinyinFormat.java │ │ │ ├── PinyinFormatter.java │ │ │ ├── PinyinUtil.java │ │ │ ├── PinyinWord.java │ │ │ ├── ToneType.java │ │ │ └── YuCharType.java │ │ │ ├── tire │ │ │ ├── GetWord.java │ │ │ ├── SmartGetWord.java │ │ │ ├── domain │ │ │ │ ├── Forest.java │ │ │ │ ├── SmartForest.java │ │ │ │ └── Value.java │ │ │ └── library │ │ │ │ └── Library.java │ │ │ └── util │ │ │ ├── AnsjArrays.java │ │ │ ├── CollectionUtil.java │ │ │ ├── FileFinder.java │ │ │ ├── FileIterator.java │ │ │ ├── IOUtil.java │ │ │ ├── MD5.java │ │ │ ├── MapCount.java │ │ │ ├── MapFactory.java │ │ │ ├── MurmurHash.java │ │ │ ├── ObjConver.java │ │ │ ├── StringUtil.java │ │ │ ├── WordAlert.java │ │ │ ├── WordWeight.java │ │ │ ├── logging │ │ │ ├── JakartaCommonsLoggingImpl.java │ │ │ ├── Jdk14LoggingImpl.java │ │ │ ├── Log.java │ │ │ ├── Log4j2Impl.java │ │ │ ├── Log4jImpl.java │ │ │ ├── LogFactory.java │ │ │ ├── NoLoggingImpl.java │ │ │ ├── Resources.java │ │ │ └── SLF4JImpl.java │ │ │ └── tuples │ │ │ ├── Decade.java │ │ │ ├── Ennead.java │ │ │ ├── KeyValue.java │ │ │ ├── LabelValue.java │ │ │ ├── Octet.java │ │ │ ├── Pair.java │ │ │ ├── Quartet.java │ │ │ ├── Quintet.java │ │ │ ├── Septet.java │ │ │ ├── Sextet.java │ │ │ ├── Triplet.java │ │ │ ├── Tuple.java │ │ │ ├── Unit.java │ │ │ └── valueintf │ │ │ ├── IValue0.java │ │ │ ├── IValue1.java │ │ │ ├── IValue2.java │ │ │ ├── IValue3.java │ │ │ ├── IValue4.java │ │ │ ├── IValue5.java │ │ │ ├── IValue6.java │ │ │ ├── IValue7.java │ │ │ ├── IValue8.java │ │ │ ├── IValue9.java │ │ │ ├── IValueKey.java │ │ │ ├── IValueLabel.java │ │ │ └── IValueValue.java │ └── resources │ │ ├── pinyin.txt │ │ ├── pinyin_alphabet.dict │ │ └── polyphone.txt │ └── test │ ├── java │ ├── com │ │ └── infinilabs │ │ │ └── pinyin │ │ │ └── analysis │ │ │ ├── PinyinAlphabetTokenizerTest.java │ │ │ └── PinyinAnalysisTest.java │ └── org │ │ └── nlpcn │ │ └── commons │ │ └── lang │ │ ├── TestUtils.java │ │ ├── pinyin │ │ └── PinyinTest.java │ │ ├── tire │ │ └── splitWord │ │ │ ├── AllWordTest.java │ │ │ ├── ForestTest.java │ │ │ ├── GetWordTest.java │ │ │ ├── LibraryTest.java │ │ │ └── SmartGetWordTest.java │ │ └── util │ │ ├── FileFinderTest.java │ │ ├── IOUtilTest.java │ │ ├── StringUtilTest.java │ │ ├── WordAlertTest.java │ │ ├── WordWeightTest.java │ │ └── logging │ │ └── NLPLoggerTest.java │ └── resources │ ├── library.txt │ ├── log4j.properties │ ├── test.json │ └── test_pinyin.dic └── pom.xml /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a bug report to help fix a problem 4 | --- 5 | 6 | ### Description 7 | 8 | A description of what the bug is. 9 | 10 | ### Steps to reproduce 11 | 12 | 1. First step 13 | 2. Second step 14 | 3. Third step 15 | 16 | Priovde your configuration or code snippet that helps. 17 | 18 | ### Expected behavior 19 | 20 | A description of what you expected to happen. 21 | 22 | ### Actual behavior 23 | 24 | A description of what happens instead. 25 | 26 | ### Environment 27 | 28 | - Versions: [e.g. Elasticsearch 8.0.0] 29 | - Operating system and version: [e.g. macOS 10.14, Windows 10, Ubuntu 18.04] 30 | - [Linux] Desktop Environment and/or Window Manager: [e.g. Gnome, LXDE, i3] 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement_or_feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enhancement or feature request 3 | about: Suggest an enhancement or feature 4 | --- 5 | 6 | ### Problem description 7 | 8 | A description of a problem, workflow or integration that your suggestion would solve. 9 | If the problem is OS-specific, include that information here. 10 | 11 | ### Preferred solution 12 | 13 | A description of what changes should be made to solve the problem. 14 | 15 | ### Alternatives 16 | 17 | A description of any alternative solutions or enhancements considered. 18 | 19 | ### Additional Information (optional) 20 | 21 | If applicable, add screenshots to help demonstrate the problem or proposed solution. 22 | Code examples or related links are useful, too. 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | /work 3 | /logs 4 | /.idea 5 | /target 6 | .DS_Store 7 | *.iml 8 | /.project 9 | /.settings 10 | /.classpath 11 | /*.ipr 12 | /*.iws 13 | /*/target 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | jdk: 3 | - oraclejdk8 4 | install: true 5 | script: 6 | - sudo apt-get update && sudo apt-get install oracle-java8-installer 7 | - java -version 8 | language: java 9 | script: mvn clean package 10 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at contact@infini.ltd. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/infinilabs/analysis-pinyin/2d58347db2db6533bf31bd2d9be9c66b5e2c32a8/assets/banner.png -------------------------------------------------------------------------------- /elasticsearch/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | analysis-pinyin 7 | com.infinilabs 8 | 1.0 9 | 10 | 4.0.0 11 | 12 | elasticsearch-analysis-pinyin 13 | ${elasticsearch.version} 14 | Pinyin Analysis for Elasticsearch 15 | jar 16 | 17 | 9.0.0 18 | 1.8 19 | 20 | analysis-pinyin 21 | com.infinilabs.elasticsearch.analysis.AnalysisPinyinPlugin 22 | true 23 | UTF-8 24 | 25 | 26 | 27 | 28 | 29 | com.infinilabs 30 | pinyin-core 31 | ${project.parent.version} 32 | 33 | 34 | 35 | org.elasticsearch 36 | elasticsearch 37 | ${elasticsearch.version} 38 | compile 39 | 40 | 41 | 42 | 43 | 44 | 45 | maven-assembly-plugin 46 | 3.6.0 47 | 48 | false 49 | ${project.build.directory}/releases/ 50 | 51 | elasticsearch/src/main/assemblies/plugin.xml 52 | 53 | 54 | 55 | 56 | distro-assembly 57 | package 58 | 59 | single 60 | 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /elasticsearch/src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/../elasticsearch/src/main/resources/plugin-descriptor.properties 11 | 12 | true 13 | 14 | 15 | 16 | 17 | / 18 | true 19 | true 20 | 21 | org.elasticsearch:elasticsearch 22 | 23 | 24 | 25 | / 26 | true 27 | true 28 | 29 | org.apache.lucene:lucene-pinyin 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/com/infinilabs/elasticsearch/analysis/AnalysisPinyinPlugin.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.elasticsearch.analysis; 2 | 3 | 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.elasticsearch.index.analysis.*; 6 | import org.elasticsearch.indices.analysis.AnalysisModule; 7 | import org.elasticsearch.plugins.AnalysisPlugin; 8 | import org.elasticsearch.plugins.Plugin; 9 | 10 | import java.util.Collections; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | 15 | public class AnalysisPinyinPlugin extends Plugin implements AnalysisPlugin { 16 | 17 | @Override 18 | public Map> getTokenizers() { 19 | Map> extra = new HashMap<>(); 20 | extra.put("pinyin", PinyinTokenizerFactory::new); 21 | extra.put("pinyin_first_letter", PinyinAbbreviationsTokenizerFactory::new); 22 | return extra; 23 | } 24 | 25 | @Override 26 | public Map> getTokenFilters() { 27 | Map> extra = new HashMap<>(); 28 | extra.put("pinyin", PinyinTokenFilterFactory::new); 29 | return extra; 30 | } 31 | 32 | @Override 33 | public Map>> getAnalyzers() { 34 | return Collections.singletonMap("pinyin", PinyinAnalyzerProvider::new); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/com/infinilabs/elasticsearch/analysis/ESPinyinConfig.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.elasticsearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinConfig; 4 | import org.elasticsearch.common.settings.Settings; 5 | 6 | public class ESPinyinConfig extends PinyinConfig { 7 | public ESPinyinConfig() { 8 | } 9 | 10 | public ESPinyinConfig(Settings settings) { 11 | this.keepFirstLetter = settings.getAsBoolean("keep_first_letter", true); 12 | this.keepSeparateFirstLetter = settings.getAsBoolean("keep_separate_first_letter", false); 13 | this.keepFullPinyin = settings.getAsBoolean("keep_full_pinyin", true); 14 | this.keepJoinedFullPinyin = settings.getAsBoolean("keep_joined_full_pinyin", false); 15 | this.keepNoneChinese = settings.getAsBoolean("keep_none_chinese", true); 16 | this.keepNoneChineseTogether = settings.getAsBoolean("keep_none_chinese_together", true); 17 | this.noneChinesePinyinTokenize = settings.getAsBoolean("none_chinese_pinyin_tokenize", true); 18 | this.keepOriginal = settings.getAsBoolean("keep_original", false); 19 | this.LimitFirstLetterLength = settings.getAsInt("limit_first_letter_length", 16); 20 | this.lowercase = settings.getAsBoolean("lowercase", true); 21 | this.trimWhitespace = settings.getAsBoolean("trim_whitespace", true); 22 | this.keepNoneChineseInFirstLetter = settings.getAsBoolean("keep_none_chinese_in_first_letter", true); 23 | this.keepNoneChineseInJoinedFullPinyin = settings.getAsBoolean("keep_none_chinese_in_joined_full_pinyin", false); 24 | this.removeDuplicateTerm = settings.getAsBoolean("remove_duplicated_term", false); 25 | this.fixedPinyinOffset = settings.getAsBoolean("fixed_pinyin_offset", false); 26 | this.ignorePinyinOffset = settings.getAsBoolean("ignore_pinyin_offset", true); 27 | this.keepSeparateChinese = settings.getAsBoolean("keep_separate_chinese", false); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/com/infinilabs/elasticsearch/analysis/PinyinAbbreviationsTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.elasticsearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinConfig; 4 | import com.infinilabs.pinyin.analysis.PinyinTokenizer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory; 10 | 11 | public class PinyinAbbreviationsTokenizerFactory extends AbstractTokenizerFactory { 12 | 13 | public PinyinAbbreviationsTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 14 | super(name); 15 | } 16 | 17 | @Override 18 | public Tokenizer create() { 19 | PinyinConfig config=new ESPinyinConfig(); 20 | config.keepFirstLetter=true; 21 | config.keepFullPinyin=false; 22 | config.keepNoneChinese=false; 23 | config.keepNoneChineseTogether=true; 24 | config.noneChinesePinyinTokenize=false; 25 | config.keepOriginal=false; 26 | config.lowercase=true; 27 | config.trimWhitespace=true; 28 | config.keepNoneChineseInFirstLetter=true; 29 | return new PinyinTokenizer(config); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/com/infinilabs/elasticsearch/analysis/PinyinAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.elasticsearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinAnalyzer; 4 | import com.infinilabs.pinyin.analysis.PinyinConfig; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 9 | import org.elasticsearch.injection.api.Inject; 10 | 11 | 12 | /* 13 | * Provider for the PinyinAnalyzer. 14 | */ 15 | public class PinyinAnalyzerProvider extends AbstractIndexAnalyzerProvider { 16 | 17 | private final PinyinAnalyzer analyzer; 18 | private PinyinConfig config; 19 | 20 | @Inject 21 | public PinyinAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 22 | super(name); 23 | config=new ESPinyinConfig(settings); 24 | analyzer = new PinyinAnalyzer(config); 25 | } 26 | 27 | @Override 28 | public PinyinAnalyzer get() { 29 | return this.analyzer; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/com/infinilabs/elasticsearch/analysis/PinyinTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.elasticsearch.analysis; 2 | 3 | 4 | import com.infinilabs.pinyin.analysis.PinyinConfig; 5 | import com.infinilabs.pinyin.analysis.PinyinTokenFilter; 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 11 | 12 | public class PinyinTokenFilterFactory extends AbstractTokenFilterFactory { 13 | private PinyinConfig config; 14 | 15 | 16 | public PinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 17 | super(name); 18 | config=new ESPinyinConfig(settings); 19 | } 20 | 21 | @Override 22 | public TokenStream create(TokenStream tokenStream) { 23 | return new PinyinTokenFilter(tokenStream, config); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /elasticsearch/src/main/java/com/infinilabs/elasticsearch/analysis/PinyinTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.elasticsearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinConfig; 4 | import com.infinilabs.pinyin.analysis.PinyinTokenizer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory; 10 | 11 | public class PinyinTokenizerFactory extends AbstractTokenizerFactory { 12 | 13 | private PinyinConfig config; 14 | 15 | public PinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 16 | super(name); 17 | config=new ESPinyinConfig(settings); 18 | } 19 | 20 | @Override 21 | public Tokenizer create() { 22 | return new PinyinTokenizer(config); 23 | } 24 | } 25 | 26 | -------------------------------------------------------------------------------- /elasticsearch/src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${plugin.name} 41 | 42 | # 43 | # 'classname': the name of the class to load, fully-qualified. 44 | classname=${elasticsearch.plugin.classname} 45 | # 46 | # 'java.version' version of java the code is built against 47 | # use the system property java.specification.version 48 | # version string must be a sequence of nonnegative decimal integers 49 | # separated by "."'s and may have leading zeros 50 | java.version=${maven.compiler.target} 51 | # 52 | # 'elasticsearch.version' version of elasticsearch compiled against 53 | # You will have to release a new version of the plugin for each new 54 | # elasticsearch release. This version is checked when the plugin 55 | # is loaded so Elasticsearch will refuse to start in the presence of 56 | # plugins with the incorrect elasticsearch.version. 57 | elasticsearch.version=${elasticsearch.version} 58 | -------------------------------------------------------------------------------- /opensearch/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | analysis-pinyin 7 | com.infinilabs 8 | 1.0 9 | 10 | 4.0.0 11 | opensearch-analysis-pinyin 12 | ${opensearch.version} 13 | Pinyin Analysis for OpenSearch 14 | jar 15 | 16 | 17 | 2.0.1 18 | 1.8 19 | analysis-pinyin 20 | com.infinilabs.opensearch.analysis.AnalysisPinyinPlugin 21 | UTF-8 22 | 23 | 24 | 25 | 26 | com.infinilabs 27 | pinyin-core 28 | ${project.parent.version} 29 | 30 | 31 | org.opensearch 32 | opensearch 33 | ${opensearch.version} 34 | compile 35 | 36 | 37 | 38 | 39 | 40 | 41 | maven-assembly-plugin 42 | 3.6.0 43 | 44 | false 45 | ${project.build.directory}/releases/ 46 | 47 | opensearch/src/main/assemblies/plugin.xml 48 | 49 | 50 | 51 | 52 | distro-assembly 53 | package 54 | 55 | single 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /opensearch/src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | plugin 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/../opensearch/src/main/resources/plugin-descriptor.properties 11 | 12 | true 13 | 14 | 15 | 16 | 17 | / 18 | true 19 | true 20 | 21 | org.opensearch:opensearch 22 | 23 | 24 | 25 | / 26 | true 27 | true 28 | 29 | org.apache.lucene:lucene-pinyin 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /opensearch/src/main/java/com/infinilabs/opensearch/analysis/AnalysisPinyinPlugin.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.opensearch.analysis; 2 | 3 | 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.opensearch.index.analysis.*; 6 | import org.opensearch.indices.analysis.AnalysisModule; 7 | import org.opensearch.plugins.AnalysisPlugin; 8 | import org.opensearch.plugins.Plugin; 9 | 10 | import java.util.Collections; 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | 15 | public class AnalysisPinyinPlugin extends Plugin implements AnalysisPlugin { 16 | 17 | @Override 18 | public Map> getTokenizers() { 19 | Map> extra = new HashMap<>(); 20 | extra.put("pinyin", PinyinTokenizerFactory::new); 21 | extra.put("pinyin_first_letter", PinyinAbbreviationsTokenizerFactory::new); 22 | return extra; 23 | } 24 | 25 | @Override 26 | public Map> getTokenFilters() { 27 | Map> extra = new HashMap<>(); 28 | extra.put("pinyin", PinyinTokenFilterFactory::new); 29 | return extra; 30 | } 31 | 32 | @Override 33 | public Map>> getAnalyzers() { 34 | return Collections.singletonMap("pinyin", PinyinAnalyzerProvider::new); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /opensearch/src/main/java/com/infinilabs/opensearch/analysis/ESPinyinConfig.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.opensearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinConfig; 4 | import org.opensearch.common.settings.Settings; 5 | 6 | public class ESPinyinConfig extends PinyinConfig { 7 | public ESPinyinConfig() { 8 | } 9 | 10 | public ESPinyinConfig(Settings settings) { 11 | this.keepFirstLetter = settings.getAsBoolean("keep_first_letter", true); 12 | this.keepSeparateFirstLetter = settings.getAsBoolean("keep_separate_first_letter", false); 13 | this.keepFullPinyin = settings.getAsBoolean("keep_full_pinyin", true); 14 | this.keepJoinedFullPinyin = settings.getAsBoolean("keep_joined_full_pinyin", false); 15 | this.keepNoneChinese = settings.getAsBoolean("keep_none_chinese", true); 16 | this.keepNoneChineseTogether = settings.getAsBoolean("keep_none_chinese_together", true); 17 | this.noneChinesePinyinTokenize = settings.getAsBoolean("none_chinese_pinyin_tokenize", true); 18 | this.keepOriginal = settings.getAsBoolean("keep_original", false); 19 | this.LimitFirstLetterLength = settings.getAsInt("limit_first_letter_length", 16); 20 | this.lowercase = settings.getAsBoolean("lowercase", true); 21 | this.trimWhitespace = settings.getAsBoolean("trim_whitespace", true); 22 | this.keepNoneChineseInFirstLetter = settings.getAsBoolean("keep_none_chinese_in_first_letter", true); 23 | this.keepNoneChineseInJoinedFullPinyin = settings.getAsBoolean("keep_none_chinese_in_joined_full_pinyin", false); 24 | this.removeDuplicateTerm = settings.getAsBoolean("remove_duplicated_term", false); 25 | this.fixedPinyinOffset = settings.getAsBoolean("fixed_pinyin_offset", false); 26 | this.ignorePinyinOffset = settings.getAsBoolean("ignore_pinyin_offset", true); 27 | this.keepSeparateChinese = settings.getAsBoolean("keep_separate_chinese", false); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /opensearch/src/main/java/com/infinilabs/opensearch/analysis/PinyinAbbreviationsTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.opensearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinConfig; 4 | import com.infinilabs.pinyin.analysis.PinyinTokenizer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.opensearch.common.settings.Settings; 7 | import org.opensearch.env.Environment; 8 | import org.opensearch.index.IndexSettings; 9 | import org.opensearch.index.analysis.AbstractTokenizerFactory; 10 | 11 | public class PinyinAbbreviationsTokenizerFactory extends AbstractTokenizerFactory { 12 | 13 | public PinyinAbbreviationsTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 14 | super(indexSettings, settings, name); 15 | } 16 | 17 | @Override 18 | public Tokenizer create() { 19 | PinyinConfig config=new ESPinyinConfig(); 20 | config.keepFirstLetter=true; 21 | config.keepFullPinyin=false; 22 | config.keepNoneChinese=false; 23 | config.keepNoneChineseTogether=true; 24 | config.noneChinesePinyinTokenize=false; 25 | config.keepOriginal=false; 26 | config.lowercase=true; 27 | config.trimWhitespace=true; 28 | config.keepNoneChineseInFirstLetter=true; 29 | return new PinyinTokenizer(config); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /opensearch/src/main/java/com/infinilabs/opensearch/analysis/PinyinAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.opensearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinAnalyzer; 4 | import com.infinilabs.pinyin.analysis.PinyinConfig; 5 | import org.opensearch.common.inject.Inject; 6 | import org.opensearch.common.settings.Settings; 7 | import org.opensearch.env.Environment; 8 | import org.opensearch.index.IndexSettings; 9 | import org.opensearch.index.analysis.AbstractIndexAnalyzerProvider; 10 | 11 | /** 12 | * 13 | */ 14 | public class PinyinAnalyzerProvider extends AbstractIndexAnalyzerProvider { 15 | 16 | private final PinyinAnalyzer analyzer; 17 | private PinyinConfig config; 18 | 19 | @Inject 20 | public PinyinAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 21 | super(indexSettings, name, settings); 22 | config = new ESPinyinConfig(settings); 23 | analyzer = new PinyinAnalyzer(config); 24 | } 25 | 26 | @Override 27 | public PinyinAnalyzer get() { 28 | return this.analyzer; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /opensearch/src/main/java/com/infinilabs/opensearch/analysis/PinyinTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.opensearch.analysis; 2 | 3 | 4 | import com.infinilabs.pinyin.analysis.PinyinConfig; 5 | import com.infinilabs.pinyin.analysis.PinyinTokenFilter; 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.opensearch.common.settings.Settings; 8 | import org.opensearch.env.Environment; 9 | import org.opensearch.index.IndexSettings; 10 | import org.opensearch.index.analysis.AbstractTokenFilterFactory; 11 | 12 | public class PinyinTokenFilterFactory extends AbstractTokenFilterFactory { 13 | private PinyinConfig config; 14 | 15 | 16 | public PinyinTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 17 | super(indexSettings, name, settings); 18 | config = new ESPinyinConfig(settings); 19 | } 20 | 21 | @Override 22 | public TokenStream create(TokenStream tokenStream) { 23 | return new PinyinTokenFilter(tokenStream, config); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /opensearch/src/main/java/com/infinilabs/opensearch/analysis/PinyinTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.opensearch.analysis; 2 | 3 | import com.infinilabs.pinyin.analysis.PinyinConfig; 4 | import com.infinilabs.pinyin.analysis.PinyinTokenizer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.opensearch.common.settings.Settings; 7 | import org.opensearch.env.Environment; 8 | import org.opensearch.index.IndexSettings; 9 | import org.opensearch.index.analysis.AbstractTokenizerFactory; 10 | 11 | public class PinyinTokenizerFactory extends AbstractTokenizerFactory { 12 | 13 | private PinyinConfig config; 14 | 15 | public PinyinTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 16 | super(indexSettings, settings, name); 17 | config = new ESPinyinConfig(settings); 18 | } 19 | 20 | @Override 21 | public Tokenizer create() { 22 | return new PinyinTokenizer(config); 23 | } 24 | } 25 | 26 | -------------------------------------------------------------------------------- /opensearch/src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # The OpenSearch Contributors require contributions made to 5 | # this file be licensed under the Apache-2.0 license or a 6 | # compatible open source license. 7 | # 8 | # Modifications Copyright OpenSearch Contributors. See 9 | # GitHub history for details. 10 | # 11 | 12 | # OpenSearch plugin descriptor file 13 | # This file must exist as 'plugin-descriptor.properties' inside a plugin. 14 | # 15 | ### example plugin for "foo" 16 | # 17 | # foo.zip <-- zip file for the plugin, with this structure: 18 | # |____ .jar <-- classes, resources, dependencies 19 | # |____ .jar <-- any number of jars 20 | # |____ plugin-descriptor.properties <-- example contents below: 21 | # 22 | # classname=foo.bar.BazPlugin 23 | # description=My cool plugin 24 | # version=6.0 25 | # opensearch.version=6.0 26 | # java.version=1.8 27 | # 28 | ### mandatory elements for all plugins: 29 | # 30 | # 'description': simple summary of the plugin 31 | description=${description} 32 | # 33 | # 'version': plugin's version 34 | version=${project.version} 35 | # 36 | # 'name': the plugin name 37 | name=${plugin.name} 38 | # 39 | # 'classname': the name of the class to load, fully-qualified 40 | classname=${opensearch.plugin.classname} 41 | # 42 | # 'java.version': version of java the code is built against 43 | # use the system property java.specification.version 44 | # version string must be a sequence of nonnegative decimal integers 45 | # separated by "."'s and may have leading zeros 46 | java.version=${maven.compiler.target} 47 | # 48 | # 'opensearch.version': semantic version of opensearch the plugin is compatible with 49 | # does not include -SNAPSHOT if compiled against a snapshot build 50 | opensearch.version=${opensearch.version} 51 | # 52 | ### optional elements for plugins: 53 | # 54 | # 'custom.foldername': the custom name of the folder in which the plugin is installed 55 | custom.foldername= 56 | # 57 | # 'extended.plugins': other plugins this plugin extends through SPI 58 | extended.plugins= 59 | # 60 | # 'has.native.controller': whether or not the plugin has a native controller 61 | has.native.controller= 62 | -------------------------------------------------------------------------------- /pinyin-core/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | analysis-pinyin 7 | com.infinilabs 8 | 1.0 9 | 10 | 4.0.0 11 | 12 | pinyin-core 13 | jar 14 | 15 | 16 | 1.8 17 | 1.8 18 | UTF-8 19 | 20 | 21 | 22 | 23 | org.apache.lucene 24 | lucene-core 25 | provided 26 | ${lucene.version} 27 | 28 | 29 | org.apache.lucene 30 | lucene-analysis-common 31 | provided 32 | ${lucene.version} 33 | 34 | 35 | 36 | org.slf4j 37 | slf4j-api 38 | 1.7.7 39 | provided 40 | 41 | 42 | 43 | commons-logging 44 | commons-logging 45 | 1.2 46 | provided 47 | 48 | 49 | 50 | log4j 51 | log4j 52 | 1.2.17 53 | provided 54 | 55 | 56 | org.apache.logging.log4j 57 | log4j-api 58 | 2.5 59 | provided 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/com/infinilabs/pinyin/analysis/ChineseUtil.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | import org.nlpcn.commons.lang.util.StringUtil; 4 | 5 | import java.util.ArrayList; 6 | import java.util.Collections; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | 10 | public class ChineseUtil { 11 | /** 12 | * 汉字始 13 | */ 14 | public static char CJK_UNIFIED_IDEOGRAPHS_START = '\u4E00'; 15 | /** 16 | * 汉字止 17 | */ 18 | public static char CJK_UNIFIED_IDEOGRAPHS_END = '\u9FA5'; 19 | 20 | public static List segmentChinese(String str){ 21 | if (StringUtil.isBlank(str)) { 22 | return Collections.emptyList(); 23 | } 24 | 25 | List lists = str.length()<=32767?new ArrayList<>(str.length()):new LinkedList<>(); 26 | for (int i=0;i=CJK_UNIFIED_IDEOGRAPHS_START&&c<=CJK_UNIFIED_IDEOGRAPHS_END){ 29 | lists.add(String.valueOf(c)); 30 | } 31 | else{ 32 | lists.add(null); 33 | } 34 | 35 | } 36 | return lists; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/com/infinilabs/pinyin/analysis/ConfigErrorException.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | /** 4 | * Created by medcl on 16/8/22. 5 | */ 6 | public class ConfigErrorException extends RuntimeException { 7 | private final String mesage; 8 | 9 | public ConfigErrorException(String message) { 10 | this.mesage=message; 11 | } 12 | public String getMessage() { 13 | return this.mesage; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/com/infinilabs/pinyin/analysis/PinyinAlphabetTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.util.*; 7 | 8 | /** 9 | * Created by medcl on 16/10/13. 10 | */ 11 | public class PinyinAlphabetTokenizer { 12 | 13 | private static final int PINYIN_MAX_LENGTH = 6; 14 | 15 | public static List walk(String text) { 16 | return segPinyinStr(text); 17 | } 18 | 19 | private static List segPinyinStr(String content) { 20 | String pinyinStr = content; 21 | pinyinStr = pinyinStr.toLowerCase(); 22 | // 按非letter切分 23 | List pinyinStrList = splitByNoletter(pinyinStr); 24 | List pinyinList = new ArrayList<>(); 25 | for (String pinyinText : pinyinStrList) { 26 | if (pinyinText.length() == 1) { 27 | pinyinList.add(pinyinText); 28 | } else { 29 | List forward = positiveMaxMatch(pinyinText, PINYIN_MAX_LENGTH); 30 | if (forward.size() == 1) { // 前向只切出1个的话,没有必要再做逆向分词 31 | pinyinList.addAll(forward); 32 | } else { 33 | // 分别正向、逆向最大匹配,选出最短的作为最优结果 34 | List backward = reverseMaxMatch(pinyinText, PINYIN_MAX_LENGTH); 35 | if (forward.size() <= backward.size()) { 36 | pinyinList.addAll(forward); 37 | } else { 38 | pinyinList.addAll(backward); 39 | } 40 | } 41 | } 42 | } 43 | return pinyinList; 44 | } 45 | 46 | private static List splitByNoletter(String pinyinStr) { 47 | List pinyinStrList = new ArrayList<>(); 48 | StringBuffer sb = new StringBuffer(); 49 | boolean lastWord = true; 50 | for (char c : pinyinStr.toCharArray()) { 51 | if ((c > 96 && c < 123) || (c > 64 && c < 91)) { 52 | if (!lastWord){ 53 | pinyinStrList.add(sb.toString()); 54 | sb.setLength(0); 55 | } 56 | sb.append(c); 57 | lastWord = true; 58 | } else { 59 | if (lastWord & sb.length()>0) { 60 | pinyinStrList.add(sb.toString()); 61 | sb.setLength(0); 62 | } 63 | sb.append(c); 64 | lastWord = false; 65 | } 66 | } 67 | if (sb.length() > 0) { 68 | pinyinStrList.add(sb.toString()); 69 | } 70 | return pinyinStrList; 71 | 72 | } 73 | 74 | private static List positiveMaxMatch(String pinyinText, int maxLength) { 75 | 76 | List pinyinList = new ArrayList<>(); 77 | StringBuffer noMatchBuffer = new StringBuffer(); 78 | for (int start = 0; start < pinyinText.length(); ) { 79 | int end = start + maxLength; 80 | if (end > pinyinText.length()) { 81 | end = pinyinText.length(); 82 | } 83 | if (start == end) { 84 | break; 85 | } 86 | String sixStr = pinyinText.substring(start, end); 87 | boolean match = false; 88 | for (int j = 0; j < sixStr.length(); j++) { 89 | String guess = sixStr.substring(0, sixStr.length() - j); 90 | if (PinyinAlphabetDict.getInstance().match(guess)) { 91 | pinyinList.add(guess); 92 | start += guess.length(); 93 | match = true; 94 | break; 95 | } 96 | } 97 | if (!match) { //没命中,向后移动一位 98 | noMatchBuffer.append(sixStr.substring(0, 1)); 99 | start++; 100 | }else { // 命中,加上之前没命中的,并清空 101 | if (noMatchBuffer.length() > 0) { 102 | pinyinList.add(noMatchBuffer.toString()); 103 | noMatchBuffer.setLength(0); 104 | } 105 | } 106 | } 107 | if (noMatchBuffer.length() > 0) { 108 | pinyinList.add(noMatchBuffer.toString()); 109 | noMatchBuffer.setLength(0); 110 | } 111 | 112 | return pinyinList; 113 | } 114 | 115 | private static List reverseMaxMatch(String pinyinText, int maxLength) { 116 | List pinyinList = new ArrayList<>(); 117 | StringBuffer noMatchBuffer = new StringBuffer(); 118 | for (int end = pinyinText.length(); end >= 0; ) { 119 | int start = end - maxLength; 120 | if (start < 0) { 121 | start = 0; 122 | } 123 | if (start == end) { 124 | break; 125 | } 126 | boolean match = false; 127 | String sixStr = pinyinText.substring(start, end); 128 | for (int j = 0; j < sixStr.length(); j++) { 129 | String guess = sixStr.substring(j); 130 | if (PinyinAlphabetDict.getInstance().match(guess)) { 131 | pinyinList.add(guess); 132 | end -= guess.length(); 133 | match = true; 134 | break; 135 | } 136 | } 137 | if (!match) { //一个也没命中 138 | noMatchBuffer.append(sixStr.substring(sixStr.length() - 1)); 139 | end--; 140 | } else { 141 | if (noMatchBuffer.length() > 0) { 142 | pinyinList.add(noMatchBuffer.toString()); 143 | noMatchBuffer.setLength(0); 144 | } 145 | } 146 | } 147 | 148 | if (noMatchBuffer.length() > 0) { 149 | pinyinList.add(noMatchBuffer.toString()); 150 | noMatchBuffer.setLength(0); 151 | } 152 | // reverse 保持切词顺序 153 | Collections.reverse(pinyinList); 154 | return pinyinList; 155 | } 156 | 157 | 158 | } 159 | 160 | class PinyinAlphabetDict { 161 | 162 | private static final String fileName = "/pinyin_alphabet.dict"; 163 | 164 | private Set alphabet = new HashSet(); 165 | 166 | private static PinyinAlphabetDict instance; 167 | 168 | private PinyinAlphabetDict() { 169 | InputStream in = PinyinAlphabetDict.class.getResourceAsStream(fileName); 170 | BufferedReader reader = new BufferedReader(new InputStreamReader(in)); 171 | try { 172 | String line; 173 | while (null != (line = reader.readLine())) { 174 | if (line.trim().length() > 0) { 175 | alphabet.add(line); 176 | } 177 | } 178 | } catch (Exception ex) { 179 | throw new RuntimeException("read pinyin dic error.", ex); 180 | } finally { 181 | try { 182 | reader.close(); 183 | } catch (Exception ignored) { 184 | } 185 | } 186 | } 187 | 188 | public static PinyinAlphabetDict getInstance() { 189 | if (instance == null) { 190 | synchronized (PinyinAlphabetDict.class) { 191 | if (instance == null) { 192 | instance = new PinyinAlphabetDict(); 193 | } 194 | } 195 | } 196 | return instance; 197 | } 198 | 199 | public boolean match(String c) { 200 | return alphabet.contains(c); 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/com/infinilabs/pinyin/analysis/PinyinAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | 5 | /** 6 | * Created by IntelliJ IDEA. 7 | * User: Medcl' 8 | * Date: 12-5-22 9 | * Time: 上午10:39 10 | */ 11 | public final class PinyinAnalyzer extends Analyzer { 12 | 13 | private PinyinConfig config; 14 | 15 | public PinyinAnalyzer(PinyinConfig config) { 16 | this.config=config; 17 | } 18 | 19 | @Override 20 | protected TokenStreamComponents createComponents(String fieldName) { 21 | return new TokenStreamComponents(new PinyinTokenizer(config)); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/com/infinilabs/pinyin/analysis/PinyinConfig.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | /** 4 | * Created by medcl on 15/11/26. 5 | */ 6 | public class PinyinConfig { 7 | 8 | public boolean lowercase=true; 9 | public boolean trimWhitespace=true; 10 | public boolean keepNoneChinese=true; 11 | public boolean keepNoneChineseInFirstLetter =true; 12 | public boolean keepNoneChineseInJoinedFullPinyin =false; 13 | public boolean keepOriginal=false; 14 | public boolean keepFirstLetter=true; 15 | public boolean keepSeparateFirstLetter=false; 16 | public boolean keepNoneChineseTogether=true; 17 | public boolean noneChinesePinyinTokenize =true; 18 | public int LimitFirstLetterLength=16; 19 | public boolean keepFullPinyin=true; 20 | public boolean keepJoinedFullPinyin =false; 21 | public boolean removeDuplicateTerm=false; 22 | public boolean fixedPinyinOffset =false; 23 | // after 6.0, offset is strictly constrained, overlapped tokens are not allowed, with this parameter, overlapped token will allowed by ignore offset, please note, all position related query or highlight will become incorrect, you should use multi fields and specify different settings for different query purpose. if you need offset, please set it to false. default: true. 24 | public boolean ignorePinyinOffset =true; 25 | public boolean keepSeparateChinese=false; 26 | 27 | 28 | } 29 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/com/infinilabs/pinyin/analysis/TermItem.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | /** 4 | * Created by IntelliJ IDEA. 5 | * User: Medcl' 6 | * Date: 12-5-21 7 | * Time: 下午5:53 8 | */ 9 | 10 | public class TermItem implements Comparable{ 11 | String term; 12 | int startOffset; 13 | int endOffset; 14 | int position; 15 | public TermItem(String term,int startOffset,int endOffset,int position){ 16 | this.term=term; 17 | this.startOffset=startOffset; 18 | this.endOffset=endOffset; 19 | this.position=position; 20 | } 21 | 22 | @Override 23 | public String toString() { 24 | return term; 25 | } 26 | 27 | @Override 28 | public int compareTo(TermItem o) { 29 | return this.position-o.position; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/CaseType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File : CaseType.java 3 | * Created : 2014年1月22日 4 | * By : luhuiguo 5 | */ 6 | package org.nlpcn.commons.lang.pinyin; 7 | 8 | /** 9 | * Define the output case of Pinyin string 10 | * 11 | *

12 | * This class provides several options for outputted cases of Pinyin string, 13 | * which are listed below. For example, Chinese character '民' 14 | * 15 | * 16 | * 17 | * 18 | * 19 | * 20 | * 21 | * 22 | * 23 | * 24 | * 25 | * 26 | * 27 | * 28 | * 29 | * 30 | * 31 | * 32 | *
OptionsOutput
LOWERCASEmin2
UPPERCASEMIN2
CAPITALIZEMin2
33 | * 34 | * @author luhuiguo 35 | */ 36 | public enum CaseType { 37 | 38 | /** 39 | * '民' -> min2 40 | */ 41 | LOWERCASE, 42 | 43 | /** 44 | * '民' -> MIN2 45 | */ 46 | UPPERCASE, 47 | 48 | /** 49 | * '民' -> Min2 50 | */ 51 | CAPITALIZE; 52 | 53 | } 54 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/Pinyin.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.pinyin; 2 | 3 | import java.util.List; 4 | 5 | public class Pinyin { 6 | 7 | /** 8 | * 拼音返回 9 | * 10 | * @param str 11 | * @return [chang, jiang, cheng, zhang] 12 | */ 13 | 14 | public static List pinyin(String str) { 15 | return PinyinUtil.INSTANCE.convert(str, PinyinFormat.TONELESS_PINYIN_FORMAT); 16 | } 17 | 18 | /** 19 | * 取得每个字的首字符 20 | * 21 | * @param str 22 | * @return [c, j, c, z] 23 | */ 24 | public static List firstChar(String str) { 25 | return PinyinUtil.INSTANCE.convert(str, PinyinFormat.ABBR_PINYIN_FORMAT); 26 | } 27 | 28 | /** 29 | * 取得每个字的帶音標 30 | * 31 | * @param str 32 | * @return [cháng, jiāng, chéng, zhăng] 33 | */ 34 | public static List unicodePinyin(String str) { 35 | return PinyinUtil.INSTANCE.convert(str, PinyinFormat.UNICODE_PINYIN_FORMAT); 36 | } 37 | 38 | /** 39 | * 要音標的拼音 40 | * 41 | * @param str 42 | * @return [chang2, jiang1, cheng2, zhang3] 43 | */ 44 | public static List tonePinyin(String str) { 45 | return PinyinUtil.INSTANCE.convert(str, PinyinFormat.DEFAULT_PINYIN_FORMAT); 46 | } 47 | 48 | /** 49 | * list 转换为字符串 50 | * 51 | * @param list 52 | * @param spearator 53 | * @return 54 | */ 55 | public static String list2String(List list, String spearator) { 56 | StringBuilder sb = new StringBuilder(); 57 | boolean flag = true; 58 | for (String string : list) { 59 | if (string == null) { 60 | string = "NULL"; 61 | } 62 | 63 | if (flag) { 64 | sb.append(string); 65 | flag = false; 66 | } else { 67 | sb.append(spearator); 68 | sb.append(string); 69 | } 70 | 71 | } 72 | return sb.toString(); 73 | } 74 | 75 | /** 76 | * list 转换为字符串 默认空格 77 | * 78 | * @param list 79 | * @return 80 | */ 81 | public static String list2String(List list) { 82 | return list2String(list, " "); 83 | } 84 | 85 | /** 86 | * 动态增加到拼音词典中 87 | * 88 | * @param word 89 | * 大长今 90 | * @param pinyins 91 | * ['da4', 'chang2' ,'jing1'] 92 | */ 93 | public static void insertPinyin(String word, String[] pinyins) { 94 | PinyinUtil.INSTANCE.insertPinyin(word, pinyins); 95 | } 96 | 97 | /** 98 | * list 转换为字符串 默认空格,忽略null 99 | * 100 | * @param list 101 | * @return 102 | */ 103 | public static String list2StringSkipNull(List list) { 104 | return list2StringSkipNull(list, " "); 105 | } 106 | 107 | /** 108 | * list 转换为字符串 109 | * 110 | * @param list 111 | * @param spearator 112 | * @return 113 | */ 114 | public static String list2StringSkipNull(List list, String spearator) { 115 | StringBuilder sb = new StringBuilder(); 116 | boolean flag = true; 117 | for (String string : list) { 118 | if (string == null) { 119 | continue; 120 | } 121 | 122 | if (flag) { 123 | sb.append(string); 124 | flag = false; 125 | } else { 126 | sb.append(spearator); 127 | sb.append(string); 128 | } 129 | 130 | } 131 | return sb.toString(); 132 | } 133 | } -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/PinyinFormat.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File : PinyinFormat.java 3 | * Created : 2014年1月22日 4 | * By : luhuiguo 5 | */ 6 | package org.nlpcn.commons.lang.pinyin; 7 | 8 | /** 9 | * 10 | * @author luhuiguo 11 | */ 12 | public class PinyinFormat { 13 | 14 | public static final PinyinFormat DEFAULT_PINYIN_FORMAT = new PinyinFormat(); 15 | 16 | public static final PinyinFormat UNICODE_PINYIN_FORMAT = new PinyinFormat( 17 | YuCharType.WITH_U_UNICODE, ToneType.WITH_TONE_MARK); 18 | 19 | public static final PinyinFormat TONELESS_PINYIN_FORMAT = new PinyinFormat( 20 | YuCharType.WITH_V, ToneType.WITHOUT_TONE); 21 | 22 | public static final PinyinFormat ABBR_PINYIN_FORMAT = new PinyinFormat( 23 | YuCharType.WITH_U_AND_COLON, ToneType.WITH_ABBR, 24 | CaseType.LOWERCASE, "", true); 25 | 26 | private YuCharType yuCharType = YuCharType.WITH_U_AND_COLON; 27 | 28 | private ToneType toneType = ToneType.WITH_TONE_NUMBER; 29 | 30 | private CaseType caseType = CaseType.LOWERCASE; 31 | 32 | private String separator = " "; 33 | 34 | private boolean onlyPinyin = false; 35 | 36 | public PinyinFormat() { 37 | super(); 38 | } 39 | 40 | public PinyinFormat(YuCharType yuCharType, ToneType toneType, 41 | CaseType caseType, String separator, boolean onlyPinyin) { 42 | super(); 43 | this.yuCharType = yuCharType; 44 | this.toneType = toneType; 45 | this.caseType = caseType; 46 | this.separator = separator; 47 | this.onlyPinyin = onlyPinyin; 48 | } 49 | 50 | public PinyinFormat(YuCharType yuCharType, ToneType toneType, 51 | CaseType caseType, String separator) { 52 | this(yuCharType, toneType, caseType, separator, false); 53 | } 54 | 55 | public PinyinFormat(YuCharType yuCharType, ToneType toneType, 56 | CaseType caseType) { 57 | this(yuCharType, toneType, caseType, " "); 58 | } 59 | 60 | public PinyinFormat(YuCharType yuCharType, ToneType toneType) { 61 | this(yuCharType, toneType, CaseType.LOWERCASE); 62 | } 63 | 64 | public YuCharType getYuCharType() { 65 | return yuCharType; 66 | } 67 | 68 | public void setYuCharType(YuCharType yuCharType) { 69 | this.yuCharType = yuCharType; 70 | } 71 | 72 | public CaseType getCaseType() { 73 | return caseType; 74 | } 75 | 76 | public void setCaseType(CaseType caseType) { 77 | this.caseType = caseType; 78 | } 79 | 80 | public ToneType getToneType() { 81 | return toneType; 82 | } 83 | 84 | public void setToneType(ToneType toneType) { 85 | this.toneType = toneType; 86 | } 87 | 88 | public String getSeparator() { 89 | return separator; 90 | } 91 | 92 | public void setSeparator(String separator) { 93 | this.separator = separator; 94 | } 95 | 96 | public boolean isOnlyPinyin() { 97 | return onlyPinyin; 98 | } 99 | 100 | public void setOnlyPinyin(boolean onlyPinyin) { 101 | this.onlyPinyin = onlyPinyin; 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/PinyinFormatter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File : PinyinFormatter.java 3 | * Created : 2014年1月22日 4 | * By : luhuiguo 5 | */ 6 | package org.nlpcn.commons.lang.pinyin; 7 | 8 | /** 9 | * 10 | * @author luhuiguo 11 | */ 12 | public class PinyinFormatter { 13 | 14 | public static String formatPinyin(String pinyinStr, PinyinFormat format) { 15 | 16 | if (ToneType.WITH_ABBR == format.getToneType()) { 17 | 18 | pinyinStr = abbr(pinyinStr); 19 | 20 | } else { 21 | 22 | if ((ToneType.WITH_TONE_MARK == format.getToneType()) 23 | && ((YuCharType.WITH_V == format.getYuCharType()) || (YuCharType.WITH_U_AND_COLON == format 24 | .getYuCharType()))) { 25 | // ToneType.WITH_TONE_MARK force YuCharType.WITH_U_UNICODE 26 | format.setYuCharType(YuCharType.WITH_U_UNICODE); 27 | 28 | // throw new BadPinyinFormatException( 29 | // "tone marks cannot be added to v or u:"); 30 | } 31 | 32 | switch (format.getToneType()) { 33 | case WITHOUT_TONE: 34 | pinyinStr = pinyinStr.replaceAll("[1-5]", ""); 35 | break; 36 | case WITH_TONE_MARK: 37 | pinyinStr = pinyinStr.replaceAll("u:", "v"); 38 | pinyinStr = convertToneNumber2ToneMark(pinyinStr); 39 | break; 40 | 41 | default: 42 | break; 43 | 44 | } 45 | 46 | switch (format.getYuCharType()) { 47 | case WITH_V: 48 | pinyinStr = pinyinStr.replaceAll("u:", "v"); 49 | break; 50 | case WITH_U_UNICODE: 51 | pinyinStr = pinyinStr.replaceAll("u:", "ü"); 52 | break; 53 | 54 | default: 55 | break; 56 | 57 | } 58 | } 59 | 60 | switch (format.getCaseType()) { 61 | case UPPERCASE: 62 | pinyinStr = pinyinStr.toUpperCase(); 63 | break; 64 | case CAPITALIZE: 65 | pinyinStr = capitalize(pinyinStr); 66 | break; 67 | 68 | default: 69 | break; 70 | 71 | } 72 | 73 | return pinyinStr; 74 | } 75 | 76 | public static String abbr(String str) { 77 | 78 | if (str == null || str.length() == 0) { 79 | return str; 80 | } 81 | 82 | return str.substring(0, 1); 83 | } 84 | 85 | public static String capitalize(String str) { 86 | int strLen; 87 | if (str == null || (strLen = str.length()) == 0) { 88 | return str; 89 | } 90 | return new StringBuilder(strLen) 91 | .append(Character.toTitleCase(str.charAt(0))) 92 | .append(str.substring(1)).toString(); 93 | } 94 | 95 | /** 96 | * Convert tone numbers to tone marks using Unicode
97 | *
98 | * 99 | * Algorithm for determining location of tone mark
100 | * 101 | * A simple algorithm for determining the vowel on which the tone mark 102 | * appears is as follows:
103 | * 104 | *

    105 | *
  1. First, look for an "a" or an "e". If either vowel appears, it takes 106 | * the tone mark. There are no possible pinyin syllables that contain both 107 | * an "a" and an "e". 108 | * 109 | *
  2. If there is no "a" or "e", look for an "ou". If "ou" appears, then 110 | * the "o" takes the tone mark. 111 | * 112 | *
  3. If none of the above cases hold, then the last vowel in the syllable 113 | * takes the tone mark. 114 | * 115 | *
116 | * 117 | * @param pinyinStr 118 | * the ascii represention with tone numbers 119 | * @return the unicode represention with tone marks 120 | */ 121 | private static String convertToneNumber2ToneMark(final String pinyinStr) { 122 | String lowerCasePinyinStr = pinyinStr.toLowerCase(); 123 | if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) { 124 | final char defautlCharValue = '$'; 125 | final int defautlIndexValue = -1; 126 | 127 | char unmarkedVowel = defautlCharValue; 128 | int indexOfUnmarkedVowel = defautlIndexValue; 129 | 130 | final char charA = 'a'; 131 | final char charE = 'e'; 132 | final String ouStr = "ou"; 133 | final String allUnmarkedVowelStr = "aeiouv"; 134 | final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü"; 135 | 136 | if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) { 137 | 138 | int tuneNumber = Character.getNumericValue(lowerCasePinyinStr 139 | .charAt(lowerCasePinyinStr.length() - 1)); 140 | 141 | int indexOfA = lowerCasePinyinStr.indexOf(charA); 142 | int indexOfE = lowerCasePinyinStr.indexOf(charE); 143 | int ouIndex = lowerCasePinyinStr.indexOf(ouStr); 144 | 145 | if (-1 != indexOfA) { 146 | indexOfUnmarkedVowel = indexOfA; 147 | unmarkedVowel = charA; 148 | } else if (-1 != indexOfE) { 149 | indexOfUnmarkedVowel = indexOfE; 150 | unmarkedVowel = charE; 151 | } else if (-1 != ouIndex) { 152 | indexOfUnmarkedVowel = ouIndex; 153 | unmarkedVowel = ouStr.charAt(0); 154 | } else { 155 | for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) { 156 | if (String.valueOf(lowerCasePinyinStr.charAt(i)) 157 | .matches("[" + allUnmarkedVowelStr + "]")) { 158 | indexOfUnmarkedVowel = i; 159 | unmarkedVowel = lowerCasePinyinStr.charAt(i); 160 | break; 161 | } 162 | } 163 | } 164 | 165 | if ((defautlCharValue != unmarkedVowel) 166 | && (defautlIndexValue != indexOfUnmarkedVowel)) { 167 | int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel); 168 | int columnIndex = tuneNumber - 1; 169 | 170 | int vowelLocation = rowIndex * 5 + columnIndex; 171 | 172 | char markedVowel = allMarkedVowelStr.charAt(vowelLocation); 173 | 174 | StringBuffer resultBuffer = new StringBuffer(); 175 | 176 | resultBuffer.append(lowerCasePinyinStr.substring(0, 177 | indexOfUnmarkedVowel).replaceAll("v", "ü")); 178 | resultBuffer.append(markedVowel); 179 | resultBuffer.append(lowerCasePinyinStr.substring( 180 | indexOfUnmarkedVowel + 1, 181 | lowerCasePinyinStr.length() - 1).replaceAll("v", 182 | "ü")); 183 | 184 | return resultBuffer.toString(); 185 | 186 | } else 187 | // error happens in the procedure of locating vowel 188 | { 189 | return lowerCasePinyinStr; 190 | } 191 | } else 192 | // input string has no any tune number 193 | { 194 | // only replace v with ü (umlat) character 195 | return lowerCasePinyinStr.replaceAll("v", "ü"); 196 | } 197 | } else 198 | // bad format 199 | { 200 | return lowerCasePinyinStr; 201 | } 202 | } 203 | } 204 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/PinyinUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File : Pinyin.java 3 | * Created : 2014年1月22日 4 | * By : luhuiguo 5 | */ 6 | package org.nlpcn.commons.lang.pinyin; 7 | 8 | import java.io.BufferedInputStream; 9 | import java.io.BufferedReader; 10 | import java.io.IOException; 11 | import java.io.InputStreamReader; 12 | import java.nio.charset.StandardCharsets; 13 | import java.util.ArrayList; 14 | import java.util.Collections; 15 | import java.util.LinkedList; 16 | import java.util.List; 17 | 18 | import org.nlpcn.commons.lang.tire.SmartGetWord; 19 | import org.nlpcn.commons.lang.tire.domain.SmartForest; 20 | import org.nlpcn.commons.lang.util.StringUtil; 21 | 22 | /** 23 | * 24 | * @author luhuiguo 25 | * @author ansj 26 | */ 27 | enum PinyinUtil { 28 | 29 | INSTANCE; 30 | 31 | public static final String PINYIN_MAPPING_FILE = "/pinyin.txt"; 32 | public static final String POLYPHONE_MAPPING_FILE = "/polyphone.txt"; 33 | 34 | public static final String EMPTY = ""; 35 | public static final String SHARP = "#"; 36 | public static final String EQUAL = "="; 37 | public static final String COMMA = ","; 38 | public static final String SPACE = " "; 39 | 40 | public static final char CJK_UNIFIED_IDEOGRAPHS_START = '\u4E00'; 41 | public static final char CJK_UNIFIED_IDEOGRAPHS_END = '\u9FA5'; 42 | 43 | private List pinyinDict = null; 44 | 45 | private SmartForest polyphoneDict = null; 46 | 47 | private int maxLen = 2; 48 | 49 | PinyinUtil() { 50 | loadPinyinMapping(); 51 | loadPolyphoneMapping(); 52 | } 53 | 54 | public void loadPinyinMapping() { 55 | 56 | pinyinDict = new ArrayList(); 57 | 58 | try { 59 | BufferedReader in = new BufferedReader( 60 | new InputStreamReader(new BufferedInputStream(getClass().getResourceAsStream(PINYIN_MAPPING_FILE)), StandardCharsets.UTF_8)); 61 | String line = null; 62 | while (null != (line = in.readLine())) { 63 | if (line.length() == 0 || line.startsWith(SHARP)) { 64 | continue; 65 | } 66 | String[] pair = line.split(EQUAL); 67 | 68 | if (pair.length < 2) { 69 | pinyinDict.add(EMPTY); 70 | } else { 71 | pinyinDict.add(pair[1]); 72 | } 73 | } 74 | 75 | in.close(); 76 | 77 | } catch (IOException e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | 82 | public void loadPolyphoneMapping() { 83 | 84 | polyphoneDict = new SmartForest(); 85 | 86 | try { 87 | BufferedReader in = new BufferedReader( 88 | new InputStreamReader(new BufferedInputStream(getClass().getResourceAsStream(POLYPHONE_MAPPING_FILE)), StandardCharsets.UTF_8)); 89 | 90 | String line = null; 91 | while (null != (line = in.readLine())) { 92 | // line = line.trim(); 93 | if (line.length() == 0 || line.startsWith(SHARP)) { 94 | continue; 95 | } 96 | String[] pair = line.split(EQUAL); 97 | 98 | if (pair.length < 2) { 99 | continue; 100 | } 101 | maxLen = maxLen < pair[0].length() ? pair[0].length() : maxLen; 102 | 103 | polyphoneDict.add(pair[0], pair[1].split(SPACE)); 104 | 105 | } 106 | 107 | in.close(); 108 | 109 | } catch (IOException e) { 110 | e.printStackTrace(); 111 | } 112 | } 113 | 114 | public String[] toUnformattedPinyin(char ch) { 115 | 116 | if (ch >= CJK_UNIFIED_IDEOGRAPHS_START && ch <= CJK_UNIFIED_IDEOGRAPHS_END) { 117 | String pinyinStr = pinyinDict.get(ch - CJK_UNIFIED_IDEOGRAPHS_START); 118 | return pinyinStr.split(COMMA); 119 | 120 | } else { 121 | return null; 122 | } 123 | } 124 | 125 | public String[] toFormattedPinyin(char ch, PinyinFormat format) { 126 | String[] pinyinStrArray = toUnformattedPinyin(ch); 127 | if (null != pinyinStrArray) { 128 | for (int i = 0; i < pinyinStrArray.length; i++) { 129 | pinyinStrArray[i] = PinyinFormatter.formatPinyin(pinyinStrArray[i], format); 130 | } 131 | return pinyinStrArray; 132 | } else 133 | return null; 134 | } 135 | 136 | public String toPinyin(char ch) { 137 | String[] pinyinStrArray = toUnformattedPinyin(ch); 138 | 139 | if (null != pinyinStrArray && pinyinStrArray.length > 0) { 140 | return pinyinStrArray[0]; 141 | } 142 | return null; 143 | } 144 | 145 | public String toPinyin(char ch, PinyinFormat format) { 146 | 147 | String[] pinyinStrArray = null; 148 | 149 | pinyinStrArray = toFormattedPinyin(ch, format); 150 | 151 | if (null != pinyinStrArray && pinyinStrArray.length > 0) { 152 | return pinyinStrArray[0]; 153 | } 154 | return null; 155 | } 156 | 157 | public List convert(String str, PinyinFormat format) { 158 | 159 | if (StringUtil.isBlank(str)) { 160 | return Collections.emptyList(); 161 | } 162 | 163 | SmartGetWord word = polyphoneDict.getWord(str); 164 | 165 | List lists = new LinkedList(); 166 | 167 | String temp = null; 168 | int beginOffe = 0; 169 | while ((temp = word.getFrontWords()) != null) { 170 | 171 | for (int i = beginOffe; i < word.offe; i++) { 172 | lists.add(toPinyin(str.charAt(i), format)); 173 | } 174 | 175 | for (String t : word.getParam()) { 176 | lists.add(PinyinFormatter.formatPinyin(t, format)); 177 | 178 | } 179 | beginOffe = word.offe + temp.length(); 180 | } 181 | 182 | if (beginOffe < str.length()) { 183 | for (int i = beginOffe; i < str.length(); i++) { 184 | lists.add(toPinyin(str.charAt(i), format)); 185 | } 186 | } 187 | return lists; 188 | 189 | } 190 | 191 | /** 192 | * 动态增加拼音到词典 193 | * 194 | * @param word 195 | * @param pinyins 196 | */ 197 | public void insertPinyin(String word, String[] pinyins) { 198 | polyphoneDict.add(word, pinyins); 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/PinyinWord.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.pinyin; 2 | 3 | public class PinyinWord { 4 | public String py; 5 | public int tone; 6 | 7 | PinyinWord(String pinyinStr) { 8 | this.py = pinyinStr.substring(0, pinyinStr.length() - 1); 9 | char c = pinyinStr.charAt(pinyinStr.length() - 1); 10 | if (c >= '0' && c <= '9') { 11 | this.tone = Integer.parseInt(String.valueOf(c)); 12 | } else { 13 | this.py = pinyinStr; 14 | } 15 | } 16 | 17 | public PinyinWord(char c) { 18 | this.py = String.valueOf(c); 19 | } 20 | 21 | public String toString() { 22 | if (tone > 0) 23 | return this.py + tone; 24 | else 25 | return this.py; 26 | } 27 | 28 | public static void main(String[] args) { 29 | System.out.println(new PinyinWord("bei3")); 30 | ; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/ToneType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File : ToneType.java 3 | * Created : 2014年1月22日 4 | * By : luhuiguo 5 | */ 6 | package org.nlpcn.commons.lang.pinyin; 7 | 8 | /** 9 | * Define the output format of Pinyin tones 10 | * 11 | *

12 | * Chinese has four pitched tones and a "toneless" tone. They are called Píng(平, 13 | * flat), Shǎng(上, rise), Qù(去, high drop), Rù(入, drop) and Qing(轻, toneless). 14 | * Usually, we use 1, 2, 3, 4 and 5 to represent them. 15 | * 16 | *

17 | * This class provides several options for output of Chinese tones, which are 18 | * listed below. For example, Chinese character '打' 19 | * 20 | * 21 | * 22 | * 23 | * 24 | * 25 | * 26 | * 27 | * 28 | * 29 | * 30 | * 31 | * 32 | * 33 | * 34 | * 35 | * 36 | * 37 | *
OptionsOutput
WITH_TONE_NUMBERda3
WITHOUT_TONEda
WITH_TONE_MARK
38 | * 39 | * @author luhuiguo 40 | */ 41 | public enum ToneType { 42 | 43 | /** 44 | * '打' -> da3 45 | */ 46 | WITH_TONE_NUMBER, 47 | 48 | /** 49 | * '打' -> da 50 | */ 51 | WITHOUT_TONE, 52 | 53 | /** 54 | * '打' -> dǎ 55 | */ 56 | WITH_TONE_MARK, 57 | 58 | /** 59 | * '打' -> d 60 | */ 61 | WITH_ABBR; 62 | 63 | } 64 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/pinyin/YuCharType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * File : YuCharType.java 3 | * Created : 2014年1月22日 4 | * By : luhuiguo 5 | */ 6 | package org.nlpcn.commons.lang.pinyin; 7 | 8 | /** 9 | * Define the output format of character 'ü' 10 | * 11 | *

12 | * 'ü' is a special character of Hanyu Pinyin, which can not be simply 13 | * represented by English letters. In Hanyu Pinyin, such characters include 'ü', 14 | * 'üe', 'üan', and 'ün'. 15 | * 16 | *

17 | * This class provides several options for output of 'ü', which are listed 18 | * below. 19 | * 20 | * 21 | * 22 | * 23 | * 24 | * 25 | * 26 | * 27 | * 28 | * 29 | * 30 | * 31 | * 32 | * 33 | * 34 | * 35 | * 36 | * 37 | *
OptionsOutput
WITH_U_AND_COLONu:
WITH_Vv
WITH_U_UNICODEü
38 | * 39 | * @author luhuiguo 40 | */ 41 | public enum YuCharType { 42 | 43 | /** 44 | * The option indicates that the output of 'ü' is "u:". 45 | */ 46 | WITH_U_AND_COLON, 47 | 48 | /** 49 | * The option indicates that the output of 'ü' is "v". 50 | */ 51 | WITH_V, 52 | 53 | /** 54 | * The option indicates that the output of 'ü' is "ü" in Unicode form. 55 | */ 56 | WITH_U_UNICODE; 57 | 58 | } 59 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/tire/GetWord.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire; 2 | 3 | import org.nlpcn.commons.lang.tire.domain.Forest; 4 | 5 | /** 6 | * 基本的string【】 类 7 | * 8 | * @author ansj 9 | * 10 | */ 11 | public class GetWord extends SmartGetWord { 12 | 13 | public GetWord(Forest forest, char[] chars) { 14 | super(forest, chars); 15 | } 16 | 17 | public GetWord(Forest forest, String content) { 18 | super(forest, content); 19 | } 20 | 21 | public String getParam(int i) { 22 | final String[] param = this.getParam(); 23 | if (param == null || i >= param.length) { 24 | return null; 25 | } else { 26 | return param[i]; 27 | } 28 | } 29 | 30 | public String[] getParams() { 31 | return this.getParam(); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/tire/SmartGetWord.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire; 2 | 3 | import org.nlpcn.commons.lang.tire.domain.SmartForest; 4 | 5 | public class SmartGetWord { 6 | private static final String EMPTYSTRING = ""; 7 | public int offe; 8 | byte status = 0; 9 | int root = 0; 10 | int i = this.root; 11 | boolean isBack = false; 12 | private SmartForest forest; 13 | private char[] chars; 14 | private String str; 15 | private int tempOffe; 16 | private T param; 17 | private SmartForest branch; 18 | 19 | public SmartGetWord(SmartForest forest, String content) { 20 | this.chars = content.toCharArray(); 21 | this.forest = forest; 22 | this.branch = forest; 23 | } 24 | 25 | public SmartGetWord(SmartForest forest, char[] chars) { 26 | this.chars = chars; 27 | this.forest = forest; 28 | this.branch = forest; 29 | } 30 | 31 | public String getAllWords() { 32 | String temp = this.allWords(); 33 | 34 | temp = checkNumberOrEnglish(temp); 35 | 36 | while (EMPTYSTRING.equals(temp)) { 37 | temp = this.allWords(); 38 | temp = checkNumberOrEnglish(temp); 39 | } 40 | return temp; 41 | } 42 | 43 | /** 44 | * 验证一个词语的左右边.不是英文和数字 45 | * 46 | * @param temp 47 | * @return 48 | */ 49 | private String checkNumberOrEnglish(String temp) { 50 | 51 | if (temp == null || temp == EMPTYSTRING) { 52 | return temp; 53 | } 54 | 55 | // 先验证最左面 56 | 57 | char l = temp.charAt(0); 58 | 59 | if (l < 127 && offe > 0) { 60 | if (checkSame(l, chars[offe - 1])) { 61 | return EMPTYSTRING; 62 | } 63 | } 64 | 65 | char r = l; 66 | 67 | if (temp.length() > 1) { 68 | r = temp.charAt(temp.length() - 1); 69 | } 70 | 71 | if (r < 127 && (offe + temp.length()) < chars.length) { 72 | if (checkSame(r, chars[offe + temp.length()])) { 73 | return EMPTYSTRING; 74 | } 75 | } 76 | 77 | return temp; 78 | } 79 | 80 | /** 81 | * 验证两个char是否都是数字或者都是英文 82 | * 83 | * @param l 84 | * @param c 85 | * @return 86 | */ 87 | private boolean checkSame(char l, char c) { 88 | 89 | if (isE(l) && isE(c)) { 90 | return true; 91 | } 92 | 93 | if (isNum(l) && isNum(c)) { 94 | return true; 95 | } 96 | 97 | return false; 98 | } 99 | 100 | public String getFrontWords() { 101 | String temp = null; 102 | do { 103 | temp = this.frontWords(); 104 | temp = checkNumberOrEnglish(temp); 105 | } while (EMPTYSTRING.equals(temp)); 106 | return temp; 107 | } 108 | 109 | private Integer tempJLen = null; 110 | 111 | private String allWords() { 112 | 113 | for (; i < chars.length;) { 114 | if (tempJLen == null) { 115 | branch = branch.getBranch(chars[i]); 116 | } 117 | if (branch == null) { 118 | branch = forest; 119 | i++; 120 | continue; 121 | } 122 | 123 | for (int j = i + (tempJLen == null ? 0 : tempJLen); j < chars.length; j++) { 124 | if (j > i) { 125 | branch = branch.getBranch(chars[j]); 126 | } 127 | if (branch == null) { 128 | branch = forest; 129 | i++; 130 | tempJLen = null; 131 | return EMPTYSTRING; 132 | } 133 | 134 | switch (branch.getStatus()) { 135 | case 2: 136 | offe = i; 137 | param = branch.getParam(); 138 | tempJLen = j - i + 1; 139 | return new String(chars, i, j - i + 1); 140 | case 3: 141 | offe = i; 142 | param = branch.getParam(); 143 | branch = forest; 144 | tempJLen = null; 145 | i++; 146 | return new String(chars, i - 1, j - i + 2); 147 | } 148 | 149 | } 150 | 151 | i++; 152 | branch = forest; 153 | tempJLen = null; 154 | return EMPTYSTRING; 155 | 156 | } 157 | 158 | return null; 159 | 160 | } 161 | 162 | private String frontWords() { 163 | for (; this.i < this.chars.length + 1; this.i++) { 164 | if (i == chars.length) { 165 | this.branch = null; 166 | } else { 167 | this.branch = this.branch.getBranch(this.chars[this.i]); 168 | } 169 | if (this.branch == null) { 170 | this.branch = this.forest; 171 | if (this.isBack) { 172 | this.offe = this.root; 173 | this.str = new String(this.chars, this.root, this.tempOffe); 174 | if (this.str.length() == 0) { 175 | this.root += 1; 176 | this.i = this.root; 177 | } else { 178 | this.i = (this.root + this.tempOffe); 179 | this.root = this.i; 180 | } 181 | this.isBack = false; 182 | return this.str; 183 | } 184 | this.i = this.root; 185 | this.root += 1; 186 | } else { 187 | switch (this.branch.getStatus()) { 188 | case 2: 189 | this.isBack = true; 190 | this.tempOffe = (this.i - this.root + 1); 191 | this.param = this.branch.getParam(); 192 | break; 193 | case 3: 194 | this.offe = this.root; 195 | this.str = new String(this.chars, this.root, this.i - this.root + 1); 196 | String temp = this.str; 197 | this.param = this.branch.getParam(); 198 | this.branch = this.forest; 199 | this.isBack = false; 200 | if (temp.length() > 0) { 201 | this.i += 1; 202 | this.root = this.i; 203 | } else { 204 | this.i = (this.root + 1); 205 | } 206 | return this.str; 207 | } 208 | } 209 | } 210 | this.tempOffe += this.chars.length; 211 | return null; 212 | } 213 | 214 | public boolean isE(char c) { 215 | if ((c >= 'A') && (c <= 'z')) { 216 | return true; 217 | } 218 | return false; 219 | } 220 | 221 | public boolean isNum(char c) { 222 | if ((c >= '0') && (c <= '9')) { 223 | return true; 224 | } 225 | return false; 226 | } 227 | 228 | public void reset(String content) { 229 | this.offe = 0; 230 | this.status = 0; 231 | this.root = 0; 232 | this.i = this.root; 233 | this.isBack = false; 234 | this.tempOffe = 0; 235 | this.chars = content.toCharArray(); 236 | this.branch = this.forest; 237 | } 238 | 239 | /** 240 | * 参数 241 | * 242 | * @return 243 | */ 244 | public T getParam() { 245 | return this.param; 246 | } 247 | 248 | } 249 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/tire/domain/Forest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.domain; 2 | 3 | import org.nlpcn.commons.lang.tire.GetWord; 4 | 5 | public class Forest extends SmartForest { 6 | 7 | private static final long serialVersionUID = -4616310486272978650L; 8 | 9 | public Forest() { 10 | }; 11 | 12 | public Forest(char c, int status, String[] param) { 13 | super(c, status, param); 14 | } 15 | 16 | public SmartForest get(char c) { 17 | return this.getBranch(c); 18 | } 19 | 20 | public SmartForest getBranch(char c) { 21 | return super.getBranch(c); 22 | } 23 | 24 | public GetWord getWord(String str) { 25 | return getWord(str.toCharArray()); 26 | } 27 | 28 | public GetWord getWord(char[] chars) { 29 | return new GetWord(this, chars); 30 | } 31 | 32 | public String[] getParams() { 33 | return this.getParam(); 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/tire/domain/Value.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.domain; 2 | 3 | import java.util.Arrays; 4 | 5 | public class Value { 6 | private static final String TAB = "\t"; 7 | private String keyword; 8 | private String[] paramers = new String[0]; 9 | 10 | public Value(String keyword, String... paramers) { 11 | this.keyword = keyword; 12 | if (paramers != null) { 13 | this.paramers = paramers; 14 | } 15 | } 16 | 17 | public Value(String temp) { 18 | String[] strs = temp.split(TAB); 19 | this.keyword = strs[0]; 20 | if (strs.length > 1) { 21 | this.paramers = Arrays.copyOfRange(strs, 1, strs.length); 22 | } 23 | } 24 | 25 | public String getKeyword() { 26 | return keyword; 27 | } 28 | 29 | public void setKeyword(String keyword) { 30 | this.keyword = keyword; 31 | } 32 | 33 | public String[] getParamers() { 34 | return paramers; 35 | } 36 | 37 | public void setParamers(String[] paramers) { 38 | this.paramers = paramers; 39 | } 40 | 41 | @Override 42 | public String toString() { 43 | StringBuilder sb = new StringBuilder(); 44 | sb.append(keyword); 45 | for (int i = 0; i < paramers.length; i++) { 46 | sb.append(TAB); 47 | sb.append(paramers[i]); 48 | } 49 | return sb.toString(); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/tire/library/Library.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.library; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileInputStream; 5 | import java.io.InputStream; 6 | import java.util.List; 7 | 8 | import org.nlpcn.commons.lang.tire.domain.Forest; 9 | import org.nlpcn.commons.lang.tire.domain.SmartForest; 10 | import org.nlpcn.commons.lang.tire.domain.Value; 11 | import org.nlpcn.commons.lang.util.IOUtil; 12 | 13 | public class Library { 14 | 15 | public static Forest makeForest(String path) throws Exception { 16 | return makeForest(new FileInputStream(path)); 17 | } 18 | 19 | public static Forest makeForest(String path, String encoding) throws Exception { 20 | return makeForest(new FileInputStream(path), encoding); 21 | } 22 | 23 | public static Forest makeForest(InputStream inputStream) throws Exception { 24 | return makeForest(IOUtil.getReader(inputStream, "UTF-8")); 25 | } 26 | 27 | public static Forest makeForest(InputStream inputStream, String encoding) throws Exception { 28 | return makeForest(IOUtil.getReader(inputStream, encoding)); 29 | } 30 | 31 | public static Forest makeForest(BufferedReader br) throws Exception { 32 | return makeLibrary(br, new Forest()); 33 | } 34 | 35 | /** 36 | * 传入value数组.构造树 37 | * 38 | * @param values 39 | * @return 40 | */ 41 | public static Forest makeForest(List values) { 42 | Forest forest = new Forest(); 43 | for (Value value : values) { 44 | insertWord(forest, value.toString()); 45 | } 46 | return forest; 47 | } 48 | 49 | /** 50 | * 词典树的构造方法 51 | * 52 | * @param br 53 | * @param forest 54 | * @return 55 | * @throws Exception 56 | */ 57 | private static Forest makeLibrary(BufferedReader br, Forest forest) throws Exception { 58 | if (br == null) 59 | return forest; 60 | try { 61 | String temp = null; 62 | while ((temp = br.readLine()) != null) { 63 | insertWord(forest, temp); 64 | } 65 | } catch (Exception e) { 66 | e.printStackTrace(); 67 | } finally { 68 | br.close(); 69 | } 70 | return forest; 71 | } 72 | 73 | public static void insertWord(Forest forest, Value value) { 74 | insertWord(forest, value.getKeyword(), value.getParamers()); 75 | } 76 | 77 | /** 78 | * 插入一个词 79 | * 80 | * @param forest 81 | * @param temp 82 | */ 83 | public static void insertWord(Forest forest, String temp) { 84 | String[] param = temp.split("\t"); 85 | 86 | temp = param[0]; 87 | 88 | String[] resultParams = new String[param.length - 1]; 89 | for (int j = 1; j < param.length; j++) { 90 | resultParams[j - 1] = param[j]; 91 | } 92 | 93 | insertWord(forest, temp, resultParams); 94 | } 95 | 96 | private static void insertWord(Forest forest, String temp, String... param) { 97 | SmartForest branch = forest; 98 | char[] chars = temp.toCharArray(); 99 | for (int i = 0; i < chars.length; i++) { 100 | if (chars.length == i + 1) { 101 | branch.add(new Forest(chars[i], 3, param)); 102 | } else { 103 | branch.add(new Forest(chars[i], 1, null)); 104 | } 105 | branch = branch.getBranch(chars[i]); 106 | } 107 | } 108 | 109 | /** 110 | * 删除一个词 111 | * 112 | * @param forest 113 | * @param word 114 | */ 115 | public static void removeWord(Forest forest, String word) { 116 | SmartForest branch = forest; 117 | char[] chars = word.toCharArray(); 118 | 119 | for (int i = 0; i < chars.length; i++) { 120 | if (branch == null) 121 | return; 122 | if (chars.length == i + 1) { 123 | branch.add(new Forest(chars[i], -1, null)); 124 | } 125 | branch = branch.getBranch(chars[i]); 126 | } 127 | } 128 | } -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/AnsjArrays.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.lang.reflect.Array; 4 | import java.util.Arrays; 5 | 6 | import org.nlpcn.commons.lang.tire.domain.SmartForest; 7 | 8 | @SuppressWarnings("all") 9 | public class AnsjArrays { 10 | private static final int INSERTIONSORT_THRESHOLD = 7; 11 | 12 | /** 13 | * 二分法查找.摘抄了jdk的东西..只不过把他的自动装箱功能给去掉了 14 | * 15 | * @param branches 16 | * branches 17 | * @param c 18 | * char 19 | * @return idx 20 | */ 21 | 22 | public static > int binarySearch(T[] branches, char c) { 23 | int high = branches.length - 1; 24 | if (branches.length < 1) { 25 | return high; 26 | } 27 | int low = 0; 28 | while (low <= high) { 29 | int mid = (low + high) >>> 1; 30 | int cmp = branches[mid].compareTo(c); 31 | 32 | if (cmp < 0) 33 | low = mid + 1; 34 | else if (cmp > 0) 35 | high = mid - 1; 36 | else 37 | return mid; // key found 38 | } 39 | return -(low + 1); // key not found. 40 | } 41 | 42 | public static void main(String[] args) { 43 | 44 | int[] chars = { 1, 2, 3, 4, 5, 6, 8, 7 }; 45 | chars = Arrays.copyOf(chars, 100); 46 | System.out.println(chars.length); 47 | for (int i = 0; i < chars.length; i++) { 48 | System.out.println(chars[i]); 49 | } 50 | } 51 | 52 | public static void sort(SmartForest[] a) { 53 | SmartForest[] aux = a.clone(); 54 | mergeSort(aux, a, 0, a.length, 0); 55 | } 56 | 57 | public static void sort(SmartForest[] a, int fromIndex, int toIndex) { 58 | rangeCheck(a.length, fromIndex, toIndex); 59 | SmartForest[] aux = copyOfRange(a, fromIndex, toIndex); 60 | mergeSort(aux, a, fromIndex, toIndex, -fromIndex); 61 | } 62 | 63 | private static void rangeCheck(int arrayLen, int fromIndex, int toIndex) { 64 | if (fromIndex > toIndex) 65 | throw new IllegalArgumentException("fromIndex(" + fromIndex + ") > toIndex(" + toIndex + ")"); 66 | if (fromIndex < 0) 67 | throw new ArrayIndexOutOfBoundsException(fromIndex); 68 | if (toIndex > arrayLen) 69 | throw new ArrayIndexOutOfBoundsException(toIndex); 70 | } 71 | 72 | private static void mergeSort(SmartForest[] src, SmartForest[] dest, int low, int high, int off) { 73 | int length = high - low; 74 | 75 | // Insertion sort on smallest arrays 76 | if (length < INSERTIONSORT_THRESHOLD) { 77 | for (int i = low; i < high; i++) 78 | for (int j = i; j > low && (dest[j - 1]).compareTo(dest[j].getC()) > 0; j--) 79 | swap(dest, j, j - 1); 80 | return; 81 | } 82 | 83 | // Recursively sort halves of dest into src 84 | int destLow = low; 85 | int destHigh = high; 86 | low += off; 87 | high += off; 88 | int mid = (low + high) >>> 1; 89 | mergeSort(dest, src, low, mid, -off); 90 | mergeSort(dest, src, mid, high, -off); 91 | 92 | // If list is already sorted, just copy from src to dest. This is an 93 | // optimization that results in faster sorts for nearly ordered lists. 94 | if (src[mid - 1].compareTo(src[mid].getC()) <= 0) { 95 | System.arraycopy(src, low, dest, destLow, length); 96 | return; 97 | } 98 | 99 | // Merge sorted halves (now in src) into dest 100 | for (int i = destLow, p = low, q = mid; i < destHigh; i++) { 101 | if (q >= high || p < mid && src[p].compareTo(src[q].getC()) <= 0) 102 | dest[i] = src[p++]; 103 | else 104 | dest[i] = src[q++]; 105 | } 106 | } 107 | 108 | /** 109 | * Swaps x[a] with x[b]. 110 | */ 111 | private static void swap(SmartForest[] x, int a, int b) { 112 | SmartForest t = x[a]; 113 | x[a] = x[b]; 114 | x[b] = t; 115 | } 116 | 117 | @SuppressWarnings("unchecked") 118 | public static T[] copyOfRange(T[] original, int from, int to) { 119 | return copyOfRange(original, from, to, (Class) original.getClass()); 120 | } 121 | 122 | public static T[] copyOfRange(U[] original, int from, int to, Class newType) { 123 | int newLength = to - from; 124 | if (newLength < 0) 125 | throw new IllegalArgumentException(from + " > " + to); 126 | @SuppressWarnings("unchecked") 127 | T[] copy = ((Object) newType == (Object) Object[].class) ? (T[]) new Object[newLength] : (T[]) Array.newInstance(newType.getComponentType(), newLength); 128 | System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength)); 129 | return copy; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/CollectionUtil.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.util.*; 4 | 5 | public class CollectionUtil { 6 | /** 7 | * map 按照value排序 8 | * 9 | * @return 10 | */ 11 | public static List> sortMapByValue(Map map, final int sort) { 12 | List> orderList = new ArrayList>(map.entrySet()); 13 | Collections.sort(orderList, new Comparator>() { 14 | @Override 15 | @SuppressWarnings("unchecked") 16 | public int compare(Map.Entry o1, Map.Entry o2) { 17 | return (((Comparable) o2.getValue()).compareTo(o1.getValue())) * sort; 18 | } 19 | }); 20 | return orderList; 21 | } 22 | 23 | public static Map as(K k1, V v1) { 24 | Map result = new HashMap() ; 25 | result.put(k1, v1) ; 26 | return result ; 27 | } 28 | 29 | 30 | 31 | } 32 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/FileFinder.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.io.File; 4 | import java.security.AccessControlException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.nlpcn.commons.lang.util.logging.Log; 9 | import org.nlpcn.commons.lang.util.logging.LogFactory; 10 | 11 | /** 12 | * 从系统各个环境中找文件.或者文件夹 13 | * 14 | * @author ansj 15 | */ 16 | public class FileFinder { 17 | 18 | private static final Log LOG = LogFactory.getLog(); 19 | 20 | /** 21 | * 系统路径分隔符 22 | */ 23 | private static final String SEPARATOR = System.getProperty("path.separator"); 24 | private static final String[] PATHS_PROPERTIES = new String[] { "java.class.path", "java.library.path" }; 25 | 26 | public static List fileDir = new ArrayList(); 27 | 28 | static { 29 | fileDir.add(new File("").getAbsoluteFile()); 30 | } 31 | 32 | /** 33 | * 输入一个文件名或者文件的最后路径寻找文件 default deep Integer.max 34 | * 35 | * @param 36 | * @return 37 | */ 38 | public static File find(String lastPath) { 39 | return find(lastPath, Integer.MAX_VALUE); 40 | } 41 | 42 | /** 43 | * 输入一个文件名或者文件的最后路径寻找文件 44 | * 45 | * @param 46 | * @return 47 | */ 48 | public static File find(String lastPath, int deep) { 49 | 50 | // 先深度查找 51 | for (File file : fileDir) { 52 | if (file.exists() && file.canRead()) { 53 | file = findByFile(file, lastPath, deep); 54 | if (file != null) { 55 | return file; 56 | } 57 | } 58 | } 59 | // 再从基本几个目录中查找 60 | for (String pathProperties : PATHS_PROPERTIES) { 61 | String[] propertyPath = System.getProperty(pathProperties).split(SEPARATOR); 62 | for (String path : propertyPath) { 63 | File file = new File(path); 64 | try { 65 | if (file.canRead() && file.exists()) { 66 | file = findByFile(file, lastPath, deep); 67 | if (file != null) { 68 | return file; 69 | } 70 | } 71 | } catch (AccessControlException e) { 72 | LOG.info(path + " not access to visit"); 73 | } 74 | } 75 | } 76 | return null; 77 | } 78 | 79 | /** 80 | * 根据一个文件深度查找 81 | * 82 | * @param file 83 | * @param lastPath 84 | * @param deep integer.max 85 | * @return 86 | */ 87 | public static File findByFile(File file, String lastPath) { 88 | return findByFile(file, lastPath, Integer.MAX_VALUE); 89 | } 90 | 91 | /** 92 | * 根据一个文件深度查找 93 | * 94 | * @param file 95 | * @param lastPath 96 | * @param deep 97 | * @return 98 | */ 99 | public static File findByFile(File file, String lastPath, int deep) { 100 | if (deep == 0 || !file.exists() || !file.canRead()) { 101 | return null; 102 | } 103 | if (file.getAbsolutePath().endsWith(lastPath)) { 104 | return file; 105 | } 106 | if (file.isDirectory()) { 107 | 108 | File[] listFiles = file.listFiles(); 109 | if (listFiles != null && listFiles.length > 0) { 110 | for (File file2 : listFiles) { 111 | File temp = findByFile(file2, lastPath, deep - 1); 112 | if (temp != null) { 113 | return temp; 114 | } 115 | } 116 | } 117 | } 118 | return null; 119 | } 120 | 121 | } 122 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/FileIterator.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.Closeable; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.UnsupportedEncodingException; 9 | import java.util.Iterator; 10 | 11 | /** 12 | * 文件迭代器 13 | * 14 | * @author ansj 15 | */ 16 | public class FileIterator implements Iterator, Closeable { 17 | String temp = null; 18 | private BufferedReader br = null; 19 | 20 | protected FileIterator(String path, String charEncoding) throws UnsupportedEncodingException, FileNotFoundException { 21 | br = IOUtil.getReader(path, charEncoding); 22 | } 23 | 24 | protected FileIterator(InputStream is, String charEncoding) throws UnsupportedEncodingException, FileNotFoundException { 25 | br = IOUtil.getReader(is, charEncoding); 26 | } 27 | 28 | @Override 29 | public boolean hasNext() { 30 | if (temp == null) { 31 | try { 32 | temp = br.readLine(); 33 | } catch (IOException e) { 34 | // TODO Auto-generated catch block 35 | e.printStackTrace(); 36 | } 37 | if (temp == null) { 38 | return false; 39 | } else { 40 | return true; 41 | } 42 | } else { 43 | return true; 44 | } 45 | } 46 | 47 | public String readLine() { 48 | try { 49 | if (temp == null) { 50 | temp = br.readLine(); 51 | } 52 | return temp; 53 | } catch (IOException e) { 54 | // TODO Auto-generated catch block 55 | e.printStackTrace(); 56 | return null; 57 | } finally { 58 | temp = null; 59 | } 60 | } 61 | 62 | @Override 63 | public void close() { 64 | if (br != null) 65 | try { 66 | br.close(); 67 | } catch (IOException e) { 68 | e.printStackTrace(); 69 | } 70 | } 71 | 72 | @Override 73 | public String next() { 74 | return readLine(); 75 | } 76 | 77 | @Override 78 | public void remove() { 79 | throw new RuntimeException("file iteartor can not remove "); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/MD5.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.security.MessageDigest; 4 | import java.security.NoSuchAlgorithmException; 5 | 6 | public class MD5 { 7 | /** 8 | * MD5加密类 9 | * @param str 要加密的字符串 10 | * @return 加密后的字符串 11 | */ 12 | public static String code(String str){ 13 | try { 14 | MessageDigest md = MessageDigest.getInstance("MD5"); 15 | md.update(str.getBytes()); 16 | byte[]byteDigest = md.digest(); 17 | int i; 18 | StringBuffer buf = new StringBuffer(""); 19 | for (int offset = 0; offset < byteDigest.length; offset++) { 20 | i = byteDigest[offset]; 21 | if (i < 0) 22 | i += 256; 23 | if (i < 16) 24 | buf.append("0"); 25 | buf.append(Integer.toHexString(i)); 26 | } 27 | //32位加密 28 | return buf.toString(); 29 | // 16位的加密 30 | //return buf.toString().substring(8, 24); 31 | } catch (NoSuchAlgorithmException e) { 32 | e.printStackTrace(); 33 | return null; 34 | } 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/MapCount.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.io.Serializable; 4 | import java.util.Collection; 5 | import java.util.HashMap; 6 | import java.util.Iterator; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | 10 | /** 11 | * 用map做的计数器. 12 | * 13 | * @param 14 | * @author ansj 15 | */ 16 | public class MapCount implements Serializable { 17 | private static final long serialVersionUID = 1L; 18 | private HashMap hm = null; 19 | 20 | public MapCount() { 21 | hm = new HashMap(); 22 | } 23 | 24 | public MapCount(HashMap hm) { 25 | this.hm = hm; 26 | } 27 | 28 | public MapCount(int initialCapacity) { 29 | hm = new HashMap(initialCapacity); 30 | } 31 | 32 | /** 33 | * 增加一个元素 34 | * 35 | * @param t 36 | * @param n 37 | */ 38 | public void add(T t, double n) { 39 | Double value = null; 40 | if ((value = hm.get(t)) != null) { 41 | hm.put(t, value + n); 42 | } else { 43 | hm.put(t, Double.valueOf(n)); 44 | } 45 | } 46 | 47 | /** 48 | * 兼容旧的api 49 | * 50 | * @param t 51 | * @param n 52 | */ 53 | public void add(T t, int n) { 54 | add(t, (double) n); 55 | } 56 | 57 | /** 58 | * 计数增加.默认为1 59 | * 60 | * @param t 61 | */ 62 | public void add(T t) { 63 | this.add(t, 1); 64 | } 65 | 66 | /** 67 | * map的大小 68 | * 69 | * @return 70 | */ 71 | public int size() { 72 | return hm.size(); 73 | } 74 | 75 | /** 76 | * 删除一个元素 77 | * 78 | * @param t 79 | */ 80 | public void remove(T t) { 81 | hm.remove(t); 82 | } 83 | 84 | /** 85 | * 得道内部的map 86 | * 87 | * @return 88 | */ 89 | public HashMap get() { 90 | return this.hm; 91 | } 92 | 93 | /** 94 | * 将map序列化为词典格式 95 | * 96 | * @return 97 | */ 98 | public String getDic() { 99 | Iterator> iterator = this.hm.entrySet().iterator(); 100 | StringBuilder sb = new StringBuilder(); 101 | Entry next = null; 102 | while (iterator.hasNext()) { 103 | next = iterator.next(); 104 | sb.append(next.getKey()); 105 | sb.append("\t"); 106 | sb.append(next.getValue()); 107 | sb.append("\n"); 108 | } 109 | return sb.toString(); 110 | } 111 | 112 | /** 113 | * 批量增加 114 | * 115 | * @param hs 116 | */ 117 | public void addAll(Collection collection) { 118 | for (T t : collection) { 119 | this.add(t); 120 | } 121 | } 122 | 123 | /** 124 | * 批量增加 125 | * 126 | * @param hs 127 | */ 128 | public void addAll(Collection collection, double weight) { 129 | for (T t : collection) { 130 | this.add(t, weight); 131 | } 132 | } 133 | 134 | /** 135 | * 批量增加 136 | * 137 | * @param hs 138 | */ 139 | public void addAll(Map map) { 140 | for (Entry e : map.entrySet()) { 141 | this.add(e.getKey(), e.getValue()); 142 | } 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/MapFactory.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.TreeMap; 6 | 7 | /** 8 | * map 工具类 9 | * 10 | * @author ansj 11 | * 12 | * @param 13 | * @param 14 | */ 15 | public class MapFactory { 16 | 17 | private Map map = null; 18 | 19 | private MapFactory() { 20 | } 21 | 22 | public static MapFactory hashMap() { 23 | MapFactory mf = new MapFactory(); 24 | mf.map = new HashMap(); 25 | return mf; 26 | } 27 | 28 | public static MapFactory treeMap() { 29 | MapFactory mf = new MapFactory(); 30 | mf.map = new TreeMap(); 31 | return mf; 32 | } 33 | 34 | public MapFactory a(K k, V v) { 35 | map.put(k, v); 36 | return this; 37 | } 38 | 39 | public Map toMap() { 40 | return map; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/MurmurHash.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | /** 4 | * murmur hash 2.0. 5 | * 6 | * The murmur hash is a relatively fast hash function from 7 | * http://murmurhash.googlepages.com/ for platforms with efficient 8 | * multiplication. 9 | * 10 | * This is a re-implementation of the original C code plus some 11 | * additional features. 12 | * 13 | * Public domain. 14 | * 15 | * @author Viliam Holub 16 | * @version 1.0.2 17 | * 18 | */ 19 | public final class MurmurHash { 20 | 21 | // all methods static; private constructor. 22 | private MurmurHash() {} 23 | 24 | /** 25 | * Generates 32 bit hash from byte array of the given length and 26 | * seed. 27 | * 28 | * @param data byte array to hash 29 | * @param length length of the array to hash 30 | * @param seed initial seed value 31 | * @return 32 bit hash of the given array 32 | */ 33 | public static int hash32(final byte[] data, int length, int seed) { 34 | // 'm' and 'r' are mixing constants generated offline. 35 | // They're not really 'magic', they just happen to work well. 36 | final int m = 0x5bd1e995; 37 | final int r = 24; 38 | 39 | // Initialize the hash to a random value 40 | int h = seed^length; 41 | int length4 = length/4; 42 | 43 | for (int i=0; i>> r; 49 | k *= m; 50 | h *= m; 51 | h ^= k; 52 | } 53 | 54 | // Handle the last few bytes of the input array 55 | switch (length%4) { 56 | case 3: h ^= (data[(length&~3) +2]&0xff) << 16; 57 | case 2: h ^= (data[(length&~3) +1]&0xff) << 8; 58 | case 1: h ^= (data[length&~3]&0xff); 59 | h *= m; 60 | } 61 | 62 | h ^= h >>> 13; 63 | h *= m; 64 | h ^= h >>> 15; 65 | 66 | return h; 67 | } 68 | 69 | /** 70 | * Generates 32 bit hash from byte array with default seed value. 71 | * 72 | * @param data byte array to hash 73 | * @param length length of the array to hash 74 | * @return 32 bit hash of the given array 75 | */ 76 | public static int hash32(final byte[] data, int length) { 77 | return hash32(data, length, 0x9747b28c); 78 | } 79 | 80 | /** 81 | * Generates 32 bit hash from a string. 82 | * 83 | * @param text string to hash 84 | * @return 32 bit hash of the given string 85 | */ 86 | public static int hash32(final String text) { 87 | final byte[] bytes = text.getBytes(); 88 | return hash32(bytes, bytes.length); 89 | } 90 | 91 | /** 92 | * Generates 32 bit hash from a substring. 93 | * 94 | * @param text string to hash 95 | * @param from starting index 96 | * @param length length of the substring to hash 97 | * @return 32 bit hash of the given string 98 | */ 99 | public static int hash32(final String text, int from, int length) { 100 | return hash32(text.substring( from, from+length)); 101 | } 102 | 103 | /** 104 | * Generates 64 bit hash from byte array of the given length and seed. 105 | * 106 | * @param data byte array to hash 107 | * @param length length of the array to hash 108 | * @param seed initial seed value 109 | * @return 64 bit hash of the given array 110 | */ 111 | public static long hash64(final byte[] data, int length, int seed) { 112 | final long m = 0xc6a4a7935bd1e995L; 113 | final int r = 47; 114 | 115 | long h = (seed&0xffffffffl)^(length*m); 116 | 117 | int length8 = length/8; 118 | 119 | for (int i=0; i>> r; 128 | k *= m; 129 | 130 | h ^= k; 131 | h *= m; 132 | } 133 | 134 | switch (length%8) { 135 | case 7: h ^= (long)(data[(length&~7)+6]&0xff) << 48; 136 | case 6: h ^= (long)(data[(length&~7)+5]&0xff) << 40; 137 | case 5: h ^= (long)(data[(length&~7)+4]&0xff) << 32; 138 | case 4: h ^= (long)(data[(length&~7)+3]&0xff) << 24; 139 | case 3: h ^= (long)(data[(length&~7)+2]&0xff) << 16; 140 | case 2: h ^= (long)(data[(length&~7)+1]&0xff) << 8; 141 | case 1: h ^= (long)(data[length&~7]&0xff); 142 | h *= m; 143 | }; 144 | 145 | h ^= h >>> r; 146 | h *= m; 147 | h ^= h >>> r; 148 | 149 | return h; 150 | } 151 | 152 | /** 153 | * Generates 64 bit hash from byte array with default seed value. 154 | * 155 | * @param data byte array to hash 156 | * @param length length of the array to hash 157 | * @return 64 bit hash of the given string 158 | */ 159 | public static long hash64(final byte[] data, int length) { 160 | return hash64(data, length, 0xe17a1465); 161 | } 162 | 163 | /** 164 | * Generates 64 bit hash from a string. 165 | * 166 | * @param text string to hash 167 | * @return 64 bit hash of the given string 168 | */ 169 | public static long hash64(final String text) { 170 | final byte[] bytes = text.getBytes(); 171 | return hash64(bytes, bytes.length); 172 | } 173 | 174 | /** 175 | * Generates 64 bit hash from a substring. 176 | * 177 | * @param text string to hash 178 | * @param from starting index 179 | * @param length length of the substring to hash 180 | * @return 64 bit hash of the given array 181 | */ 182 | public static long hash64(final String text, int from, int length) { 183 | return hash64(text.substring( from, from+length)); 184 | } 185 | } -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/ObjConver.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.text.ParseException; 4 | import java.text.SimpleDateFormat; 5 | import java.util.Date; 6 | 7 | public class ObjConver { 8 | 9 | public static final String DEFFAULT_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss"; 10 | 11 | public static Double getDouble(String value) { 12 | return castToDouble(value); 13 | } 14 | 15 | public static Double getDoubleValue(String value) { 16 | if (StringUtil.isBlank(value)) { 17 | return 0D; 18 | } 19 | return castToDouble(value); 20 | } 21 | 22 | public static Float getFloat(String value) { 23 | if (StringUtil.isBlank(value)) { 24 | return null; 25 | } 26 | return castToFloat(value); 27 | } 28 | 29 | public static Float getFloatValue(String value) { 30 | return castToFloat(value).floatValue(); 31 | } 32 | 33 | public static Integer getInteger(String value) { 34 | return castToInteger(value); 35 | } 36 | 37 | public static int getIntValue(String value) { 38 | if (StringUtil.isBlank(value)) { 39 | return 0; 40 | } 41 | return castToInteger(value); 42 | } 43 | 44 | public static Date getDate(String value) { 45 | if (StringUtil.isBlank(value)) { 46 | return null; 47 | } 48 | return castToDate(value); 49 | } 50 | 51 | public static Long getLong(String value) { 52 | return castToLong(value); 53 | } 54 | 55 | public static long getLongValue(String value) { 56 | if (StringUtil.isBlank(value)) { 57 | return 0L; 58 | } 59 | return castToLong(value); 60 | } 61 | 62 | public static Boolean getBoolean(String value) { 63 | return castToBoolean(value); 64 | } 65 | 66 | public static boolean getBooleanValue(String value) { 67 | if (StringUtil.isBlank(value)) { 68 | return false; 69 | } 70 | return castToBoolean(value); 71 | 72 | } 73 | 74 | public static final Float castToFloat(Object value) { 75 | if (value == null) { 76 | return null; 77 | } 78 | 79 | if (value instanceof Number) { 80 | return ((Number) value).floatValue(); 81 | } 82 | 83 | if (value instanceof String) { 84 | String strVal = value.toString(); 85 | if (strVal.length() == 0) { 86 | return null; 87 | } 88 | 89 | return Float.parseFloat(strVal); 90 | } 91 | 92 | throw new ClassCastException("can not cast to float, value : " + value); 93 | } 94 | 95 | public static final Double castToDouble(Object value) { 96 | if (value == null) { 97 | return null; 98 | } 99 | if (value instanceof Number) { 100 | return ((Number) value).doubleValue(); 101 | } else if (value instanceof String) { 102 | String strVal = value.toString(); 103 | if (strVal.length() == 0) { 104 | return null; 105 | } 106 | return Double.parseDouble(strVal); 107 | } 108 | throw new RuntimeException("can not cast to double, value : " + value); 109 | } 110 | 111 | public static final Date castToDate(Object value) { 112 | if (value == null) { 113 | return null; 114 | } 115 | long longValue = -1; 116 | 117 | if(value instanceof Date){ 118 | return (Date) value ; 119 | }else if (value instanceof Number) { 120 | longValue = ((Number) value).longValue(); 121 | } else if (value instanceof String) { 122 | String strVal = (String) value; 123 | 124 | if (strVal.indexOf('-') != -1) { 125 | String format = null; 126 | if (strVal.length() == DEFFAULT_DATE_FORMAT.length()) { 127 | format = DEFFAULT_DATE_FORMAT; 128 | } else if (strVal.length() == 10) { 129 | format = "yyyy-MM-dd"; 130 | } else if (strVal.length() == "yyyy-MM-dd HH".length()) { 131 | format = "yyyy-MM-dd HH"; 132 | } else if (strVal.length() == "yyyy-MM-dd HH:mm".length()) { 133 | format = "yyyy-MM-dd HH:mm"; 134 | } else if (strVal.length() == "yyyy-MM-dd HH:mm:ss".length()) { 135 | format = "yyyy-MM-dd HH:mm:ss"; 136 | } else if (strVal.length() == "yyyy-MM-dd HH:mm:SSS".length()) { 137 | format = "yyyy-MM-dd HH:mm:ss.SSS"; 138 | } else { 139 | return null; 140 | } 141 | 142 | SimpleDateFormat dateFormat = new SimpleDateFormat(format); 143 | try { 144 | return dateFormat.parse(strVal); 145 | } catch (ParseException e) { 146 | throw new RuntimeException("can not cast to Date, value : " + strVal); 147 | } 148 | } 149 | 150 | if (strVal.length() == 0) { 151 | return null; 152 | } 153 | 154 | longValue = Long.parseLong(strVal); 155 | } 156 | 157 | if (longValue < 0) { 158 | throw new ClassCastException("can not cast to Date, value : " + value); 159 | } 160 | 161 | return new Date(longValue); 162 | } 163 | 164 | public static final Long castToLong(Object value) { 165 | if (value == null) { 166 | return null; 167 | } 168 | 169 | if (value instanceof Number) { 170 | return ((Number) value).longValue(); 171 | } 172 | 173 | if (value instanceof String) { 174 | String strVal = (String) value; 175 | if (strVal.length() == 0) { 176 | return null; 177 | } 178 | 179 | try { 180 | return Long.parseLong(strVal); 181 | } catch (NumberFormatException ex) { 182 | } 183 | 184 | Date date = castToDate(strVal); 185 | 186 | if (date != null) { 187 | return date.getTime(); 188 | } 189 | } 190 | 191 | throw new ClassCastException("can not cast to long, value : " + value); 192 | } 193 | 194 | public static final Integer castToInteger(Object value) { 195 | if (value == null) { 196 | return null; 197 | } 198 | 199 | if (value instanceof Integer) { 200 | return (Integer) value; 201 | } 202 | 203 | if (value instanceof Number) { 204 | return ((Number) value).intValue(); 205 | } 206 | 207 | if (value instanceof String) { 208 | String strVal = (String) value; 209 | if (strVal.length() == 0) { 210 | return null; 211 | } 212 | 213 | return Integer.parseInt(strVal); 214 | } 215 | 216 | throw new ClassCastException("can not cast to int, value : " + value); 217 | } 218 | 219 | public static final Boolean castToBoolean(Object value) { 220 | if (value == null) { 221 | return null; 222 | } 223 | 224 | if (value instanceof Boolean) { 225 | return (Boolean) value; 226 | } 227 | 228 | if (value instanceof Number) { 229 | return ((Number) value).intValue() == 1; 230 | } 231 | 232 | if (value instanceof String) { 233 | String str = (String) value; 234 | if (str.length() == 0) { 235 | return null; 236 | } 237 | 238 | if ("true".equalsIgnoreCase(str)) { 239 | return Boolean.TRUE; 240 | } 241 | if ("false".equalsIgnoreCase(str)) { 242 | return Boolean.FALSE; 243 | } 244 | 245 | if ("1".equalsIgnoreCase(str)) { 246 | return Boolean.TRUE; 247 | } 248 | } 249 | 250 | throw new ClassCastException("can not cast to int, value : " + value); 251 | } 252 | 253 | private static Character castToCharacter(Object value) { 254 | 255 | if (value instanceof Character) { 256 | return (Character) value; 257 | } 258 | 259 | if (value instanceof Number) { 260 | return (char) ((Number) value).intValue(); 261 | } 262 | 263 | if (value != null) { 264 | return value.toString().trim().charAt(0); 265 | } 266 | 267 | return null; 268 | } 269 | 270 | /** 271 | * 将一个对象转换为对应的类 272 | * 273 | * @param 274 | * 275 | * @param 276 | * 277 | * @param 278 | * @param value 279 | * @param c 280 | * @return 281 | */ 282 | @SuppressWarnings("unchecked") 283 | public static T conversion(Object value, Class c) { 284 | if (String.class.equals(c)) { 285 | return (T) value; 286 | } else if (Character.class.equals(c)) { 287 | return (T) ObjConver.castToCharacter(value); 288 | } else if (Integer.class.equals(c)) { 289 | return (T) ObjConver.castToInteger(value); 290 | } else if (Double.class.equals(c)) { 291 | return (T) ObjConver.castToDouble(value); 292 | } else if (Float.class.equals(c)) { 293 | return (T) ObjConver.castToFloat(value); 294 | } else if (Long.class.equals(c)) { 295 | return (T) ObjConver.castToLong(value); 296 | } else if (Boolean.class.equals(c)) { 297 | return (T) ObjConver.castToBoolean(value); 298 | } else { 299 | throw new RuntimeException("not define this class by " + c); 300 | } 301 | } 302 | 303 | } 304 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/StringUtil.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Collection; 6 | import java.util.Iterator; 7 | import java.util.List; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | public class StringUtil { 12 | 13 | private static final char DY = '\''; 14 | private static final char DH = ','; 15 | private static int[] filter = new int[128]; 16 | private static int[] filterEnd = new int[128]; 17 | private static final String EMPTY = ""; 18 | private static final String NULL = "null"; 19 | 20 | static { 21 | filter['<'] = Integer.MAX_VALUE / 2; 22 | filterEnd['<'] = '>'; 23 | 24 | filter['&'] = 10; 25 | filterEnd['&'] = ';'; 26 | 27 | filter[';'] = -1; 28 | filter['\n'] = -1; 29 | 30 | filter['\r'] = -1; 31 | filter['\t'] = -1; 32 | filter[' '] = 1; 33 | filter['*'] = 1; 34 | filter['-'] = 1; 35 | filter['.'] = 1; 36 | filter['#'] = 1; 37 | 38 | } 39 | 40 | /** 41 | * 去除html标签 42 | * 43 | * @param input 44 | * @return 45 | */ 46 | public static String rmHtmlTag(String input) { 47 | if (isBlank(input)) { 48 | return ""; 49 | } 50 | int length = input.length(); 51 | int tl = 0; 52 | StringBuilder sb = new StringBuilder(); 53 | char c = 0; 54 | for (int i = 0; i < length; i++) { 55 | c = input.charAt(i); 56 | 57 | if (c > 127) { 58 | sb.append(c); 59 | continue; 60 | } 61 | 62 | switch (filter[c]) { 63 | case -1: 64 | break; 65 | case 0: 66 | sb.append(c); 67 | break; 68 | case 1: 69 | if (sb.length() > 0 && sb.charAt(sb.length() - 1) != c) 70 | sb.append(c); 71 | do { 72 | i++; 73 | } while (i < length && input.charAt(i) == c); 74 | 75 | if (i < length || input.charAt(length - 1) != c) 76 | i--; 77 | break; 78 | default: 79 | tl = filter[c] + i; 80 | int tempOff = i; 81 | boolean flag = false; 82 | char end = (char) filterEnd[c]; 83 | for (i++; i < length && i < tl; i++) { 84 | c = input.charAt(i); 85 | if (c > 127) 86 | continue; 87 | if (c == end) { 88 | flag = true; 89 | break; 90 | } 91 | } 92 | if (!flag) { 93 | i = tempOff; 94 | sb.append(input.charAt(i)); 95 | } 96 | break; 97 | } 98 | } 99 | return sb.toString(); 100 | } 101 | 102 | /** 103 | * 判断字符串是否为空 104 | * 105 | * @param cs 106 | * @return 107 | */ 108 | public static boolean isBlank(CharSequence cs) { 109 | int strLen; 110 | if (cs == null || (strLen = cs.length()) == 0) { 111 | return true; 112 | } 113 | for (int i = 0; i < strLen; i++) { 114 | if (Character.isWhitespace(cs.charAt(i)) == false) { 115 | return false; 116 | } 117 | } 118 | return true; 119 | } 120 | 121 | /** 122 | * 判断字符串是否不为空 123 | * 124 | * @param cs 125 | * @return 126 | */ 127 | public static boolean isNotBlank(CharSequence cs) { 128 | return !isBlank(cs); 129 | 130 | } 131 | 132 | public static String makeSqlInString(String str) { 133 | String[] strs = str.split(","); 134 | StringBuilder sb = new StringBuilder(); 135 | String field = null; 136 | for (int i = 0; i < strs.length; i++) { 137 | field = strs[i].trim(); 138 | if (isNotBlank(field)) { 139 | sb.append(DY); 140 | sb.append(field); 141 | sb.append(DY); 142 | if (i < strs.length - 1) { 143 | sb.append(DH); 144 | } 145 | } 146 | } 147 | return sb.toString(); 148 | } 149 | 150 | /** 151 | * 将一个字符串.转换成排序后的字符数组 152 | * 153 | * @param str 154 | * @return 155 | */ 156 | public static char[] sortCharArray(String str) { 157 | char[] chars = str.toCharArray(); 158 | Arrays.sort(chars); 159 | return chars; 160 | } 161 | 162 | public static String joiner(int[] ints, String split) { 163 | 164 | if (ints.length == 0) { 165 | return EMPTY; 166 | } 167 | 168 | StringBuilder sb = new StringBuilder(String.valueOf(ints[0])); 169 | 170 | for (int i = 1; i < ints.length; i++) { 171 | sb.append(split); 172 | sb.append(ints[i]); 173 | } 174 | 175 | return sb.toString(); 176 | } 177 | 178 | public static String joiner(double[] doubles, String split) { 179 | 180 | if (doubles.length == 0) { 181 | return EMPTY; 182 | } 183 | 184 | StringBuilder sb = new StringBuilder(String.valueOf(doubles[0])); 185 | 186 | for (int i = 1; i < doubles.length; i++) { 187 | sb.append(split); 188 | sb.append(doubles[i]); 189 | } 190 | 191 | return sb.toString(); 192 | } 193 | 194 | public static String joiner(float[] floats, String split) { 195 | 196 | if (floats.length == 0) { 197 | return EMPTY; 198 | } 199 | 200 | StringBuilder sb = new StringBuilder(String.valueOf(floats[0])); 201 | 202 | for (int i = 1; i < floats.length; i++) { 203 | sb.append(split); 204 | sb.append(floats[i]); 205 | } 206 | 207 | return sb.toString(); 208 | } 209 | 210 | public static String joiner(long[] longs, String split) { 211 | 212 | if (longs.length == 0) { 213 | return EMPTY; 214 | } 215 | 216 | StringBuilder sb = new StringBuilder(String.valueOf(longs[0])); 217 | 218 | for (int i = 1; i < longs.length; i++) { 219 | sb.append(split); 220 | sb.append(longs[i]); 221 | } 222 | 223 | return sb.toString(); 224 | } 225 | 226 | public static String toString(Object obj) { 227 | if (obj == null) { 228 | return NULL; 229 | } else { 230 | return obj.toString(); 231 | } 232 | } 233 | 234 | public static String joiner(Collection c, String split) { 235 | 236 | Iterator iterator = c.iterator(); 237 | 238 | if (!iterator.hasNext()) { 239 | return EMPTY; 240 | } 241 | 242 | StringBuilder sb = new StringBuilder(iterator.next().toString()); 243 | 244 | while (iterator.hasNext()) { 245 | sb.append(split); 246 | sb.append(toString(iterator.next()).toString()); 247 | } 248 | 249 | return sb.toString(); 250 | } 251 | 252 | public static boolean isBlank(char[] chars) { 253 | // TODO Auto-generated method stub 254 | int strLen; 255 | if (chars == null || (strLen = chars.length) == 0) { 256 | return true; 257 | } 258 | for (int i = 0; i < strLen; i++) { 259 | if (Character.isWhitespace(chars[i]) == false) { 260 | return false; 261 | } 262 | } 263 | return true; 264 | } 265 | 266 | /** 267 | * 正则匹配第一个 268 | * 269 | * @param regex 270 | * @param input 271 | * @return 272 | */ 273 | public static String matcherFirst(String regex, String input) { 274 | Matcher matcher = Pattern.compile(regex).matcher(input); // 读取特征个数 275 | if (matcher.find()) { 276 | return input.substring(matcher.start(), matcher.end()); 277 | } else { 278 | return null; 279 | } 280 | } 281 | 282 | /** 283 | * trim 一个字符串.扩展了string类原生的trim.对BOM和中文空格进行trim 284 | * 285 | * @return 286 | */ 287 | public static String trim(String value) { 288 | 289 | if (value == null) { 290 | return value; 291 | } 292 | 293 | int len = value.length(); 294 | 295 | int st = 0; 296 | 297 | while ((st < len) && (Character.isWhitespace(value.charAt(st)) || value.charAt(st) == 65279 || value.charAt(st) == 160 || value.charAt(st) == 12288)) { 298 | st++; 299 | } 300 | while ((st < len) && (Character.isWhitespace(value.charAt(len - 1)) || value.charAt(st) == 160 || value.charAt(st) == 12288)) { 301 | len--; 302 | } 303 | return ((st > 0) || (len < value.length())) ? value.substring(st, len) : value; 304 | } 305 | 306 | /** 307 | * 正则匹配全部 308 | * 309 | * @param regex 310 | * @param input 311 | * @return 312 | */ 313 | public static List matcherAll(String regex, String input) { 314 | List result = new ArrayList(); 315 | Matcher matcher = Pattern.compile(regex).matcher(input); // 读取特征个数 316 | while (matcher.find()) { 317 | result.add(input.substring(matcher.start(), matcher.end())); 318 | } 319 | return result; 320 | } 321 | 322 | /** 323 | * 正则匹配全部 324 | * 325 | * @param regex 326 | * @param input 327 | * @return 328 | */ 329 | public static String matcherLast(String regex, String input) { 330 | List result = matcherAll(regex, input); 331 | if (result.size() == 0) { 332 | return null; 333 | } else { 334 | return result.get(result.size() - 1); 335 | } 336 | } 337 | } -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/WordAlert.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | public class WordAlert { 4 | 5 | /** 6 | * 这个就是(int)'a' 7 | */ 8 | public static final int MIN_LOWER = 65345; 9 | /** 10 | * 这个就是(int)'z' 11 | */ 12 | public static final int MAX_LOWER = 65370; 13 | /** 14 | * 差距进行转译需要的 15 | */ 16 | public static final int LOWER_GAP = 65248; 17 | /** 18 | * 这个就是(int)'A' 19 | */ 20 | public static final int MIN_UPPER = 65313; 21 | /** 22 | * 这个就是(int)'Z' 23 | */ 24 | public static final int MAX_UPPER = 65338; 25 | /** 26 | * 差距进行转译需要的 27 | */ 28 | public static final int UPPER_GAP = 65216; 29 | /** 30 | * 这个就是(int)'A' 31 | */ 32 | public static final int MIN_UPPER_E = 65; 33 | /** 34 | * 这个就是(int)'Z' 35 | */ 36 | public static final int MAX_UPPER_E = 90; 37 | /** 38 | * 差距进行转译需要的 39 | */ 40 | public static final int UPPER_GAP_E = -32; 41 | /** 42 | * 这个就是(int)'0' 43 | */ 44 | public static final int MIN_UPPER_N = 65296; 45 | /** 46 | * 这个就是(int)'9' 47 | */ 48 | public static final int MAX_UPPER_N = 65305; 49 | /** 50 | * 差距进行转译需要的 51 | */ 52 | public static final int UPPER_GAP_N = 65248; 53 | 54 | private static final char[] CHARCOVER = new char[65536]; 55 | 56 | static { 57 | for (int i = 0; i < CHARCOVER.length; i++) { 58 | if (i >= MIN_LOWER && i <= MAX_LOWER) { 59 | CHARCOVER[i] = (char) (i - LOWER_GAP); 60 | } else if (i >= MIN_UPPER && i <= MAX_UPPER) { 61 | CHARCOVER[i] = (char) (i - UPPER_GAP); 62 | } else if (i >= MIN_UPPER_E && i <= MAX_UPPER_E) { 63 | CHARCOVER[i] = (char) (i - UPPER_GAP_E); 64 | } else if (i >= MIN_UPPER_N && i <= MAX_UPPER_N) { 65 | CHARCOVER[i] = (char) (i - UPPER_GAP_N); 66 | } else if (i >= '0' && i <= '9') { 67 | CHARCOVER[i] = (char) i; 68 | } else if (i >= 'a' && i <= 'z') { 69 | CHARCOVER[i] = (char) i; 70 | } 71 | 72 | // CHARCOVER['﹩'] = '$'; 73 | // CHARCOVER[' '] = ' '; 74 | // CHARCOVER[','] = ','; 75 | // CHARCOVER['?'] = '?'; 76 | // CHARCOVER['“'] = '"' ; 77 | // CHARCOVER['”'] = '"' ; 78 | 79 | 80 | } 81 | } 82 | 83 | /** 84 | * 对全角的字符串,大写字母进行转译.如sdfsdf 85 | * 86 | * @param chars 87 | * @param start 88 | * @param end 89 | * @return 90 | */ 91 | public static String alertEnglish(char[] chars, int start, int end) { 92 | for (int i = start; i < start + end; i++) { 93 | if (chars[i] >= MIN_LOWER && chars[i] <= MAX_LOWER) { 94 | chars[i] = (char) (chars[i] - LOWER_GAP); 95 | } 96 | if (chars[i] >= MIN_UPPER && chars[i] <= MAX_UPPER) { 97 | chars[i] = (char) (chars[i] - UPPER_GAP); 98 | } 99 | if (chars[i] >= MIN_UPPER_E && chars[i] <= MAX_UPPER_E) { 100 | chars[i] = (char) (chars[i] - UPPER_GAP_E); 101 | } 102 | } 103 | return new String(chars, start, end); 104 | } 105 | 106 | public static String alertEnglish(String temp, int start, int end) { 107 | char c = 0; 108 | StringBuilder sb = new StringBuilder(); 109 | for (int i = start; i < start + end; i++) { 110 | c = temp.charAt(i); 111 | if (c >= MIN_LOWER && c <= MAX_LOWER) { 112 | sb.append((char) (c - LOWER_GAP)); 113 | } else if (c >= MIN_UPPER && c <= MAX_UPPER) { 114 | sb.append((char) (c - UPPER_GAP)); 115 | } else if (c >= MIN_UPPER_E && c <= MAX_UPPER_E) { 116 | sb.append((char) (c - UPPER_GAP_E)); 117 | } else { 118 | sb.append(c); 119 | } 120 | } 121 | return sb.toString(); 122 | } 123 | 124 | public static String alertNumber(char[] chars, int start, int end) { 125 | for (int i = start; i < start + end; i++) { 126 | if (chars[i] >= MIN_UPPER_N && chars[i] <= MAX_UPPER_N) { 127 | chars[i] = (char) (chars[i] - UPPER_GAP_N); 128 | } 129 | } 130 | return new String(chars, start, end); 131 | } 132 | 133 | public static String alertNumber(String temp, int start, int end) { 134 | char c = 0; 135 | StringBuilder sb = new StringBuilder(); 136 | for (int i = start; i < start + end; i++) { 137 | c = temp.charAt(i); 138 | if (c >= MIN_UPPER_N && c <= MAX_UPPER_N) { 139 | sb.append((char) (c - UPPER_GAP_N)); 140 | } else { 141 | sb.append(c); 142 | } 143 | } 144 | return sb.toString(); 145 | } 146 | 147 | /** 148 | * 将一个字符串标准化 149 | * 150 | * @param str 151 | * @return 152 | */ 153 | public static char[] alertStr(String str) { 154 | char[] chars = new char[str.length()]; 155 | char c = 0; 156 | for (int i = 0; i < chars.length; i++) { 157 | c = CHARCOVER[str.charAt(i)]; 158 | if (c > 0) { 159 | chars[i] = c; 160 | } else { 161 | chars[i] = str.charAt(i); 162 | } 163 | } 164 | return chars; 165 | } 166 | 167 | /** 168 | * 判断一个字符串是否是english 169 | * 170 | * @param word 171 | * @return 172 | */ 173 | public static boolean isEnglish(String word) { 174 | int length = word.length(); 175 | char c; 176 | for (int i = 0; i < length; i++) { 177 | c = word.charAt(i); 178 | if ((c >= 'a' && c <= 'z') || (c >= MIN_LOWER && c <= MAX_LOWER) || (c >= MIN_UPPER && c <= MAX_UPPER) || (c >= MIN_UPPER_E && c <= MAX_UPPER_E)) { 179 | } else { 180 | return false; 181 | } 182 | } 183 | return true; 184 | } 185 | 186 | /** 187 | * 判断一个字符串是否是数字 188 | * 189 | * @param word 190 | * @return 191 | */ 192 | public static boolean isNumber(String word) { 193 | char c = 0; 194 | int len = word.length(); 195 | for (int i = 0; i < len; i++) { 196 | c = word.charAt(i); 197 | if ((c >= '0' && c <= '9') || c >= MIN_UPPER_N && c <= MAX_UPPER_N || c == '.') { 198 | } else { 199 | return false; 200 | } 201 | } 202 | return true; 203 | } 204 | 205 | /** 206 | * 将一个char标准化 207 | * 208 | * @param c 209 | * @return 210 | */ 211 | public static char CharCover(char c) { 212 | return CHARCOVER[c]; 213 | } 214 | 215 | } 216 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/WordWeight.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.Map.Entry; 7 | import java.util.Set; 8 | 9 | /** 10 | * 计算词语的权重,词频统计等 11 | * 12 | * @author ansj 13 | * 14 | */ 15 | public class WordWeight { 16 | 17 | private MapCount mc = new MapCount(); // 词频统计 18 | 19 | private HashMap> x2mat = new HashMap>(); 20 | 21 | private MapCount x2mc = new MapCount(); 22 | 23 | private Integer maxCount; 24 | 25 | private Integer recyclingCount; 26 | 27 | private double allFreq; 28 | 29 | public WordWeight() { 30 | }; 31 | 32 | /** 33 | * 新的个数 = maxCount - recyclingCount; recyclingCount< maxCount 34 | * 35 | * @param maxCount 36 | * 最大值,当超过这个值后进行回收 37 | * @param recyclingCount 38 | * 回收个数 39 | */ 40 | public WordWeight(Integer maxCount, Integer recyclingCount) { 41 | this.maxCount = maxCount; 42 | this.recyclingCount = recyclingCount; 43 | } 44 | 45 | public void add(String word) { 46 | add(word, 1); 47 | } 48 | 49 | public void add(String word, double weight) { 50 | allFreq += weight; 51 | mc.add(word, weight); 52 | if (maxCount != null && recyclingCount != null && mc.get().size() >= maxCount) { 53 | recycling(); 54 | } 55 | } 56 | 57 | public void add(String word, String target) { 58 | add(word, target, 1); 59 | } 60 | 61 | public void add(String word, String target, double weight) { 62 | if (x2mat.containsKey(target)) { 63 | x2mat.get(target).add(word, weight); 64 | } else { 65 | x2mat.put(target, new MapCount()); 66 | x2mat.get(target).add(word, weight); 67 | } 68 | x2mc.add(target, 1); 69 | add(word, weight); 70 | } 71 | 72 | /** 73 | * 导出词频统计结果 74 | * 75 | * @return 76 | */ 77 | public Map export() { 78 | Map result = new HashMap(); 79 | result.putAll(mc.get()); 80 | return result; 81 | } 82 | 83 | /** 84 | * 导出IDF统计结果 85 | * 86 | * @return 87 | */ 88 | public Map exportIDF() { 89 | 90 | Map result = new HashMap(); 91 | 92 | for (Entry entry : mc.get().entrySet()) { 93 | result.put(entry.getKey(), Math.log(allFreq / entry.getValue())); 94 | } 95 | 96 | return result; 97 | } 98 | 99 | public HashMap> exportChiSquare() { 100 | 101 | HashMap> x2final = new HashMap>(); 102 | 103 | double sum = allFreq; 104 | 105 | Double a, b, c, d; 106 | 107 | for (Entry> iter1 : x2mat.entrySet()) { 108 | String target = iter1.getKey(); 109 | for (Entry iter2 : iter1.getValue().get().entrySet()) { 110 | String name = iter2.getKey(); 111 | a = iter2.getValue(); 112 | b = x2mc.get().get(target) - a; 113 | c = mc.get().get(name) - a; 114 | d = sum - b - c + a; 115 | Double x2stat = Math.pow(a * d - b * c, 2) / (a + c) / (b + d); 116 | if (x2final.get(target) != null) { 117 | x2final.get(target).add(name, x2stat); 118 | } else { 119 | x2final.put(target, new MapCount()); 120 | x2final.get(target).add(name, x2stat); 121 | } 122 | } 123 | } 124 | 125 | return x2final; 126 | 127 | } 128 | 129 | /** 130 | * 回收 131 | */ 132 | private void recycling() { 133 | List> list = CollectionUtil.sortMapByValue(mc.get(), -1); 134 | Set targetSet = x2mat.keySet(); 135 | String word; 136 | for (int i = 0; i < recyclingCount; i++) { 137 | word = list.get(i).getKey(); 138 | allFreq -= mc.get().remove(word); // 从全局中移除数字 139 | for (String target : targetSet) { 140 | Double r2 = x2mat.get(target).get().remove(word); 141 | if (r2 != null) { 142 | x2mc.add(target, -r2); 143 | } 144 | } 145 | } 146 | } 147 | 148 | } 149 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/JakartaCommonsLoggingImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | import org.apache.commons.logging.Log; 19 | import org.apache.commons.logging.LogFactory; 20 | 21 | public class JakartaCommonsLoggingImpl implements org.nlpcn.commons.lang.util.logging.Log { 22 | 23 | private Log log; 24 | 25 | /** 26 | * @since 0.2.1 27 | * @param log 28 | */ 29 | public JakartaCommonsLoggingImpl(Log log){ 30 | this.log = log; 31 | } 32 | 33 | public JakartaCommonsLoggingImpl(String loggerName){ 34 | log = LogFactory.getLog(loggerName); 35 | } 36 | 37 | public boolean isDebugEnabled() { 38 | return log.isDebugEnabled(); 39 | } 40 | 41 | public void error(String s, Throwable e) { 42 | log.error(s, e); 43 | } 44 | 45 | public void error(String s) { 46 | log.error(s); 47 | } 48 | 49 | public void debug(String s) { 50 | log.debug(s); 51 | } 52 | 53 | public void debug(String s, Throwable e) { 54 | log.debug(s, e); 55 | } 56 | 57 | public void warn(String s) { 58 | log.warn(s); 59 | } 60 | 61 | @Override 62 | public void warn(String s, Throwable e) { 63 | log.warn(s, e); 64 | } 65 | 66 | 67 | @Override 68 | public boolean isInfoEnabled() { 69 | return log.isInfoEnabled(); 70 | } 71 | 72 | @Override 73 | public void info(String msg) { 74 | log.info(msg); 75 | } 76 | 77 | 78 | @Override 79 | public boolean isWarnEnabled() { 80 | return log.isWarnEnabled(); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/Jdk14LoggingImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | import java.util.logging.Level; 19 | import java.util.logging.Logger; 20 | 21 | public class Jdk14LoggingImpl implements Log { 22 | 23 | private Logger log; 24 | 25 | private String loggerName; 26 | 27 | public Jdk14LoggingImpl(String loggerName){ 28 | this.loggerName = loggerName; 29 | log = Logger.getLogger(loggerName); 30 | } 31 | 32 | public boolean isDebugEnabled() { 33 | return log.isLoggable(Level.FINE); 34 | } 35 | 36 | public void error(String s, Throwable e) { 37 | log.logp(Level.SEVERE, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), s, e); 38 | } 39 | 40 | public void error(String s) { 41 | log.logp(Level.SEVERE, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), s); 42 | } 43 | 44 | public void debug(String s) { 45 | log.logp(Level.FINE, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), s); 46 | } 47 | 48 | public void debug(String s, Throwable e) { 49 | log.logp(Level.FINE, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), s, e); 50 | } 51 | 52 | public void warn(String s) { 53 | log.logp(Level.WARNING, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), s); 54 | } 55 | 56 | @Override 57 | public void warn(String s, Throwable e) { 58 | log.logp(Level.WARNING, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), s, e); 59 | } 60 | 61 | @Override 62 | public boolean isInfoEnabled() { 63 | return log.isLoggable(Level.INFO); 64 | } 65 | 66 | @Override 67 | public void info(String msg) { 68 | log.logp(Level.INFO, loggerName, Thread.currentThread().getStackTrace()[1].getMethodName(), msg); 69 | } 70 | 71 | @Override 72 | public boolean isWarnEnabled() { 73 | return log.isLoggable(Level.WARNING); 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/Log.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | public interface Log { 19 | 20 | boolean isDebugEnabled(); 21 | 22 | void error(String msg, Throwable e); 23 | 24 | void error(String msg); 25 | 26 | boolean isInfoEnabled(); 27 | 28 | void info(String msg); 29 | 30 | void debug(String msg); 31 | 32 | void debug(String msg, Throwable e); 33 | 34 | boolean isWarnEnabled(); 35 | 36 | void warn(String msg); 37 | 38 | void warn(String msg, Throwable e); 39 | 40 | } 41 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/Log4j2Impl.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util.logging; 2 | 3 | import org.apache.logging.log4j.Level; 4 | import org.apache.logging.log4j.LogManager; 5 | import org.apache.logging.log4j.Logger; 6 | 7 | /* 8 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 9 | * 10 | * Licensed under the Apache License, Version 2.0 (the "License"); 11 | * you may not use this file except in compliance with the License. 12 | * You may obtain a copy of the License at 13 | * 14 | * http://www.apache.org/licenses/LICENSE-2.0 15 | * 16 | * Unless required by applicable law or agreed to in writing, software 17 | * distributed under the License is distributed on an "AS IS" BASIS, 18 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | * See the License for the specific language governing permissions and 20 | * limitations under the License. 21 | */ 22 | public class Log4j2Impl implements Log { 23 | 24 | private Logger log; 25 | 26 | private int errorCount; 27 | private int warnCount; 28 | private int infoCount; 29 | private int debugCount; 30 | 31 | /** 32 | * @since 0.2.21 33 | * @param log 34 | */ 35 | public Log4j2Impl(Logger log){ 36 | this.log = log; 37 | } 38 | 39 | public Log4j2Impl(String loggerName){ 40 | log = LogManager.getLogger(loggerName); 41 | } 42 | 43 | public Logger getLog() { 44 | return log; 45 | } 46 | 47 | public boolean isDebugEnabled() { 48 | return log.isDebugEnabled(); 49 | } 50 | 51 | public void error(String s, Throwable e) { 52 | errorCount++; 53 | log.error(s, e); 54 | } 55 | 56 | public void error(String s) { 57 | errorCount++; 58 | log.error(s); 59 | } 60 | 61 | public void debug(String s) { 62 | debugCount++; 63 | log.debug(s); 64 | } 65 | 66 | public void debug(String s, Throwable e) { 67 | debugCount++; 68 | log.debug(s, e); 69 | } 70 | 71 | public void warn(String s) { 72 | log.warn(s); 73 | warnCount++; 74 | } 75 | 76 | public void warn(String s, Throwable e) { 77 | log.warn(s, e); 78 | warnCount++; 79 | } 80 | 81 | public int getWarnCount() { 82 | return warnCount; 83 | } 84 | 85 | public int getErrorCount() { 86 | return errorCount; 87 | } 88 | 89 | public void resetStat() { 90 | errorCount = 0; 91 | warnCount = 0; 92 | infoCount = 0; 93 | debugCount = 0; 94 | } 95 | 96 | public int getDebugCount() { 97 | return debugCount; 98 | } 99 | 100 | public boolean isInfoEnabled() { 101 | return log.isInfoEnabled(); 102 | } 103 | 104 | public void info(String msg) { 105 | infoCount++; 106 | log.info(msg); 107 | } 108 | 109 | public boolean isWarnEnabled() { 110 | return log.isEnabled(Level.WARN); 111 | } 112 | 113 | public int getInfoCount() { 114 | return infoCount; 115 | } 116 | 117 | public String toString() { 118 | return log.toString(); 119 | } 120 | 121 | } 122 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/Log4jImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | import org.apache.log4j.Level; 19 | import org.apache.log4j.Logger; 20 | 21 | public class Log4jImpl implements Log { 22 | 23 | private static final String callerFQCN = Log4jImpl.class.getName(); 24 | 25 | private Logger log; 26 | 27 | private int errorCount; 28 | private int warnCount; 29 | private int infoCount; 30 | private int debugCount; 31 | 32 | /** 33 | * @since 0.2.21 34 | * @param log 35 | */ 36 | public Log4jImpl(Logger log){ 37 | this.log = log; 38 | } 39 | 40 | public Log4jImpl(String loggerName){ 41 | log = Logger.getLogger(loggerName); 42 | } 43 | 44 | public Logger getLog() { 45 | return log; 46 | } 47 | 48 | public boolean isDebugEnabled() { 49 | return log.isDebugEnabled(); 50 | } 51 | 52 | public void error(String s, Throwable e) { 53 | errorCount++; 54 | log.log(callerFQCN, Level.ERROR, s, e); 55 | } 56 | 57 | public void error(String s) { 58 | errorCount++; 59 | log.log(callerFQCN, Level.ERROR, s, null); 60 | } 61 | 62 | public void debug(String s) { 63 | debugCount++; 64 | log.log(callerFQCN, Level.DEBUG, s, null); 65 | } 66 | 67 | public void debug(String s, Throwable e) { 68 | debugCount++; 69 | log.log(callerFQCN, Level.DEBUG, s, e); 70 | } 71 | 72 | public void warn(String s) { 73 | log.log(callerFQCN, Level.WARN, s, null); 74 | warnCount++; 75 | } 76 | 77 | public void warn(String s, Throwable e) { 78 | log.log(callerFQCN, Level.WARN, s, e); 79 | warnCount++; 80 | } 81 | 82 | public int getWarnCount() { 83 | return warnCount; 84 | } 85 | 86 | public int getErrorCount() { 87 | return errorCount; 88 | } 89 | 90 | public void resetStat() { 91 | errorCount = 0; 92 | warnCount = 0; 93 | infoCount = 0; 94 | debugCount = 0; 95 | } 96 | 97 | public int getDebugCount() { 98 | return debugCount; 99 | } 100 | 101 | public boolean isInfoEnabled() { 102 | return log.isInfoEnabled(); 103 | } 104 | 105 | public void info(String msg) { 106 | infoCount++; 107 | log.log(callerFQCN, Level.INFO, msg, null); 108 | } 109 | 110 | public boolean isWarnEnabled() { 111 | return log.isEnabledFor(Level.WARN); 112 | } 113 | 114 | public int getInfoCount() { 115 | return infoCount; 116 | } 117 | 118 | public String toString() { 119 | return log.toString(); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/LogFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | import java.lang.reflect.Constructor; 19 | 20 | @SuppressWarnings("rawtypes") 21 | public class LogFactory { 22 | 23 | private static Constructor logConstructor; 24 | 25 | static { 26 | String logType = System.getProperty("druid.logType"); 27 | if (logType != null) { 28 | if (logType.equalsIgnoreCase("slf4j")) { 29 | tryImplementation("org.slf4j.Logger", "org.nlpcn.commons.lang.util.logging.SLF4JImpl"); 30 | } else if (logType.equalsIgnoreCase("log4j")) { 31 | tryImplementation("org.apache.log4j.Logger", "org.nlpcn.commons.lang.util.logging.Log4jImpl"); 32 | } else if (logType.equalsIgnoreCase("log4j2")) { 33 | tryImplementation("org.apache.logging.log4j.Logger", "org.nlpcn.commons.lang.util.logging.Log4j2Impl"); 34 | } else if (logType.equalsIgnoreCase("commonsLog")) { 35 | tryImplementation("org.apache.commons.logging.LogFactory", "org.nlpcn.commons.lang.util.logging.JakartaCommonsLoggingImpl"); 36 | } else if (logType.equalsIgnoreCase("jdkLog")) { 37 | tryImplementation("java.util.logging.Logger", "org.nlpcn.commons.lang.util.logging.Jdk14LoggingImpl"); 38 | } 39 | } 40 | // 优先选择log4j,而非Apache Common Logging. 因为后者无法设置真实Log调用者的信息 41 | tryImplementation("org.apache.log4j.Logger", "org.nlpcn.commons.lang.util.logging.Log4jImpl"); 42 | tryImplementation("org.apache.logging.log4j.Logger", "org.nlpcn.commons.lang.util.logging.Log4j2Impl"); 43 | tryImplementation("org.slf4j.Logger", "org.nlpcn.commons.lang.util.logging.SLF4JImpl"); 44 | tryImplementation("org.apache.commons.logging.LogFactory", "org.nlpcn.commons.lang.util.logging.JakartaCommonsLoggingImpl"); 45 | tryImplementation("java.util.logging.Logger", "org.nlpcn.commons.lang.util.logging.Jdk14LoggingImpl"); 46 | 47 | if (logConstructor == null) { 48 | try { 49 | logConstructor = NoLoggingImpl.class.getConstructor(String.class); 50 | } catch (Exception e) { 51 | throw new IllegalStateException(e.getMessage(), e); 52 | } 53 | } 54 | } 55 | 56 | @SuppressWarnings("unchecked") 57 | private static void tryImplementation(String testClassName, String implClassName) { 58 | if (logConstructor != null) { 59 | return; 60 | } 61 | 62 | try { 63 | Resources.classForName(testClassName); 64 | Class implClass = Resources.classForName(implClassName); 65 | logConstructor = implClass.getConstructor(new Class[] { String.class }); 66 | 67 | Class declareClass = logConstructor.getDeclaringClass(); 68 | if (!Log.class.isAssignableFrom(declareClass)) { 69 | logConstructor = null; 70 | } 71 | 72 | try { 73 | if (null != logConstructor) { 74 | logConstructor.newInstance(LogFactory.class.getName()); 75 | } 76 | } catch (Throwable t) { 77 | logConstructor = null; 78 | } 79 | 80 | } catch (Throwable t) { 81 | // skip 82 | } 83 | } 84 | 85 | public static Log getLog(Class clazz) { 86 | return getLog(clazz.getName()); 87 | } 88 | 89 | public static Log getLog(String loggerName) { 90 | try { 91 | return (Log) logConstructor.newInstance(loggerName); 92 | } catch (Throwable t) { 93 | throw new RuntimeException("Error creating logger for logger '" + loggerName + "'. Cause: " + t, t); 94 | } 95 | } 96 | 97 | /** 98 | * 获取log默认当前类,不支持android 99 | * @return 100 | */ 101 | public static Log getLog() { 102 | StackTraceElement[] sts = Thread.currentThread().getStackTrace(); 103 | return getLog(sts[2].getClassName()); 104 | } 105 | 106 | @SuppressWarnings("unchecked") 107 | public static synchronized void selectLog4JLogging() { 108 | try { 109 | Resources.classForName("org.apache.log4j.Logger"); 110 | Class implClass = Resources.classForName("org.nlpcn.commons.lang.util.logging.Log4jImpl"); 111 | logConstructor = implClass.getConstructor(new Class[] { String.class }); 112 | } catch (Throwable t) { 113 | //ignore 114 | } 115 | } 116 | 117 | @SuppressWarnings("unchecked") 118 | public static synchronized void selectJavaLogging() { 119 | try { 120 | Resources.classForName("java.util.logging.Logger"); 121 | Class implClass = Resources.classForName("org.nlpcn.commons.lang.util.logging.Jdk14LoggingImpl"); 122 | logConstructor = implClass.getConstructor(new Class[] { String.class }); 123 | } catch (Throwable t) { 124 | //ignore 125 | } 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/NoLoggingImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | public class NoLoggingImpl implements Log { 19 | 20 | private String loggerName; 21 | 22 | public NoLoggingImpl(String loggerName){ 23 | this.loggerName = loggerName; 24 | } 25 | 26 | public String getLoggerName() { 27 | return this.loggerName; 28 | } 29 | 30 | public boolean isDebugEnabled() { 31 | return false; 32 | } 33 | 34 | public void error(String s, Throwable e) { 35 | error(s); 36 | 37 | if (e != null) { 38 | e.printStackTrace(); 39 | } 40 | } 41 | 42 | public void error(String s) { 43 | if (s != null) { 44 | System.err.println(loggerName + " : " + s); 45 | } 46 | } 47 | 48 | public void debug(String s) { 49 | System.out.println(s); 50 | } 51 | 52 | public void debug(String s, Throwable e) { 53 | System.out.println(s+e!=null?e.getMessage():""); 54 | } 55 | 56 | public void warn(String s) { 57 | System.out.println(s); 58 | } 59 | 60 | @Override 61 | public void warn(String s, Throwable e) { 62 | System.out.println(s+","+e!=null?e.getMessage():""); 63 | } 64 | 65 | 66 | @Override 67 | public boolean isInfoEnabled() { 68 | return false; 69 | } 70 | 71 | @Override 72 | public void info(String s) { 73 | System.out.println(s); 74 | } 75 | 76 | @Override 77 | public boolean isWarnEnabled() { 78 | return false; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/Resources.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | 19 | /** 20 | * A class to simplify access to resources through the classloader. 21 | */ 22 | public final class Resources extends Object { 23 | 24 | private static ClassLoader defaultClassLoader; 25 | 26 | private Resources(){ 27 | } 28 | 29 | /** 30 | * Returns the default classloader (may be null). 31 | * 32 | * @return The default classloader 33 | */ 34 | public static ClassLoader getDefaultClassLoader() { 35 | return defaultClassLoader; 36 | } 37 | 38 | /** 39 | * Sets the default classloader 40 | * 41 | * @param defaultClassLoader - the new default ClassLoader 42 | */ 43 | public static void setDefaultClassLoader(ClassLoader defaultClassLoader) { 44 | Resources.defaultClassLoader = defaultClassLoader; 45 | } 46 | 47 | /** 48 | * Loads a class 49 | * 50 | * @param className - the class to load 51 | * @return The loaded class 52 | * @throws ClassNotFoundException If the class cannot be found (duh!) 53 | */ 54 | public static Class classForName(String className) throws ClassNotFoundException { 55 | Class clazz = null; 56 | try { 57 | clazz = getClassLoader().loadClass(className); 58 | } catch (Exception e) { 59 | // Ignore. Failsafe below. 60 | } 61 | if (clazz == null) { 62 | clazz = Class.forName(className); 63 | } 64 | return clazz; 65 | } 66 | 67 | private static ClassLoader getClassLoader() { 68 | if (defaultClassLoader != null) { 69 | return defaultClassLoader; 70 | } else { 71 | return Thread.currentThread().getContextClassLoader(); 72 | } 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/logging/SLF4JImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1999-2101 Alibaba Group Holding Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package org.nlpcn.commons.lang.util.logging; 17 | 18 | import org.slf4j.Logger; 19 | import org.slf4j.LoggerFactory; 20 | import org.slf4j.spi.LocationAwareLogger; 21 | 22 | public class SLF4JImpl implements Log { 23 | 24 | private static final String callerFQCN = SLF4JImpl.class.getName(); 25 | private static final Logger testLogger = LoggerFactory.getLogger(SLF4JImpl.class); 26 | static { 27 | // if the logger is not a LocationAwareLogger instance, it can not get correct stack StackTraceElement 28 | // so ignore this implementation. 29 | if (!(testLogger instanceof LocationAwareLogger)) { 30 | throw new UnsupportedOperationException(testLogger.getClass() + " is not a suitable logger"); 31 | } 32 | } 33 | private LocationAwareLogger log; 34 | 35 | public SLF4JImpl(LocationAwareLogger log){ 36 | this.log = log; 37 | } 38 | 39 | public SLF4JImpl(String loggerName){ 40 | this.log = (LocationAwareLogger) LoggerFactory.getLogger(loggerName); 41 | } 42 | 43 | @Override 44 | public boolean isDebugEnabled() { 45 | return log.isDebugEnabled(); 46 | } 47 | 48 | @Override 49 | public void error(String msg, Throwable e) { 50 | log.log(null, callerFQCN, LocationAwareLogger.ERROR_INT, msg, null, e); 51 | } 52 | 53 | @Override 54 | public void error(String msg) { 55 | log.log(null, callerFQCN, LocationAwareLogger.ERROR_INT, msg, null, null); 56 | } 57 | 58 | @Override 59 | public boolean isInfoEnabled() { 60 | return log.isInfoEnabled(); 61 | } 62 | 63 | @Override 64 | public void info(String msg) { 65 | log.log(null, callerFQCN, LocationAwareLogger.INFO_INT, msg, null, null); 66 | } 67 | 68 | @Override 69 | public void debug(String msg) { 70 | log.log(null, callerFQCN, LocationAwareLogger.DEBUG_INT, msg, null, null); 71 | } 72 | 73 | @Override 74 | public void debug(String msg, Throwable e) { 75 | log.log(null, callerFQCN, LocationAwareLogger.ERROR_INT, msg, null, e); 76 | } 77 | 78 | @Override 79 | public boolean isWarnEnabled() { 80 | return log.isWarnEnabled(); 81 | } 82 | 83 | @Override 84 | public void warn(String msg) { 85 | log.log(null, callerFQCN, LocationAwareLogger.WARN_INT, msg, null, null); 86 | } 87 | 88 | @Override 89 | public void warn(String msg, Throwable e) { 90 | log.log(null, callerFQCN, LocationAwareLogger.WARN_INT, msg, null, e); 91 | } 92 | 93 | } 94 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/KeyValue.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples; 21 | 22 | import java.util.Collection; 23 | import java.util.Iterator; 24 | 25 | import org.nlpcn.commons.lang.util.tuples.valueintf.IValueKey; 26 | import org.nlpcn.commons.lang.util.tuples.valueintf.IValueValue; 27 | 28 | /** 29 | *

30 | * A tuple of two elements, with positions 0 and 1 renamed as "key" and 31 | * "value", respectively. 32 | *

33 | * 34 | * @since 1.0 35 | * 36 | * @author Daniel Fernández 37 | * 38 | */ 39 | public final class KeyValue 40 | extends Tuple 41 | implements IValueKey, 42 | IValueValue { 43 | 44 | private static final long serialVersionUID = 3460957157833872509L; 45 | 46 | private static final int SIZE = 2; 47 | 48 | private final A key; 49 | private final B value; 50 | 51 | 52 | 53 | public static KeyValue with(final A key, final B value) { 54 | return new KeyValue(key,value); 55 | } 56 | 57 | 58 | /** 59 | *

60 | * Create tuple from array. Array has to have exactly two elements. 61 | *

62 | * 63 | * @param the array component type 64 | * @param array the array to be converted to a tuple 65 | * @return the tuple 66 | */ 67 | public static KeyValue fromArray(final X[] array) { 68 | if (array == null) { 69 | throw new IllegalArgumentException("Array cannot be null"); 70 | } 71 | if (array.length != 2) { 72 | throw new IllegalArgumentException("Array must have exactly 2 elements in order to create a KeyValue. Size is " + array.length); 73 | } 74 | return new KeyValue(array[0],array[1]); 75 | } 76 | 77 | 78 | public static KeyValue fromCollection(final Collection collection) { 79 | return fromIterable(collection); 80 | } 81 | 82 | 83 | 84 | public static KeyValue fromIterable(final Iterable iterable) { 85 | return fromIterable(iterable, 0, true); 86 | } 87 | 88 | 89 | 90 | public static KeyValue fromIterable(final Iterable iterable, int index) { 91 | return fromIterable(iterable, index, false); 92 | } 93 | 94 | 95 | 96 | private static KeyValue fromIterable(final Iterable iterable, int index, final boolean exactSize) { 97 | 98 | if (iterable == null) { 99 | throw new IllegalArgumentException("Iterable cannot be null"); 100 | } 101 | 102 | boolean tooFewElements = false; 103 | 104 | X element0 = null; 105 | X element1 = null; 106 | 107 | final Iterator iter = iterable.iterator(); 108 | 109 | int i = 0; 110 | while (i < index) { 111 | if (iter.hasNext()) { 112 | iter.next(); 113 | } else { 114 | tooFewElements = true; 115 | } 116 | i++; 117 | } 118 | 119 | if (iter.hasNext()) { 120 | element0 = iter.next(); 121 | } else { 122 | tooFewElements = true; 123 | } 124 | 125 | if (iter.hasNext()) { 126 | element1 = iter.next(); 127 | } else { 128 | tooFewElements = true; 129 | } 130 | 131 | if (tooFewElements && exactSize) { 132 | throw new IllegalArgumentException("Not enough elements for creating a KeyValue (2 needed)"); 133 | } 134 | 135 | if (iter.hasNext() && exactSize) { 136 | throw new IllegalArgumentException("Iterable must have exactly 2 available elements in order to create a KeyValue."); 137 | } 138 | 139 | return new KeyValue(element0, element1); 140 | 141 | } 142 | 143 | 144 | public KeyValue( 145 | final A key, 146 | final B value) { 147 | super(key, value); 148 | this.key = key; 149 | this.value = value; 150 | } 151 | 152 | 153 | public A getKey() { 154 | return this.key; 155 | } 156 | 157 | 158 | public B getValue() { 159 | return this.value; 160 | } 161 | 162 | 163 | @Override 164 | public int getSize() { 165 | return SIZE; 166 | } 167 | 168 | 169 | 170 | public KeyValue setKey(final X key) { 171 | return new KeyValue(key, this.value); 172 | } 173 | 174 | 175 | public KeyValue setValue(final Y value) { 176 | return new KeyValue(this.key, value); 177 | } 178 | 179 | 180 | 181 | 182 | 183 | } 184 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/LabelValue.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples; 21 | 22 | import java.util.Collection; 23 | import java.util.Iterator; 24 | 25 | import org.nlpcn.commons.lang.util.tuples.valueintf.IValueLabel; 26 | import org.nlpcn.commons.lang.util.tuples.valueintf.IValueValue; 27 | 28 | /** 29 | *

30 | * A tuple of two elements, with positions 0 and 1 renamed as "label" and 31 | * "value", respectively. 32 | *

33 | * 34 | * @since 1.0 35 | * 36 | * @author Daniel Fernández 37 | * 38 | */ 39 | public final class LabelValue 40 | extends Tuple 41 | implements IValueLabel
, 42 | IValueValue { 43 | 44 | private static final long serialVersionUID = 5055574980300695706L; 45 | 46 | private static final int SIZE = 2; 47 | 48 | private final A label; 49 | private final B value; 50 | 51 | 52 | 53 | public static LabelValue with(final A label, final B value) { 54 | return new LabelValue(label,value); 55 | } 56 | 57 | 58 | /** 59 | *

60 | * Create tuple from array. Array has to have exactly two elements. 61 | *

62 | * 63 | * @param the array component type 64 | * @param array the array to be converted to a tuple 65 | * @return the tuple 66 | */ 67 | public static LabelValue fromArray(final X[] array) { 68 | if (array == null) { 69 | throw new IllegalArgumentException("Array cannot be null"); 70 | } 71 | if (array.length != 2) { 72 | throw new IllegalArgumentException("Array must have exactly 2 elements in order to create a LabelValue. Size is " + array.length); 73 | } 74 | return new LabelValue(array[0],array[1]); 75 | } 76 | 77 | 78 | public static LabelValue fromCollection(final Collection collection) { 79 | return fromIterable(collection); 80 | } 81 | 82 | 83 | 84 | public static LabelValue fromIterable(final Iterable iterable) { 85 | return fromIterable(iterable, 0, true); 86 | } 87 | 88 | 89 | 90 | public static LabelValue fromIterable(final Iterable iterable, int index) { 91 | return fromIterable(iterable, index, false); 92 | } 93 | 94 | 95 | 96 | private static LabelValue fromIterable(final Iterable iterable, int index, final boolean exactSize) { 97 | 98 | if (iterable == null) { 99 | throw new IllegalArgumentException("Iterable cannot be null"); 100 | } 101 | 102 | boolean tooFewElements = false; 103 | 104 | X element0 = null; 105 | X element1 = null; 106 | 107 | final Iterator iter = iterable.iterator(); 108 | 109 | int i = 0; 110 | while (i < index) { 111 | if (iter.hasNext()) { 112 | iter.next(); 113 | } else { 114 | tooFewElements = true; 115 | } 116 | i++; 117 | } 118 | 119 | if (iter.hasNext()) { 120 | element0 = iter.next(); 121 | } else { 122 | tooFewElements = true; 123 | } 124 | 125 | if (iter.hasNext()) { 126 | element1 = iter.next(); 127 | } else { 128 | tooFewElements = true; 129 | } 130 | 131 | if (tooFewElements && exactSize) { 132 | throw new IllegalArgumentException("Not enough elements for creating a LabelValue (2 needed)"); 133 | } 134 | 135 | if (iter.hasNext() && exactSize) { 136 | throw new IllegalArgumentException("Iterable must have exactly 2 available elements in order to create a LabelValue."); 137 | } 138 | 139 | return new LabelValue(element0, element1); 140 | 141 | } 142 | 143 | 144 | 145 | 146 | public LabelValue( 147 | final A label, 148 | final B value) { 149 | super(label, value); 150 | this.label = label; 151 | this.value = value; 152 | } 153 | 154 | 155 | public A getLabel() { 156 | return this.label; 157 | } 158 | 159 | 160 | public B getValue() { 161 | return this.value; 162 | } 163 | 164 | 165 | @Override 166 | public int getSize() { 167 | return SIZE; 168 | } 169 | 170 | 171 | 172 | public LabelValue setLabel(final X label) { 173 | return new LabelValue(label, this.value); 174 | } 175 | 176 | 177 | public LabelValue setValue(final Y value) { 178 | return new LabelValue(this.label, value); 179 | } 180 | 181 | 182 | 183 | 184 | 185 | } 186 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/Tuple.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples; 21 | 22 | import java.io.Serializable; 23 | import java.util.ArrayList; 24 | import java.util.Arrays; 25 | import java.util.Collection; 26 | import java.util.Collections; 27 | import java.util.Iterator; 28 | import java.util.List; 29 | 30 | 31 | /** 32 | *

33 | * Abstract base class for all tuple classes. 34 | *

35 | * 36 | * @since 1.0 37 | * 38 | * @author Daniel Fernández 39 | * 40 | */ 41 | public abstract class Tuple implements Iterable, Serializable, Comparable { 42 | 43 | private static final long serialVersionUID = 5431085632328343101L; 44 | 45 | private final Object[] valueArray; 46 | private final List valueList; 47 | 48 | 49 | 50 | /** 51 | * 52 | * @deprecated Will be removed in 1.4. The "size" parameter is of no use at 53 | * this level, so use the simpler Tuple(values) constructor instead. 54 | */ 55 | @Deprecated 56 | protected Tuple(@SuppressWarnings("unused") final int size, final Object... values) { 57 | super(); 58 | this.valueArray = values; 59 | this.valueList = Arrays.asList(values); 60 | } 61 | 62 | 63 | 64 | protected Tuple(final Object... values) { 65 | super(); 66 | this.valueArray = values; 67 | this.valueList = Arrays.asList(values); 68 | } 69 | 70 | 71 | /** 72 | *

73 | * Return the size of the tuple. 74 | *

75 | * 76 | * @return the size of the tuple. 77 | */ 78 | public abstract int getSize(); 79 | 80 | 81 | /** 82 | *

83 | * Get the value at a specific position in the tuple. This method 84 | * has to return object, so using it you will lose the type-safety you 85 | * get with the getValueX() methods. 86 | *

87 | * 88 | * @param pos the position of the value to be retrieved. 89 | * @return the value 90 | */ 91 | public final Object getValue(final int pos) { 92 | if (pos >= getSize()) { 93 | throw new IllegalArgumentException( 94 | "Cannot retrieve position " + pos + " in " + this.getClass().getSimpleName() + 95 | ". Positions for this class start with 0 and end with " + (getSize() - 1)); 96 | } 97 | return this.valueArray[pos]; 98 | } 99 | 100 | 101 | 102 | public final Iterator iterator() { 103 | return this.valueList.iterator(); 104 | } 105 | 106 | 107 | @Override 108 | public final String toString() { 109 | return this.valueList.toString(); 110 | } 111 | 112 | 113 | public final boolean contains(final Object value) { 114 | for (final Object val : this.valueList) { 115 | if (val == null) { 116 | if (value == null) { 117 | return true; 118 | } 119 | } else { 120 | if (val.equals(value)) { 121 | return true; 122 | } 123 | } 124 | } 125 | return false; 126 | } 127 | 128 | 129 | public final boolean containsAll(final Collection collection) { 130 | for (final Object value : collection) { 131 | if (!contains(value)) { 132 | return false; 133 | } 134 | } 135 | return true; 136 | } 137 | 138 | 139 | public final boolean containsAll(final Object... values) { 140 | if (values == null) { 141 | throw new IllegalArgumentException("Values array cannot be null"); 142 | } 143 | for (final Object value : values) { 144 | if (!contains(value)) { 145 | return false; 146 | } 147 | } 148 | return true; 149 | } 150 | 151 | 152 | 153 | public final int indexOf(final Object value) { 154 | int i = 0; 155 | for (final Object val : this.valueList) { 156 | if (val == null) { 157 | if (value == null) { 158 | return i; 159 | } 160 | } else { 161 | if (val.equals(value)) { 162 | return i; 163 | } 164 | } 165 | i++; 166 | } 167 | return -1; 168 | } 169 | 170 | 171 | public final int lastIndexOf(final Object value) { 172 | for (int i = getSize() - 1; i >= 0; i--) { 173 | final Object val = this.valueList.get(i); 174 | if (val == null) { 175 | if (value == null) { 176 | return i; 177 | } 178 | } else { 179 | if (val.equals(value)) { 180 | return i; 181 | } 182 | } 183 | } 184 | return -1; 185 | } 186 | 187 | 188 | 189 | 190 | 191 | public final List toList() { 192 | return Collections.unmodifiableList(new ArrayList(this.valueList)); 193 | } 194 | 195 | 196 | 197 | public final Object[] toArray() { 198 | return this.valueArray.clone(); 199 | } 200 | 201 | 202 | 203 | @Override 204 | public final int hashCode() { 205 | final int prime = 31; 206 | int result = 1; 207 | result = prime * result 208 | + ((this.valueList == null) ? 0 : this.valueList.hashCode()); 209 | return result; 210 | } 211 | 212 | 213 | 214 | @Override 215 | public final boolean equals(final Object obj) { 216 | if (this == obj) { 217 | return true; 218 | } 219 | if (obj == null) { 220 | return false; 221 | } 222 | if (getClass() != obj.getClass()) { 223 | return false; 224 | } 225 | final Tuple other = (Tuple) obj; 226 | return this.valueList.equals(other.valueList); 227 | } 228 | 229 | 230 | 231 | 232 | @SuppressWarnings({ "rawtypes", "unchecked" }) 233 | public int compareTo(final Tuple o) { 234 | 235 | final int tLen = this.valueArray.length; 236 | final Object[] oValues = o.valueArray; 237 | final int oLen = oValues.length; 238 | 239 | for (int i = 0; i < tLen && i < oLen; i++) { 240 | 241 | final Comparable tElement = (Comparable)this.valueArray[i]; 242 | final Comparable oElement = (Comparable)oValues[i]; 243 | 244 | final int comparison = tElement.compareTo(oElement); 245 | if (comparison != 0) { 246 | return comparison; 247 | } 248 | 249 | } 250 | 251 | return (Integer.valueOf(tLen)).compareTo(Integer.valueOf(oLen)); 252 | 253 | } 254 | 255 | 256 | 257 | } 258 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue0.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | 23 | /** 24 | *

25 | * Marker interface for tuples with a "0" value. 26 | *

27 | 28 | * @since 1.1 29 | * 30 | * @author Daniel Fernández 31 | * 32 | */ 33 | public interface IValue0 { 34 | 35 | public X getValue0(); 36 | 37 | } 38 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue1.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "1" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue1 { 33 | 34 | public X getValue1(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "2" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue2 { 33 | 34 | public X getValue2(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue3.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "3" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue3 { 33 | 34 | public X getValue3(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue4.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "4" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue4 { 33 | 34 | public X getValue4(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue5.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "5" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue5 { 33 | 34 | public X getValue5(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue6.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "6" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue6 { 33 | 34 | public X getValue6(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue7.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "7" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue7 { 33 | 34 | public X getValue7(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue8.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "8" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue8 { 33 | 34 | public X getValue8(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValue9.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "9" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValue9 { 33 | 34 | public X getValue9(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValueKey.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "key" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValueKey { 33 | 34 | public X getKey(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValueLabel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * ============================================================================= 19 | */ 20 | package org.nlpcn.commons.lang.util.tuples.valueintf; 21 | 22 | /** 23 | *

24 | * Marker interface for tuples with a "label" value. 25 | *

26 | 27 | * @since 1.1 28 | * 29 | * @author Daniel Fernández 30 | * 31 | */ 32 | public interface IValueLabel { 33 | 34 | public X getLabel(); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /pinyin-core/src/main/java/org/nlpcn/commons/lang/util/tuples/valueintf/IValueValue.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ============================================================================= 3 | * 4 | 5 | * Copyright (c) 2010, The JAVATUPLES team (http://www.javatuples.org) 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | * 19 | * ============================================================================= 20 | */ 21 | package org.nlpcn.commons.lang.util.tuples.valueintf; 22 | 23 | /** 24 | *

25 | * Marker interface for tuples with a "value" value. 26 | *

27 | 28 | * @since 1.1 29 | * 30 | * @author Daniel Fernández 31 | * 32 | */ 33 | public interface IValueValue { 34 | 35 | public X getValue(); 36 | 37 | } 38 | -------------------------------------------------------------------------------- /pinyin-core/src/main/resources/pinyin_alphabet.dict: -------------------------------------------------------------------------------- 1 | a 2 | ai 3 | an 4 | ang 5 | ao 6 | b 7 | ba 8 | bai 9 | ban 10 | bang 11 | bao 12 | bei 13 | ben 14 | beng 15 | bi 16 | bian 17 | biao 18 | bie 19 | bin 20 | bing 21 | bo 22 | bu 23 | c 24 | ca 25 | cai 26 | can 27 | cang 28 | cao 29 | ce 30 | cen 31 | ceng 32 | ch 33 | cha 34 | chai 35 | chan 36 | chang 37 | chao 38 | che 39 | chen 40 | cheng 41 | chi 42 | chong 43 | chou 44 | chu 45 | chua 46 | chuai 47 | chuan 48 | chuang 49 | chui 50 | chun 51 | chuo 52 | ci 53 | cong 54 | cou 55 | cu 56 | cuan 57 | cui 58 | cun 59 | cuo 60 | d 61 | da 62 | dai 63 | dan 64 | dang 65 | dao 66 | de 67 | dei 68 | den 69 | deng 70 | di 71 | dia 72 | dian 73 | diao 74 | die 75 | ding 76 | diu 77 | dong 78 | dou 79 | du 80 | duan 81 | dui 82 | dun 83 | duo 84 | e 85 | er 86 | en 87 | f 88 | fa 89 | fan 90 | fang 91 | fei 92 | fen 93 | feng 94 | fiao 95 | fo 96 | fou 97 | fu 98 | g 99 | ga 100 | gai 101 | gan 102 | gang 103 | gao 104 | ge 105 | gei 106 | gen 107 | geng 108 | gong 109 | gou 110 | gu 111 | gua 112 | guai 113 | guan 114 | guang 115 | gui 116 | gun 117 | guo 118 | h 119 | ha 120 | hai 121 | han 122 | hang 123 | hao 124 | he 125 | hei 126 | hen 127 | heng 128 | hong 129 | hou 130 | hu 131 | hua 132 | huai 133 | huan 134 | huang 135 | hui 136 | hun 137 | huo 138 | i 139 | j 140 | ja 141 | ji 142 | jia 143 | jian 144 | jiang 145 | jiao 146 | jie 147 | jin 148 | jing 149 | jiong 150 | jiu 151 | ju 152 | juan 153 | jue 154 | jun 155 | k 156 | ka 157 | kai 158 | kan 159 | kang 160 | kao 161 | ke 162 | kei 163 | ken 164 | keng 165 | kong 166 | kou 167 | ku 168 | kua 169 | kuai 170 | kuan 171 | kuang 172 | kui 173 | kun 174 | kuo 175 | l 176 | la 177 | lai 178 | lan 179 | lang 180 | lao 181 | le 182 | lei 183 | leng 184 | li 185 | lia 186 | lian 187 | liang 188 | liao 189 | lie 190 | lin 191 | ling 192 | liu 193 | lo 194 | long 195 | lou 196 | lu 197 | luan 198 | lun 199 | luo 200 | lv 201 | lve 202 | lü 203 | lüe 204 | m 205 | ma 206 | mai 207 | man 208 | mang 209 | mao 210 | me 211 | mei 212 | men 213 | meng 214 | mi 215 | mian 216 | miao 217 | mie 218 | min 219 | ming 220 | miu 221 | mo 222 | mou 223 | mu 224 | n 225 | na 226 | nai 227 | nan 228 | nang 229 | nao 230 | ne 231 | nei 232 | nen 233 | neng 234 | ni 235 | nian 236 | niang 237 | niao 238 | nie 239 | nin 240 | ning 241 | niu 242 | nong 243 | nou 244 | nu 245 | nuan 246 | nun 247 | nuo 248 | nv 249 | nve 250 | nü 251 | nüe 252 | o 253 | ou 254 | p 255 | pa 256 | pai 257 | pan 258 | pang 259 | pao 260 | pei 261 | pen 262 | peng 263 | pi 264 | pian 265 | piao 266 | pie 267 | pin 268 | ping 269 | po 270 | pou 271 | pu 272 | q 273 | qi 274 | qia 275 | qian 276 | qiang 277 | qiao 278 | qie 279 | qin 280 | qing 281 | qiong 282 | qiu 283 | qu 284 | quan 285 | que 286 | qun 287 | r 288 | ran 289 | rang 290 | rao 291 | re 292 | ren 293 | reng 294 | ri 295 | rong 296 | rou 297 | ru 298 | ruan 299 | rui 300 | run 301 | ruo 302 | s 303 | sa 304 | sai 305 | san 306 | sang 307 | sao 308 | se 309 | sen 310 | seng 311 | sh 312 | sha 313 | shai 314 | shan 315 | shang 316 | shao 317 | she 318 | shei 319 | shen 320 | sheng 321 | shi 322 | shou 323 | shu 324 | shua 325 | shuai 326 | shuan 327 | shuang 328 | shui 329 | shun 330 | shuo 331 | si 332 | song 333 | sou 334 | su 335 | suan 336 | sui 337 | sun 338 | suo 339 | t 340 | ta 341 | tai 342 | tan 343 | tang 344 | tao 345 | te 346 | teng 347 | ti 348 | tian 349 | tiao 350 | tie 351 | ting 352 | tong 353 | tou 354 | tu 355 | tuan 356 | tui 357 | tun 358 | tuo 359 | u 360 | v 361 | w 362 | wa 363 | wai 364 | wan 365 | wang 366 | wei 367 | wen 368 | weng 369 | wo 370 | wu 371 | x 372 | xi 373 | xia 374 | xian 375 | xiang 376 | xiao 377 | xie 378 | xin 379 | xing 380 | xiong 381 | xiu 382 | xu 383 | xuan 384 | xue 385 | xun 386 | y 387 | ya 388 | yai 389 | yan 390 | yang 391 | yao 392 | ye 393 | yi 394 | yin 395 | ying 396 | yo 397 | yong 398 | you 399 | yu 400 | yuan 401 | yue 402 | yun 403 | z 404 | za 405 | zai 406 | zan 407 | zang 408 | zao 409 | ze 410 | zei 411 | zen 412 | zeng 413 | zh 414 | zha 415 | zhai 416 | zhan 417 | zhang 418 | zhao 419 | zhe 420 | zhei 421 | zhen 422 | zheng 423 | zhi 424 | zhong 425 | zhou 426 | zhu 427 | zhua 428 | zhuai 429 | zhuan 430 | zhuang 431 | zhui 432 | zhun 433 | zhuo 434 | zi 435 | zong 436 | zou 437 | zu 438 | zuan 439 | zui 440 | zun 441 | zuo 442 | ü 443 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/com/infinilabs/pinyin/analysis/PinyinAlphabetTokenizerTest.java: -------------------------------------------------------------------------------- 1 | package com.infinilabs.pinyin.analysis; 2 | 3 | import java.util.Arrays; 4 | 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | 8 | /** 9 | * 拼音串切分,很难做到最好,认为取最少切分是最好的 10 | * 11 | * @author shenyanchao 12 | * @since 2018-10-08 12:22 13 | */ 14 | public class PinyinAlphabetTokenizerTest { 15 | 16 | @Test 17 | public void walk() throws Exception { 18 | 19 | Assert.assertEquals(Arrays.asList("xian").toString(), PinyinAlphabetTokenizer.walk("xian").toString()); 20 | Assert.assertEquals(Arrays.asList("wo", "shi", "liang").toString(), 21 | PinyinAlphabetTokenizer.walk("woshiliang").toString()); 22 | 23 | Assert.assertEquals(Arrays.asList("zhong", "hua", "ren", "min", "gong", "he", "guo").toString(), 24 | PinyinAlphabetTokenizer.walk("zhonghuarenmingongheguo").toString()); 25 | Assert.assertEquals( 26 | Arrays.asList("5", "zhong", "hua", "ren", "89", "min", "gong", "he", "guo", "234").toString(), 27 | PinyinAlphabetTokenizer.walk("5zhonghuaren89mingongheguo234").toString()); 28 | } 29 | 30 | } -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/TestUtils.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang; 2 | 3 | import java.util.List; 4 | 5 | import org.nlpcn.commons.lang.pinyin.Pinyin; 6 | 7 | public class TestUtils { 8 | 9 | public static String mainResources(final String file) { 10 | return System.getProperties().getProperty("user.dir") + "/src/main/resources" 11 | + (file.startsWith("/") ? file : "/" + file); 12 | } 13 | 14 | public static String testResources(final String file) { 15 | return System.getProperties().getProperty("user.dir") + "/src/test/resources" 16 | + (file.startsWith("/") ? file : "/" + file); 17 | } 18 | 19 | public static void main(String[] args) { 20 | List parseStr = Pinyin.unicodePinyin("日往月来"); 21 | System.out.println(parseStr); 22 | parseStr = Pinyin.pinyin("日往月来"); 23 | System.out.println(parseStr); 24 | parseStr = Pinyin.tonePinyin("日往月来"); 25 | System.out.println(parseStr); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/pinyin/PinyinTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.pinyin; 2 | 3 | import java.util.List; 4 | 5 | import org.junit.Assert; 6 | import org.junit.Test; 7 | 8 | public class PinyinTest { 9 | 10 | String str = "正品行货 正品行货 码完代码,他起身关上电脑,用滚烫的开水为自己泡制一碗腾着热气的老坛酸菜面。中国的程序员更偏爱拉上窗帘,在黑暗中享受这独特的美食。这是现代工业给一天辛苦劳作的人最好的馈赠。南方一带生长的程序员虽然在京城多年,但仍口味清淡,他们往往不加料包,由脸颊自然淌下的热泪补充恰当的盐分。他们相信,用这种方式,能够抹平思考着现在是不是过去想要的未来而带来的大部分忧伤…小李的父亲在年轻的时候也是从爷爷手里接收了祖传的代码,不过令人惊讶的是,到了小李这一代,很多东西都遗失了,但是程序员苦逼的味道保存的是如此的完整。 就在24小时之前,最新的需求从PM处传来,为了得到这份自然的馈赠,码农们开机、写码、调试、重构,四季轮回的等待换来这难得的丰收时刻。码农知道,需求的保鲜期只有短短的两天,码农们要以最快的速度对代码进行精致的加工,任何一个需求都可能在24小时之后失去原本的活力,变成一文不值的垃圾创意。"; 11 | 12 | // String str = "點下面繁體字按鈕進行在線轉換" ; 13 | 14 | 15 | /** 16 | * 動態加入拼音 17 | */ 18 | @Test 19 | public void testInsertPinyin(){ 20 | List result1 = (Pinyin.tonePinyin(str)); 21 | System.out.println("result1:"+result1); 22 | Pinyin.insertPinyin("行货", new String[]{"hang2","huo4"}); 23 | List result2 = (Pinyin.tonePinyin(str)); 24 | System.out.println("result2:"+result2); 25 | Assert.assertNotSame(result1.get(2), result2.get(2)); 26 | 27 | } 28 | 29 | /** 30 | * list 转换为String 31 | */ 32 | @Test 33 | public void testList2String(){ 34 | List list = Pinyin.unicodePinyin(str); 35 | 36 | System.out.println(list); 37 | 38 | System.out.println(Pinyin.list2String(list)); 39 | 40 | System.out.println(Pinyin.list2StringSkipNull(list)); 41 | } 42 | 43 | 44 | @Test 45 | public void testStr2Pinyin() { 46 | List parseStr = Pinyin.unicodePinyin(str); 47 | System.out.println(parseStr); 48 | Assert.assertEquals(parseStr.size(), str.length()); 49 | } 50 | 51 | /** 52 | * 拼音返回 53 | * 54 | * @param str 55 | * @return ['zhong3','guo4'] 56 | */ 57 | @Test 58 | public void testPinyinStr() { 59 | List result = Pinyin.pinyin(str); 60 | System.out.println(result); 61 | Assert.assertEquals(result.size(), str.length()); 62 | 63 | } 64 | 65 | /** 66 | * 取得每个字的拼音,不要声调 67 | * 68 | * @return 69 | */ 70 | @Test 71 | public void testPinyinWithoutTone() { 72 | List result = Pinyin.pinyin(str); 73 | System.out.println(result); 74 | Assert.assertEquals(result.size(), str.length()); 75 | } 76 | 77 | /** 78 | * 取得每个字的首字符 79 | * 80 | * @param str 81 | * @return 82 | */ 83 | @Test 84 | public void testStr2FirstCharArr() { 85 | List result = Pinyin.firstChar(str); 86 | System.out.println(result); 87 | Assert.assertEquals(result.size(), str.length()); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/tire/splitWord/AllWordTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.splitWord; 2 | 3 | import org.nlpcn.commons.lang.tire.SmartGetWord; 4 | import org.nlpcn.commons.lang.tire.domain.SmartForest; 5 | import org.nlpcn.commons.lang.util.StringUtil; 6 | 7 | public class AllWordTest { 8 | public static void main(String[] args) { 9 | /** 10 | * 词典的构造.一行一个词后面是参数.可以从文件读取.可以是read流. 11 | */ 12 | long start = System.currentTimeMillis(); 13 | SmartForest forest = new SmartForest(); 14 | 15 | forest.add("中国", 3); 16 | 17 | forest.add("android", 3); 18 | 19 | forest.add("java", 3); 20 | 21 | forest.add("jav", 3); 22 | 23 | forest.add("中国人", 3); 24 | forest.add("国人", 3); 25 | 26 | forest.add("0",3); 27 | forest.add("3",3); 28 | 29 | String content = " Android-java-中国人00000000000000 1230 013 33333"; 30 | 31 | 32 | content = StringUtil.rmHtmlTag(content); 33 | 34 | for (int i = 0; i < 1; i++) { 35 | SmartGetWord udg = forest.getWord(content.toLowerCase().toCharArray()); 36 | 37 | String temp; 38 | while ((temp = udg.getAllWords()) != null) { 39 | System.out.println(temp + "\t" + udg.getParam()); 40 | } 41 | } 42 | System.out.println(System.currentTimeMillis() - start); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/tire/splitWord/ForestTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.splitWord; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStreamReader; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | //import org.nlpcn.commons.lang.dic.DicManager; 11 | import org.nlpcn.commons.lang.tire.GetWord; 12 | import org.nlpcn.commons.lang.tire.domain.Forest; 13 | import org.nlpcn.commons.lang.tire.library.Library; 14 | import org.nlpcn.commons.lang.util.IOUtil; 15 | 16 | public class ForestTest { 17 | 18 | @Test 19 | public void test() throws Exception { 20 | 21 | Forest f = new Forest(); 22 | 23 | f.addBranch("5", null); 24 | f.addBranch("2", null); 25 | f.addBranch("0", null); 26 | f.addBranch("12", null); 27 | f.addBranch("23", null); 28 | f.addBranch("abc12", null); 29 | f.addBranch("abc", null); 30 | f.addBranch("解放军", null); 31 | f.addBranch("解放", null); 32 | f.addBranch("解放军强渡长江", null); 33 | 34 | GetWord word = f.getWord("  5月20日,解放军强渡渭河123abc123"); 35 | 36 | Assert.assertEquals(word.getFrontWords(), "5"); 37 | Assert.assertEquals(word.getFrontWords(), "解放军"); 38 | //TODO: it is a bug ! fuck me! 39 | //Assert.assertEquals(word.getFrontWords(), "abc"); 40 | Assert.assertEquals(word.getFrontWords(), null); 41 | 42 | } 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/tire/splitWord/GetWordTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.splitWord; 2 | 3 | import org.nlpcn.commons.lang.tire.GetWord; 4 | import org.nlpcn.commons.lang.tire.domain.Forest; 5 | import org.nlpcn.commons.lang.tire.library.Library; 6 | import org.nlpcn.commons.lang.util.StringUtil; 7 | 8 | import java.io.BufferedReader; 9 | import java.io.StringReader; 10 | 11 | /** 12 | * Created by ansj on 3/30/14. 13 | */ 14 | public class GetWordTest { 15 | public static void main(String[] args) throws Exception { 16 | /** 17 | * 词典的构造.一行一个词后面是参数.可以从文件读取.可以是read流. 18 | */ 19 | long start = System.currentTimeMillis(); 20 | String dic = "android\t10\t孙健\nc\t100\nC++\t10\nc++\t5\nc#\t100\nVC++\t100".toLowerCase(); 21 | System.out.println(dic); 22 | Forest forest = Library.makeForest(new BufferedReader(new StringReader(dic))); 23 | /** 24 | * 删除一个单词 25 | */ 26 | Library.removeWord(forest, "中国"); 27 | /** 28 | * 增加一个新词 29 | */ 30 | Library.insertWord(forest, "中国人"); 31 | String content = "Android--中国人"; 32 | content = StringUtil.rmHtmlTag(content); 33 | 34 | for (int i = 0; i < 1; i++) { 35 | GetWord udg = forest.getWord(content.toLowerCase().toCharArray()); 36 | 37 | String temp = null; 38 | while ((temp = udg.getFrontWords()) != null) { 39 | System.out.println(temp + "\t\t" + udg.getParam()[0] + "\t\t" + udg.getParam()[1]); 40 | System.out.println(udg.offe); 41 | } 42 | } 43 | System.out.println(System.currentTimeMillis() - start); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/tire/splitWord/LibraryTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.splitWord; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | import org.nlpcn.commons.lang.tire.domain.Forest; 6 | import org.nlpcn.commons.lang.tire.library.Library; 7 | 8 | 9 | public class LibraryTest { 10 | 11 | @Test 12 | public void test() throws Exception { 13 | Forest makeForest = Library.makeForest("src/test/resources/library.txt","utf-8") ; 14 | 15 | Assert.assertNotNull(makeForest.getBranch("上访")); 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/tire/splitWord/SmartGetWordTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.tire.splitWord; 2 | 3 | import org.junit.Test; 4 | import org.nlpcn.commons.lang.tire.SmartGetWord; 5 | import org.nlpcn.commons.lang.tire.domain.SmartForest; 6 | import org.nlpcn.commons.lang.util.StringUtil; 7 | 8 | /** 9 | * by ansj on 3/30/14. 10 | */ 11 | public class SmartGetWordTest { 12 | 13 | @Test 14 | public void test() { 15 | /** 16 | * 词典的构造.一行一个词后面是参数.可以从文件读取.可以是read流. 17 | */ 18 | long start = System.currentTimeMillis(); 19 | SmartForest forest = new SmartForest(); 20 | 21 | forest.add("中国", 3); 22 | 23 | forest.add("android", 3); 24 | 25 | forest.add("java", 3); 26 | 27 | forest.add("中国人", 3); 28 | 29 | String content = " Android-java-中国人"; 30 | 31 | 32 | forest.remove("中国人") ; 33 | 34 | content = StringUtil.rmHtmlTag(content); 35 | 36 | for (int i = 0; i < 1; i++) { 37 | SmartGetWord udg = forest.getWord(content.toLowerCase().toCharArray()); 38 | 39 | String temp; 40 | while ((temp = udg.getFrontWords()) != null) { 41 | System.out.println(temp + "\t" + udg.getParam()); 42 | } 43 | } 44 | System.out.println(System.currentTimeMillis() - start); 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/util/FileFinderTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | 7 | import java.io.File; 8 | 9 | public class FileFinderTest { 10 | 11 | @Test 12 | public void test() { 13 | 14 | File find = (FileFinder.findByFile(new File("./"), "FileFinder.java",10)); 15 | 16 | Assert.assertNotNull(find); 17 | 18 | find = (FileFinder.findByFile(new File("./"), "FileFinder.java",9)); 19 | 20 | Assert.assertNull(find); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/util/IOUtilTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | public class IOUtilTest { 12 | 13 | @Test 14 | public void testWriteList() throws IOException { 15 | List list = new ArrayList() ; 16 | for (int i = 0; i < 100; i++) { 17 | list.add(String.valueOf(i)) ; 18 | } 19 | 20 | IOUtil.writeList(list, "list.tmp", "utf-8"); 21 | 22 | 23 | List readFile2List = IOUtil.readFile2List("list.tmp", "utf-8") ; 24 | 25 | Assert.assertArrayEquals(list.toArray(), readFile2List.toArray()); 26 | 27 | new File("list.tmp").delete(); 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/util/StringUtilTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.Collections; 6 | import java.util.List; 7 | 8 | import org.junit.Assert; 9 | import org.junit.Test; 10 | 11 | public class StringUtilTest { 12 | 13 | @Test 14 | public void test() { 15 | // 顺路介绍下 字符串是否为空的判断 16 | System.out.println(StringUtil.isBlank(" \t")); // result --> true 17 | 18 | // html清理 19 | System.out.println(StringUtil.rmHtmlTag("hello ansjmy name is ")); // result 20 | // -->hello 21 | // ansj 22 | 23 | System.out.println(StringUtil.rmHtmlTag("hello ansj hello kk ")); 24 | 25 | // 将用都好隔开的字符转换为sql中的in查询 26 | System.out.println(StringUtil.makeSqlInString("ansj,2134,123,123,123")); 27 | // result --> 'ansj','2134','123','123','123' 28 | } 29 | 30 | @Test 31 | public void joinerTest() { 32 | int[] ints = new int[] { 1, 2, 3, 4, 5, 6, 7 }; 33 | Assert.assertEquals(StringUtil.joiner(ints, ","), "1,2,3,4,5,6,7"); 34 | 35 | List list = new ArrayList(); 36 | 37 | for (int i : ints) { 38 | list.add(i); 39 | } 40 | 41 | Assert.assertEquals(StringUtil.joiner(list, ","), "1,2,3,4,5,6,7"); 42 | } 43 | 44 | @Test 45 | public void trimTest(){ 46 | String str = new String(new char[]{(char)65279,'\u00A0','\u3000'}) ; 47 | 48 | Assert.assertEquals(StringUtil.trim(str).length(), 0); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/util/WordAlertTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import org.junit.Test; 4 | 5 | 6 | /** 7 | * 8 | * @author ansj 9 | * 10 | */ 11 | public class WordAlertTest { 12 | 13 | @Test 14 | public void test() { 15 | String str = "az AZ AZ az 09•" ; 16 | char[] result = WordAlert.alertStr(str) ; 17 | System.out.println(new String(result));//az az az az 09· 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/util/WordWeightTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map.Entry; 5 | 6 | import org.junit.Test; 7 | 8 | public class WordWeightTest { 9 | 10 | private WordWeight ww = new WordWeight(); 11 | 12 | @Test 13 | public void exportTest() { 14 | ww.add("a"); 15 | ww.add("a"); 16 | ww.add("b"); 17 | ww.add("b"); 18 | ww.add("b"); 19 | ww.add("c"); 20 | System.out.println(ww.export()); 21 | } 22 | 23 | @Test 24 | public void exportIDFTest() { 25 | ww.add("a"); 26 | ww.add("a"); 27 | ww.add("b"); 28 | ww.add("b"); 29 | ww.add("b"); 30 | ww.add("c"); 31 | System.out.println(ww.exportIDF()); 32 | } 33 | 34 | @Test 35 | public void exportChiSquareTest() { 36 | ww.add("a", "t1"); 37 | ww.add("a", "t2"); 38 | ww.add("b", "t1"); 39 | ww.add("b", "t1"); 40 | ww.add("b", "t2"); 41 | ww.add("c", "t2"); 42 | HashMap> exportChiSquare = ww.exportChiSquare(); 43 | for (Entry> entry : exportChiSquare.entrySet()) { 44 | System.out.println(entry.getKey() + "\t" + entry.getValue().get()); 45 | } 46 | } 47 | 48 | @Test 49 | public void recyclingTest() { 50 | ww = new WordWeight(5, 3); 51 | ww.add("a", "t1"); 52 | ww.add("a", "t2"); 53 | ww.add("b", "t1"); 54 | ww.add("b", "t1"); 55 | ww.add("b", "t2"); 56 | ww.add("c", "t2"); 57 | ww.add("d", "t2"); 58 | ww.add("e", "t2"); 59 | ww.add("f", "t2"); 60 | ww.add("f", "t2"); 61 | System.out.println(ww.export()); 62 | System.out.println(ww.exportIDF()); 63 | HashMap> exportChiSquare = ww.exportChiSquare(); 64 | for (Entry> entry : exportChiSquare.entrySet()) { 65 | System.out.println(entry.getKey() + "\t" + entry.getValue().get()); 66 | } 67 | 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /pinyin-core/src/test/java/org/nlpcn/commons/lang/util/logging/NLPLoggerTest.java: -------------------------------------------------------------------------------- 1 | package org.nlpcn.commons.lang.util.logging; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | 6 | public class NLPLoggerTest { 7 | 8 | @Test 9 | @Ignore 10 | public void test() { 11 | Log logger = LogFactory.getLog(NLPLoggerTest.class) ; 12 | logger.info("info hello nlpcn!"); 13 | logger.warn("warn hello nlpcn!"); 14 | logger.error("error hello nlpcn!"); 15 | logger.debug("debug hello nlpcn!"); 16 | 17 | 18 | logger.warn("warn hello nlpcn!",new Exception("ansj")); 19 | logger.error("error hello nlpcn!",new Exception("ansj")); 20 | logger.debug("debug hello nlpcn!",new Exception("ansj")); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /pinyin-core/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=DEBUG, stdout 2 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n 5 | -------------------------------------------------------------------------------- /pinyin-core/src/test/resources/test_pinyin.dic: -------------------------------------------------------------------------------- 1 | 〇 ling2 2 | 一 yi1 3 | 丁 ding1 4 | 丂 kao3 5 | 七 qi1 6 | 丄 shang4 7 | 丅 xia4 8 | 丆 none0 9 | 万 wan4 10 | 丈 zhang4 11 | 三 san1 12 | 上 shang4 13 | 下 xia4 14 | 丌 ji1 15 | 不 bu4 16 | 与 yu3 17 | 丏 mian3 18 | 丐 gai4 19 | 丑 chou3 20 | 丒 chou3 21 | 专 zhuan1 22 | 且 qie3 23 | 丕 pi1 24 | 世 shi4 25 | 丗 shi4 26 | 丘 qiu1 27 | 丙 bing3 28 | 业 ye4 29 | 丛 cong2 30 | 东 dong1 31 | 丝 si1 32 | 丞 cheng2 33 | 丟 diu1 34 | 龙 long2 35 | 麝 she4 --------------------------------------------------------------------------------