├── .gitignore ├── config └── index.js ├── index.js ├── lib ├── hanlp-1.3.2 │ ├── gson-2.2.2.jar │ ├── hanlp-1.3.2.jar │ ├── hanlp-1.3.2-sources.jar │ └── src-java │ │ ├── node │ │ ├── VarArgs.class │ │ ├── CastingUtils.class │ │ ├── NodeJsException.class │ │ ├── NodeDynamicProxyClass.class │ │ ├── NodeJsException.java │ │ ├── CastingUtils.java │ │ ├── VarArgs.java │ │ └── NodeDynamicProxyClass.java │ │ └── hanLP.properties └── index.js ├── .dockerignore ├── Dockerfile ├── package.json ├── scripts └── build-docker-image.sh ├── app.js ├── examples ├── conversion.js ├── extract.js └── tokenizer.js ├── router.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | data 3 | -------------------------------------------------------------------------------- /config/index.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | host : "0.0.0.0", 3 | port : 3000 4 | } -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * hanlp toolkit 3 | */ 4 | 5 | module.exports = require("./lib/index"); -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/gson-2.2.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/gson-2.2.2.jar -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/hanlp-1.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/hanlp-1.3.2.jar -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | */sftp-config.json 2 | */*.sublime-* 3 | .* 4 | node_modules/ 5 | logs/* 6 | scripts/ 7 | .vscode 8 | */dist 9 | -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/hanlp-1.3.2-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/hanlp-1.3.2-sources.jar -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/VarArgs.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/src-java/node/VarArgs.class -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/CastingUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/src-java/node/CastingUtils.class -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/NodeJsException.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/src-java/node/NodeJsException.class -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/NodeDynamicProxyClass.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hailiang-wang/hanlp-api/HEAD/lib/hanlp-1.3.2/src-java/node/NodeDynamicProxyClass.class -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/NodeJsException.java: -------------------------------------------------------------------------------- 1 | package node; 2 | 3 | public class NodeJsException extends RuntimeException { 4 | public NodeJsException(String message) { 5 | super(message); 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:7.9.0 2 | MAINTAINER Hain Wang 3 | 4 | RUN apt-get update 5 | RUN apt-get install openjdk-7-jdk -yy 6 | 7 | RUN npm install -g cnpm --registry=https://registry.npm.taobao.org 8 | RUN /bin/bash -c "mkdir -p /hanlp-api" 9 | COPY . /hanlp-api 10 | WORKDIR /hanlp-api 11 | RUN cnpm install 12 | 13 | ENTRYPOINT ["node"] 14 | CMD ["app.js"] 15 | 16 | EXPOSE 3001 -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-hanlp", 3 | "version": "1.0.2", 4 | "description": "HanLP for nodejs", 5 | "main": "index.js", 6 | "dependencies": {}, 7 | "devDependencies": { 8 | "body-parser": "^1.17.1", 9 | "express": "^4.15.2", 10 | "java": "^0.8.0", 11 | "underscore": "^1.8.3" 12 | }, 13 | "scripts": { 14 | "test": "echo \"Error: no test specified\" && exit 1" 15 | }, 16 | "keywords": [ 17 | "hanlp", 18 | "node-hanlp" 19 | ], 20 | "author": "chanre", 21 | "license": "ISC" 22 | } 23 | -------------------------------------------------------------------------------- /scripts/build-docker-image.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ########################################### 3 | # Build Docker Image 4 | ########################################### 5 | 6 | # constants 7 | baseDir=$(cd `dirname "$0"`;pwd) 8 | # functions 9 | 10 | # main 11 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return 12 | cd $baseDir/.. 13 | 14 | # Version key/value should be on his own line 15 | PACKAGE_VERSION=$(cat package.json \ 16 | | grep version \ 17 | | head -1 \ 18 | | awk -F: '{ print $2 }' \ 19 | | sed 's/[",]//g' | xargs) 20 | 21 | echo $PACKAGE_VERSION 22 | docker build --force-rm=true --tag samurais/hanlp-api:$PACKAGE_VERSION . 23 | -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * @authors chandre (chandre21cn@gmail.com) 4 | * @date 2017-04-09 21:01:20 5 | * @version $Id$ 6 | */ 7 | 8 | const express = require('express'); 9 | const bodyParser = require('body-parser'); 10 | const app = express(); 11 | 12 | let router = require("./router"); 13 | 14 | // 收藏图标 15 | app.get("/favicon.ico", (req, res) => { 16 | res.send(""); 17 | }); 18 | 19 | app.use(bodyParser.json()); 20 | app.use(bodyParser.urlencoded({ extended: true })); 21 | app.use("/" , router ) 22 | 23 | let config = require("./config") 24 | // 启动HTTP服务 25 | let server = app.listen({ 26 | port : config.port || 3000, 27 | host : config.host || "0.0.0.0" 28 | }, () => { 29 | console.log('HanLP Service listening at http://%s:%s', server.address().address, server.address().port); 30 | }); -------------------------------------------------------------------------------- /examples/conversion.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 文本信息提取 3 | * @authors chandre (chandre21cn@gmail.com) 4 | * @date 2017-04-08 18:24:04 5 | * @version 1.0.0 6 | */ 7 | 8 | 9 | const Hanlp = require("../lib/index"); 10 | const HanLP = new Hanlp(); 11 | 12 | // [ConversionFont 简繁转换] 13 | console.log("\n============================= 繁体 =============================") 14 | var text = "用笔记本电脑写程序"; 15 | var txt = HanLP.ConversionFont( text , "ft" ); 16 | console.log( `${text} >>>> ${txt}` ) 17 | 18 | console.log("\n============================= 简体 =============================") 19 | var text = "「以後等妳當上皇后,就能買士多啤梨慶祝了」"; 20 | var txt = HanLP.ConversionFont( text ); 21 | console.log( `${text} >>>> ${txt}` ) 22 | 23 | 24 | // [Pinyin 拼音转换] 25 | console.log("\n============================= 拼音转换 =============================") 26 | var text = "用笔记本电脑写程序"; 27 | ['num','tone','outtone','shengmu','yunmu','head'].forEach(function(item){ 28 | var txt = HanLP.Pinyin( text , item ); 29 | console.log( `${item} >>>> ${txt}` ) 30 | }) 31 | -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/CastingUtils.java: -------------------------------------------------------------------------------- 1 | package node; 2 | 3 | import java.lang.reflect.Method; 4 | 5 | public class CastingUtils { 6 | public static void cast(Method method, Object[] args) throws Throwable { 7 | Class[] methodParameterTypes = method.getParameterTypes(); 8 | if (methodParameterTypes.length != args.length) { 9 | throw new Exception("Method argument length mismatch. Expecting " + methodParameterTypes.length + " found " + args.length); 10 | } 11 | for (int i = 0; i < methodParameterTypes.length; i++) { 12 | args[i] = cast(args[i], methodParameterTypes[i]); 13 | } 14 | } 15 | 16 | public static Object cast(Object o, Class t) { 17 | if (o == null) { 18 | return null; 19 | } 20 | 21 | Class oClass = o.getClass(); 22 | if (oClass == Integer.class) { 23 | Integer i = (Integer) o; 24 | if (t == Double.class) { 25 | return i.doubleValue(); 26 | } 27 | } else if (oClass == Double.class) { 28 | Double d = (Double) o; 29 | if (t == Integer.class) { 30 | return d.intValue(); 31 | } 32 | } 33 | 34 | return o; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/hanLP.properties: -------------------------------------------------------------------------------- 1 | #本配置文件中的路径的根目录,根目录+其他路径=绝对路径 2 | #Windows用户请注意,路径分隔符统一使用/ 3 | root=/node-hanlp 4 | #核心词典路径 5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt 6 | #2元语法词典路径 7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt 8 | #停用词词典路径 9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt 10 | #同义词词典路径 11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt 12 | #人名词典路径 13 | PersonDictionaryPath=data/dictionary/person/nr.txt 14 | #人名词典转移矩阵路径 15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt 16 | #繁简词典根目录 17 | tcDictionaryRoot=data/dictionary/tc 18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除。所有词典统一使用UTF-8编码。 20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf; 21 | #CRF分词模型路径 22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt 23 | #HMM分词模型 24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin 25 | #分词结果是否展示词性 26 | ShowTermNature=true 27 | #IO适配器,实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台(Hadoop、Redis等)上运行HanLP 28 | #默认的IO适配器如下,该适配器是基于普通文件系统的。 29 | #IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter 30 | -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/VarArgs.java: -------------------------------------------------------------------------------- 1 | package node; 2 | 3 | import java.lang.reflect.Constructor; 4 | import java.lang.reflect.Method; 5 | import java.lang.reflect.Array; 6 | import java.util.Arrays; 7 | 8 | public class VarArgs { 9 | public static Object[] getVarArgs(Method method, Object[] args) { 10 | if (method.isVarArgs()) { 11 | Class[] methodParameterTypes = method.getParameterTypes(); 12 | return getVarArgs(args, methodParameterTypes); 13 | } 14 | return args; 15 | } 16 | 17 | public static Object[] getVarArgs(Constructor constructor, Object[] args) { 18 | if (constructor.isVarArgs()) { 19 | Class[] constructorParameterTypes = constructor.getParameterTypes(); 20 | return getVarArgs(args, constructorParameterTypes); 21 | } 22 | return args; 23 | } 24 | 25 | public static Object[] getVarArgs(Object[] args, Class[] methodParameterTypes) { 26 | if(args.length == methodParameterTypes.length 27 | && args[args.length - 1].getClass().equals(methodParameterTypes[methodParameterTypes.length - 1])) { 28 | return args; 29 | } 30 | 31 | Object[] newArgs = new Object[methodParameterTypes.length]; 32 | System.arraycopy(args, 0, newArgs, 0, methodParameterTypes.length - 1); 33 | Class varArgComponentType = methodParameterTypes[methodParameterTypes.length - 1].getComponentType(); 34 | int varArgLength = args.length - methodParameterTypes.length + 1; 35 | Object[] varArgsArray = (Object[])Array.newInstance(varArgComponentType, varArgLength); 36 | // System.out.println("varArgComponentType: " + varArgComponentType); 37 | // System.out.println("varArgsArray: " + Arrays.asList(varArgsArray).toString()); 38 | // System.out.println("args: " + Arrays.asList(args).toString()); 39 | System.arraycopy(args, methodParameterTypes.length - 1, varArgsArray, 0, varArgLength); 40 | newArgs[methodParameterTypes.length - 1] = varArgsArray; 41 | return newArgs; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /lib/hanlp-1.3.2/src-java/node/NodeDynamicProxyClass.java: -------------------------------------------------------------------------------- 1 | package node; 2 | 3 | import java.lang.reflect.Method; 4 | 5 | public class NodeDynamicProxyClass implements java.lang.reflect.InvocationHandler { 6 | private static final Method EQUALS; 7 | private static final Method HASHCODE; 8 | static { 9 | try { 10 | EQUALS = Object.class.getMethod("equals", Object.class); 11 | HASHCODE = Object.class.getMethod("hashCode"); 12 | } catch (NoSuchMethodException e) { 13 | throw new ExceptionInInitializerError(e); 14 | } 15 | } 16 | 17 | private native Object callJs(long ptr, java.lang.reflect.Method m, Object[] args) throws Throwable; 18 | private native void unref(long ptr) throws Throwable; 19 | public final long ptr; 20 | 21 | public NodeDynamicProxyClass(String path, long ptr) { 22 | try{ 23 | Runtime.getRuntime().load(path); 24 | }catch(Exception e){ 25 | System.out.println(e.toString()); 26 | } 27 | this.ptr = ptr; 28 | } 29 | 30 | @Override 31 | public Object invoke(Object proxy, java.lang.reflect.Method m, Object[] args) throws Throwable 32 | { 33 | try { 34 | Object result = callJs(this.ptr, m, args); 35 | //if(result == null) { 36 | // System.out.println("invoke: null"); 37 | //} else { 38 | // System.out.println("invoke: " + result + " class: " + result.getClass() + " to string: " + result.toString()); 39 | //} 40 | return result; 41 | } catch (NoSuchMethodError e) { 42 | // use 'vanilla' implementations otherwise - the object that persists between multiple invocations is 43 | // 'this', not the 'proxy' argument, so we operate on this. 44 | if (EQUALS.equals(m)) { 45 | // need to check if the arg is a Proxy, and if so, if its invocation handler == this! 46 | return args[0] == proxy; 47 | } else if (HASHCODE.equals(m)) { 48 | return System.identityHashCode(proxy); 49 | } else if ("unref".equals(m.getName()) && m.getParameterTypes().length == 0 && m.getReturnType() == Void.TYPE) { 50 | this.unref(); 51 | } 52 | throw e; 53 | } 54 | } 55 | 56 | public void unref() throws Throwable { 57 | unref(this.ptr); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /examples/extract.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 文本信息提取 3 | * @authors chandre (chandre21cn@gmail.com) 4 | * @date 2017-04-08 18:24:04 5 | * @version 1.0.0 6 | */ 7 | 8 | 9 | const Hanlp = require("../lib/index"); 10 | const HanLP = new Hanlp(); 11 | 12 | var text = [ 13 | '腾讯科技讯据外电报道美国电动车制造商特斯拉周日表示该公司第一季度电动车的交付总量同比增长69%,达到创纪录的2.5万辆,超出市场分析师此前的预期。', 14 | '特斯拉表示,该公司第一季度交付了1.345万辆轿车型Model S电动车,以及11,550辆SUV型Model X电动车,精确的交付量为2.5418万辆。特斯拉第一季度电动车交付量的提升,将有力的支撑公司此前制定的上半年交付5万辆电动车的目标。彭博社的统计数据显示,市场分析师此前平均预计,特斯拉第一季度电动车的交付总量为2.42万辆。', 15 | '投资公司Robert W. Baird & Co.分析师本·卡罗尔(Ben Kallo)就此表示,“市场目前把注意力都集中到了Model 3的量产问题,不过对特斯拉而言,第一季度电动车的交付量创出历史新高是一个非常好的消息。2.5万辆的交付量是我们能想到的最好数据,市场目前指望着Model 3推动特斯拉电动车销量的继续增长。”', 16 | '特斯拉此前曾表示,由于公司从去年10月底至12月初遇到制造方面的挑战,导致去年第四季度的电动车交付量未达市场预期。最终,大约2750辆电动车因为运输延迟或消费者未能实物提货,而未被计入到当季的交付量当中。特斯拉在周日的声明中还表示,大约有4650辆电动车在第一季度末转运给消费者,将被计入到第二季度的交付量当中。', 17 | '今年是特斯拉的关键一年,因为该公司首款量产、廉价版电动车Model 3将开始投产。受益于Model 3的市场前景和腾讯买入特斯拉5%股权的推动,特斯拉的股价在今年已累计上涨了30%。至本周五收盘时,特斯拉市值约为454亿美元,仅距年产数百万辆汽车的福特汽车相差8.7亿美元。', 18 | '作为美国最年轻的上市汽车制造商,特斯拉目前尚未向市场证明该公司有能力量产电动车。特斯拉首席执行官伊隆·马斯克(Elon Musk)曾表示,特斯拉明年要实现年产量50万辆的目标,不过这一雄心勃勃的计划将受制于该公司位于内华达州的超级电池工厂能否顺利投产。截至目前,特斯拉尚未给出今年全年的电动车出货量目标。', 19 | '特斯拉目前正在对产品线和价格结构进行调整,从而为今年7月量产Model 3进行准备。按照计划,特斯拉将从今年年底开始向美国市场销售Model 3。市场当前预计Model 3的起售价格将为3.5万美元左右。特斯拉在上月中旬决定停售售价为7.45万美元的低端版Model S轿车。4月16日之后,公司不再销售该款汽车。低端版Model S配备75kWh电池组,不过特斯拉用软件锁定电池,只提供60kWh的续航能力。特斯拉用户如果购买该汽车,可以通过升级软件解锁电池,让75kWh完全发挥作用,不过软件需要花钱购买。为什么停售该款汽车?原因是大多数客户最终会选择升级到75kWh,特斯拉想精简产品线。', 20 | '特斯拉目前公布的数据只是初步数据,可能会在该公司今年5月发布财报时出现略微变化。特斯拉会每个季度发布全球销售数据,而不是像传统汽车制造商那样每月发布一次汽车销售数据。交付数据只包括已转交给消费者、且所有文书工作都是正确的电动车。' 21 | ]; 22 | 23 | // [Keyword 关键词提取] 24 | console.log("\n============================= 关键词提取 =============================") 25 | var words = HanLP.Keyword( text.join("") , 3 ); 26 | console.log(words) 27 | 28 | // [Phrase 短语提取] 29 | console.log("\n============================= 短语提取 =============================") 30 | var words = HanLP.Phrase( text.join("") , 2 ); 31 | console.log(words) 32 | 33 | // [Summary 提取文章摘要] 34 | console.log("\n============================= 提取文章摘要 =============================") 35 | var words = HanLP.Summary( text.join("") , 3 ); 36 | console.log(words) 37 | 38 | 39 | // [ Suggester 文本推荐(句子级别,从一系列句子中挑出与输入句子最相似的那一个)] 40 | console.log("\n============================= 文本推荐 =============================") 41 | var words = HanLP.Suggester( text , ["价格","业绩"] , 1 ); 42 | console.log(words) -------------------------------------------------------------------------------- /examples/tokenizer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 分词 3 | * @authors chandre (chandre21cn@gmail.com) 4 | * @date 2017-04-08 18:24:04 5 | * @version 1.0.0 6 | */ 7 | 8 | const Hanlp = require("../lib/index"); 9 | const HanLP = new Hanlp(); 10 | 11 | var text = [ 12 | '腾讯科技讯据外电报道美国电动车制造商特斯拉周日表示该公司第一季度电动车的交付总量同比增长69%,达到创纪录的2.5万辆,超出市场分析师此前的预期。', 13 | '特斯拉表示,该公司第一季度交付了1.345万辆轿车型Model S电动车,以及11,550辆SUV型Model X电动车,精确的交付量为2.5418万辆。特斯拉第一季度电动车交付量的提升,将有力的支撑公司此前制定的上半年交付5万辆电动车的目标。彭博社的统计数据显示,市场分析师此前平均预计,特斯拉第一季度电动车的交付总量为2.42万辆。', 14 | '投资公司Robert W. Baird & Co.分析师本·卡罗尔(Ben Kallo)就此表示,“市场目前把注意力都集中到了Model 3的量产问题,不过对特斯拉而言,第一季度电动车的交付量创出历史新高是一个非常好的消息。2.5万辆的交付量是我们能想到的最好数据,市场目前指望着Model 3推动特斯拉电动车销量的继续增长。”', 15 | '特斯拉此前曾表示,由于公司从去年10月底至12月初遇到制造方面的挑战,导致去年第四季度的电动车交付量未达市场预期。最终,大约2750辆电动车因为运输延迟或消费者未能实物提货,而未被计入到当季的交付量当中。特斯拉在周日的声明中还表示,大约有4650辆电动车在第一季度末转运给消费者,将被计入到第二季度的交付量当中。', 16 | '今年是特斯拉的关键一年,因为该公司首款量产、廉价版电动车Model 3将开始投产。受益于Model 3的市场前景和腾讯买入特斯拉5%股权的推动,特斯拉的股价在今年已累计上涨了30%。至本周五收盘时,特斯拉市值约为454亿美元,仅距年产数百万辆汽车的福特汽车相差8.7亿美元。', 17 | '作为美国最年轻的上市汽车制造商,特斯拉目前尚未向市场证明该公司有能力量产电动车。特斯拉首席执行官伊隆·马斯克(Elon Musk)曾表示,特斯拉明年要实现年产量50万辆的目标,不过这一雄心勃勃的计划将受制于该公司位于内华达州的超级电池工厂能否顺利投产。截至目前,特斯拉尚未给出今年全年的电动车出货量目标。', 18 | '特斯拉目前正在对产品线和价格结构进行调整,从而为今年7月量产Model 3进行准备。按照计划,特斯拉将从今年年底开始向美国市场销售Model 3。市场当前预计Model 3的起售价格将为3.5万美元左右。特斯拉在上月中旬决定停售售价为7.45万美元的低端版Model S轿车。4月16日之后,公司不再销售该款汽车。低端版Model S配备75kWh电池组,不过特斯拉用软件锁定电池,只提供60kWh的续航能力。特斯拉用户如果购买该汽车,可以通过升级软件解锁电池,让75kWh完全发挥作用,不过软件需要花钱购买。为什么停售该款汽车?原因是大多数客户最终会选择升级到75kWh,特斯拉想精简产品线。', 19 | '特斯拉目前公布的数据只是初步数据,可能会在该公司今年5月发布财报时出现略微变化。特斯拉会每个季度发布全球销售数据,而不是像传统汽车制造商那样每月发布一次汽车销售数据。交付数据只包括已转交给消费者、且所有文书工作都是正确的电动车。' 20 | ]; 21 | 22 | // [Tokenizer 标准分词] 23 | console.log("\n============================= 标准分词 =============================") 24 | var words = HanLP.Tokenizer("商品和服务"); 25 | console.log(words) 26 | 27 | // [NLPTokenizer NLP分词] 28 | console.log("\n============================= NLP分词 =============================") 29 | var words = HanLP.NLPTokenizer("中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程"); 30 | console.log(words) 31 | 32 | // [IndexTokenizer 索引分词] 33 | console.log("\n============================= 索引分词 =============================") 34 | var words = HanLP.IndexTokenizer("主副食品"); 35 | console.log(words) 36 | 37 | 38 | // [ShortSegment 最短路分词] 39 | console.log("\n============================= 最短路分词 =============================") 40 | var words = HanLP.ShortSegment( "今天,刘志军案的关键人物,山西女商人丁书苗在市二中院出庭受审。" ); 41 | console.log(words) 42 | 43 | // [NShortSegment N-最短分词] 44 | console.log("\n============================= N-最短分词 =============================") 45 | var words = HanLP.NShortSegment( "刘喜杰石国祥会见吴亚琴先进事迹报告团成员" ); 46 | console.log(words) 47 | 48 | // [SpeedTokenizer 极速词典分词] 49 | console.log("\n============================= 极速词典分词 =============================") 50 | var words = HanLP.SpeedTokenizer( "江西鄱阳湖干枯,中国最大淡水湖变成大草原" ); 51 | console.log(words) 52 | 53 | 54 | // [CRFTokenizer CRF分词] 55 | console.log("\n============================= CRF分词 =============================") 56 | text.forEach( ( item ) => { 57 | var words = HanLP.CRFTokenizer( item ); 58 | console.log(words) 59 | }) 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /router.js: -------------------------------------------------------------------------------- 1 | const _ = require("underscore"); 2 | const express = require('express'); 3 | const Hanlp = require("./lib/index"); 4 | 5 | const router = express.Router(); 6 | const HanLP = new Hanlp(); 7 | 8 | 9 | 10 | // 路由中间件 11 | router.use(function(req, res, next) { 12 | if (req.method==="GET") return next(); 13 | let param = req.body; 14 | if ( !_.has( param , "content" ) || _.isEmpty( param["content"] ) ) { 15 | return res.status(500).send({ 16 | status : "error", 17 | msg : "请求失败!" 18 | }) 19 | }; 20 | next(); 21 | }); 22 | 23 | // 分词 24 | router.post("/tokenizer", ( req , res ) => { 25 | let param = req.body, 26 | words = null, 27 | type = _.isEmpty(param.type) ? "standard" : param.type; 28 | 29 | type = type.toLowerCase(); 30 | switch (type) { 31 | case "crf" : //CRF分词 32 | words = HanLP.CRFTokenizer( param.content ); 33 | break; 34 | case "nostopword" : //去除停用词分词 35 | words = HanLP.NoStopWord( param.content ); 36 | break; 37 | case "nlp" : //NLP分词 38 | words = HanLP.NLPTokenizer( param.content ); 39 | break; 40 | case "index" : //索引分词 41 | words = HanLP.IndexTokenizer( param.content ); 42 | break; 43 | case "short" : //最短路分词 44 | words = HanLP.ShortSegment( param.content ); 45 | break; 46 | case "nshort" : //N-最短分词 47 | words = HanLP.NShortSegment( param.content ); 48 | break; 49 | case "speed" : //极速词典分词 50 | words = HanLP.SpeedTokenizer( param.content ); 51 | break; 52 | case "standard" : //标准分词 53 | default: 54 | words = HanLP.Tokenizer( param.content ); 55 | } 56 | res.send({ 57 | status : "success", 58 | data : words 59 | }); 60 | }) 61 | 62 | // 关键词 63 | router.post("/keyword", ( req , res ) => { 64 | let param = req.body, 65 | num = _.isEmpty(param.num) || _.isNaN( parseInt(param.num) ) ? 3 : parseInt(param.num), 66 | words = HanLP.Keyword( param.content , num ); 67 | res.send({ 68 | status : "success", 69 | data : words 70 | }); 71 | }) 72 | 73 | // 摘要 74 | router.post("/summary", ( req , res ) => { 75 | let param = req.body, 76 | num = _.isEmpty(param.num) || _.isNaN( parseInt(param.num) ) ? 3 : parseInt(param.num), 77 | words = HanLP.Summary( param.content , num ); 78 | res.send({ 79 | status : "success", 80 | data : words 81 | }); 82 | }) 83 | 84 | // 短语提取 85 | router.post("/phrase", ( req , res ) => { 86 | let param = req.body, 87 | num = _.isEmpty(param.num) || _.isNaN( parseInt(param.num) ) ? 3 : parseInt(param.num), 88 | words = HanLP.Phrase( param.content , num ); 89 | res.send({ 90 | status : "success", 91 | data : words 92 | }); 93 | }); 94 | 95 | // 关键词、摘要 96 | router.post("/query", ( req , res ) => { 97 | let param = req.body; 98 | let num = _.isEmpty(param.num) || _.isNaN( parseInt(param.num) ) ? 3 : parseInt(param.num) ; 99 | res.send({ 100 | status : "success", 101 | data : { 102 | keyword : HanLP.Keyword( param.content , num ), 103 | summary : HanLP.Summary( param.content , num ), 104 | } 105 | }) 106 | }) 107 | 108 | // 简、繁、拼音转换 109 | router.post("/conversion" , (req , res ) => { 110 | let param = req.body, 111 | data = null, 112 | type = _.isEmpty(param.type) ? "py" : param.type; 113 | type = type.toLowerCase(); 114 | switch (type) { 115 | case "ft" : //繁体 116 | data = HanLP.ConversionFont( param.content , "ft" ); 117 | break; 118 | case "jt" : //简体 119 | data = HanLP.ConversionFont( param.content , "jt" ); 120 | break; 121 | case "py" : //拼音转换 122 | default: 123 | data = HanLP.Pinyin( param.content , "outtone" ); 124 | } 125 | res.send({ 126 | status : "success", 127 | data : data 128 | }) 129 | }) 130 | 131 | module.exports = router; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Docker Pulls](https://img.shields.io/docker/pulls/samurais/hanlp-api.svg?maxAge=2592000)](https://hub.docker.com/r/samurais/hanlp-api/) [![Docker Stars](https://img.shields.io/docker/stars/samurais/hanlp-api.svg?maxAge=2592000)](https://hub.docker.com/r/samurais/hanlp-api/) [![Docker Layers](https://images.microbadger.com/badges/image/samurais/hanlp-api.svg)](https://microbadger.com/#/images/samurais/hanlp-api) [![](https://images.microbadger.com/badges/version/samurais/hanlp-api.svg)](https://microbadger.com/images/samurais/hanlp-api "Get your own version badge on microbadger.com") 2 | 3 | HanLP 自然语言处理 for nodejs 4 | ===== 5 | * 支持中文分词(N-最短路分词、CRF分词、索引分词、用户自定义词典、词性标注),命名实体识别(中国人名、音译人名、日本人名、地名、实体机构名识别),关键词提取,自动摘要,短语提取,拼音转换,简繁转换,文本推荐,依存句法分析(MaxEnt依存句法分析、CRF依存句法分析) 6 | * 官方文档:http://www.hankcs.com/nlp/hanlp.html 7 | 8 | ### 环境要求 9 | java 1.8 10 | nodejs >= 6 11 | 12 | ### docker 13 | 14 | * build image 15 | ``` 16 | cd node-hanlp 17 | ./scripts/build-docker-image.sh 18 | ``` 19 | 20 | Or pull image 21 | ``` 22 | docker pull samurais/hanlp-api:1.0.0 23 | ``` 24 | 25 | * start container 26 | ``` 27 | docker run -it --rm -p 3002:3000 samurais/hanlp-api:1.0.0 28 | ``` 29 | 30 | * access service 31 | 32 | ``` 33 | POST /tokenizer HTTP/1.1 34 | Host: localhost:3002 35 | Content-Type: application/json 36 | 37 | { 38 | "type": "nlp", 39 | "content": "刘德华和张学友创作了很多流行歌曲" 40 | } 41 | 42 | RESPONSE 43 | { 44 | "status": "success", 45 | "data": [ 46 | { 47 | "word": "刘德华", 48 | "nature": "nr", 49 | "offset": 0 50 | }, 51 | { 52 | "word": "和", 53 | "nature": "cc", 54 | "offset": 0 55 | }, 56 | { 57 | "word": "张学友", 58 | "nature": "nr", 59 | "offset": 0 60 | }, 61 | { 62 | "word": "创作", 63 | "nature": "v", 64 | "offset": 0 65 | }, 66 | { 67 | "word": "了", 68 | "nature": "ule", 69 | "offset": 0 70 | }, 71 | { 72 | "word": "很多", 73 | "nature": "m", 74 | "offset": 0 75 | }, 76 | { 77 | "word": "流行歌曲", 78 | "nature": "n", 79 | "offset": 0 80 | } 81 | ] 82 | } 83 | ``` 84 | 85 | * Other APIs 86 | 87 | - tokenizer 分词 88 | - keyword 关键词 89 | - summary 摘要 90 | - phrase 短语提取 91 | - query 关键词、摘要 92 | - conversion 简、繁、拼音转换 93 | 94 | [源码](/router.js) 95 | 96 | ### node module 97 | 98 | * Install 99 | 100 | ``` 101 | npm install node-hanlp 102 | ``` 103 | 104 | * Config 105 | - 配置文件路径 node_modules/node-hanlp/lib/src-java/hanLP.proerties 106 | - **请修改root为您的目录路径** 107 | 108 | - 词典文件目录 ./data 109 | - 请下载词典 https://pan.baidu.com/s/1pKUVNYF 放入 ./data (约800MB文件) 目录下 110 | 111 | * Usage 112 | 113 | ```js 114 | const Hanlp = require("node-hanlp"); 115 | //分词库初始化及配置 116 | const HanLP = new Hanlp({ 117 | CustomDict : true, //使用自定义词典 118 | NameRecognize : true, //中国人名识别 119 | TranslatedNameRecognize : true , //音译人名识别 120 | JapaneseNameRecognize : true, //日本人名识别 121 | PlaceRecognize : true , //地名识别 122 | OrgRecognize : true //机构名识别 123 | }); 124 | let words = HanLP.Tokenizer("商品和服务"); 125 | ``` 126 | 127 | ### 标准分词 HanLP.Tokenizer( text ) 128 | @param String text [文本] 129 | @ruten Object 130 | ```js 131 | let words = HanLP.Tokenizer("商品和服务"); 132 | 133 | [ 134 | { word: '商品', nature: 'n', offset: 0 }, 135 | { word: '和', nature: 'cc', offset: 0 }, 136 | { word: '服务', nature: 'vn', offset: 0 } 137 | ] 138 | ``` 139 | ### NLP分词 HanLP.NLPTokenizer( text ) 140 | @param String text [文本] 141 | @ruten Object 142 | ```js 143 | let words = HanLP.NLPTokenizer("中国科学院计算技术研究所的宗成庆教授正在教授自然语言处理课程"); 144 | 145 | [ 146 | { word: '中国科学院计算技术研究所', nature: 'nt', offset: 0 }, 147 | { word: '的', nature: 'ude1', offset: 0 }, 148 | { word: '宗成庆', nature: 'nr', offset: 0 }, 149 | { word: '教授', nature: 'nnt', offset: 0 }, 150 | ... 151 | ] 152 | ``` 153 | ### 索引分词 HanLP.IndexTokenizer( text ) 154 | @param String text [文本] 155 | @ruten Object 156 | ```js 157 | let words = HanLP.IndexTokenizer("主副食品"); 158 | 159 | [ 160 | { word: '主副食品', nature: 'n', offset: 0 }, 161 | { word: '主副食', nature: 'j', offset: 0 }, 162 | { word: '副食', nature: 'n', offset: 1 }, 163 | { word: '副食品', nature: 'n', offset: 1 }, 164 | { word: '食品', nature: 'n', offset: 2 } 165 | ] 166 | ``` 167 | ### CRF分词 HanLP.CRFTokenizer( text ) 168 | @param String text [文本] 169 | @ruten Object 170 | ```js 171 | let words = HanLP.CRFTokenizer("你好,欢迎使用HanLP汉语处理包!"); 172 | 173 | [ 174 | { word: '你好', nature: 'vl', offset: 0 }, 175 | { word: ',', nature: 'w', offset: 0 }, 176 | { word: '欢迎', nature: 'v', offset: 0 }, 177 | { word: '使用', nature: 'v', offset: 0 }, 178 | { word: 'HanLP', nature: 'nz', offset: 0 }, 179 | { word: '汉语', nature: 'gi', offset: 0 }, 180 | ... 181 | ] 182 | ``` 183 | ### 去除停用词分词 HanLP.NoStopWord( text ) 184 | @param String text [文本] 185 | @ruten Object 186 | ```js 187 | let words = HanLP.NoStopWord("你好,欢迎使用HanLP汉语处理包!"); 188 | 189 | [ 190 | { word: '你好', nature: 'vl', offset: 0 }, 191 | { word: '欢迎', nature: 'v', offset: 0 }, 192 | { word: '使用', nature: 'v', offset: 0 }, 193 | { word: 'HanLP', nature: 'nz', offset: 0 }, 194 | { word: '汉语', nature: 'gi', offset: 0 }, 195 | ... 196 | ] 197 | ``` 198 | 199 | ### 最短路分词 HanLP.ShortSegment( text ) 200 | @param String text [文本] 201 | @ruten Object 202 | ```js 203 | let words = HanLP.ShortSegment("今天,刘志军案的关键人物,山西女商人丁书苗在市二中院出庭受审。"); 204 | 205 | [ 206 | { word: '今天', nature: 't', offset: 0 }, 207 | { word: ',', nature: 'w', offset: 0 }, 208 | { word: '刘志军', nature: 'nr', offset: 0 }, 209 | { word: '案', nature: 'ng', offset: 0 }, 210 | { word: '的', nature: 'ude1', offset: 0 }, 211 | { word: '关键', nature: 'n', offset: 0 }, 212 | ... 213 | ] 214 | ``` 215 | ### N-最短分词 HanLP.NShortSegment( text ) 216 | @param String text [文本] 217 | @ruten Object 218 | ```js 219 | let words = HanLP.NShortSegment("刘喜杰石国祥会见吴亚琴先进事迹报告团成员"); 220 | 221 | [ 222 | { word: '刘喜杰', nature: 'nr', offset: 0 }, 223 | { word: '石国祥', nature: 'nr', offset: 0 }, 224 | { word: '会见', nature: 'v', offset: 0 }, 225 | { word: '吴亚琴', nature: 'nr', offset: 0 }, 226 | { word: '先进', nature: 'a', offset: 0 }, 227 | ... 228 | ] 229 | ``` 230 | ### 极速词典分词 HanLP.SpeedTokenizer( text ) 231 | @param String text [文本] 232 | @ruten Object 233 | ```js 234 | let words = HanLP.SpeedTokenizer("江西鄱阳湖干枯,中国最大淡水湖变成大草原"); 235 | 236 | [ 237 | { word: '江西', offset: 0 }, 238 | { word: '鄱阳湖', offset: 2 }, 239 | { word: '干枯', offset: 5 }, 240 | { word: ',', offset: 7 }, 241 | { word: '中国', offset: 8 }, 242 | ] 243 | ``` 244 | ### 关键词提取 HanLP.Keyword( text , nTop ) 245 | @param String text [文本] 246 | @param Number nTop [关键词个数,默认5个] 247 | @ruten Object 248 | ```js 249 | let words = HanLP.Keyword("江西鄱阳湖干枯,中国最大淡水湖变成大草原" , 3); 250 | 251 | [ '中国', '最大', '淡水湖' ] 252 | ``` 253 | 254 | ### 短语提取 HanLP.Phrase( text , nTop ) 255 | @param String text [文本] 256 | @param Number nTop [短语个数,默认3个] 257 | @ruten Object 258 | ```js 259 | let words = HanLP.Phrase("江西鄱阳湖干枯,中国最大淡水湖变成大草原" , 2 ); 260 | 261 | [ '中国最大', '变成草原' ] 262 | ``` 263 | ### 提取文章摘要 HanLP.Summary( text , nTop ) 264 | @param String text [文本] 265 | @param Number nTop [文章摘要条数,默认3条] 266 | @ruten Object 267 | ```js 268 | let text = "据美国福克斯新闻报道,俄罗斯黑海舰队一艘护卫舰格里戈罗维奇海军上将号,正在驶向美国军舰发射导弹攻击叙利亚的区域。该护卫舰是俄罗斯最先进的护卫舰,2016年才刚服役,除防空、反舰导弹外,也可以发射巡航导弹。格里戈罗维奇海军上将号原定于本周访问叙利亚的塔尔图斯港。" 269 | 270 | let words = HanLP.Summary( text , 3); 271 | 272 | [ 273 | '俄罗斯黑海舰队一艘护卫舰格里戈罗维奇海军上将号', 274 | '格里戈罗维奇海军上将号原定于本周访问叙利亚的塔尔图斯港', 275 | '正在驶向美国军舰发射导弹攻击叙利亚的区域' 276 | ] 277 | ``` 278 | ### 文本推荐 HanLP.Suggester( list, words, Ntop ) 279 | @param Array list 句子列表 280 | @param Array words 词语 281 | @param Number nTop 相似句子推荐个数,默认1个 282 | @ruten Object 283 | 284 | 句子级别,从一系列句子中挑出与输入句子最相似的那一个 285 | 286 | ### 语义距离 HanLP.WordDistance( words ) 287 | @param Array words 词 288 | @ruten Object 289 | 290 | ### 简繁转换 HanLP.ConversionFont( text , type ) 291 | @param String text 文本 292 | @ruten String type 类型 jt简体|ft繁体,默认jt 293 | @ruten String 294 | 295 | ### 拼音转换 HanLP.Pinyin( text , type ) 296 | @param String text 文本 297 | @ruten String type 类型 类型 num数字音调|tone符号音调|outtone无音调|shengmu声母|yunmu韵母|head输入法头,默认outtone 298 | @ruten Object 299 | 300 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * HanLP node版汉语言处理包 3 | * @authors chandre (chandre21cn@gmail.com) 4 | * @date 2017-04-08 18:24:04 5 | * @version 1.0.0 6 | */ 7 | 8 | const java = require("java"); 9 | const _ = require("underscore"); 10 | const path = require("path"); 11 | 12 | const BaseDir = path.resolve( __dirname , "./hanlp-1.3.2" ); 13 | java.options = ['-Xms1024m','-Xmx1024m']; 14 | java.classpath[1] = BaseDir + "/src-java"; 15 | java.asyncOptions = { 16 | asyncSuffix: undefined, 17 | syncSuffix: "Sync", 18 | ifReadOnlySuffix: "_alt", 19 | }; 20 | 21 | java.classpath.push( BaseDir + "/hanlp-1.3.2.jar" ); 22 | java.classpath.push( BaseDir + "/gson-2.2.2.jar" ); 23 | 24 | const HanLP = java.import('com.hankcs.hanlp.HanLP'); 25 | const Gson = java.newInstanceSync('com.google.gson.Gson'); 26 | 27 | module.exports = class HANLP { 28 | 29 | constructor( args ) { 30 | 31 | let opts = this.options = _.extend({ 32 | CustomDict : true, //使用自定义词典 33 | NameRecognize : true, //中国人名识别 34 | TranslatedNameRecognize : true , //音译人名识别 35 | JapaneseNameRecognize : true, //日本人名识别 36 | PlaceRecognize : true , //地名识别 37 | OrgRecognize : true //机构名识别 38 | }, args ); 39 | 40 | let Tokenizer = java.import('com.hankcs.hanlp.tokenizer.StandardTokenizer'); 41 | 42 | // 识别设置 43 | Tokenizer.SEGMENT.enableCustomDictionarySync( opts.CustomDict ) //使用自定义词典 44 | .enableNameRecognizeSync(opts.NameRecognize) //中国人名识别 45 | .enableTranslatedNameRecognizeSync(opts.TranslatedNameRecognize) //音译人名识别 46 | .enableJapaneseNameRecognizeSync(opts.JapaneseNameRecognize) //日本人名识别 47 | .enablePlaceRecognizeSync( opts.PlaceRecognize ) //地名识别 48 | .enableOrganizationRecognizeSync( opts.OrgRecognize ); //机构名识别 49 | } 50 | 51 | /** 52 | * [Tokenizer 标准分词] 53 | * @param {[String]} text [文本] 54 | * @return Object 55 | */ 56 | Tokenizer ( text ) { 57 | let StandardTokenizer = java.import('com.hankcs.hanlp.tokenizer.StandardTokenizer'); 58 | let words = StandardTokenizer.segmentSync( text ); 59 | return JSON.parse( Gson.toJsonSync(words) ); 60 | } 61 | 62 | /** 63 | * [NLPTokenizer NLP分词] 64 | * @param {[String]} text [文本] 65 | * @return Object 66 | */ 67 | NLPTokenizer ( text ) { 68 | let NLPTokenizer = java.import('com.hankcs.hanlp.tokenizer.NLPTokenizer'); 69 | let words = NLPTokenizer.segmentSync( text ); 70 | return JSON.parse( Gson.toJsonSync(words) ); 71 | } 72 | 73 | /** 74 | * [IndexTokenizer 索引分词] 75 | * @param {[String]} text [文本] 76 | * @return Object 77 | */ 78 | IndexTokenizer ( text ) { 79 | let IndexTokenizer = java.import('com.hankcs.hanlp.tokenizer.IndexTokenizer'); 80 | let words = IndexTokenizer.segmentSync( text ); 81 | return JSON.parse( Gson.toJsonSync(words) ); 82 | } 83 | 84 | 85 | /** 86 | * [CRFTokenizer CRF分词] 87 | * @param {[String]} text [文本] 88 | * @return Object 89 | */ 90 | CRFTokenizer ( text ) { 91 | let CRFSegment = java.newInstanceSync('com.hankcs.hanlp.seg.CRF.CRFSegment'); 92 | let words = CRFSegment.segSync( text ); 93 | return JSON.parse( Gson.toJsonSync(words) ); 94 | } 95 | 96 | /** 97 | * [ShortSegment 最短路分词] 98 | * @param {[String]} text [文本] 99 | * @return Object 100 | */ 101 | ShortSegment ( text ) { 102 | let opts = this.options, 103 | ShortSegment = java.newInstanceSync('com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment'); 104 | 105 | let words = ShortSegment.segSync( text ); 106 | return JSON.parse( Gson.toJsonSync(words) ); 107 | } 108 | 109 | /** 110 | * [NShortSegment N-最短分词] 111 | * @param {[String]} text [文本] 112 | * @return Object 113 | */ 114 | NShortSegment ( text ) { 115 | let NShortSegment = java.newInstanceSync('com.hankcs.hanlp.seg.NShort.NShortSegment'); 116 | let words = NShortSegment.segSync( text ); 117 | return JSON.parse( Gson.toJsonSync(words) ); 118 | } 119 | 120 | /** 121 | * [SpeedTokenizer 极速词典分词] 122 | * @param {[String]} text [文本] 123 | * @return Object 124 | */ 125 | SpeedTokenizer ( text ) { 126 | let SpeedTokenizer = java.import('com.hankcs.hanlp.tokenizer.SpeedTokenizer'); 127 | let words = SpeedTokenizer.segmentSync( text ); 128 | return JSON.parse( Gson.toJsonSync(words) ); 129 | } 130 | 131 | /** 132 | * [NoStopWord 去除停用词分词] 133 | * @param {[String]} text [文本] 134 | * @return Object 135 | */ 136 | NoStopWord ( text ) { 137 | let opts = this.options, 138 | Tokenizer = java.import('com.hankcs.hanlp.tokenizer.StandardTokenizer'), 139 | StopWordDict = java.import('com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary'); 140 | let words = Tokenizer.segmentSync( text ); 141 | StopWordDict.applySync( words ); 142 | return JSON.parse( Gson.toJsonSync( words ) ); 143 | } 144 | 145 | /** 146 | * [Keyword 关键词提取] 147 | * @param {[String]} text [文本] 148 | * @param {[Number]} Ntop [关键词个数,默认5个] 149 | * @return Object 150 | */ 151 | Keyword ( text , Ntop ) { 152 | let words = HanLP.extractKeywordSync( text , Ntop || 5 ); 153 | return JSON.parse( Gson.toJsonSync(words) ); 154 | } 155 | 156 | /** 157 | * [Phrase 短语提取] 158 | * @param {[String]} text [文本] 159 | * @param {[Number]} Ntop [短语个数,默认3个] 160 | * @return Object 161 | */ 162 | Phrase ( text , Ntop ) { 163 | let words = HanLP.extractPhraseSync( text , Ntop || 3 ); 164 | return JSON.parse( Gson.toJsonSync(words) ); 165 | } 166 | 167 | /** 168 | * [Summary 提取文章摘要] 169 | * @param {[String]} text [文本] 170 | * @param {[Number]} Ntop [文章摘要个数,默认3个] 171 | * @return Object 172 | */ 173 | Summary ( text , Ntop ) { 174 | let words = HanLP.extractSummarySync( text , Ntop || 3 ); 175 | return JSON.parse( Gson.toJsonSync(words) ); 176 | } 177 | 178 | /** 179 | * [ Suggester 文本推荐(句子级别,从一系列句子中挑出与输入句子最相似的那一个)] 180 | * @param {[Array]} list [句子列表] 181 | * @param {[Array]} words [词语] 182 | * @param {[Number]} Ntop [相似句子个数,默认1个] 183 | * @return Object 184 | */ 185 | Suggester ( list, words, Ntop ) { 186 | let Suggester = java.newInstanceSync('com.hankcs.hanlp.suggest.Suggester'); 187 | list = _.isArray( list ) ? list : new Array( list ); 188 | words = _.isArray( words ) ? words : new Array( words ); 189 | 190 | list.forEach( ( item , i ) => { 191 | Suggester.addSentenceSync( item ); 192 | }); 193 | 194 | let data = []; 195 | words.forEach( (item , i)=> { 196 | data[i] = { 197 | word : item, 198 | value : JSON.parse( Gson.toJsonSync( Suggester.suggestSync( item , Ntop || 1 ) ) ), 199 | }; 200 | }); 201 | return data; 202 | } 203 | 204 | /** 205 | * [WordDistance 语义距离] 206 | * @param {[Array]} text [词] 207 | * @return Object 208 | */ 209 | WordDistance ( words ) { 210 | words = _.isArray( words ) ? words : new Array( words ); 211 | let data = [], 212 | SynonymDictionary = java.newInstanceSync('com.hankcs.hanlp.dictionary.CoreSynonymDictionary'); 213 | 214 | words.forEach( ( wordA , i ) => { 215 | let tmp = []; 216 | words.forEach( ( wordB , index ) => { 217 | tmp[index] = JSON.parse( Gson.toJsonSync( SynonymDictionary.distanceSync( wordA , wordB ) ) ) 218 | }); 219 | data[i]= { 220 | word : wordA, 221 | value : tmp 222 | }; 223 | }); 224 | 225 | return data 226 | } 227 | 228 | /** 229 | * [ConversionFont 简繁转换] 230 | * @param {[type]} text [文本] 231 | * @param {[type]} type [类型 jt简体|ft繁体 默认jt ] 232 | * @return Object 233 | */ 234 | ConversionFont ( text , type ) { 235 | type = type || "jt" 236 | if ( type.toLowerCase() === "ft" ) { 237 | return HanLP.convertToTraditionalChineseSync( text ); 238 | } else { 239 | return HanLP.convertToSimplifiedChineseSync( text ); 240 | } 241 | } 242 | 243 | /** 244 | * [Pinyin 拼音转换] 245 | * @param {[type]} text [文本] 246 | * @param {[type]} type [类型 num数字音调|tone符号音调|outtone无音调|shengmu声母|yunmu韵母|head输入法头 默认outtone ] 247 | * @return Object 248 | */ 249 | Pinyin ( text , type ) { 250 | type = type || "jt" 251 | let PinYin = HanLP.convertToPinyinListSync( text ).toArraySync(); 252 | type = type.toLowerCase(); 253 | 254 | return PinYin.map( (item , i ) => { 255 | switch (type) { 256 | case "num": 257 | return item.toStringSync(); 258 | break; 259 | case "shengmu": 260 | return item.getShengmuSync().toStringSync(); 261 | break; 262 | case "yunmu": 263 | return item.getYunmuSync().toStringSync(); 264 | break; 265 | case "head": 266 | return item.getHeadSync().toStringSync(); 267 | break; 268 | case "tone": 269 | return item.getPinyinWithToneMarkSync(); 270 | case "outtone": 271 | default : 272 | return item.getPinyinWithoutToneSync(); 273 | } 274 | }) 275 | } 276 | 277 | } --------------------------------------------------------------------------------