├── .checker ├── README.md ├── scripts │ ├── before-install.sh │ ├── install-annotators.sh │ ├── install-benepar.sh │ ├── install-cabocha.sh │ ├── install-crf.sh │ ├── install-depccg.sh │ ├── install-jar.sh │ ├── install-juman.sh │ ├── install-knp.sh │ ├── install-mecab.sh │ ├── install-other-languages.sh │ ├── install-syntaxnet.sh │ ├── install-udpipe.sh │ ├── run-test.sh │ └── set-env.sh ├── setup.cfg └── tests │ ├── basetest.py │ ├── benepar │ └── test_benepar.py │ ├── cabocha │ └── test_cabocha.py │ ├── comparison.py │ ├── constant.py │ ├── corenlp │ ├── test_berkeleyparser_dcoref.py │ ├── test_dcoref.py │ ├── test_ssplit.py │ └── test_tokenize.py │ ├── corenlp_other_languages │ ├── test_chinese_coref.py │ └── test_french_depparse.py │ ├── depccg │ └── test_depccg_ccg.py │ ├── example_test.py │ ├── juman │ └── test_juman.py │ ├── knp │ └── test_knp.py │ ├── mecab │ └── test_mecab.py │ ├── syntaxnet │ └── test_syntaxnet.py │ └── udpipe │ ├── test_udpipe_parse.py │ └── test_udpipe_tokenize.py ├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── bin ├── sbt └── sbt-launch.jar ├── build.sbt ├── docker-compose.yml ├── dockers ├── knp │ └── Dockerfile └── syntaxnet │ └── Dockerfile ├── jar └── easyccg.jar ├── project ├── build.properties ├── buildinfo.sbt └── plugins.sbt ├── python ├── README.md ├── pipeline_example.py ├── pyjigg │ ├── __init__.py │ └── pipeline.py └── setup.py ├── script └── release.sh └── src ├── main ├── resources │ └── python │ │ ├── _depccg.py │ │ ├── bene_par.py │ │ └── udpipe.py └── scala │ └── jigg │ ├── ml │ ├── Example.scala │ ├── FeatureBase.scala │ ├── FeatureIndexer.scala │ ├── FeatureUtil.scala │ ├── LinearClassifier.scala │ ├── LogLinearAdaGradL1.scala │ ├── LogLinearClassifier.scala │ ├── LogLinearSGD.scala │ ├── OnlineLogLinearTrainer.scala │ ├── OnlineTrainer.scala │ ├── Perceptron.scala │ ├── WeightVector.scala │ └── keras │ │ ├── Convolution1D.scala │ │ ├── Dense.scala │ │ ├── Embedding.scala │ │ ├── Empty.scala │ │ ├── Flatten.scala │ │ ├── Functor.scala │ │ ├── KerasModel.scala │ │ ├── KerasParser.scala │ │ ├── README.md │ │ ├── Relu.scala │ │ ├── Sigmoid.scala │ │ ├── Softmax.scala │ │ └── Tanh.scala │ ├── nlp │ └── ccg │ │ ├── CCGBank.scala │ │ ├── CCGBank2EnjuXML.scala │ │ ├── CCGBankToCabochaFormat.scala │ │ ├── CalcCoverage.scala │ │ ├── EvalParser.scala │ │ ├── EvalSuperTagger.scala │ │ ├── GoldBunsetsuDepInCabocha.scala │ │ ├── LoadDumpedTaggerModel.scala │ │ ├── Opts.scala │ │ ├── OutputCategoryList.scala │ │ ├── ParserModel.scala │ │ ├── ParserRunner.scala │ │ ├── ParserTrainer.scala │ │ ├── README.md │ │ ├── RenderCCGDerivation.scala │ │ ├── SuperTaggerModel.scala │ │ ├── SuperTaggerRunner.scala │ │ ├── SuperTaggerTrainer.scala │ │ ├── TrainParser.scala │ │ ├── TrainSuperTagger.scala │ │ ├── lexicon │ │ ├── Bunsetsu.scala │ │ ├── CCGBankReader.scala │ │ ├── CabochaReader.scala │ │ ├── Category.scala │ │ ├── CategoryDictionary.scala │ │ ├── CategoryFeature.scala │ │ ├── CategoryManager.scala │ │ ├── CategoryParser.scala │ │ ├── CategoryTree.scala │ │ ├── Derivation.scala │ │ ├── Dictionary.scala │ │ ├── Direction.scala │ │ ├── JapaneseDictionary.scala │ │ ├── MecabReader.scala │ │ ├── Numbered.scala │ │ ├── NumberedManager.scala │ │ ├── ParseTree.scala │ │ ├── ParseTreeConverer.scala │ │ ├── PoS.scala │ │ ├── Sentence.scala │ │ ├── SimpleDictionary.scala │ │ ├── Slash.scala │ │ └── Word.scala │ │ ├── package.scala │ │ ├── parser │ │ ├── Action.scala │ │ ├── BeamSearchDecoder.scala │ │ ├── HeadFinder.scala │ │ ├── KBestDecoder.scala │ │ ├── Oracle.scala │ │ ├── Rule.scala │ │ ├── ShiftReduceFeature.scala │ │ ├── ShiftReduceFeatureExtractors.scala │ │ ├── State.scala │ │ ├── TransitionBasedParser.scala │ │ └── package.scala │ │ └── tagger │ │ ├── MaxentMultiTagger.scala │ │ ├── SuperTaggingFeature.scala │ │ ├── SuperTaggingFeatureExtractors.scala │ │ ├── UserDefinedFeatureExtractors.scala │ │ └── package.scala │ ├── pipeline │ ├── AnnotatingInParallel.scala │ ├── Annotation.scala │ ├── AnnotationError.scala │ ├── Annotator.scala │ ├── ArgumentError.scala │ ├── BeneParAnnotator.scala │ ├── BerkeleyParserAnnotator.scala │ ├── BunsetsuKerasAnnotator.scala │ ├── CCGParseAnnotator.scala │ ├── CabochaAnnotator.scala │ ├── CandCAnnotator.scala │ ├── DepCCGAnnotator.scala │ ├── DocumentAnnotator.scala │ ├── DocumentKNPAnnotator.scala │ ├── EasyCCGAnnotator.scala │ ├── IOCommunicator.scala │ ├── JumanAnnotator.scala │ ├── KNPAnnotator.scala │ ├── KuromojiAnnotator.scala │ ├── MecabAnnotator.scala │ ├── OutputConverter.scala │ ├── Pipeline.scala │ ├── PipelineServer.scala │ ├── PropsHolder.scala │ ├── RegexDocumentAnnotator.scala │ ├── RegexSentenceAnnotator.scala │ ├── Requirement.scala │ ├── SentencesAnnotator.scala │ ├── SimpleKNPAnnotator.scala │ ├── SpaceTokenizerAnnotator.scala │ ├── SsplitKerasAnnotator.scala │ ├── StanfordCollapsedDependenciesAnnotator.scala │ ├── StanfordCoreNLPAnnotator.scala │ ├── StanfordTypedDependenciesAnnotator.scala │ ├── SyntaxNetAnnotator.scala │ ├── SystemDict.scala │ ├── UDPipeAnnotator.scala │ └── UnmanagedAnnotators.scala │ └── util │ ├── ArgumentsParser.scala │ ├── CoNLLUtil.scala │ ├── HDF5Object.scala │ ├── IDGenerator.scala │ ├── IOUtil.scala │ ├── JSONUtil.scala │ ├── LogUtil.scala │ ├── LookupTable.scala │ ├── Normalizer.scala │ ├── Prop.java │ ├── PropertiesUtil.scala │ ├── ResourceUtil.scala │ ├── TreesUtil.scala │ └── XMLUtil.scala └── test ├── resources ├── data │ ├── Japanese.small.lexicon │ ├── Japanese.unkVerb.lexicon │ ├── json │ │ ├── english.ssplit.test.json │ │ └── japanese.ssplit.test.json │ ├── keras │ │ ├── bunsetsu_model.h5 │ │ ├── jpnLookupCharacter.json │ │ ├── jpnLookupWords.json │ │ └── ssplit_model.h5 │ ├── ml │ │ └── keras │ │ │ ├── convolution1d │ │ │ ├── convolution1d_gold.csv │ │ │ ├── convolution1d_input.csv │ │ │ └── convolution1d_model.h5 │ │ │ ├── dense │ │ │ ├── dense_gold.csv │ │ │ ├── dense_input.csv │ │ │ └── dense_model.h5 │ │ │ ├── embedding │ │ │ ├── embedding_gold.csv │ │ │ ├── embedding_input.csv │ │ │ └── embedding_model.h5 │ │ │ ├── flatten │ │ │ ├── flatten_gold.csv │ │ │ ├── flatten_input.csv │ │ │ └── flatten_model.h5 │ │ │ └── kerasModel │ │ │ ├── kerasModel_gold.csv │ │ │ ├── kerasModel_input.csv │ │ │ └── kerasModel_model.h5 │ ├── template.small.lst │ ├── template.unkVerb.lst │ └── xml │ │ ├── english.ssplit.spaceTokenize.gold.xml │ │ ├── english.ssplit.test.xml │ │ ├── japanese.ssplit.kuromoji.gold.xml │ │ └── japanese.ssplit.test.xml └── script │ └── create_small_lst_from_lexicon.py └── scala └── jigg ├── ml └── keras │ ├── Convolution1DSpec.scala │ ├── DenseSpec.scala │ ├── EmbeddingSpec.scala │ ├── FlattenSpec.scala │ ├── KerasModelSpec.scala │ └── KerasParserTest.scala ├── nlp └── ccg │ ├── lexicon │ ├── BunsetsuTest.scala │ ├── CCGBankReaderTest.scala │ ├── CategoryFeatureTest.scala │ ├── CategoryManagerTest.scala │ ├── CategoryParserTest.scala │ └── JapaneseDictionaryTest.scala │ └── parser │ ├── KBestDecoderTest.scala │ ├── OracleTest.scala │ ├── ParsedSentence.scala │ └── RuleTest.scala ├── pipeline ├── AnnotatorSpec.scala ├── BaseAnnotatorSpec.scala ├── BeneParAnnotatorSpec.scala ├── BerkeleyParserAnnotatorSpec.scala ├── BunsetsuKerasAnnotatorTest.scala ├── CabochaAnnotatorSpec.scala ├── DepCCGAnnotatorSpec.scala ├── DocumentKNPAnnotatorSpec.scala ├── EasyCCGAnnotatorSpec.scala ├── IntermediateInputSpec.scala ├── JumanAnnotatorSpec.scala ├── KuromojiAnnotatorSpec.scala ├── MecabAnnotatorSpec.scala ├── PipelineSpec.scala ├── RegexSentenceAnnotatorTest.scala ├── RequirementSpec.scala ├── SimpleKNPAnnotatorSpec.scala ├── SsplitKerasAnnotatorTest.scala ├── StanfordTypedDependenciesAnnotatorSpec.scala └── SyntaxNetAnnotatorSpec.scala └── util ├── CoNLLUtilSpec.scala ├── JSONUtilSpec.scala ├── TreesUtilSpec.scala └── XMLUtilSpec.scala /.checker/scripts/before-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # upgrade c++ 4 | # add repository 5 | sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y 6 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F 7 | 8 | sudo apt update -y && sudo apt install g++-4.9 gcc-4.9 -y 9 | 10 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 10 11 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 20 12 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/g++-4.8 10 13 | sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 20 14 | 15 | sudo rm /usr/bin/cpp 16 | 17 | sudo update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-4.8 10 18 | sudo update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-4.9 20 19 | sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30 20 | sudo update-alternatives --set cc /usr/bin/gcc 21 | sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30 22 | sudo update-alternatives --set c++ /usr/bin/g++ 23 | -------------------------------------------------------------------------------- /.checker/scripts/install-annotators.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ${ANNOTATORS} == "udpipe" ];then 4 | echo "Install UDPIPE" 5 | ./.checker/scripts/install-udpipe.sh 6 | elif [ ${ANNOTATORS} == "depccg" ];then 7 | echo "Install DEPCCG" 8 | ./.checker/scripts/install-depccg.sh 9 | elif [ ${ANNOTATORS} == "mecab" ];then 10 | echo "Install MECAB" 11 | ./.checker/scripts/install-mecab.sh 12 | elif [ ${ANNOTATORS} == "cabocha" ];then 13 | echo "Install CABOCHA" 14 | ./.checker/scripts/install-mecab.sh 15 | ./.checker/scripts/install-crf.sh 16 | ./.checker/scripts/install-cabocha.sh 17 | elif [ ${ANNOTATORS} == "juman" ];then 18 | echo "Install JUMAN" 19 | ./.checker/scripts/install-juman.sh 20 | elif [ ${ANNOTATORS} == "knp" ];then 21 | echo "Install KNP" 22 | ./.checker/scripts/install-knp.sh 23 | elif [ ${ANNOTATORS} == "corenlp" ];then 24 | echo "Install CORENLP" 25 | ./.checker/scripts/install-jar.sh 26 | elif [ ${ANNOTATORS} == "corenlp_other_languages" ];then 27 | echo "Install CORENLP OTHER LANGUAGE" 28 | ./.checker/scripts/install-jar.sh 29 | ./.checker/scripts/install-other-languages.sh 30 | elif [ ${ANNOTATORS} == "benepar" ];then 31 | echo "Install BENEPAR" 32 | ./.checker/scripts/install-jar.sh 33 | ./.checker/scripts/install-benepar.sh 34 | elif [ ${ANNOTATORS} == "syntaxnet" ];then 35 | echo "Install SYNTAXNET" 36 | ./.checker/scripts/install-jar.sh 37 | ./.checker/scripts/install-syntaxnet.sh 38 | fi 39 | -------------------------------------------------------------------------------- /.checker/scripts/install-benepar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | pip install cython numpy 6 | pip install benepar[cpu] 7 | 8 | python -c 'import benepar; benepar.download("benepar_en2")' 9 | -------------------------------------------------------------------------------- /.checker/scripts/install-cabocha.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | source ./.checker/scripts/set-env.sh 6 | 7 | home_dir=`pwd ./` 8 | 9 | url="https://github.com/taku910/cabocha/archive/master.zip" 10 | file=master.zip 11 | dir=cabocha-master 12 | 13 | # download 14 | wget ${url} 15 | 16 | # unpack 17 | unzip ${file} 18 | 19 | # compile 20 | cd ${home_dir}"/"${dir} 21 | ./autogen.sh 22 | ./configure --with-charset=UTF8 23 | make 24 | make check 25 | sudo make install 26 | 27 | cd ${home_dir} 28 | -------------------------------------------------------------------------------- /.checker/scripts/install-crf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | home_dir=`pwd ./` 6 | 7 | # To get file id, you singup google account. 8 | url="https://drive.google.com/uc?export=view&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ" 9 | file=CRF++-0.58.tar.gz 10 | dir=CRF++-0.58 11 | 12 | wget ${url} -O ${file} 13 | 14 | tar -zxvf ${file} 15 | 16 | cd ${home_dir}"/"${dir} 17 | ./configure 18 | make 19 | sudo make install 20 | 21 | cd ${home_dir} 22 | -------------------------------------------------------------------------------- /.checker/scripts/install-depccg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | home_dir=`pwd ./` 6 | 7 | pip install cython numpy 8 | pip install depccg 9 | 10 | depccg_en download 11 | depccg_ja download 12 | 13 | # en_model_url=http://cl.naist.jp/~masashi-y/resources/depccg/en_hf_tri.tar.gz 14 | # ja_model_url=http://cl.naist.jp/~masashi-y/resources/depccg/ja_hf_ccgbank.tar.gz 15 | # en_model=en_hf_tri.tar.gz 16 | # ja_model=ja_hf_ccgbank.tar.gz 17 | 18 | # model_dir="depccg/models" 19 | # src_dir="depccg/src" 20 | 21 | # # Install cython & chainer. 22 | # pip install -U pip cython 23 | # pip install chainer 24 | # pip install scrapy 25 | 26 | # # Git clone the depccg repository 27 | # git clone https://github.com/masashi-y/depccg.git 28 | 29 | # # download model file. 30 | # wget ${en_model_url} 31 | # wget ${ja_model_url} 32 | 33 | # # make directory saved model file 34 | # mkdir ${model_dir} 35 | # mv ${en_model} ${ja_model} ${model_dir} 36 | 37 | # # compile 38 | # # A default g++ version is 4.8 in Ubuntu 14.04. 39 | # # In depccg compile, it requires the version >= 4.9. 40 | # export CC=g++-4.9 41 | # cd ${home_dir}"/"${src_dir} 42 | # python setup.py build_ext --inplace 43 | 44 | # ln -s depccg*.so depccg.so 45 | 46 | # # unpack model files. 47 | # cd ${home_dir}"/"${model_dir} 48 | # tar -zxvf ${en_model} 49 | # tar -zxvf ${ja_model} 50 | 51 | # cd ${home_dir} 52 | -------------------------------------------------------------------------------- /.checker/scripts/install-jar.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | home_dir=`pwd ./` 6 | jar_dir="jar/" 7 | 8 | 9 | # download stanford corenlp 10 | url=http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip 11 | zip=stanford-corenlp-full-2018-10-05.zip 12 | dir=stanford-corenlp-full-2018-10-05 13 | file=stanford-corenlp-3.9.2.jar 14 | file_model=stanford-corenlp-3.9.2-models.jar 15 | 16 | # download Stanford CoreNLP models 17 | wget ${url} 18 | 19 | # unpack 20 | unzip ${zip} 21 | 22 | cp ${dir}"/"${file} ${jar_dir} 23 | cp ${dir}"/"${file_model} ${jar_dir} 24 | 25 | 26 | # create jigg jar file 27 | jigg_file="target/jigg-assembly-0.8.0.jar" 28 | ./bin/sbt assembly 29 | cp ${jigg_file} ${jar_dir} 30 | 31 | 32 | # download jigg-models 33 | jigg_models="jigg-models.jar" 34 | wget https://github.com/mynlp/jigg-models/raw/master/jigg-models.jar 35 | mv ${jigg_models} ${jar_dir} 36 | -------------------------------------------------------------------------------- /.checker/scripts/install-juman.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | home_dir=`pwd ./` 6 | 7 | url=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 8 | file=juman-7.01.tar.bz2 9 | dir=juman-7.01 10 | 11 | # download 12 | wget ${url} 13 | 14 | # unpack bz2 file 15 | tar -jxvf ${file} 16 | 17 | # build 18 | cd ${dir} 19 | ./configure 20 | make 21 | sudo make install 22 | 23 | cd ${home_dir} 24 | -------------------------------------------------------------------------------- /.checker/scripts/install-knp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | docker build -t jigg/jigg:knp -f dockers/knp/Dockerfile . 6 | -------------------------------------------------------------------------------- /.checker/scripts/install-mecab.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | sudo apt install libmecab2 libmecab-dev mecab mecab-ipadic-utf8 mecab-ipadic mecab-utils 6 | -------------------------------------------------------------------------------- /.checker/scripts/install-other-languages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | jar_dir="jar" 4 | 5 | # chinese model jar file 6 | wget http://nlp.stanford.edu/software/stanford-chinese-corenlp-2018-10-05-models.jar 7 | mv stanford-chinese-corenlp-2018-10-05-models.jar ${jar_dir} 8 | 9 | # french model jar file 10 | wget http://nlp.stanford.edu/software/stanford-french-corenlp-2018-10-05-models.jar 11 | mv stanford-french-corenlp-2018-10-05-models.jar ${jar_dir} 12 | 13 | 14 | -------------------------------------------------------------------------------- /.checker/scripts/install-syntaxnet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | docker build -t jigg/jigg:syntaxnet -f dockers/syntaxnet/Dockerfile . 6 | -------------------------------------------------------------------------------- /.checker/scripts/install-udpipe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | pip install ufal.udpipe 6 | 7 | # model download 8 | curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2364/udpipe-ud-2.0-170801.zip 9 | 10 | # unpack 11 | unzip udpipe-ud-2.0-170801.zip 12 | 13 | # rename model directory 14 | mv udpipe-ud-2.0-170801 udpipe-ud-model 15 | -------------------------------------------------------------------------------- /.checker/scripts/run-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | source .checker/scripts/set-env.sh 6 | 7 | # run a unit test for the files under the directory `.checker/tests/${ANNOTATORS}`. 8 | python3 -m unittest discover -s .checker/tests/${ANNOTATORS} 9 | -------------------------------------------------------------------------------- /.checker/scripts/set-env.sh: -------------------------------------------------------------------------------- 1 | export JIGG_VERSION="0.8.0" 2 | export CORENLP_VERSION="3.9.2" 3 | export IVY2_CACHE_DIR="${HOME}/.ivy2/cache" 4 | 5 | export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:$LD_LIBRARY_PATH 6 | -------------------------------------------------------------------------------- /.checker/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 -------------------------------------------------------------------------------- /.checker/tests/constant.py: -------------------------------------------------------------------------------- 1 | JIGG_VERSION = "0.8.0" 2 | CORENLP_VERSION = "3.9.2" 3 | 4 | JIGG_JAR = "target/jigg-assembly-{}.jar".format(JIGG_VERSION) 5 | JIGG_MODEL_JAR = "jigg-models.jar" 6 | 7 | CORENLP_MODEL_JAR = "stanford-corenlp-{}-models.jar".format(CORENLP_VERSION) 8 | 9 | 10 | # URL 11 | # juman 12 | JUMAN_MAIN_URL = "http://nlp.ist.i.kyoto-u.ac.jp/?JUMAN" 13 | JUMAN_DOWNLOAD_URL = "http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2" 14 | 15 | # knp 16 | KNP_MAIN_URL = "http://nlp.ist.i.kyoto-u.ac.jp/?KNP" 17 | KNP_DOWNLOAD_URL = "http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/knp/knp-4.19.tar.bz2" 18 | 19 | # CRF 20 | CRF_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ" 21 | 22 | # cabocha 23 | CABOCHA_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7SDd1Q1dUQkZQaUU" 24 | 25 | # depccg 26 | DEPCCG_EN_MODLE_URL = "http://cl.naist.jp/~masashi-y/resources/depccg/en_hf_tri.tar.gz" 27 | DEPCCG_JA_MODEL_URL = "http://cl.naist.jp/~masashi-y/resources/depccg/ja_hf_ccgbank.tar.gz" 28 | DEPCCG_GIT_URL = "https://github.com/masashi-y/depccg.git" 29 | 30 | # udpipe 31 | UDPIPE_MODEL_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2364/udpipe-ud-2.0-170801.zip" 32 | -------------------------------------------------------------------------------- /.checker/tests/corenlp/test_ssplit.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".checker/tests") 3 | 4 | from basetest import BaseTest 5 | 6 | 7 | class TestSsplit(BaseTest): 8 | 9 | def setUp(self): 10 | 11 | self.input_text = "Stanford University is located in California. It is a great university, founded in 1891." 12 | 13 | self.expected_text = """ 14 | 15 | 16 | 17 | 18 | Stanford University is located in California. 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | It is a great university, founded in 1891. 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | """ 47 | 48 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize,ssplit]' 49 | 50 | def test_ssplit(self): 51 | self.check_equal(self.exe, self.input_text, self.expected_text) 52 | -------------------------------------------------------------------------------- /.checker/tests/corenlp/test_tokenize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".checker/tests") 3 | 4 | from basetest import BaseTest 5 | 6 | 7 | class TestTokenize(BaseTest): 8 | 9 | def setUp(self): 10 | 11 | self.input_text = "Stanford University is located in California. It is a great university, founded in 1891." 12 | 13 | self.expected_text = """ 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | """ 41 | 42 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]' 43 | 44 | def test_tokenize(self): 45 | self.check_equal(self.exe, self.input_text, self.expected_text) 46 | -------------------------------------------------------------------------------- /.checker/tests/example_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".checker/tests") 3 | 4 | from basetest import BaseTest 5 | 6 | 7 | class TestName(BaseTest): 8 | ''' 9 | This is an exmaple (or a based) file of unittest. You want 10 | to add the new test file, please copy this file and edit 11 | it as the following. 12 | 13 | 1. Copy this file 14 | please, copy this file as the following command: 15 | ``` 16 | cp example_test.py {ANNOTATORS}/test_***.py 17 | ``` 18 | The {ANNOTATORS} is annotator name. 19 | You need to name the file like `test_***.py`. `***` is any name. 20 | Note the head to the file name must give the `test`. For example, 21 | `test_tokenize.py`. 22 | 2. Change the class name 23 | For each the test case, You change the class name from 24 | TestName to Test***. `***` is any name, for example, 25 | Tokenize, Ssplit, ... etc. 26 | 3. Change three variables in the setUp() function 27 | - self.input_text : a sample text using for test 28 | - self.expected_text : an expected output text by test run 29 | - self.exe : an execution command 30 | This program runs with the sbt runMain command. For example, 31 | `sbt "runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]"`. 32 | You set the part of "runMain ~" in the variable `self.exe`. 33 | 4. Change the function name. 34 | For each the test case, You also change the function name 35 | from test_name to test_***. `***` is any name, for example, 36 | tokenize, ssplit, ... etc. Note that the head of the 37 | function name must give the `test`. 38 | 39 | For example, the case of the annotator `pos`: 40 | 1. file name -> test_pos.py 41 | 2. class name -> class TestPos(BaseTest): 42 | 3. variables -> 43 | self.input_text = "This is a sample text." 44 | self.expected_text = "[the result text]" 45 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize,ssplit,pos]' 46 | 4. function name -> def test_pos(self): 47 | ''' 48 | def setUp(self): 49 | # Set an input (sample) text 50 | self.input_text = "" 51 | 52 | # Set an expected text 53 | self.expected_text = "" 54 | 55 | # Set a execution command 56 | # You need to change the `-annotators` term according to the test case. 57 | # For example, the case of annotation `lemma`, corenlp[tokenize,ssplit,pos,lemma]. 58 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]' 59 | 60 | def test_name(self): 61 | # A function check_equal() is defined on the superclass BaseTest. 62 | self.check_equal(self.exe, self.input_text, self.expected_text) 63 | -------------------------------------------------------------------------------- /.checker/tests/udpipe/test_udpipe_tokenize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".checker/tests") 3 | 4 | from basetest import BaseTest 5 | 6 | 7 | class TestUDpipeTokenize(BaseTest): 8 | 9 | def setUp(self): 10 | self.input_text = "Stanford University is located in California. It is a great university, founded in 1891." 11 | 12 | self.expected_text = r""" 13 | 14 | 15 | 16 | 17 | Stanford University is located in California. 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | It is a great university, founded in 1891. 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | """ 46 | 47 | self.exe = 'runMain jigg.pipeline.Pipeline ' \ 48 | + '-annotators udpipe[tokenize] ' \ 49 | + '-udpipe.model udpipe-ud-model/english-ud-2.0-170801.udpipe ' 50 | 51 | def test_udpipe_tokenize(self): 52 | self.check_equal(self.exe, self.input_text, self.expected_text) 53 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !bin 3 | !project 4 | !python 5 | !script 6 | !src 7 | !build.sbt 8 | !jar -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.o 3 | *.pyc 4 | .lock* 5 | .waf* 6 | *.class 7 | build/ 8 | target/ 9 | .idea/ 10 | models/ 11 | tools/ 12 | download 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | 3 | scala: 4 | - 2.11.8 5 | 6 | dist: trusty 7 | 8 | env: 9 | - ANNOTATORS=corenlp 10 | - ANNOTATORS=corenlp_other_languages 11 | - ANNOTATORS=udpipe 12 | - ANNOTATORS=depccg 13 | - ANNOTATORS=juman 14 | - ANNOTATORS=knp 15 | - ANNOTATORS=mecab 16 | - ANNOTATORS=cabocha 17 | - ANNOTATORS=benepar 18 | # - ANNOTATORS=syntaxnet 19 | 20 | before_install: 21 | - ./.checker/scripts/before-install.sh 22 | - pyenv global system 3.6 23 | - virtualenv --python=python3.6 .venv 24 | - source .venv/bin/activate 25 | - pip install --upgrade pip 26 | 27 | install: 28 | - ./.checker/scripts/install-annotators.sh 29 | 30 | script: 31 | - .checker/scripts/run-test.sh 32 | 33 | branches: 34 | only: 35 | - master -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:8 2 | 3 | WORKDIR /jigg 4 | 5 | # Download dependencies 6 | COPY build.sbt /jigg/ 7 | COPY project/*.sbt project/build.properties /jigg/project/ 8 | COPY bin /jigg/bin 9 | RUN bin/sbt update 10 | 11 | # Build 12 | COPY src /jigg/src 13 | COPY jar /jigg/jar 14 | RUN bin/sbt assembly 15 | 16 | # Run a simple test 17 | RUN echo "テレビで自転車で走っている少女を見た" |\ 18 | java -Xms1024M -Xmx1024M -cp "target/*:jar/jigg-models.jar" \ 19 | jigg.pipeline.Pipeline -annotators ssplit,kuromoji,jaccg 20 | -------------------------------------------------------------------------------- /bin/sbt: -------------------------------------------------------------------------------- 1 | java -Dfile.encoding=UTF-8 -Xms512M -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=384M -jar `dirname $0`/sbt-launch.jar "$@" 2 | -------------------------------------------------------------------------------- /bin/sbt-launch.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/bin/sbt-launch.jar -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | jigg: 4 | build: . 5 | ports: 6 | - 8080:8080 7 | entrypoint: 8 | - java 9 | - -Xms1024M 10 | - -Xmx1024M 11 | - -cp 12 | - "target/*:jar/*" 13 | - jigg.pipeline.PipelineServer 14 | - -host 15 | - 0.0.0.0 16 | volumes: 17 | - ./script:/jigg/script 18 | - ./jar:/jigg/jar 19 | -------------------------------------------------------------------------------- /dockers/knp/Dockerfile: -------------------------------------------------------------------------------- 1 | # If you build a image using this file, please run the following command at a directory `jigg/`, 2 | # ``` 3 | # docker build -t {image name}:{tag} -f docker/knp/Dockerfile . 4 | # ``` 5 | FROM jigg/jigg-dockers:knp 6 | 7 | WORKDIR /jigg 8 | 9 | ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/bin:/usr/local/lib 10 | ENV PATH $PATH:$HOME/usr/bin 11 | 12 | COPY build.sbt /jigg/ 13 | COPY project/*.sbt project/build.properties /jigg/project/ 14 | COPY bin /jigg/bin 15 | RUN bin/sbt update 16 | 17 | # Build 18 | COPY src /jigg/src 19 | COPY jar /jigg/jar 20 | RUN bin/sbt assembly -------------------------------------------------------------------------------- /dockers/syntaxnet/Dockerfile: -------------------------------------------------------------------------------- 1 | # If you build a image using this file, please run the following command at a directory 'jigg/', 2 | # ``` 3 | # docker build -t {image name}:{tag} -f dockers/syntaxnet/Dockerfile . 4 | # ``` 5 | 6 | FROM tensorflow/syntaxnet 7 | 8 | WORKDIR /jigg 9 | 10 | RUN apt-get update -y && apt-get install -y less wget tar bzip2 unzip sudo make gcc g++ libz-dev 11 | 12 | # install jigg 13 | COPY build.sbt /jigg/ 14 | COPY project/*.sbt project/build.properties /jigg/project/ 15 | COPY bin /jigg/bin 16 | RUN bin/sbt update 17 | 18 | # Build 19 | COPY src /jigg/src 20 | COPY jar /jigg/jar 21 | RUN bin/sbt assembly 22 | -------------------------------------------------------------------------------- /jar/easyccg.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/jar/easyccg.jar -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.0 2 | -------------------------------------------------------------------------------- /project/buildinfo.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0") 2 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6") 2 | 3 | // for sbt-sonatype (https://github.com/xerial/sbt-sonatype) 4 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.1") 5 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0") 6 | -------------------------------------------------------------------------------- /python/pipeline_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from pyjigg import Pipeline 4 | import xml.etree.ElementTree as ET 5 | import json 6 | 7 | '''Example to use Jigg from python. 8 | 9 | Before using this, users must start the PipelineServer in a command line, e.g.: 10 | $ cd jigg-0.6.2/ 11 | $ java -Xmx4g -cp "*" jigg.pipeline.PipelineServer 12 | ''' 13 | 14 | if __name__ == '__main__': 15 | pipeline = Pipeline('http://localhost:8080') 16 | 17 | text1 = """This is the first sentence. This is the second sentence.""" 18 | 19 | text2 = """This is the third sentence. This is the forth sentence.""" 20 | 21 | output1 = pipeline.annotate(text1, { 22 | 'annotators': 'corenlp[tokenize,ssplit]', 23 | 'outputFormat': 'xml'}) 24 | print ET.tostring(output1) 25 | 26 | output2 = pipeline.annotate(text2, { 27 | 'annotators': 'corenlp[tokenize,ssplit]', 28 | 'outputFormat': 'json'}) 29 | print json.dumps(output2, indent=4) 30 | -------------------------------------------------------------------------------- /python/pyjigg/__init__.py: -------------------------------------------------------------------------------- 1 | from pyjigg.pipeline import Pipeline 2 | -------------------------------------------------------------------------------- /python/pyjigg/pipeline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import xml.etree.ElementTree as ET 4 | import json 5 | import requests 6 | 7 | JIGG = 'jigg-0.6.2' 8 | 9 | class Pipeline: 10 | 11 | def __init__(self, server_url): 12 | if server_url[-1] == '/': 13 | server_url = server_url[:-1] 14 | self.server_url = server_url 15 | 16 | def annotate(self, text, properties=None): 17 | assert isinstance(text, str) 18 | if properties is None: 19 | properties = {} 20 | else: 21 | assert isinstance(properties, dict) 22 | 23 | # Checks that the Jigg Pipeline server is started. 24 | try: 25 | requests.get(self.server_url) 26 | except requests.exceptions.ConnectionError: 27 | raise Exception('Check whether you have started the Jigg\'s PipelineServer e.g.\n' 28 | '$ cd %s/ \n' 29 | '$ java -Xmx4g -cp "*" jigg.pipeline.PipelineServer' % (JIGG)) 30 | 31 | url = self.server_url + '/annotate' 32 | text = text.encode() 33 | data = properties.copy() 34 | data['q'] = text 35 | r = requests.post(url, data=data) 36 | output = r.text 37 | if ('outputFormat' in properties and properties['outputFormat'] == 'json'): 38 | try: 39 | output = json.loads(output, encoding='utf-8', strict=True) 40 | except: 41 | pass 42 | else: 43 | try: 44 | output = ET.fromstring(output) 45 | except: 46 | pass 47 | 48 | return output 49 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name = "pyjigg", 5 | packages=['pyjigg'], 6 | version = "0.1.0", 7 | ) 8 | -------------------------------------------------------------------------------- /script/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Usage: ./script/release.sh (e.g., 0.7.2) 4 | 5 | version=$1 6 | corenlp_url='http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip' 7 | corenlp_model='stanford-corenlp-3.9.1-models.jar' 8 | jigg_url='git@github.com:mynlp/jigg.git' 9 | 10 | corenlp_zip=${corenlp_url##*/} 11 | corenlp_dir=${corenlp_zip%.*} 12 | 13 | if [[ ! -e jigg-${version} ]]; then mkdir jigg-${version}; fi 14 | cd jigg-${version} 15 | 16 | # get jigg, if needed 17 | if [[ ! -e jigg ]]; then 18 | git clone $jigg_url 19 | fi 20 | 21 | # add corenlp model 22 | if [[ ! -e ${corenlp_dir} ]]; then 23 | wget ${corenlp_url} -O ${corenlp_zip} 24 | unzip ${corenlp_zip} 25 | mv ${corenlp_dir}/${corenlp_model} jigg 26 | fi 27 | 28 | # add assembled jigg 29 | if [[ ! -e jigg/jigg-$1.jar ]]; then 30 | cd jigg 31 | ./bin/sbt assembly 32 | mv target/jigg-assembly-$1.jar jigg-$1.jar 33 | ./bin/sbt clean 34 | cd ../ 35 | fi 36 | 37 | for f in 'src/test' '.checker' '.git' 'project' 'target'; do 38 | if [[ -e jigg/$f ]]; then 39 | rm -rf jigg/$f 40 | fi 41 | done 42 | 43 | if [[ -e jigg/.git ]]; then 44 | rm -rf jigg/.git 45 | fi 46 | 47 | # if [[ -e jigg/src/test ]]; then 48 | # rm -rf jigg/src/test 49 | # fi 50 | 51 | # if [[ -e jigg/.checker ]]; then rm -rf jigg/.checker; fi 52 | 53 | # if [[ -e jigg/project ]]; then rm -rf jigg/project; fi 54 | # if [[ -e jigg/target ]]; then rm -rf jigg/target; fi 55 | 56 | # add jigg models (berkeley parser model inside) 57 | if [[ ! -e jigg/jigg-models.jar ]]; then 58 | cd jigg 59 | wget https://github.com/mynlp/jigg-models/raw/master/jigg-models.jar 60 | cd ../ 61 | fi 62 | 63 | mv jigg jigg-${version} 64 | zip -r jigg-${version}.zip jigg-${version} 65 | -------------------------------------------------------------------------------- /src/main/resources/python/bene_par.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function, unicode_literals 3 | import sys 4 | 5 | import benepar 6 | 7 | # In Python2, wrap sys.stdin and sys.stdout to work with unicode. 8 | if sys.version_info[0] < 3: 9 | import codecs 10 | import locale 11 | encoding = locale.getpreferredencoding() 12 | sys.stdin = codecs.getreader(encoding)(sys.stdin) 13 | sys.stdout = codecs.getwriter(encoding)(sys.stdout) 14 | 15 | if sys.version_info.major == 3: 16 | raw_input = input 17 | 18 | model = sys.argv[1] # maybe "benepar_en" 19 | 20 | parser = benepar.Parser(model) 21 | 22 | def parse(tokens, tags): 23 | sentence = list(zip(tokens, tags)) 24 | parse_raw, tags_raw, sentence = next(parser._batched_parsed_raw([(tokens, sentence)])) 25 | tree = parser._make_nltk_tree(sentence, tags_raw, *parse_raw) 26 | return tree 27 | 28 | while True: 29 | tokens = raw_input() 30 | tags = raw_input() 31 | 32 | tokens = tokens.split(' ') 33 | tags = tags.split(' ') 34 | 35 | tree = parse(tokens, tags) 36 | print(tree) 37 | print("END") 38 | -------------------------------------------------------------------------------- /src/main/resources/python/udpipe.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function, unicode_literals 3 | import sys 4 | 5 | from ufal.udpipe import Model, Pipeline, ProcessingError 6 | 7 | # In Python2, wrap sys.stdin and sys.stdout to work with unicode. 8 | if sys.version_info[0] < 3: 9 | import codecs 10 | import locale 11 | encoding = locale.getpreferredencoding() 12 | sys.stdin = codecs.getreader(encoding)(sys.stdin) 13 | sys.stdout = codecs.getwriter(encoding)(sys.stdout) 14 | 15 | if sys.version_info.major == 3: 16 | raw_input = input 17 | 18 | # To reduce the overhead we divide the patterns of a possible pipeline into 3 cases. 19 | _MODE_ = ['all', 'tok|pos', 'pos|par', 'tok', 'pos', 'par'] 20 | 21 | model = sys.argv[1] 22 | mode = sys.argv[2] # one of _MODE_ 23 | 24 | model = Model.load(model) 25 | 26 | if mode == 'all' or mode.find('tok') >= 0: input_format = 'tokenize' 27 | else: input_format = 'conllu' 28 | output_format = 'conllu' 29 | 30 | if mode == 'all' or mode.find('pos') >= 0: pos = Pipeline.DEFAULT 31 | else: pos = Pipeline.NONE 32 | 33 | if mode == 'all' or mode.find('par') >= 0: parse = Pipeline.DEFAULT 34 | else: parse = Pipeline.NONE 35 | 36 | pipeline = Pipeline( 37 | model, input_format, pos, parse, output_format) 38 | error = ProcessingError() 39 | 40 | while True: 41 | inputs = [] 42 | while True: 43 | line = raw_input() 44 | if line == '####EOD####': break 45 | inputs.append(line) 46 | 47 | result = pipeline.process('\n'.join(inputs), error) 48 | print(result) 49 | print('END') 50 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/Example.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | case class Example[L](featVec:Array[Int], label:L) 20 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/FeatureBase.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | trait FeatureBase 20 | 21 | // Unlabeld feature, but not limited to: user may want to create features always with label (e.g., in structured classification exam). In such case, please include label to this class and ignore LabeldFeature. 22 | trait Feature extends FeatureBase { 23 | type LabelType 24 | type DictionaryType 25 | def assignLabel(label:LabelType): LabeledFeature[LabelType] 26 | def concat(items:Any*): String = items.mkString("_###_") 27 | } 28 | 29 | trait LabeledFeature[L] extends FeatureBase { 30 | def unlabeled: Feature 31 | def label: L 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/FeatureIndexer.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.collection.mutable.{HashMap, ArrayBuffer} 20 | 21 | @SerialVersionUID(1L) 22 | trait FeatureIndexer[Feature] extends Serializable { 23 | def size: Int 24 | 25 | /** Mutable indexing method which may add a new entry into the backbone map 26 | */ 27 | def getIndex(key: Feature): Int 28 | 29 | /** Immutable indexing, -1 for unknown entry. 30 | */ 31 | def get(key: Feature) = getIndex(key) 32 | } 33 | 34 | @SerialVersionUID(1L) 35 | class ExactFeatureIndexer[Feature](val map: HashMap[Feature, Int]) extends FeatureIndexer[Feature] { 36 | 37 | def size: Int = map.size 38 | 39 | def getIndex(key: Feature) = map.getOrElseUpdate(key, map.size) 40 | 41 | override def get(key: Feature) = map.getOrElse(key, -1) 42 | } 43 | 44 | /** FeatureIndexer with hash trick. Hash value is calculated with MurmurHash3. 45 | * 46 | * Pros of this approach are: 47 | * 1) Very memory efficient; we don't have to hold a hashmap for millions of feature objects; 48 | * 2) Small loading time of model. 49 | * 50 | * The expense is a small loss of accuracy but usually this is really small... 51 | */ 52 | @SerialVersionUID(1L) 53 | class HashedFeatureIndexer[Feature] private( 54 | val maxFeatureSize: Int, 55 | val hasher: (Feature => Int)) extends FeatureIndexer[Feature] { 56 | 57 | def size = maxFeatureSize 58 | 59 | def getIndex(key: Feature) = (math.abs(hasher(key)) % maxFeatureSize) 60 | } 61 | 62 | object HashedFeatureIndexer { 63 | def apply[Feature]( 64 | maxFeatureSize: Int = (2 << 23), 65 | hasher: (Feature => Int) = {f: Feature => f.hashCode()}) = { 66 | 67 | val biggestPrimeBelow = primes.takeWhile(maxFeatureSize > _).last 68 | new HashedFeatureIndexer[Feature](biggestPrimeBelow, hasher) 69 | } 70 | 71 | private lazy val primes = 2 #:: sieve(3) 72 | 73 | private def sieve(n: Int): Stream[Int] = 74 | if (primes.takeWhile(p => p*p <= n).exists(n % _ == 0)) sieve(n + 2) 75 | else n #:: sieve(n + 2) 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/FeatureUtil.scala: -------------------------------------------------------------------------------- 1 | // package jigg.ml 2 | 3 | // import scala.collection.mutable.{Map => mMap} 4 | // import scala.collection.mutable.AnyRefMap 5 | 6 | // trait FeatureUtil[Feature <: AnyRef] { 7 | // type FeatureIndexer = AnyRefMap[Feature, Int] 8 | 9 | // def getIndex(indexer: FeatureIndexer, key: Feature) = indexer.getOrElseUpdate(key, indexer.size) 10 | 11 | // def removeIndexes(indexer: FeatureIndexer, idxs: Seq[Int]): Unit = { 12 | // val features = indexer.toSeq.sortWith(_._2 < _._2).map(_._1) 13 | // val originalSize = indexer.size 14 | // (0 to idxs.size) foreach { i => 15 | // val idx = if (i == idxs.size) originalSize else idxs(i) 16 | // val lastIdx = if (i == 0) -1 else idxs(i - 1) 17 | // (lastIdx + 1 until idx) foreach { f => indexer(features(f)) -= i } 18 | // if (i != idxs.size) indexer -= features(idx) 19 | // } 20 | // } 21 | // def removeElemsOver(indexer: FeatureIndexer, lastIdx: Int) = indexer.toSeq.foreach { 22 | // case (feature, idx) => 23 | // indexer -= feature 24 | // } 25 | // } 26 | 27 | // // example usage: 28 | // object FeatureUtilExample { 29 | // case class MyFeature(unlabeled: String, label: Int) 30 | // object FU extends FeatureUtil[MyFeature] 31 | 32 | // def run = { 33 | // val indexer = new FU.FeatureIndexer 34 | // FU.getIndex(indexer, MyFeature("hoge", 10)) 35 | // } 36 | // } 37 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/LinearClassifier.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | trait Classifier[L] { 20 | 21 | protected val weights: WeightVector[Float] 22 | 23 | def predict(examples: Seq[Example[L]]): (L, Float) 24 | } 25 | 26 | trait LinearClassifier[L] extends Classifier[L] { 27 | 28 | override def predict(examples: Seq[Example[L]]): (L, Float) = 29 | if (examples.isEmpty) (null.asInstanceOf[L], 0F) 30 | else examples.map { e => (e.label, featureScore(e.featVec)) }.maxBy(_._2) 31 | 32 | def featureScore(feature: Array[Int]): Float = { 33 | var a = 0F 34 | var i = 0 35 | while (i < feature.size) { 36 | a += weight(feature(i)) 37 | i += 1 38 | } 39 | a 40 | } 41 | /** Control the behavior of the access to weight. 42 | * You *MUST* use this method to access weight inside the classifier, and *NEVER* call like weights(i) directly (except updating the value) 43 | * This is because in some classifiers, such as AdaGradL1, the values must be preprocessed (e.g., lazy update) before used. 44 | * You can add such a preprocessing by overriding this method in a subclass. 45 | */ 46 | protected def weight(idx: Int): Float = weights(idx) 47 | } 48 | 49 | /** A classifier in which weight vector backbone is implemented by array, hopefully faster than growable counterpart. 50 | */ 51 | class FixedClassifier[L](val array: Array[Float]) extends LinearClassifier[L] { 52 | override val weights = new FixedWeightVector(array) 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/LogLinearAdaGradL1.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | abstract class LogLinearAdaGradL1[L](val lambda: Float, val eta: Float) extends OnlineLogLinearTrainer[L] { 20 | 21 | private[this] val lastUpdates = WeightVector.growable[Float]() 22 | private[this] val diagGt = WeightVector.growable[Float]() 23 | 24 | override protected def weight(idx: Int): Float = 25 | if (lastUpdates(idx) == time) weights(idx) 26 | else { 27 | val currentXti = weights(idx) 28 | if (currentXti == 0.0F) 0.0F 29 | else { 30 | val t0 = lastUpdates(idx) 31 | assert(time != 0) 32 | val ht0ii = 1.0 + Math.sqrt(diagGt(idx)) 33 | val newWeight = Math.signum(currentXti) * Math.max( 34 | 0.0, Math.abs(currentXti) - (lambda * eta / ht0ii) * (time - t0)) 35 | weights(idx) = newWeight.toFloat 36 | lastUpdates(idx) = time 37 | newWeight.toFloat 38 | } 39 | } 40 | 41 | override def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit = { 42 | // Here, we negate the gradient. This is because original formulation by Duch et al. 43 | // minimizes the objective, while we maximize the objective. 44 | val gti = -derivative 45 | val deltaDiagGti = gti * gti // these are shared by all i below, so we cache here 46 | 47 | val feats = e.featVec 48 | var j = 0 49 | while (j < feats.size) { 50 | val i = feats(j) 51 | 52 | //val xti = weight(i) // This automatically perform lazy update of the target weight 53 | val xti = weights(i) // weighs(i) must be lazy-updated at calculating label scores, so we can skip 54 | diagGt(i) += deltaDiagGti 55 | val htii = 1.0 + Math.sqrt(diagGt(i)) 56 | val etaOverHtii = eta / htii 57 | val tempXti = xti - etaOverHtii * gti 58 | 59 | weights(i) = (Math.signum(tempXti) * Math.max(0.0, Math.abs(tempXti) - lambda * etaOverHtii)).toFloat 60 | lastUpdates(i) = time + 1 61 | 62 | j += 1 63 | } 64 | } 65 | override def postProcess: Unit = { 66 | (0 until weights.size).foreach { weight(_) } 67 | } 68 | } 69 | 70 | class FixedLogLinearAdaGradL1[L](val weightArray: Array[Float], lambda: Float, eta: Float) extends LogLinearAdaGradL1(lambda, eta) { 71 | override val weights = new FixedWeightVector(weightArray) 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/LogLinearClassifier.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | /** Augument LinearClassifier with a method to return label probabilities. 20 | * (implies loss function of log loss) 21 | */ 22 | trait LogLinearClassifier[L] extends LinearClassifier[L] { 23 | val weights: WeightVector[Float] 24 | 25 | def labelProbs(examples: Seq[Example[L]]): Array[Float] = { 26 | val unnormalized: Array[Float] = examples.map { e => 27 | val p = Math.exp(featureScore(e.featVec)).toFloat 28 | if (p < 1e-100) 1e-100F else p 29 | }.toArray 30 | val z = unnormalized.sum 31 | unnormalized.map(_ / z) 32 | } 33 | } 34 | 35 | class FixedLogLinerClassifier[L](val weightArray: Array[Float]) extends LogLinearClassifier[L] { 36 | override val weights = new FixedWeightVector(weightArray) 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/LogLinearSGD.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | abstract class LogLinearSGD[L](val a: Float) extends OnlineLogLinearTrainer[L] { 20 | 21 | def stepSize = Math.pow(time + 1, -a).toFloat // avoid the overflow 22 | def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit = { 23 | val dw = stepSize * derivative 24 | val feats = e.featVec 25 | var i = 0 26 | while (i < feats.size) { 27 | weights(feats(i)) += dw 28 | i += 1 29 | } 30 | } 31 | } 32 | 33 | class FixedLogLinearSGD[L](val weightArray: Array[Float], a: Float) extends LogLinearSGD(a) { 34 | 35 | override val weights = new FixedWeightVector(weightArray) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/OnlineLogLinearTrainer.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | /** This trait exploits the common procedure in trainers of log-linear models. 20 | */ 21 | trait OnlineLogLinearTrainer[L] extends OnlineTrainer[L] with LogLinearClassifier[L] { 22 | var time: Int = 0 23 | 24 | override def update(examples: Seq[Example[L]], gold:L): Unit = { 25 | val dist = labelProbs(examples) 26 | var i = 0 27 | while (i < examples.size) { 28 | val e = examples(i) 29 | val p = dist(i) 30 | val derivative = if (e.label == gold) (1 - p) else -p 31 | updateExampleWeights(e, gold, derivative) 32 | i += 1 33 | } 34 | reguralizeWeights(examples) 35 | time += 1 36 | } 37 | def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit 38 | def reguralizeWeights(examples: Seq[Example[L]]): Unit = {} // Some algorithms reguralize weights after temporalily updating the values and this method defines that postprocessing. See LogLinearSGDCumulativeL1 for example. 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/OnlineTrainer.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | /** A trait which support parameter update, and the interface of Classifier. 20 | * Currently two subclasses exists: OnlineLoglinearTrainer is used for log-linear models, while Perceptron is used to train the perceptron including structured perceptron with beam-search. 21 | */ 22 | trait OnlineTrainer[L] extends Classifier[L] { 23 | def update(examples: Seq[Example[L]], gold:L): Unit 24 | def postProcess: Unit = Unit 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/Perceptron.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.collection.mutable.ArrayBuffer 20 | 21 | trait Perceptron[L] extends LinearClassifier[L] with OnlineTrainer[L] { 22 | 23 | def averageWeights: WeightVector[Float] 24 | 25 | var c = 1.0F 26 | 27 | override def update(examples: Seq[Example[L]], gold: L): Unit = { 28 | val pred = predict(examples)._1 29 | if (pred != gold) { 30 | var i = 0 31 | while (i < examples.size) { 32 | val label = examples(i).label 33 | if (label == pred) updateFeatureWeighs(examples(i).featVec, -1.0F) 34 | else if (label == gold) updateFeatureWeighs(examples(i).featVec, 1.0F) 35 | i += 1 36 | } 37 | } 38 | c += 1.0F 39 | } 40 | def updateFeatureWeighs(featVec: Array[Int], scale: Float): Unit = featVec.foreach { f => 41 | weights(f) += scale 42 | averageWeights(f) += scale * c 43 | } 44 | def update(predFeatVec:Array[Int], goldFeatVec:Array[Int]): Unit = { 45 | updateFeatureWeighs(predFeatVec, -1.0F) 46 | updateFeatureWeighs(goldFeatVec, 1.0F) 47 | c += 1.0F 48 | } 49 | def takeAverage: Unit = (0 until weights.size) foreach { i => 50 | weights(i) -= averageWeights(i) / c 51 | } 52 | } 53 | 54 | class FixedPerceptron[L](val weightArray: Array[Float]) extends Perceptron[L] { 55 | 56 | override val weights = new FixedWeightVector(weightArray) 57 | override val averageWeights = new FixedWeightVector(new Array[Float](weights.size)) 58 | } 59 | 60 | class GrowablePerceptron[L](val weightArray: ArrayBuffer[Float]) extends Perceptron[L] { 61 | 62 | override val weights = new GrowableWeightVector(weightArray) 63 | override val averageWeights = WeightVector.growable[Float](weights.size) 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/WeightVector.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.collection.mutable.ArrayBuffer 20 | 21 | @SerialVersionUID(1L) 22 | trait WeightVector[@specialized(Int, Double, Float) A] extends Serializable { 23 | def apply(idx: Int): A 24 | def update(idx: Int, elem: A): Unit 25 | def size: Int 26 | 27 | def seq: IndexedSeq[A] // indexed seq from a backbone data structure 28 | } 29 | 30 | object WeightVector { 31 | def growable[A](initialSize: Int = 0)(implicit numeric: Numeric[A]) = new GrowableWeightVector[A](new ArrayBuffer[A](initialSize))(numeric) 32 | } 33 | 34 | class FixedWeightVector[@specialized(Int, Double, Float) A](val array: Array[A]) extends WeightVector[A] { 35 | def apply(idx: Int) = array(idx) 36 | def update(idx: Int, elem: A) = array(idx) = elem 37 | def size = array.size 38 | 39 | def seq = array 40 | } 41 | 42 | class GrowableWeightVector[@specialized(Int, Double, Float) A](val array: ArrayBuffer[A])(implicit numeric: Numeric[A]) extends WeightVector[A] { 43 | def apply(idx: Int) = if (idx >= size || idx < 0) numeric.zero else array(idx) 44 | def update(idx: Int, elem: A) = { 45 | if (idx >= array.size) array ++= List.fill(idx - array.size + 1)(numeric.zero) 46 | array(idx) = elem 47 | } 48 | def size = array.size 49 | 50 | def seq = array 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Dense.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.{DenseMatrix, DenseVector} 20 | import ucar.nc2.{Variable, Group} 21 | 22 | class Dense(inputDim: Int, outputDim: Int) extends Functor{ 23 | 24 | override def functorName = "Dense" 25 | 26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = { 27 | val z = data * w 28 | for (i <- 0 until data.rows){ 29 | z(i, ::) :+= b.t 30 | } 31 | z 32 | } 33 | 34 | private val w = DenseMatrix.zeros[Float](inputDim, outputDim) 35 | private val b = DenseVector.zeros[Float](outputDim) 36 | 37 | def h5load(weight: Variable, bias: Variable): Unit = { 38 | val weightData = weight.read 39 | val weightIndex = weightData.getIndex 40 | val biasData = bias.read 41 | val biasIndex = biasData.getIndex 42 | for(y <- 0 until inputDim) 43 | for(x <- 0 until outputDim){ 44 | w(y, x) = weightData.getFloat(weightIndex.set(y, x)) 45 | if(y == 0) 46 | b(x) = biasData.getFloat(biasIndex.set(x)) 47 | } 48 | } 49 | 50 | override def toString: String = "Dense: {inputDim: " + inputDim + ", outputDim: " + outputDim + "}" 51 | 52 | def head: String = w(0 until 2, ::).toString 53 | } 54 | 55 | object Dense{ 56 | def apply(inputDim:Int, outputDim:Int) = new Dense(inputDim, outputDim) 57 | 58 | def apply(configs: Map[String, Any], weightGroups: Group): Dense = { 59 | val layerName = configs("name").toString 60 | val params = weightGroups.findGroup(layerName) 61 | val weightNames = params.findAttribute("weight_names") 62 | val weight = params.findVariable(weightNames.getStringValue(0)) 63 | val bias = params.findVariable(weightNames.getStringValue(1)) 64 | val dims = weight.getDimensions 65 | if(dims.size != 2){ 66 | throw new IllegalArgumentException("invalid dimension for Dense class") 67 | } 68 | 69 | val d = new Dense(dims.get(0).getLength, dims.get(1).getLength) 70 | d.h5load(weight, bias) 71 | d 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Embedding.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.{DenseMatrix, DenseVector} 20 | import ucar.nc2.{Variable, Group} 21 | 22 | class Embedding(vocabulary: Int, outDim: Int) extends Functor{ 23 | 24 | override def functorName = "Embedding" 25 | 26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = { 27 | val arrayOfId = data.reshape(data.size, 1) 28 | val length = arrayOfId.size 29 | val z = DenseMatrix.zeros[Float](length, outDim) 30 | for(i <- 0 until length){ 31 | z(i, ::) := w(arrayOfId(i, 0).asInstanceOf[Int]).t 32 | } 33 | z 34 | } 35 | 36 | private val w = new Array[DenseVector[Float]](vocabulary).map(_ => DenseVector.zeros[Float](outDim)) 37 | 38 | def h5load(weight: Variable):Unit = { 39 | val weightData = weight.read 40 | val weightIndex = weightData.getIndex 41 | for(y <- 0 until vocabulary) 42 | for(x <- 0 until outDim) 43 | w(y)(x) = weightData.getFloat(weightIndex.set(y, x)) 44 | } 45 | 46 | } 47 | 48 | object Embedding{ 49 | def apply(vocabulary: Int, outDim: Int) = new Embedding(vocabulary, outDim) 50 | 51 | def apply(configs: Map[String, Any], weightGroups: Group): Embedding = { 52 | val layerName = configs("name").toString 53 | val params = weightGroups.findGroup(layerName) 54 | val weightNames = params.findAttribute("weight_names") 55 | val weight = params.findVariable(weightNames.getStringValue(0)) 56 | val dims = weight.getDimensions 57 | if(dims.size != 2){ 58 | throw new IllegalArgumentException("Invalid dimension for Embedding class") 59 | } 60 | val e = new Embedding(dims.get(0).getLength, dims.get(1).getLength) 61 | e.h5load(weight) 62 | e 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Empty.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | 21 | object Empty extends Functor{ 22 | 23 | override def functorName = "Empty" 24 | 25 | override final def convert(data: DenseMatrix[Float]):DenseMatrix[Float] = data 26 | 27 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Flatten.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | 21 | object Flatten extends Functor{ 22 | 23 | override def functorName = "Flatten" 24 | 25 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.t.toDenseVector.toDenseMatrix 26 | 27 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Functor.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | 21 | trait Functor { 22 | 23 | def functorName: String 24 | def convert(data: DenseMatrix[Float]): DenseMatrix[Float] 25 | override def toString: String = functorName 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/KerasModel.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | import jigg.util.HDF5Object 21 | import org.json4s.jackson.JsonMethods._ 22 | import org.json4s.{DefaultFormats, _} 23 | 24 | class KerasModel(model: HDF5Object) { 25 | 26 | private val kerasAttribute = model.checkAndGetAttribute("keras_version") 27 | private val modelAttribute = model.checkAndGetAttribute("model_config") 28 | 29 | private val weightGroups = model.checkAndGetGroup("model_weights") 30 | 31 | def parseConfigToSeq(config: String): Seq[Map[String, Any]] = { 32 | val jsonValue = parse(config) 33 | implicit val formats = DefaultFormats 34 | val jsonList = jsonValue.extract[Map[String, Any]] 35 | jsonList("config").asInstanceOf[Seq[Map[String, Any]]] 36 | } 37 | 38 | private val modelValues = parseConfigToSeq(modelAttribute.getValue(0).toString) 39 | 40 | def getConfigs(x: Map[String, Any]): Map[String, Any] = x("config").asInstanceOf[Map[String,Any]] 41 | 42 | def constructNetwork(values: Seq[Map[String, Any]]): Seq[Functor] = values.map{ 43 | x => { 44 | val configs = getConfigs(x) 45 | val functor = x("class_name").toString match { 46 | case "Activation" => 47 | configs("activation").toString match{ 48 | case "relu" => Relu 49 | case "softmax" => Softmax 50 | case "sigmoid" => Sigmoid 51 | case "tanh" => Tanh 52 | } 53 | case "Convolution1D" => 54 | Convolution1D(configs, weightGroups) 55 | case "Dense" => 56 | Dense(configs, weightGroups) 57 | case "Embedding" => 58 | Embedding(configs, weightGroups) 59 | case "Flatten" => Flatten 60 | case _ => Empty 61 | } 62 | functor 63 | } 64 | } 65 | 66 | private val graph:Seq[Functor] = constructNetwork(modelValues) 67 | 68 | def convert(input: DenseMatrix[Float]): DenseMatrix[Float] = callFunctors(input, graph) 69 | 70 | private def callFunctors(input: DenseMatrix[Float], unprocessed:Seq[Functor]): DenseMatrix[Float] = unprocessed match { 71 | case functor :: tail => 72 | val interOutput = functor.convert(input) 73 | callFunctors(interOutput, tail) 74 | case Nil => input 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/KerasParser.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.argmax 20 | import jigg.ml.keras._ 21 | import jigg.util.LookupTable 22 | 23 | import scala.xml.Node 24 | import scala.collection.mutable.{ArrayBuffer, ListBuffer} 25 | 26 | class KerasParser(model: KerasModel, table: LookupTable) { 27 | 28 | /* 29 | * BIO tag 30 | * B : Begin of segment. Value is 0. 31 | * I : Continuation or end of segment. Value is 1. 32 | * O : Others. Value is 2. 33 | */ 34 | private val tagset:Map[Int, String] = Map(0 -> "B", 1 -> "I", 2 -> "O") 35 | 36 | def parsing(str: String): Array[(Int, Int)] = { 37 | // For dummy input to indicate boundaries of sentence. 38 | val s = "\n" + str + "\n\n" 39 | val inputData = table.encodeCharacter(s) 40 | val outputData = model.convert(inputData) 41 | 42 | val tags = for { 43 | i <- 1 until outputData.rows - 2 44 | maxID = argmax(outputData(i, ::)) 45 | } yield maxID 46 | 47 | getOffsets(tags.toArray) 48 | } 49 | 50 | def parsing(tokens: Node): Array[Array[String]] = { 51 | // For dummy input to indicate boundaries of sentence. 52 | val words = Array("\n").union( 53 | (tokens \\ "tokens").flatMap(x => x \\ "@lemma").toArray.map(x => x.toString)).union(Array("\n\n")) 54 | val ids = (tokens \\ "tokens").flatMap(x => x \\ "@id").toArray.map(x => x.toString) 55 | 56 | val inputData = table.encodeWords(words) 57 | val outputData = model.convert(inputData) 58 | 59 | val tags = for { 60 | i <- 1 until outputData.rows - 2 61 | maxID = argmax(outputData(i, ::)) 62 | } yield maxID 63 | 64 | val ranges = getOffsets(tags.toArray) 65 | 66 | ranges.map(x => ids.slice(x._1, x._2)) 67 | } 68 | 69 | def getOffsets(data: Array[Int]): Array[(Int, Int)]= { 70 | val ranges = ArrayBuffer[(Int, Int)]() 71 | var bpos = -1 72 | 73 | for(i <- data.indices){ 74 | tagset(data(i)) match{ 75 | case "B" => 76 | if(bpos >= 0) 77 | ranges += ((bpos, i)) 78 | bpos = i 79 | case "I" if i == 0 || bpos == -2 => 80 | bpos = i 81 | case "O" => 82 | if (bpos >= 0) 83 | ranges += ((bpos, i)) 84 | bpos = -2 85 | case _ if i == data.indices.last => 86 | ranges += ((bpos, i + 1)) 87 | case _ => 88 | } 89 | } 90 | ranges.toArray 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/README.md: -------------------------------------------------------------------------------- 1 | # KerasParser 2 | 3 | ## Abstract 4 | - Main class: jigg.ml.keras.KerasParser 5 | - KerasParser requires a model file and a lookup-table file. 6 | 7 | ## Requirements 8 | ### Model file 9 | - Model file must be generated by [keras](https://keras.io) 10 | - HDF5 is only supported 11 | - Required output class style: BIO 12 | - Tag `B` corresponds to `0`. 13 | - Tag `I` corresponds to `1`. 14 | - Tag `O` corresponds to `2`. 15 | - The following keras's functions are supported. 16 | - Layer 17 | - Dense 18 | - Embedding 19 | - Convolution1D 20 | - Flatten 21 | - Activation 22 | - Relu 23 | - Sigmoid 24 | - Softmax 25 | - Tanh 26 | 27 | ### Lookup table 28 | - Field construction 29 | - `_lookup` 30 | - `_key2id`: Convert character/word to ID 31 | - key: Target character/word 32 | - value: ID number of target character/word 33 | - `_id2key`: Convert ID to chracter/word 34 | - key: ID number of target chracter/word 35 | - value: Target character/word 36 | - Table shoud contain following elements: 37 | 38 | | ID | Value | 39 | |:---|:------| 40 | |0 | UNKNOWN | 41 | |1 | new line (`\n`) | 42 | |2 | half space (` `) | 43 | 44 | #### Example 45 | ```json 46 | {"_lookup":{ 47 | "_key2id": { 48 | "UNKNOWN": "0", 49 | "\n": "1", 50 | " " : "2", 51 | "Additional elements": "3..." 52 | }, 53 | "_id2key": { 54 | "0": "UNKNOWN", 55 | "1": "\n", 56 | "2": " ", 57 | "3..." : "Additional elements" 58 | } 59 | } 60 | } 61 | ``` 62 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Relu.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | 21 | object Relu extends Functor{ 22 | 23 | override def functorName = "Relu" 24 | 25 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map(x => 26 | if(x > 0.0.toFloat) x else 0.0.toFloat 27 | ) 28 | 29 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Sigmoid.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | import breeze.numerics.exp 21 | 22 | object Sigmoid extends Functor { 23 | 24 | override def functorName = "Sigmoid" 25 | 26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map{x => (1.0 / (1.0 + exp(-x))).toFloat} 27 | 28 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Softmax.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.{DenseVector, DenseMatrix, softmax} 20 | import breeze.numerics.exp 21 | 22 | object Softmax extends Functor{ 23 | 24 | override def functorName = "Softmax" 25 | 26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = { 27 | for(y <- 0 until data.rows){ 28 | val v = data(y, ::) 29 | data(y, ::) := (exp(v) :/= exp(softmax(v))) 30 | } 31 | data 32 | } 33 | 34 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x) 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/jigg/ml/keras/Tanh.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import breeze.linalg.DenseMatrix 20 | import breeze.numerics.tanh 21 | 22 | object Tanh extends Functor{ 23 | 24 | override def functorName = "Tanh" 25 | 26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map{ x => tanh(x)} 27 | 28 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x) 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/CCGBank2EnjuXML.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import lexicon._ 20 | import jigg.util.IOUtil 21 | 22 | import breeze.config.{CommandLineParser, Help} 23 | 24 | import scala.collection.mutable.ArrayBuffer 25 | import scala.sys.process.Process 26 | 27 | import java.io.{File, FileWriter} 28 | 29 | 30 | object CCGBank2EnjuXML { 31 | 32 | case class Opts( 33 | @Help(text="Path to CCGBank file") ccgBank: File = new File(""), 34 | @Help(text="Path to output (xml)") output: File = new File(""), 35 | @Help(text="Number of sentences") numSentences: Int = 50 36 | ) 37 | 38 | def main(args:Array[String]) = { 39 | val opts = CommandLineParser.readIn[Opts](args) 40 | 41 | val dict = new JapaneseDictionary(new Word2CategoryDictionary) 42 | 43 | val conv = new JapaneseParseTreeConverter(dict) 44 | 45 | val reader = new CCGBankReader 46 | 47 | val instances: Seq[(TaggedSentence, Derivation)] = 48 | reader.takeLines(IOUtil.openIterator(opts.ccgBank.getPath), opts.numSentences).toSeq.map { line => 49 | val trees = reader.readParseFragments(line).map { conv.toLabelTree(_) } 50 | (conv.toSentenceFromLabelTrees(trees), conv.toFragmentalDerivation(trees)) 51 | } 52 | 53 | val fw = new FileWriter(opts.output.getPath) 54 | 55 | instances.zipWithIndex foreach { case ((s, d), i) => fw.write(d.renderEnjuXML(s, i) + "\n") } 56 | 57 | fw.flush 58 | fw.close 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/CCGBankToCabochaFormat.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | 20 | import lexicon._ 21 | 22 | import breeze.config.{CommandLineParser, Help} 23 | 24 | import scala.sys.process.Process 25 | 26 | import java.io.{File, FileWriter, ByteArrayInputStream} 27 | 28 | /** Creates Cabocha-formatted CCGBank sentences. 29 | * 30 | * The output of this is required when evaluating bunsetsu-dependency of CCG parser. 31 | * When new CCGBank is released, currently, we have to manually run this class to get the correct data. 32 | */ 33 | object CCGBankToCabochaFormat { 34 | 35 | case class Opts( 36 | @Help(text="Path to CCGBank file") ccgbank: File = new File(""), 37 | @Help(text="Path to output") output: File = new File(""), 38 | @Help(text="Cabocha command (path to cabocha)") cabocha: String = "cabocha" 39 | ) 40 | 41 | type Tree = ParseTree[NodeLabel] 42 | 43 | def main(args:Array[String]) = { 44 | val opts = CommandLineParser.readIn[Opts](args) 45 | 46 | val dict = new JapaneseDictionary() 47 | val extractors = TreeExtractor( 48 | new JapaneseParseTreeConverter(dict), 49 | new CCGBankReader) 50 | 51 | val trees = extractors.readTrees(opts.ccgbank, -1, true) 52 | val rawString = trees map (extractors.treeConv.toSentenceFromLabelTree) map (_.wordSeq.mkString("")) mkString ("\n") 53 | val is = new java.io.ByteArrayInputStream(rawString.getBytes("UTF-8")) 54 | val out = (Process(s"${opts.cabocha} -f1") #< is).lineStream_! 55 | 56 | val os = jigg.util.IOUtil.openOut(opts.output.getPath) 57 | out foreach { line => 58 | os.write(line + "\n") 59 | } 60 | os.flush 61 | os.close 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/GoldBunsetsuDepInCabocha.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import lexicon._ 20 | import jigg.util.IOUtil 21 | 22 | import breeze.config.{CommandLineParser, Help} 23 | 24 | import java.io.{File, FileWriter} 25 | 26 | /** Input: CCGBank file (e.g., train.ccgbank) from stdin. 27 | * Output: Gold bunsetsu dependencies according to the CCGBank in CoNLL format. 28 | */ 29 | object GoldBunsetsuDepInCoNLL { 30 | 31 | case class Opts( 32 | @Help(text="Path to Cabocha file (same sentences with the CCGBank file)") cabocha: File = new File("") 33 | ) 34 | 35 | def main(args:Array[String]) = { 36 | val opts = CommandLineParser.readIn[Opts](args) 37 | 38 | val dict = new JapaneseDictionary(new Word2CategoryDictionary) 39 | 40 | val conv = new JapaneseParseTreeConverter(dict) 41 | val parseTrees = new CCGBankReader() 42 | .readParseTrees(IOUtil.openStandardIterator, -1, true) 43 | .map(conv.toLabelTree _).toSeq 44 | val goldDerivs = parseTrees.map(conv.toDerivation) 45 | val sentences = parseTrees.map(conv.toSentenceFromLabelTree) 46 | 47 | val bunsetsuSentencesWithPredHead = 48 | new CabochaReader(sentences).readSentences(opts.cabocha.getPath) 49 | 50 | val bunsetsuSentencesWithGoldHead = 51 | bunsetsuSentencesWithPredHead zip goldDerivs map { case (sentence, deriv) => 52 | BunsetsuSentence(sentence.bunsetsuSeq).parseWithCCGDerivation(deriv) 53 | } 54 | for (sentence <- bunsetsuSentencesWithGoldHead) { 55 | println(sentence.renderInCoNLL) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/Opts.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import lexicon._ 20 | 21 | import jigg.ml 22 | 23 | import breeze.config.Help 24 | 25 | import java.io.File 26 | 27 | object Opts { 28 | 29 | @Help(text="About CCGBank") 30 | case class BankInfo( 31 | @Help(text="Language (ja|en)") lang: String = "ja", 32 | @Help(text="Path to CCGBank directory (if this is set, files in this dir are used as default values of train/dev and others)") dir: File = new File(""), 33 | @Help(text="# training instances, -1 for all") trainSize: Int = -1, 34 | @Help(text="# test instances, -1 for all") testSize: Int = -1, 35 | @Help(text="# dev instances, -1 for all") devSize: Int = -1 36 | ) 37 | 38 | @Help(text="About category dictionary") 39 | case class DictParams( 40 | @Help(text="How to look up category candidates? (for Japanese only) (surfaceOnly|surfaceAndPoS|surfaceAndSecondFineTag|surfaceAndSecondWithConj)") 41 | lookupMethod: String = "surfaceAndSecondWithConj", 42 | @Help(text="Whether using lexicon files to create word -> category mappings") 43 | useLexiconFiles: Boolean = true, 44 | @Help(text="Minimum number of occurences for registering as lexicalized entry") 45 | unkThreathold: Int = 30 46 | ) { 47 | 48 | val categoryDictinoary = lookupMethod match { 49 | case "surfaceOnly" => new Word2CategoryDictionary 50 | case "surfaceAndPoS" => new WordPoS2CategoryDictionary 51 | case "surfaceAndSecondFineTag" => new WordSecondFineTag2CategoryDictionary 52 | case "surfaceAndSecondWithConj" => new WordSecondWithConj2CategoryDictionary 53 | case _ => sys.error("unknown lookUpMethod") 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/ParserRunner.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import lexicon._ 20 | import parser.{ActionLabel, KBestDecoder} 21 | import jigg.ml.FixedPerceptron 22 | 23 | import breeze.config.{CommandLineParser, Help} 24 | 25 | import scala.collection.mutable.{ArraySeq} 26 | 27 | import java.io.File 28 | 29 | 30 | class ParserRunner(model: ParserModel, params: ParserRunner.Params) { 31 | 32 | val tagger = new SuperTaggerRunner(model.taggerModel, params.tagger) 33 | val perceptron = new FixedPerceptron[ActionLabel](model.weights) 34 | val decoder = model.mkDecoder(params.beam, perceptron) 35 | 36 | val preferConnected = params.preferConnected 37 | 38 | def decode[S<:TaggedSentence](sentences: Array[S]): Array[Derivation] = { 39 | 40 | val predDerivations = sentences.zipWithIndex map { 41 | case (sentence, i) => 42 | if (i % 100 == 0) 43 | System.err.print(i + "\t/" + sentences.size + " have been processed.\r") 44 | decodeOne(sentence) 45 | } 46 | System.err.println() 47 | predDerivations 48 | } 49 | 50 | def decodeOne[S<:TaggedSentence](sentence: S): Derivation = 51 | kBestDerivations(sentence, 1)(0)._1 52 | 53 | def kBestDerivations[S<:TaggedSentence](sentence: S, k: Int) 54 | : Seq[(Derivation, Double)] = { 55 | val superTaggedSentence = tagger.assignKBest(sentence) 56 | 57 | decoder match { 58 | case decoder: KBestDecoder => 59 | decoder predictKbest (k, superTaggedSentence, preferConnected) 60 | case decoder => 61 | Seq(decoder predict superTaggedSentence) 62 | } 63 | } 64 | } 65 | 66 | object ParserRunner { 67 | 68 | @Help(text="Params for testing/evaluating parser") 69 | case class Params( 70 | @Help(text="Beam size") beam: Int = 32, 71 | @Help(text="Prefer connected derivation at prediction") preferConnected: Boolean = true, 72 | tagger: SuperTaggerRunner.Params = new SuperTaggerRunner.Params() 73 | ) 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/SuperTaggerModel.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import tagger.{LF=>Feature, MaxEntMultiTagger, MaxEntMultiTaggerTrainer, FeatureExtractors} 20 | import lexicon._ 21 | import jigg.ml._ 22 | 23 | import scala.collection.mutable.HashMap 24 | 25 | case class SuperTaggerModel( 26 | dict: Dictionary, 27 | featureMap: HashMap[Feature, Int], 28 | weights: WeightVec, 29 | extractors: FeatureExtractors) { self => 30 | 31 | def reduceFeatures(): SuperTaggerModel = { 32 | 33 | val buffer = weights.asInstanceOf[GrowableWeightVector[Float]].array // 0 1.0 2.0 0 0 1.0 ... 34 | val activeIdxs = buffer.zipWithIndex filter (_._1 != 0) map (_._2) // 1 2 5 35 | println(s"# features reduced from ${buffer.size} to ${activeIdxs.size}") 36 | val idxMap = activeIdxs.zipWithIndex.toMap // {1->0, 2->1 5->2} 37 | 38 | val newFeatureMap = featureMap collect { 39 | case (f, oldIdx) if idxMap.isDefinedAt(oldIdx) => (f, idxMap(oldIdx)) 40 | } 41 | val newWeights = new FixedWeightVector[Float](activeIdxs.map(buffer).toArray) 42 | 43 | this copy (featureMap = newFeatureMap, weights = newWeights) 44 | } 45 | 46 | def mkMultiTaggerTrainer(classifierTrainer: OnlineLogLinearTrainer[Int]) = 47 | new MaxEntMultiTaggerTrainer(mkIndexer(), extractors, classifierTrainer, dict) 48 | 49 | def mkMultiTagger() = 50 | new MaxEntMultiTagger(mkIndexer(), extractors, mkClassifier(), dict) 51 | 52 | def mkClassifier() = new LogLinearClassifier[Int] { 53 | override val weights = self.weights 54 | } 55 | 56 | private def mkIndexer() = new ExactFeatureIndexer(featureMap) 57 | } 58 | 59 | object SuperTaggerModel { 60 | 61 | def saveTo(path: String, model: SuperTaggerModel) = { 62 | System.err.println("Saving tagger model to " + path) 63 | val os = jigg.util.IOUtil.openBinOut(path) 64 | os.writeObject(model) 65 | os.close 66 | } 67 | 68 | def loadFrom(path: String): SuperTaggerModel = { 69 | jigg.util.LogUtil.track("Loading supertagger model ...") { 70 | val in = jigg.util.IOUtil.openBinIn(path) 71 | val model = in.readObject.asInstanceOf[SuperTaggerModel] 72 | in.close 73 | model 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/SuperTaggerRunner.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import lexicon._ 20 | 21 | import breeze.config.{CommandLineParser, Help} 22 | 23 | import scala.collection.mutable.{ArraySeq} 24 | 25 | import java.io.File 26 | 27 | 28 | class SuperTaggerRunner(model: SuperTaggerModel, params: SuperTaggerRunner.Params) { 29 | 30 | val tagger = model.mkMultiTagger() 31 | 32 | def assignKBests[S<:TaggedSentence](sentences: Array[S]): ArraySeq[S#AssignedSentence] = 33 | sentences map (assignKBest) 34 | 35 | def assignKBest[S<:TaggedSentence](s: S): S#AssignedSentence = 36 | s assignCands (tagger candSeq(s, params.beta, params.maxK)) 37 | } 38 | 39 | object SuperTaggerRunner { 40 | 41 | @Help(text="Params for testing/evaluating super tagger") 42 | case class Params( 43 | // @Help(text="Load model path") model: SuperTaggerModel: SuperTaggerModel, 44 | @Help(text="Beta for decising the threshold of k-best at prediction") beta: Double = 0.001, 45 | @Help(text="Maximum number of k, -1 for no limit") maxK: Int = -1 46 | ) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/TrainParser.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import breeze.config.CommandLineParser 20 | 21 | object TrainParser { 22 | 23 | import ParserTrainer.Params 24 | 25 | def main(args: Array[String]) = { 26 | 27 | val params = CommandLineParser.readIn[Params](args) 28 | val trainer = mkTrainer(params) 29 | trainer.trainAndSave() 30 | } 31 | 32 | def mkTrainer(params: Params): ParserTrainer = params.bank.lang match { 33 | case "ja" => new JapaneseParserTrainer(params) 34 | case "en" => new EnglishParserTrainer(params) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/TrainSuperTagger.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import breeze.config.CommandLineParser 20 | 21 | object TrainSuperTagger { 22 | 23 | import SuperTaggerTrainer.Params 24 | 25 | def main(args: Array[String]) = { 26 | 27 | val params = CommandLineParser.readIn[Params](args) 28 | val trainer = mkTrainer(params) 29 | trainer.trainAndSave() 30 | } 31 | 32 | def mkTrainer(params: Params): SuperTaggerTrainer = params.bank.lang match { 33 | case "ja" => new JapaneseSuperTaggerTrainer(params) 34 | case "en" => new EnglishSuperTaggerTrainer(params) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/CabochaReader.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.io.Source 20 | 21 | class CabochaReader[S<:TaggedSentence](ccgSentences: Seq[S]) { 22 | def readSentences(path: String): Seq[ParsedBunsetsuSentence] = { 23 | val bunsetsuStart = """\* (\d+) (-?\d+)[A-Z].*""".r 24 | def addBunsetsuTo(curSent: List[(String, Int)], curBunsetsu: List[String]) = curBunsetsu.reverse match { 25 | case Nil => curSent 26 | case headIdx :: tail => (tail.mkString(""), headIdx.toInt) :: curSent 27 | } 28 | 29 | val bunsetsuSegedSentences: List[List[(String, Int)]] = 30 | scala.io.Source.fromFile(path).getLines.filter(_ != "").foldLeft( 31 | (List[List[(String, Int)]](), List[(String, Int)](), List[String]())) { 32 | case ((processed, curSent, curBunsetsu), line) => line match { 33 | case bunsetsuStart(_, nextHeadIdx) => 34 | (processed, addBunsetsuTo(curSent, curBunsetsu), nextHeadIdx :: Nil) // use first elem as the head idx 35 | case "EOS" => (addBunsetsuTo(curSent, curBunsetsu).reverse :: processed, Nil, Nil) 36 | case word => (processed, curSent, word.split("\t")(0) :: curBunsetsu) 37 | } 38 | }._1.reverse 39 | 40 | ccgSentences.zip(bunsetsuSegedSentences).map { case (ccgSentence, bunsetsuSentence) => 41 | val bunsetsuSegCharIdxs: List[Int] = bunsetsuSentence.map { _._1.size }.scanLeft(0)(_+_).tail // 5 10 ... 42 | val ccgWordSegCharIdxs: List[Int] = ccgSentence.wordSeq.toList.map { _.v.size }.scanLeft(0)(_+_).tail // 2 5 7 10 ... 43 | 44 | assert(bunsetsuSegCharIdxs.last == ccgWordSegCharIdxs.last) 45 | val bunsetsuSegWordIdxs: List[Int] = ccgWordSegCharIdxs.zipWithIndex.foldLeft((List[Int](), 0)) { // 1 3 ... 46 | case ((segWordIdxs, curBunsetsuIdx), (wordIdx, i)) => 47 | if (wordIdx >= bunsetsuSegCharIdxs(curBunsetsuIdx)) (i :: segWordIdxs, curBunsetsuIdx + 1) 48 | else (segWordIdxs, curBunsetsuIdx) // wait until wordIdx exceeds the next bunsetsu segment 49 | }._1.reverse 50 | val bunsetsuSeq = bunsetsuSegWordIdxs.zip(-1 :: bunsetsuSegWordIdxs).map { case (bunsetsuIdx, prevIdx) => 51 | val offset = prevIdx + 1 52 | Bunsetsu(offset, 53 | ccgSentence.wordSeq.slice(offset, bunsetsuIdx + 1), 54 | ccgSentence.posSeq.slice(offset, bunsetsuIdx + 1)) 55 | } 56 | ParsedBunsetsuSentence(bunsetsuSeq, bunsetsuSentence.map { _._2 }) 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/Category.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | import Slash._ 19 | 20 | sealed trait Category extends Numbered[Unit] { 21 | override def v:Unit = {} 22 | def toStringNoFeature: String 23 | } 24 | 25 | @SerialVersionUID(6748884927580538343L) 26 | case class AtomicCategory(override val id:Int, base:String, feature:CategoryFeature) extends Category { 27 | override def toString = feature.toString match { 28 | case "" => base 29 | case s => base + "[" + s + "]" 30 | } 31 | 32 | override def toStringNoFeature = base 33 | } 34 | @SerialVersionUID(3754315949719248198L) 35 | case class ComplexCategory(override val id:Int, 36 | left:Category, right:Category, 37 | slash:Slash) extends Category { 38 | def toStringChild(child:Category) = child match { 39 | case AtomicCategory(_,_,_) => child.toString 40 | case ComplexCategory(_,_,_,_) => "(" + child.toString + ")" 41 | } 42 | override def toString = toStringChild(left) + slash + toStringChild(right) 43 | 44 | def toStringChildNoFeature(child:Category) = child match { 45 | case AtomicCategory(_,_,_) => child.toStringNoFeature 46 | case ComplexCategory(_,_,_,_) => "(" + child.toStringNoFeature + ")" 47 | } 48 | override def toStringNoFeature = toStringChildNoFeature(left) + slash + toStringChildNoFeature(right) 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/CategoryManager.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.collection.mutable.HashMap 20 | import scala.collection.mutable.ArrayBuffer 21 | 22 | class CategoryManager extends StringBaseNumberedManager[Category] with OptionReturner[Category] { 23 | override def createWithId(original:Category): Category = original match { 24 | case AtomicCategory(id, base, avm) => AtomicCategory(newId, base, avm) 25 | case ComplexCategory(id, left, right, slash) => 26 | val leftWithId = assignID(left) 27 | val rightWithId = assignID(right) 28 | ComplexCategory(newId, leftWithId, rightWithId, slash) 29 | } 30 | override def getOrNone(str:String): Option[Category] = str2objIndex.get(str) match { 31 | case Some(i) => Some(objects(i)) 32 | case None => canonicalMap.get(createCanonicalInstance(str)) 33 | } 34 | 35 | override def createCanonicalInstance(str:String): Category = JapaneseCategoryParser.parse(str) 36 | 37 | // This is used when candidate shift category is empty 38 | // It sometimes happen if for example, PoS not registered in the dictionary is detected. 39 | val unkCategory = getOrCreate("UNK") 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/CategoryTree.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | import Slash._ 19 | 20 | case class CategoryTree(var surface:String, slash:Slash, left:CategoryTree, right:CategoryTree) { 21 | def isLeaf = left == null && right == null 22 | def setSurface:CategoryTree = { 23 | def childSurface(child:CategoryTree) = 24 | if (child.isLeaf) child.surface else '(' + child.surface + ')' 25 | 26 | if (isLeaf) assert(surface != null) 27 | else surface = childSurface(left) + slash + childSurface(right) 28 | this 29 | } 30 | def foreachLeaf(f:CategoryTree=>Any):Unit = { 31 | if (isLeaf) f(this) 32 | else List(left,right).foreach(_.foreachLeaf(f)) 33 | } 34 | } 35 | 36 | object CategoryTree { 37 | def createLeaf(surface:String) = CategoryTree(surface, null, null, null) 38 | def createInternal(slash:Slash, left:CategoryTree , right:CategoryTree) = 39 | CategoryTree(null, slash, left, right) 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/Direction.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | object Direction extends Enumeration { 20 | type Direction = Value; val Left, Right = Value 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/MecabReader.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.io.Source 20 | import scala.collection.mutable.ArrayBuffer 21 | 22 | /** Read the output of mecab with -Ochasen option. 23 | */ 24 | class MecabReader(dict:Dictionary) { 25 | def toPoSTaggedSentence(lines:Seq[String]) = { 26 | val terminalSeq = lines.map { line => 27 | val splitted = line.split('\t') 28 | val word = dict.getWordOrCreate(splitted(0)) 29 | val base = dict.getWordOrCreate(splitted(2)) 30 | 31 | val conjStr = if (splitted.size > 6) splitted(5) else "_" 32 | val posStr = splitted(3) + "/" + conjStr 33 | 34 | val pos = dict.getPoSOrCreate(posStr) 35 | (word, base, pos) 36 | } 37 | new PoSTaggedSentence( 38 | terminalSeq.map(_._1), 39 | terminalSeq.map(_._2), 40 | terminalSeq.map(_._3)) 41 | } 42 | def readSentences(in:Source, n:Int): Array[PoSTaggedSentence] = { 43 | val sentences = new ArrayBuffer[PoSTaggedSentence] 44 | 45 | val sentenceLines = new ArrayBuffer[String] 46 | 47 | takeLines(in, n).foreach { _ match { 48 | case "EOS" => 49 | sentences += toPoSTaggedSentence(sentenceLines) 50 | sentenceLines.clear 51 | case line => 52 | sentenceLines += line 53 | }} 54 | sentences.toArray 55 | } 56 | def readSentences(path:String, n:Int): Array[PoSTaggedSentence] = 57 | readSentences(Source.fromFile(path), n) 58 | def takeLines(in:Source, n:Int): Iterator[String] = 59 | for (line <- in.getLines.filter(_!="") match { 60 | case lines if (n == -1) => lines 61 | case lines => lines.take(n) }) yield line 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/Numbered.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | trait Numbered[T] { 20 | def id:Int 21 | def v:T 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/PoS.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | /** 20 | * Internal representation of Pat-of-Speech. 21 | * The trait gives some methods to access to the information, which might be used in some languages. 22 | * For example, hierar is a sequence of FineTag, which is assmed to represent hierarchy of that PoS. 23 | * To enable using these different types of tags transparently (it is useful in e.g., feature extractions), Conjugation or FineTag itself is also PoS. 24 | * WARNING: all PoSs have to have unique ids to be distinguished, so it assmed that surface forms of conj, hierar, and pos itself (full surface) are disjoint; if, for example, a FineTag have the same surface to a Conjugation, the dictionary discards the latter one. One solution to this problem is to add a symbol to each type of PoS, e.g., adding suffix 'F' to all FineTag instances when draw/inserting the dictionary. 25 | */ 26 | sealed trait PoS extends Numbered[String] { 27 | def conj:PoS = sys.error("conj is not defined in this PoS class.") 28 | def hierar:Seq[PoS] = sys.error("hierar is not defined in this PoS class.") 29 | def hierarConj:Seq[PoS] = sys.error("hierarConj is not defined in this PoS class.") 30 | def first = hierar(0) 31 | def second = if (hierar.size < 2) first else hierar(1) 32 | def third = if (hierar.size < 3) second else hierar(2) 33 | 34 | def firstWithConj = hierarConj(0) 35 | def secondWithConj = if (hierarConj.size < 2) firstWithConj else hierarConj(1) 36 | def thirdWithConj = if (hierarConj.size < 3) secondWithConj else hierarConj(2) 37 | } 38 | trait OptionalPoS extends PoS 39 | trait MainPoS extends PoS 40 | 41 | case class Conjugation(override val id:Int, override val v:String) extends OptionalPoS { 42 | override def toString = v 43 | } 44 | case class FineTag(override val id:Int, override val v:String) extends OptionalPoS { 45 | override def toString = v 46 | } 47 | case class FineWithConjugation(override val id:Int, override val v:String) extends OptionalPoS { 48 | override def toString = v 49 | } 50 | case class SimplePoS(override val id:Int, override val v:String) extends MainPoS { 51 | override def toString = v 52 | } 53 | case class JapanesePoS(override val id:Int, 54 | override val v:String, 55 | override val conj:PoS, 56 | override val hierar:Seq[PoS], 57 | override val hierarConj:Seq[PoS]) extends MainPoS { 58 | override def toString = v 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/SimpleDictionary.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | class SimpleDictionary extends Dictionary(new Word2CategoryDictionary) { 20 | override val posManager = new PoSManager { 21 | def createWithId(original: PoS) = SimplePoS(newId, original.v) 22 | def createCanonicalInstance(str:String) = SimplePoS(0, str) 23 | } 24 | override val categoryManager = new CategoryManager { 25 | override def createCanonicalInstance(str: String): Category = EnglishCategoryParser.parse(str) 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/Slash.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | object Slash extends Enumeration { 20 | type Slash = Value 21 | val Left = Value("\\") 22 | val Right = Value("/") 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/lexicon/Word.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | trait Word extends Numbered[String] { 20 | // additional information is defined in function; may or may not be overridden in val by subclasses 21 | def classId:Int = throw new RuntimeException("classId is not defined in this Word class.") 22 | def assignClass(classId:Int):Word = this // default do nothing 23 | // some morphological information extracted from the surface form might be included ? (e.g., for morphological rich languages) 24 | } 25 | 26 | case class SimpleWord(override val id:Int, override val v:String) extends Word { 27 | override def assignClass(classId:Int) = ClassedWord(id, v, classId) 28 | override def toString = v 29 | } 30 | case class ClassedWord(override val id:Int, 31 | override val v:String, 32 | override val classId:Int) extends Word { 33 | override def assignClass(classId:Int) = ClassedWord(id, v, classId) 34 | override def toString = v + "[" + classId + "]" 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/package.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package object ccg { 20 | type WeightVec = jigg.ml.WeightVector[Float] 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/parser/Action.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.parser 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import jigg.nlp.ccg.lexicon.{Category, Dictionary} 20 | import jigg.nlp.ccg.lexicon.Direction.Direction 21 | 22 | /** 23 | * action and corresponding label; for speed reason, label should not have the actual object such as category, so we convert Action object into corresponding Label object when filling feature templates 24 | */ 25 | sealed trait Action { def toLabel:ActionLabel } 26 | sealed trait ActionLabel { 27 | def mkString(dict:Dictionary):String 28 | } 29 | 30 | // shift the category with categoryId of the head of buffer 31 | case class Shift(category:Category) extends Action { override def toLabel = ShiftLabel(category.id) } 32 | 33 | @SerialVersionUID(-6619103978469031483L) 34 | case class ShiftLabel(id:Int) extends ActionLabel { 35 | override def mkString(dict:Dictionary) = "SHIFT(" + dict.getCategory(id) + ")" 36 | } 37 | 38 | // combine two top nodes on the stack into a node which has categoryId 39 | case class Combine(category:Category, headDir:Direction, ruleType:String) extends Action { override def toLabel = CombineLabel(category.id) } 40 | 41 | @SerialVersionUID(-1350486416817206332L) 42 | case class CombineLabel(id:Int) extends ActionLabel { 43 | override def mkString(dict:Dictionary) = "COMBINE(" + dict.getCategory(id) + ")" 44 | } 45 | 46 | // unary change to a node with categoryId 47 | case class Unary(category:Category, ruleType:String) extends Action { override def toLabel = UnaryLabel(category.id) } 48 | 49 | @SerialVersionUID(-3492899016953622825L) 50 | case class UnaryLabel(id:Int) extends ActionLabel { 51 | def mkString(dict:Dictionary) = "UNARY(" + dict.getCategory(id) + ")" 52 | } 53 | 54 | case class Finish() extends Action { override def toLabel = FinishLabel() } 55 | 56 | @SerialVersionUID(-6536578690403443069L) 57 | case class FinishLabel() extends ActionLabel { 58 | def mkString(dict:Dictionary) = "FINISH" 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/parser/HeadFinder.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.parser 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.collection.mutable.HashMap 20 | import jigg.nlp.ccg.lexicon.{PoS, JapanesePoS, Category} 21 | import jigg.nlp.ccg.lexicon.Direction._ 22 | 23 | trait HeadFinder extends Serializable { 24 | type NodeInfo = HeadFinder.NodeInfo 25 | def get(left:NodeInfo, right:NodeInfo): Direction 26 | } 27 | object HeadFinder { 28 | case class NodeInfo(pos:PoS, category:Category, headCategory:Category) 29 | } 30 | 31 | case class EnglishHeadFinder(children2dir: Map[(Int, Int), Direction]) extends HeadFinder { 32 | def get(left:NodeInfo, right:NodeInfo) = 33 | children2dir.get(left.category.id, right.category.id) match { 34 | case Some(dir) => dir 35 | case _ => Left 36 | } 37 | } 38 | 39 | object EnglishHeadFinder { 40 | import jigg.nlp.ccg.lexicon.{ParseTree, NodeLabel, BinaryTree, NonterminalLabel} 41 | def createFromParseTrees(trees: Seq[ParseTree[NodeLabel]]): EnglishHeadFinder = { 42 | val map = new HashMap[(Int, Int), Direction] 43 | trees.foreach { _.foreachTree { _ match { 44 | case BinaryTree(left, right, NonterminalLabel(dir, _, _)) => 45 | map += (left.label.category.id, right.label.category.id) -> dir 46 | case _ => 47 | }}} 48 | EnglishHeadFinder(map.toMap) 49 | } 50 | } 51 | 52 | object JapaneseHeadFinder extends HeadFinder { 53 | val Symbol = "記号" 54 | def get(left:NodeInfo, right:NodeInfo) = { 55 | val leftPos = left.pos.first.v 56 | val rightPos = right.pos.first.v 57 | if (rightPos == Symbol) Left else Right 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/parser/KBestDecoder.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.parser 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import jigg.nlp.ccg.lexicon.{Derivation, CandAssignedSentence} 20 | 21 | case class WrappedAction(v: Action, isGold:Boolean, partialFeatures:LabeledFeatures = LabeledFeatures()) 22 | 23 | case class StatePath(state:State, waction: WrappedAction, prev: Option[StatePath] = None, score:Double = 0) { 24 | def actionPath = expand.map(_.waction) 25 | def expand = expandRecur(Nil) 26 | private def expandRecur(seq: List[StatePath]): List[StatePath] = prev match { 27 | case None => seq // always ignoring the initial state 28 | case Some(prev) => prev.expandRecur(this :: seq) 29 | } 30 | def lighten = this.copy(waction = waction.copy(partialFeatures = LabeledFeatures())) 31 | } 32 | 33 | trait KBestDecoder { 34 | 35 | trait ACandidate { 36 | def path: StatePath 37 | def score: Double 38 | def isConnected: Boolean = path.state.s1 == None 39 | } 40 | 41 | val comparePreferringConnected: (ACandidate, ACandidate) => Boolean = { 42 | case (a, b) if a.isConnected && !b.isConnected => true 43 | case (a, b) if !a.isConnected && b.isConnected => false 44 | case (a, b) => a.score > b.score 45 | } 46 | 47 | def search(sentence: CandAssignedSentence): Seq[ACandidate] 48 | 49 | def predict(sentence: CandAssignedSentence): (Derivation, Double) = { 50 | val c = search(sentence).sortWith(_.score > _.score)(0) 51 | (c.path.state.toDerivation, c.score) 52 | } 53 | 54 | /** If a fully connected tree is found, return the one with the maximum score; else return the maximum score unconnected tree 55 | */ 56 | def predictConnected(sentence: CandAssignedSentence): (Derivation, Double) = { 57 | val c = search(sentence).sortWith(comparePreferringConnected)(0) 58 | (c.path.state.toDerivation, c.score) 59 | } 60 | 61 | /** Return k-best trees according to the final state score. 62 | * 63 | * @param preferConnected if ture, fully connected trees are placed at the top of elements even if it is not the maximum score tree. 64 | */ 65 | def predictKbest(k: Int, sentence: CandAssignedSentence, preferConnected: Boolean = false): Seq[(Derivation, Double)] = { 66 | val sorted = preferConnected match { 67 | case true => search(sentence).sortWith(comparePreferringConnected) 68 | case false => search(sentence).sortWith(_.score > _.score) 69 | } 70 | sorted.take(k) map { c => (c.path.state.toDerivation, c.score) } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/parser/Rule.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.parser 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule} 20 | 21 | import scala.collection.mutable.{HashMap, HashSet} 22 | import java.io.{ObjectOutputStream, ObjectInputStream} 23 | 24 | trait Rule { 25 | def unify(left:Category, right:Category): Option[Array[(Category, String)]] 26 | def raise(child:Category): Option[Array[(Category, String)]] 27 | def headFinder:HeadFinder 28 | } 29 | 30 | // rules are restricted to CFG rules extracted from the training CCGBank 31 | case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType) 32 | val unaryRules:Map[Int, Array[(Category, String)]], 33 | override val headFinder:HeadFinder) extends Rule { 34 | def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id)) 35 | def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id) 36 | } 37 | 38 | object CFGRule { 39 | def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = { 40 | val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]] 41 | val unaryRules = new HashMap[Int, HashSet[(Category, String)]] 42 | 43 | derivations.foreach { deriv => 44 | deriv.foreachPoint({ point:Point => deriv.get(point) match { 45 | case Some(AppliedRule(UnaryChildPoint(child), ruleType)) => 46 | val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)]) 47 | parents += ((point.category, ruleType)) 48 | case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) => 49 | val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)]) 50 | parents += ((point.category, ruleType)) 51 | case _ => 52 | }}) 53 | } 54 | new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap, 55 | unaryRules.map { case (k, v) => k -> v.toArray }.toMap, 56 | headFinder) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/parser/package.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package object parser { 20 | type UF = ShiftReduceUnlabeledFeature 21 | type LF = ShiftReduceFeature 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/tagger/UserDefinedFeatureExtractors.scala: -------------------------------------------------------------------------------- 1 | // package jigg.nlp.ccg.tagger 2 | 3 | // import jigg.nlp.ccg.lexicon.{Dictionary, JapaneseDictionary} 4 | 5 | // import scala.collection.mutable.ArrayBuffer 6 | 7 | // // this is the example to define new features and the extractor that extracts that features 8 | 9 | // object NewTemplate extends Enumeration { 10 | // type NewTemplate = Value 11 | // val w_p = Value 12 | // } 13 | 14 | // case class UnigramWordPoSFeature[T](word:Int, pos:Int, tmpl:T) extends FeatureOnDictionary { 15 | // override def mkString(dict:Dictionary) = concat(tmpl, dict.getWord(word)) 16 | // } 17 | 18 | // class UnigramSecondLevelFineExtractor(val windowSize:Int) extends FeatureExtractor { 19 | // def addFeatures(c:Context, features:ArrayBuffer[UF]) = { 20 | // features += UnigramWordPoSFeature(c.word(0), c.pos(0), NewTemplate.w_p) 21 | // } 22 | // } 23 | -------------------------------------------------------------------------------- /src/main/scala/jigg/nlp/ccg/tagger/package.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | package object tagger { 20 | type UF = SuperTaggingUnlabeledFeature 21 | type LF = SuperTaggingFeature 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/Annotation.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | 20 | /** Currently, this trait is useful to assign unique id 21 | * for each annotation. 22 | */ 23 | abstract class Annotation(val idPrefix: String) { 24 | val idGen = jigg.util.IDGenerator(idPrefix) 25 | def nextId: String = idGen.next 26 | } 27 | 28 | object Annotation { 29 | 30 | object Document extends Annotation("d") 31 | 32 | object Sentence extends Annotation("s") 33 | 34 | object Token extends Annotation("t") 35 | 36 | object Dependency extends Annotation("dep") 37 | 38 | object CCG extends Annotation("ccg") 39 | 40 | object NE extends Annotation("ne") 41 | 42 | object Mention extends Annotation("me") 43 | 44 | object Coreference extends Annotation("cr") 45 | 46 | object PredArg extends Annotation("pa") 47 | 48 | object ParseSpan extends Annotation("sp") 49 | object CCGSpan extends Annotation("ccgsp") 50 | 51 | object Chunk extends Annotation("ch") 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/AnnotationError.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | class AnnotationError(msg: String) extends RuntimeException(msg) 20 | 21 | class ProcessError(msg: String) extends AnnotationError(msg) 22 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/ArgumentError.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | class ArgumentError(msg: String) extends RuntimeException(msg) 20 | 21 | class RequirementError(msg: String) extends RuntimeException(msg) 22 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/DocumentAnnotator.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2017 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.xml.{Elem, Node} 20 | import jigg.util.XMLUtil.RichNode 21 | 22 | /** A trait for an annotator which modifies a document node. Use this trait if an annotator 23 | * is a document-level annotator. 24 | */ 25 | trait DocumentAnnotator extends Annotator { 26 | override def annotate(annotation: Node): Node = { 27 | 28 | annotation.replaceAll("root") { case e: Elem => 29 | val newChild = Annotator.makePar(e.child, nThreads).map { c => 30 | c match { 31 | case c if c.label == "document" => 32 | try newDocumentAnnotation(c) catch { 33 | case e: AnnotationError => 34 | System.err.println(s"Failed to annotate a document by $name.") 35 | Annotator.annotateError(c, name, e) 36 | } 37 | case c => c 38 | } 39 | }.seq 40 | e.copy(child = newChild) 41 | } 42 | } 43 | 44 | def newDocumentAnnotation(sentence: Node): Node 45 | } 46 | 47 | trait SeqDocumentAnnotator extends DocumentAnnotator { 48 | override def nThreads = 1 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/RegexDocumentAnnotator.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2016 Takafumi Sakakibara and Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import scala.xml.Node 21 | 22 | class RegexDocumentAnnotator(override val name: String, override val props: Properties) extends Annotator { 23 | 24 | @Prop(gloss = "Regular expression to segment documents") var pattern = """\n{2,}""" 25 | readProps() 26 | 27 | private[this] val documentIDGen = jigg.util.IDGenerator("d") 28 | override def annotate(annotation: Node): Node = { 29 | val raw = annotation.text 30 | 31 | var offset = 0 32 | 33 | val documents = raw.split(pattern).map { str => 34 | val n = { str } 39 | offset += str.size 40 | n 41 | } 42 | 43 | { documents } 44 | } 45 | 46 | override def requires = Set() 47 | override def requirementsSatisfied = Set(Requirement.Dsplit) 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/SentencesAnnotator.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2017 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.xml.{Elem, Node} 20 | import jigg.util.XMLUtil.RichNode 21 | 22 | /** A trait for an annotator which modifies a sentence node. 23 | * 24 | * If an annotator is sentence-level annotator such as a parser or pos tagger, it should 25 | * extend this trait and usually what you should do is only to implement 26 | * newSentenceAnnotation method, which rewrites a sentence node and returns new one. 27 | * 28 | * This annotates given sentences in parallel. If you want to avoid this perhaps 29 | * because the annotator is not thread-safe, use [[jigg.pipeline.SeqSentencesannotator]] 30 | * instead, which does annotates sequentially. 31 | */ 32 | trait SentencesAnnotator extends Annotator { 33 | def annotate(annotation: Node): Node = { 34 | 35 | annotation.replaceAll("sentences") { case e: Elem => 36 | val annotatedChild = Annotator.makePar(e.child, nThreads).map { 37 | case s if s.label == "sentence" => 38 | try newSentenceAnnotation(s) catch { 39 | case e: AnnotationError => 40 | System.err.println(s"Failed to annotate a document by $name.") 41 | Annotator.annotateError(s, name, e) 42 | } 43 | case s => s 44 | }.seq 45 | e.copy(child = annotatedChild) 46 | } 47 | } 48 | 49 | def newSentenceAnnotation(sentence: Node): Node 50 | } 51 | 52 | /** This trait annotates the inputs sequentially. 53 | */ 54 | trait SeqSentencesAnnotator extends SentencesAnnotator { 55 | override def nThreads = 1 56 | } 57 | 58 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/SimpleKNPAnnotator.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Takafumi Sakakibara and Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import scala.xml._ 21 | 22 | class SimpleKNPAnnotator(override val name: String, override val props: Properties) 23 | extends KNPAnnotator with AnnotatingSentencesInParallel { self=> 24 | 25 | @Prop(gloss = "Use this command to launch KNP (-tab is automatically added. -anaphora is not compatible with this annotator. In that case, use knpDoc instead). Version >= 4.12 is assumed.") var command = "knp" 26 | readProps() 27 | 28 | localAnnotators // instantiate lazy val here 29 | 30 | def mkLocalAnnotator = new SimpleKNPLocalAnnotator 31 | 32 | class SimpleKNPLocalAnnotator 33 | extends SentencesAnnotator with LocalAnnotator with BaseKNPLocalAnnotator { 34 | override def defaultArgs = Seq("-tab") 35 | 36 | val knp = mkIO() 37 | 38 | override def newSentenceAnnotation(sentence: Node): Node = { 39 | val sentenceId = (sentence \ "@id").toString 40 | 41 | val knpResult = runKNP(sentence, None) 42 | annotateSentenceNode(sentence, knpResult, sentenceId, _ => sentenceId) 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/SpaceTokenizerAnnotator.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | 21 | import scala.xml.{Node, Elem, Text, Atom} 22 | import jigg.util.XMLUtil.RichNode 23 | 24 | /** This simple annotator just segments a sentence by spaces, i.e., 25 | * assuming the input sentence is already correctly tokenized. 26 | */ 27 | class SpaceTokenizerAnnotator(override val name: String, override val props: Properties) 28 | extends SentencesAnnotator { 29 | 30 | override def newSentenceAnnotation(sentence: Node): Node = { 31 | 32 | val sindex = sentence \@ "id" 33 | val text = sentence.text 34 | val range = (0 until text.size) 35 | 36 | def isSpace(c: Char) = c == ' ' || c == '\t' 37 | 38 | val begins = 0 +: (1 until text.size).filter { i => isSpace(text(i-1)) && !isSpace(text(i)) } 39 | 40 | val ends = begins map { 41 | range indexWhere (i=>isSpace(text(i)), _) match { 42 | case -1 => text.size 43 | case e => e 44 | } 45 | } 46 | 47 | val tokenSeq = begins.zip(ends).zipWithIndex map { case ((b, e), i) => 48 | 53 | } 54 | val tokens = { tokenSeq } 55 | sentence addChild tokens 56 | } 57 | 58 | override def requires = Set(Requirement.Ssplit) 59 | override def requirementsSatisfied = Set(Requirement.Tokenize) 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/SystemDict.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | sealed trait SystemDic 20 | 21 | object SystemDic { 22 | case object ipadic extends SystemDic 23 | case object jumandic extends SystemDic 24 | case object unidic extends SystemDic 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/jigg/pipeline/UnmanagedAnnotators.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /** A singleton managing the collection of `UnmanagedAnnotator`. 4 | * 5 | * See the document of `UnmanagedAnnotator` for its role. `list` is an essential object, 6 | * which preserves mapping from the annotator name to an `UnmanagedAnnotator`. If you 7 | * want to support a new annotator that depends on an unmanaged library, add it to the 8 | * `list`. 9 | */ 10 | object UnmanagedAnnotators { 11 | 12 | /** Information about the annotator that wraps a software, which is in JVM while not 13 | * included as a managed library via maven. 14 | * 15 | * When assembling, such external unmanaged jars are not included, so a user has to 16 | * explicitly add them to the class path. Each UnmanagedAnnotator object helps to 17 | * describe how to use it. For example, its default message, implemented in 18 | * `DefaultUnmanagedannotator` tells the url of the library jar file. 19 | */ 20 | trait UnmanagedAnnotator[A] { 21 | def name: String 22 | def clazz: Class[A] 23 | 24 | def msg: String 25 | } 26 | 27 | case class DefaultUnmanagedAnnotator[A]( 28 | val name: String, val clazz: Class[A], url: String) extends UnmanagedAnnotator[A] { 29 | 30 | def msg = s"""Failed to launch $name. Maybe the necessary jar file is not included in 31 | the current class path. This might be solved by adding jar/* into your class path, 32 | e.g., call the jigg like like: 33 | 34 | > java cp "jigg-xxx.jar:jar/*" jigg.pipeline.Pipeline ... 35 | 36 | If the error still remains, the necessary jar file is missing. You can download it 37 | from ${url}. Try e.g., 38 | 39 | > wget $url jar/ 40 | 41 | and do the above command. 42 | """ 43 | } 44 | 45 | val list = Map( 46 | "easyccg" -> DefaultUnmanagedAnnotator( 47 | "easyccg", 48 | classOf[EasyCCGAnnotator], 49 | "https://github.com/mikelewis0/easyccg/raw/master/easyccg.jar")) 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/ArgumentsParser.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | 21 | object ArgumentsParser { 22 | def parse(args: List[String]): Properties = parseRecur(new Properties, args) 23 | 24 | private def parseRecur(props: Properties, args: List[String]): Properties = args match { 25 | case ArgKey(key) :: next => next match { 26 | case ArgKey(nextKey) :: tail => // -key1 -key2 ... => key1 is boolean value 27 | putTrue(props, key) 28 | parseRecur(props, next) 29 | case value :: tail => 30 | key match { 31 | case "props" => props.load(jigg.util.IOUtil.openIn(value)) 32 | case _ => props.put(key, value) 33 | } 34 | parseRecur(props, tail) 35 | case Nil => 36 | putTrue(props, key) 37 | parseRecur(props, next) 38 | } 39 | case _ => props 40 | } 41 | def putTrue(props: Properties, key: String) = props.put(key, "true") 42 | 43 | object ArgKey { 44 | def unapply(key: String): Option[String] = key match { 45 | case x if x.size > 1 && x(0) == '-' && x.drop(1).forall(x=>x.isDigit || x=='.') => None // -10.0, -1, etc are not key 46 | case x if x.size > 1 && x(0) == '-' && x(1) == '-' => Some(x.substring(2)) 47 | case x if x.size > 1 && x(0) == '-' => Some(x.substring(1)) // we don't catch if x.size == 1, ('-' is recognized as some value) 48 | case _ => None 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/HDF5Object.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import ucar.nc2.{Attribute, Group, NetcdfFile} 20 | 21 | class HDF5Object(rootGroup: Group) { 22 | 23 | def checkAndGetAttribute(name: String): Attribute = Option(rootGroup.findAttribute(name)) match { 24 | case Some(x) => x 25 | case None => throw new IllegalArgumentException("cannot get " + name + " attribute from input model file") 26 | } 27 | 28 | def checkAndGetGroup(name: String): Group = Option(rootGroup.findGroup(name)) match { 29 | case Some(x) => x 30 | case None => throw new IllegalArgumentException("cannot get " + name + " group from input model file") 31 | } 32 | 33 | } 34 | 35 | object HDF5Object { 36 | 37 | // Load from a path on the file system 38 | def fromFile(path: String): HDF5Object = { 39 | val file = NetcdfFile.open(path, null) 40 | mkObj(file) 41 | } 42 | 43 | // Load from class loader 44 | def fromResource(path: String): HDF5Object = { 45 | val file = 46 | NetcdfFile.openInMemory(IOUtil.findResource(path).toURI) 47 | mkObj(file) 48 | } 49 | 50 | private def mkObj(file: NetcdfFile) = { 51 | val group = file.getRootGroup 52 | new HDF5Object(group) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/IDGenerator.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | // trait IDGeneratorBase { 20 | // def next(): String 21 | // } 22 | 23 | // case class IDGenerator(prefix: String) extends IDGeneratorBase { 24 | // private[this] val stream = Stream.from(0).iterator 25 | // def next() = prefix + stream.next 26 | // } 27 | 28 | case class IDGenerator(toId: Int=>String) { 29 | private[this] var stream = Stream.from(0).iterator 30 | def next() = toId(stream.next) 31 | def reset() = stream = Stream.from(0).iterator 32 | } 33 | 34 | object IDGenerator { 35 | def apply(prefix: String): IDGenerator = IDGenerator(prefix + _) 36 | } 37 | 38 | /** Not thread-safe but little overhead 39 | */ 40 | case class LocalIDGenerator(toId: Int=>String) { 41 | var i = 0 42 | def next() = { 43 | val n = toId(i) 44 | i += 1 45 | n 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/LogUtil.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | object LogUtil { 20 | /** A helper to measure time. 21 | * If multiple commands are nested, use multipleTrack. 22 | * 23 | * TODO: Integrate track and multipleTrack to automatically choose indent and appropriate format. 24 | * Currently track[A](beginMessage: String, ...) "manually" handles the indent level. 25 | */ 26 | def track[A](message: String)(body: => A): A = { 27 | // System.out.print(message) 28 | // val (result, time) = recordTime { body } 29 | // System.out.println("done [%.1f sec]".format(time)) 30 | // result 31 | track(message, "done", 0) { body } 32 | } 33 | 34 | def multipleTrack[A](message: String)(body: => A): A = { 35 | // System.out.println("{ " + message) 36 | // val (result, time) = recordTime { body } 37 | // System.out.println("} [%.1f sec]".format(time)) 38 | // result 39 | track(message + " {\n", "}", 0) { body } 40 | } 41 | 42 | def track[A](beginMessage: String, endMessage: String, indent: Int)(body: => A): A = { 43 | def print(raw: String) = { 44 | (0 until indent) foreach { _ => System.out.print(" ") } 45 | System.out.print(raw) 46 | } 47 | print(beginMessage) 48 | val (result, time) = recordTime { body } 49 | System.out.println(endMessage + " [%.1f sec]".format(time)) 50 | result 51 | } 52 | 53 | def recordTime[A](body: => A): (A, Double) = { 54 | val before = System.currentTimeMillis 55 | val result = body 56 | val time = (System.currentTimeMillis - before).toDouble / 1000.0 57 | (result, time) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/LookupTable.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.io.Reader 20 | 21 | import breeze.linalg.DenseMatrix 22 | import org.json4s.{DefaultFormats, _} 23 | import org.json4s.jackson.JsonMethods 24 | import org.json4s.JsonAST.JValue 25 | 26 | class LookupTable(rawTable: JValue) { 27 | 28 | implicit private val formats = DefaultFormats 29 | private val tables = rawTable.extract[Map[String, Map[String, Map[String, String]]]] 30 | 31 | private val key2id = tables("_lookup")("_key2id") 32 | private val id2key = tables("_lookup")("_id2key") 33 | 34 | // For raw text 35 | def encodeCharacter(str: String): DenseMatrix[Float] = { 36 | val strArray = str.map{x => 37 | // Note: For skipping unknown character, this encoder returns dummy id. 38 | key2id.getOrElse(x.toString, "3").toFloat 39 | }.toArray 40 | new DenseMatrix[Float](1, str.length, strArray) 41 | } 42 | 43 | // For list of words 44 | def encodeWords(words: Array[String]): DenseMatrix[Float] = { 45 | val wordsArray = words.map{x => 46 | // Note: For skipping unknown words, this encoder returns dummy id. 47 | key2id.getOrElse(x.toString, "3").toFloat 48 | } 49 | new DenseMatrix[Float](1, words.length, wordsArray) 50 | } 51 | 52 | def decode(data: DenseMatrix[Float]): Array[String] = 53 | data.map{x => id2key.getOrElse(x.toInt.toString, "NONE")}.toArray 54 | 55 | def getId(key: String): Int = key2id.getOrElse(key, "0").toInt 56 | def getId(key: Char): Int = getId(key.toString) 57 | 58 | def getKey(id: Int): String = id2key.getOrElse(id.toString, "UNKNOWN") 59 | } 60 | 61 | 62 | object LookupTable { 63 | 64 | // Load from a path on the file system 65 | def fromFile(path: String) = mkTable(IOUtil.openIn(path)) 66 | 67 | // Load from class loader 68 | def fromResource(path: String) = mkTable(IOUtil.openResourceAsReader(path)) 69 | 70 | private def mkTable(input: Reader) = { 71 | val j = try { JsonMethods.parse(input) } finally { input.close } 72 | new LookupTable(j) 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/Normalizer.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import com.ibm.icu.text.Transliterator 20 | 21 | object Normalizer { 22 | 23 | /** Replace all half space characters in ascii (< 0x7F) to full space characters. 24 | * 25 | * Useful for preprocessing in some Japanese software such as JUMAN and KNP. 26 | * 27 | * NOTE: We do not touch hankaku kana characters since they make alignment to the 28 | * original text more involved. 29 | */ 30 | def hanZenAscii(text: String) = text map { 31 | case c if c <= 0x7F => hanzenTrans.transliterate(c + "")(0) 32 | case c => c 33 | } 34 | private val hanzenTrans = Transliterator.getInstance("Halfwidth-Fullwidth") 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/Prop.java: -------------------------------------------------------------------------------- 1 | package jigg.util; 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.lang.annotation.*; 20 | 21 | @Retention(RetentionPolicy.RUNTIME) 22 | public @interface Prop { 23 | // String name() default ""; 24 | String gloss() default ""; 25 | boolean required() default false; 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/PropertiesUtil.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import scala.collection.JavaConversions._ 21 | 22 | object PropertiesUtil { 23 | def findProperty(key: String, props: Properties): Option[String] = props.getProperty(key) match { 24 | case null => None 25 | case value => Some(value) 26 | } 27 | def safeFind(key: String, props: Properties): String = findProperty(key, props).getOrElse { sys.error(s"$key property is required!" ) } 28 | 29 | def getBoolean(key: String, props: Properties): Option[Boolean] = findProperty(key, props) map { 30 | case "true" => true 31 | case "false" => false 32 | case _ => sys.error(s"Property $key should be true or false") 33 | } 34 | 35 | def filter(props: Properties)(f: (String, String)=>Boolean): Seq[(String, String)] = 36 | props.stringPropertyNames.toSeq 37 | .map { k => (k, props.getProperty(k)) } 38 | .filter { case (k, v) => f(k, v) } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/ResourceUtil.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | import java.io.File 4 | 5 | object ResourceUtil { 6 | 7 | /** Read a python script found in `resources/python/xxx.py`. Since these files cannot 8 | * be executed directly we create a temporary file by copying the script first, and 9 | * return the resulting temp file. 10 | * 11 | * @param name script name, corresponding to `xxx.py`. 12 | */ 13 | def readPython(name: String): File = { 14 | val script = File.createTempFile("jigg", ".py") 15 | script.deleteOnExit 16 | val stream = getClass.getResourceAsStream(s"/python/${name}") 17 | IOUtil.writing(script.getPath) { o => 18 | scala.io.Source.fromInputStream(stream).getLines foreach { line => 19 | o.write(line + "\n") 20 | } 21 | } 22 | script 23 | } 24 | 25 | } 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/main/scala/jigg/util/TreesUtil.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import scala.collection.mutable.ArrayBuffer 20 | import scala.xml._ 21 | 22 | import jigg.pipeline.Annotation 23 | 24 | object TreesUtil { 25 | 26 | def streeToNode(tree: String, sentence: Node, annotator: String) = { 27 | val tokens = tree.replaceAllLiterally("(", " ( ").replaceAllLiterally(")", " ) ").trim.split("\\s+") 28 | 29 | val tokenSeq = (sentence \ "tokens").head \ "token" 30 | var tokIdx = -1 31 | def nextTokId = { tokIdx += 1; tokenSeq(tokIdx) \@ "id" } 32 | 33 | val spans = new ArrayBuffer[Node] 34 | 35 | // Fill in spans; return the id of constructed subtree, and the arrived index. 36 | def readTopdown(idx: Int): (String, Int) = { 37 | 38 | def collectChildren(curChildren: List[String], cur: Int): (Seq[String], Int) = 39 | tokens(cur) match { 40 | case ")" => 41 | (curChildren.reverse, cur) 42 | case "(" => 43 | val (nextChildId, nextIdx) = readTopdown(cur) 44 | collectChildren(nextChildId :: curChildren, nextIdx) 45 | } 46 | 47 | tokens(idx) match { 48 | case "(" => 49 | def skipParen(i: Int = 0): Int = { 50 | if (tokens(idx + i) == "(") skipParen(i + 1) 51 | else i 52 | } 53 | val parenCount = skipParen() 54 | 55 | val labelIdx = idx + parenCount 56 | val label = tokens(labelIdx) 57 | 58 | val (children, closeIdx) = tokens(labelIdx + 1) match { 59 | case "(" => collectChildren(Nil, labelIdx + 1) 60 | case word => (Nil, labelIdx + 1 + 1) 61 | } 62 | val thisId = children match { 63 | case Nil => nextTokId 64 | case children => Annotation.ParseSpan.nextId 65 | } 66 | if (!children.isEmpty) { 67 | val childStr = children mkString " " 68 | spans += 69 | } 70 | for (i <- 0 until parenCount) { assert(tokens(closeIdx + i) == ")") } 71 | (thisId, closeIdx + parenCount) 72 | } 73 | } 74 | 75 | val (rootId, _) = readTopdown(0) 76 | { spans } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/test/resources/data/Japanese.small.lexicon: -------------------------------------------------------------------------------- 1 | @UNK@/フィラー/_ S1/S1 NP[nc,adv]1/NP[nc,adv]1 NP[nc,nm]1/NP[nc,nm]1 2 | @UNK@/副詞-一般/_ S1/S1 NP[nc,nm]1/NP[nc,nm]1 S[nm,stem] NP[nc,adv]1/NP[nc,adv]1 3 | @UNK@/副詞-助詞類接続/_ S1/S1 NP[nc,nm]1/NP[nc,nm]1 S[nm,stem] 4 | あふれる/動詞-自立/基本形 S[nm,stem]\NP[ga,nm,ga]-base_verb_rule S[nm,stem]\NP[ga,nm,ga]-adnominal_verb_rule -------------------------------------------------------------------------------- /src/test/resources/data/Japanese.unkVerb.lexicon: -------------------------------------------------------------------------------- 1 | @UNK@/動詞-非自立/仮定形 S[nm,hyp]\S[nm,cont]sem 2 | @UNK@/動詞-非自立/体言接続特殊 S[nm,attr]\S[nm,neg]sem 3 | @UNK@/動詞-非自立/体言接続特殊2 S[adn,attr] 4 | @UNK@/動詞-非自立/基本形 S[nm,base]\S[nm,cont]sem S[adn,base]\S[nm,cont]sem S[nm,base] S[nm,base]\NP[ga,nm,ga] NP[nc,nm]1/NP[nc,nm]1 5 | @UNK@/動詞-非自立/未然ウ接続 S[nm,neg]\S[nm,cont]sem 6 | @UNK@/動詞-非自立/未然形 S[nm,neg]\S[nm,cont]sem S[nm,neg] S[nm,neg]\S[nm,r]sem 7 | @UNK@/動詞-非自立/連用タ接続 S[nm,cont]\S[nm,cont]sem S[nm,cont] 8 | @UNK@/動詞-非自立/連用形 S[nm,cont]\S[nm,cont]sem S[adv,cont]\S[nm,cont]sem S[nm,cont] S[adn,cont] S[adn,cont]\S[nm,cont]sem S[nm,cont]\NP[ga,nm,ga] -------------------------------------------------------------------------------- /src/test/resources/data/json/english.ssplit.test.json: -------------------------------------------------------------------------------- 1 | { 2 | ".tag" : "root", 3 | ".child" : [ { 4 | ".tag" : "document", 5 | "id" : "d0", 6 | ".child" : [ { 7 | ".tag" : "sentences", 8 | ".child" : [ { 9 | ".tag" : "sentence", 10 | "text" : "Alice asked her mother to cook a cake.", 11 | "id" : "s0", 12 | "characterOffsetBegin" : "0", 13 | "characterOffsetEnd" : "38" 14 | }, { 15 | ".tag" : "sentence", 16 | "text" : "Bob saw a girl in the garden with a telescope.", 17 | "id" : "s1", 18 | "characterOffsetBegin" : "39", 19 | "characterOffsetEnd" : "85" 20 | } ] 21 | } ] 22 | } ] 23 | } -------------------------------------------------------------------------------- /src/test/resources/data/json/japanese.ssplit.test.json: -------------------------------------------------------------------------------- 1 | { 2 | ".tag" : "root", 3 | ".child" : [ { 4 | ".tag" : "document", 5 | "id" : "d0", 6 | ".child" : [ { 7 | ".tag" : "sentences", 8 | ".child" : [ { 9 | ".tag" : "sentence", 10 | "text" : "自転車で走っている少女を見た", 11 | "id" : "s0", 12 | "characterOffsetBegin" : "0", 13 | "characterOffsetEnd" : "14" 14 | }, { 15 | ".tag" : "sentence", 16 | "text" : "テレビで走っている少女を見た", 17 | "id" : "s1", 18 | "characterOffsetBegin" : "15", 19 | "characterOffsetEnd" : "29" 20 | } ] 21 | } ] 22 | } ] 23 | } -------------------------------------------------------------------------------- /src/test/resources/data/keras/bunsetsu_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/keras/bunsetsu_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/keras/ssplit_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/keras/ssplit_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/convolution1d/convolution1d_gold.csv: -------------------------------------------------------------------------------- 1 | -0.288217455148697,0.681861579418182 2 | -0.538490712642670,0.062052655965090 3 | -0.318091481924057,-0.074813574552536 4 | -0.023546881973743,0.040708515793085 5 | -0.485583871603012,0.224703624844551 6 | -0.450441420078278,0.002716975519434 7 | -0.176823571324348,0.489799916744232 8 | -0.123186729848385,0.057490978389978 9 | -0.336253672838211,-0.084099449217319 10 | 0.059555754065514,0.000320440391079 11 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/convolution1d/convolution1d_input.csv: -------------------------------------------------------------------------------- 1 | 0.027738961009708,0.393455303479803,0.694816228560713,0.157559454348151,0.214884384043615,0.005565078182797,0.949002280200014,0.690369967699377,0.998176256773562,0.204425396011438,0.845982123544135,0.818198829832328 2 | 0.252301884057857,0.437311847167796,0.104436208603942,0.763925291392123,0.870987562303758,0.079435648160725,0.142875224317561,0.170360773159227,0.387373867227415,0.745431984723710,0.479836153327895,0.744296844299619 3 | 0.883415945353071,0.697078201963215,0.606604317884067,0.777094318509148,0.956809131373719,0.018343700379643,0.692863164913816,0.107627736723910,0.595232367723716,0.618970512903785,0.748639111184423,0.941869156250547 4 | 0.035042201371063,0.700113249200931,0.717126347279872,0.511744032438561,0.247658441044617,0.576820124281050,0.047399750738226,0.067116874648913,0.175494795121527,0.240304085868729,0.603887921839716,0.537397181554857 5 | 0.554501767544110,0.411117180527812,0.648722795158795,0.508408218827410,0.785647318386747,0.947404977871054,0.113110476551426,0.936072327771750,0.863526769665361,0.172236633875255,0.715443984726397,0.869742300523170 6 | 0.331881976191941,0.174389983798250,0.974055309053648,0.952572967439939,0.395194463615389,0.979596804619930,0.126419143266621,0.028127155855804,0.377202820144004,0.788029009784025,0.143934466920253,0.885531232719449 7 | 0.082605263961736,0.816844068389051,0.742036051284236,0.448338330763183,0.231913187967981,0.324263082007595,0.095113194171922,0.575291246962427,0.402043739476673,0.773164202330256,0.978885567374195,0.531234497631943 8 | 0.797474806333550,0.770689995657307,0.286838584369559,0.272812118439933,0.522711445247614,0.557358959671089,0.655063150020376,0.613348870624681,0.903721040494730,0.676600535740517,0.862388024752785,0.483734729571592 9 | 0.511364975233000,0.956982804048265,0.489405080608254,0.946988783071462,0.304099907120206,0.159633845243493,0.441705350104236,0.014337837348216,0.609972921479224,0.159291332076170,0.521437544993183,0.863046123179579 10 | 0.043232549851898,0.273736339785920,0.378312369831591,0.953767858492059,0.200604482875413,0.810072095098931,0.391870443803649,0.639344286225899,0.677303032937693,0.276362747713528,0.359063987058490,0.334056036907750 11 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/convolution1d/convolution1d_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/convolution1d/convolution1d_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/dense/dense_gold.csv: -------------------------------------------------------------------------------- 1 | -0.265054643154144,0.819157660007477 2 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/dense/dense_input.csv: -------------------------------------------------------------------------------- 1 | 0.919222086072171,0.268580028843516,0.850487637208910,0.195140088357300,0.915650682096673,0.694448840619902,0.686364957159918,0.845189174009755,0.515407551460194,0.707307670736291 2 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/dense/dense_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/dense/dense_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/embedding/embedding_gold.csv: -------------------------------------------------------------------------------- 1 | -0.024064350873232,0.015874337404966 2 | -0.032138548791409,0.035715412348509 3 | -0.009305894374847,0.047007892280817 4 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/embedding/embedding_input.csv: -------------------------------------------------------------------------------- 1 | 4.000000000000000,3.000000000000000,6.000000000000000 2 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/embedding/embedding_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/embedding/embedding_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/flatten/flatten_gold.csv: -------------------------------------------------------------------------------- 1 | 0.483355849981308,0.272490352392197,0.915887176990509,0.335418432950974,0.778468728065491,0.853674173355103 2 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/flatten/flatten_input.csv: -------------------------------------------------------------------------------- 1 | 0.483355847870847,0.272490343423817 2 | 0.915887187299997,0.335418421687206 3 | 0.778468739455691,0.853674144810384 4 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/flatten/flatten_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/flatten/flatten_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/kerasModel/kerasModel_gold.csv: -------------------------------------------------------------------------------- 1 | 0.066982857882977,0.864855527877808,0.068161644041538 2 | 0.036359727382660,0.940843880176544,0.022796416655183 3 | 0.000093939248472,0.024136895313859,0.975769102573395 4 | 0.000007191142231,0.037699114531279,0.962293744087219 5 | 0.859113097190857,0.130854964256287,0.010032005608082 6 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/kerasModel/kerasModel_input.csv: -------------------------------------------------------------------------------- 1 | 0.000000000000000,6.000000000000000,6.000000000000000,2.000000000000000,6.000000000000000 2 | -------------------------------------------------------------------------------- /src/test/resources/data/ml/keras/kerasModel/kerasModel_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/kerasModel/kerasModel_model.h5 -------------------------------------------------------------------------------- /src/test/resources/data/template.small.lst: -------------------------------------------------------------------------------- 1 | NP[nc,nm]1/NP[nc,nm]1 NP[nc,nm]1/NP[nc,nm]1 2 | S[nm,stem]\NP[ga,nm,ga]-base_verb_rule S[nm,base]\NP[ga,nm,ga] 3 | S[nm,stem]\NP[ga,nm,ga]-adnominal_verb_rule S[adn,base]\NP[ga,nm,ga] 4 | S1/S1 S1/S1 5 | NP[nc,adv]1/NP[nc,adv]1 NP[nc,adv]1/NP[nc,adv]1 6 | S[nm,stem] S[nm,stem] 7 | -------------------------------------------------------------------------------- /src/test/resources/data/template.unkVerb.lst: -------------------------------------------------------------------------------- 1 | S[adn,attr] S[adn,attr] 2 | S[adn,base]\S[nm,cont]sem S[adn,base]\S[nm,cont]sem 3 | S[adv,cont]\S[nm,cont]sem S[adv,cont]\S[nm,cont]sem 4 | S[adn,cont] S[adn,cont] 5 | S[adn,cont]\S[nm,cont]sem S[adn,cont]\S[nm,cont]sem 6 | S[nm,hyp]\S[nm,cont]sem S[nm,hyp]\S[nm,cont]sem 7 | S[nm,attr]\S[nm,neg]sem S[nm,attr]\S[nm,neg]sem 8 | S[nm,base] S[nm,base] 9 | S[nm,base]\S[nm,cont]sem S[nm,base]\S[nm,cont]sem 10 | S[nm,base]\NP[ga,nm,ga] S[nm,base]\NP[ga,nm,ga] 11 | S[nm,neg] S[nm,neg] 12 | S[nm,neg]\S[nm,cont]sem S[nm,neg]\S[nm,cont]sem 13 | S[nm,neg]\S[nm,r]sem S[nm,neg]\S[nm,r]sem 14 | S[nm,cont]\S[nm,cont]sem S[nm,cont]\S[nm,cont]sem 15 | S[nm,cont] S[nm,cont] 16 | S[nm,cont]\S[nm,cont]sem S[nm,cont]\S[nm,cont]sem 17 | S[nm,cont]\NP[ga,nm,ga] S[nm,cont]\NP[ga,nm,ga] 18 | NP[nc,nm]1/NP[nc,nm]1 NP[nc,nm]1/NP[nc,nm]1 19 | -------------------------------------------------------------------------------- /src/test/resources/data/xml/english.ssplit.spaceTokenize.gold.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Alice asked her mother to cook a cake. 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | Bob saw a girl in the garden with a telescope. 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /src/test/resources/data/xml/english.ssplit.test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Alice asked her mother to cook a cake. 6 | Bob saw a girl in the garden with a telescope. 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /src/test/resources/data/xml/japanese.ssplit.test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 自転車で走っている少女を見た 6 | テレビで走っている少女を見た 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/test/resources/script/create_small_lst_from_lexicon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' This script is used for creating data used for test 4 | 5 | Output is already included in resources/data directory 6 | as `template.small.lst`, so usually this file is unnecessary. 7 | 8 | Example usage from the project root directory is 9 | ./src/test/resources/script/create_small_lst_from_lexicon.py \ 10 | ./ccgbank/template.lst 11 | 12 | ''' 13 | 14 | import sys, os 15 | 16 | if __name__ == '__main__': 17 | if len(sys.argv) < 2: 18 | print "usage", sys.argv[0], "full_template_lst" 19 | exit() 20 | 21 | data_dir = os.path.abspath(os.path.dirname(__file__))+'/../data' 22 | small_lexicon_path = data_dir+'/Japanese.small.lexicon' 23 | output_path = data_dir+'/template.small.lst' 24 | 25 | cat_tmps = [] 26 | for line in open(small_lexicon_path): 27 | cat_tmps += line.strip().split(' ')[1:] 28 | cat_tmps = set(cat_tmps) 29 | 30 | with open(output_path, 'w') as f: 31 | for line in open(sys.argv[1]): 32 | line = line.strip().split('\t') 33 | cat_tmp = line[0] 34 | cat_str = line[1] 35 | 36 | if cat_tmp in cat_tmps: 37 | f.write("%s\t%s\n" % (cat_tmp, cat_str)) 38 | 39 | -------------------------------------------------------------------------------- /src/test/scala/jigg/ml/keras/Convolution1DSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.io._ 20 | import org.scalatest._ 21 | 22 | import jigg.util.HDF5Object 23 | 24 | import breeze.linalg.csvread 25 | import breeze.numerics.abs 26 | 27 | class Convolution1DSpec extends FlatSpec with Matchers{ 28 | 29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath 30 | 31 | "convert" should "load model and convert input matrix" in { 32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/convolution1d/convolution1d_model.h5") 33 | val model = new KerasModel(hdf5) 34 | val inputData = csvread(new File(findPath("./data/ml/keras/convolution1d/convolution1d_input.csv")),separator = ',').map{x => x.toFloat} 35 | val goldData = csvread(new File(findPath("./data/ml/keras/convolution1d/convolution1d_gold.csv")),separator = ',').map{x => x.toFloat} 36 | 37 | val output = model.convert(inputData) 38 | 39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat) 40 | 41 | diff should be (true) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/jigg/ml/keras/DenseSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.io._ 20 | import org.scalatest._ 21 | 22 | import jigg.util.HDF5Object 23 | 24 | import breeze.linalg.csvread 25 | import breeze.numerics.abs 26 | 27 | 28 | class DenseSpec extends FlatSpec with Matchers{ 29 | 30 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath 31 | 32 | "convert" should "load model and convert input matrix" in { 33 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/dense/dense_model.h5") 34 | val model = new KerasModel(hdf5) 35 | val inputData = csvread(new File(findPath("./data/ml/keras/dense/dense_input.csv")),separator = ',').map{x => x.toFloat} 36 | val goldData = csvread(new File(findPath("./data/ml/keras/dense/dense_gold.csv")),separator = ',').map{x => x.toFloat} 37 | 38 | val output = model.convert(inputData) 39 | 40 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat) 41 | 42 | diff should be (true) 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/scala/jigg/ml/keras/EmbeddingSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.io._ 20 | import org.scalatest._ 21 | 22 | import jigg.util.HDF5Object 23 | 24 | import breeze.linalg.csvread 25 | import breeze.numerics.abs 26 | 27 | class EmbeddingSpec extends FlatSpec with Matchers{ 28 | 29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath 30 | 31 | "convert" should "load model and convert input matrix" in { 32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/embedding/embedding_model.h5") 33 | val model = new KerasModel(hdf5) 34 | val inputData = csvread(new File(findPath("./data/ml/keras/embedding/embedding_input.csv")),separator = ',').map{x => x.toFloat} 35 | val goldData = csvread(new File(findPath("./data/ml/keras/embedding/embedding_gold.csv")),separator = ',').map{x => x.toFloat} 36 | 37 | val output = model.convert(inputData) 38 | 39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat) 40 | 41 | diff should be (true) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/jigg/ml/keras/FlattenSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.io._ 20 | import org.scalatest._ 21 | 22 | import jigg.util.HDF5Object 23 | 24 | import breeze.linalg.csvread 25 | import breeze.numerics.abs 26 | 27 | class FlattenSpec extends FlatSpec with Matchers{ 28 | 29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath 30 | 31 | "convert" should "load model and convert input matrix" in { 32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/flatten/flatten_model.h5") 33 | val model = new KerasModel(hdf5) 34 | val inputData = csvread(new File(findPath("./data/ml/keras/flatten/flatten_input.csv")),separator = ',').map{x => x.toFloat} 35 | val goldData = csvread(new File(findPath("./data/ml/keras/flatten/flatten_gold.csv")),separator = ',').map{x => x.toFloat} 36 | 37 | val output = model.convert(inputData) 38 | 39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat) 40 | 41 | diff should be (true) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/jigg/ml/keras/KerasModelSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.io._ 20 | import org.scalatest._ 21 | 22 | import jigg.util.HDF5Object 23 | 24 | import breeze.linalg.csvread 25 | import breeze.numerics.abs 26 | 27 | class KerasModelSpec extends FlatSpec with Matchers{ 28 | 29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath 30 | 31 | "convert" should "load model and convert input matrix" in { 32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/kerasModel/kerasModel_model.h5") 33 | val model = new KerasModel(hdf5) 34 | val inputData = csvread(new File(findPath("./data/ml/keras/kerasModel/kerasModel_input.csv")),separator = ',').map{x => x.toFloat} 35 | val goldData = csvread(new File(findPath("./data/ml/keras/kerasModel/kerasModel_gold.csv")),separator = ',').map{x => x.toFloat} 36 | 37 | val output = model.convert(inputData) 38 | 39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat) 40 | 41 | diff should be (true) 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/jigg/ml/keras/KerasParserTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.ml.keras 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | 21 | import org.scalatest.FunSuite 22 | import org.scalatest.Matchers._ 23 | 24 | import jigg.util.{HDF5Object, LookupTable} 25 | 26 | class KerasParserTest extends FunSuite{ 27 | 28 | val model = new KerasModel(HDF5Object.fromResource("./data/keras/ssplit_model.h5")) 29 | val table = LookupTable.fromResource("data/keras/jpnLookupCharacter.json") 30 | 31 | val parser = new KerasParser(model, table) 32 | 33 | test("get an offset list from pattern1") { 34 | val pattern = Array[Int](0,1,1,0,1,1) 35 | val ranges = parser.getOffsets(pattern) 36 | ranges should be (Array[(Int, Int)]((0,3),(3,6))) 37 | } 38 | 39 | test("get an offset list from pattern2") { 40 | val pattern = Array[Int](0,1,1,2,2,0,1,1) 41 | val ranges = parser.getOffsets(pattern) 42 | ranges should be (Array[(Int, Int)]((0,3),(5,8))) 43 | } 44 | 45 | test("get an offset list from pattern3") { 46 | val pattern = Array[Int](0,1,1,2,0,1,1,2) 47 | val ranges = parser.getOffsets(pattern) 48 | ranges should be (Array[(Int, Int)]((0,3),(4,7))) 49 | 50 | } 51 | 52 | test("get an offset list from pattern4") { 53 | val pattern = Array[Int](2,2,0,1,1,2,0,1,1,2) 54 | val ranges = parser.getOffsets(pattern) 55 | ranges should be (Array[(Int, Int)]((2,5),(6,9))) 56 | } 57 | 58 | test("get an offset list from pattern5") { 59 | val pattern = Array[Int](1,1,1,0,1,1) 60 | val ranges = parser.getOffsets(pattern) 61 | ranges should be (Array[(Int, Int)]((0,3),(3,6))) 62 | } 63 | 64 | test("get an offset list from pattern6") { 65 | val pattern = Array[Int](2,2,1,1,1,0,1,1) 66 | val ranges = parser.getOffsets(pattern) 67 | ranges should be (Array[(Int, Int)]((2,5),(5,8))) 68 | } 69 | 70 | test("get an offset list from pattern7") { 71 | val pattern = Array[Int](0,1,1,0,0,1,1) 72 | val ranges = parser.getOffsets(pattern) 73 | ranges should be (Array[(Int, Int)]((0,3),(3,4),(4,7))) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/jigg/nlp/ccg/lexicon/BunsetsuTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import org.scalatest.FunSuite 20 | import org.scalatest.Matchers._ 21 | 22 | class BunsetsuTest extends FunSuite { 23 | test("A gold derivation with cabocha bunsetsu-segments recover gold dependencies") { 24 | import jigg.nlp.ccg.parser.ParsedSentences 25 | val parsedSentences = new ParsedSentences 26 | val (sentence, derivation) = parsedSentences.simpleSentenceAndDerivation 27 | 28 | val bunsetsuSentence = BunsetsuSentence(Array( 29 | Bunsetsu(0, sentence.wordSeq.slice(0, 2), sentence.posSeq.slice(0, 2)), // 政権 に 30 | Bunsetsu(2, sentence.wordSeq.slice(2, 4), sentence.posSeq.slice(2, 4)), // 影響 を 31 | Bunsetsu(4, sentence.wordSeq.slice(4, 5), sentence.posSeq.slice(4, 5)), // 及ぼす 32 | Bunsetsu(5, sentence.wordSeq.slice(5, 6), sentence.posSeq.slice(5, 6)))) // こと 33 | 34 | val parsed = bunsetsuSentence.parseWithCCGDerivation(derivation) 35 | parsed.headSeq should equal (Seq(2, 2, 3, -1)) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/jigg/nlp/ccg/lexicon/CategoryFeatureTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | import org.scalatest.FunSuite 19 | import org.scalatest.Matchers._ 20 | import scala.collection.mutable.HashSet 21 | 22 | class JPCategoryFeatureTest extends FunSuite { 23 | test("equal test") { 24 | val feat1 = JPCategoryFeature.createFromValues(List("adn","attr","ga")) 25 | val feat2 = JPCategoryFeature.createFromValues(List("nm","attr","ga")) 26 | val feat3 = JPCategoryFeature.createFromValues(List("adn","attr")) 27 | val feat4 = JPCategoryFeature.createFromValues(List("adn","attr","ga")) 28 | 29 | feat1.kvs should equal (feat4.kvs) 30 | feat1.kvs should not equal (feat2.kvs) 31 | feat1.kvs should not equal (feat3.kvs) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/test/scala/jigg/nlp/ccg/lexicon/CategoryManagerTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | import org.scalatest.FunSuite 19 | import org.scalatest.Matchers._ 20 | 21 | class CategoryManagerTest extends FunSuite { 22 | test("the same child node should be assiged the same id") { 23 | val manager = new CategoryManager // Constructor automatically creates unknown category which is assigned id 0 24 | 25 | val cat = JapaneseCategoryParser.parse("NP[case=o,mod=nm]/NP[case=o,mod=nm]") 26 | manager.assignID(cat) match { 27 | case ComplexCategory(id, left, right, _) => { 28 | left.id should equal (1) 29 | right.id should equal (1) 30 | id should equal (2) 31 | } 32 | case _ => fail() // should not occur 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/test/scala/jigg/nlp/ccg/lexicon/CategoryParserTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.lexicon 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | import org.scalatest.FunSuite 19 | import org.scalatest.Matchers._ 20 | 21 | class CategoryParserTest extends FunSuite { 22 | test("extractCategoryFeature") { 23 | val reader = new JapaneseCategoryParser.JapaneseReader 24 | val ni_nm = reader.extractCategoryFeature("ni,nm") 25 | ni_nm.toString should equal ("mod=nm,case=ni") 26 | //assert(ni_nm.toString == "mod=nm,case=ni") 27 | } 28 | 29 | test("createAomicCategory") { 30 | val cat1Str = "NP[case=nc,mod=nm]{I1}" 31 | val cat1 = JapaneseCategoryParser.parse(cat1Str) 32 | cat1.toString should equal ("NP[mod=nm,case=nc]") 33 | 34 | val cat2Str = "(((S[mod=adn,form=base]{I1}\\NP[case=ni,mod=nm]{I2}){I1})\\NP[case=o,mod=nm]{I3}){I1}_I1(unk,I3,I2,_)" 35 | val cat2 = JapaneseCategoryParser.parse(cat2Str) 36 | cat2.toString should equal ("(S[mod=adn,form=base]\\NP[mod=nm,case=ni])\\NP[mod=nm,case=o]") 37 | 38 | 39 | val cat3Str = "(NP[case=X1,mod=X2,fin=f]{I1}/NP[case=X1,mod=X2,fin=f]{I1}){I2}_none" 40 | val cat3 = JapaneseCategoryParser.parse(cat3Str) 41 | cat3.toString should equal ("NP[fin=f]/NP[fin=f]") 42 | } 43 | 44 | // These are obsolute tests for previous version 45 | // test("createComplexCategory") { 46 | // JapaneseCategoryParser.parse("NP[nc,nm]1//NP[nc,nm]1").toString should equal("NP[mod=nm,case=nc]/NP[mod=nm,case=nc]") 47 | // JapaneseCategoryParser.parse("(S[nm,stem,nm]\NP[nc,nm])/NP[nc,nm]").toString should equal( 48 | // """(S[mod=nm,form=stem]\NP[mod=nm,case=nc])/NP[mod=nm,case=nc]""") 49 | // JapaneseCategoryParser.parse("(((S\NP)/NP[nc,nm])\(S[nm,stem]1/NP[o,nm]sem))/NP[nc,nm]1").toString should equal( 50 | // """(((S\NP)/NP[mod=nm,case=nc])\(S[mod=nm,form=stem]/NP[mod=nm,case=o]))/NP[mod=nm,case=nc]""") 51 | // JapaneseCategoryParser.parse("S1/S1").toString should equal("S/S") 52 | // JapaneseCategoryParser.parse("(S2/S2)1/(S3/S3)1").toString should equal("(S/S)/(S/S)") 53 | // } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/scala/jigg/nlp/ccg/parser/RuleTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.nlp.ccg.parser 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import org.scalatest.FunSuite 20 | import org.scalatest.Matchers._ 21 | 22 | class RuleTest extends FunSuite { 23 | val parsedSentences = new ParsedSentences 24 | val dict = parsedSentences.dict 25 | def cat(str:String) = dict.getCategory(str).get 26 | 27 | test("extract all rules from derivations") { 28 | val (sentence, derivation) = parsedSentences.simpleSentenceAndDerivation 29 | 30 | val rule = CFGRule.extractRulesFromDerivations(Array(derivation), JapaneseHeadFinder) 31 | rule.unify(cat("(NP[case=nc,mod=X1]{I1}/NP[case=nc,mod=X1]{I1}){I2}"), cat("NP[case=nc,mod=nm]{I1}_none")).get should contain (cat("NP[case=nc,mod=nm]{I1}"), ">") 32 | rule.raise(cat("S[mod=adn,form=base]{I1}")).get should contain (cat("(NP[case=nc,mod=X1]{I1}/NP[case=nc,mod=X1]{I1}){I2}"), "ADN") 33 | rule.unify(cat("NP[case=ni,mod=nm]{I1}"), cat("(S[mod=adn,form=base]{I1}\\NP[case=ni,mod=nm]{I2}){I1}")).get should contain (cat("S[mod=adn,form=base]{I1}"), "<") 34 | 35 | rule.unify(cat("NP[case=nc,mod=nm]{I1}_none"), cat("(NP[case=o,mod=nm]{I1}\\NP[case=nc,mod=nm]{I1}){I2}_none")).get should contain (cat("NP[case=o,mod=nm]{I1}"), "<") 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/AnnotatorSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import scala.xml.Node 21 | import org.scalatest._ 22 | 23 | import jigg.util.Prop 24 | 25 | class NothingAnnotator(override val name: String, override val props: Properties) extends Annotator { 26 | 27 | @Prop(gloss = "gloss of variable1", required=true) var variable1 = "" 28 | readProps() 29 | 30 | def annotate(node: Node) = node 31 | } 32 | 33 | class AnnotatorSpec extends FlatSpec with Matchers { 34 | 35 | "Opt variable" should "be customizable with property file" in { 36 | val props = new Properties 37 | props.setProperty("nothing.variable1", "hoge") 38 | 39 | val annotator = new NothingAnnotator("nothing", props) 40 | 41 | annotator.variable1 should be("hoge") 42 | } 43 | 44 | "Annotator" should "throws an exception during initProps if required variable is missed" in { 45 | val props = new Properties 46 | try { 47 | val annotator = new NothingAnnotator("nothing", props) 48 | fail() 49 | } catch { 50 | case e: ArgumentError => 51 | case _: Throwable => fail() 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/BaseAnnotatorSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import org.scalactic.Equality 21 | import org.scalatest._ 22 | import scala.xml._ 23 | 24 | trait BaseAnnotatorSpec extends FlatSpec with Matchers { 25 | 26 | val sameElem = new Equality[Node] { 27 | import scala.xml.Utility.trim 28 | override def areEqual(a: Node, b: Any) = b match { 29 | case n: Node => trim(a) == trim(n) 30 | case _ => false 31 | } 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/BeneParAnnotatorSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2017 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | 21 | import org.scalatest._ 22 | import scala.xml._ 23 | 24 | class BeneParAnnotatorSpec extends BaseAnnotatorSpec { 25 | 26 | class AnnotatorStub(output: String) extends BeneParAnnotator("benepar", new Properties) { 27 | override def mkLocalAnnotator = new LocalBeneParAnnotator { 28 | override def mkCommunicator = new StubExternalCommunicator(output) 29 | } 30 | assert(nThreads == 1) 31 | } 32 | 33 | Annotation.ParseSpan.idGen.reset() 34 | 35 | "BeneParAnnotator" should "convert a s-tree output of benepar into a node" in { 36 | val doc = 37 | 38 | 39 | 40 | He ate pizza . 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | val output = """(S (NP (PRP He)) (VP (VBD ate) (NN pizza)) (. .)) 52 | END""" 53 | 54 | val ann = new AnnotatorStub(output) 55 | val annotation = ann.annotate(doc) 56 | 57 | val s = annotation \\ "sentence" 58 | 59 | (s \ "parse").head should equal( 60 | 61 | 62 | 63 | ) (decided by sameElem) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/BunsetsuKerasAnnotatorTest.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licencses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitation under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | 21 | import org.scalatest.FunSuite 22 | import org.scalatest.Matchers._ 23 | 24 | import scala.xml.{NodeSeq, Node} 25 | 26 | class BunsetsuKerasAnnotatorTest extends FunSuite { 27 | 28 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath 29 | 30 | def segment(node: Node, properties: Properties): NodeSeq = { 31 | val bunsetsuSplitter = new IPABunsetsuKerasAnnotator("bunsetsuKeras", properties) 32 | bunsetsuSplitter.mkLocalAnnotator.newSentenceAnnotation(node) 33 | } 34 | 35 | val properties = new Properties 36 | properties.setProperty("bunsetsuKeras.model", findPath("./data/keras/bunsetsu_model.h5")) 37 | properties.setProperty("bunsetsuKeras.table", findPath("data/keras/jpnLookupWords.json")) 38 | 39 | test("do chunking") { 40 | 41 | val chunks = segment(Sentences.xml("oneSentence"),properties) \\ "chunk" 42 | 43 | chunks.length should be (2) 44 | } 45 | 46 | object Sentences { 47 | val xml = Map("oneSentence" -> 48 | 49 | 梅が咲いた。 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | ) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/KuromojiAnnotatorSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2015 Takafumi Sakakibara and Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import scala.xml.Node 21 | import org.scalatest._ 22 | 23 | import com.atilika.kuromoji.{TokenBase, TokenizerBase} 24 | import com.atilika.kuromoji.ipadic.{Token=>IToken, Tokenizer=>ITokenizer} 25 | 26 | class KuromojiAnnotatorSpec extends FlatSpec with Matchers { 27 | 28 | "Annotator" should "assign token id using sentence id" in { 29 | 30 | val annotator = KuromojiAnnotator.fromProps("kuromoji", new Properties) 31 | 32 | val sentence = 33 | val annotated = annotator newSentenceAnnotation sentence 34 | 35 | val tokenId = annotated \\ "token" \@ "id" 36 | tokenId should be ("a_0") 37 | } 38 | 39 | "TokenAnnotator" should "segment into tokens" in { 40 | val annotator = KuromojiAnnotator.fromProps("kuromoji[tokenize]", new Properties) 41 | 42 | val sentence = 43 | val annotated = annotator newSentenceAnnotation sentence 44 | 45 | val token = annotated \\ "token" 46 | token \@ "form" should be ("あ") 47 | token \@ "pos" should be ("") 48 | } 49 | 50 | "POSAnnotator" should "assign POS tags" in { 51 | val annotator = KuromojiAnnotator.fromProps("kuromoji[pos]", new Properties) 52 | 53 | val sentence = 54 | 55 | 56 | 57 | 58 | 59 | val annotated = annotator newSentenceAnnotation sentence 60 | 61 | val token = annotated \\ "token" 62 | token \@ "pos" should not be ("") 63 | token \@ "dummy" should be ("a") // not removed (overriden) 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/MecabAnnotatorSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2017 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import scala.xml.Node 21 | import org.scalatest._ 22 | 23 | class MecabAnnotatorSpec extends BaseAnnotatorSpec { 24 | 25 | def stubCom(output: String) = new StubExternalCommunicator(output) 26 | def mapCom(responces: Map[String, String]) = new MapStubExternalCommunicator(responces) 27 | 28 | def newIPA(mkCom: ()=>IOCommunicator, threads: Int = 1, p: Properties = new Properties) = 29 | new IPAMecabAnnotator("mecab", p) { 30 | override def mkLocalAnnotator = new IPALocalMecabAnnotator { 31 | override def mkCommunicator = mkCom() 32 | } 33 | override def nThreads = threads 34 | } 35 | 36 | "Annotator with nThreads=1" should "be able to annotate one sentence" in { 37 | val s = "a" 38 | val in = a 39 | val out = """a 名詞,固有名詞,組織,*,*,*,* 40 | EOS""" 41 | val annotator = newIPA(()=>stubCom(out), threads=1) 42 | val result = annotator.annotate(in) 43 | val tokens = result \\ "token" 44 | tokens.size should be(1) 45 | (tokens(0) \@ "pos") should be("名詞") 46 | 47 | result \\ "tokens" \@ "annotators" should be("mecab") 48 | } 49 | 50 | "Annotator with nThreads=2" should "annotate in parallel" in { 51 | val responces = Map( 52 | "a" -> """a 名詞,固有名詞,*,*,*,*,* 53 | EOS""", 54 | "b" -> """b 動詞,*,*,*,*,*,* 55 | EOS""", 56 | "c" -> """c 形容詞,*,*,*,*,*,* 57 | EOS""" 58 | ) 59 | val in = 60 | 61 | 62 | a 63 | b 64 | c 65 | 66 | 67 | 68 | 69 | val annotator = newIPA(()=>mapCom(responces), threads=2) 70 | val result = annotator.annotate(in) 71 | 72 | val sentences = result \\ "sentence" 73 | sentences.size should be(3) 74 | ((sentences(0) \\ "token")(0) \@ "form") should be("a") 75 | ((sentences(1) \\ "token")(0) \@ "form") should be("b") 76 | ((sentences(2) \\ "token")(0) \@ "form") should be("c") 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/PipelineSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2018 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import org.scalatest._ 21 | import scala.xml._ 22 | import jigg.util.{XMLUtil, JSONUtil} 23 | 24 | class PipelineSpec extends BaseAnnotatorSpec { 25 | 26 | class StubMecabAnnotator(n: String, p: Properties) 27 | extends IPAMecabAnnotator(n, p) { 28 | override def mkLocalAnnotator = new IPALocalMecabAnnotator { 29 | override def mkCommunicator = new StubExternalCommunicator("aaa") 30 | } 31 | } 32 | 33 | class DummyPipeline(p: Properties) extends Pipeline(p) { 34 | override def getAnnotator(name: String) = name match { 35 | case "dummy" => new StubMecabAnnotator(name, p) 36 | case _ => super.getAnnotator(name) 37 | } 38 | } 39 | 40 | "-Threads option" should "be able to customize each annotator's number of threads" in { 41 | val p = new Properties 42 | p.setProperty("annotators", "ssplit,dummy") 43 | p.setProperty("nThreads", "2") 44 | p.setProperty("dummy.nThreads", "4") 45 | 46 | val pipeline = new DummyPipeline(p) 47 | 48 | val annotators = pipeline.annotatorList 49 | annotators(0).name should equal("ssplit") 50 | annotators(0).nThreads should equal(2) 51 | annotators(1).name should equal("dummy") 52 | annotators(1).nThreads should equal(4) 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/RequirementSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | import org.scalatest._ 21 | 22 | class RequirementSpec extends FlatSpec with Matchers { 23 | 24 | "Tokenize" should "be satisfied when TokenizeWithIPA is satisfied" in { 25 | 26 | val satisfied = RequirementSet(JaRequirement.TokenizeWithIPA) 27 | val requires: Set[Requirement] = Set(Requirement.Tokenize) 28 | 29 | val lacked = satisfied.lackedIn(requires) 30 | lacked shouldBe empty 31 | } 32 | 33 | "TokenizedWithIPA" should "not be satisifed when Tokenize is satisfied" in { 34 | 35 | val satisfied = RequirementSet(Requirement.Tokenize) 36 | val requires: Set[Requirement] = Set(JaRequirement.TokenizeWithIPA) 37 | 38 | val lacked = satisfied.lackedIn(requires) 39 | lacked shouldBe Set(JaRequirement.TokenizeWithIPA) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/test/scala/jigg/pipeline/SyntaxNetAnnotatorSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.pipeline 2 | 3 | /* 4 | Copyright 2013-2016 Hiroshi Noji 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | */ 18 | 19 | import java.util.Properties 20 | 21 | import org.scalatest._ 22 | import scala.xml._ 23 | 24 | class SyntaxNetAnnotatorSpec extends BaseAnnotatorSpec { 25 | 26 | class POSAnnotatorStub(output: String) extends 27 | SyntaxNetPOSAnnotator("syntaxnetpos", new Properties) { 28 | 29 | override def run(input: String) = output.split("\n").toStream 30 | } 31 | 32 | "POSAnnotator" should "annotate all sentences across documents" in { 33 | 34 | val root = 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | val output = """1 a _ A A _ 0 A _ _ 50 | 51 | 1 b _ B B _ 0 B _ _ 52 | 2 c _ C C _ 0 C _ _ 53 | 54 | 1 c _ D D _ 0 D _ _ 55 | """ 56 | 57 | val annotator = new POSAnnotatorStub(output) 58 | val annotated = annotator.annotate(root) 59 | 60 | annotated should equal ( 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | ) (decided by sameElem) 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /src/test/scala/jigg/util/JSONUtilSpec.scala: -------------------------------------------------------------------------------- 1 | package jigg.util 2 | 3 | package jigg.pipeline 4 | 5 | import org.scalatest.FunSuite 6 | import org.scalatest.Matchers._ 7 | 8 | class JSONUtilSpec extends FunSuite{ 9 | import org.json4s._ 10 | import org.json4s.jackson.JsonMethods._ 11 | 12 | val testNode = 13 | 14 | 15 | Test Node 16 | 17 | 18 | val goldJSON = 19 | parse( 20 | """ 21 | { 22 | ".tag" : "root", 23 | ".child" : [ { 24 | ".tag" : "document", 25 | "id" : "d0", 26 | "text" : "Test Node" 27 | } ] 28 | } 29 | """ 30 | ) 31 | 32 | /** 33 | * For handling a backslash. 34 | */ 35 | val testNodeForBackslash = 36 | 37 | 38 | Test Node 39 | 40 | 41 | 42 | val goldJSONForBackSlash = 43 | parse( 44 | """{".tag":"root",".child": 45 | [{".tag":"document","id":"d0\\N","text":"Test Node"} 46 | ] 47 | }""" 48 | ) 49 | 50 | /** 51 | * For handling escaped strings. 52 | */ 53 | val testNodeForEscaping = 54 | 55 | "}> 56 | {"quot\" amp&"} 57 | 58 | 59 | {"new line\n \n tab\t \t carriage return\r \r backslash\\ \\"} 60 | 61 | 62 | 63 | val goldJSONForEscaping = 64 | parse( 65 | """{".tag":"root",".child": 66 | [{".tag":"document","id":"","text":"quot\" amp&"}, 67 | {".tag":"document", "id":"d1", "text": "new line\n \n tab\t \t carriage return\r \r backslash\\ \\"} 68 | ] 69 | }""" 70 | ) 71 | 72 | val testJSONForEscaping = 73 | parse( 74 | """{".tag":"root",".child": 75 | [{".tag":"document","id":"<d0>","text":"&Test Node"amp;"} 76 | ] 77 | }""" 78 | ) 79 | 80 | /** 81 | * Unit testing toJSON 82 | */ 83 | test("toJSON should generate formatted String object from scala.xml.Node"){ 84 | parse(JSONUtil.toJSON(testNode)) should be (goldJSON) 85 | parse(JSONUtil.toJSON(testNodeForBackslash)) should be (goldJSONForBackSlash) 86 | parse(JSONUtil.toJSON(testNodeForEscaping)) should be (goldJSONForEscaping) 87 | } 88 | /** 89 | * Unit testing JSON to XML 90 | */ 91 | test("toXML should generate xml.Node"){ 92 | val xmlFromJSON = JSONUtil.toXML(goldJSON) 93 | val xmlFromJSONWithBackslash = JSONUtil.toXML(goldJSONForBackSlash) 94 | val xmlFromJSONWithEscapeChar = JSONUtil.toXML(testJSONForEscaping) 95 | xmlFromJSON should be ({"Test Node"}) 96 | xmlFromJSONWithBackslash should be ({"Test Node"}) 97 | xmlFromJSONWithEscapeChar should be ("}>{"&Test Node\"amp;"}) 98 | } 99 | } 100 | --------------------------------------------------------------------------------