├── .checker
├── README.md
├── scripts
│ ├── before-install.sh
│ ├── install-annotators.sh
│ ├── install-benepar.sh
│ ├── install-cabocha.sh
│ ├── install-crf.sh
│ ├── install-depccg.sh
│ ├── install-jar.sh
│ ├── install-juman.sh
│ ├── install-knp.sh
│ ├── install-mecab.sh
│ ├── install-other-languages.sh
│ ├── install-syntaxnet.sh
│ ├── install-udpipe.sh
│ ├── run-test.sh
│ └── set-env.sh
├── setup.cfg
└── tests
│ ├── basetest.py
│ ├── benepar
│ └── test_benepar.py
│ ├── cabocha
│ └── test_cabocha.py
│ ├── comparison.py
│ ├── constant.py
│ ├── corenlp
│ ├── test_berkeleyparser_dcoref.py
│ ├── test_dcoref.py
│ ├── test_ssplit.py
│ └── test_tokenize.py
│ ├── corenlp_other_languages
│ ├── test_chinese_coref.py
│ └── test_french_depparse.py
│ ├── depccg
│ └── test_depccg_ccg.py
│ ├── example_test.py
│ ├── juman
│ └── test_juman.py
│ ├── knp
│ └── test_knp.py
│ ├── mecab
│ └── test_mecab.py
│ ├── syntaxnet
│ └── test_syntaxnet.py
│ └── udpipe
│ ├── test_udpipe_parse.py
│ └── test_udpipe_tokenize.py
├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── bin
├── sbt
└── sbt-launch.jar
├── build.sbt
├── docker-compose.yml
├── dockers
├── knp
│ └── Dockerfile
└── syntaxnet
│ └── Dockerfile
├── jar
└── easyccg.jar
├── project
├── build.properties
├── buildinfo.sbt
└── plugins.sbt
├── python
├── README.md
├── pipeline_example.py
├── pyjigg
│ ├── __init__.py
│ └── pipeline.py
└── setup.py
├── script
└── release.sh
└── src
├── main
├── resources
│ └── python
│ │ ├── _depccg.py
│ │ ├── bene_par.py
│ │ └── udpipe.py
└── scala
│ └── jigg
│ ├── ml
│ ├── Example.scala
│ ├── FeatureBase.scala
│ ├── FeatureIndexer.scala
│ ├── FeatureUtil.scala
│ ├── LinearClassifier.scala
│ ├── LogLinearAdaGradL1.scala
│ ├── LogLinearClassifier.scala
│ ├── LogLinearSGD.scala
│ ├── OnlineLogLinearTrainer.scala
│ ├── OnlineTrainer.scala
│ ├── Perceptron.scala
│ ├── WeightVector.scala
│ └── keras
│ │ ├── Convolution1D.scala
│ │ ├── Dense.scala
│ │ ├── Embedding.scala
│ │ ├── Empty.scala
│ │ ├── Flatten.scala
│ │ ├── Functor.scala
│ │ ├── KerasModel.scala
│ │ ├── KerasParser.scala
│ │ ├── README.md
│ │ ├── Relu.scala
│ │ ├── Sigmoid.scala
│ │ ├── Softmax.scala
│ │ └── Tanh.scala
│ ├── nlp
│ └── ccg
│ │ ├── CCGBank.scala
│ │ ├── CCGBank2EnjuXML.scala
│ │ ├── CCGBankToCabochaFormat.scala
│ │ ├── CalcCoverage.scala
│ │ ├── EvalParser.scala
│ │ ├── EvalSuperTagger.scala
│ │ ├── GoldBunsetsuDepInCabocha.scala
│ │ ├── LoadDumpedTaggerModel.scala
│ │ ├── Opts.scala
│ │ ├── OutputCategoryList.scala
│ │ ├── ParserModel.scala
│ │ ├── ParserRunner.scala
│ │ ├── ParserTrainer.scala
│ │ ├── README.md
│ │ ├── RenderCCGDerivation.scala
│ │ ├── SuperTaggerModel.scala
│ │ ├── SuperTaggerRunner.scala
│ │ ├── SuperTaggerTrainer.scala
│ │ ├── TrainParser.scala
│ │ ├── TrainSuperTagger.scala
│ │ ├── lexicon
│ │ ├── Bunsetsu.scala
│ │ ├── CCGBankReader.scala
│ │ ├── CabochaReader.scala
│ │ ├── Category.scala
│ │ ├── CategoryDictionary.scala
│ │ ├── CategoryFeature.scala
│ │ ├── CategoryManager.scala
│ │ ├── CategoryParser.scala
│ │ ├── CategoryTree.scala
│ │ ├── Derivation.scala
│ │ ├── Dictionary.scala
│ │ ├── Direction.scala
│ │ ├── JapaneseDictionary.scala
│ │ ├── MecabReader.scala
│ │ ├── Numbered.scala
│ │ ├── NumberedManager.scala
│ │ ├── ParseTree.scala
│ │ ├── ParseTreeConverer.scala
│ │ ├── PoS.scala
│ │ ├── Sentence.scala
│ │ ├── SimpleDictionary.scala
│ │ ├── Slash.scala
│ │ └── Word.scala
│ │ ├── package.scala
│ │ ├── parser
│ │ ├── Action.scala
│ │ ├── BeamSearchDecoder.scala
│ │ ├── HeadFinder.scala
│ │ ├── KBestDecoder.scala
│ │ ├── Oracle.scala
│ │ ├── Rule.scala
│ │ ├── ShiftReduceFeature.scala
│ │ ├── ShiftReduceFeatureExtractors.scala
│ │ ├── State.scala
│ │ ├── TransitionBasedParser.scala
│ │ └── package.scala
│ │ └── tagger
│ │ ├── MaxentMultiTagger.scala
│ │ ├── SuperTaggingFeature.scala
│ │ ├── SuperTaggingFeatureExtractors.scala
│ │ ├── UserDefinedFeatureExtractors.scala
│ │ └── package.scala
│ ├── pipeline
│ ├── AnnotatingInParallel.scala
│ ├── Annotation.scala
│ ├── AnnotationError.scala
│ ├── Annotator.scala
│ ├── ArgumentError.scala
│ ├── BeneParAnnotator.scala
│ ├── BerkeleyParserAnnotator.scala
│ ├── BunsetsuKerasAnnotator.scala
│ ├── CCGParseAnnotator.scala
│ ├── CabochaAnnotator.scala
│ ├── CandCAnnotator.scala
│ ├── DepCCGAnnotator.scala
│ ├── DocumentAnnotator.scala
│ ├── DocumentKNPAnnotator.scala
│ ├── EasyCCGAnnotator.scala
│ ├── IOCommunicator.scala
│ ├── JumanAnnotator.scala
│ ├── KNPAnnotator.scala
│ ├── KuromojiAnnotator.scala
│ ├── MecabAnnotator.scala
│ ├── OutputConverter.scala
│ ├── Pipeline.scala
│ ├── PipelineServer.scala
│ ├── PropsHolder.scala
│ ├── RegexDocumentAnnotator.scala
│ ├── RegexSentenceAnnotator.scala
│ ├── Requirement.scala
│ ├── SentencesAnnotator.scala
│ ├── SimpleKNPAnnotator.scala
│ ├── SpaceTokenizerAnnotator.scala
│ ├── SsplitKerasAnnotator.scala
│ ├── StanfordCollapsedDependenciesAnnotator.scala
│ ├── StanfordCoreNLPAnnotator.scala
│ ├── StanfordTypedDependenciesAnnotator.scala
│ ├── SyntaxNetAnnotator.scala
│ ├── SystemDict.scala
│ ├── UDPipeAnnotator.scala
│ └── UnmanagedAnnotators.scala
│ └── util
│ ├── ArgumentsParser.scala
│ ├── CoNLLUtil.scala
│ ├── HDF5Object.scala
│ ├── IDGenerator.scala
│ ├── IOUtil.scala
│ ├── JSONUtil.scala
│ ├── LogUtil.scala
│ ├── LookupTable.scala
│ ├── Normalizer.scala
│ ├── Prop.java
│ ├── PropertiesUtil.scala
│ ├── ResourceUtil.scala
│ ├── TreesUtil.scala
│ └── XMLUtil.scala
└── test
├── resources
├── data
│ ├── Japanese.small.lexicon
│ ├── Japanese.unkVerb.lexicon
│ ├── json
│ │ ├── english.ssplit.test.json
│ │ └── japanese.ssplit.test.json
│ ├── keras
│ │ ├── bunsetsu_model.h5
│ │ ├── jpnLookupCharacter.json
│ │ ├── jpnLookupWords.json
│ │ └── ssplit_model.h5
│ ├── ml
│ │ └── keras
│ │ │ ├── convolution1d
│ │ │ ├── convolution1d_gold.csv
│ │ │ ├── convolution1d_input.csv
│ │ │ └── convolution1d_model.h5
│ │ │ ├── dense
│ │ │ ├── dense_gold.csv
│ │ │ ├── dense_input.csv
│ │ │ └── dense_model.h5
│ │ │ ├── embedding
│ │ │ ├── embedding_gold.csv
│ │ │ ├── embedding_input.csv
│ │ │ └── embedding_model.h5
│ │ │ ├── flatten
│ │ │ ├── flatten_gold.csv
│ │ │ ├── flatten_input.csv
│ │ │ └── flatten_model.h5
│ │ │ └── kerasModel
│ │ │ ├── kerasModel_gold.csv
│ │ │ ├── kerasModel_input.csv
│ │ │ └── kerasModel_model.h5
│ ├── template.small.lst
│ ├── template.unkVerb.lst
│ └── xml
│ │ ├── english.ssplit.spaceTokenize.gold.xml
│ │ ├── english.ssplit.test.xml
│ │ ├── japanese.ssplit.kuromoji.gold.xml
│ │ └── japanese.ssplit.test.xml
└── script
│ └── create_small_lst_from_lexicon.py
└── scala
└── jigg
├── ml
└── keras
│ ├── Convolution1DSpec.scala
│ ├── DenseSpec.scala
│ ├── EmbeddingSpec.scala
│ ├── FlattenSpec.scala
│ ├── KerasModelSpec.scala
│ └── KerasParserTest.scala
├── nlp
└── ccg
│ ├── lexicon
│ ├── BunsetsuTest.scala
│ ├── CCGBankReaderTest.scala
│ ├── CategoryFeatureTest.scala
│ ├── CategoryManagerTest.scala
│ ├── CategoryParserTest.scala
│ └── JapaneseDictionaryTest.scala
│ └── parser
│ ├── KBestDecoderTest.scala
│ ├── OracleTest.scala
│ ├── ParsedSentence.scala
│ └── RuleTest.scala
├── pipeline
├── AnnotatorSpec.scala
├── BaseAnnotatorSpec.scala
├── BeneParAnnotatorSpec.scala
├── BerkeleyParserAnnotatorSpec.scala
├── BunsetsuKerasAnnotatorTest.scala
├── CabochaAnnotatorSpec.scala
├── DepCCGAnnotatorSpec.scala
├── DocumentKNPAnnotatorSpec.scala
├── EasyCCGAnnotatorSpec.scala
├── IntermediateInputSpec.scala
├── JumanAnnotatorSpec.scala
├── KuromojiAnnotatorSpec.scala
├── MecabAnnotatorSpec.scala
├── PipelineSpec.scala
├── RegexSentenceAnnotatorTest.scala
├── RequirementSpec.scala
├── SimpleKNPAnnotatorSpec.scala
├── SsplitKerasAnnotatorTest.scala
├── StanfordTypedDependenciesAnnotatorSpec.scala
└── SyntaxNetAnnotatorSpec.scala
└── util
├── CoNLLUtilSpec.scala
├── JSONUtilSpec.scala
├── TreesUtilSpec.scala
└── XMLUtilSpec.scala
/.checker/scripts/before-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # upgrade c++
4 | # add repository
5 | sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
6 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
7 |
8 | sudo apt update -y && sudo apt install g++-4.9 gcc-4.9 -y
9 |
10 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 10
11 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 20
12 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/g++-4.8 10
13 | sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 20
14 |
15 | sudo rm /usr/bin/cpp
16 |
17 | sudo update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-4.8 10
18 | sudo update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-4.9 20
19 | sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30
20 | sudo update-alternatives --set cc /usr/bin/gcc
21 | sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
22 | sudo update-alternatives --set c++ /usr/bin/g++
23 |
--------------------------------------------------------------------------------
/.checker/scripts/install-annotators.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ ${ANNOTATORS} == "udpipe" ];then
4 | echo "Install UDPIPE"
5 | ./.checker/scripts/install-udpipe.sh
6 | elif [ ${ANNOTATORS} == "depccg" ];then
7 | echo "Install DEPCCG"
8 | ./.checker/scripts/install-depccg.sh
9 | elif [ ${ANNOTATORS} == "mecab" ];then
10 | echo "Install MECAB"
11 | ./.checker/scripts/install-mecab.sh
12 | elif [ ${ANNOTATORS} == "cabocha" ];then
13 | echo "Install CABOCHA"
14 | ./.checker/scripts/install-mecab.sh
15 | ./.checker/scripts/install-crf.sh
16 | ./.checker/scripts/install-cabocha.sh
17 | elif [ ${ANNOTATORS} == "juman" ];then
18 | echo "Install JUMAN"
19 | ./.checker/scripts/install-juman.sh
20 | elif [ ${ANNOTATORS} == "knp" ];then
21 | echo "Install KNP"
22 | ./.checker/scripts/install-knp.sh
23 | elif [ ${ANNOTATORS} == "corenlp" ];then
24 | echo "Install CORENLP"
25 | ./.checker/scripts/install-jar.sh
26 | elif [ ${ANNOTATORS} == "corenlp_other_languages" ];then
27 | echo "Install CORENLP OTHER LANGUAGE"
28 | ./.checker/scripts/install-jar.sh
29 | ./.checker/scripts/install-other-languages.sh
30 | elif [ ${ANNOTATORS} == "benepar" ];then
31 | echo "Install BENEPAR"
32 | ./.checker/scripts/install-jar.sh
33 | ./.checker/scripts/install-benepar.sh
34 | elif [ ${ANNOTATORS} == "syntaxnet" ];then
35 | echo "Install SYNTAXNET"
36 | ./.checker/scripts/install-jar.sh
37 | ./.checker/scripts/install-syntaxnet.sh
38 | fi
39 |
--------------------------------------------------------------------------------
/.checker/scripts/install-benepar.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | pip install cython numpy
6 | pip install benepar[cpu]
7 |
8 | python -c 'import benepar; benepar.download("benepar_en2")'
9 |
--------------------------------------------------------------------------------
/.checker/scripts/install-cabocha.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | source ./.checker/scripts/set-env.sh
6 |
7 | home_dir=`pwd ./`
8 |
9 | url="https://github.com/taku910/cabocha/archive/master.zip"
10 | file=master.zip
11 | dir=cabocha-master
12 |
13 | # download
14 | wget ${url}
15 |
16 | # unpack
17 | unzip ${file}
18 |
19 | # compile
20 | cd ${home_dir}"/"${dir}
21 | ./autogen.sh
22 | ./configure --with-charset=UTF8
23 | make
24 | make check
25 | sudo make install
26 |
27 | cd ${home_dir}
28 |
--------------------------------------------------------------------------------
/.checker/scripts/install-crf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | home_dir=`pwd ./`
6 |
7 | # To get file id, you singup google account.
8 | url="https://drive.google.com/uc?export=view&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ"
9 | file=CRF++-0.58.tar.gz
10 | dir=CRF++-0.58
11 |
12 | wget ${url} -O ${file}
13 |
14 | tar -zxvf ${file}
15 |
16 | cd ${home_dir}"/"${dir}
17 | ./configure
18 | make
19 | sudo make install
20 |
21 | cd ${home_dir}
22 |
--------------------------------------------------------------------------------
/.checker/scripts/install-depccg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | home_dir=`pwd ./`
6 |
7 | pip install cython numpy
8 | pip install depccg
9 |
10 | depccg_en download
11 | depccg_ja download
12 |
13 | # en_model_url=http://cl.naist.jp/~masashi-y/resources/depccg/en_hf_tri.tar.gz
14 | # ja_model_url=http://cl.naist.jp/~masashi-y/resources/depccg/ja_hf_ccgbank.tar.gz
15 | # en_model=en_hf_tri.tar.gz
16 | # ja_model=ja_hf_ccgbank.tar.gz
17 |
18 | # model_dir="depccg/models"
19 | # src_dir="depccg/src"
20 |
21 | # # Install cython & chainer.
22 | # pip install -U pip cython
23 | # pip install chainer
24 | # pip install scrapy
25 |
26 | # # Git clone the depccg repository
27 | # git clone https://github.com/masashi-y/depccg.git
28 |
29 | # # download model file.
30 | # wget ${en_model_url}
31 | # wget ${ja_model_url}
32 |
33 | # # make directory saved model file
34 | # mkdir ${model_dir}
35 | # mv ${en_model} ${ja_model} ${model_dir}
36 |
37 | # # compile
38 | # # A default g++ version is 4.8 in Ubuntu 14.04.
39 | # # In depccg compile, it requires the version >= 4.9.
40 | # export CC=g++-4.9
41 | # cd ${home_dir}"/"${src_dir}
42 | # python setup.py build_ext --inplace
43 |
44 | # ln -s depccg*.so depccg.so
45 |
46 | # # unpack model files.
47 | # cd ${home_dir}"/"${model_dir}
48 | # tar -zxvf ${en_model}
49 | # tar -zxvf ${ja_model}
50 |
51 | # cd ${home_dir}
52 |
--------------------------------------------------------------------------------
/.checker/scripts/install-jar.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | home_dir=`pwd ./`
6 | jar_dir="jar/"
7 |
8 |
9 | # download stanford corenlp
10 | url=http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
11 | zip=stanford-corenlp-full-2018-10-05.zip
12 | dir=stanford-corenlp-full-2018-10-05
13 | file=stanford-corenlp-3.9.2.jar
14 | file_model=stanford-corenlp-3.9.2-models.jar
15 |
16 | # download Stanford CoreNLP models
17 | wget ${url}
18 |
19 | # unpack
20 | unzip ${zip}
21 |
22 | cp ${dir}"/"${file} ${jar_dir}
23 | cp ${dir}"/"${file_model} ${jar_dir}
24 |
25 |
26 | # create jigg jar file
27 | jigg_file="target/jigg-assembly-0.8.0.jar"
28 | ./bin/sbt assembly
29 | cp ${jigg_file} ${jar_dir}
30 |
31 |
32 | # download jigg-models
33 | jigg_models="jigg-models.jar"
34 | wget https://github.com/mynlp/jigg-models/raw/master/jigg-models.jar
35 | mv ${jigg_models} ${jar_dir}
36 |
--------------------------------------------------------------------------------
/.checker/scripts/install-juman.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | home_dir=`pwd ./`
6 |
7 | url=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2
8 | file=juman-7.01.tar.bz2
9 | dir=juman-7.01
10 |
11 | # download
12 | wget ${url}
13 |
14 | # unpack bz2 file
15 | tar -jxvf ${file}
16 |
17 | # build
18 | cd ${dir}
19 | ./configure
20 | make
21 | sudo make install
22 |
23 | cd ${home_dir}
24 |
--------------------------------------------------------------------------------
/.checker/scripts/install-knp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | docker build -t jigg/jigg:knp -f dockers/knp/Dockerfile .
6 |
--------------------------------------------------------------------------------
/.checker/scripts/install-mecab.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | sudo apt install libmecab2 libmecab-dev mecab mecab-ipadic-utf8 mecab-ipadic mecab-utils
6 |
--------------------------------------------------------------------------------
/.checker/scripts/install-other-languages.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | jar_dir="jar"
4 |
5 | # chinese model jar file
6 | wget http://nlp.stanford.edu/software/stanford-chinese-corenlp-2018-10-05-models.jar
7 | mv stanford-chinese-corenlp-2018-10-05-models.jar ${jar_dir}
8 |
9 | # french model jar file
10 | wget http://nlp.stanford.edu/software/stanford-french-corenlp-2018-10-05-models.jar
11 | mv stanford-french-corenlp-2018-10-05-models.jar ${jar_dir}
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.checker/scripts/install-syntaxnet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | docker build -t jigg/jigg:syntaxnet -f dockers/syntaxnet/Dockerfile .
6 |
--------------------------------------------------------------------------------
/.checker/scripts/install-udpipe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | pip install ufal.udpipe
6 |
7 | # model download
8 | curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2364/udpipe-ud-2.0-170801.zip
9 |
10 | # unpack
11 | unzip udpipe-ud-2.0-170801.zip
12 |
13 | # rename model directory
14 | mv udpipe-ud-2.0-170801 udpipe-ud-model
15 |
--------------------------------------------------------------------------------
/.checker/scripts/run-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | source .checker/scripts/set-env.sh
6 |
7 | # run a unit test for the files under the directory `.checker/tests/${ANNOTATORS}`.
8 | python3 -m unittest discover -s .checker/tests/${ANNOTATORS}
9 |
--------------------------------------------------------------------------------
/.checker/scripts/set-env.sh:
--------------------------------------------------------------------------------
1 | export JIGG_VERSION="0.8.0"
2 | export CORENLP_VERSION="3.9.2"
3 | export IVY2_CACHE_DIR="${HOME}/.ivy2/cache"
4 |
5 | export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:$LD_LIBRARY_PATH
6 |
--------------------------------------------------------------------------------
/.checker/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
--------------------------------------------------------------------------------
/.checker/tests/constant.py:
--------------------------------------------------------------------------------
1 | JIGG_VERSION = "0.8.0"
2 | CORENLP_VERSION = "3.9.2"
3 |
4 | JIGG_JAR = "target/jigg-assembly-{}.jar".format(JIGG_VERSION)
5 | JIGG_MODEL_JAR = "jigg-models.jar"
6 |
7 | CORENLP_MODEL_JAR = "stanford-corenlp-{}-models.jar".format(CORENLP_VERSION)
8 |
9 |
10 | # URL
11 | # juman
12 | JUMAN_MAIN_URL = "http://nlp.ist.i.kyoto-u.ac.jp/?JUMAN"
13 | JUMAN_DOWNLOAD_URL = "http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2"
14 |
15 | # knp
16 | KNP_MAIN_URL = "http://nlp.ist.i.kyoto-u.ac.jp/?KNP"
17 | KNP_DOWNLOAD_URL = "http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/knp/knp-4.19.tar.bz2"
18 |
19 | # CRF
20 | CRF_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ"
21 |
22 | # cabocha
23 | CABOCHA_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7SDd1Q1dUQkZQaUU"
24 |
25 | # depccg
26 | DEPCCG_EN_MODLE_URL = "http://cl.naist.jp/~masashi-y/resources/depccg/en_hf_tri.tar.gz"
27 | DEPCCG_JA_MODEL_URL = "http://cl.naist.jp/~masashi-y/resources/depccg/ja_hf_ccgbank.tar.gz"
28 | DEPCCG_GIT_URL = "https://github.com/masashi-y/depccg.git"
29 |
30 | # udpipe
31 | UDPIPE_MODEL_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2364/udpipe-ud-2.0-170801.zip"
32 |
--------------------------------------------------------------------------------
/.checker/tests/corenlp/test_ssplit.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append(".checker/tests")
3 |
4 | from basetest import BaseTest
5 |
6 |
7 | class TestSsplit(BaseTest):
8 |
9 | def setUp(self):
10 |
11 | self.input_text = "Stanford University is located in California. It is a great university, founded in 1891."
12 |
13 | self.expected_text = """
14 |
15 |
16 |
17 |
18 | Stanford University is located in California.
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 | It is a great university, founded in 1891.
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 | """
47 |
48 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize,ssplit]'
49 |
50 | def test_ssplit(self):
51 | self.check_equal(self.exe, self.input_text, self.expected_text)
52 |
--------------------------------------------------------------------------------
/.checker/tests/corenlp/test_tokenize.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append(".checker/tests")
3 |
4 | from basetest import BaseTest
5 |
6 |
7 | class TestTokenize(BaseTest):
8 |
9 | def setUp(self):
10 |
11 | self.input_text = "Stanford University is located in California. It is a great university, founded in 1891."
12 |
13 | self.expected_text = """
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 | """
41 |
42 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]'
43 |
44 | def test_tokenize(self):
45 | self.check_equal(self.exe, self.input_text, self.expected_text)
46 |
--------------------------------------------------------------------------------
/.checker/tests/example_test.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append(".checker/tests")
3 |
4 | from basetest import BaseTest
5 |
6 |
7 | class TestName(BaseTest):
8 | '''
9 | This is an exmaple (or a based) file of unittest. You want
10 | to add the new test file, please copy this file and edit
11 | it as the following.
12 |
13 | 1. Copy this file
14 | please, copy this file as the following command:
15 | ```
16 | cp example_test.py {ANNOTATORS}/test_***.py
17 | ```
18 | The {ANNOTATORS} is annotator name.
19 | You need to name the file like `test_***.py`. `***` is any name.
20 | Note the head to the file name must give the `test`. For example,
21 | `test_tokenize.py`.
22 | 2. Change the class name
23 | For each the test case, You change the class name from
24 | TestName to Test***. `***` is any name, for example,
25 | Tokenize, Ssplit, ... etc.
26 | 3. Change three variables in the setUp() function
27 | - self.input_text : a sample text using for test
28 | - self.expected_text : an expected output text by test run
29 | - self.exe : an execution command
30 | This program runs with the sbt runMain command. For example,
31 | `sbt "runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]"`.
32 | You set the part of "runMain ~" in the variable `self.exe`.
33 | 4. Change the function name.
34 | For each the test case, You also change the function name
35 | from test_name to test_***. `***` is any name, for example,
36 | tokenize, ssplit, ... etc. Note that the head of the
37 | function name must give the `test`.
38 |
39 | For example, the case of the annotator `pos`:
40 | 1. file name -> test_pos.py
41 | 2. class name -> class TestPos(BaseTest):
42 | 3. variables ->
43 | self.input_text = "This is a sample text."
44 | self.expected_text = "[the result text]"
45 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize,ssplit,pos]'
46 | 4. function name -> def test_pos(self):
47 | '''
48 | def setUp(self):
49 | # Set an input (sample) text
50 | self.input_text = ""
51 |
52 | # Set an expected text
53 | self.expected_text = ""
54 |
55 | # Set a execution command
56 | # You need to change the `-annotators` term according to the test case.
57 | # For example, the case of annotation `lemma`, corenlp[tokenize,ssplit,pos,lemma].
58 | self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]'
59 |
60 | def test_name(self):
61 | # A function check_equal() is defined on the superclass BaseTest.
62 | self.check_equal(self.exe, self.input_text, self.expected_text)
63 |
--------------------------------------------------------------------------------
/.checker/tests/udpipe/test_udpipe_tokenize.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append(".checker/tests")
3 |
4 | from basetest import BaseTest
5 |
6 |
7 | class TestUDpipeTokenize(BaseTest):
8 |
9 | def setUp(self):
10 | self.input_text = "Stanford University is located in California. It is a great university, founded in 1891."
11 |
12 | self.expected_text = r"""
13 |
14 |
15 |
16 |
17 | Stanford University is located in California.
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | It is a great university, founded in 1891.
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 | """
46 |
47 | self.exe = 'runMain jigg.pipeline.Pipeline ' \
48 | + '-annotators udpipe[tokenize] ' \
49 | + '-udpipe.model udpipe-ud-model/english-ud-2.0-170801.udpipe '
50 |
51 | def test_udpipe_tokenize(self):
52 | self.check_equal(self.exe, self.input_text, self.expected_text)
53 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !bin
3 | !project
4 | !python
5 | !script
6 | !src
7 | !build.sbt
8 | !jar
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.o
3 | *.pyc
4 | .lock*
5 | .waf*
6 | *.class
7 | build/
8 | target/
9 | .idea/
10 | models/
11 | tools/
12 | download
13 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 |
3 | scala:
4 | - 2.11.8
5 |
6 | dist: trusty
7 |
8 | env:
9 | - ANNOTATORS=corenlp
10 | - ANNOTATORS=corenlp_other_languages
11 | - ANNOTATORS=udpipe
12 | - ANNOTATORS=depccg
13 | - ANNOTATORS=juman
14 | - ANNOTATORS=knp
15 | - ANNOTATORS=mecab
16 | - ANNOTATORS=cabocha
17 | - ANNOTATORS=benepar
18 | # - ANNOTATORS=syntaxnet
19 |
20 | before_install:
21 | - ./.checker/scripts/before-install.sh
22 | - pyenv global system 3.6
23 | - virtualenv --python=python3.6 .venv
24 | - source .venv/bin/activate
25 | - pip install --upgrade pip
26 |
27 | install:
28 | - ./.checker/scripts/install-annotators.sh
29 |
30 | script:
31 | - .checker/scripts/run-test.sh
32 |
33 | branches:
34 | only:
35 | - master
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM openjdk:8
2 |
3 | WORKDIR /jigg
4 |
5 | # Download dependencies
6 | COPY build.sbt /jigg/
7 | COPY project/*.sbt project/build.properties /jigg/project/
8 | COPY bin /jigg/bin
9 | RUN bin/sbt update
10 |
11 | # Build
12 | COPY src /jigg/src
13 | COPY jar /jigg/jar
14 | RUN bin/sbt assembly
15 |
16 | # Run a simple test
17 | RUN echo "テレビで自転車で走っている少女を見た" |\
18 | java -Xms1024M -Xmx1024M -cp "target/*:jar/jigg-models.jar" \
19 | jigg.pipeline.Pipeline -annotators ssplit,kuromoji,jaccg
20 |
--------------------------------------------------------------------------------
/bin/sbt:
--------------------------------------------------------------------------------
1 | java -Dfile.encoding=UTF-8 -Xms512M -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=384M -jar `dirname $0`/sbt-launch.jar "$@"
2 |
--------------------------------------------------------------------------------
/bin/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/bin/sbt-launch.jar
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | jigg:
4 | build: .
5 | ports:
6 | - 8080:8080
7 | entrypoint:
8 | - java
9 | - -Xms1024M
10 | - -Xmx1024M
11 | - -cp
12 | - "target/*:jar/*"
13 | - jigg.pipeline.PipelineServer
14 | - -host
15 | - 0.0.0.0
16 | volumes:
17 | - ./script:/jigg/script
18 | - ./jar:/jigg/jar
19 |
--------------------------------------------------------------------------------
/dockers/knp/Dockerfile:
--------------------------------------------------------------------------------
1 | # If you build a image using this file, please run the following command at a directory `jigg/`,
2 | # ```
3 | # docker build -t {image name}:{tag} -f docker/knp/Dockerfile .
4 | # ```
5 | FROM jigg/jigg-dockers:knp
6 |
7 | WORKDIR /jigg
8 |
9 | ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/bin:/usr/local/lib
10 | ENV PATH $PATH:$HOME/usr/bin
11 |
12 | COPY build.sbt /jigg/
13 | COPY project/*.sbt project/build.properties /jigg/project/
14 | COPY bin /jigg/bin
15 | RUN bin/sbt update
16 |
17 | # Build
18 | COPY src /jigg/src
19 | COPY jar /jigg/jar
20 | RUN bin/sbt assembly
--------------------------------------------------------------------------------
/dockers/syntaxnet/Dockerfile:
--------------------------------------------------------------------------------
1 | # If you build a image using this file, please run the following command at a directory 'jigg/',
2 | # ```
3 | # docker build -t {image name}:{tag} -f dockers/syntaxnet/Dockerfile .
4 | # ```
5 |
6 | FROM tensorflow/syntaxnet
7 |
8 | WORKDIR /jigg
9 |
10 | RUN apt-get update -y && apt-get install -y less wget tar bzip2 unzip sudo make gcc g++ libz-dev
11 |
12 | # install jigg
13 | COPY build.sbt /jigg/
14 | COPY project/*.sbt project/build.properties /jigg/project/
15 | COPY bin /jigg/bin
16 | RUN bin/sbt update
17 |
18 | # Build
19 | COPY src /jigg/src
20 | COPY jar /jigg/jar
21 | RUN bin/sbt assembly
22 |
--------------------------------------------------------------------------------
/jar/easyccg.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/jar/easyccg.jar
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.0
2 |
--------------------------------------------------------------------------------
/project/buildinfo.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0")
2 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 |
3 | // for sbt-sonatype (https://github.com/xerial/sbt-sonatype)
4 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.1")
5 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0")
6 |
--------------------------------------------------------------------------------
/python/pipeline_example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from pyjigg import Pipeline
4 | import xml.etree.ElementTree as ET
5 | import json
6 |
7 | '''Example to use Jigg from python.
8 |
9 | Before using this, users must start the PipelineServer in a command line, e.g.:
10 | $ cd jigg-0.6.2/
11 | $ java -Xmx4g -cp "*" jigg.pipeline.PipelineServer
12 | '''
13 |
14 | if __name__ == '__main__':
15 | pipeline = Pipeline('http://localhost:8080')
16 |
17 | text1 = """This is the first sentence. This is the second sentence."""
18 |
19 | text2 = """This is the third sentence. This is the forth sentence."""
20 |
21 | output1 = pipeline.annotate(text1, {
22 | 'annotators': 'corenlp[tokenize,ssplit]',
23 | 'outputFormat': 'xml'})
24 | print ET.tostring(output1)
25 |
26 | output2 = pipeline.annotate(text2, {
27 | 'annotators': 'corenlp[tokenize,ssplit]',
28 | 'outputFormat': 'json'})
29 | print json.dumps(output2, indent=4)
30 |
--------------------------------------------------------------------------------
/python/pyjigg/__init__.py:
--------------------------------------------------------------------------------
1 | from pyjigg.pipeline import Pipeline
2 |
--------------------------------------------------------------------------------
/python/pyjigg/pipeline.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import xml.etree.ElementTree as ET
4 | import json
5 | import requests
6 |
7 | JIGG = 'jigg-0.6.2'
8 |
9 | class Pipeline:
10 |
11 | def __init__(self, server_url):
12 | if server_url[-1] == '/':
13 | server_url = server_url[:-1]
14 | self.server_url = server_url
15 |
16 | def annotate(self, text, properties=None):
17 | assert isinstance(text, str)
18 | if properties is None:
19 | properties = {}
20 | else:
21 | assert isinstance(properties, dict)
22 |
23 | # Checks that the Jigg Pipeline server is started.
24 | try:
25 | requests.get(self.server_url)
26 | except requests.exceptions.ConnectionError:
27 | raise Exception('Check whether you have started the Jigg\'s PipelineServer e.g.\n'
28 | '$ cd %s/ \n'
29 | '$ java -Xmx4g -cp "*" jigg.pipeline.PipelineServer' % (JIGG))
30 |
31 | url = self.server_url + '/annotate'
32 | text = text.encode()
33 | data = properties.copy()
34 | data['q'] = text
35 | r = requests.post(url, data=data)
36 | output = r.text
37 | if ('outputFormat' in properties and properties['outputFormat'] == 'json'):
38 | try:
39 | output = json.loads(output, encoding='utf-8', strict=True)
40 | except:
41 | pass
42 | else:
43 | try:
44 | output = ET.fromstring(output)
45 | except:
46 | pass
47 |
48 | return output
49 |
--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name = "pyjigg",
5 | packages=['pyjigg'],
6 | version = "0.1.0",
7 | )
8 |
--------------------------------------------------------------------------------
/script/release.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Usage: ./script/release.sh (e.g., 0.7.2)
4 |
5 | version=$1
6 | corenlp_url='http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip'
7 | corenlp_model='stanford-corenlp-3.9.1-models.jar'
8 | jigg_url='git@github.com:mynlp/jigg.git'
9 |
10 | corenlp_zip=${corenlp_url##*/}
11 | corenlp_dir=${corenlp_zip%.*}
12 |
13 | if [[ ! -e jigg-${version} ]]; then mkdir jigg-${version}; fi
14 | cd jigg-${version}
15 |
16 | # get jigg, if needed
17 | if [[ ! -e jigg ]]; then
18 | git clone $jigg_url
19 | fi
20 |
21 | # add corenlp model
22 | if [[ ! -e ${corenlp_dir} ]]; then
23 | wget ${corenlp_url} -O ${corenlp_zip}
24 | unzip ${corenlp_zip}
25 | mv ${corenlp_dir}/${corenlp_model} jigg
26 | fi
27 |
28 | # add assembled jigg
29 | if [[ ! -e jigg/jigg-$1.jar ]]; then
30 | cd jigg
31 | ./bin/sbt assembly
32 | mv target/jigg-assembly-$1.jar jigg-$1.jar
33 | ./bin/sbt clean
34 | cd ../
35 | fi
36 |
37 | for f in 'src/test' '.checker' '.git' 'project' 'target'; do
38 | if [[ -e jigg/$f ]]; then
39 | rm -rf jigg/$f
40 | fi
41 | done
42 |
43 | if [[ -e jigg/.git ]]; then
44 | rm -rf jigg/.git
45 | fi
46 |
47 | # if [[ -e jigg/src/test ]]; then
48 | # rm -rf jigg/src/test
49 | # fi
50 |
51 | # if [[ -e jigg/.checker ]]; then rm -rf jigg/.checker; fi
52 |
53 | # if [[ -e jigg/project ]]; then rm -rf jigg/project; fi
54 | # if [[ -e jigg/target ]]; then rm -rf jigg/target; fi
55 |
56 | # add jigg models (berkeley parser model inside)
57 | if [[ ! -e jigg/jigg-models.jar ]]; then
58 | cd jigg
59 | wget https://github.com/mynlp/jigg-models/raw/master/jigg-models.jar
60 | cd ../
61 | fi
62 |
63 | mv jigg jigg-${version}
64 | zip -r jigg-${version}.zip jigg-${version}
65 |
--------------------------------------------------------------------------------
/src/main/resources/python/bene_par.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function, unicode_literals
3 | import sys
4 |
5 | import benepar
6 |
7 | # In Python2, wrap sys.stdin and sys.stdout to work with unicode.
8 | if sys.version_info[0] < 3:
9 | import codecs
10 | import locale
11 | encoding = locale.getpreferredencoding()
12 | sys.stdin = codecs.getreader(encoding)(sys.stdin)
13 | sys.stdout = codecs.getwriter(encoding)(sys.stdout)
14 |
15 | if sys.version_info.major == 3:
16 | raw_input = input
17 |
18 | model = sys.argv[1] # maybe "benepar_en"
19 |
20 | parser = benepar.Parser(model)
21 |
22 | def parse(tokens, tags):
23 | sentence = list(zip(tokens, tags))
24 | parse_raw, tags_raw, sentence = next(parser._batched_parsed_raw([(tokens, sentence)]))
25 | tree = parser._make_nltk_tree(sentence, tags_raw, *parse_raw)
26 | return tree
27 |
28 | while True:
29 | tokens = raw_input()
30 | tags = raw_input()
31 |
32 | tokens = tokens.split(' ')
33 | tags = tags.split(' ')
34 |
35 | tree = parse(tokens, tags)
36 | print(tree)
37 | print("END")
38 |
--------------------------------------------------------------------------------
/src/main/resources/python/udpipe.py:
--------------------------------------------------------------------------------
1 |
2 | from __future__ import print_function, unicode_literals
3 | import sys
4 |
5 | from ufal.udpipe import Model, Pipeline, ProcessingError
6 |
7 | # In Python2, wrap sys.stdin and sys.stdout to work with unicode.
8 | if sys.version_info[0] < 3:
9 | import codecs
10 | import locale
11 | encoding = locale.getpreferredencoding()
12 | sys.stdin = codecs.getreader(encoding)(sys.stdin)
13 | sys.stdout = codecs.getwriter(encoding)(sys.stdout)
14 |
15 | if sys.version_info.major == 3:
16 | raw_input = input
17 |
18 | # To reduce the overhead we divide the patterns of a possible pipeline into 3 cases.
19 | _MODE_ = ['all', 'tok|pos', 'pos|par', 'tok', 'pos', 'par']
20 |
21 | model = sys.argv[1]
22 | mode = sys.argv[2] # one of _MODE_
23 |
24 | model = Model.load(model)
25 |
26 | if mode == 'all' or mode.find('tok') >= 0: input_format = 'tokenize'
27 | else: input_format = 'conllu'
28 | output_format = 'conllu'
29 |
30 | if mode == 'all' or mode.find('pos') >= 0: pos = Pipeline.DEFAULT
31 | else: pos = Pipeline.NONE
32 |
33 | if mode == 'all' or mode.find('par') >= 0: parse = Pipeline.DEFAULT
34 | else: parse = Pipeline.NONE
35 |
36 | pipeline = Pipeline(
37 | model, input_format, pos, parse, output_format)
38 | error = ProcessingError()
39 |
40 | while True:
41 | inputs = []
42 | while True:
43 | line = raw_input()
44 | if line == '####EOD####': break
45 | inputs.append(line)
46 |
47 | result = pipeline.process('\n'.join(inputs), error)
48 | print(result)
49 | print('END')
50 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/Example.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | case class Example[L](featVec:Array[Int], label:L)
20 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/FeatureBase.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | trait FeatureBase
20 |
21 | // Unlabeld feature, but not limited to: user may want to create features always with label (e.g., in structured classification exam). In such case, please include label to this class and ignore LabeldFeature.
22 | trait Feature extends FeatureBase {
23 | type LabelType
24 | type DictionaryType
25 | def assignLabel(label:LabelType): LabeledFeature[LabelType]
26 | def concat(items:Any*): String = items.mkString("_###_")
27 | }
28 |
29 | trait LabeledFeature[L] extends FeatureBase {
30 | def unlabeled: Feature
31 | def label: L
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/FeatureIndexer.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.collection.mutable.{HashMap, ArrayBuffer}
20 |
21 | @SerialVersionUID(1L)
22 | trait FeatureIndexer[Feature] extends Serializable {
23 | def size: Int
24 |
25 | /** Mutable indexing method which may add a new entry into the backbone map
26 | */
27 | def getIndex(key: Feature): Int
28 |
29 | /** Immutable indexing, -1 for unknown entry.
30 | */
31 | def get(key: Feature) = getIndex(key)
32 | }
33 |
34 | @SerialVersionUID(1L)
35 | class ExactFeatureIndexer[Feature](val map: HashMap[Feature, Int]) extends FeatureIndexer[Feature] {
36 |
37 | def size: Int = map.size
38 |
39 | def getIndex(key: Feature) = map.getOrElseUpdate(key, map.size)
40 |
41 | override def get(key: Feature) = map.getOrElse(key, -1)
42 | }
43 |
44 | /** FeatureIndexer with hash trick. Hash value is calculated with MurmurHash3.
45 | *
46 | * Pros of this approach are:
47 | * 1) Very memory efficient; we don't have to hold a hashmap for millions of feature objects;
48 | * 2) Small loading time of model.
49 | *
50 | * The expense is a small loss of accuracy but usually this is really small...
51 | */
52 | @SerialVersionUID(1L)
53 | class HashedFeatureIndexer[Feature] private(
54 | val maxFeatureSize: Int,
55 | val hasher: (Feature => Int)) extends FeatureIndexer[Feature] {
56 |
57 | def size = maxFeatureSize
58 |
59 | def getIndex(key: Feature) = (math.abs(hasher(key)) % maxFeatureSize)
60 | }
61 |
62 | object HashedFeatureIndexer {
63 | def apply[Feature](
64 | maxFeatureSize: Int = (2 << 23),
65 | hasher: (Feature => Int) = {f: Feature => f.hashCode()}) = {
66 |
67 | val biggestPrimeBelow = primes.takeWhile(maxFeatureSize > _).last
68 | new HashedFeatureIndexer[Feature](biggestPrimeBelow, hasher)
69 | }
70 |
71 | private lazy val primes = 2 #:: sieve(3)
72 |
73 | private def sieve(n: Int): Stream[Int] =
74 | if (primes.takeWhile(p => p*p <= n).exists(n % _ == 0)) sieve(n + 2)
75 | else n #:: sieve(n + 2)
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/FeatureUtil.scala:
--------------------------------------------------------------------------------
1 | // package jigg.ml
2 |
3 | // import scala.collection.mutable.{Map => mMap}
4 | // import scala.collection.mutable.AnyRefMap
5 |
6 | // trait FeatureUtil[Feature <: AnyRef] {
7 | // type FeatureIndexer = AnyRefMap[Feature, Int]
8 |
9 | // def getIndex(indexer: FeatureIndexer, key: Feature) = indexer.getOrElseUpdate(key, indexer.size)
10 |
11 | // def removeIndexes(indexer: FeatureIndexer, idxs: Seq[Int]): Unit = {
12 | // val features = indexer.toSeq.sortWith(_._2 < _._2).map(_._1)
13 | // val originalSize = indexer.size
14 | // (0 to idxs.size) foreach { i =>
15 | // val idx = if (i == idxs.size) originalSize else idxs(i)
16 | // val lastIdx = if (i == 0) -1 else idxs(i - 1)
17 | // (lastIdx + 1 until idx) foreach { f => indexer(features(f)) -= i }
18 | // if (i != idxs.size) indexer -= features(idx)
19 | // }
20 | // }
21 | // def removeElemsOver(indexer: FeatureIndexer, lastIdx: Int) = indexer.toSeq.foreach {
22 | // case (feature, idx) =>
23 | // indexer -= feature
24 | // }
25 | // }
26 |
27 | // // example usage:
28 | // object FeatureUtilExample {
29 | // case class MyFeature(unlabeled: String, label: Int)
30 | // object FU extends FeatureUtil[MyFeature]
31 |
32 | // def run = {
33 | // val indexer = new FU.FeatureIndexer
34 | // FU.getIndex(indexer, MyFeature("hoge", 10))
35 | // }
36 | // }
37 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LinearClassifier.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | trait Classifier[L] {
20 |
21 | protected val weights: WeightVector[Float]
22 |
23 | def predict(examples: Seq[Example[L]]): (L, Float)
24 | }
25 |
26 | trait LinearClassifier[L] extends Classifier[L] {
27 |
28 | override def predict(examples: Seq[Example[L]]): (L, Float) =
29 | if (examples.isEmpty) (null.asInstanceOf[L], 0F)
30 | else examples.map { e => (e.label, featureScore(e.featVec)) }.maxBy(_._2)
31 |
32 | def featureScore(feature: Array[Int]): Float = {
33 | var a = 0F
34 | var i = 0
35 | while (i < feature.size) {
36 | a += weight(feature(i))
37 | i += 1
38 | }
39 | a
40 | }
41 | /** Control the behavior of the access to weight.
42 | * You *MUST* use this method to access weight inside the classifier, and *NEVER* call like weights(i) directly (except updating the value)
43 | * This is because in some classifiers, such as AdaGradL1, the values must be preprocessed (e.g., lazy update) before used.
44 | * You can add such a preprocessing by overriding this method in a subclass.
45 | */
46 | protected def weight(idx: Int): Float = weights(idx)
47 | }
48 |
49 | /** A classifier in which weight vector backbone is implemented by array, hopefully faster than growable counterpart.
50 | */
51 | class FixedClassifier[L](val array: Array[Float]) extends LinearClassifier[L] {
52 | override val weights = new FixedWeightVector(array)
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LogLinearAdaGradL1.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | abstract class LogLinearAdaGradL1[L](val lambda: Float, val eta: Float) extends OnlineLogLinearTrainer[L] {
20 |
21 | private[this] val lastUpdates = WeightVector.growable[Float]()
22 | private[this] val diagGt = WeightVector.growable[Float]()
23 |
24 | override protected def weight(idx: Int): Float =
25 | if (lastUpdates(idx) == time) weights(idx)
26 | else {
27 | val currentXti = weights(idx)
28 | if (currentXti == 0.0F) 0.0F
29 | else {
30 | val t0 = lastUpdates(idx)
31 | assert(time != 0)
32 | val ht0ii = 1.0 + Math.sqrt(diagGt(idx))
33 | val newWeight = Math.signum(currentXti) * Math.max(
34 | 0.0, Math.abs(currentXti) - (lambda * eta / ht0ii) * (time - t0))
35 | weights(idx) = newWeight.toFloat
36 | lastUpdates(idx) = time
37 | newWeight.toFloat
38 | }
39 | }
40 |
41 | override def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit = {
42 | // Here, we negate the gradient. This is because original formulation by Duch et al.
43 | // minimizes the objective, while we maximize the objective.
44 | val gti = -derivative
45 | val deltaDiagGti = gti * gti // these are shared by all i below, so we cache here
46 |
47 | val feats = e.featVec
48 | var j = 0
49 | while (j < feats.size) {
50 | val i = feats(j)
51 |
52 | //val xti = weight(i) // This automatically perform lazy update of the target weight
53 | val xti = weights(i) // weighs(i) must be lazy-updated at calculating label scores, so we can skip
54 | diagGt(i) += deltaDiagGti
55 | val htii = 1.0 + Math.sqrt(diagGt(i))
56 | val etaOverHtii = eta / htii
57 | val tempXti = xti - etaOverHtii * gti
58 |
59 | weights(i) = (Math.signum(tempXti) * Math.max(0.0, Math.abs(tempXti) - lambda * etaOverHtii)).toFloat
60 | lastUpdates(i) = time + 1
61 |
62 | j += 1
63 | }
64 | }
65 | override def postProcess: Unit = {
66 | (0 until weights.size).foreach { weight(_) }
67 | }
68 | }
69 |
70 | class FixedLogLinearAdaGradL1[L](val weightArray: Array[Float], lambda: Float, eta: Float) extends LogLinearAdaGradL1(lambda, eta) {
71 | override val weights = new FixedWeightVector(weightArray)
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LogLinearClassifier.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | /** Augument LinearClassifier with a method to return label probabilities.
20 | * (implies loss function of log loss)
21 | */
22 | trait LogLinearClassifier[L] extends LinearClassifier[L] {
23 | val weights: WeightVector[Float]
24 |
25 | def labelProbs(examples: Seq[Example[L]]): Array[Float] = {
26 | val unnormalized: Array[Float] = examples.map { e =>
27 | val p = Math.exp(featureScore(e.featVec)).toFloat
28 | if (p < 1e-100) 1e-100F else p
29 | }.toArray
30 | val z = unnormalized.sum
31 | unnormalized.map(_ / z)
32 | }
33 | }
34 |
35 | class FixedLogLinerClassifier[L](val weightArray: Array[Float]) extends LogLinearClassifier[L] {
36 | override val weights = new FixedWeightVector(weightArray)
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LogLinearSGD.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | abstract class LogLinearSGD[L](val a: Float) extends OnlineLogLinearTrainer[L] {
20 |
21 | def stepSize = Math.pow(time + 1, -a).toFloat // avoid the overflow
22 | def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit = {
23 | val dw = stepSize * derivative
24 | val feats = e.featVec
25 | var i = 0
26 | while (i < feats.size) {
27 | weights(feats(i)) += dw
28 | i += 1
29 | }
30 | }
31 | }
32 |
33 | class FixedLogLinearSGD[L](val weightArray: Array[Float], a: Float) extends LogLinearSGD(a) {
34 |
35 | override val weights = new FixedWeightVector(weightArray)
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/OnlineLogLinearTrainer.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | /** This trait exploits the common procedure in trainers of log-linear models.
20 | */
21 | trait OnlineLogLinearTrainer[L] extends OnlineTrainer[L] with LogLinearClassifier[L] {
22 | var time: Int = 0
23 |
24 | override def update(examples: Seq[Example[L]], gold:L): Unit = {
25 | val dist = labelProbs(examples)
26 | var i = 0
27 | while (i < examples.size) {
28 | val e = examples(i)
29 | val p = dist(i)
30 | val derivative = if (e.label == gold) (1 - p) else -p
31 | updateExampleWeights(e, gold, derivative)
32 | i += 1
33 | }
34 | reguralizeWeights(examples)
35 | time += 1
36 | }
37 | def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit
38 | def reguralizeWeights(examples: Seq[Example[L]]): Unit = {} // Some algorithms reguralize weights after temporalily updating the values and this method defines that postprocessing. See LogLinearSGDCumulativeL1 for example.
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/OnlineTrainer.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | /** A trait which support parameter update, and the interface of Classifier.
20 | * Currently two subclasses exists: OnlineLoglinearTrainer is used for log-linear models, while Perceptron is used to train the perceptron including structured perceptron with beam-search.
21 | */
22 | trait OnlineTrainer[L] extends Classifier[L] {
23 | def update(examples: Seq[Example[L]], gold:L): Unit
24 | def postProcess: Unit = Unit
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/Perceptron.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.collection.mutable.ArrayBuffer
20 |
21 | trait Perceptron[L] extends LinearClassifier[L] with OnlineTrainer[L] {
22 |
23 | def averageWeights: WeightVector[Float]
24 |
25 | var c = 1.0F
26 |
27 | override def update(examples: Seq[Example[L]], gold: L): Unit = {
28 | val pred = predict(examples)._1
29 | if (pred != gold) {
30 | var i = 0
31 | while (i < examples.size) {
32 | val label = examples(i).label
33 | if (label == pred) updateFeatureWeighs(examples(i).featVec, -1.0F)
34 | else if (label == gold) updateFeatureWeighs(examples(i).featVec, 1.0F)
35 | i += 1
36 | }
37 | }
38 | c += 1.0F
39 | }
40 | def updateFeatureWeighs(featVec: Array[Int], scale: Float): Unit = featVec.foreach { f =>
41 | weights(f) += scale
42 | averageWeights(f) += scale * c
43 | }
44 | def update(predFeatVec:Array[Int], goldFeatVec:Array[Int]): Unit = {
45 | updateFeatureWeighs(predFeatVec, -1.0F)
46 | updateFeatureWeighs(goldFeatVec, 1.0F)
47 | c += 1.0F
48 | }
49 | def takeAverage: Unit = (0 until weights.size) foreach { i =>
50 | weights(i) -= averageWeights(i) / c
51 | }
52 | }
53 |
54 | class FixedPerceptron[L](val weightArray: Array[Float]) extends Perceptron[L] {
55 |
56 | override val weights = new FixedWeightVector(weightArray)
57 | override val averageWeights = new FixedWeightVector(new Array[Float](weights.size))
58 | }
59 |
60 | class GrowablePerceptron[L](val weightArray: ArrayBuffer[Float]) extends Perceptron[L] {
61 |
62 | override val weights = new GrowableWeightVector(weightArray)
63 | override val averageWeights = WeightVector.growable[Float](weights.size)
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/WeightVector.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.collection.mutable.ArrayBuffer
20 |
21 | @SerialVersionUID(1L)
22 | trait WeightVector[@specialized(Int, Double, Float) A] extends Serializable {
23 | def apply(idx: Int): A
24 | def update(idx: Int, elem: A): Unit
25 | def size: Int
26 |
27 | def seq: IndexedSeq[A] // indexed seq from a backbone data structure
28 | }
29 |
30 | object WeightVector {
31 | def growable[A](initialSize: Int = 0)(implicit numeric: Numeric[A]) = new GrowableWeightVector[A](new ArrayBuffer[A](initialSize))(numeric)
32 | }
33 |
34 | class FixedWeightVector[@specialized(Int, Double, Float) A](val array: Array[A]) extends WeightVector[A] {
35 | def apply(idx: Int) = array(idx)
36 | def update(idx: Int, elem: A) = array(idx) = elem
37 | def size = array.size
38 |
39 | def seq = array
40 | }
41 |
42 | class GrowableWeightVector[@specialized(Int, Double, Float) A](val array: ArrayBuffer[A])(implicit numeric: Numeric[A]) extends WeightVector[A] {
43 | def apply(idx: Int) = if (idx >= size || idx < 0) numeric.zero else array(idx)
44 | def update(idx: Int, elem: A) = {
45 | if (idx >= array.size) array ++= List.fill(idx - array.size + 1)(numeric.zero)
46 | array(idx) = elem
47 | }
48 | def size = array.size
49 |
50 | def seq = array
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Dense.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.{DenseMatrix, DenseVector}
20 | import ucar.nc2.{Variable, Group}
21 |
22 | class Dense(inputDim: Int, outputDim: Int) extends Functor{
23 |
24 | override def functorName = "Dense"
25 |
26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = {
27 | val z = data * w
28 | for (i <- 0 until data.rows){
29 | z(i, ::) :+= b.t
30 | }
31 | z
32 | }
33 |
34 | private val w = DenseMatrix.zeros[Float](inputDim, outputDim)
35 | private val b = DenseVector.zeros[Float](outputDim)
36 |
37 | def h5load(weight: Variable, bias: Variable): Unit = {
38 | val weightData = weight.read
39 | val weightIndex = weightData.getIndex
40 | val biasData = bias.read
41 | val biasIndex = biasData.getIndex
42 | for(y <- 0 until inputDim)
43 | for(x <- 0 until outputDim){
44 | w(y, x) = weightData.getFloat(weightIndex.set(y, x))
45 | if(y == 0)
46 | b(x) = biasData.getFloat(biasIndex.set(x))
47 | }
48 | }
49 |
50 | override def toString: String = "Dense: {inputDim: " + inputDim + ", outputDim: " + outputDim + "}"
51 |
52 | def head: String = w(0 until 2, ::).toString
53 | }
54 |
55 | object Dense{
56 | def apply(inputDim:Int, outputDim:Int) = new Dense(inputDim, outputDim)
57 |
58 | def apply(configs: Map[String, Any], weightGroups: Group): Dense = {
59 | val layerName = configs("name").toString
60 | val params = weightGroups.findGroup(layerName)
61 | val weightNames = params.findAttribute("weight_names")
62 | val weight = params.findVariable(weightNames.getStringValue(0))
63 | val bias = params.findVariable(weightNames.getStringValue(1))
64 | val dims = weight.getDimensions
65 | if(dims.size != 2){
66 | throw new IllegalArgumentException("invalid dimension for Dense class")
67 | }
68 |
69 | val d = new Dense(dims.get(0).getLength, dims.get(1).getLength)
70 | d.h5load(weight, bias)
71 | d
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Embedding.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.{DenseMatrix, DenseVector}
20 | import ucar.nc2.{Variable, Group}
21 |
22 | class Embedding(vocabulary: Int, outDim: Int) extends Functor{
23 |
24 | override def functorName = "Embedding"
25 |
26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = {
27 | val arrayOfId = data.reshape(data.size, 1)
28 | val length = arrayOfId.size
29 | val z = DenseMatrix.zeros[Float](length, outDim)
30 | for(i <- 0 until length){
31 | z(i, ::) := w(arrayOfId(i, 0).asInstanceOf[Int]).t
32 | }
33 | z
34 | }
35 |
36 | private val w = new Array[DenseVector[Float]](vocabulary).map(_ => DenseVector.zeros[Float](outDim))
37 |
38 | def h5load(weight: Variable):Unit = {
39 | val weightData = weight.read
40 | val weightIndex = weightData.getIndex
41 | for(y <- 0 until vocabulary)
42 | for(x <- 0 until outDim)
43 | w(y)(x) = weightData.getFloat(weightIndex.set(y, x))
44 | }
45 |
46 | }
47 |
48 | object Embedding{
49 | def apply(vocabulary: Int, outDim: Int) = new Embedding(vocabulary, outDim)
50 |
51 | def apply(configs: Map[String, Any], weightGroups: Group): Embedding = {
52 | val layerName = configs("name").toString
53 | val params = weightGroups.findGroup(layerName)
54 | val weightNames = params.findAttribute("weight_names")
55 | val weight = params.findVariable(weightNames.getStringValue(0))
56 | val dims = weight.getDimensions
57 | if(dims.size != 2){
58 | throw new IllegalArgumentException("Invalid dimension for Embedding class")
59 | }
60 | val e = new Embedding(dims.get(0).getLength, dims.get(1).getLength)
61 | e.h5load(weight)
62 | e
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Empty.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 |
21 | object Empty extends Functor{
22 |
23 | override def functorName = "Empty"
24 |
25 | override final def convert(data: DenseMatrix[Float]):DenseMatrix[Float] = data
26 |
27 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Flatten.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 |
21 | object Flatten extends Functor{
22 |
23 | override def functorName = "Flatten"
24 |
25 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.t.toDenseVector.toDenseMatrix
26 |
27 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Functor.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 |
21 | trait Functor {
22 |
23 | def functorName: String
24 | def convert(data: DenseMatrix[Float]): DenseMatrix[Float]
25 | override def toString: String = functorName
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/KerasModel.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 | import jigg.util.HDF5Object
21 | import org.json4s.jackson.JsonMethods._
22 | import org.json4s.{DefaultFormats, _}
23 |
24 | class KerasModel(model: HDF5Object) {
25 |
26 | private val kerasAttribute = model.checkAndGetAttribute("keras_version")
27 | private val modelAttribute = model.checkAndGetAttribute("model_config")
28 |
29 | private val weightGroups = model.checkAndGetGroup("model_weights")
30 |
31 | def parseConfigToSeq(config: String): Seq[Map[String, Any]] = {
32 | val jsonValue = parse(config)
33 | implicit val formats = DefaultFormats
34 | val jsonList = jsonValue.extract[Map[String, Any]]
35 | jsonList("config").asInstanceOf[Seq[Map[String, Any]]]
36 | }
37 |
38 | private val modelValues = parseConfigToSeq(modelAttribute.getValue(0).toString)
39 |
40 | def getConfigs(x: Map[String, Any]): Map[String, Any] = x("config").asInstanceOf[Map[String,Any]]
41 |
42 | def constructNetwork(values: Seq[Map[String, Any]]): Seq[Functor] = values.map{
43 | x => {
44 | val configs = getConfigs(x)
45 | val functor = x("class_name").toString match {
46 | case "Activation" =>
47 | configs("activation").toString match{
48 | case "relu" => Relu
49 | case "softmax" => Softmax
50 | case "sigmoid" => Sigmoid
51 | case "tanh" => Tanh
52 | }
53 | case "Convolution1D" =>
54 | Convolution1D(configs, weightGroups)
55 | case "Dense" =>
56 | Dense(configs, weightGroups)
57 | case "Embedding" =>
58 | Embedding(configs, weightGroups)
59 | case "Flatten" => Flatten
60 | case _ => Empty
61 | }
62 | functor
63 | }
64 | }
65 |
66 | private val graph:Seq[Functor] = constructNetwork(modelValues)
67 |
68 | def convert(input: DenseMatrix[Float]): DenseMatrix[Float] = callFunctors(input, graph)
69 |
70 | private def callFunctors(input: DenseMatrix[Float], unprocessed:Seq[Functor]): DenseMatrix[Float] = unprocessed match {
71 | case functor :: tail =>
72 | val interOutput = functor.convert(input)
73 | callFunctors(interOutput, tail)
74 | case Nil => input
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/KerasParser.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.argmax
20 | import jigg.ml.keras._
21 | import jigg.util.LookupTable
22 |
23 | import scala.xml.Node
24 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
25 |
26 | class KerasParser(model: KerasModel, table: LookupTable) {
27 |
28 | /*
29 | * BIO tag
30 | * B : Begin of segment. Value is 0.
31 | * I : Continuation or end of segment. Value is 1.
32 | * O : Others. Value is 2.
33 | */
34 | private val tagset:Map[Int, String] = Map(0 -> "B", 1 -> "I", 2 -> "O")
35 |
36 | def parsing(str: String): Array[(Int, Int)] = {
37 | // For dummy input to indicate boundaries of sentence.
38 | val s = "\n" + str + "\n\n"
39 | val inputData = table.encodeCharacter(s)
40 | val outputData = model.convert(inputData)
41 |
42 | val tags = for {
43 | i <- 1 until outputData.rows - 2
44 | maxID = argmax(outputData(i, ::))
45 | } yield maxID
46 |
47 | getOffsets(tags.toArray)
48 | }
49 |
50 | def parsing(tokens: Node): Array[Array[String]] = {
51 | // For dummy input to indicate boundaries of sentence.
52 | val words = Array("\n").union(
53 | (tokens \\ "tokens").flatMap(x => x \\ "@lemma").toArray.map(x => x.toString)).union(Array("\n\n"))
54 | val ids = (tokens \\ "tokens").flatMap(x => x \\ "@id").toArray.map(x => x.toString)
55 |
56 | val inputData = table.encodeWords(words)
57 | val outputData = model.convert(inputData)
58 |
59 | val tags = for {
60 | i <- 1 until outputData.rows - 2
61 | maxID = argmax(outputData(i, ::))
62 | } yield maxID
63 |
64 | val ranges = getOffsets(tags.toArray)
65 |
66 | ranges.map(x => ids.slice(x._1, x._2))
67 | }
68 |
69 | def getOffsets(data: Array[Int]): Array[(Int, Int)]= {
70 | val ranges = ArrayBuffer[(Int, Int)]()
71 | var bpos = -1
72 |
73 | for(i <- data.indices){
74 | tagset(data(i)) match{
75 | case "B" =>
76 | if(bpos >= 0)
77 | ranges += ((bpos, i))
78 | bpos = i
79 | case "I" if i == 0 || bpos == -2 =>
80 | bpos = i
81 | case "O" =>
82 | if (bpos >= 0)
83 | ranges += ((bpos, i))
84 | bpos = -2
85 | case _ if i == data.indices.last =>
86 | ranges += ((bpos, i + 1))
87 | case _ =>
88 | }
89 | }
90 | ranges.toArray
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/README.md:
--------------------------------------------------------------------------------
1 | # KerasParser
2 |
3 | ## Abstract
4 | - Main class: jigg.ml.keras.KerasParser
5 | - KerasParser requires a model file and a lookup-table file.
6 |
7 | ## Requirements
8 | ### Model file
9 | - Model file must be generated by [keras](https://keras.io)
10 | - HDF5 is only supported
11 | - Required output class style: BIO
12 | - Tag `B` corresponds to `0`.
13 | - Tag `I` corresponds to `1`.
14 | - Tag `O` corresponds to `2`.
15 | - The following keras's functions are supported.
16 | - Layer
17 | - Dense
18 | - Embedding
19 | - Convolution1D
20 | - Flatten
21 | - Activation
22 | - Relu
23 | - Sigmoid
24 | - Softmax
25 | - Tanh
26 |
27 | ### Lookup table
28 | - Field construction
29 | - `_lookup`
30 | - `_key2id`: Convert character/word to ID
31 | - key: Target character/word
32 | - value: ID number of target character/word
33 | - `_id2key`: Convert ID to chracter/word
34 | - key: ID number of target chracter/word
35 | - value: Target character/word
36 | - Table shoud contain following elements:
37 |
38 | | ID | Value |
39 | |:---|:------|
40 | |0 | UNKNOWN |
41 | |1 | new line (`\n`) |
42 | |2 | half space (` `) |
43 |
44 | #### Example
45 | ```json
46 | {"_lookup":{
47 | "_key2id": {
48 | "UNKNOWN": "0",
49 | "\n": "1",
50 | " " : "2",
51 | "Additional elements": "3..."
52 | },
53 | "_id2key": {
54 | "0": "UNKNOWN",
55 | "1": "\n",
56 | "2": " ",
57 | "3..." : "Additional elements"
58 | }
59 | }
60 | }
61 | ```
62 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Relu.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 |
21 | object Relu extends Functor{
22 |
23 | override def functorName = "Relu"
24 |
25 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map(x =>
26 | if(x > 0.0.toFloat) x else 0.0.toFloat
27 | )
28 |
29 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Sigmoid.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 | import breeze.numerics.exp
21 |
22 | object Sigmoid extends Functor {
23 |
24 | override def functorName = "Sigmoid"
25 |
26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map{x => (1.0 / (1.0 + exp(-x))).toFloat}
27 |
28 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Softmax.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.{DenseVector, DenseMatrix, softmax}
20 | import breeze.numerics.exp
21 |
22 | object Softmax extends Functor{
23 |
24 | override def functorName = "Softmax"
25 |
26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = {
27 | for(y <- 0 until data.rows){
28 | val v = data(y, ::)
29 | data(y, ::) := (exp(v) :/= exp(softmax(v)))
30 | }
31 | data
32 | }
33 |
34 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Tanh.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import breeze.linalg.DenseMatrix
20 | import breeze.numerics.tanh
21 |
22 | object Tanh extends Functor{
23 |
24 | override def functorName = "Tanh"
25 |
26 | override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map{ x => tanh(x)}
27 |
28 | def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
29 |
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/CCGBank2EnjuXML.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import lexicon._
20 | import jigg.util.IOUtil
21 |
22 | import breeze.config.{CommandLineParser, Help}
23 |
24 | import scala.collection.mutable.ArrayBuffer
25 | import scala.sys.process.Process
26 |
27 | import java.io.{File, FileWriter}
28 |
29 |
30 | object CCGBank2EnjuXML {
31 |
32 | case class Opts(
33 | @Help(text="Path to CCGBank file") ccgBank: File = new File(""),
34 | @Help(text="Path to output (xml)") output: File = new File(""),
35 | @Help(text="Number of sentences") numSentences: Int = 50
36 | )
37 |
38 | def main(args:Array[String]) = {
39 | val opts = CommandLineParser.readIn[Opts](args)
40 |
41 | val dict = new JapaneseDictionary(new Word2CategoryDictionary)
42 |
43 | val conv = new JapaneseParseTreeConverter(dict)
44 |
45 | val reader = new CCGBankReader
46 |
47 | val instances: Seq[(TaggedSentence, Derivation)] =
48 | reader.takeLines(IOUtil.openIterator(opts.ccgBank.getPath), opts.numSentences).toSeq.map { line =>
49 | val trees = reader.readParseFragments(line).map { conv.toLabelTree(_) }
50 | (conv.toSentenceFromLabelTrees(trees), conv.toFragmentalDerivation(trees))
51 | }
52 |
53 | val fw = new FileWriter(opts.output.getPath)
54 |
55 | instances.zipWithIndex foreach { case ((s, d), i) => fw.write(d.renderEnjuXML(s, i) + "\n") }
56 |
57 | fw.flush
58 | fw.close
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/CCGBankToCabochaFormat.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 |
20 | import lexicon._
21 |
22 | import breeze.config.{CommandLineParser, Help}
23 |
24 | import scala.sys.process.Process
25 |
26 | import java.io.{File, FileWriter, ByteArrayInputStream}
27 |
28 | /** Creates Cabocha-formatted CCGBank sentences.
29 | *
30 | * The output of this is required when evaluating bunsetsu-dependency of CCG parser.
31 | * When new CCGBank is released, currently, we have to manually run this class to get the correct data.
32 | */
33 | object CCGBankToCabochaFormat {
34 |
35 | case class Opts(
36 | @Help(text="Path to CCGBank file") ccgbank: File = new File(""),
37 | @Help(text="Path to output") output: File = new File(""),
38 | @Help(text="Cabocha command (path to cabocha)") cabocha: String = "cabocha"
39 | )
40 |
41 | type Tree = ParseTree[NodeLabel]
42 |
43 | def main(args:Array[String]) = {
44 | val opts = CommandLineParser.readIn[Opts](args)
45 |
46 | val dict = new JapaneseDictionary()
47 | val extractors = TreeExtractor(
48 | new JapaneseParseTreeConverter(dict),
49 | new CCGBankReader)
50 |
51 | val trees = extractors.readTrees(opts.ccgbank, -1, true)
52 | val rawString = trees map (extractors.treeConv.toSentenceFromLabelTree) map (_.wordSeq.mkString("")) mkString ("\n")
53 | val is = new java.io.ByteArrayInputStream(rawString.getBytes("UTF-8"))
54 | val out = (Process(s"${opts.cabocha} -f1") #< is).lineStream_!
55 |
56 | val os = jigg.util.IOUtil.openOut(opts.output.getPath)
57 | out foreach { line =>
58 | os.write(line + "\n")
59 | }
60 | os.flush
61 | os.close
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/GoldBunsetsuDepInCabocha.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import lexicon._
20 | import jigg.util.IOUtil
21 |
22 | import breeze.config.{CommandLineParser, Help}
23 |
24 | import java.io.{File, FileWriter}
25 |
26 | /** Input: CCGBank file (e.g., train.ccgbank) from stdin.
27 | * Output: Gold bunsetsu dependencies according to the CCGBank in CoNLL format.
28 | */
29 | object GoldBunsetsuDepInCoNLL {
30 |
31 | case class Opts(
32 | @Help(text="Path to Cabocha file (same sentences with the CCGBank file)") cabocha: File = new File("")
33 | )
34 |
35 | def main(args:Array[String]) = {
36 | val opts = CommandLineParser.readIn[Opts](args)
37 |
38 | val dict = new JapaneseDictionary(new Word2CategoryDictionary)
39 |
40 | val conv = new JapaneseParseTreeConverter(dict)
41 | val parseTrees = new CCGBankReader()
42 | .readParseTrees(IOUtil.openStandardIterator, -1, true)
43 | .map(conv.toLabelTree _).toSeq
44 | val goldDerivs = parseTrees.map(conv.toDerivation)
45 | val sentences = parseTrees.map(conv.toSentenceFromLabelTree)
46 |
47 | val bunsetsuSentencesWithPredHead =
48 | new CabochaReader(sentences).readSentences(opts.cabocha.getPath)
49 |
50 | val bunsetsuSentencesWithGoldHead =
51 | bunsetsuSentencesWithPredHead zip goldDerivs map { case (sentence, deriv) =>
52 | BunsetsuSentence(sentence.bunsetsuSeq).parseWithCCGDerivation(deriv)
53 | }
54 | for (sentence <- bunsetsuSentencesWithGoldHead) {
55 | println(sentence.renderInCoNLL)
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/Opts.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import lexicon._
20 |
21 | import jigg.ml
22 |
23 | import breeze.config.Help
24 |
25 | import java.io.File
26 |
27 | object Opts {
28 |
29 | @Help(text="About CCGBank")
30 | case class BankInfo(
31 | @Help(text="Language (ja|en)") lang: String = "ja",
32 | @Help(text="Path to CCGBank directory (if this is set, files in this dir are used as default values of train/dev and others)") dir: File = new File(""),
33 | @Help(text="# training instances, -1 for all") trainSize: Int = -1,
34 | @Help(text="# test instances, -1 for all") testSize: Int = -1,
35 | @Help(text="# dev instances, -1 for all") devSize: Int = -1
36 | )
37 |
38 | @Help(text="About category dictionary")
39 | case class DictParams(
40 | @Help(text="How to look up category candidates? (for Japanese only) (surfaceOnly|surfaceAndPoS|surfaceAndSecondFineTag|surfaceAndSecondWithConj)")
41 | lookupMethod: String = "surfaceAndSecondWithConj",
42 | @Help(text="Whether using lexicon files to create word -> category mappings")
43 | useLexiconFiles: Boolean = true,
44 | @Help(text="Minimum number of occurences for registering as lexicalized entry")
45 | unkThreathold: Int = 30
46 | ) {
47 |
48 | val categoryDictinoary = lookupMethod match {
49 | case "surfaceOnly" => new Word2CategoryDictionary
50 | case "surfaceAndPoS" => new WordPoS2CategoryDictionary
51 | case "surfaceAndSecondFineTag" => new WordSecondFineTag2CategoryDictionary
52 | case "surfaceAndSecondWithConj" => new WordSecondWithConj2CategoryDictionary
53 | case _ => sys.error("unknown lookUpMethod")
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/ParserRunner.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import lexicon._
20 | import parser.{ActionLabel, KBestDecoder}
21 | import jigg.ml.FixedPerceptron
22 |
23 | import breeze.config.{CommandLineParser, Help}
24 |
25 | import scala.collection.mutable.{ArraySeq}
26 |
27 | import java.io.File
28 |
29 |
30 | class ParserRunner(model: ParserModel, params: ParserRunner.Params) {
31 |
32 | val tagger = new SuperTaggerRunner(model.taggerModel, params.tagger)
33 | val perceptron = new FixedPerceptron[ActionLabel](model.weights)
34 | val decoder = model.mkDecoder(params.beam, perceptron)
35 |
36 | val preferConnected = params.preferConnected
37 |
38 | def decode[S<:TaggedSentence](sentences: Array[S]): Array[Derivation] = {
39 |
40 | val predDerivations = sentences.zipWithIndex map {
41 | case (sentence, i) =>
42 | if (i % 100 == 0)
43 | System.err.print(i + "\t/" + sentences.size + " have been processed.\r")
44 | decodeOne(sentence)
45 | }
46 | System.err.println()
47 | predDerivations
48 | }
49 |
50 | def decodeOne[S<:TaggedSentence](sentence: S): Derivation =
51 | kBestDerivations(sentence, 1)(0)._1
52 |
53 | def kBestDerivations[S<:TaggedSentence](sentence: S, k: Int)
54 | : Seq[(Derivation, Double)] = {
55 | val superTaggedSentence = tagger.assignKBest(sentence)
56 |
57 | decoder match {
58 | case decoder: KBestDecoder =>
59 | decoder predictKbest (k, superTaggedSentence, preferConnected)
60 | case decoder =>
61 | Seq(decoder predict superTaggedSentence)
62 | }
63 | }
64 | }
65 |
66 | object ParserRunner {
67 |
68 | @Help(text="Params for testing/evaluating parser")
69 | case class Params(
70 | @Help(text="Beam size") beam: Int = 32,
71 | @Help(text="Prefer connected derivation at prediction") preferConnected: Boolean = true,
72 | tagger: SuperTaggerRunner.Params = new SuperTaggerRunner.Params()
73 | )
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/SuperTaggerModel.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import tagger.{LF=>Feature, MaxEntMultiTagger, MaxEntMultiTaggerTrainer, FeatureExtractors}
20 | import lexicon._
21 | import jigg.ml._
22 |
23 | import scala.collection.mutable.HashMap
24 |
25 | case class SuperTaggerModel(
26 | dict: Dictionary,
27 | featureMap: HashMap[Feature, Int],
28 | weights: WeightVec,
29 | extractors: FeatureExtractors) { self =>
30 |
31 | def reduceFeatures(): SuperTaggerModel = {
32 |
33 | val buffer = weights.asInstanceOf[GrowableWeightVector[Float]].array // 0 1.0 2.0 0 0 1.0 ...
34 | val activeIdxs = buffer.zipWithIndex filter (_._1 != 0) map (_._2) // 1 2 5
35 | println(s"# features reduced from ${buffer.size} to ${activeIdxs.size}")
36 | val idxMap = activeIdxs.zipWithIndex.toMap // {1->0, 2->1 5->2}
37 |
38 | val newFeatureMap = featureMap collect {
39 | case (f, oldIdx) if idxMap.isDefinedAt(oldIdx) => (f, idxMap(oldIdx))
40 | }
41 | val newWeights = new FixedWeightVector[Float](activeIdxs.map(buffer).toArray)
42 |
43 | this copy (featureMap = newFeatureMap, weights = newWeights)
44 | }
45 |
46 | def mkMultiTaggerTrainer(classifierTrainer: OnlineLogLinearTrainer[Int]) =
47 | new MaxEntMultiTaggerTrainer(mkIndexer(), extractors, classifierTrainer, dict)
48 |
49 | def mkMultiTagger() =
50 | new MaxEntMultiTagger(mkIndexer(), extractors, mkClassifier(), dict)
51 |
52 | def mkClassifier() = new LogLinearClassifier[Int] {
53 | override val weights = self.weights
54 | }
55 |
56 | private def mkIndexer() = new ExactFeatureIndexer(featureMap)
57 | }
58 |
59 | object SuperTaggerModel {
60 |
61 | def saveTo(path: String, model: SuperTaggerModel) = {
62 | System.err.println("Saving tagger model to " + path)
63 | val os = jigg.util.IOUtil.openBinOut(path)
64 | os.writeObject(model)
65 | os.close
66 | }
67 |
68 | def loadFrom(path: String): SuperTaggerModel = {
69 | jigg.util.LogUtil.track("Loading supertagger model ...") {
70 | val in = jigg.util.IOUtil.openBinIn(path)
71 | val model = in.readObject.asInstanceOf[SuperTaggerModel]
72 | in.close
73 | model
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/SuperTaggerRunner.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import lexicon._
20 |
21 | import breeze.config.{CommandLineParser, Help}
22 |
23 | import scala.collection.mutable.{ArraySeq}
24 |
25 | import java.io.File
26 |
27 |
28 | class SuperTaggerRunner(model: SuperTaggerModel, params: SuperTaggerRunner.Params) {
29 |
30 | val tagger = model.mkMultiTagger()
31 |
32 | def assignKBests[S<:TaggedSentence](sentences: Array[S]): ArraySeq[S#AssignedSentence] =
33 | sentences map (assignKBest)
34 |
35 | def assignKBest[S<:TaggedSentence](s: S): S#AssignedSentence =
36 | s assignCands (tagger candSeq(s, params.beta, params.maxK))
37 | }
38 |
39 | object SuperTaggerRunner {
40 |
41 | @Help(text="Params for testing/evaluating super tagger")
42 | case class Params(
43 | // @Help(text="Load model path") model: SuperTaggerModel: SuperTaggerModel,
44 | @Help(text="Beta for decising the threshold of k-best at prediction") beta: Double = 0.001,
45 | @Help(text="Maximum number of k, -1 for no limit") maxK: Int = -1
46 | )
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/TrainParser.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import breeze.config.CommandLineParser
20 |
21 | object TrainParser {
22 |
23 | import ParserTrainer.Params
24 |
25 | def main(args: Array[String]) = {
26 |
27 | val params = CommandLineParser.readIn[Params](args)
28 | val trainer = mkTrainer(params)
29 | trainer.trainAndSave()
30 | }
31 |
32 | def mkTrainer(params: Params): ParserTrainer = params.bank.lang match {
33 | case "ja" => new JapaneseParserTrainer(params)
34 | case "en" => new EnglishParserTrainer(params)
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/TrainSuperTagger.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import breeze.config.CommandLineParser
20 |
21 | object TrainSuperTagger {
22 |
23 | import SuperTaggerTrainer.Params
24 |
25 | def main(args: Array[String]) = {
26 |
27 | val params = CommandLineParser.readIn[Params](args)
28 | val trainer = mkTrainer(params)
29 | trainer.trainAndSave()
30 | }
31 |
32 | def mkTrainer(params: Params): SuperTaggerTrainer = params.bank.lang match {
33 | case "ja" => new JapaneseSuperTaggerTrainer(params)
34 | case "en" => new EnglishSuperTaggerTrainer(params)
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/CabochaReader.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.io.Source
20 |
21 | class CabochaReader[S<:TaggedSentence](ccgSentences: Seq[S]) {
22 | def readSentences(path: String): Seq[ParsedBunsetsuSentence] = {
23 | val bunsetsuStart = """\* (\d+) (-?\d+)[A-Z].*""".r
24 | def addBunsetsuTo(curSent: List[(String, Int)], curBunsetsu: List[String]) = curBunsetsu.reverse match {
25 | case Nil => curSent
26 | case headIdx :: tail => (tail.mkString(""), headIdx.toInt) :: curSent
27 | }
28 |
29 | val bunsetsuSegedSentences: List[List[(String, Int)]] =
30 | scala.io.Source.fromFile(path).getLines.filter(_ != "").foldLeft(
31 | (List[List[(String, Int)]](), List[(String, Int)](), List[String]())) {
32 | case ((processed, curSent, curBunsetsu), line) => line match {
33 | case bunsetsuStart(_, nextHeadIdx) =>
34 | (processed, addBunsetsuTo(curSent, curBunsetsu), nextHeadIdx :: Nil) // use first elem as the head idx
35 | case "EOS" => (addBunsetsuTo(curSent, curBunsetsu).reverse :: processed, Nil, Nil)
36 | case word => (processed, curSent, word.split("\t")(0) :: curBunsetsu)
37 | }
38 | }._1.reverse
39 |
40 | ccgSentences.zip(bunsetsuSegedSentences).map { case (ccgSentence, bunsetsuSentence) =>
41 | val bunsetsuSegCharIdxs: List[Int] = bunsetsuSentence.map { _._1.size }.scanLeft(0)(_+_).tail // 5 10 ...
42 | val ccgWordSegCharIdxs: List[Int] = ccgSentence.wordSeq.toList.map { _.v.size }.scanLeft(0)(_+_).tail // 2 5 7 10 ...
43 |
44 | assert(bunsetsuSegCharIdxs.last == ccgWordSegCharIdxs.last)
45 | val bunsetsuSegWordIdxs: List[Int] = ccgWordSegCharIdxs.zipWithIndex.foldLeft((List[Int](), 0)) { // 1 3 ...
46 | case ((segWordIdxs, curBunsetsuIdx), (wordIdx, i)) =>
47 | if (wordIdx >= bunsetsuSegCharIdxs(curBunsetsuIdx)) (i :: segWordIdxs, curBunsetsuIdx + 1)
48 | else (segWordIdxs, curBunsetsuIdx) // wait until wordIdx exceeds the next bunsetsu segment
49 | }._1.reverse
50 | val bunsetsuSeq = bunsetsuSegWordIdxs.zip(-1 :: bunsetsuSegWordIdxs).map { case (bunsetsuIdx, prevIdx) =>
51 | val offset = prevIdx + 1
52 | Bunsetsu(offset,
53 | ccgSentence.wordSeq.slice(offset, bunsetsuIdx + 1),
54 | ccgSentence.posSeq.slice(offset, bunsetsuIdx + 1))
55 | }
56 | ParsedBunsetsuSentence(bunsetsuSeq, bunsetsuSentence.map { _._2 })
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 | import Slash._
19 |
20 | sealed trait Category extends Numbered[Unit] {
21 | override def v:Unit = {}
22 | def toStringNoFeature: String
23 | }
24 |
25 | @SerialVersionUID(6748884927580538343L)
26 | case class AtomicCategory(override val id:Int, base:String, feature:CategoryFeature) extends Category {
27 | override def toString = feature.toString match {
28 | case "" => base
29 | case s => base + "[" + s + "]"
30 | }
31 |
32 | override def toStringNoFeature = base
33 | }
34 | @SerialVersionUID(3754315949719248198L)
35 | case class ComplexCategory(override val id:Int,
36 | left:Category, right:Category,
37 | slash:Slash) extends Category {
38 | def toStringChild(child:Category) = child match {
39 | case AtomicCategory(_,_,_) => child.toString
40 | case ComplexCategory(_,_,_,_) => "(" + child.toString + ")"
41 | }
42 | override def toString = toStringChild(left) + slash + toStringChild(right)
43 |
44 | def toStringChildNoFeature(child:Category) = child match {
45 | case AtomicCategory(_,_,_) => child.toStringNoFeature
46 | case ComplexCategory(_,_,_,_) => "(" + child.toStringNoFeature + ")"
47 | }
48 | override def toStringNoFeature = toStringChildNoFeature(left) + slash + toStringChildNoFeature(right)
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/CategoryManager.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.collection.mutable.HashMap
20 | import scala.collection.mutable.ArrayBuffer
21 |
22 | class CategoryManager extends StringBaseNumberedManager[Category] with OptionReturner[Category] {
23 | override def createWithId(original:Category): Category = original match {
24 | case AtomicCategory(id, base, avm) => AtomicCategory(newId, base, avm)
25 | case ComplexCategory(id, left, right, slash) =>
26 | val leftWithId = assignID(left)
27 | val rightWithId = assignID(right)
28 | ComplexCategory(newId, leftWithId, rightWithId, slash)
29 | }
30 | override def getOrNone(str:String): Option[Category] = str2objIndex.get(str) match {
31 | case Some(i) => Some(objects(i))
32 | case None => canonicalMap.get(createCanonicalInstance(str))
33 | }
34 |
35 | override def createCanonicalInstance(str:String): Category = JapaneseCategoryParser.parse(str)
36 |
37 | // This is used when candidate shift category is empty
38 | // It sometimes happen if for example, PoS not registered in the dictionary is detected.
39 | val unkCategory = getOrCreate("UNK")
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/CategoryTree.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 | import Slash._
19 |
20 | case class CategoryTree(var surface:String, slash:Slash, left:CategoryTree, right:CategoryTree) {
21 | def isLeaf = left == null && right == null
22 | def setSurface:CategoryTree = {
23 | def childSurface(child:CategoryTree) =
24 | if (child.isLeaf) child.surface else '(' + child.surface + ')'
25 |
26 | if (isLeaf) assert(surface != null)
27 | else surface = childSurface(left) + slash + childSurface(right)
28 | this
29 | }
30 | def foreachLeaf(f:CategoryTree=>Any):Unit = {
31 | if (isLeaf) f(this)
32 | else List(left,right).foreach(_.foreachLeaf(f))
33 | }
34 | }
35 |
36 | object CategoryTree {
37 | def createLeaf(surface:String) = CategoryTree(surface, null, null, null)
38 | def createInternal(slash:Slash, left:CategoryTree , right:CategoryTree) =
39 | CategoryTree(null, slash, left, right)
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Direction.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | object Direction extends Enumeration {
20 | type Direction = Value; val Left, Right = Value
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/MecabReader.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.io.Source
20 | import scala.collection.mutable.ArrayBuffer
21 |
22 | /** Read the output of mecab with -Ochasen option.
23 | */
24 | class MecabReader(dict:Dictionary) {
25 | def toPoSTaggedSentence(lines:Seq[String]) = {
26 | val terminalSeq = lines.map { line =>
27 | val splitted = line.split('\t')
28 | val word = dict.getWordOrCreate(splitted(0))
29 | val base = dict.getWordOrCreate(splitted(2))
30 |
31 | val conjStr = if (splitted.size > 6) splitted(5) else "_"
32 | val posStr = splitted(3) + "/" + conjStr
33 |
34 | val pos = dict.getPoSOrCreate(posStr)
35 | (word, base, pos)
36 | }
37 | new PoSTaggedSentence(
38 | terminalSeq.map(_._1),
39 | terminalSeq.map(_._2),
40 | terminalSeq.map(_._3))
41 | }
42 | def readSentences(in:Source, n:Int): Array[PoSTaggedSentence] = {
43 | val sentences = new ArrayBuffer[PoSTaggedSentence]
44 |
45 | val sentenceLines = new ArrayBuffer[String]
46 |
47 | takeLines(in, n).foreach { _ match {
48 | case "EOS" =>
49 | sentences += toPoSTaggedSentence(sentenceLines)
50 | sentenceLines.clear
51 | case line =>
52 | sentenceLines += line
53 | }}
54 | sentences.toArray
55 | }
56 | def readSentences(path:String, n:Int): Array[PoSTaggedSentence] =
57 | readSentences(Source.fromFile(path), n)
58 | def takeLines(in:Source, n:Int): Iterator[String] =
59 | for (line <- in.getLines.filter(_!="") match {
60 | case lines if (n == -1) => lines
61 | case lines => lines.take(n) }) yield line
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Numbered.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | trait Numbered[T] {
20 | def id:Int
21 | def v:T
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/PoS.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | /**
20 | * Internal representation of Pat-of-Speech.
21 | * The trait gives some methods to access to the information, which might be used in some languages.
22 | * For example, hierar is a sequence of FineTag, which is assmed to represent hierarchy of that PoS.
23 | * To enable using these different types of tags transparently (it is useful in e.g., feature extractions), Conjugation or FineTag itself is also PoS.
24 | * WARNING: all PoSs have to have unique ids to be distinguished, so it assmed that surface forms of conj, hierar, and pos itself (full surface) are disjoint; if, for example, a FineTag have the same surface to a Conjugation, the dictionary discards the latter one. One solution to this problem is to add a symbol to each type of PoS, e.g., adding suffix 'F' to all FineTag instances when draw/inserting the dictionary.
25 | */
26 | sealed trait PoS extends Numbered[String] {
27 | def conj:PoS = sys.error("conj is not defined in this PoS class.")
28 | def hierar:Seq[PoS] = sys.error("hierar is not defined in this PoS class.")
29 | def hierarConj:Seq[PoS] = sys.error("hierarConj is not defined in this PoS class.")
30 | def first = hierar(0)
31 | def second = if (hierar.size < 2) first else hierar(1)
32 | def third = if (hierar.size < 3) second else hierar(2)
33 |
34 | def firstWithConj = hierarConj(0)
35 | def secondWithConj = if (hierarConj.size < 2) firstWithConj else hierarConj(1)
36 | def thirdWithConj = if (hierarConj.size < 3) secondWithConj else hierarConj(2)
37 | }
38 | trait OptionalPoS extends PoS
39 | trait MainPoS extends PoS
40 |
41 | case class Conjugation(override val id:Int, override val v:String) extends OptionalPoS {
42 | override def toString = v
43 | }
44 | case class FineTag(override val id:Int, override val v:String) extends OptionalPoS {
45 | override def toString = v
46 | }
47 | case class FineWithConjugation(override val id:Int, override val v:String) extends OptionalPoS {
48 | override def toString = v
49 | }
50 | case class SimplePoS(override val id:Int, override val v:String) extends MainPoS {
51 | override def toString = v
52 | }
53 | case class JapanesePoS(override val id:Int,
54 | override val v:String,
55 | override val conj:PoS,
56 | override val hierar:Seq[PoS],
57 | override val hierarConj:Seq[PoS]) extends MainPoS {
58 | override def toString = v
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/SimpleDictionary.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | class SimpleDictionary extends Dictionary(new Word2CategoryDictionary) {
20 | override val posManager = new PoSManager {
21 | def createWithId(original: PoS) = SimplePoS(newId, original.v)
22 | def createCanonicalInstance(str:String) = SimplePoS(0, str)
23 | }
24 | override val categoryManager = new CategoryManager {
25 | override def createCanonicalInstance(str: String): Category = EnglishCategoryParser.parse(str)
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Slash.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | object Slash extends Enumeration {
20 | type Slash = Value
21 | val Left = Value("\\")
22 | val Right = Value("/")
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Word.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | trait Word extends Numbered[String] {
20 | // additional information is defined in function; may or may not be overridden in val by subclasses
21 | def classId:Int = throw new RuntimeException("classId is not defined in this Word class.")
22 | def assignClass(classId:Int):Word = this // default do nothing
23 | // some morphological information extracted from the surface form might be included ? (e.g., for morphological rich languages)
24 | }
25 |
26 | case class SimpleWord(override val id:Int, override val v:String) extends Word {
27 | override def assignClass(classId:Int) = ClassedWord(id, v, classId)
28 | override def toString = v
29 | }
30 | case class ClassedWord(override val id:Int,
31 | override val v:String,
32 | override val classId:Int) extends Word {
33 | override def assignClass(classId:Int) = ClassedWord(id, v, classId)
34 | override def toString = v + "[" + classId + "]"
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/package.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | package object ccg {
20 | type WeightVec = jigg.ml.WeightVector[Float]
21 | }
22 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/Action.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.parser
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import jigg.nlp.ccg.lexicon.{Category, Dictionary}
20 | import jigg.nlp.ccg.lexicon.Direction.Direction
21 |
22 | /**
23 | * action and corresponding label; for speed reason, label should not have the actual object such as category, so we convert Action object into corresponding Label object when filling feature templates
24 | */
25 | sealed trait Action { def toLabel:ActionLabel }
26 | sealed trait ActionLabel {
27 | def mkString(dict:Dictionary):String
28 | }
29 |
30 | // shift the category with categoryId of the head of buffer
31 | case class Shift(category:Category) extends Action { override def toLabel = ShiftLabel(category.id) }
32 |
33 | @SerialVersionUID(-6619103978469031483L)
34 | case class ShiftLabel(id:Int) extends ActionLabel {
35 | override def mkString(dict:Dictionary) = "SHIFT(" + dict.getCategory(id) + ")"
36 | }
37 |
38 | // combine two top nodes on the stack into a node which has categoryId
39 | case class Combine(category:Category, headDir:Direction, ruleType:String) extends Action { override def toLabel = CombineLabel(category.id) }
40 |
41 | @SerialVersionUID(-1350486416817206332L)
42 | case class CombineLabel(id:Int) extends ActionLabel {
43 | override def mkString(dict:Dictionary) = "COMBINE(" + dict.getCategory(id) + ")"
44 | }
45 |
46 | // unary change to a node with categoryId
47 | case class Unary(category:Category, ruleType:String) extends Action { override def toLabel = UnaryLabel(category.id) }
48 |
49 | @SerialVersionUID(-3492899016953622825L)
50 | case class UnaryLabel(id:Int) extends ActionLabel {
51 | def mkString(dict:Dictionary) = "UNARY(" + dict.getCategory(id) + ")"
52 | }
53 |
54 | case class Finish() extends Action { override def toLabel = FinishLabel() }
55 |
56 | @SerialVersionUID(-6536578690403443069L)
57 | case class FinishLabel() extends ActionLabel {
58 | def mkString(dict:Dictionary) = "FINISH"
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/HeadFinder.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.parser
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.collection.mutable.HashMap
20 | import jigg.nlp.ccg.lexicon.{PoS, JapanesePoS, Category}
21 | import jigg.nlp.ccg.lexicon.Direction._
22 |
23 | trait HeadFinder extends Serializable {
24 | type NodeInfo = HeadFinder.NodeInfo
25 | def get(left:NodeInfo, right:NodeInfo): Direction
26 | }
27 | object HeadFinder {
28 | case class NodeInfo(pos:PoS, category:Category, headCategory:Category)
29 | }
30 |
31 | case class EnglishHeadFinder(children2dir: Map[(Int, Int), Direction]) extends HeadFinder {
32 | def get(left:NodeInfo, right:NodeInfo) =
33 | children2dir.get(left.category.id, right.category.id) match {
34 | case Some(dir) => dir
35 | case _ => Left
36 | }
37 | }
38 |
39 | object EnglishHeadFinder {
40 | import jigg.nlp.ccg.lexicon.{ParseTree, NodeLabel, BinaryTree, NonterminalLabel}
41 | def createFromParseTrees(trees: Seq[ParseTree[NodeLabel]]): EnglishHeadFinder = {
42 | val map = new HashMap[(Int, Int), Direction]
43 | trees.foreach { _.foreachTree { _ match {
44 | case BinaryTree(left, right, NonterminalLabel(dir, _, _)) =>
45 | map += (left.label.category.id, right.label.category.id) -> dir
46 | case _ =>
47 | }}}
48 | EnglishHeadFinder(map.toMap)
49 | }
50 | }
51 |
52 | object JapaneseHeadFinder extends HeadFinder {
53 | val Symbol = "記号"
54 | def get(left:NodeInfo, right:NodeInfo) = {
55 | val leftPos = left.pos.first.v
56 | val rightPos = right.pos.first.v
57 | if (rightPos == Symbol) Left else Right
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/KBestDecoder.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.parser
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import jigg.nlp.ccg.lexicon.{Derivation, CandAssignedSentence}
20 |
21 | case class WrappedAction(v: Action, isGold:Boolean, partialFeatures:LabeledFeatures = LabeledFeatures())
22 |
23 | case class StatePath(state:State, waction: WrappedAction, prev: Option[StatePath] = None, score:Double = 0) {
24 | def actionPath = expand.map(_.waction)
25 | def expand = expandRecur(Nil)
26 | private def expandRecur(seq: List[StatePath]): List[StatePath] = prev match {
27 | case None => seq // always ignoring the initial state
28 | case Some(prev) => prev.expandRecur(this :: seq)
29 | }
30 | def lighten = this.copy(waction = waction.copy(partialFeatures = LabeledFeatures()))
31 | }
32 |
33 | trait KBestDecoder {
34 |
35 | trait ACandidate {
36 | def path: StatePath
37 | def score: Double
38 | def isConnected: Boolean = path.state.s1 == None
39 | }
40 |
41 | val comparePreferringConnected: (ACandidate, ACandidate) => Boolean = {
42 | case (a, b) if a.isConnected && !b.isConnected => true
43 | case (a, b) if !a.isConnected && b.isConnected => false
44 | case (a, b) => a.score > b.score
45 | }
46 |
47 | def search(sentence: CandAssignedSentence): Seq[ACandidate]
48 |
49 | def predict(sentence: CandAssignedSentence): (Derivation, Double) = {
50 | val c = search(sentence).sortWith(_.score > _.score)(0)
51 | (c.path.state.toDerivation, c.score)
52 | }
53 |
54 | /** If a fully connected tree is found, return the one with the maximum score; else return the maximum score unconnected tree
55 | */
56 | def predictConnected(sentence: CandAssignedSentence): (Derivation, Double) = {
57 | val c = search(sentence).sortWith(comparePreferringConnected)(0)
58 | (c.path.state.toDerivation, c.score)
59 | }
60 |
61 | /** Return k-best trees according to the final state score.
62 | *
63 | * @param preferConnected if ture, fully connected trees are placed at the top of elements even if it is not the maximum score tree.
64 | */
65 | def predictKbest(k: Int, sentence: CandAssignedSentence, preferConnected: Boolean = false): Seq[(Derivation, Double)] = {
66 | val sorted = preferConnected match {
67 | case true => search(sentence).sortWith(comparePreferringConnected)
68 | case false => search(sentence).sortWith(_.score > _.score)
69 | }
70 | sorted.take(k) map { c => (c.path.state.toDerivation, c.score) }
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/Rule.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.parser
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule}
20 |
21 | import scala.collection.mutable.{HashMap, HashSet}
22 | import java.io.{ObjectOutputStream, ObjectInputStream}
23 |
24 | trait Rule {
25 | def unify(left:Category, right:Category): Option[Array[(Category, String)]]
26 | def raise(child:Category): Option[Array[(Category, String)]]
27 | def headFinder:HeadFinder
28 | }
29 |
30 | // rules are restricted to CFG rules extracted from the training CCGBank
31 | case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType)
32 | val unaryRules:Map[Int, Array[(Category, String)]],
33 | override val headFinder:HeadFinder) extends Rule {
34 | def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id))
35 | def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id)
36 | }
37 |
38 | object CFGRule {
39 | def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = {
40 | val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]]
41 | val unaryRules = new HashMap[Int, HashSet[(Category, String)]]
42 |
43 | derivations.foreach { deriv =>
44 | deriv.foreachPoint({ point:Point => deriv.get(point) match {
45 | case Some(AppliedRule(UnaryChildPoint(child), ruleType)) =>
46 | val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)])
47 | parents += ((point.category, ruleType))
48 | case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) =>
49 | val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)])
50 | parents += ((point.category, ruleType))
51 | case _ =>
52 | }})
53 | }
54 | new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap,
55 | unaryRules.map { case (k, v) => k -> v.toArray }.toMap,
56 | headFinder)
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/package.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | package object parser {
20 | type UF = ShiftReduceUnlabeledFeature
21 | type LF = ShiftReduceFeature
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/tagger/UserDefinedFeatureExtractors.scala:
--------------------------------------------------------------------------------
1 | // package jigg.nlp.ccg.tagger
2 |
3 | // import jigg.nlp.ccg.lexicon.{Dictionary, JapaneseDictionary}
4 |
5 | // import scala.collection.mutable.ArrayBuffer
6 |
7 | // // this is the example to define new features and the extractor that extracts that features
8 |
9 | // object NewTemplate extends Enumeration {
10 | // type NewTemplate = Value
11 | // val w_p = Value
12 | // }
13 |
14 | // case class UnigramWordPoSFeature[T](word:Int, pos:Int, tmpl:T) extends FeatureOnDictionary {
15 | // override def mkString(dict:Dictionary) = concat(tmpl, dict.getWord(word))
16 | // }
17 |
18 | // class UnigramSecondLevelFineExtractor(val windowSize:Int) extends FeatureExtractor {
19 | // def addFeatures(c:Context, features:ArrayBuffer[UF]) = {
20 | // features += UnigramWordPoSFeature(c.word(0), c.pos(0), NewTemplate.w_p)
21 | // }
22 | // }
23 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/tagger/package.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | package object tagger {
20 | type UF = SuperTaggingUnlabeledFeature
21 | type LF = SuperTaggingFeature
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/Annotation.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 |
20 | /** Currently, this trait is useful to assign unique id
21 | * for each annotation.
22 | */
23 | abstract class Annotation(val idPrefix: String) {
24 | val idGen = jigg.util.IDGenerator(idPrefix)
25 | def nextId: String = idGen.next
26 | }
27 |
28 | object Annotation {
29 |
30 | object Document extends Annotation("d")
31 |
32 | object Sentence extends Annotation("s")
33 |
34 | object Token extends Annotation("t")
35 |
36 | object Dependency extends Annotation("dep")
37 |
38 | object CCG extends Annotation("ccg")
39 |
40 | object NE extends Annotation("ne")
41 |
42 | object Mention extends Annotation("me")
43 |
44 | object Coreference extends Annotation("cr")
45 |
46 | object PredArg extends Annotation("pa")
47 |
48 | object ParseSpan extends Annotation("sp")
49 | object CCGSpan extends Annotation("ccgsp")
50 |
51 | object Chunk extends Annotation("ch")
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/AnnotationError.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | class AnnotationError(msg: String) extends RuntimeException(msg)
20 |
21 | class ProcessError(msg: String) extends AnnotationError(msg)
22 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/ArgumentError.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | class ArgumentError(msg: String) extends RuntimeException(msg)
20 |
21 | class RequirementError(msg: String) extends RuntimeException(msg)
22 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/DocumentAnnotator.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2017 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.xml.{Elem, Node}
20 | import jigg.util.XMLUtil.RichNode
21 |
22 | /** A trait for an annotator which modifies a document node. Use this trait if an annotator
23 | * is a document-level annotator.
24 | */
25 | trait DocumentAnnotator extends Annotator {
26 | override def annotate(annotation: Node): Node = {
27 |
28 | annotation.replaceAll("root") { case e: Elem =>
29 | val newChild = Annotator.makePar(e.child, nThreads).map { c =>
30 | c match {
31 | case c if c.label == "document" =>
32 | try newDocumentAnnotation(c) catch {
33 | case e: AnnotationError =>
34 | System.err.println(s"Failed to annotate a document by $name.")
35 | Annotator.annotateError(c, name, e)
36 | }
37 | case c => c
38 | }
39 | }.seq
40 | e.copy(child = newChild)
41 | }
42 | }
43 |
44 | def newDocumentAnnotation(sentence: Node): Node
45 | }
46 |
47 | trait SeqDocumentAnnotator extends DocumentAnnotator {
48 | override def nThreads = 1
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/RegexDocumentAnnotator.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2016 Takafumi Sakakibara and Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import scala.xml.Node
21 |
22 | class RegexDocumentAnnotator(override val name: String, override val props: Properties) extends Annotator {
23 |
24 | @Prop(gloss = "Regular expression to segment documents") var pattern = """\n{2,}"""
25 | readProps()
26 |
27 | private[this] val documentIDGen = jigg.util.IDGenerator("d")
28 | override def annotate(annotation: Node): Node = {
29 | val raw = annotation.text
30 |
31 | var offset = 0
32 |
33 | val documents = raw.split(pattern).map { str =>
34 | val n = { str }
39 | offset += str.size
40 | n
41 | }
42 |
43 | { documents }
44 | }
45 |
46 | override def requires = Set()
47 | override def requirementsSatisfied = Set(Requirement.Dsplit)
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SentencesAnnotator.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2017 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.xml.{Elem, Node}
20 | import jigg.util.XMLUtil.RichNode
21 |
22 | /** A trait for an annotator which modifies a sentence node.
23 | *
24 | * If an annotator is sentence-level annotator such as a parser or pos tagger, it should
25 | * extend this trait and usually what you should do is only to implement
26 | * newSentenceAnnotation method, which rewrites a sentence node and returns new one.
27 | *
28 | * This annotates given sentences in parallel. If you want to avoid this perhaps
29 | * because the annotator is not thread-safe, use [[jigg.pipeline.SeqSentencesannotator]]
30 | * instead, which does annotates sequentially.
31 | */
32 | trait SentencesAnnotator extends Annotator {
33 | def annotate(annotation: Node): Node = {
34 |
35 | annotation.replaceAll("sentences") { case e: Elem =>
36 | val annotatedChild = Annotator.makePar(e.child, nThreads).map {
37 | case s if s.label == "sentence" =>
38 | try newSentenceAnnotation(s) catch {
39 | case e: AnnotationError =>
40 | System.err.println(s"Failed to annotate a document by $name.")
41 | Annotator.annotateError(s, name, e)
42 | }
43 | case s => s
44 | }.seq
45 | e.copy(child = annotatedChild)
46 | }
47 | }
48 |
49 | def newSentenceAnnotation(sentence: Node): Node
50 | }
51 |
52 | /** This trait annotates the inputs sequentially.
53 | */
54 | trait SeqSentencesAnnotator extends SentencesAnnotator {
55 | override def nThreads = 1
56 | }
57 |
58 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SimpleKNPAnnotator.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Takafumi Sakakibara and Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import scala.xml._
21 |
22 | class SimpleKNPAnnotator(override val name: String, override val props: Properties)
23 | extends KNPAnnotator with AnnotatingSentencesInParallel { self=>
24 |
25 | @Prop(gloss = "Use this command to launch KNP (-tab is automatically added. -anaphora is not compatible with this annotator. In that case, use knpDoc instead). Version >= 4.12 is assumed.") var command = "knp"
26 | readProps()
27 |
28 | localAnnotators // instantiate lazy val here
29 |
30 | def mkLocalAnnotator = new SimpleKNPLocalAnnotator
31 |
32 | class SimpleKNPLocalAnnotator
33 | extends SentencesAnnotator with LocalAnnotator with BaseKNPLocalAnnotator {
34 | override def defaultArgs = Seq("-tab")
35 |
36 | val knp = mkIO()
37 |
38 | override def newSentenceAnnotation(sentence: Node): Node = {
39 | val sentenceId = (sentence \ "@id").toString
40 |
41 | val knpResult = runKNP(sentence, None)
42 | annotateSentenceNode(sentence, knpResult, sentenceId, _ => sentenceId)
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SpaceTokenizerAnnotator.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 |
21 | import scala.xml.{Node, Elem, Text, Atom}
22 | import jigg.util.XMLUtil.RichNode
23 |
24 | /** This simple annotator just segments a sentence by spaces, i.e.,
25 | * assuming the input sentence is already correctly tokenized.
26 | */
27 | class SpaceTokenizerAnnotator(override val name: String, override val props: Properties)
28 | extends SentencesAnnotator {
29 |
30 | override def newSentenceAnnotation(sentence: Node): Node = {
31 |
32 | val sindex = sentence \@ "id"
33 | val text = sentence.text
34 | val range = (0 until text.size)
35 |
36 | def isSpace(c: Char) = c == ' ' || c == '\t'
37 |
38 | val begins = 0 +: (1 until text.size).filter { i => isSpace(text(i-1)) && !isSpace(text(i)) }
39 |
40 | val ends = begins map {
41 | range indexWhere (i=>isSpace(text(i)), _) match {
42 | case -1 => text.size
43 | case e => e
44 | }
45 | }
46 |
47 | val tokenSeq = begins.zip(ends).zipWithIndex map { case ((b, e), i) =>
48 |
53 | }
54 | val tokens = { tokenSeq }
55 | sentence addChild tokens
56 | }
57 |
58 | override def requires = Set(Requirement.Ssplit)
59 | override def requirementsSatisfied = Set(Requirement.Tokenize)
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SystemDict.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | sealed trait SystemDic
20 |
21 | object SystemDic {
22 | case object ipadic extends SystemDic
23 | case object jumandic extends SystemDic
24 | case object unidic extends SystemDic
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/UnmanagedAnnotators.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /** A singleton managing the collection of `UnmanagedAnnotator`.
4 | *
5 | * See the document of `UnmanagedAnnotator` for its role. `list` is an essential object,
6 | * which preserves mapping from the annotator name to an `UnmanagedAnnotator`. If you
7 | * want to support a new annotator that depends on an unmanaged library, add it to the
8 | * `list`.
9 | */
10 | object UnmanagedAnnotators {
11 |
12 | /** Information about the annotator that wraps a software, which is in JVM while not
13 | * included as a managed library via maven.
14 | *
15 | * When assembling, such external unmanaged jars are not included, so a user has to
16 | * explicitly add them to the class path. Each UnmanagedAnnotator object helps to
17 | * describe how to use it. For example, its default message, implemented in
18 | * `DefaultUnmanagedannotator` tells the url of the library jar file.
19 | */
20 | trait UnmanagedAnnotator[A] {
21 | def name: String
22 | def clazz: Class[A]
23 |
24 | def msg: String
25 | }
26 |
27 | case class DefaultUnmanagedAnnotator[A](
28 | val name: String, val clazz: Class[A], url: String) extends UnmanagedAnnotator[A] {
29 |
30 | def msg = s"""Failed to launch $name. Maybe the necessary jar file is not included in
31 | the current class path. This might be solved by adding jar/* into your class path,
32 | e.g., call the jigg like like:
33 |
34 | > java cp "jigg-xxx.jar:jar/*" jigg.pipeline.Pipeline ...
35 |
36 | If the error still remains, the necessary jar file is missing. You can download it
37 | from ${url}. Try e.g.,
38 |
39 | > wget $url jar/
40 |
41 | and do the above command.
42 | """
43 | }
44 |
45 | val list = Map(
46 | "easyccg" -> DefaultUnmanagedAnnotator(
47 | "easyccg",
48 | classOf[EasyCCGAnnotator],
49 | "https://github.com/mikelewis0/easyccg/raw/master/easyccg.jar"))
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/ArgumentsParser.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 |
21 | object ArgumentsParser {
22 | def parse(args: List[String]): Properties = parseRecur(new Properties, args)
23 |
24 | private def parseRecur(props: Properties, args: List[String]): Properties = args match {
25 | case ArgKey(key) :: next => next match {
26 | case ArgKey(nextKey) :: tail => // -key1 -key2 ... => key1 is boolean value
27 | putTrue(props, key)
28 | parseRecur(props, next)
29 | case value :: tail =>
30 | key match {
31 | case "props" => props.load(jigg.util.IOUtil.openIn(value))
32 | case _ => props.put(key, value)
33 | }
34 | parseRecur(props, tail)
35 | case Nil =>
36 | putTrue(props, key)
37 | parseRecur(props, next)
38 | }
39 | case _ => props
40 | }
41 | def putTrue(props: Properties, key: String) = props.put(key, "true")
42 |
43 | object ArgKey {
44 | def unapply(key: String): Option[String] = key match {
45 | case x if x.size > 1 && x(0) == '-' && x.drop(1).forall(x=>x.isDigit || x=='.') => None // -10.0, -1, etc are not key
46 | case x if x.size > 1 && x(0) == '-' && x(1) == '-' => Some(x.substring(2))
47 | case x if x.size > 1 && x(0) == '-' => Some(x.substring(1)) // we don't catch if x.size == 1, ('-' is recognized as some value)
48 | case _ => None
49 | }
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/HDF5Object.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import ucar.nc2.{Attribute, Group, NetcdfFile}
20 |
21 | class HDF5Object(rootGroup: Group) {
22 |
23 | def checkAndGetAttribute(name: String): Attribute = Option(rootGroup.findAttribute(name)) match {
24 | case Some(x) => x
25 | case None => throw new IllegalArgumentException("cannot get " + name + " attribute from input model file")
26 | }
27 |
28 | def checkAndGetGroup(name: String): Group = Option(rootGroup.findGroup(name)) match {
29 | case Some(x) => x
30 | case None => throw new IllegalArgumentException("cannot get " + name + " group from input model file")
31 | }
32 |
33 | }
34 |
35 | object HDF5Object {
36 |
37 | // Load from a path on the file system
38 | def fromFile(path: String): HDF5Object = {
39 | val file = NetcdfFile.open(path, null)
40 | mkObj(file)
41 | }
42 |
43 | // Load from class loader
44 | def fromResource(path: String): HDF5Object = {
45 | val file =
46 | NetcdfFile.openInMemory(IOUtil.findResource(path).toURI)
47 | mkObj(file)
48 | }
49 |
50 | private def mkObj(file: NetcdfFile) = {
51 | val group = file.getRootGroup
52 | new HDF5Object(group)
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/IDGenerator.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | // trait IDGeneratorBase {
20 | // def next(): String
21 | // }
22 |
23 | // case class IDGenerator(prefix: String) extends IDGeneratorBase {
24 | // private[this] val stream = Stream.from(0).iterator
25 | // def next() = prefix + stream.next
26 | // }
27 |
28 | case class IDGenerator(toId: Int=>String) {
29 | private[this] var stream = Stream.from(0).iterator
30 | def next() = toId(stream.next)
31 | def reset() = stream = Stream.from(0).iterator
32 | }
33 |
34 | object IDGenerator {
35 | def apply(prefix: String): IDGenerator = IDGenerator(prefix + _)
36 | }
37 |
38 | /** Not thread-safe but little overhead
39 | */
40 | case class LocalIDGenerator(toId: Int=>String) {
41 | var i = 0
42 | def next() = {
43 | val n = toId(i)
44 | i += 1
45 | n
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/LogUtil.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | object LogUtil {
20 | /** A helper to measure time.
21 | * If multiple commands are nested, use multipleTrack.
22 | *
23 | * TODO: Integrate track and multipleTrack to automatically choose indent and appropriate format.
24 | * Currently track[A](beginMessage: String, ...) "manually" handles the indent level.
25 | */
26 | def track[A](message: String)(body: => A): A = {
27 | // System.out.print(message)
28 | // val (result, time) = recordTime { body }
29 | // System.out.println("done [%.1f sec]".format(time))
30 | // result
31 | track(message, "done", 0) { body }
32 | }
33 |
34 | def multipleTrack[A](message: String)(body: => A): A = {
35 | // System.out.println("{ " + message)
36 | // val (result, time) = recordTime { body }
37 | // System.out.println("} [%.1f sec]".format(time))
38 | // result
39 | track(message + " {\n", "}", 0) { body }
40 | }
41 |
42 | def track[A](beginMessage: String, endMessage: String, indent: Int)(body: => A): A = {
43 | def print(raw: String) = {
44 | (0 until indent) foreach { _ => System.out.print(" ") }
45 | System.out.print(raw)
46 | }
47 | print(beginMessage)
48 | val (result, time) = recordTime { body }
49 | System.out.println(endMessage + " [%.1f sec]".format(time))
50 | result
51 | }
52 |
53 | def recordTime[A](body: => A): (A, Double) = {
54 | val before = System.currentTimeMillis
55 | val result = body
56 | val time = (System.currentTimeMillis - before).toDouble / 1000.0
57 | (result, time)
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/LookupTable.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.io.Reader
20 |
21 | import breeze.linalg.DenseMatrix
22 | import org.json4s.{DefaultFormats, _}
23 | import org.json4s.jackson.JsonMethods
24 | import org.json4s.JsonAST.JValue
25 |
26 | class LookupTable(rawTable: JValue) {
27 |
28 | implicit private val formats = DefaultFormats
29 | private val tables = rawTable.extract[Map[String, Map[String, Map[String, String]]]]
30 |
31 | private val key2id = tables("_lookup")("_key2id")
32 | private val id2key = tables("_lookup")("_id2key")
33 |
34 | // For raw text
35 | def encodeCharacter(str: String): DenseMatrix[Float] = {
36 | val strArray = str.map{x =>
37 | // Note: For skipping unknown character, this encoder returns dummy id.
38 | key2id.getOrElse(x.toString, "3").toFloat
39 | }.toArray
40 | new DenseMatrix[Float](1, str.length, strArray)
41 | }
42 |
43 | // For list of words
44 | def encodeWords(words: Array[String]): DenseMatrix[Float] = {
45 | val wordsArray = words.map{x =>
46 | // Note: For skipping unknown words, this encoder returns dummy id.
47 | key2id.getOrElse(x.toString, "3").toFloat
48 | }
49 | new DenseMatrix[Float](1, words.length, wordsArray)
50 | }
51 |
52 | def decode(data: DenseMatrix[Float]): Array[String] =
53 | data.map{x => id2key.getOrElse(x.toInt.toString, "NONE")}.toArray
54 |
55 | def getId(key: String): Int = key2id.getOrElse(key, "0").toInt
56 | def getId(key: Char): Int = getId(key.toString)
57 |
58 | def getKey(id: Int): String = id2key.getOrElse(id.toString, "UNKNOWN")
59 | }
60 |
61 |
62 | object LookupTable {
63 |
64 | // Load from a path on the file system
65 | def fromFile(path: String) = mkTable(IOUtil.openIn(path))
66 |
67 | // Load from class loader
68 | def fromResource(path: String) = mkTable(IOUtil.openResourceAsReader(path))
69 |
70 | private def mkTable(input: Reader) = {
71 | val j = try { JsonMethods.parse(input) } finally { input.close }
72 | new LookupTable(j)
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/Normalizer.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import com.ibm.icu.text.Transliterator
20 |
21 | object Normalizer {
22 |
23 | /** Replace all half space characters in ascii (< 0x7F) to full space characters.
24 | *
25 | * Useful for preprocessing in some Japanese software such as JUMAN and KNP.
26 | *
27 | * NOTE: We do not touch hankaku kana characters since they make alignment to the
28 | * original text more involved.
29 | */
30 | def hanZenAscii(text: String) = text map {
31 | case c if c <= 0x7F => hanzenTrans.transliterate(c + "")(0)
32 | case c => c
33 | }
34 | private val hanzenTrans = Transliterator.getInstance("Halfwidth-Fullwidth")
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/Prop.java:
--------------------------------------------------------------------------------
1 | package jigg.util;
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.lang.annotation.*;
20 |
21 | @Retention(RetentionPolicy.RUNTIME)
22 | public @interface Prop {
23 | // String name() default "";
24 | String gloss() default "";
25 | boolean required() default false;
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/PropertiesUtil.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import scala.collection.JavaConversions._
21 |
22 | object PropertiesUtil {
23 | def findProperty(key: String, props: Properties): Option[String] = props.getProperty(key) match {
24 | case null => None
25 | case value => Some(value)
26 | }
27 | def safeFind(key: String, props: Properties): String = findProperty(key, props).getOrElse { sys.error(s"$key property is required!" ) }
28 |
29 | def getBoolean(key: String, props: Properties): Option[Boolean] = findProperty(key, props) map {
30 | case "true" => true
31 | case "false" => false
32 | case _ => sys.error(s"Property $key should be true or false")
33 | }
34 |
35 | def filter(props: Properties)(f: (String, String)=>Boolean): Seq[(String, String)] =
36 | props.stringPropertyNames.toSeq
37 | .map { k => (k, props.getProperty(k)) }
38 | .filter { case (k, v) => f(k, v) }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/ResourceUtil.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | import java.io.File
4 |
5 | object ResourceUtil {
6 |
7 | /** Read a python script found in `resources/python/xxx.py`. Since these files cannot
8 | * be executed directly we create a temporary file by copying the script first, and
9 | * return the resulting temp file.
10 | *
11 | * @param name script name, corresponding to `xxx.py`.
12 | */
13 | def readPython(name: String): File = {
14 | val script = File.createTempFile("jigg", ".py")
15 | script.deleteOnExit
16 | val stream = getClass.getResourceAsStream(s"/python/${name}")
17 | IOUtil.writing(script.getPath) { o =>
18 | scala.io.Source.fromInputStream(stream).getLines foreach { line =>
19 | o.write(line + "\n")
20 | }
21 | }
22 | script
23 | }
24 |
25 | }
26 |
27 |
28 |
--------------------------------------------------------------------------------
/src/main/scala/jigg/util/TreesUtil.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import scala.collection.mutable.ArrayBuffer
20 | import scala.xml._
21 |
22 | import jigg.pipeline.Annotation
23 |
24 | object TreesUtil {
25 |
26 | def streeToNode(tree: String, sentence: Node, annotator: String) = {
27 | val tokens = tree.replaceAllLiterally("(", " ( ").replaceAllLiterally(")", " ) ").trim.split("\\s+")
28 |
29 | val tokenSeq = (sentence \ "tokens").head \ "token"
30 | var tokIdx = -1
31 | def nextTokId = { tokIdx += 1; tokenSeq(tokIdx) \@ "id" }
32 |
33 | val spans = new ArrayBuffer[Node]
34 |
35 | // Fill in spans; return the id of constructed subtree, and the arrived index.
36 | def readTopdown(idx: Int): (String, Int) = {
37 |
38 | def collectChildren(curChildren: List[String], cur: Int): (Seq[String], Int) =
39 | tokens(cur) match {
40 | case ")" =>
41 | (curChildren.reverse, cur)
42 | case "(" =>
43 | val (nextChildId, nextIdx) = readTopdown(cur)
44 | collectChildren(nextChildId :: curChildren, nextIdx)
45 | }
46 |
47 | tokens(idx) match {
48 | case "(" =>
49 | def skipParen(i: Int = 0): Int = {
50 | if (tokens(idx + i) == "(") skipParen(i + 1)
51 | else i
52 | }
53 | val parenCount = skipParen()
54 |
55 | val labelIdx = idx + parenCount
56 | val label = tokens(labelIdx)
57 |
58 | val (children, closeIdx) = tokens(labelIdx + 1) match {
59 | case "(" => collectChildren(Nil, labelIdx + 1)
60 | case word => (Nil, labelIdx + 1 + 1)
61 | }
62 | val thisId = children match {
63 | case Nil => nextTokId
64 | case children => Annotation.ParseSpan.nextId
65 | }
66 | if (!children.isEmpty) {
67 | val childStr = children mkString " "
68 | spans +=
69 | }
70 | for (i <- 0 until parenCount) { assert(tokens(closeIdx + i) == ")") }
71 | (thisId, closeIdx + parenCount)
72 | }
73 | }
74 |
75 | val (rootId, _) = readTopdown(0)
76 | { spans }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/test/resources/data/Japanese.small.lexicon:
--------------------------------------------------------------------------------
1 | @UNK@/フィラー/_ S1/S1 NP[nc,adv]1/NP[nc,adv]1 NP[nc,nm]1/NP[nc,nm]1
2 | @UNK@/副詞-一般/_ S1/S1 NP[nc,nm]1/NP[nc,nm]1 S[nm,stem] NP[nc,adv]1/NP[nc,adv]1
3 | @UNK@/副詞-助詞類接続/_ S1/S1 NP[nc,nm]1/NP[nc,nm]1 S[nm,stem]
4 | あふれる/動詞-自立/基本形 S[nm,stem]\NP[ga,nm,ga]-base_verb_rule S[nm,stem]\NP[ga,nm,ga]-adnominal_verb_rule
--------------------------------------------------------------------------------
/src/test/resources/data/Japanese.unkVerb.lexicon:
--------------------------------------------------------------------------------
1 | @UNK@/動詞-非自立/仮定形 S[nm,hyp]\S[nm,cont]sem
2 | @UNK@/動詞-非自立/体言接続特殊 S[nm,attr]\S[nm,neg]sem
3 | @UNK@/動詞-非自立/体言接続特殊2 S[adn,attr]
4 | @UNK@/動詞-非自立/基本形 S[nm,base]\S[nm,cont]sem S[adn,base]\S[nm,cont]sem S[nm,base] S[nm,base]\NP[ga,nm,ga] NP[nc,nm]1/NP[nc,nm]1
5 | @UNK@/動詞-非自立/未然ウ接続 S[nm,neg]\S[nm,cont]sem
6 | @UNK@/動詞-非自立/未然形 S[nm,neg]\S[nm,cont]sem S[nm,neg] S[nm,neg]\S[nm,r]sem
7 | @UNK@/動詞-非自立/連用タ接続 S[nm,cont]\S[nm,cont]sem S[nm,cont]
8 | @UNK@/動詞-非自立/連用形 S[nm,cont]\S[nm,cont]sem S[adv,cont]\S[nm,cont]sem S[nm,cont] S[adn,cont] S[adn,cont]\S[nm,cont]sem S[nm,cont]\NP[ga,nm,ga]
--------------------------------------------------------------------------------
/src/test/resources/data/json/english.ssplit.test.json:
--------------------------------------------------------------------------------
1 | {
2 | ".tag" : "root",
3 | ".child" : [ {
4 | ".tag" : "document",
5 | "id" : "d0",
6 | ".child" : [ {
7 | ".tag" : "sentences",
8 | ".child" : [ {
9 | ".tag" : "sentence",
10 | "text" : "Alice asked her mother to cook a cake.",
11 | "id" : "s0",
12 | "characterOffsetBegin" : "0",
13 | "characterOffsetEnd" : "38"
14 | }, {
15 | ".tag" : "sentence",
16 | "text" : "Bob saw a girl in the garden with a telescope.",
17 | "id" : "s1",
18 | "characterOffsetBegin" : "39",
19 | "characterOffsetEnd" : "85"
20 | } ]
21 | } ]
22 | } ]
23 | }
--------------------------------------------------------------------------------
/src/test/resources/data/json/japanese.ssplit.test.json:
--------------------------------------------------------------------------------
1 | {
2 | ".tag" : "root",
3 | ".child" : [ {
4 | ".tag" : "document",
5 | "id" : "d0",
6 | ".child" : [ {
7 | ".tag" : "sentences",
8 | ".child" : [ {
9 | ".tag" : "sentence",
10 | "text" : "自転車で走っている少女を見た",
11 | "id" : "s0",
12 | "characterOffsetBegin" : "0",
13 | "characterOffsetEnd" : "14"
14 | }, {
15 | ".tag" : "sentence",
16 | "text" : "テレビで走っている少女を見た",
17 | "id" : "s1",
18 | "characterOffsetBegin" : "15",
19 | "characterOffsetEnd" : "29"
20 | } ]
21 | } ]
22 | } ]
23 | }
--------------------------------------------------------------------------------
/src/test/resources/data/keras/bunsetsu_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/keras/bunsetsu_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/keras/ssplit_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/keras/ssplit_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/convolution1d/convolution1d_gold.csv:
--------------------------------------------------------------------------------
1 | -0.288217455148697,0.681861579418182
2 | -0.538490712642670,0.062052655965090
3 | -0.318091481924057,-0.074813574552536
4 | -0.023546881973743,0.040708515793085
5 | -0.485583871603012,0.224703624844551
6 | -0.450441420078278,0.002716975519434
7 | -0.176823571324348,0.489799916744232
8 | -0.123186729848385,0.057490978389978
9 | -0.336253672838211,-0.084099449217319
10 | 0.059555754065514,0.000320440391079
11 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/convolution1d/convolution1d_input.csv:
--------------------------------------------------------------------------------
1 | 0.027738961009708,0.393455303479803,0.694816228560713,0.157559454348151,0.214884384043615,0.005565078182797,0.949002280200014,0.690369967699377,0.998176256773562,0.204425396011438,0.845982123544135,0.818198829832328
2 | 0.252301884057857,0.437311847167796,0.104436208603942,0.763925291392123,0.870987562303758,0.079435648160725,0.142875224317561,0.170360773159227,0.387373867227415,0.745431984723710,0.479836153327895,0.744296844299619
3 | 0.883415945353071,0.697078201963215,0.606604317884067,0.777094318509148,0.956809131373719,0.018343700379643,0.692863164913816,0.107627736723910,0.595232367723716,0.618970512903785,0.748639111184423,0.941869156250547
4 | 0.035042201371063,0.700113249200931,0.717126347279872,0.511744032438561,0.247658441044617,0.576820124281050,0.047399750738226,0.067116874648913,0.175494795121527,0.240304085868729,0.603887921839716,0.537397181554857
5 | 0.554501767544110,0.411117180527812,0.648722795158795,0.508408218827410,0.785647318386747,0.947404977871054,0.113110476551426,0.936072327771750,0.863526769665361,0.172236633875255,0.715443984726397,0.869742300523170
6 | 0.331881976191941,0.174389983798250,0.974055309053648,0.952572967439939,0.395194463615389,0.979596804619930,0.126419143266621,0.028127155855804,0.377202820144004,0.788029009784025,0.143934466920253,0.885531232719449
7 | 0.082605263961736,0.816844068389051,0.742036051284236,0.448338330763183,0.231913187967981,0.324263082007595,0.095113194171922,0.575291246962427,0.402043739476673,0.773164202330256,0.978885567374195,0.531234497631943
8 | 0.797474806333550,0.770689995657307,0.286838584369559,0.272812118439933,0.522711445247614,0.557358959671089,0.655063150020376,0.613348870624681,0.903721040494730,0.676600535740517,0.862388024752785,0.483734729571592
9 | 0.511364975233000,0.956982804048265,0.489405080608254,0.946988783071462,0.304099907120206,0.159633845243493,0.441705350104236,0.014337837348216,0.609972921479224,0.159291332076170,0.521437544993183,0.863046123179579
10 | 0.043232549851898,0.273736339785920,0.378312369831591,0.953767858492059,0.200604482875413,0.810072095098931,0.391870443803649,0.639344286225899,0.677303032937693,0.276362747713528,0.359063987058490,0.334056036907750
11 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/convolution1d/convolution1d_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/convolution1d/convolution1d_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/dense/dense_gold.csv:
--------------------------------------------------------------------------------
1 | -0.265054643154144,0.819157660007477
2 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/dense/dense_input.csv:
--------------------------------------------------------------------------------
1 | 0.919222086072171,0.268580028843516,0.850487637208910,0.195140088357300,0.915650682096673,0.694448840619902,0.686364957159918,0.845189174009755,0.515407551460194,0.707307670736291
2 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/dense/dense_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/dense/dense_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/embedding/embedding_gold.csv:
--------------------------------------------------------------------------------
1 | -0.024064350873232,0.015874337404966
2 | -0.032138548791409,0.035715412348509
3 | -0.009305894374847,0.047007892280817
4 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/embedding/embedding_input.csv:
--------------------------------------------------------------------------------
1 | 4.000000000000000,3.000000000000000,6.000000000000000
2 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/embedding/embedding_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/embedding/embedding_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/flatten/flatten_gold.csv:
--------------------------------------------------------------------------------
1 | 0.483355849981308,0.272490352392197,0.915887176990509,0.335418432950974,0.778468728065491,0.853674173355103
2 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/flatten/flatten_input.csv:
--------------------------------------------------------------------------------
1 | 0.483355847870847,0.272490343423817
2 | 0.915887187299997,0.335418421687206
3 | 0.778468739455691,0.853674144810384
4 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/flatten/flatten_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/flatten/flatten_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/kerasModel/kerasModel_gold.csv:
--------------------------------------------------------------------------------
1 | 0.066982857882977,0.864855527877808,0.068161644041538
2 | 0.036359727382660,0.940843880176544,0.022796416655183
3 | 0.000093939248472,0.024136895313859,0.975769102573395
4 | 0.000007191142231,0.037699114531279,0.962293744087219
5 | 0.859113097190857,0.130854964256287,0.010032005608082
6 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/kerasModel/kerasModel_input.csv:
--------------------------------------------------------------------------------
1 | 0.000000000000000,6.000000000000000,6.000000000000000,2.000000000000000,6.000000000000000
2 |
--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/kerasModel/kerasModel_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/kerasModel/kerasModel_model.h5
--------------------------------------------------------------------------------
/src/test/resources/data/template.small.lst:
--------------------------------------------------------------------------------
1 | NP[nc,nm]1/NP[nc,nm]1 NP[nc,nm]1/NP[nc,nm]1
2 | S[nm,stem]\NP[ga,nm,ga]-base_verb_rule S[nm,base]\NP[ga,nm,ga]
3 | S[nm,stem]\NP[ga,nm,ga]-adnominal_verb_rule S[adn,base]\NP[ga,nm,ga]
4 | S1/S1 S1/S1
5 | NP[nc,adv]1/NP[nc,adv]1 NP[nc,adv]1/NP[nc,adv]1
6 | S[nm,stem] S[nm,stem]
7 |
--------------------------------------------------------------------------------
/src/test/resources/data/template.unkVerb.lst:
--------------------------------------------------------------------------------
1 | S[adn,attr] S[adn,attr]
2 | S[adn,base]\S[nm,cont]sem S[adn,base]\S[nm,cont]sem
3 | S[adv,cont]\S[nm,cont]sem S[adv,cont]\S[nm,cont]sem
4 | S[adn,cont] S[adn,cont]
5 | S[adn,cont]\S[nm,cont]sem S[adn,cont]\S[nm,cont]sem
6 | S[nm,hyp]\S[nm,cont]sem S[nm,hyp]\S[nm,cont]sem
7 | S[nm,attr]\S[nm,neg]sem S[nm,attr]\S[nm,neg]sem
8 | S[nm,base] S[nm,base]
9 | S[nm,base]\S[nm,cont]sem S[nm,base]\S[nm,cont]sem
10 | S[nm,base]\NP[ga,nm,ga] S[nm,base]\NP[ga,nm,ga]
11 | S[nm,neg] S[nm,neg]
12 | S[nm,neg]\S[nm,cont]sem S[nm,neg]\S[nm,cont]sem
13 | S[nm,neg]\S[nm,r]sem S[nm,neg]\S[nm,r]sem
14 | S[nm,cont]\S[nm,cont]sem S[nm,cont]\S[nm,cont]sem
15 | S[nm,cont] S[nm,cont]
16 | S[nm,cont]\S[nm,cont]sem S[nm,cont]\S[nm,cont]sem
17 | S[nm,cont]\NP[ga,nm,ga] S[nm,cont]\NP[ga,nm,ga]
18 | NP[nc,nm]1/NP[nc,nm]1 NP[nc,nm]1/NP[nc,nm]1
19 |
--------------------------------------------------------------------------------
/src/test/resources/data/xml/english.ssplit.spaceTokenize.gold.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Alice asked her mother to cook a cake.
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | Bob saw a girl in the garden with a telescope.
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/src/test/resources/data/xml/english.ssplit.test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Alice asked her mother to cook a cake.
6 | Bob saw a girl in the garden with a telescope.
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/src/test/resources/data/xml/japanese.ssplit.test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 自転車で走っている少女を見た
6 | テレビで走っている少女を見た
7 |
8 |
9 |
--------------------------------------------------------------------------------
/src/test/resources/script/create_small_lst_from_lexicon.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | ''' This script is used for creating data used for test
4 |
5 | Output is already included in resources/data directory
6 | as `template.small.lst`, so usually this file is unnecessary.
7 |
8 | Example usage from the project root directory is
9 | ./src/test/resources/script/create_small_lst_from_lexicon.py \
10 | ./ccgbank/template.lst
11 |
12 | '''
13 |
14 | import sys, os
15 |
16 | if __name__ == '__main__':
17 | if len(sys.argv) < 2:
18 | print "usage", sys.argv[0], "full_template_lst"
19 | exit()
20 |
21 | data_dir = os.path.abspath(os.path.dirname(__file__))+'/../data'
22 | small_lexicon_path = data_dir+'/Japanese.small.lexicon'
23 | output_path = data_dir+'/template.small.lst'
24 |
25 | cat_tmps = []
26 | for line in open(small_lexicon_path):
27 | cat_tmps += line.strip().split(' ')[1:]
28 | cat_tmps = set(cat_tmps)
29 |
30 | with open(output_path, 'w') as f:
31 | for line in open(sys.argv[1]):
32 | line = line.strip().split('\t')
33 | cat_tmp = line[0]
34 | cat_str = line[1]
35 |
36 | if cat_tmp in cat_tmps:
37 | f.write("%s\t%s\n" % (cat_tmp, cat_str))
38 |
39 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/Convolution1DSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.io._
20 | import org.scalatest._
21 |
22 | import jigg.util.HDF5Object
23 |
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 |
27 | class Convolution1DSpec extends FlatSpec with Matchers{
28 |
29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 |
31 | "convert" should "load model and convert input matrix" in {
32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/convolution1d/convolution1d_model.h5")
33 | val model = new KerasModel(hdf5)
34 | val inputData = csvread(new File(findPath("./data/ml/keras/convolution1d/convolution1d_input.csv")),separator = ',').map{x => x.toFloat}
35 | val goldData = csvread(new File(findPath("./data/ml/keras/convolution1d/convolution1d_gold.csv")),separator = ',').map{x => x.toFloat}
36 |
37 | val output = model.convert(inputData)
38 |
39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 |
41 | diff should be (true)
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/DenseSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.io._
20 | import org.scalatest._
21 |
22 | import jigg.util.HDF5Object
23 |
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 |
27 |
28 | class DenseSpec extends FlatSpec with Matchers{
29 |
30 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
31 |
32 | "convert" should "load model and convert input matrix" in {
33 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/dense/dense_model.h5")
34 | val model = new KerasModel(hdf5)
35 | val inputData = csvread(new File(findPath("./data/ml/keras/dense/dense_input.csv")),separator = ',').map{x => x.toFloat}
36 | val goldData = csvread(new File(findPath("./data/ml/keras/dense/dense_gold.csv")),separator = ',').map{x => x.toFloat}
37 |
38 | val output = model.convert(inputData)
39 |
40 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
41 |
42 | diff should be (true)
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/EmbeddingSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.io._
20 | import org.scalatest._
21 |
22 | import jigg.util.HDF5Object
23 |
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 |
27 | class EmbeddingSpec extends FlatSpec with Matchers{
28 |
29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 |
31 | "convert" should "load model and convert input matrix" in {
32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/embedding/embedding_model.h5")
33 | val model = new KerasModel(hdf5)
34 | val inputData = csvread(new File(findPath("./data/ml/keras/embedding/embedding_input.csv")),separator = ',').map{x => x.toFloat}
35 | val goldData = csvread(new File(findPath("./data/ml/keras/embedding/embedding_gold.csv")),separator = ',').map{x => x.toFloat}
36 |
37 | val output = model.convert(inputData)
38 |
39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 |
41 | diff should be (true)
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/FlattenSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.io._
20 | import org.scalatest._
21 |
22 | import jigg.util.HDF5Object
23 |
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 |
27 | class FlattenSpec extends FlatSpec with Matchers{
28 |
29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 |
31 | "convert" should "load model and convert input matrix" in {
32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/flatten/flatten_model.h5")
33 | val model = new KerasModel(hdf5)
34 | val inputData = csvread(new File(findPath("./data/ml/keras/flatten/flatten_input.csv")),separator = ',').map{x => x.toFloat}
35 | val goldData = csvread(new File(findPath("./data/ml/keras/flatten/flatten_gold.csv")),separator = ',').map{x => x.toFloat}
36 |
37 | val output = model.convert(inputData)
38 |
39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 |
41 | diff should be (true)
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/KerasModelSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.io._
20 | import org.scalatest._
21 |
22 | import jigg.util.HDF5Object
23 |
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 |
27 | class KerasModelSpec extends FlatSpec with Matchers{
28 |
29 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 |
31 | "convert" should "load model and convert input matrix" in {
32 | val hdf5 = HDF5Object.fromResource("./data/ml/keras/kerasModel/kerasModel_model.h5")
33 | val model = new KerasModel(hdf5)
34 | val inputData = csvread(new File(findPath("./data/ml/keras/kerasModel/kerasModel_input.csv")),separator = ',').map{x => x.toFloat}
35 | val goldData = csvread(new File(findPath("./data/ml/keras/kerasModel/kerasModel_gold.csv")),separator = ',').map{x => x.toFloat}
36 |
37 | val output = model.convert(inputData)
38 |
39 | val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 |
41 | diff should be (true)
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/KerasParserTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.ml.keras
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.util.Properties
20 |
21 | import org.scalatest.FunSuite
22 | import org.scalatest.Matchers._
23 |
24 | import jigg.util.{HDF5Object, LookupTable}
25 |
26 | class KerasParserTest extends FunSuite{
27 |
28 | val model = new KerasModel(HDF5Object.fromResource("./data/keras/ssplit_model.h5"))
29 | val table = LookupTable.fromResource("data/keras/jpnLookupCharacter.json")
30 |
31 | val parser = new KerasParser(model, table)
32 |
33 | test("get an offset list from pattern1") {
34 | val pattern = Array[Int](0,1,1,0,1,1)
35 | val ranges = parser.getOffsets(pattern)
36 | ranges should be (Array[(Int, Int)]((0,3),(3,6)))
37 | }
38 |
39 | test("get an offset list from pattern2") {
40 | val pattern = Array[Int](0,1,1,2,2,0,1,1)
41 | val ranges = parser.getOffsets(pattern)
42 | ranges should be (Array[(Int, Int)]((0,3),(5,8)))
43 | }
44 |
45 | test("get an offset list from pattern3") {
46 | val pattern = Array[Int](0,1,1,2,0,1,1,2)
47 | val ranges = parser.getOffsets(pattern)
48 | ranges should be (Array[(Int, Int)]((0,3),(4,7)))
49 |
50 | }
51 |
52 | test("get an offset list from pattern4") {
53 | val pattern = Array[Int](2,2,0,1,1,2,0,1,1,2)
54 | val ranges = parser.getOffsets(pattern)
55 | ranges should be (Array[(Int, Int)]((2,5),(6,9)))
56 | }
57 |
58 | test("get an offset list from pattern5") {
59 | val pattern = Array[Int](1,1,1,0,1,1)
60 | val ranges = parser.getOffsets(pattern)
61 | ranges should be (Array[(Int, Int)]((0,3),(3,6)))
62 | }
63 |
64 | test("get an offset list from pattern6") {
65 | val pattern = Array[Int](2,2,1,1,1,0,1,1)
66 | val ranges = parser.getOffsets(pattern)
67 | ranges should be (Array[(Int, Int)]((2,5),(5,8)))
68 | }
69 |
70 | test("get an offset list from pattern7") {
71 | val pattern = Array[Int](0,1,1,0,0,1,1)
72 | val ranges = parser.getOffsets(pattern)
73 | ranges should be (Array[(Int, Int)]((0,3),(3,4),(4,7)))
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/BunsetsuTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import org.scalatest.FunSuite
20 | import org.scalatest.Matchers._
21 |
22 | class BunsetsuTest extends FunSuite {
23 | test("A gold derivation with cabocha bunsetsu-segments recover gold dependencies") {
24 | import jigg.nlp.ccg.parser.ParsedSentences
25 | val parsedSentences = new ParsedSentences
26 | val (sentence, derivation) = parsedSentences.simpleSentenceAndDerivation
27 |
28 | val bunsetsuSentence = BunsetsuSentence(Array(
29 | Bunsetsu(0, sentence.wordSeq.slice(0, 2), sentence.posSeq.slice(0, 2)), // 政権 に
30 | Bunsetsu(2, sentence.wordSeq.slice(2, 4), sentence.posSeq.slice(2, 4)), // 影響 を
31 | Bunsetsu(4, sentence.wordSeq.slice(4, 5), sentence.posSeq.slice(4, 5)), // 及ぼす
32 | Bunsetsu(5, sentence.wordSeq.slice(5, 6), sentence.posSeq.slice(5, 6)))) // こと
33 |
34 | val parsed = bunsetsuSentence.parseWithCCGDerivation(derivation)
35 | parsed.headSeq should equal (Seq(2, 2, 3, -1))
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/CategoryFeatureTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 | import org.scalatest.FunSuite
19 | import org.scalatest.Matchers._
20 | import scala.collection.mutable.HashSet
21 |
22 | class JPCategoryFeatureTest extends FunSuite {
23 | test("equal test") {
24 | val feat1 = JPCategoryFeature.createFromValues(List("adn","attr","ga"))
25 | val feat2 = JPCategoryFeature.createFromValues(List("nm","attr","ga"))
26 | val feat3 = JPCategoryFeature.createFromValues(List("adn","attr"))
27 | val feat4 = JPCategoryFeature.createFromValues(List("adn","attr","ga"))
28 |
29 | feat1.kvs should equal (feat4.kvs)
30 | feat1.kvs should not equal (feat2.kvs)
31 | feat1.kvs should not equal (feat3.kvs)
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/CategoryManagerTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 | import org.scalatest.FunSuite
19 | import org.scalatest.Matchers._
20 |
21 | class CategoryManagerTest extends FunSuite {
22 | test("the same child node should be assiged the same id") {
23 | val manager = new CategoryManager // Constructor automatically creates unknown category which is assigned id 0
24 |
25 | val cat = JapaneseCategoryParser.parse("NP[case=o,mod=nm]/NP[case=o,mod=nm]")
26 | manager.assignID(cat) match {
27 | case ComplexCategory(id, left, right, _) => {
28 | left.id should equal (1)
29 | right.id should equal (1)
30 | id should equal (2)
31 | }
32 | case _ => fail() // should not occur
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/CategoryParserTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.lexicon
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 | import org.scalatest.FunSuite
19 | import org.scalatest.Matchers._
20 |
21 | class CategoryParserTest extends FunSuite {
22 | test("extractCategoryFeature") {
23 | val reader = new JapaneseCategoryParser.JapaneseReader
24 | val ni_nm = reader.extractCategoryFeature("ni,nm")
25 | ni_nm.toString should equal ("mod=nm,case=ni")
26 | //assert(ni_nm.toString == "mod=nm,case=ni")
27 | }
28 |
29 | test("createAomicCategory") {
30 | val cat1Str = "NP[case=nc,mod=nm]{I1}"
31 | val cat1 = JapaneseCategoryParser.parse(cat1Str)
32 | cat1.toString should equal ("NP[mod=nm,case=nc]")
33 |
34 | val cat2Str = "(((S[mod=adn,form=base]{I1}\\NP[case=ni,mod=nm]{I2}){I1})\\NP[case=o,mod=nm]{I3}){I1}_I1(unk,I3,I2,_)"
35 | val cat2 = JapaneseCategoryParser.parse(cat2Str)
36 | cat2.toString should equal ("(S[mod=adn,form=base]\\NP[mod=nm,case=ni])\\NP[mod=nm,case=o]")
37 |
38 |
39 | val cat3Str = "(NP[case=X1,mod=X2,fin=f]{I1}/NP[case=X1,mod=X2,fin=f]{I1}){I2}_none"
40 | val cat3 = JapaneseCategoryParser.parse(cat3Str)
41 | cat3.toString should equal ("NP[fin=f]/NP[fin=f]")
42 | }
43 |
44 | // These are obsolute tests for previous version
45 | // test("createComplexCategory") {
46 | // JapaneseCategoryParser.parse("NP[nc,nm]1//NP[nc,nm]1").toString should equal("NP[mod=nm,case=nc]/NP[mod=nm,case=nc]")
47 | // JapaneseCategoryParser.parse("(S[nm,stem,nm]\NP[nc,nm])/NP[nc,nm]").toString should equal(
48 | // """(S[mod=nm,form=stem]\NP[mod=nm,case=nc])/NP[mod=nm,case=nc]""")
49 | // JapaneseCategoryParser.parse("(((S\NP)/NP[nc,nm])\(S[nm,stem]1/NP[o,nm]sem))/NP[nc,nm]1").toString should equal(
50 | // """(((S\NP)/NP[mod=nm,case=nc])\(S[mod=nm,form=stem]/NP[mod=nm,case=o]))/NP[mod=nm,case=nc]""")
51 | // JapaneseCategoryParser.parse("S1/S1").toString should equal("S/S")
52 | // JapaneseCategoryParser.parse("(S2/S2)1/(S3/S3)1").toString should equal("(S/S)/(S/S)")
53 | // }
54 | }
55 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/parser/RuleTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.nlp.ccg.parser
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import org.scalatest.FunSuite
20 | import org.scalatest.Matchers._
21 |
22 | class RuleTest extends FunSuite {
23 | val parsedSentences = new ParsedSentences
24 | val dict = parsedSentences.dict
25 | def cat(str:String) = dict.getCategory(str).get
26 |
27 | test("extract all rules from derivations") {
28 | val (sentence, derivation) = parsedSentences.simpleSentenceAndDerivation
29 |
30 | val rule = CFGRule.extractRulesFromDerivations(Array(derivation), JapaneseHeadFinder)
31 | rule.unify(cat("(NP[case=nc,mod=X1]{I1}/NP[case=nc,mod=X1]{I1}){I2}"), cat("NP[case=nc,mod=nm]{I1}_none")).get should contain (cat("NP[case=nc,mod=nm]{I1}"), ">")
32 | rule.raise(cat("S[mod=adn,form=base]{I1}")).get should contain (cat("(NP[case=nc,mod=X1]{I1}/NP[case=nc,mod=X1]{I1}){I2}"), "ADN")
33 | rule.unify(cat("NP[case=ni,mod=nm]{I1}"), cat("(S[mod=adn,form=base]{I1}\\NP[case=ni,mod=nm]{I2}){I1}")).get should contain (cat("S[mod=adn,form=base]{I1}"), "<")
34 |
35 | rule.unify(cat("NP[case=nc,mod=nm]{I1}_none"), cat("(NP[case=o,mod=nm]{I1}\\NP[case=nc,mod=nm]{I1}){I2}_none")).get should contain (cat("NP[case=o,mod=nm]{I1}"), "<")
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/AnnotatorSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import scala.xml.Node
21 | import org.scalatest._
22 |
23 | import jigg.util.Prop
24 |
25 | class NothingAnnotator(override val name: String, override val props: Properties) extends Annotator {
26 |
27 | @Prop(gloss = "gloss of variable1", required=true) var variable1 = ""
28 | readProps()
29 |
30 | def annotate(node: Node) = node
31 | }
32 |
33 | class AnnotatorSpec extends FlatSpec with Matchers {
34 |
35 | "Opt variable" should "be customizable with property file" in {
36 | val props = new Properties
37 | props.setProperty("nothing.variable1", "hoge")
38 |
39 | val annotator = new NothingAnnotator("nothing", props)
40 |
41 | annotator.variable1 should be("hoge")
42 | }
43 |
44 | "Annotator" should "throws an exception during initProps if required variable is missed" in {
45 | val props = new Properties
46 | try {
47 | val annotator = new NothingAnnotator("nothing", props)
48 | fail()
49 | } catch {
50 | case e: ArgumentError =>
51 | case _: Throwable => fail()
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/BaseAnnotatorSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import org.scalactic.Equality
21 | import org.scalatest._
22 | import scala.xml._
23 |
24 | trait BaseAnnotatorSpec extends FlatSpec with Matchers {
25 |
26 | val sameElem = new Equality[Node] {
27 | import scala.xml.Utility.trim
28 | override def areEqual(a: Node, b: Any) = b match {
29 | case n: Node => trim(a) == trim(n)
30 | case _ => false
31 | }
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/BeneParAnnotatorSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2017 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 |
21 | import org.scalatest._
22 | import scala.xml._
23 |
24 | class BeneParAnnotatorSpec extends BaseAnnotatorSpec {
25 |
26 | class AnnotatorStub(output: String) extends BeneParAnnotator("benepar", new Properties) {
27 | override def mkLocalAnnotator = new LocalBeneParAnnotator {
28 | override def mkCommunicator = new StubExternalCommunicator(output)
29 | }
30 | assert(nThreads == 1)
31 | }
32 |
33 | Annotation.ParseSpan.idGen.reset()
34 |
35 | "BeneParAnnotator" should "convert a s-tree output of benepar into a node" in {
36 | val doc =
37 |
38 |
39 |
40 | He ate pizza .
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 | val output = """(S (NP (PRP He)) (VP (VBD ate) (NN pizza)) (. .))
52 | END"""
53 |
54 | val ann = new AnnotatorStub(output)
55 | val annotation = ann.annotate(doc)
56 |
57 | val s = annotation \\ "sentence"
58 |
59 | (s \ "parse").head should equal(
60 |
61 |
62 |
63 | ) (decided by sameElem)
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/BunsetsuKerasAnnotatorTest.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licencses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitation under the License.
17 | */
18 |
19 | import java.util.Properties
20 |
21 | import org.scalatest.FunSuite
22 | import org.scalatest.Matchers._
23 |
24 | import scala.xml.{NodeSeq, Node}
25 |
26 | class BunsetsuKerasAnnotatorTest extends FunSuite {
27 |
28 | def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
29 |
30 | def segment(node: Node, properties: Properties): NodeSeq = {
31 | val bunsetsuSplitter = new IPABunsetsuKerasAnnotator("bunsetsuKeras", properties)
32 | bunsetsuSplitter.mkLocalAnnotator.newSentenceAnnotation(node)
33 | }
34 |
35 | val properties = new Properties
36 | properties.setProperty("bunsetsuKeras.model", findPath("./data/keras/bunsetsu_model.h5"))
37 | properties.setProperty("bunsetsuKeras.table", findPath("data/keras/jpnLookupWords.json"))
38 |
39 | test("do chunking") {
40 |
41 | val chunks = segment(Sentences.xml("oneSentence"),properties) \\ "chunk"
42 |
43 | chunks.length should be (2)
44 | }
45 |
46 | object Sentences {
47 | val xml = Map("oneSentence" ->
48 |
49 | 梅が咲いた。
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | )
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/KuromojiAnnotatorSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2015 Takafumi Sakakibara and Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import scala.xml.Node
21 | import org.scalatest._
22 |
23 | import com.atilika.kuromoji.{TokenBase, TokenizerBase}
24 | import com.atilika.kuromoji.ipadic.{Token=>IToken, Tokenizer=>ITokenizer}
25 |
26 | class KuromojiAnnotatorSpec extends FlatSpec with Matchers {
27 |
28 | "Annotator" should "assign token id using sentence id" in {
29 |
30 | val annotator = KuromojiAnnotator.fromProps("kuromoji", new Properties)
31 |
32 | val sentence = あ
33 | val annotated = annotator newSentenceAnnotation sentence
34 |
35 | val tokenId = annotated \\ "token" \@ "id"
36 | tokenId should be ("a_0")
37 | }
38 |
39 | "TokenAnnotator" should "segment into tokens" in {
40 | val annotator = KuromojiAnnotator.fromProps("kuromoji[tokenize]", new Properties)
41 |
42 | val sentence = あ
43 | val annotated = annotator newSentenceAnnotation sentence
44 |
45 | val token = annotated \\ "token"
46 | token \@ "form" should be ("あ")
47 | token \@ "pos" should be ("")
48 | }
49 |
50 | "POSAnnotator" should "assign POS tags" in {
51 | val annotator = KuromojiAnnotator.fromProps("kuromoji[pos]", new Properties)
52 |
53 | val sentence =
54 |
55 |
56 |
57 |
58 |
59 | val annotated = annotator newSentenceAnnotation sentence
60 |
61 | val token = annotated \\ "token"
62 | token \@ "pos" should not be ("")
63 | token \@ "dummy" should be ("a") // not removed (overriden)
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/MecabAnnotatorSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2017 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import scala.xml.Node
21 | import org.scalatest._
22 |
23 | class MecabAnnotatorSpec extends BaseAnnotatorSpec {
24 |
25 | def stubCom(output: String) = new StubExternalCommunicator(output)
26 | def mapCom(responces: Map[String, String]) = new MapStubExternalCommunicator(responces)
27 |
28 | def newIPA(mkCom: ()=>IOCommunicator, threads: Int = 1, p: Properties = new Properties) =
29 | new IPAMecabAnnotator("mecab", p) {
30 | override def mkLocalAnnotator = new IPALocalMecabAnnotator {
31 | override def mkCommunicator = mkCom()
32 | }
33 | override def nThreads = threads
34 | }
35 |
36 | "Annotator with nThreads=1" should "be able to annotate one sentence" in {
37 | val s = "a"
38 | val in = a
39 | val out = """a 名詞,固有名詞,組織,*,*,*,*
40 | EOS"""
41 | val annotator = newIPA(()=>stubCom(out), threads=1)
42 | val result = annotator.annotate(in)
43 | val tokens = result \\ "token"
44 | tokens.size should be(1)
45 | (tokens(0) \@ "pos") should be("名詞")
46 |
47 | result \\ "tokens" \@ "annotators" should be("mecab")
48 | }
49 |
50 | "Annotator with nThreads=2" should "annotate in parallel" in {
51 | val responces = Map(
52 | "a" -> """a 名詞,固有名詞,*,*,*,*,*
53 | EOS""",
54 | "b" -> """b 動詞,*,*,*,*,*,*
55 | EOS""",
56 | "c" -> """c 形容詞,*,*,*,*,*,*
57 | EOS"""
58 | )
59 | val in =
60 |
61 |
62 | a
63 | b
64 | c
65 |
66 |
67 |
68 |
69 | val annotator = newIPA(()=>mapCom(responces), threads=2)
70 | val result = annotator.annotate(in)
71 |
72 | val sentences = result \\ "sentence"
73 | sentences.size should be(3)
74 | ((sentences(0) \\ "token")(0) \@ "form") should be("a")
75 | ((sentences(1) \\ "token")(0) \@ "form") should be("b")
76 | ((sentences(2) \\ "token")(0) \@ "form") should be("c")
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/PipelineSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2018 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import org.scalatest._
21 | import scala.xml._
22 | import jigg.util.{XMLUtil, JSONUtil}
23 |
24 | class PipelineSpec extends BaseAnnotatorSpec {
25 |
26 | class StubMecabAnnotator(n: String, p: Properties)
27 | extends IPAMecabAnnotator(n, p) {
28 | override def mkLocalAnnotator = new IPALocalMecabAnnotator {
29 | override def mkCommunicator = new StubExternalCommunicator("aaa")
30 | }
31 | }
32 |
33 | class DummyPipeline(p: Properties) extends Pipeline(p) {
34 | override def getAnnotator(name: String) = name match {
35 | case "dummy" => new StubMecabAnnotator(name, p)
36 | case _ => super.getAnnotator(name)
37 | }
38 | }
39 |
40 | "-Threads option" should "be able to customize each annotator's number of threads" in {
41 | val p = new Properties
42 | p.setProperty("annotators", "ssplit,dummy")
43 | p.setProperty("nThreads", "2")
44 | p.setProperty("dummy.nThreads", "4")
45 |
46 | val pipeline = new DummyPipeline(p)
47 |
48 | val annotators = pipeline.annotatorList
49 | annotators(0).name should equal("ssplit")
50 | annotators(0).nThreads should equal(2)
51 | annotators(1).name should equal("dummy")
52 | annotators(1).nThreads should equal(4)
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/RequirementSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 | import org.scalatest._
21 |
22 | class RequirementSpec extends FlatSpec with Matchers {
23 |
24 | "Tokenize" should "be satisfied when TokenizeWithIPA is satisfied" in {
25 |
26 | val satisfied = RequirementSet(JaRequirement.TokenizeWithIPA)
27 | val requires: Set[Requirement] = Set(Requirement.Tokenize)
28 |
29 | val lacked = satisfied.lackedIn(requires)
30 | lacked shouldBe empty
31 | }
32 |
33 | "TokenizedWithIPA" should "not be satisifed when Tokenize is satisfied" in {
34 |
35 | val satisfied = RequirementSet(Requirement.Tokenize)
36 | val requires: Set[Requirement] = Set(JaRequirement.TokenizeWithIPA)
37 |
38 | val lacked = satisfied.lackedIn(requires)
39 | lacked shouldBe Set(JaRequirement.TokenizeWithIPA)
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/SyntaxNetAnnotatorSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.pipeline
2 |
3 | /*
4 | Copyright 2013-2016 Hiroshi Noji
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | */
18 |
19 | import java.util.Properties
20 |
21 | import org.scalatest._
22 | import scala.xml._
23 |
24 | class SyntaxNetAnnotatorSpec extends BaseAnnotatorSpec {
25 |
26 | class POSAnnotatorStub(output: String) extends
27 | SyntaxNetPOSAnnotator("syntaxnetpos", new Properties) {
28 |
29 | override def run(input: String) = output.split("\n").toStream
30 | }
31 |
32 | "POSAnnotator" should "annotate all sentences across documents" in {
33 |
34 | val root =
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | val output = """1 a _ A A _ 0 A _ _
50 |
51 | 1 b _ B B _ 0 B _ _
52 | 2 c _ C C _ 0 C _ _
53 |
54 | 1 c _ D D _ 0 D _ _
55 | """
56 |
57 | val annotator = new POSAnnotatorStub(output)
58 | val annotated = annotator.annotate(root)
59 |
60 | annotated should equal (
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | ) (decided by sameElem)
78 | }
79 |
80 | }
81 |
--------------------------------------------------------------------------------
/src/test/scala/jigg/util/JSONUtilSpec.scala:
--------------------------------------------------------------------------------
1 | package jigg.util
2 |
3 | package jigg.pipeline
4 |
5 | import org.scalatest.FunSuite
6 | import org.scalatest.Matchers._
7 |
8 | class JSONUtilSpec extends FunSuite{
9 | import org.json4s._
10 | import org.json4s.jackson.JsonMethods._
11 |
12 | val testNode =
13 |
14 |
15 | Test Node
16 |
17 |
18 | val goldJSON =
19 | parse(
20 | """
21 | {
22 | ".tag" : "root",
23 | ".child" : [ {
24 | ".tag" : "document",
25 | "id" : "d0",
26 | "text" : "Test Node"
27 | } ]
28 | }
29 | """
30 | )
31 |
32 | /**
33 | * For handling a backslash.
34 | */
35 | val testNodeForBackslash =
36 |
37 |
38 | Test Node
39 |
40 |
41 |
42 | val goldJSONForBackSlash =
43 | parse(
44 | """{".tag":"root",".child":
45 | [{".tag":"document","id":"d0\\N","text":"Test Node"}
46 | ]
47 | }"""
48 | )
49 |
50 | /**
51 | * For handling escaped strings.
52 | */
53 | val testNodeForEscaping =
54 |
55 | "}>
56 | {"quot\" amp&"}
57 |
58 |
59 | {"new line\n \n tab\t \t carriage return\r \r backslash\\ \\"}
60 |
61 |
62 |
63 | val goldJSONForEscaping =
64 | parse(
65 | """{".tag":"root",".child":
66 | [{".tag":"document","id":"","text":"quot\" amp&"},
67 | {".tag":"document", "id":"d1", "text": "new line\n \n tab\t \t carriage return\r \r backslash\\ \\"}
68 | ]
69 | }"""
70 | )
71 |
72 | val testJSONForEscaping =
73 | parse(
74 | """{".tag":"root",".child":
75 | [{".tag":"document","id":"<d0>","text":"&Test Node"amp;"}
76 | ]
77 | }"""
78 | )
79 |
80 | /**
81 | * Unit testing toJSON
82 | */
83 | test("toJSON should generate formatted String object from scala.xml.Node"){
84 | parse(JSONUtil.toJSON(testNode)) should be (goldJSON)
85 | parse(JSONUtil.toJSON(testNodeForBackslash)) should be (goldJSONForBackSlash)
86 | parse(JSONUtil.toJSON(testNodeForEscaping)) should be (goldJSONForEscaping)
87 | }
88 | /**
89 | * Unit testing JSON to XML
90 | */
91 | test("toXML should generate xml.Node"){
92 | val xmlFromJSON = JSONUtil.toXML(goldJSON)
93 | val xmlFromJSONWithBackslash = JSONUtil.toXML(goldJSONForBackSlash)
94 | val xmlFromJSONWithEscapeChar = JSONUtil.toXML(testJSONForEscaping)
95 | xmlFromJSON should be ({"Test Node"})
96 | xmlFromJSONWithBackslash should be ({"Test Node"})
97 | xmlFromJSONWithEscapeChar should be ("}>{"&Test Node\"amp;"})
98 | }
99 | }
100 |
--------------------------------------------------------------------------------