├── .checker
    ├── README.md
    ├── scripts
    │   ├── before-install.sh
    │   ├── install-annotators.sh
    │   ├── install-benepar.sh
    │   ├── install-cabocha.sh
    │   ├── install-crf.sh
    │   ├── install-depccg.sh
    │   ├── install-jar.sh
    │   ├── install-juman.sh
    │   ├── install-knp.sh
    │   ├── install-mecab.sh
    │   ├── install-other-languages.sh
    │   ├── install-syntaxnet.sh
    │   ├── install-udpipe.sh
    │   ├── run-test.sh
    │   └── set-env.sh
    ├── setup.cfg
    └── tests
    │   ├── basetest.py
    │   ├── benepar
    │       └── test_benepar.py
    │   ├── cabocha
    │       └── test_cabocha.py
    │   ├── comparison.py
    │   ├── constant.py
    │   ├── corenlp
    │       ├── test_berkeleyparser_dcoref.py
    │       ├── test_dcoref.py
    │       ├── test_ssplit.py
    │       └── test_tokenize.py
    │   ├── corenlp_other_languages
    │       ├── test_chinese_coref.py
    │       └── test_french_depparse.py
    │   ├── depccg
    │       └── test_depccg_ccg.py
    │   ├── example_test.py
    │   ├── juman
    │       └── test_juman.py
    │   ├── knp
    │       └── test_knp.py
    │   ├── mecab
    │       └── test_mecab.py
    │   ├── syntaxnet
    │       └── test_syntaxnet.py
    │   └── udpipe
    │       ├── test_udpipe_parse.py
    │       └── test_udpipe_tokenize.py
├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── bin
    ├── sbt
    └── sbt-launch.jar
├── build.sbt
├── docker-compose.yml
├── dockers
    ├── knp
    │   └── Dockerfile
    └── syntaxnet
    │   └── Dockerfile
├── jar
    └── easyccg.jar
├── project
    ├── build.properties
    ├── buildinfo.sbt
    └── plugins.sbt
├── python
    ├── README.md
    ├── pipeline_example.py
    ├── pyjigg
    │   ├── __init__.py
    │   └── pipeline.py
    └── setup.py
├── script
    └── release.sh
└── src
    ├── main
        ├── resources
        │   └── python
        │   │   ├── _depccg.py
        │   │   ├── bene_par.py
        │   │   └── udpipe.py
        └── scala
        │   └── jigg
        │       ├── ml
        │           ├── Example.scala
        │           ├── FeatureBase.scala
        │           ├── FeatureIndexer.scala
        │           ├── FeatureUtil.scala
        │           ├── LinearClassifier.scala
        │           ├── LogLinearAdaGradL1.scala
        │           ├── LogLinearClassifier.scala
        │           ├── LogLinearSGD.scala
        │           ├── OnlineLogLinearTrainer.scala
        │           ├── OnlineTrainer.scala
        │           ├── Perceptron.scala
        │           ├── WeightVector.scala
        │           └── keras
        │           │   ├── Convolution1D.scala
        │           │   ├── Dense.scala
        │           │   ├── Embedding.scala
        │           │   ├── Empty.scala
        │           │   ├── Flatten.scala
        │           │   ├── Functor.scala
        │           │   ├── KerasModel.scala
        │           │   ├── KerasParser.scala
        │           │   ├── README.md
        │           │   ├── Relu.scala
        │           │   ├── Sigmoid.scala
        │           │   ├── Softmax.scala
        │           │   └── Tanh.scala
        │       ├── nlp
        │           └── ccg
        │           │   ├── CCGBank.scala
        │           │   ├── CCGBank2EnjuXML.scala
        │           │   ├── CCGBankToCabochaFormat.scala
        │           │   ├── CalcCoverage.scala
        │           │   ├── EvalParser.scala
        │           │   ├── EvalSuperTagger.scala
        │           │   ├── GoldBunsetsuDepInCabocha.scala
        │           │   ├── LoadDumpedTaggerModel.scala
        │           │   ├── Opts.scala
        │           │   ├── OutputCategoryList.scala
        │           │   ├── ParserModel.scala
        │           │   ├── ParserRunner.scala
        │           │   ├── ParserTrainer.scala
        │           │   ├── README.md
        │           │   ├── RenderCCGDerivation.scala
        │           │   ├── SuperTaggerModel.scala
        │           │   ├── SuperTaggerRunner.scala
        │           │   ├── SuperTaggerTrainer.scala
        │           │   ├── TrainParser.scala
        │           │   ├── TrainSuperTagger.scala
        │           │   ├── lexicon
        │           │       ├── Bunsetsu.scala
        │           │       ├── CCGBankReader.scala
        │           │       ├── CabochaReader.scala
        │           │       ├── Category.scala
        │           │       ├── CategoryDictionary.scala
        │           │       ├── CategoryFeature.scala
        │           │       ├── CategoryManager.scala
        │           │       ├── CategoryParser.scala
        │           │       ├── CategoryTree.scala
        │           │       ├── Derivation.scala
        │           │       ├── Dictionary.scala
        │           │       ├── Direction.scala
        │           │       ├── JapaneseDictionary.scala
        │           │       ├── MecabReader.scala
        │           │       ├── Numbered.scala
        │           │       ├── NumberedManager.scala
        │           │       ├── ParseTree.scala
        │           │       ├── ParseTreeConverer.scala
        │           │       ├── PoS.scala
        │           │       ├── Sentence.scala
        │           │       ├── SimpleDictionary.scala
        │           │       ├── Slash.scala
        │           │       └── Word.scala
        │           │   ├── package.scala
        │           │   ├── parser
        │           │       ├── Action.scala
        │           │       ├── BeamSearchDecoder.scala
        │           │       ├── HeadFinder.scala
        │           │       ├── KBestDecoder.scala
        │           │       ├── Oracle.scala
        │           │       ├── Rule.scala
        │           │       ├── ShiftReduceFeature.scala
        │           │       ├── ShiftReduceFeatureExtractors.scala
        │           │       ├── State.scala
        │           │       ├── TransitionBasedParser.scala
        │           │       └── package.scala
        │           │   └── tagger
        │           │       ├── MaxentMultiTagger.scala
        │           │       ├── SuperTaggingFeature.scala
        │           │       ├── SuperTaggingFeatureExtractors.scala
        │           │       ├── UserDefinedFeatureExtractors.scala
        │           │       └── package.scala
        │       ├── pipeline
        │           ├── AnnotatingInParallel.scala
        │           ├── Annotation.scala
        │           ├── AnnotationError.scala
        │           ├── Annotator.scala
        │           ├── ArgumentError.scala
        │           ├── BeneParAnnotator.scala
        │           ├── BerkeleyParserAnnotator.scala
        │           ├── BunsetsuKerasAnnotator.scala
        │           ├── CCGParseAnnotator.scala
        │           ├── CabochaAnnotator.scala
        │           ├── CandCAnnotator.scala
        │           ├── DepCCGAnnotator.scala
        │           ├── DocumentAnnotator.scala
        │           ├── DocumentKNPAnnotator.scala
        │           ├── EasyCCGAnnotator.scala
        │           ├── IOCommunicator.scala
        │           ├── JumanAnnotator.scala
        │           ├── KNPAnnotator.scala
        │           ├── KuromojiAnnotator.scala
        │           ├── MecabAnnotator.scala
        │           ├── OutputConverter.scala
        │           ├── Pipeline.scala
        │           ├── PipelineServer.scala
        │           ├── PropsHolder.scala
        │           ├── RegexDocumentAnnotator.scala
        │           ├── RegexSentenceAnnotator.scala
        │           ├── Requirement.scala
        │           ├── SentencesAnnotator.scala
        │           ├── SimpleKNPAnnotator.scala
        │           ├── SpaceTokenizerAnnotator.scala
        │           ├── SsplitKerasAnnotator.scala
        │           ├── StanfordCollapsedDependenciesAnnotator.scala
        │           ├── StanfordCoreNLPAnnotator.scala
        │           ├── StanfordTypedDependenciesAnnotator.scala
        │           ├── SyntaxNetAnnotator.scala
        │           ├── SystemDict.scala
        │           ├── UDPipeAnnotator.scala
        │           └── UnmanagedAnnotators.scala
        │       └── util
        │           ├── ArgumentsParser.scala
        │           ├── CoNLLUtil.scala
        │           ├── HDF5Object.scala
        │           ├── IDGenerator.scala
        │           ├── IOUtil.scala
        │           ├── JSONUtil.scala
        │           ├── LogUtil.scala
        │           ├── LookupTable.scala
        │           ├── Normalizer.scala
        │           ├── Prop.java
        │           ├── PropertiesUtil.scala
        │           ├── ResourceUtil.scala
        │           ├── TreesUtil.scala
        │           └── XMLUtil.scala
    └── test
        ├── resources
            ├── data
            │   ├── Japanese.small.lexicon
            │   ├── Japanese.unkVerb.lexicon
            │   ├── json
            │   │   ├── english.ssplit.test.json
            │   │   └── japanese.ssplit.test.json
            │   ├── keras
            │   │   ├── bunsetsu_model.h5
            │   │   ├── jpnLookupCharacter.json
            │   │   ├── jpnLookupWords.json
            │   │   └── ssplit_model.h5
            │   ├── ml
            │   │   └── keras
            │   │   │   ├── convolution1d
            │   │   │       ├── convolution1d_gold.csv
            │   │   │       ├── convolution1d_input.csv
            │   │   │       └── convolution1d_model.h5
            │   │   │   ├── dense
            │   │   │       ├── dense_gold.csv
            │   │   │       ├── dense_input.csv
            │   │   │       └── dense_model.h5
            │   │   │   ├── embedding
            │   │   │       ├── embedding_gold.csv
            │   │   │       ├── embedding_input.csv
            │   │   │       └── embedding_model.h5
            │   │   │   ├── flatten
            │   │   │       ├── flatten_gold.csv
            │   │   │       ├── flatten_input.csv
            │   │   │       └── flatten_model.h5
            │   │   │   └── kerasModel
            │   │   │       ├── kerasModel_gold.csv
            │   │   │       ├── kerasModel_input.csv
            │   │   │       └── kerasModel_model.h5
            │   ├── template.small.lst
            │   ├── template.unkVerb.lst
            │   └── xml
            │   │   ├── english.ssplit.spaceTokenize.gold.xml
            │   │   ├── english.ssplit.test.xml
            │   │   ├── japanese.ssplit.kuromoji.gold.xml
            │   │   └── japanese.ssplit.test.xml
            └── script
            │   └── create_small_lst_from_lexicon.py
        └── scala
            └── jigg
                ├── ml
                    └── keras
                    │   ├── Convolution1DSpec.scala
                    │   ├── DenseSpec.scala
                    │   ├── EmbeddingSpec.scala
                    │   ├── FlattenSpec.scala
                    │   ├── KerasModelSpec.scala
                    │   └── KerasParserTest.scala
                ├── nlp
                    └── ccg
                    │   ├── lexicon
                    │       ├── BunsetsuTest.scala
                    │       ├── CCGBankReaderTest.scala
                    │       ├── CategoryFeatureTest.scala
                    │       ├── CategoryManagerTest.scala
                    │       ├── CategoryParserTest.scala
                    │       └── JapaneseDictionaryTest.scala
                    │   └── parser
                    │       ├── KBestDecoderTest.scala
                    │       ├── OracleTest.scala
                    │       ├── ParsedSentence.scala
                    │       └── RuleTest.scala
                ├── pipeline
                    ├── AnnotatorSpec.scala
                    ├── BaseAnnotatorSpec.scala
                    ├── BeneParAnnotatorSpec.scala
                    ├── BerkeleyParserAnnotatorSpec.scala
                    ├── BunsetsuKerasAnnotatorTest.scala
                    ├── CabochaAnnotatorSpec.scala
                    ├── DepCCGAnnotatorSpec.scala
                    ├── DocumentKNPAnnotatorSpec.scala
                    ├── EasyCCGAnnotatorSpec.scala
                    ├── IntermediateInputSpec.scala
                    ├── JumanAnnotatorSpec.scala
                    ├── KuromojiAnnotatorSpec.scala
                    ├── MecabAnnotatorSpec.scala
                    ├── PipelineSpec.scala
                    ├── RegexSentenceAnnotatorTest.scala
                    ├── RequirementSpec.scala
                    ├── SimpleKNPAnnotatorSpec.scala
                    ├── SsplitKerasAnnotatorTest.scala
                    ├── StanfordTypedDependenciesAnnotatorSpec.scala
                    └── SyntaxNetAnnotatorSpec.scala
                └── util
                    ├── CoNLLUtilSpec.scala
                    ├── JSONUtilSpec.scala
                    ├── TreesUtilSpec.scala
                    └── XMLUtilSpec.scala


/.checker/scripts/before-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # upgrade c++
 4 | # add repository
 5 | sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y   
 6 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
 7 | 
 8 | sudo apt update -y && sudo apt install g++-4.9 gcc-4.9 -y
 9 | 
10 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.8 10
11 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 20
12 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/g++-4.8 10
13 | sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 20
14 | 
15 | sudo rm /usr/bin/cpp
16 | 
17 | sudo update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-4.8 10
18 | sudo update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-4.9 20
19 | sudo update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 30
20 | sudo update-alternatives --set cc /usr/bin/gcc
21 | sudo update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 30
22 | sudo update-alternatives --set c++ /usr/bin/g++
23 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-annotators.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ${ANNOTATORS} == "udpipe" ];then
 4 |     echo "Install UDPIPE"
 5 |     ./.checker/scripts/install-udpipe.sh
 6 | elif [ ${ANNOTATORS} == "depccg" ];then
 7 |     echo "Install DEPCCG"
 8 |     ./.checker/scripts/install-depccg.sh    
 9 | elif [ ${ANNOTATORS} == "mecab" ];then
10 |     echo "Install MECAB"
11 |     ./.checker/scripts/install-mecab.sh
12 | elif [ ${ANNOTATORS} == "cabocha" ];then
13 |     echo "Install CABOCHA"
14 |     ./.checker/scripts/install-mecab.sh
15 |     ./.checker/scripts/install-crf.sh
16 |     ./.checker/scripts/install-cabocha.sh
17 | elif [ ${ANNOTATORS} == "juman" ];then
18 |     echo "Install JUMAN"
19 |     ./.checker/scripts/install-juman.sh
20 | elif [ ${ANNOTATORS} == "knp" ];then
21 |     echo "Install KNP"
22 |     ./.checker/scripts/install-knp.sh
23 | elif [ ${ANNOTATORS} == "corenlp" ];then
24 |     echo "Install CORENLP"
25 |     ./.checker/scripts/install-jar.sh
26 | elif [ ${ANNOTATORS} == "corenlp_other_languages" ];then
27 |     echo "Install CORENLP OTHER LANGUAGE"
28 |     ./.checker/scripts/install-jar.sh
29 |     ./.checker/scripts/install-other-languages.sh
30 | elif [ ${ANNOTATORS} == "benepar" ];then
31 |     echo "Install BENEPAR"
32 |     ./.checker/scripts/install-jar.sh    
33 |     ./.checker/scripts/install-benepar.sh
34 | elif [ ${ANNOTATORS} == "syntaxnet" ];then
35 |     echo "Install SYNTAXNET"
36 |     ./.checker/scripts/install-jar.sh
37 |     ./.checker/scripts/install-syntaxnet.sh
38 | fi
39 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-benepar.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | pip install cython numpy
6 | pip install benepar[cpu]
7 | 
8 | python -c 'import benepar; benepar.download("benepar_en2")'
9 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-cabocha.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | source ./.checker/scripts/set-env.sh
 6 | 
 7 | home_dir=`pwd ./`
 8 | 
 9 | url="https://github.com/taku910/cabocha/archive/master.zip"
10 | file=master.zip
11 | dir=cabocha-master
12 | 
13 | # download
14 | wget ${url}
15 | 
16 | # unpack
17 | unzip ${file}
18 | 
19 | # compile
20 | cd ${home_dir}"/"${dir}
21 | ./autogen.sh
22 | ./configure --with-charset=UTF8
23 | make
24 | make check
25 | sudo make install
26 | 
27 | cd ${home_dir}
28 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-crf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | home_dir=`pwd ./`
 6 | 
 7 | # To get file id, you singup google account.
 8 | url="https://drive.google.com/uc?export=view&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ"
 9 | file=CRF++-0.58.tar.gz
10 | dir=CRF++-0.58
11 | 
12 | wget ${url} -O ${file}
13 | 
14 | tar -zxvf ${file}
15 | 
16 | cd ${home_dir}"/"${dir}
17 | ./configure
18 | make
19 | sudo make install
20 | 
21 | cd ${home_dir}
22 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-depccg.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | home_dir=`pwd ./`
 6 | 
 7 | pip install cython numpy
 8 | pip install depccg
 9 | 
10 | depccg_en download
11 | depccg_ja download
12 | 
13 | # en_model_url=http://cl.naist.jp/~masashi-y/resources/depccg/en_hf_tri.tar.gz
14 | # ja_model_url=http://cl.naist.jp/~masashi-y/resources/depccg/ja_hf_ccgbank.tar.gz
15 | # en_model=en_hf_tri.tar.gz
16 | # ja_model=ja_hf_ccgbank.tar.gz
17 | 
18 | # model_dir="depccg/models"
19 | # src_dir="depccg/src"
20 | 
21 | # # Install cython & chainer.
22 | # pip install -U pip cython
23 | # pip install chainer
24 | # pip install scrapy
25 | 
26 | # # Git clone the depccg repository
27 | # git clone https://github.com/masashi-y/depccg.git
28 | 
29 | # # download model file.
30 | # wget ${en_model_url}
31 | # wget ${ja_model_url}
32 | 
33 | # # make directory saved model file
34 | # mkdir ${model_dir}
35 | # mv ${en_model} ${ja_model} ${model_dir}
36 | 
37 | # # compile
38 | # # A default g++ version is 4.8 in Ubuntu 14.04.
39 | # # In depccg compile, it requires the version >= 4.9.
40 | # export CC=g++-4.9
41 | # cd ${home_dir}"/"${src_dir}
42 | # python setup.py build_ext --inplace
43 | 
44 | # ln -s depccg*.so depccg.so
45 | 
46 | # # unpack model files.
47 | # cd ${home_dir}"/"${model_dir}
48 | # tar -zxvf ${en_model}
49 | # tar -zxvf ${ja_model}
50 | 
51 | # cd ${home_dir}
52 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-jar.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | home_dir=`pwd ./`
 6 | jar_dir="jar/"
 7 | 
 8 | 
 9 | # download stanford corenlp
10 | url=http://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip
11 | zip=stanford-corenlp-full-2018-10-05.zip
12 | dir=stanford-corenlp-full-2018-10-05
13 | file=stanford-corenlp-3.9.2.jar
14 | file_model=stanford-corenlp-3.9.2-models.jar
15 | 
16 | # download Stanford CoreNLP models
17 | wget ${url}
18 | 
19 | # unpack
20 | unzip ${zip}
21 | 
22 | cp ${dir}"/"${file} ${jar_dir}
23 | cp ${dir}"/"${file_model} ${jar_dir}
24 | 
25 | 
26 | # create jigg jar file
27 | jigg_file="target/jigg-assembly-0.8.0.jar"
28 | ./bin/sbt assembly
29 | cp ${jigg_file} ${jar_dir}
30 | 
31 | 
32 | # download jigg-models
33 | jigg_models="jigg-models.jar"
34 | wget https://github.com/mynlp/jigg-models/raw/master/jigg-models.jar
35 | mv ${jigg_models} ${jar_dir}
36 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-juman.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | home_dir=`pwd ./`
 6 | 
 7 | url=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2
 8 | file=juman-7.01.tar.bz2
 9 | dir=juman-7.01
10 | 
11 | # download
12 | wget ${url}
13 | 
14 | # unpack bz2 file
15 | tar -jxvf ${file}
16 | 
17 | # build
18 | cd ${dir}
19 | ./configure
20 | make
21 | sudo make install
22 | 
23 | cd ${home_dir}
24 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-knp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | docker build -t jigg/jigg:knp -f dockers/knp/Dockerfile .
6 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-mecab.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | sudo apt install libmecab2 libmecab-dev mecab mecab-ipadic-utf8 mecab-ipadic mecab-utils
6 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-other-languages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | jar_dir="jar"
 4 | 
 5 | # chinese model jar file
 6 | wget http://nlp.stanford.edu/software/stanford-chinese-corenlp-2018-10-05-models.jar
 7 | mv stanford-chinese-corenlp-2018-10-05-models.jar ${jar_dir}
 8 | 
 9 | # french model jar file
10 | wget http://nlp.stanford.edu/software/stanford-french-corenlp-2018-10-05-models.jar
11 | mv stanford-french-corenlp-2018-10-05-models.jar ${jar_dir}
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-syntaxnet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | docker build -t jigg/jigg:syntaxnet -f dockers/syntaxnet/Dockerfile .
6 | 


--------------------------------------------------------------------------------
/.checker/scripts/install-udpipe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | pip install ufal.udpipe
 6 | 
 7 | # model download
 8 | curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2364/udpipe-ud-2.0-170801.zip
 9 | 
10 | # unpack
11 | unzip udpipe-ud-2.0-170801.zip
12 | 
13 | # rename model directory 
14 | mv udpipe-ud-2.0-170801 udpipe-ud-model
15 | 


--------------------------------------------------------------------------------
/.checker/scripts/run-test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | source .checker/scripts/set-env.sh
6 | 
7 | # run a unit test for the files under the directory `.checker/tests/${ANNOTATORS}`.
8 | python3 -m unittest discover -s .checker/tests/${ANNOTATORS}
9 | 


--------------------------------------------------------------------------------
/.checker/scripts/set-env.sh:
--------------------------------------------------------------------------------
1 | export JIGG_VERSION="0.8.0"
2 | export CORENLP_VERSION="3.9.2"
3 | export IVY2_CACHE_DIR="${HOME}/.ivy2/cache"
4 | 
5 | export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:$LD_LIBRARY_PATH
6 | 


--------------------------------------------------------------------------------
/.checker/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120


--------------------------------------------------------------------------------
/.checker/tests/constant.py:
--------------------------------------------------------------------------------
 1 | JIGG_VERSION = "0.8.0"
 2 | CORENLP_VERSION = "3.9.2"
 3 | 
 4 | JIGG_JAR = "target/jigg-assembly-{}.jar".format(JIGG_VERSION)
 5 | JIGG_MODEL_JAR = "jigg-models.jar"
 6 | 
 7 | CORENLP_MODEL_JAR = "stanford-corenlp-{}-models.jar".format(CORENLP_VERSION)
 8 | 
 9 | 
10 | # URL
11 | # juman
12 | JUMAN_MAIN_URL = "http://nlp.ist.i.kyoto-u.ac.jp/?JUMAN"
13 | JUMAN_DOWNLOAD_URL = "http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2"
14 | 
15 | # knp
16 | KNP_MAIN_URL = "http://nlp.ist.i.kyoto-u.ac.jp/?KNP"
17 | KNP_DOWNLOAD_URL = "http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/knp/knp-4.19.tar.bz2"
18 | 
19 | # CRF
20 | CRF_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7QVR6VXJ5dWExSTQ"
21 | 
22 | # cabocha
23 | CABOCHA_DOWNLOAD_URL = "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7SDd1Q1dUQkZQaUU"
24 | 
25 | # depccg
26 | DEPCCG_EN_MODLE_URL = "http://cl.naist.jp/~masashi-y/resources/depccg/en_hf_tri.tar.gz"
27 | DEPCCG_JA_MODEL_URL = "http://cl.naist.jp/~masashi-y/resources/depccg/ja_hf_ccgbank.tar.gz"
28 | DEPCCG_GIT_URL = "https://github.com/masashi-y/depccg.git"
29 | 
30 | # udpipe
31 | UDPIPE_MODEL_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2364/udpipe-ud-2.0-170801.zip"
32 | 


--------------------------------------------------------------------------------
/.checker/tests/corenlp/test_ssplit.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(".checker/tests")
 3 | 
 4 | from basetest import BaseTest
 5 | 
 6 | 
 7 | class TestSsplit(BaseTest):
 8 | 
 9 |     def setUp(self):
10 | 
11 |         self.input_text = "Stanford University is located in California. It is a great university, founded in 1891."
12 | 
13 |         self.expected_text = """<?xml version='1.0' encoding='UTF-8'?>
14 | <root>
15 |   <document id="d0">
16 |     <sentences>
17 |       <sentence characterOffsetEnd="45" characterOffsetBegin="0" id="s0">
18 |         Stanford University is located in California.
19 |         <tokens annotators="corenlp">
20 |           <token form="Stanford" id="t0" characterOffsetBegin="0" characterOffsetEnd="8"/>
21 |           <token form="University" id="t1" characterOffsetBegin="9" characterOffsetEnd="19"/>
22 |           <token form="is" id="t2" characterOffsetBegin="20" characterOffsetEnd="22"/>
23 |           <token form="located" id="t3" characterOffsetBegin="23" characterOffsetEnd="30"/>
24 |           <token form="in" id="t4" characterOffsetBegin="31" characterOffsetEnd="33"/>
25 |           <token form="California" id="t5" characterOffsetBegin="34" characterOffsetEnd="44"/>
26 |           <token form="." id="t6" characterOffsetBegin="44" characterOffsetEnd="45"/>
27 |         </tokens>
28 |       </sentence>
29 |       <sentence characterOffsetEnd="88" characterOffsetBegin="46" id="s1">
30 |         It is a great university, founded in 1891.
31 |         <tokens annotators="corenlp">
32 |           <token form="It" id="t7" characterOffsetBegin="0" characterOffsetEnd="2"/>
33 |           <token form="is" id="t8" characterOffsetBegin="3" characterOffsetEnd="5"/>
34 |           <token form="a" id="t9" characterOffsetBegin="6" characterOffsetEnd="7"/>
35 |           <token form="great" id="t10" characterOffsetBegin="8" characterOffsetEnd="13"/>
36 |           <token form="university" id="t11" characterOffsetBegin="14" characterOffsetEnd="24"/>
37 |           <token form="," id="t12" characterOffsetBegin="24" characterOffsetEnd="25"/>
38 |           <token form="founded" id="t13" characterOffsetBegin="26" characterOffsetEnd="33"/>
39 |           <token form="in" id="t14" characterOffsetBegin="34" characterOffsetEnd="36"/>
40 |           <token form="1891" id="t15" characterOffsetBegin="37" characterOffsetEnd="41"/>
41 |           <token form="." id="t16" characterOffsetBegin="41" characterOffsetEnd="42"/>
42 |         </tokens>
43 |       </sentence>
44 |     </sentences>
45 |   </document>
46 | </root>"""
47 | 
48 |         self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize,ssplit]'
49 | 
50 |     def test_ssplit(self):
51 |         self.check_equal(self.exe, self.input_text, self.expected_text)
52 | 


--------------------------------------------------------------------------------
/.checker/tests/corenlp/test_tokenize.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(".checker/tests")
 3 | 
 4 | from basetest import BaseTest
 5 | 
 6 | 
 7 | class TestTokenize(BaseTest):
 8 | 
 9 |     def setUp(self):
10 | 
11 |         self.input_text = "Stanford University is located in California. It is a great university, founded in 1891."
12 | 
13 |         self.expected_text = """<?xml version='1.0' encoding='UTF-8'?>
14 | <root>
15 |   <document id="d0">
16 |     <sentences>
17 |       <sentence characterOffsetEnd="88" characterOffsetBegin="0" id="s0">
18 |         <tokens annotators="corenlp">
19 |           <token characterOffsetEnd="8" characterOffsetBegin="0" form="Stanford" id="t0"/>
20 |           <token characterOffsetEnd="19" characterOffsetBegin="9" form="University" id="t1"/>
21 |           <token characterOffsetEnd="22" characterOffsetBegin="20" form="is" id="t2"/>
22 |           <token characterOffsetEnd="30" characterOffsetBegin="23" form="located" id="t3"/>
23 |           <token characterOffsetEnd="33" characterOffsetBegin="31" form="in" id="t4"/>
24 |           <token characterOffsetEnd="44" characterOffsetBegin="34" form="California" id="t5"/>
25 |           <token characterOffsetEnd="45" characterOffsetBegin="44" form="." id="t6"/>
26 |           <token characterOffsetEnd="48" characterOffsetBegin="46" form="It" id="t7"/>
27 |           <token characterOffsetEnd="51" characterOffsetBegin="49" form="is" id="t8"/>
28 |           <token characterOffsetEnd="53" characterOffsetBegin="52" form="a" id="t9"/>
29 |           <token characterOffsetEnd="59" characterOffsetBegin="54" form="great" id="t10"/>
30 |           <token characterOffsetEnd="70" characterOffsetBegin="60" form="university" id="t11"/>
31 |           <token characterOffsetEnd="71" characterOffsetBegin="70" form="," id="t12"/>
32 |           <token characterOffsetEnd="79" characterOffsetBegin="72" form="founded" id="t13"/>
33 |           <token characterOffsetEnd="82" characterOffsetBegin="80" form="in" id="t14"/>
34 |           <token characterOffsetEnd="87" characterOffsetBegin="83" form="1891" id="t15"/>
35 |           <token characterOffsetEnd="88" characterOffsetBegin="87" form="." id="t16"/>
36 |         </tokens>
37 |       </sentence>
38 |     </sentences>
39 |   </document>
40 | </root>"""
41 | 
42 |         self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]'
43 | 
44 |     def test_tokenize(self):
45 |         self.check_equal(self.exe, self.input_text, self.expected_text)
46 | 


--------------------------------------------------------------------------------
/.checker/tests/example_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(".checker/tests")
 3 | 
 4 | from basetest import BaseTest
 5 | 
 6 | 
 7 | class TestName(BaseTest):
 8 |     '''
 9 |     This is an exmaple (or a based) file of unittest. You want
10 |     to add the new test file, please copy this file and edit
11 |     it as the following.
12 | 
13 |     1. Copy this file
14 |        please, copy this file as the following command:
15 |        ```
16 |        cp example_test.py {ANNOTATORS}/test_***.py
17 |        ```
18 |        The {ANNOTATORS} is annotator name.
19 |        You need to name the file like `test_***.py`. `***` is any name.
20 |        Note the head to the file name must give the `test`. For example,
21 |        `test_tokenize.py`.
22 |     2. Change the class name
23 |        For each the test case, You change the class name from
24 |        TestName to Test***. `***` is any name, for example,
25 |        Tokenize, Ssplit, ... etc.
26 |     3. Change three variables in the setUp() function
27 |        - self.input_text : a sample text using for test
28 |        - self.expected_text : an expected output text by test run
29 |        - self.exe : an execution command
30 |          This program runs with the sbt runMain command. For example,
31 |          `sbt "runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]"`.
32 |          You set the part of "runMain ~" in the variable `self.exe`.
33 |     4. Change the function name.
34 |        For each the test case, You also change the function name
35 |        from test_name to test_***. `***` is any name, for example,
36 |        tokenize, ssplit, ... etc. Note that the head of the
37 |        function name must give the `test`.
38 | 
39 |     For example, the case of the annotator `pos`:
40 |     1. file name -> test_pos.py
41 |     2. class name -> class TestPos(BaseTest):
42 |     3. variables ->
43 |        self.input_text = "This is a sample text."
44 |        self.expected_text = "[the result text]"
45 |        self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize,ssplit,pos]'
46 |     4. function name -> def test_pos(self):
47 |     '''
48 |     def setUp(self):
49 |         # Set an input (sample) text
50 |         self.input_text = ""
51 | 
52 |         # Set an expected text
53 |         self.expected_text = ""
54 | 
55 |         # Set a execution command
56 |         # You need to change the `-annotators` term according to the test case.
57 |         # For example, the case of annotation `lemma`, corenlp[tokenize,ssplit,pos,lemma].
58 |         self.exe = 'runMain jigg.pipeline.Pipeline -annotators corenlp[tokenize]'
59 | 
60 |     def test_name(self):
61 |         # A function check_equal() is defined on the superclass BaseTest.
62 |         self.check_equal(self.exe, self.input_text, self.expected_text)
63 | 


--------------------------------------------------------------------------------
/.checker/tests/udpipe/test_udpipe_tokenize.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(".checker/tests")
 3 | 
 4 | from basetest import BaseTest
 5 | 
 6 | 
 7 | class TestUDpipeTokenize(BaseTest):
 8 | 
 9 |     def setUp(self):
10 |         self.input_text = "Stanford University is located in California. It is a great university, founded in 1891."
11 | 
12 |         self.expected_text = r"""<?xml version='1.0' encoding='UTF-8'?>
13 | <root>
14 |   <document id="d0">
15 |     <sentences>
16 |       <sentence characterOffsetEnd="45" characterOffsetBegin="0" id="s0">
17 |         Stanford University is located in California.
18 |         <tokens annotators="udpipe">
19 |           <token offsetEnd="8" offsetBegin="0" form="Stanford" id="t0"/>
20 |           <token offsetEnd="19" offsetBegin="9" form="University" id="t1"/>
21 |           <token offsetEnd="22" offsetBegin="20" form="is" id="t2"/>
22 |           <token offsetEnd="30" offsetBegin="23" form="located" id="t3"/>
23 |           <token offsetEnd="33" offsetBegin="31" form="in" id="t4"/>
24 |           <token offsetEnd="44" offsetBegin="34" form="California" id="t5"/>
25 |           <token offsetEnd="45" offsetBegin="44" form="." id="t6"/>
26 |         </tokens>
27 |       </sentence>
28 |       <sentence characterOffsetEnd="88" characterOffsetBegin="46" id="s1">
29 |         It is a great university, founded in 1891.
30 |         <tokens annotators="udpipe">
31 |           <token offsetEnd="48" offsetBegin="46" form="It" id="t7"/>
32 |           <token offsetEnd="51" offsetBegin="49" form="is" id="t8"/>
33 |           <token offsetEnd="53" offsetBegin="52" form="a" id="t9"/>
34 |           <token offsetEnd="59" offsetBegin="54" form="great" id="t10"/>
35 |           <token offsetEnd="70" offsetBegin="60" form="university" id="t11"/>
36 |           <token offsetEnd="71" offsetBegin="70" form="," id="t12"/>
37 |           <token offsetEnd="79" offsetBegin="72" form="founded" id="t13"/>
38 |           <token offsetEnd="82" offsetBegin="80" form="in" id="t14"/>
39 |           <token offsetEnd="87" offsetBegin="83" form="1891" id="t15"/>
40 |           <token offsetEnd="88" offsetBegin="87" form="." id="t16"/>
41 |         </tokens>
42 |       </sentence>
43 |     </sentences>
44 |   </document>
45 | </root>"""
46 | 
47 |         self.exe = 'runMain jigg.pipeline.Pipeline ' \
48 |                    + '-annotators udpipe[tokenize] ' \
49 |                    + '-udpipe.model udpipe-ud-model/english-ud-2.0-170801.udpipe '
50 | 
51 |     def test_udpipe_tokenize(self):
52 |         self.check_equal(self.exe, self.input_text, self.expected_text)
53 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !bin
3 | !project
4 | !python
5 | !script
6 | !src
7 | !build.sbt
8 | !jar


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.o
 3 | *.pyc
 4 | .lock*
 5 | .waf*
 6 | *.class
 7 | build/
 8 | target/
 9 | .idea/
10 | models/
11 | tools/
12 | download
13 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | 
 3 | scala:
 4 |   -  2.11.8
 5 | 
 6 | dist: trusty
 7 | 
 8 | env:
 9 |   - ANNOTATORS=corenlp
10 |   - ANNOTATORS=corenlp_other_languages
11 |   - ANNOTATORS=udpipe
12 |   - ANNOTATORS=depccg
13 |   - ANNOTATORS=juman
14 |   - ANNOTATORS=knp
15 |   - ANNOTATORS=mecab
16 |   - ANNOTATORS=cabocha
17 |   - ANNOTATORS=benepar
18 |   # - ANNOTATORS=syntaxnet
19 | 
20 | before_install:
21 |   - ./.checker/scripts/before-install.sh
22 |   - pyenv global system 3.6
23 |   - virtualenv --python=python3.6 .venv
24 |   - source .venv/bin/activate
25 |   - pip install --upgrade pip
26 | 
27 | install:
28 |   - ./.checker/scripts/install-annotators.sh
29 | 
30 | script:
31 |   - .checker/scripts/run-test.sh
32 | 
33 | branches:
34 |   only:
35 |     - master


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM  openjdk:8
 2 | 
 3 | WORKDIR /jigg
 4 | 
 5 | # Download dependencies
 6 | COPY build.sbt /jigg/
 7 | COPY project/*.sbt project/build.properties /jigg/project/
 8 | COPY bin /jigg/bin
 9 | RUN bin/sbt update
10 | 
11 | # Build
12 | COPY src /jigg/src
13 | COPY jar /jigg/jar
14 | RUN bin/sbt assembly
15 | 
16 | # Run a simple test
17 | RUN echo "テレビで自転車で走っている少女を見た" |\
18 |  java -Xms1024M -Xmx1024M -cp "target/*:jar/jigg-models.jar" \
19 |  jigg.pipeline.Pipeline -annotators ssplit,kuromoji,jaccg
20 | 


--------------------------------------------------------------------------------
/bin/sbt:
--------------------------------------------------------------------------------
1 | java -Dfile.encoding=UTF-8 -Xms512M -Xmx1536M -Xss1M -XX:+CMSClassUnloadingEnabled -XX:MaxPermSize=384M -jar `dirname $0`/sbt-launch.jar "$@"
2 | 


--------------------------------------------------------------------------------
/bin/sbt-launch.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/bin/sbt-launch.jar


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   jigg:
 4 |     build: .
 5 |     ports:
 6 |       - 8080:8080
 7 |     entrypoint:
 8 |       - java
 9 |       - -Xms1024M
10 |       - -Xmx1024M
11 |       - -cp
12 |       - "target/*:jar/*"
13 |       - jigg.pipeline.PipelineServer
14 |       - -host
15 |       - 0.0.0.0
16 |     volumes:
17 |       - ./script:/jigg/script
18 |       - ./jar:/jigg/jar
19 | 


--------------------------------------------------------------------------------
/dockers/knp/Dockerfile:
--------------------------------------------------------------------------------
 1 | # If you build a image using this file, please run the following command at a directory `jigg/`,
 2 | # ```
 3 | # docker build -t {image name}:{tag} -f docker/knp/Dockerfile . 
 4 | # ```
 5 | FROM jigg/jigg-dockers:knp
 6 | 
 7 | WORKDIR /jigg
 8 | 
 9 | ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:/usr/local/bin:/usr/local/lib
10 | ENV PATH $PATH:$HOME/usr/bin
11 | 
12 | COPY build.sbt /jigg/
13 | COPY project/*.sbt project/build.properties /jigg/project/
14 | COPY bin /jigg/bin
15 | RUN bin/sbt update
16 | 
17 | # Build
18 | COPY src /jigg/src
19 | COPY jar /jigg/jar
20 | RUN bin/sbt assembly


--------------------------------------------------------------------------------
/dockers/syntaxnet/Dockerfile:
--------------------------------------------------------------------------------
 1 | # If you build a image using this file, please run the following command at a directory 'jigg/',
 2 | # ```
 3 | # docker build -t {image name}:{tag} -f dockers/syntaxnet/Dockerfile .
 4 | # ```
 5 | 
 6 | FROM tensorflow/syntaxnet
 7 | 
 8 | WORKDIR /jigg
 9 | 
10 | RUN apt-get update -y && apt-get install -y less wget tar bzip2 unzip sudo make gcc g++ libz-dev
11 | 
12 | # install jigg
13 | COPY build.sbt /jigg/
14 | COPY project/*.sbt project/build.properties /jigg/project/
15 | COPY bin /jigg/bin
16 | RUN bin/sbt update
17 | 
18 | # Build
19 | COPY src /jigg/src
20 | COPY jar /jigg/jar
21 | RUN bin/sbt assembly
22 | 


--------------------------------------------------------------------------------
/jar/easyccg.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/jar/easyccg.jar


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.1.0
2 | 


--------------------------------------------------------------------------------
/project/buildinfo.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.7.0")
2 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 | 
3 | // for sbt-sonatype (https://github.com/xerial/sbt-sonatype)
4 | addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.1")
5 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.1.0")
6 | 


--------------------------------------------------------------------------------
/python/pipeline_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from pyjigg import Pipeline
 4 | import xml.etree.ElementTree as ET
 5 | import json
 6 | 
 7 | '''Example to use Jigg from python.
 8 | 
 9 | Before using this, users must start the PipelineServer in a command line, e.g.:
10 |  $ cd jigg-0.6.2/
11 |  $ java -Xmx4g -cp "*" jigg.pipeline.PipelineServer
12 | '''
13 | 
14 | if __name__ == '__main__':
15 |     pipeline = Pipeline('http://localhost:8080')
16 | 
17 |     text1 = """This is the first sentence. This is the second sentence."""
18 | 
19 |     text2 = """This is the third sentence. This is the forth sentence."""
20 | 
21 |     output1 = pipeline.annotate(text1, {
22 |         'annotators': 'corenlp[tokenize,ssplit]',
23 |         'outputFormat': 'xml'})
24 |     print ET.tostring(output1)
25 | 
26 |     output2 = pipeline.annotate(text2, {
27 |         'annotators': 'corenlp[tokenize,ssplit]',
28 |         'outputFormat': 'json'})
29 |     print json.dumps(output2, indent=4)
30 | 


--------------------------------------------------------------------------------
/python/pyjigg/__init__.py:
--------------------------------------------------------------------------------
1 | from pyjigg.pipeline import Pipeline
2 | 


--------------------------------------------------------------------------------
/python/pyjigg/pipeline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import xml.etree.ElementTree as ET
 4 | import json
 5 | import requests
 6 | 
 7 | JIGG = 'jigg-0.6.2'
 8 | 
 9 | class Pipeline:
10 | 
11 |     def __init__(self, server_url):
12 |         if server_url[-1] == '/':
13 |             server_url = server_url[:-1]
14 |         self.server_url = server_url
15 | 
16 |     def annotate(self, text, properties=None):
17 |         assert isinstance(text, str)
18 |         if properties is None:
19 |             properties = {}
20 |         else:
21 |             assert isinstance(properties, dict)
22 | 
23 |         # Checks that the Jigg Pipeline server is started.
24 |         try:
25 |             requests.get(self.server_url)
26 |         except requests.exceptions.ConnectionError:
27 |             raise Exception('Check whether you have started the Jigg\'s PipelineServer e.g.\n'
28 |                             '$ cd %s/ \n'
29 |                             '$ java -Xmx4g -cp "*" jigg.pipeline.PipelineServer' % (JIGG))
30 | 
31 |         url = self.server_url + '/annotate'
32 |         text = text.encode()
33 |         data = properties.copy()
34 |         data['q'] = text
35 |         r = requests.post(url, data=data)
36 |         output = r.text
37 |         if ('outputFormat' in properties and properties['outputFormat'] == 'json'):
38 |             try:
39 |                 output = json.loads(output, encoding='utf-8', strict=True)
40 |             except:
41 |                 pass
42 |         else:
43 |             try:
44 |                 output = ET.fromstring(output)
45 |             except:
46 |                 pass
47 | 
48 |         return output
49 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(
4 |     name = "pyjigg",
5 |     packages=['pyjigg'],
6 |     version = "0.1.0",
7 | )
8 | 


--------------------------------------------------------------------------------
/script/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # Usage: ./script/release.sh <version-number> (e.g., 0.7.2)
 4 | 
 5 | version=$1
 6 | corenlp_url='http://nlp.stanford.edu/software/stanford-corenlp-full-2018-02-27.zip'
 7 | corenlp_model='stanford-corenlp-3.9.1-models.jar'
 8 | jigg_url='git@github.com:mynlp/jigg.git'
 9 | 
10 | corenlp_zip=${corenlp_url##*/}
11 | corenlp_dir=${corenlp_zip%.*}
12 | 
13 | if [[ ! -e jigg-${version} ]]; then mkdir jigg-${version}; fi
14 | cd jigg-${version}
15 | 
16 | # get jigg, if needed
17 | if [[ ! -e jigg ]]; then
18 |     git clone $jigg_url
19 | fi
20 | 
21 | # add corenlp model
22 | if [[ ! -e ${corenlp_dir} ]]; then
23 |     wget ${corenlp_url} -O ${corenlp_zip}
24 |     unzip ${corenlp_zip}
25 |     mv ${corenlp_dir}/${corenlp_model} jigg
26 | fi
27 | 
28 | # add assembled jigg
29 | if [[ ! -e jigg/jigg-$1.jar ]]; then
30 |     cd jigg
31 |     ./bin/sbt assembly
32 |     mv target/jigg-assembly-$1.jar jigg-$1.jar
33 |     ./bin/sbt clean
34 |     cd ../
35 | fi
36 | 
37 | for f in 'src/test' '.checker' '.git' 'project' 'target'; do
38 |     if [[ -e jigg/$f ]]; then
39 |         rm -rf jigg/$f
40 |     fi
41 | done
42 | 
43 | if [[ -e jigg/.git ]]; then
44 |     rm -rf jigg/.git
45 | fi
46 | 
47 | # if [[ -e jigg/src/test ]]; then
48 | #     rm -rf jigg/src/test
49 | # fi
50 | 
51 | # if [[ -e jigg/.checker ]]; then rm -rf jigg/.checker; fi
52 | 
53 | # if [[ -e jigg/project ]]; then rm -rf jigg/project; fi
54 | # if [[ -e jigg/target ]]; then rm -rf jigg/target; fi
55 | 
56 | # add jigg models (berkeley parser model inside)
57 | if [[ ! -e jigg/jigg-models.jar ]]; then
58 |     cd jigg
59 |     wget https://github.com/mynlp/jigg-models/raw/master/jigg-models.jar
60 |     cd ../
61 | fi
62 | 
63 | mv jigg jigg-${version}
64 | zip -r jigg-${version}.zip jigg-${version}
65 | 


--------------------------------------------------------------------------------
/src/main/resources/python/bene_par.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function, unicode_literals
 3 | import sys
 4 | 
 5 | import benepar
 6 | 
 7 | # In Python2, wrap sys.stdin and sys.stdout to work with unicode.
 8 | if sys.version_info[0] < 3:
 9 |     import codecs
10 |     import locale
11 |     encoding = locale.getpreferredencoding()
12 |     sys.stdin = codecs.getreader(encoding)(sys.stdin)
13 |     sys.stdout = codecs.getwriter(encoding)(sys.stdout)
14 | 
15 | if sys.version_info.major == 3:
16 |     raw_input = input
17 | 
18 | model = sys.argv[1] # maybe "benepar_en"
19 | 
20 | parser = benepar.Parser(model)
21 | 
22 | def parse(tokens, tags):
23 |     sentence = list(zip(tokens, tags))
24 |     parse_raw, tags_raw, sentence = next(parser._batched_parsed_raw([(tokens, sentence)]))
25 |     tree = parser._make_nltk_tree(sentence, tags_raw, *parse_raw)
26 |     return tree
27 | 
28 | while True:
29 |     tokens = raw_input()
30 |     tags = raw_input()
31 | 
32 |     tokens = tokens.split(' ')
33 |     tags = tags.split(' ')
34 | 
35 |     tree = parse(tokens, tags)
36 |     print(tree)
37 |     print("END")
38 | 


--------------------------------------------------------------------------------
/src/main/resources/python/udpipe.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function, unicode_literals
 3 | import sys
 4 | 
 5 | from ufal.udpipe import Model, Pipeline, ProcessingError
 6 | 
 7 | # In Python2, wrap sys.stdin and sys.stdout to work with unicode.
 8 | if sys.version_info[0] < 3:
 9 |     import codecs
10 |     import locale
11 |     encoding = locale.getpreferredencoding()
12 |     sys.stdin = codecs.getreader(encoding)(sys.stdin)
13 |     sys.stdout = codecs.getwriter(encoding)(sys.stdout)
14 | 
15 | if sys.version_info.major == 3:
16 |     raw_input = input
17 | 
18 | # To reduce the overhead we divide the patterns of a possible pipeline into 3 cases.
19 | _MODE_ = ['all', 'tok|pos', 'pos|par', 'tok', 'pos', 'par']
20 | 
21 | model = sys.argv[1]
22 | mode = sys.argv[2] # one of _MODE_
23 | 
24 | model = Model.load(model)
25 | 
26 | if mode == 'all' or mode.find('tok') >= 0: input_format = 'tokenize'
27 | else: input_format = 'conllu'
28 | output_format = 'conllu'
29 | 
30 | if mode == 'all' or mode.find('pos') >= 0: pos = Pipeline.DEFAULT
31 | else: pos = Pipeline.NONE
32 | 
33 | if mode == 'all' or mode.find('par') >= 0: parse = Pipeline.DEFAULT
34 | else: parse = Pipeline.NONE
35 | 
36 | pipeline = Pipeline(
37 |     model, input_format, pos, parse, output_format)
38 | error = ProcessingError()
39 | 
40 | while True:
41 |     inputs = []
42 |     while True:
43 |         line = raw_input()
44 |         if line == '####EOD####': break
45 |         inputs.append(line)
46 | 
47 |     result = pipeline.process('\n'.join(inputs), error)
48 |     print(result)
49 |     print('END')
50 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/Example.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | case class Example[L](featVec:Array[Int], label:L)
20 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/FeatureBase.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | trait FeatureBase
20 | 
21 | // Unlabeld feature, but not limited to: user may want to create features always with label (e.g., in structured classification exam). In such case, please include label to this class and ignore LabeldFeature.
22 | trait Feature extends FeatureBase {
23 |   type LabelType
24 |   type DictionaryType
25 |   def assignLabel(label:LabelType): LabeledFeature[LabelType]
26 |   def concat(items:Any*): String = items.mkString("_###_")
27 | }
28 | 
29 | trait LabeledFeature[L] extends FeatureBase {
30 |   def unlabeled: Feature
31 |   def label: L
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/FeatureIndexer.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.collection.mutable.{HashMap, ArrayBuffer}
20 | 
21 | @SerialVersionUID(1L)
22 | trait FeatureIndexer[Feature] extends Serializable {
23 |   def size: Int
24 | 
25 |   /** Mutable indexing method which may add a new entry into the backbone map
26 |     */
27 |   def getIndex(key: Feature): Int
28 | 
29 |   /** Immutable indexing, -1 for unknown entry.
30 |     */
31 |   def get(key: Feature) = getIndex(key)
32 | }
33 | 
34 | @SerialVersionUID(1L)
35 | class ExactFeatureIndexer[Feature](val map: HashMap[Feature, Int]) extends FeatureIndexer[Feature] {
36 | 
37 |   def size: Int = map.size
38 | 
39 |   def getIndex(key: Feature) = map.getOrElseUpdate(key, map.size)
40 | 
41 |   override def get(key: Feature) = map.getOrElse(key, -1)
42 | }
43 | 
44 | /** FeatureIndexer with hash trick. Hash value is calculated with MurmurHash3.
45 |   *
46 |   * Pros of this approach are:
47 |   *  1) Very memory efficient; we don't have to hold a hashmap for millions of feature objects;
48 |   *  2) Small loading time of model.
49 |   *
50 |   * The expense is a small loss of accuracy but usually this is really small...
51 |   */
52 | @SerialVersionUID(1L)
53 | class HashedFeatureIndexer[Feature] private(
54 |   val maxFeatureSize: Int,
55 |   val hasher: (Feature => Int)) extends FeatureIndexer[Feature] {
56 | 
57 |   def size = maxFeatureSize
58 | 
59 |   def getIndex(key: Feature) = (math.abs(hasher(key)) % maxFeatureSize)
60 | }
61 | 
62 | object HashedFeatureIndexer {
63 |   def apply[Feature](
64 |     maxFeatureSize: Int = (2 << 23),
65 |     hasher: (Feature => Int) = {f: Feature => f.hashCode()}) = {
66 | 
67 |     val biggestPrimeBelow = primes.takeWhile(maxFeatureSize > _).last
68 |     new HashedFeatureIndexer[Feature](biggestPrimeBelow, hasher)
69 |   }
70 | 
71 |   private lazy val primes = 2 #:: sieve(3)
72 | 
73 |   private def sieve(n: Int): Stream[Int] =
74 |     if (primes.takeWhile(p => p*p <= n).exists(n % _ == 0)) sieve(n + 2)
75 |     else n #:: sieve(n + 2)
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/FeatureUtil.scala:
--------------------------------------------------------------------------------
 1 | // package jigg.ml
 2 | 
 3 | // import scala.collection.mutable.{Map => mMap}
 4 | // import scala.collection.mutable.AnyRefMap
 5 | 
 6 | // trait FeatureUtil[Feature <: AnyRef] {
 7 | //   type FeatureIndexer = AnyRefMap[Feature, Int]
 8 | 
 9 | //   def getIndex(indexer: FeatureIndexer, key: Feature) = indexer.getOrElseUpdate(key, indexer.size)
10 | 
11 | //   def removeIndexes(indexer: FeatureIndexer, idxs: Seq[Int]): Unit = {
12 | //     val features = indexer.toSeq.sortWith(_._2 < _._2).map(_._1)
13 | //     val originalSize = indexer.size
14 | //     (0 to idxs.size) foreach { i =>
15 | //       val idx = if (i == idxs.size) originalSize else idxs(i)
16 | //       val lastIdx = if (i == 0) -1 else idxs(i - 1)
17 | //       (lastIdx + 1 until idx) foreach { f => indexer(features(f)) -= i }
18 | //       if (i != idxs.size) indexer -= features(idx)
19 | //     }
20 | //   }
21 | //   def removeElemsOver(indexer: FeatureIndexer, lastIdx: Int) = indexer.toSeq.foreach {
22 | //     case (feature, idx) =>
23 | //       indexer -= feature
24 | //   }
25 | // }
26 | 
27 | // // example usage:
28 | // object FeatureUtilExample {
29 | //   case class MyFeature(unlabeled: String, label: Int)
30 | //   object FU extends FeatureUtil[MyFeature]
31 | 
32 | //   def run = {
33 | //     val indexer = new FU.FeatureIndexer
34 | //     FU.getIndex(indexer, MyFeature("hoge", 10))
35 | //   }
36 | // }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LinearClassifier.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | trait Classifier[L] {
20 | 
21 |   protected val weights: WeightVector[Float]
22 | 
23 |   def predict(examples: Seq[Example[L]]): (L, Float)
24 | }
25 | 
26 | trait LinearClassifier[L] extends Classifier[L] {
27 | 
28 |   override def predict(examples: Seq[Example[L]]): (L, Float) =
29 |     if (examples.isEmpty) (null.asInstanceOf[L], 0F)
30 |     else examples.map { e => (e.label, featureScore(e.featVec)) }.maxBy(_._2)
31 | 
32 |   def featureScore(feature: Array[Int]): Float = {
33 |     var a = 0F
34 |     var i = 0
35 |     while (i < feature.size) {
36 |       a += weight(feature(i))
37 |       i += 1
38 |     }
39 |     a
40 |   }
41 |   /** Control the behavior of the access to weight.
42 |     * You *MUST* use this method to access weight inside the classifier, and *NEVER* call like weights(i) directly (except updating the value)
43 |     * This is because in some classifiers, such as AdaGradL1, the values must be preprocessed (e.g., lazy update) before used.
44 |     * You can add such a preprocessing by overriding this method in a subclass.
45 |     */
46 |   protected def weight(idx: Int): Float = weights(idx)
47 | }
48 | 
49 | /** A classifier in which weight vector backbone is implemented by array, hopefully faster than growable counterpart.
50 |   */
51 | class FixedClassifier[L](val array: Array[Float]) extends LinearClassifier[L] {
52 |   override val weights = new FixedWeightVector(array)
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LogLinearAdaGradL1.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | abstract class LogLinearAdaGradL1[L](val lambda: Float, val eta: Float) extends OnlineLogLinearTrainer[L] {
20 | 
21 |   private[this] val lastUpdates = WeightVector.growable[Float]()
22 |   private[this] val diagGt = WeightVector.growable[Float]()
23 | 
24 |   override protected def weight(idx: Int): Float =
25 |     if (lastUpdates(idx) == time) weights(idx)
26 |     else {
27 |       val currentXti = weights(idx)
28 |       if (currentXti == 0.0F) 0.0F
29 |       else {
30 |         val t0 = lastUpdates(idx)
31 |         assert(time != 0)
32 |         val ht0ii = 1.0 + Math.sqrt(diagGt(idx))
33 |         val newWeight = Math.signum(currentXti) * Math.max(
34 |           0.0, Math.abs(currentXti) - (lambda * eta / ht0ii) * (time - t0))
35 |         weights(idx) = newWeight.toFloat
36 |         lastUpdates(idx) = time
37 |         newWeight.toFloat
38 |       }
39 |     }
40 | 
41 |   override def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit = {
42 |     // Here, we negate the gradient. This is because original formulation by Duch et al.
43 |     // minimizes the objective, while we maximize the objective.
44 |     val gti = -derivative
45 |     val deltaDiagGti = gti * gti // these are shared by all i below, so we cache here
46 | 
47 |     val feats = e.featVec
48 |     var j = 0
49 |     while (j < feats.size) {
50 |       val i = feats(j)
51 | 
52 |       //val xti = weight(i) // This automatically perform lazy update of the target weight
53 |       val xti = weights(i) // weighs(i) must be lazy-updated at calculating label scores, so we can skip
54 |       diagGt(i) += deltaDiagGti
55 |       val htii = 1.0 + Math.sqrt(diagGt(i))
56 |       val etaOverHtii = eta / htii
57 |       val tempXti = xti - etaOverHtii * gti
58 | 
59 |       weights(i) = (Math.signum(tempXti) * Math.max(0.0, Math.abs(tempXti) - lambda * etaOverHtii)).toFloat
60 |       lastUpdates(i) = time + 1
61 | 
62 |       j += 1
63 |     }
64 |   }
65 |   override def postProcess: Unit = {
66 |     (0 until weights.size).foreach { weight(_) }
67 |   }
68 | }
69 | 
70 | class FixedLogLinearAdaGradL1[L](val weightArray: Array[Float], lambda: Float, eta: Float) extends LogLinearAdaGradL1(lambda, eta) {
71 |   override val weights = new FixedWeightVector(weightArray)
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LogLinearClassifier.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | /** Augument LinearClassifier with a method to return label probabilities.
20 |   * (implies loss function of log loss)
21 |   */
22 | trait LogLinearClassifier[L] extends LinearClassifier[L] {
23 |   val weights: WeightVector[Float]
24 | 
25 |   def labelProbs(examples: Seq[Example[L]]): Array[Float] = {
26 |     val unnormalized: Array[Float] = examples.map { e =>
27 |       val p = Math.exp(featureScore(e.featVec)).toFloat
28 |       if (p < 1e-100) 1e-100F else p
29 |     }.toArray
30 |     val z = unnormalized.sum
31 |     unnormalized.map(_ / z)
32 |   }
33 | }
34 | 
35 | class FixedLogLinerClassifier[L](val weightArray: Array[Float]) extends LogLinearClassifier[L] {
36 |   override val weights = new FixedWeightVector(weightArray)
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/LogLinearSGD.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | abstract class LogLinearSGD[L](val a: Float) extends OnlineLogLinearTrainer[L] {
20 | 
21 |   def stepSize = Math.pow(time + 1, -a).toFloat // avoid the overflow
22 |   def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit = {
23 |     val dw = stepSize * derivative
24 |     val feats = e.featVec
25 |     var i = 0
26 |     while (i < feats.size) {
27 |       weights(feats(i)) += dw
28 |       i += 1
29 |     }
30 |   }
31 | }
32 | 
33 | class FixedLogLinearSGD[L](val weightArray: Array[Float], a: Float) extends LogLinearSGD(a) {
34 | 
35 |   override val weights = new FixedWeightVector(weightArray)
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/OnlineLogLinearTrainer.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | /** This trait exploits the common procedure in trainers of log-linear models.
20 |   */
21 | trait OnlineLogLinearTrainer[L] extends OnlineTrainer[L] with LogLinearClassifier[L] {
22 |   var time: Int = 0
23 | 
24 |   override def update(examples: Seq[Example[L]], gold:L): Unit = {
25 |     val dist = labelProbs(examples)
26 |     var i = 0
27 |     while (i < examples.size) {
28 |       val e = examples(i)
29 |       val p = dist(i)
30 |       val derivative = if (e.label == gold) (1 - p) else -p
31 |       updateExampleWeights(e, gold, derivative)
32 |       i += 1
33 |     }
34 |     reguralizeWeights(examples)
35 |     time += 1
36 |   }
37 |   def updateExampleWeights(e: Example[L], gold: L, derivative: Float): Unit
38 |   def reguralizeWeights(examples: Seq[Example[L]]): Unit = {} // Some algorithms reguralize weights after temporalily updating the values and this method defines that postprocessing. See LogLinearSGDCumulativeL1 for example.
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/OnlineTrainer.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | /** A trait which support parameter update, and the interface of Classifier.
20 |   * Currently two subclasses exists: OnlineLoglinearTrainer is used for log-linear models, while Perceptron is used to train the perceptron including structured perceptron with beam-search.
21 |   */
22 | trait OnlineTrainer[L] extends Classifier[L] {
23 |   def update(examples: Seq[Example[L]], gold:L): Unit
24 |   def postProcess: Unit = Unit
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/Perceptron.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.collection.mutable.ArrayBuffer
20 | 
21 | trait Perceptron[L] extends LinearClassifier[L] with OnlineTrainer[L] {
22 | 
23 |   def averageWeights: WeightVector[Float]
24 | 
25 |   var c = 1.0F
26 | 
27 |   override def update(examples: Seq[Example[L]], gold: L): Unit = {
28 |     val pred = predict(examples)._1
29 |     if (pred != gold) {
30 |       var i = 0
31 |       while (i < examples.size) {
32 |         val label = examples(i).label
33 |         if (label == pred) updateFeatureWeighs(examples(i).featVec, -1.0F)
34 |         else if (label == gold) updateFeatureWeighs(examples(i).featVec, 1.0F)
35 |         i += 1
36 |       }
37 |     }
38 |     c += 1.0F
39 |   }
40 |   def updateFeatureWeighs(featVec: Array[Int], scale: Float): Unit = featVec.foreach { f =>
41 |     weights(f) += scale
42 |     averageWeights(f) += scale * c
43 |   }
44 |   def update(predFeatVec:Array[Int], goldFeatVec:Array[Int]): Unit = {
45 |     updateFeatureWeighs(predFeatVec, -1.0F)
46 |     updateFeatureWeighs(goldFeatVec, 1.0F)
47 |     c += 1.0F
48 |   }
49 |   def takeAverage: Unit = (0 until weights.size) foreach { i =>
50 |     weights(i) -= averageWeights(i) / c
51 |   }
52 | }
53 | 
54 | class FixedPerceptron[L](val weightArray: Array[Float]) extends Perceptron[L] {
55 | 
56 |   override val weights = new FixedWeightVector(weightArray)
57 |   override val averageWeights = new FixedWeightVector(new Array[Float](weights.size))
58 | }
59 | 
60 | class GrowablePerceptron[L](val weightArray: ArrayBuffer[Float]) extends Perceptron[L] {
61 | 
62 |   override val weights = new GrowableWeightVector(weightArray)
63 |   override val averageWeights = WeightVector.growable[Float](weights.size)
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/WeightVector.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.collection.mutable.ArrayBuffer
20 | 
21 | @SerialVersionUID(1L)
22 | trait WeightVector[@specialized(Int, Double, Float) A] extends Serializable {
23 |   def apply(idx: Int): A
24 |   def update(idx: Int, elem: A): Unit
25 |   def size: Int
26 | 
27 |   def seq: IndexedSeq[A] // indexed seq from a backbone data structure
28 | }
29 | 
30 | object WeightVector {
31 |   def growable[A](initialSize: Int = 0)(implicit numeric: Numeric[A]) = new GrowableWeightVector[A](new ArrayBuffer[A](initialSize))(numeric)
32 | }
33 | 
34 | class FixedWeightVector[@specialized(Int, Double, Float) A](val array: Array[A]) extends WeightVector[A] {
35 |   def apply(idx: Int) = array(idx)
36 |   def update(idx: Int, elem: A) = array(idx) = elem
37 |   def size = array.size
38 | 
39 |   def seq = array
40 | }
41 | 
42 | class GrowableWeightVector[@specialized(Int, Double, Float) A](val array: ArrayBuffer[A])(implicit numeric: Numeric[A]) extends WeightVector[A] {
43 |   def apply(idx: Int) = if (idx >= size || idx < 0) numeric.zero else array(idx)
44 |   def update(idx: Int, elem: A) = {
45 |     if (idx >= array.size) array ++= List.fill(idx - array.size + 1)(numeric.zero)
46 |     array(idx) = elem
47 |   }
48 |   def size = array.size
49 | 
50 |   def seq = array
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Dense.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.{DenseMatrix, DenseVector}
20 | import ucar.nc2.{Variable, Group}
21 | 
22 | class Dense(inputDim: Int, outputDim: Int) extends Functor{
23 | 
24 |   override def functorName = "Dense"
25 | 
26 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = {
27 |     val z = data * w
28 |     for (i <- 0 until data.rows){
29 |       z(i, ::) :+= b.t
30 |     }
31 |     z
32 |   }
33 | 
34 |   private val w = DenseMatrix.zeros[Float](inputDim, outputDim)
35 |   private val b = DenseVector.zeros[Float](outputDim)
36 | 
37 |   def h5load(weight: Variable, bias: Variable): Unit = {
38 |     val weightData = weight.read
39 |     val weightIndex = weightData.getIndex
40 |     val biasData = bias.read
41 |     val biasIndex = biasData.getIndex
42 |     for(y <- 0 until inputDim)
43 |       for(x <- 0 until outputDim){
44 |         w(y, x) = weightData.getFloat(weightIndex.set(y, x))
45 |         if(y == 0)
46 |           b(x) = biasData.getFloat(biasIndex.set(x))
47 |       }
48 |   }
49 | 
50 |   override def toString: String = "Dense: {inputDim: " + inputDim + ", outputDim: " + outputDim + "}"
51 | 
52 |   def head: String = w(0 until 2, ::).toString
53 | }
54 | 
55 | object Dense{
56 |   def apply(inputDim:Int, outputDim:Int) = new Dense(inputDim, outputDim)
57 | 
58 |   def apply(configs: Map[String, Any], weightGroups: Group): Dense = {
59 |     val layerName = configs("name").toString
60 |     val params = weightGroups.findGroup(layerName)
61 |     val weightNames = params.findAttribute("weight_names")
62 |     val weight = params.findVariable(weightNames.getStringValue(0))
63 |     val bias = params.findVariable(weightNames.getStringValue(1))
64 |     val dims = weight.getDimensions
65 |     if(dims.size != 2){
66 |       throw new IllegalArgumentException("invalid dimension for Dense class")
67 |     }
68 | 
69 |     val d = new Dense(dims.get(0).getLength, dims.get(1).getLength)
70 |     d.h5load(weight, bias)
71 |     d
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Embedding.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.{DenseMatrix, DenseVector}
20 | import ucar.nc2.{Variable, Group}
21 | 
22 | class Embedding(vocabulary: Int, outDim: Int) extends Functor{
23 | 
24 |   override def functorName = "Embedding"
25 | 
26 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = {
27 |     val arrayOfId = data.reshape(data.size, 1)
28 |     val length = arrayOfId.size
29 |     val z = DenseMatrix.zeros[Float](length, outDim)
30 |     for(i <- 0 until length){
31 |       z(i, ::) := w(arrayOfId(i, 0).asInstanceOf[Int]).t
32 |     }
33 |     z
34 |   }
35 | 
36 |   private val w = new Array[DenseVector[Float]](vocabulary).map(_ => DenseVector.zeros[Float](outDim))
37 | 
38 |   def h5load(weight: Variable):Unit = {
39 |     val weightData = weight.read
40 |     val weightIndex = weightData.getIndex
41 |     for(y <- 0 until vocabulary)
42 |       for(x <- 0 until outDim)
43 |         w(y)(x) = weightData.getFloat(weightIndex.set(y, x))
44 |   }
45 | 
46 | }
47 | 
48 | object Embedding{
49 |   def apply(vocabulary: Int, outDim: Int) = new Embedding(vocabulary, outDim)
50 | 
51 |   def apply(configs: Map[String, Any], weightGroups: Group): Embedding = {
52 |     val layerName = configs("name").toString
53 |     val params = weightGroups.findGroup(layerName)
54 |     val weightNames = params.findAttribute("weight_names")
55 |     val weight = params.findVariable(weightNames.getStringValue(0))
56 |     val dims = weight.getDimensions
57 |     if(dims.size != 2){
58 |       throw new IllegalArgumentException("Invalid dimension for Embedding class")
59 |     }
60 |     val e = new Embedding(dims.get(0).getLength, dims.get(1).getLength)
61 |     e.h5load(weight)
62 |     e
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Empty.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | 
21 | object Empty extends Functor{
22 | 
23 |   override def functorName = "Empty"
24 | 
25 |   override final def convert(data: DenseMatrix[Float]):DenseMatrix[Float] = data
26 | 
27 |   def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Flatten.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | 
21 | object Flatten extends Functor{
22 | 
23 |   override def functorName = "Flatten"
24 | 
25 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.t.toDenseVector.toDenseMatrix
26 | 
27 |   def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Functor.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | 
21 | trait Functor {
22 | 
23 |   def functorName: String
24 |   def convert(data: DenseMatrix[Float]): DenseMatrix[Float]
25 |   override def toString: String = functorName
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/KerasModel.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | import jigg.util.HDF5Object
21 | import org.json4s.jackson.JsonMethods._
22 | import org.json4s.{DefaultFormats, _}
23 | 
24 | class KerasModel(model: HDF5Object) {
25 | 
26 |   private val kerasAttribute = model.checkAndGetAttribute("keras_version")
27 |   private val modelAttribute = model.checkAndGetAttribute("model_config")
28 | 
29 |   private val weightGroups = model.checkAndGetGroup("model_weights")
30 | 
31 |   def parseConfigToSeq(config: String): Seq[Map[String, Any]] = {
32 |     val jsonValue = parse(config)
33 |     implicit val formats = DefaultFormats
34 |     val jsonList = jsonValue.extract[Map[String, Any]]
35 |     jsonList("config").asInstanceOf[Seq[Map[String, Any]]]
36 |   }
37 | 
38 |   private val modelValues = parseConfigToSeq(modelAttribute.getValue(0).toString)
39 | 
40 |   def getConfigs(x: Map[String, Any]): Map[String, Any] = x("config").asInstanceOf[Map[String,Any]]
41 | 
42 |   def constructNetwork(values: Seq[Map[String, Any]]): Seq[Functor] = values.map{
43 |     x => {
44 |       val configs = getConfigs(x)
45 |       val functor = x("class_name").toString match {
46 |         case "Activation" =>
47 |           configs("activation").toString match{
48 |             case "relu" => Relu
49 |             case "softmax" => Softmax
50 |             case "sigmoid" => Sigmoid
51 |             case "tanh" => Tanh
52 |           }
53 |         case "Convolution1D" =>
54 |           Convolution1D(configs, weightGroups)
55 |         case "Dense" =>
56 |           Dense(configs, weightGroups)
57 |         case "Embedding" =>
58 |           Embedding(configs, weightGroups)
59 |         case "Flatten" => Flatten
60 |         case _ => Empty
61 |       }
62 |       functor
63 |     }
64 |   }
65 | 
66 |   private val graph:Seq[Functor] = constructNetwork(modelValues)
67 | 
68 |   def convert(input: DenseMatrix[Float]): DenseMatrix[Float] = callFunctors(input, graph)
69 | 
70 |   private def callFunctors(input: DenseMatrix[Float], unprocessed:Seq[Functor]): DenseMatrix[Float] = unprocessed match {
71 |     case functor :: tail =>
72 |       val interOutput = functor.convert(input)
73 |       callFunctors(interOutput, tail)
74 |     case Nil => input
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/KerasParser.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.argmax
20 | import jigg.ml.keras._
21 | import jigg.util.LookupTable
22 | 
23 | import scala.xml.Node
24 | import scala.collection.mutable.{ArrayBuffer, ListBuffer}
25 | 
26 | class KerasParser(model: KerasModel, table: LookupTable) {
27 | 
28 |   /*
29 |    * BIO tag
30 |    *  B : Begin of segment.               Value is 0.
31 |    *  I : Continuation or end of segment. Value is 1.
32 |    *  O : Others.                         Value is 2.
33 |    */
34 |   private val tagset:Map[Int, String] = Map(0 -> "B", 1 -> "I", 2 -> "O")
35 | 
36 |   def parsing(str: String): Array[(Int, Int)] = {
37 |     // For dummy input to indicate boundaries of sentence.
38 |     val s = "\n" + str + "\n\n"
39 |     val inputData = table.encodeCharacter(s)
40 |     val outputData = model.convert(inputData)
41 | 
42 |     val tags = for {
43 |       i <- 1 until outputData.rows - 2
44 |       maxID = argmax(outputData(i, ::))
45 |     } yield maxID
46 | 
47 |     getOffsets(tags.toArray)
48 |   }
49 | 
50 |   def parsing(tokens: Node): Array[Array[String]] = {
51 |     // For dummy input to indicate boundaries of sentence.
52 |     val words = Array("\n").union(
53 |       (tokens \\ "tokens").flatMap(x => x \\ "@lemma").toArray.map(x => x.toString)).union(Array("\n\n"))
54 |     val ids = (tokens \\ "tokens").flatMap(x => x \\ "@id").toArray.map(x => x.toString)
55 | 
56 |     val inputData = table.encodeWords(words)
57 |     val outputData = model.convert(inputData)
58 | 
59 |     val tags = for {
60 |       i <- 1 until outputData.rows - 2
61 |       maxID = argmax(outputData(i, ::))
62 |     } yield maxID
63 | 
64 |     val ranges = getOffsets(tags.toArray)
65 | 
66 |     ranges.map(x => ids.slice(x._1, x._2))
67 |   }
68 | 
69 |   def getOffsets(data: Array[Int]): Array[(Int, Int)]= {
70 |     val ranges = ArrayBuffer[(Int, Int)]()
71 |     var bpos = -1
72 | 
73 |     for(i <- data.indices){
74 |       tagset(data(i)) match{
75 |         case "B" =>
76 |           if(bpos >= 0)
77 |             ranges += ((bpos, i))
78 |           bpos = i
79 |         case "I" if i == 0 || bpos == -2 =>
80 |           bpos = i
81 |         case "O" =>
82 |           if (bpos >= 0)
83 |             ranges += ((bpos, i))
84 |           bpos = -2
85 |         case _ if i == data.indices.last =>
86 |           ranges += ((bpos, i + 1))
87 |         case _ =>
88 |       }
89 |     }
90 |     ranges.toArray
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/README.md:
--------------------------------------------------------------------------------
 1 | # KerasParser
 2 | 
 3 | ## Abstract
 4 | - Main class: jigg.ml.keras.KerasParser
 5 | - KerasParser requires a model file and a lookup-table file.
 6 | 
 7 | ## Requirements
 8 | ### Model file
 9 | - Model file must be generated by [keras](https://keras.io)
10 | - HDF5 is only supported
11 | - Required output class style: BIO
12 |   - Tag `B` corresponds to `0`.
13 |   - Tag `I` corresponds to `1`.
14 |   - Tag `O` corresponds to `2`.
15 | - The following keras's functions are supported.
16 |   - Layer
17 |     - Dense
18 |     - Embedding
19 |     - Convolution1D
20 |     - Flatten
21 |   - Activation
22 |     - Relu
23 |     - Sigmoid
24 |     - Softmax
25 |     - Tanh
26 | 
27 | ### Lookup table
28 | - Field construction
29 |   - `_lookup`
30 |     - `_key2id`: Convert character/word to ID
31 |       - key: Target character/word
32 |       - value: ID number of target character/word
33 |     - `_id2key`: Convert ID to chracter/word
34 |       - key: ID number of target chracter/word
35 |       - value: Target character/word
36 |   - Table shoud contain following elements:
37 | 
38 |   | ID | Value |
39 |   |:---|:------|
40 |   |0   | UNKNOWN |
41 |   |1   | new line (`\n`) |
42 |   |2   | half space (` `) |
43 | 
44 | #### Example
45 | ```json
46 | {"_lookup":{
47 |     "_key2id": {
48 |         "UNKNOWN": "0",
49 |         "\n": "1",
50 |         " " : "2",
51 |         "Additional elements": "3..."
52 |         },
53 |     "_id2key": {
54 |         "0": "UNKNOWN",
55 |         "1": "\n",
56 |         "2": " ",
57 |         "3..." : "Additional elements"
58 |     }
59 |   }
60 | }
61 | ```
62 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Relu.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | 
21 | object Relu extends Functor{
22 | 
23 |   override def functorName = "Relu"
24 | 
25 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map(x =>
26 |     if(x > 0.0.toFloat) x else 0.0.toFloat
27 |   )
28 | 
29 |   def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Sigmoid.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | import breeze.numerics.exp
21 | 
22 | object Sigmoid extends Functor {
23 | 
24 |   override def functorName = "Sigmoid"
25 | 
26 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map{x => (1.0 / (1.0 + exp(-x))).toFloat}
27 | 
28 |   def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Softmax.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.{DenseVector, DenseMatrix, softmax}
20 | import breeze.numerics.exp
21 | 
22 | object Softmax extends Functor{
23 | 
24 |   override def functorName = "Softmax"
25 | 
26 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = {
27 |     for(y <- 0 until data.rows){
28 |       val v = data(y, ::)
29 |       data(y, ::) := (exp(v) :/= exp(softmax(v)))
30 |     }
31 |     data
32 |   }
33 | 
34 |   def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/ml/keras/Tanh.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import breeze.linalg.DenseMatrix
20 | import breeze.numerics.tanh
21 | 
22 | object Tanh extends Functor{
23 | 
24 |   override def functorName = "Tanh"
25 | 
26 |   override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = data.map{ x => tanh(x)}
27 | 
28 |   def apply(x: DenseMatrix[Float]): DenseMatrix[Float] = this.convert(x)
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/CCGBank2EnjuXML.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import lexicon._
20 | import jigg.util.IOUtil
21 | 
22 | import breeze.config.{CommandLineParser, Help}
23 | 
24 | import scala.collection.mutable.ArrayBuffer
25 | import scala.sys.process.Process
26 | 
27 | import java.io.{File, FileWriter}
28 | 
29 | 
30 | object CCGBank2EnjuXML {
31 | 
32 |   case class Opts(
33 |     @Help(text="Path to CCGBank file") ccgBank: File = new File(""),
34 |     @Help(text="Path to output (xml)") output: File = new File(""),
35 |     @Help(text="Number of sentences") numSentences: Int = 50
36 |   )
37 | 
38 |   def main(args:Array[String]) = {
39 |     val opts = CommandLineParser.readIn[Opts](args)
40 | 
41 |     val dict = new JapaneseDictionary(new Word2CategoryDictionary)
42 | 
43 |     val conv = new JapaneseParseTreeConverter(dict)
44 | 
45 |     val reader = new CCGBankReader
46 | 
47 |     val instances: Seq[(TaggedSentence, Derivation)] =
48 |       reader.takeLines(IOUtil.openIterator(opts.ccgBank.getPath), opts.numSentences).toSeq.map { line =>
49 |         val trees = reader.readParseFragments(line).map { conv.toLabelTree(_) }
50 |         (conv.toSentenceFromLabelTrees(trees), conv.toFragmentalDerivation(trees))
51 |       }
52 | 
53 |     val fw = new FileWriter(opts.output.getPath)
54 | 
55 |     instances.zipWithIndex foreach { case ((s, d), i) => fw.write(d.renderEnjuXML(s, i) + "\n") }
56 | 
57 |     fw.flush
58 |     fw.close
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/CCGBankToCabochaFormat.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | 
20 | import lexicon._
21 | 
22 | import breeze.config.{CommandLineParser, Help}
23 | 
24 | import scala.sys.process.Process
25 | 
26 | import java.io.{File, FileWriter, ByteArrayInputStream}
27 | 
28 | /** Creates Cabocha-formatted CCGBank sentences.
29 |   *
30 |   * The output of this is required when evaluating bunsetsu-dependency of CCG parser.
31 |   * When new CCGBank is released, currently, we have to manually run this class to get the correct data.
32 |   */
33 | object CCGBankToCabochaFormat {
34 | 
35 |   case class Opts(
36 |     @Help(text="Path to CCGBank file") ccgbank: File = new File(""),
37 |     @Help(text="Path to output") output: File = new File(""),
38 |     @Help(text="Cabocha command (path to cabocha)") cabocha: String = "cabocha"
39 |   )
40 | 
41 |   type Tree = ParseTree[NodeLabel]
42 | 
43 |   def main(args:Array[String]) = {
44 |     val opts = CommandLineParser.readIn[Opts](args)
45 | 
46 |     val dict = new JapaneseDictionary()
47 |     val extractors = TreeExtractor(
48 |       new JapaneseParseTreeConverter(dict),
49 |       new CCGBankReader)
50 | 
51 |     val trees = extractors.readTrees(opts.ccgbank, -1, true)
52 |     val rawString = trees map (extractors.treeConv.toSentenceFromLabelTree) map (_.wordSeq.mkString("")) mkString ("\n")
53 |     val is = new java.io.ByteArrayInputStream(rawString.getBytes("UTF-8"))
54 |     val out = (Process(s"${opts.cabocha} -f1") #< is).lineStream_!
55 | 
56 |     val os = jigg.util.IOUtil.openOut(opts.output.getPath)
57 |     out foreach { line =>
58 |       os.write(line + "\n")
59 |     }
60 |     os.flush
61 |     os.close
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/GoldBunsetsuDepInCabocha.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import lexicon._
20 | import jigg.util.IOUtil
21 | 
22 | import breeze.config.{CommandLineParser, Help}
23 | 
24 | import java.io.{File, FileWriter}
25 | 
26 | /** Input: CCGBank file (e.g., train.ccgbank) from stdin.
27 |   * Output: Gold bunsetsu dependencies according to the CCGBank in CoNLL format.
28 |   */
29 | object GoldBunsetsuDepInCoNLL {
30 | 
31 |   case class Opts(
32 |     @Help(text="Path to Cabocha file (same sentences with the CCGBank file)") cabocha: File = new File("")
33 |   )
34 | 
35 |   def main(args:Array[String]) = {
36 |     val opts = CommandLineParser.readIn[Opts](args)
37 | 
38 |     val dict = new JapaneseDictionary(new Word2CategoryDictionary)
39 | 
40 |     val conv = new JapaneseParseTreeConverter(dict)
41 |     val parseTrees = new CCGBankReader()
42 |       .readParseTrees(IOUtil.openStandardIterator, -1, true)
43 |       .map(conv.toLabelTree _).toSeq
44 |     val goldDerivs = parseTrees.map(conv.toDerivation)
45 |     val sentences = parseTrees.map(conv.toSentenceFromLabelTree)
46 | 
47 |     val bunsetsuSentencesWithPredHead =
48 |       new CabochaReader(sentences).readSentences(opts.cabocha.getPath)
49 | 
50 |     val bunsetsuSentencesWithGoldHead =
51 |       bunsetsuSentencesWithPredHead zip goldDerivs map { case (sentence, deriv) =>
52 |         BunsetsuSentence(sentence.bunsetsuSeq).parseWithCCGDerivation(deriv)
53 |       }
54 |     for (sentence <- bunsetsuSentencesWithGoldHead) {
55 |       println(sentence.renderInCoNLL)
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/Opts.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 |  */
18 | 
19 | import lexicon._
20 | 
21 | import jigg.ml
22 | 
23 | import breeze.config.Help
24 | 
25 | import java.io.File
26 | 
27 | object Opts {
28 | 
29 |   @Help(text="About CCGBank")
30 |   case class BankInfo(
31 |     @Help(text="Language (ja|en)") lang: String = "ja",
32 |     @Help(text="Path to CCGBank directory (if this is set, files in this dir are used as default values of train/dev and others)") dir: File = new File(""),
33 |     @Help(text="# training instances, -1 for all") trainSize: Int = -1,
34 |     @Help(text="# test instances, -1 for all") testSize: Int = -1,
35 |     @Help(text="# dev instances, -1 for all") devSize: Int = -1
36 |   )
37 | 
38 |   @Help(text="About category dictionary")
39 |   case class DictParams(
40 |     @Help(text="How to look up category candidates? (for Japanese only) (surfaceOnly|surfaceAndPoS|surfaceAndSecondFineTag|surfaceAndSecondWithConj)")
41 |       lookupMethod: String = "surfaceAndSecondWithConj",
42 |     @Help(text="Whether using lexicon files to create word -> category mappings")
43 |       useLexiconFiles: Boolean = true,
44 |     @Help(text="Minimum number of occurences for registering as lexicalized entry")
45 |       unkThreathold: Int = 30
46 |   ) {
47 | 
48 |     val categoryDictinoary = lookupMethod match {
49 |       case "surfaceOnly" => new Word2CategoryDictionary
50 |       case "surfaceAndPoS" => new WordPoS2CategoryDictionary
51 |       case "surfaceAndSecondFineTag" => new WordSecondFineTag2CategoryDictionary
52 |       case "surfaceAndSecondWithConj" => new WordSecondWithConj2CategoryDictionary
53 |       case _ => sys.error("unknown lookUpMethod")
54 |     }
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/ParserRunner.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import lexicon._
20 | import parser.{ActionLabel, KBestDecoder}
21 | import jigg.ml.FixedPerceptron
22 | 
23 | import breeze.config.{CommandLineParser, Help}
24 | 
25 | import scala.collection.mutable.{ArraySeq}
26 | 
27 | import java.io.File
28 | 
29 | 
30 | class ParserRunner(model: ParserModel, params: ParserRunner.Params) {
31 | 
32 |   val tagger = new SuperTaggerRunner(model.taggerModel, params.tagger)
33 |   val perceptron = new FixedPerceptron[ActionLabel](model.weights)
34 |   val decoder = model.mkDecoder(params.beam, perceptron)
35 | 
36 |   val preferConnected = params.preferConnected
37 | 
38 |   def decode[S<:TaggedSentence](sentences: Array[S]): Array[Derivation] = {
39 | 
40 |     val predDerivations = sentences.zipWithIndex map {
41 |       case (sentence, i) =>
42 |         if (i % 100 == 0)
43 |           System.err.print(i + "\t/" + sentences.size + " have been processed.\r")
44 |         decodeOne(sentence)
45 |     }
46 |     System.err.println()
47 |     predDerivations
48 |   }
49 | 
50 |   def decodeOne[S<:TaggedSentence](sentence: S): Derivation =
51 |     kBestDerivations(sentence, 1)(0)._1
52 | 
53 |   def kBestDerivations[S<:TaggedSentence](sentence: S, k: Int)
54 |       : Seq[(Derivation, Double)] = {
55 |     val superTaggedSentence = tagger.assignKBest(sentence)
56 | 
57 |     decoder match {
58 |       case decoder: KBestDecoder =>
59 |         decoder predictKbest (k, superTaggedSentence, preferConnected)
60 |       case decoder =>
61 |         Seq(decoder predict superTaggedSentence)
62 |     }
63 |   }
64 | }
65 | 
66 | object ParserRunner {
67 | 
68 |   @Help(text="Params for testing/evaluating parser")
69 |   case class Params(
70 |     @Help(text="Beam size") beam: Int = 32,
71 |     @Help(text="Prefer connected derivation at prediction") preferConnected: Boolean = true,
72 |     tagger: SuperTaggerRunner.Params = new SuperTaggerRunner.Params()
73 |   )
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/SuperTaggerModel.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import tagger.{LF=>Feature, MaxEntMultiTagger, MaxEntMultiTaggerTrainer, FeatureExtractors}
20 | import lexicon._
21 | import jigg.ml._
22 | 
23 | import scala.collection.mutable.HashMap
24 | 
25 | case class SuperTaggerModel(
26 |   dict: Dictionary,
27 |   featureMap: HashMap[Feature, Int],
28 |   weights: WeightVec,
29 |   extractors: FeatureExtractors) { self =>
30 | 
31 |   def reduceFeatures(): SuperTaggerModel = {
32 | 
33 |     val buffer = weights.asInstanceOf[GrowableWeightVector[Float]].array // 0 1.0 2.0 0 0 1.0 ...
34 |     val activeIdxs = buffer.zipWithIndex filter (_._1 != 0) map (_._2)  // 1 2 5
35 |     println(s"# features reduced from ${buffer.size} to ${activeIdxs.size}")
36 |     val idxMap = activeIdxs.zipWithIndex.toMap // {1->0, 2->1 5->2}
37 | 
38 |     val newFeatureMap = featureMap collect {
39 |       case (f, oldIdx) if idxMap.isDefinedAt(oldIdx) => (f, idxMap(oldIdx))
40 |     }
41 |     val newWeights = new FixedWeightVector[Float](activeIdxs.map(buffer).toArray)
42 | 
43 |     this copy (featureMap = newFeatureMap, weights = newWeights)
44 |   }
45 | 
46 |   def mkMultiTaggerTrainer(classifierTrainer: OnlineLogLinearTrainer[Int]) =
47 |     new MaxEntMultiTaggerTrainer(mkIndexer(), extractors, classifierTrainer, dict)
48 | 
49 |   def mkMultiTagger() =
50 |     new MaxEntMultiTagger(mkIndexer(), extractors, mkClassifier(), dict)
51 | 
52 |   def mkClassifier() = new LogLinearClassifier[Int] {
53 |     override val weights = self.weights
54 |   }
55 | 
56 |   private def mkIndexer() = new ExactFeatureIndexer(featureMap)
57 | }
58 | 
59 | object SuperTaggerModel {
60 | 
61 |   def saveTo(path: String, model: SuperTaggerModel) = {
62 |     System.err.println("Saving tagger model to " + path)
63 |     val os = jigg.util.IOUtil.openBinOut(path)
64 |     os.writeObject(model)
65 |     os.close
66 |   }
67 | 
68 |   def loadFrom(path: String): SuperTaggerModel = {
69 |     jigg.util.LogUtil.track("Loading supertagger model ...") {
70 |       val in = jigg.util.IOUtil.openBinIn(path)
71 |       val model = in.readObject.asInstanceOf[SuperTaggerModel]
72 |       in.close
73 |       model
74 |     }
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/SuperTaggerRunner.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import lexicon._
20 | 
21 | import breeze.config.{CommandLineParser, Help}
22 | 
23 | import scala.collection.mutable.{ArraySeq}
24 | 
25 | import java.io.File
26 | 
27 | 
28 | class SuperTaggerRunner(model: SuperTaggerModel, params: SuperTaggerRunner.Params) {
29 | 
30 |   val tagger = model.mkMultiTagger()
31 | 
32 |   def assignKBests[S<:TaggedSentence](sentences: Array[S]): ArraySeq[S#AssignedSentence] =
33 |     sentences map (assignKBest)
34 | 
35 |   def assignKBest[S<:TaggedSentence](s: S): S#AssignedSentence =
36 |     s assignCands (tagger candSeq(s, params.beta, params.maxK))
37 | }
38 | 
39 | object SuperTaggerRunner {
40 | 
41 |   @Help(text="Params for testing/evaluating super tagger")
42 |   case class Params(
43 |     // @Help(text="Load model path") model: SuperTaggerModel: SuperTaggerModel,
44 |     @Help(text="Beta for decising the threshold of k-best at prediction") beta: Double = 0.001,
45 |     @Help(text="Maximum number of k, -1 for no limit") maxK: Int = -1
46 |   )
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/TrainParser.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import breeze.config.CommandLineParser
20 | 
21 | object TrainParser {
22 | 
23 |   import ParserTrainer.Params
24 | 
25 |   def main(args: Array[String]) = {
26 | 
27 |     val params = CommandLineParser.readIn[Params](args)
28 |     val trainer = mkTrainer(params)
29 |     trainer.trainAndSave()
30 |   }
31 | 
32 |   def mkTrainer(params: Params): ParserTrainer = params.bank.lang match {
33 |     case "ja" => new JapaneseParserTrainer(params)
34 |     case "en" => new EnglishParserTrainer(params)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/TrainSuperTagger.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import breeze.config.CommandLineParser
20 | 
21 | object TrainSuperTagger {
22 | 
23 |   import SuperTaggerTrainer.Params
24 | 
25 |   def main(args: Array[String]) = {
26 | 
27 |     val params = CommandLineParser.readIn[Params](args)
28 |     val trainer = mkTrainer(params)
29 |     trainer.trainAndSave()
30 |   }
31 | 
32 |   def mkTrainer(params: Params): SuperTaggerTrainer = params.bank.lang match {
33 |     case "ja" => new JapaneseSuperTaggerTrainer(params)
34 |     case "en" => new EnglishSuperTaggerTrainer(params)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/CabochaReader.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.io.Source
20 | 
21 | class CabochaReader[S<:TaggedSentence](ccgSentences: Seq[S]) {
22 |   def readSentences(path: String): Seq[ParsedBunsetsuSentence] = {
23 |     val bunsetsuStart = """\* (\d+) (-?\d+)[A-Z].*""".r
24 |     def addBunsetsuTo(curSent: List[(String, Int)], curBunsetsu: List[String]) = curBunsetsu.reverse match {
25 |       case Nil => curSent
26 |       case headIdx :: tail => (tail.mkString(""), headIdx.toInt) :: curSent
27 |     }
28 | 
29 |     val bunsetsuSegedSentences: List[List[(String, Int)]] =
30 |       scala.io.Source.fromFile(path).getLines.filter(_ != "").foldLeft(
31 |         (List[List[(String, Int)]](), List[(String, Int)](), List[String]())) {
32 |         case ((processed, curSent, curBunsetsu), line) => line match {
33 |           case bunsetsuStart(_, nextHeadIdx) =>
34 |             (processed, addBunsetsuTo(curSent, curBunsetsu), nextHeadIdx :: Nil) // use first elem as the head idx
35 |           case "EOS" => (addBunsetsuTo(curSent, curBunsetsu).reverse :: processed, Nil, Nil)
36 |           case word => (processed, curSent, word.split("\t")(0) :: curBunsetsu)
37 |         }
38 |       }._1.reverse
39 | 
40 |     ccgSentences.zip(bunsetsuSegedSentences).map { case (ccgSentence, bunsetsuSentence) =>
41 |       val bunsetsuSegCharIdxs: List[Int] = bunsetsuSentence.map { _._1.size }.scanLeft(0)(_+_).tail // 5 10 ...
42 |       val ccgWordSegCharIdxs: List[Int] = ccgSentence.wordSeq.toList.map { _.v.size }.scanLeft(0)(_+_).tail // 2 5 7 10 ...
43 | 
44 |       assert(bunsetsuSegCharIdxs.last == ccgWordSegCharIdxs.last)
45 |       val bunsetsuSegWordIdxs: List[Int] = ccgWordSegCharIdxs.zipWithIndex.foldLeft((List[Int](), 0)) { // 1 3 ...
46 |         case ((segWordIdxs, curBunsetsuIdx), (wordIdx, i)) =>
47 |           if (wordIdx >= bunsetsuSegCharIdxs(curBunsetsuIdx)) (i :: segWordIdxs, curBunsetsuIdx + 1)
48 |           else (segWordIdxs, curBunsetsuIdx) // wait until wordIdx exceeds the next bunsetsu segment
49 |       }._1.reverse
50 |       val bunsetsuSeq = bunsetsuSegWordIdxs.zip(-1 :: bunsetsuSegWordIdxs).map { case (bunsetsuIdx, prevIdx) =>
51 |         val offset = prevIdx + 1
52 |         Bunsetsu(offset,
53 |           ccgSentence.wordSeq.slice(offset, bunsetsuIdx + 1),
54 |           ccgSentence.posSeq.slice(offset, bunsetsuIdx + 1))
55 |       }
56 |       ParsedBunsetsuSentence(bunsetsuSeq, bunsetsuSentence.map { _._2 })
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Category.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | import Slash._
19 | 
20 | sealed trait Category extends Numbered[Unit] {
21 |   override def v:Unit = {}
22 |   def toStringNoFeature: String
23 | }
24 | 
25 | @SerialVersionUID(6748884927580538343L)
26 | case class AtomicCategory(override val id:Int, base:String, feature:CategoryFeature) extends Category {
27 |   override def toString = feature.toString match {
28 |     case "" => base
29 |     case s => base + "[" + s + "]"
30 |   }
31 | 
32 |   override def toStringNoFeature = base
33 | }
34 | @SerialVersionUID(3754315949719248198L)
35 | case class ComplexCategory(override val id:Int,
36 |                            left:Category, right:Category,
37 |                            slash:Slash) extends Category {
38 |   def toStringChild(child:Category) = child match {
39 |     case AtomicCategory(_,_,_) => child.toString
40 |     case ComplexCategory(_,_,_,_) => "(" + child.toString + ")"
41 |   }
42 |   override def toString = toStringChild(left) + slash + toStringChild(right)
43 | 
44 |   def toStringChildNoFeature(child:Category) = child match {
45 |     case AtomicCategory(_,_,_) => child.toStringNoFeature
46 |     case ComplexCategory(_,_,_,_) => "(" + child.toStringNoFeature + ")"
47 |   }
48 |   override def toStringNoFeature = toStringChildNoFeature(left) + slash + toStringChildNoFeature(right)
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/CategoryManager.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.collection.mutable.HashMap
20 | import scala.collection.mutable.ArrayBuffer
21 | 
22 | class CategoryManager extends StringBaseNumberedManager[Category] with OptionReturner[Category] {
23 |   override def createWithId(original:Category): Category = original match {
24 |     case AtomicCategory(id, base, avm) => AtomicCategory(newId, base, avm)
25 |     case ComplexCategory(id, left, right, slash) =>
26 |       val leftWithId = assignID(left)
27 |       val rightWithId = assignID(right)
28 |       ComplexCategory(newId, leftWithId, rightWithId, slash)
29 |   }
30 |   override def getOrNone(str:String): Option[Category] = str2objIndex.get(str) match {
31 |     case Some(i) => Some(objects(i))
32 |     case None => canonicalMap.get(createCanonicalInstance(str))
33 |   }
34 | 
35 |   override def createCanonicalInstance(str:String): Category = JapaneseCategoryParser.parse(str)
36 | 
37 |   // This is used when candidate shift category is empty
38 |   // It sometimes happen if for example, PoS not registered in the dictionary is detected.
39 |   val unkCategory = getOrCreate("UNK")
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/CategoryTree.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | import Slash._
19 | 
20 | case class CategoryTree(var surface:String, slash:Slash, left:CategoryTree, right:CategoryTree) {
21 |   def isLeaf = left == null && right == null
22 |   def setSurface:CategoryTree = {
23 |     def childSurface(child:CategoryTree) =
24 |       if (child.isLeaf) child.surface else '(' + child.surface + ')'
25 | 
26 |     if (isLeaf) assert(surface != null)
27 |     else surface = childSurface(left) + slash + childSurface(right)
28 |     this
29 |   }
30 |   def foreachLeaf(f:CategoryTree=>Any):Unit = {
31 |     if (isLeaf) f(this)
32 |     else List(left,right).foreach(_.foreachLeaf(f))
33 |   }
34 | }
35 | 
36 | object CategoryTree {
37 |   def createLeaf(surface:String) = CategoryTree(surface, null, null, null)
38 |   def createInternal(slash:Slash, left:CategoryTree , right:CategoryTree) =
39 |     CategoryTree(null, slash, left, right)
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Direction.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | object Direction extends Enumeration {
20 |   type Direction = Value; val Left, Right = Value
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/MecabReader.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.io.Source
20 | import scala.collection.mutable.ArrayBuffer
21 | 
22 | /** Read the output of mecab with -Ochasen option.
23 |   */
24 | class MecabReader(dict:Dictionary) {
25 |   def toPoSTaggedSentence(lines:Seq[String]) = {
26 |     val terminalSeq = lines.map { line =>
27 |       val splitted = line.split('\t')
28 |       val word = dict.getWordOrCreate(splitted(0))
29 |       val base = dict.getWordOrCreate(splitted(2))
30 | 
31 |       val conjStr = if (splitted.size > 6) splitted(5) else "_"
32 |       val posStr = splitted(3) + "/" + conjStr
33 | 
34 |       val pos = dict.getPoSOrCreate(posStr)
35 |       (word, base, pos)
36 |     }
37 |     new PoSTaggedSentence(
38 |       terminalSeq.map(_._1),
39 |       terminalSeq.map(_._2),
40 |       terminalSeq.map(_._3))
41 |   }
42 |   def readSentences(in:Source, n:Int): Array[PoSTaggedSentence] = {
43 |     val sentences = new ArrayBuffer[PoSTaggedSentence]
44 | 
45 |     val sentenceLines = new ArrayBuffer[String]
46 | 
47 |     takeLines(in, n).foreach { _ match {
48 |       case "EOS" =>
49 |         sentences += toPoSTaggedSentence(sentenceLines)
50 |         sentenceLines.clear
51 |       case line =>
52 |         sentenceLines += line
53 |     }}
54 |     sentences.toArray
55 |   }
56 |   def readSentences(path:String, n:Int): Array[PoSTaggedSentence] =
57 |     readSentences(Source.fromFile(path), n)
58 |   def takeLines(in:Source, n:Int): Iterator[String] =
59 |     for (line <- in.getLines.filter(_!="") match {
60 |       case lines if (n == -1) => lines
61 |       case lines => lines.take(n) }) yield line
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Numbered.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | trait Numbered[T] {
20 |   def id:Int
21 |   def v:T
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/PoS.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | /**
20 |  * Internal representation of Pat-of-Speech.
21 |  * The trait gives some methods to access to the information, which might be used in some languages.
22 |  * For example, hierar is a sequence of FineTag, which is assmed to represent hierarchy of that PoS.
23 |  * To enable using these different types of tags transparently (it is useful in e.g., feature extractions), Conjugation or FineTag itself is also PoS.
24 |  * WARNING: all PoSs have to have unique ids to be distinguished, so it assmed that surface forms of conj, hierar, and pos itself (full surface) are disjoint; if, for example, a FineTag have the same surface to a Conjugation, the dictionary discards the latter one. One solution to this problem is to add a symbol to each type of PoS, e.g., adding suffix 'F' to all FineTag instances when draw/inserting the dictionary.
25 |  */
26 | sealed trait PoS extends Numbered[String] {
27 |   def conj:PoS = sys.error("conj is not defined in this PoS class.")
28 |   def hierar:Seq[PoS] = sys.error("hierar is not defined in this PoS class.")
29 |   def hierarConj:Seq[PoS] = sys.error("hierarConj is not defined in this PoS class.")
30 |   def first = hierar(0)
31 |   def second = if (hierar.size < 2) first else hierar(1)
32 |   def third = if (hierar.size < 3) second else hierar(2)
33 | 
34 |   def firstWithConj = hierarConj(0)
35 |   def secondWithConj = if (hierarConj.size < 2) firstWithConj else hierarConj(1)
36 |   def thirdWithConj = if (hierarConj.size < 3) secondWithConj else hierarConj(2)
37 | }
38 | trait OptionalPoS extends PoS
39 | trait MainPoS extends PoS
40 | 
41 | case class Conjugation(override val id:Int, override val v:String) extends OptionalPoS {
42 |   override def toString = v
43 | }
44 | case class FineTag(override val id:Int, override val v:String) extends OptionalPoS {
45 |   override def toString = v
46 | }
47 | case class FineWithConjugation(override val id:Int, override val v:String) extends OptionalPoS {
48 |   override def toString = v
49 | }
50 | case class SimplePoS(override val id:Int, override val v:String) extends MainPoS {
51 |   override def toString = v
52 | }
53 | case class JapanesePoS(override val id:Int,
54 |                        override val v:String,
55 |                        override val conj:PoS,
56 |                        override val hierar:Seq[PoS],
57 |                        override val hierarConj:Seq[PoS]) extends MainPoS {
58 |   override def toString = v
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/SimpleDictionary.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | class SimpleDictionary extends Dictionary(new Word2CategoryDictionary) {
20 |   override val posManager = new PoSManager {
21 |     def createWithId(original: PoS) = SimplePoS(newId, original.v)
22 |     def createCanonicalInstance(str:String) = SimplePoS(0, str)
23 |   }
24 |   override val categoryManager = new CategoryManager {
25 |     override def createCanonicalInstance(str: String): Category = EnglishCategoryParser.parse(str)
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Slash.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | object Slash extends Enumeration {
20 |   type Slash = Value
21 |   val Left = Value("\\")
22 |   val Right = Value("/")
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/lexicon/Word.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | trait Word extends Numbered[String] {
20 |   // additional information is defined in function; may or may not be overridden in val by subclasses
21 |   def classId:Int = throw new RuntimeException("classId is not defined in this Word class.")
22 |   def assignClass(classId:Int):Word = this // default do nothing
23 |   // some morphological information extracted from the surface form might be included ? (e.g., for morphological rich languages)
24 | }
25 | 
26 | case class SimpleWord(override val id:Int, override val v:String) extends Word {
27 |   override def assignClass(classId:Int) = ClassedWord(id, v, classId)
28 |   override def toString = v
29 | }
30 | case class ClassedWord(override val id:Int,
31 |                        override val v:String,
32 |                        override val classId:Int) extends Word {
33 |   override def assignClass(classId:Int) = ClassedWord(id, v, classId)
34 |   override def toString = v + "[" + classId + "]"
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/package.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | package object ccg {
20 |   type WeightVec = jigg.ml.WeightVector[Float]
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/Action.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.parser
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import jigg.nlp.ccg.lexicon.{Category, Dictionary}
20 | import jigg.nlp.ccg.lexicon.Direction.Direction
21 | 
22 | /**
23 |  * action and corresponding label; for speed reason, label should not have the actual object such as category, so we convert Action object into corresponding Label object when filling feature templates
24 |  */
25 | sealed trait Action { def toLabel:ActionLabel }
26 | sealed trait ActionLabel {
27 |   def mkString(dict:Dictionary):String
28 | }
29 | 
30 | // shift the category with categoryId of the head of buffer
31 | case class Shift(category:Category) extends Action { override def toLabel = ShiftLabel(category.id) }
32 | 
33 | @SerialVersionUID(-6619103978469031483L)
34 | case class ShiftLabel(id:Int) extends ActionLabel {
35 |   override def mkString(dict:Dictionary) = "SHIFT(" + dict.getCategory(id) + ")"
36 | }
37 | 
38 | // combine two top nodes on the stack into a node which has categoryId
39 | case class Combine(category:Category, headDir:Direction, ruleType:String) extends Action { override def toLabel = CombineLabel(category.id) }
40 | 
41 | @SerialVersionUID(-1350486416817206332L)
42 | case class CombineLabel(id:Int) extends ActionLabel {
43 |   override def mkString(dict:Dictionary) = "COMBINE(" + dict.getCategory(id) + ")"
44 | }
45 | 
46 | // unary change to a node with categoryId
47 | case class Unary(category:Category, ruleType:String) extends Action { override def toLabel = UnaryLabel(category.id) }
48 | 
49 | @SerialVersionUID(-3492899016953622825L)
50 | case class UnaryLabel(id:Int) extends ActionLabel {
51 |   def mkString(dict:Dictionary) = "UNARY(" + dict.getCategory(id) + ")"
52 | }
53 | 
54 | case class Finish() extends Action { override def toLabel = FinishLabel() }
55 | 
56 | @SerialVersionUID(-6536578690403443069L)
57 | case class FinishLabel() extends ActionLabel {
58 |   def mkString(dict:Dictionary) = "FINISH"
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/HeadFinder.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.parser
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.collection.mutable.HashMap
20 | import jigg.nlp.ccg.lexicon.{PoS, JapanesePoS, Category}
21 | import jigg.nlp.ccg.lexicon.Direction._
22 | 
23 | trait HeadFinder extends Serializable {
24 |   type NodeInfo = HeadFinder.NodeInfo
25 |   def get(left:NodeInfo, right:NodeInfo): Direction
26 | }
27 | object HeadFinder {
28 |   case class NodeInfo(pos:PoS, category:Category, headCategory:Category)
29 | }
30 | 
31 | case class EnglishHeadFinder(children2dir: Map[(Int, Int), Direction]) extends HeadFinder {
32 |   def get(left:NodeInfo, right:NodeInfo) =
33 |     children2dir.get(left.category.id, right.category.id) match {
34 |       case Some(dir) => dir
35 |       case _ => Left
36 |     }
37 | }
38 | 
39 | object EnglishHeadFinder {
40 |   import jigg.nlp.ccg.lexicon.{ParseTree, NodeLabel, BinaryTree, NonterminalLabel}
41 |   def createFromParseTrees(trees: Seq[ParseTree[NodeLabel]]): EnglishHeadFinder = {
42 |     val map = new HashMap[(Int, Int), Direction]
43 |     trees.foreach { _.foreachTree { _ match {
44 |       case BinaryTree(left, right, NonterminalLabel(dir, _, _)) =>
45 |         map += (left.label.category.id, right.label.category.id) -> dir
46 |       case _ =>
47 |     }}}
48 |     EnglishHeadFinder(map.toMap)
49 |   }
50 | }
51 | 
52 | object JapaneseHeadFinder extends HeadFinder {
53 |   val Symbol = "記号"
54 |   def get(left:NodeInfo, right:NodeInfo) = {
55 |     val leftPos = left.pos.first.v
56 |     val rightPos = right.pos.first.v
57 |     if (rightPos == Symbol) Left else Right
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/KBestDecoder.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.parser
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import jigg.nlp.ccg.lexicon.{Derivation, CandAssignedSentence}
20 | 
21 | case class WrappedAction(v: Action, isGold:Boolean, partialFeatures:LabeledFeatures = LabeledFeatures())
22 | 
23 | case class StatePath(state:State, waction: WrappedAction, prev: Option[StatePath] = None, score:Double = 0) {
24 |   def actionPath = expand.map(_.waction)
25 |   def expand = expandRecur(Nil)
26 |   private def expandRecur(seq: List[StatePath]): List[StatePath] = prev match {
27 |     case None => seq // always ignoring the initial state
28 |     case Some(prev) => prev.expandRecur(this :: seq)
29 |   }
30 |   def lighten = this.copy(waction = waction.copy(partialFeatures = LabeledFeatures()))
31 | }
32 | 
33 | trait KBestDecoder {
34 | 
35 |   trait ACandidate {
36 |     def path: StatePath
37 |     def score: Double
38 |     def isConnected: Boolean = path.state.s1 == None
39 |   }
40 | 
41 |   val comparePreferringConnected: (ACandidate, ACandidate) => Boolean = {
42 |     case (a, b) if a.isConnected && !b.isConnected => true
43 |     case (a, b) if !a.isConnected && b.isConnected => false
44 |     case (a, b) => a.score > b.score
45 |   }
46 | 
47 |   def search(sentence: CandAssignedSentence): Seq[ACandidate]
48 | 
49 |   def predict(sentence: CandAssignedSentence): (Derivation, Double) = {
50 |     val c = search(sentence).sortWith(_.score > _.score)(0)
51 |     (c.path.state.toDerivation, c.score)
52 |   }
53 | 
54 |   /** If a fully connected tree is found, return the one with the maximum score; else return the maximum score unconnected tree
55 |     */
56 |   def predictConnected(sentence: CandAssignedSentence): (Derivation, Double) = {
57 |     val c = search(sentence).sortWith(comparePreferringConnected)(0)
58 |     (c.path.state.toDerivation, c.score)
59 |   }
60 | 
61 |   /** Return k-best trees according to the final state score.
62 |     *
63 |     * @param preferConnected if ture, fully connected trees are placed at the top of elements even if it is not the maximum score tree.
64 |     */
65 |   def predictKbest(k: Int, sentence: CandAssignedSentence, preferConnected: Boolean = false): Seq[(Derivation, Double)] = {
66 |     val sorted = preferConnected match {
67 |       case true => search(sentence).sortWith(comparePreferringConnected)
68 |       case false => search(sentence).sortWith(_.score > _.score)
69 |     }
70 |     sorted.take(k) map { c => (c.path.state.toDerivation, c.score) }
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/Rule.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.parser
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule}
20 | 
21 | import scala.collection.mutable.{HashMap, HashSet}
22 | import java.io.{ObjectOutputStream, ObjectInputStream}
23 | 
24 | trait Rule {
25 |   def unify(left:Category, right:Category): Option[Array[(Category, String)]]
26 |   def raise(child:Category): Option[Array[(Category, String)]]
27 |   def headFinder:HeadFinder
28 | }
29 | 
30 | // rules are restricted to CFG rules extracted from the training CCGBank
31 | case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType)
32 |                    val unaryRules:Map[Int, Array[(Category, String)]],
33 |                    override val headFinder:HeadFinder) extends Rule {
34 |   def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id))
35 |   def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id)
36 | }
37 | 
38 | object CFGRule {
39 |   def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = {
40 |     val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]]
41 |     val unaryRules = new HashMap[Int, HashSet[(Category, String)]]
42 | 
43 |     derivations.foreach { deriv =>
44 |       deriv.foreachPoint({ point:Point => deriv.get(point) match {
45 |         case Some(AppliedRule(UnaryChildPoint(child), ruleType)) =>
46 |           val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)])
47 |           parents += ((point.category, ruleType))
48 |         case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) =>
49 |           val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)])
50 |           parents += ((point.category, ruleType))
51 |         case _ =>
52 |       }})
53 |     }
54 |     new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap,
55 |                 unaryRules.map { case (k, v) => k -> v.toArray }.toMap,
56 |                 headFinder)
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/parser/package.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | package object parser {
20 |   type UF = ShiftReduceUnlabeledFeature
21 |   type LF = ShiftReduceFeature
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/tagger/UserDefinedFeatureExtractors.scala:
--------------------------------------------------------------------------------
 1 | // package jigg.nlp.ccg.tagger
 2 | 
 3 | // import jigg.nlp.ccg.lexicon.{Dictionary, JapaneseDictionary}
 4 | 
 5 | // import scala.collection.mutable.ArrayBuffer
 6 | 
 7 | // // this is the example to define new features and the extractor that extracts that features
 8 | 
 9 | // object NewTemplate extends Enumeration {
10 | //   type NewTemplate = Value
11 | //   val w_p = Value
12 | // }
13 | 
14 | // case class UnigramWordPoSFeature[T](word:Int, pos:Int, tmpl:T) extends FeatureOnDictionary {
15 | //   override def mkString(dict:Dictionary) = concat(tmpl, dict.getWord(word))
16 | // }
17 | 
18 | // class UnigramSecondLevelFineExtractor(val windowSize:Int) extends FeatureExtractor {
19 | //   def addFeatures(c:Context, features:ArrayBuffer[UF]) = {
20 | //     features += UnigramWordPoSFeature(c.word(0), c.pos(0), NewTemplate.w_p)
21 | //   }
22 | // }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/nlp/ccg/tagger/package.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | package object tagger {
20 |   type UF = SuperTaggingUnlabeledFeature
21 |   type LF = SuperTaggingFeature
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/Annotation.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | 
20 | /** Currently, this trait is useful to assign unique id
21 |   * for each annotation.
22 |   */
23 | abstract class Annotation(val idPrefix: String) {
24 |   val idGen = jigg.util.IDGenerator(idPrefix)
25 |   def nextId: String = idGen.next
26 | }
27 | 
28 | object Annotation {
29 | 
30 |   object Document extends Annotation("d")
31 | 
32 |   object Sentence extends Annotation("s")
33 | 
34 |   object Token extends Annotation("t")
35 | 
36 |   object Dependency extends Annotation("dep")
37 | 
38 |   object CCG extends Annotation("ccg")
39 | 
40 |   object NE extends Annotation("ne")
41 | 
42 |   object Mention extends Annotation("me")
43 | 
44 |   object Coreference extends Annotation("cr")
45 | 
46 |   object PredArg extends Annotation("pa")
47 | 
48 |   object ParseSpan extends Annotation("sp")
49 |   object CCGSpan extends Annotation("ccgsp")
50 | 
51 |   object Chunk extends Annotation("ch")
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/AnnotationError.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | class AnnotationError(msg: String) extends RuntimeException(msg)
20 | 
21 | class ProcessError(msg: String) extends AnnotationError(msg)
22 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/ArgumentError.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | class ArgumentError(msg: String) extends RuntimeException(msg)
20 | 
21 | class RequirementError(msg: String) extends RuntimeException(msg)
22 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/DocumentAnnotator.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2017 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.xml.{Elem, Node}
20 | import jigg.util.XMLUtil.RichNode
21 | 
22 | /** A trait for an annotator which modifies a document node. Use this trait if an annotator
23 |   * is a document-level annotator.
24 |   */
25 | trait DocumentAnnotator extends Annotator {
26 |   override def annotate(annotation: Node): Node = {
27 | 
28 |     annotation.replaceAll("root") { case e: Elem =>
29 |       val newChild = Annotator.makePar(e.child, nThreads).map { c =>
30 |         c match {
31 |           case c if c.label == "document" =>
32 |             try newDocumentAnnotation(c) catch {
33 |               case e: AnnotationError =>
34 |                 System.err.println(s"Failed to annotate a document by $name.")
35 |                 Annotator.annotateError(c, name, e)
36 |             }
37 |           case c => c
38 |         }
39 |       }.seq
40 |       e.copy(child = newChild)
41 |     }
42 |   }
43 | 
44 |   def newDocumentAnnotation(sentence: Node): Node
45 | }
46 | 
47 | trait SeqDocumentAnnotator extends DocumentAnnotator {
48 |   override def nThreads = 1
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/RegexDocumentAnnotator.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Takafumi Sakakibara and Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import scala.xml.Node
21 | 
22 | class RegexDocumentAnnotator(override val name: String, override val props: Properties) extends Annotator {
23 | 
24 |   @Prop(gloss = "Regular expression to segment documents") var pattern = """\n{2,}"""
25 |   readProps()
26 | 
27 |   private[this] val documentIDGen = jigg.util.IDGenerator("d")
28 |   override def annotate(annotation: Node): Node = {
29 |     val raw = annotation.text
30 | 
31 |     var offset = 0
32 | 
33 |     val documents = raw.split(pattern).map { str =>
34 |       val n = <document
35 |         id={ documentIDGen.next }
36 |         characterOffsetBegin={ offset+"" }
37 |         characterOffsetEnd={ offset+str.size+"" }
38 |       >{ str }</document>
39 |       offset += str.size
40 |       n
41 |     }
42 | 
43 |     <root>{ documents }</root>
44 |   }
45 | 
46 |   override def requires = Set()
47 |   override def requirementsSatisfied = Set(Requirement.Dsplit)
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SentencesAnnotator.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2017 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.xml.{Elem, Node}
20 | import jigg.util.XMLUtil.RichNode
21 | 
22 | /** A trait for an annotator which modifies a sentence node.
23 |   *
24 |   * If an annotator is sentence-level annotator such as a parser or pos tagger, it should
25 |   * extend this trait and usually what you should do is only to implement
26 |   * newSentenceAnnotation method, which rewrites a sentence node and returns new one.
27 |   *
28 |   * This annotates given sentences in parallel. If you want to avoid this perhaps
29 |   * because the annotator is not thread-safe, use [[jigg.pipeline.SeqSentencesannotator]]
30 |   * instead, which does annotates sequentially.
31 |   */
32 | trait SentencesAnnotator extends Annotator {
33 |   def annotate(annotation: Node): Node = {
34 | 
35 |     annotation.replaceAll("sentences") { case e: Elem =>
36 |       val annotatedChild = Annotator.makePar(e.child, nThreads).map {
37 |         case s if s.label == "sentence" =>
38 |           try newSentenceAnnotation(s) catch {
39 |             case e: AnnotationError =>
40 |               System.err.println(s"Failed to annotate a document by $name.")
41 |               Annotator.annotateError(s, name, e)
42 |           }
43 |         case s => s
44 |       }.seq
45 |       e.copy(child = annotatedChild)
46 |     }
47 |   }
48 | 
49 |   def newSentenceAnnotation(sentence: Node): Node
50 | }
51 | 
52 | /** This trait annotates the inputs sequentially.
53 |   */
54 | trait SeqSentencesAnnotator extends SentencesAnnotator {
55 |   override def nThreads = 1
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SimpleKNPAnnotator.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Takafumi Sakakibara and Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import scala.xml._
21 | 
22 | class SimpleKNPAnnotator(override val name: String, override val props: Properties)
23 |     extends KNPAnnotator with AnnotatingSentencesInParallel { self=>
24 | 
25 |   @Prop(gloss = "Use this command to launch KNP (-tab is automatically added. -anaphora is not compatible with this annotator. In that case, use knpDoc instead). Version >= 4.12 is assumed.") var command = "knp"
26 |   readProps()
27 | 
28 |   localAnnotators // instantiate lazy val here
29 | 
30 |   def mkLocalAnnotator = new SimpleKNPLocalAnnotator
31 | 
32 |   class SimpleKNPLocalAnnotator
33 |       extends SentencesAnnotator with LocalAnnotator with BaseKNPLocalAnnotator {
34 |     override def defaultArgs = Seq("-tab")
35 | 
36 |     val knp = mkIO()
37 | 
38 |     override def newSentenceAnnotation(sentence: Node): Node = {
39 |       val sentenceId = (sentence \ "@id").toString
40 | 
41 |       val knpResult = runKNP(sentence, None)
42 |       annotateSentenceNode(sentence, knpResult, sentenceId,  _ => sentenceId)
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SpaceTokenizerAnnotator.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | 
21 | import scala.xml.{Node, Elem, Text, Atom}
22 | import jigg.util.XMLUtil.RichNode
23 | 
24 | /** This simple annotator just segments a sentence by spaces, i.e.,
25 |   * assuming the input sentence is already correctly tokenized.
26 |   */
27 | class SpaceTokenizerAnnotator(override val name: String, override val props: Properties)
28 |     extends SentencesAnnotator {
29 | 
30 |   override def newSentenceAnnotation(sentence: Node): Node = {
31 | 
32 |     val sindex = sentence \@ "id"
33 |     val text = sentence.text
34 |     val range = (0 until text.size)
35 | 
36 |     def isSpace(c: Char) = c == ' ' || c == '\t'
37 | 
38 |     val begins = 0 +: (1 until text.size).filter { i => isSpace(text(i-1)) && !isSpace(text(i)) }
39 | 
40 |     val ends = begins map {
41 |       range indexWhere (i=>isSpace(text(i)), _) match {
42 |         case -1 => text.size
43 |         case e => e
44 |       }
45 |     }
46 | 
47 |     val tokenSeq = begins.zip(ends).zipWithIndex map { case ((b, e), i) =>
48 |       <token
49 |         id={ sindex + "_tok" + i }
50 |         form={ text.substring(b, e) }
51 |         characterOffsetBegin={ b+"" }
52 |         characterOffsetEnd={ e+"" }/>
53 |     }
54 |     val tokens = <tokens annotators={ name }>{ tokenSeq }</tokens>
55 |     sentence addChild tokens
56 |   }
57 | 
58 |   override def requires = Set(Requirement.Ssplit)
59 |   override def requirementsSatisfied = Set(Requirement.Tokenize)
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/SystemDict.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | sealed trait SystemDic
20 | 
21 | object SystemDic {
22 |   case object ipadic extends SystemDic
23 |   case object jumandic extends SystemDic
24 |   case object unidic extends SystemDic
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/pipeline/UnmanagedAnnotators.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /** A singleton managing the collection of `UnmanagedAnnotator`.
 4 |   *
 5 |   * See the document of `UnmanagedAnnotator` for its role. `list` is an essential object,
 6 |   * which preserves mapping from the annotator name to an `UnmanagedAnnotator`. If you
 7 |   * want to support a new annotator that depends on an unmanaged library, add it to the
 8 |   * `list`.
 9 |   */
10 | object UnmanagedAnnotators {
11 | 
12 |   /** Information about the annotator that wraps a software, which is in JVM while not
13 |     * included as a managed library via maven.
14 |     *
15 |     * When assembling, such external unmanaged jars are not included, so a user has to
16 |     * explicitly add them to the class path. Each UnmanagedAnnotator object helps to
17 |     * describe how to use it. For example, its default message, implemented in
18 |     * `DefaultUnmanagedannotator` tells the url of the library jar file.
19 |     */
20 |   trait UnmanagedAnnotator[A] {
21 |     def name: String
22 |     def clazz: Class[A]
23 | 
24 |     def msg: String
25 |   }
26 | 
27 |   case class DefaultUnmanagedAnnotator[A](
28 |     val name: String, val clazz: Class[A], url: String) extends UnmanagedAnnotator[A] {
29 | 
30 |     def msg = s"""Failed to launch $name. Maybe the necessary jar file is not included in
31 | the current class path. This might be solved by adding jar/* into your class path,
32 | e.g., call the jigg like like:
33 | 
34 | > java cp "jigg-xxx.jar:jar/*" jigg.pipeline.Pipeline ...
35 | 
36 | If the error still remains, the necessary jar file is missing. You can download it
37 | from ${url}. Try e.g.,
38 | 
39 | > wget $url jar/
40 | 
41 | and do the above command.
42 | """
43 |   }
44 | 
45 |   val list = Map(
46 |     "easyccg" -> DefaultUnmanagedAnnotator(
47 |       "easyccg",
48 |       classOf[EasyCCGAnnotator],
49 |       "https://github.com/mikelewis0/easyccg/raw/master/easyccg.jar"))
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/ArgumentsParser.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | 
21 | object ArgumentsParser {
22 |   def parse(args: List[String]): Properties = parseRecur(new Properties, args)
23 | 
24 |   private def parseRecur(props: Properties, args: List[String]): Properties = args match {
25 |     case ArgKey(key) :: next => next match {
26 |       case ArgKey(nextKey) :: tail => // -key1 -key2 ... => key1 is boolean value
27 |         putTrue(props, key)
28 |         parseRecur(props, next)
29 |       case value :: tail =>
30 |         key match {
31 |           case "props" => props.load(jigg.util.IOUtil.openIn(value))
32 |           case _ => props.put(key, value)
33 |         }
34 |         parseRecur(props, tail)
35 |       case Nil =>
36 |         putTrue(props, key)
37 |         parseRecur(props, next)
38 |     }
39 |     case _ => props
40 |   }
41 |   def putTrue(props: Properties, key: String) = props.put(key, "true")
42 | 
43 |   object ArgKey {
44 |     def unapply(key: String): Option[String] = key match {
45 |       case x if x.size > 1 && x(0) == '-' && x.drop(1).forall(x=>x.isDigit || x=='.') => None // -10.0, -1, etc are not key
46 |       case x if x.size > 1 && x(0) == '-' && x(1) == '-' => Some(x.substring(2))
47 |       case x if x.size > 1 && x(0) == '-' => Some(x.substring(1)) // we don't catch if x.size == 1, ('-' is recognized as some value)
48 |       case _ => None
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/HDF5Object.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import ucar.nc2.{Attribute, Group, NetcdfFile}
20 | 
21 | class HDF5Object(rootGroup: Group) {
22 | 
23 |   def checkAndGetAttribute(name: String): Attribute = Option(rootGroup.findAttribute(name)) match {
24 |     case Some(x) => x
25 |     case None => throw new IllegalArgumentException("cannot get " + name + " attribute from input model file")
26 |   }
27 | 
28 |   def checkAndGetGroup(name: String): Group = Option(rootGroup.findGroup(name)) match {
29 |     case Some(x) => x
30 |     case None => throw new IllegalArgumentException("cannot get " + name + " group from input model file")
31 |   }
32 | 
33 | }
34 | 
35 | object HDF5Object {
36 | 
37 |   // Load from a path on the file system
38 |   def fromFile(path: String): HDF5Object =  {
39 |     val file = NetcdfFile.open(path, null)
40 |     mkObj(file)
41 |   }
42 | 
43 |   // Load from class loader
44 |   def fromResource(path: String): HDF5Object = {
45 |     val file =
46 |       NetcdfFile.openInMemory(IOUtil.findResource(path).toURI)
47 |     mkObj(file)
48 |   }
49 | 
50 |   private def mkObj(file: NetcdfFile) = {
51 |     val group = file.getRootGroup
52 |     new HDF5Object(group)
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/IDGenerator.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 |  */
18 | 
19 | // trait IDGeneratorBase {
20 | //   def next(): String
21 | // }
22 | 
23 | // case class IDGenerator(prefix: String) extends IDGeneratorBase {
24 | //   private[this] val stream = Stream.from(0).iterator
25 | //   def next() = prefix + stream.next
26 | // }
27 | 
28 | case class IDGenerator(toId: Int=>String) {
29 |   private[this] var stream = Stream.from(0).iterator
30 |   def next() = toId(stream.next)
31 |   def reset() = stream = Stream.from(0).iterator
32 | }
33 | 
34 | object IDGenerator {
35 |   def apply(prefix: String): IDGenerator = IDGenerator(prefix + _)
36 | }
37 | 
38 | /** Not thread-safe but little overhead
39 |   */
40 | case class LocalIDGenerator(toId: Int=>String) {
41 |   var i = 0
42 |   def next() = {
43 |     val n = toId(i)
44 |     i += 1
45 |     n
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/LogUtil.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | object LogUtil {
20 |   /** A helper to measure time.
21 |     * If multiple commands are nested, use multipleTrack.
22 |     *
23 |     * TODO: Integrate track and multipleTrack to automatically choose indent and appropriate format.
24 |     * Currently track[A](beginMessage: String, ...) "manually" handles the indent level.
25 |     */
26 |   def track[A](message: String)(body: => A): A = {
27 |     // System.out.print(message)
28 |     // val (result, time) = recordTime { body }
29 |     // System.out.println("done [%.1f sec]".format(time))
30 |     // result
31 |     track(message, "done", 0) { body }
32 |   }
33 | 
34 |   def multipleTrack[A](message: String)(body: => A): A = {
35 |     // System.out.println("{ " + message)
36 |     // val (result, time) = recordTime { body }
37 |     // System.out.println("} [%.1f sec]".format(time))
38 |     // result
39 |     track(message + " {\n", "}", 0) { body }
40 |   }
41 | 
42 |   def track[A](beginMessage: String, endMessage: String, indent: Int)(body: => A): A = {
43 |     def print(raw: String) = {
44 |       (0 until indent) foreach { _ => System.out.print(" ") }
45 |       System.out.print(raw)
46 |     }
47 |     print(beginMessage)
48 |     val (result, time) = recordTime { body }
49 |     System.out.println(endMessage + " [%.1f sec]".format(time))
50 |     result
51 |   }
52 | 
53 |   def recordTime[A](body: => A): (A, Double) = {
54 |     val before = System.currentTimeMillis
55 |     val result = body
56 |     val time = (System.currentTimeMillis - before).toDouble / 1000.0
57 |     (result, time)
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/LookupTable.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.io.Reader
20 | 
21 | import breeze.linalg.DenseMatrix
22 | import org.json4s.{DefaultFormats, _}
23 | import org.json4s.jackson.JsonMethods
24 | import org.json4s.JsonAST.JValue
25 | 
26 | class LookupTable(rawTable: JValue) {
27 | 
28 |   implicit private val formats = DefaultFormats
29 |   private val tables = rawTable.extract[Map[String, Map[String, Map[String, String]]]]
30 | 
31 |   private val key2id = tables("_lookup")("_key2id")
32 |   private val id2key = tables("_lookup")("_id2key")
33 | 
34 |   // For raw text
35 |   def encodeCharacter(str: String): DenseMatrix[Float] = {
36 |     val strArray = str.map{x =>
37 |       // Note: For skipping unknown character, this encoder returns dummy id.
38 |       key2id.getOrElse(x.toString, "3").toFloat
39 |     }.toArray
40 |     new DenseMatrix[Float](1, str.length, strArray)
41 |   }
42 | 
43 |   // For list of words
44 |   def encodeWords(words: Array[String]): DenseMatrix[Float] = {
45 |     val wordsArray = words.map{x =>
46 |       // Note: For skipping unknown words, this encoder returns dummy id.
47 |       key2id.getOrElse(x.toString, "3").toFloat
48 |     }
49 |     new DenseMatrix[Float](1, words.length, wordsArray)
50 |   }
51 | 
52 |   def decode(data: DenseMatrix[Float]): Array[String] =
53 |     data.map{x => id2key.getOrElse(x.toInt.toString, "NONE")}.toArray
54 | 
55 |   def getId(key: String): Int = key2id.getOrElse(key, "0").toInt
56 |   def getId(key: Char): Int = getId(key.toString)
57 | 
58 |   def getKey(id: Int): String = id2key.getOrElse(id.toString, "UNKNOWN")
59 | }
60 | 
61 | 
62 | object LookupTable {
63 | 
64 |   // Load from a path on the file system
65 |   def fromFile(path: String) = mkTable(IOUtil.openIn(path))
66 | 
67 |   // Load from class loader
68 |   def fromResource(path: String) = mkTable(IOUtil.openResourceAsReader(path))
69 | 
70 |   private def mkTable(input: Reader) = {
71 |     val j = try { JsonMethods.parse(input) } finally { input.close }
72 |     new LookupTable(j)
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/Normalizer.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import com.ibm.icu.text.Transliterator
20 | 
21 | object Normalizer {
22 | 
23 |   /** Replace all half space characters in ascii (< 0x7F) to full space characters.
24 |     *
25 |     * Useful for preprocessing in some Japanese software such as JUMAN and KNP.
26 |     *
27 |     * NOTE: We do not touch hankaku kana characters since they make alignment to the
28 |     * original text more involved.
29 |     */
30 |   def hanZenAscii(text: String) = text map {
31 |     case c if c <= 0x7F => hanzenTrans.transliterate(c + "")(0)
32 |     case c => c
33 |   }
34 |   private val hanzenTrans = Transliterator.getInstance("Halfwidth-Fullwidth")
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/Prop.java:
--------------------------------------------------------------------------------
 1 | package jigg.util;
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.lang.annotation.*;
20 | 
21 | @Retention(RetentionPolicy.RUNTIME)
22 | public @interface Prop {
23 |   // String name() default "";
24 |   String gloss() default "";
25 |   boolean required() default false;
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/PropertiesUtil.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import scala.collection.JavaConversions._
21 | 
22 | object PropertiesUtil {
23 |   def findProperty(key: String, props: Properties): Option[String] = props.getProperty(key) match {
24 |     case null => None
25 |     case value => Some(value)
26 |   }
27 |   def safeFind(key: String, props: Properties): String = findProperty(key, props).getOrElse { sys.error(s"$key property is required!" ) }
28 | 
29 |   def getBoolean(key: String, props: Properties): Option[Boolean] = findProperty(key, props) map {
30 |     case "true" => true
31 |     case "false" => false
32 |     case _ => sys.error(s"Property $key should be true or false")
33 |   }
34 | 
35 |   def filter(props: Properties)(f: (String, String)=>Boolean): Seq[(String, String)] =
36 |     props.stringPropertyNames.toSeq
37 |       .map { k => (k, props.getProperty(k)) }
38 |       .filter { case (k, v) => f(k, v) }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/ResourceUtil.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | import java.io.File
 4 | 
 5 | object ResourceUtil {
 6 | 
 7 |   /** Read a python script found in `resources/python/xxx.py`. Since these files cannot
 8 |     * be executed directly we create a temporary file by copying the script first, and
 9 |     * return the resulting temp file.
10 |     *
11 |     * @param name script name, corresponding to `xxx.py`.
12 |     */
13 |   def readPython(name: String): File = {
14 |     val script = File.createTempFile("jigg", ".py")
15 |       script.deleteOnExit
16 |       val stream = getClass.getResourceAsStream(s"/python/${name}")
17 |       IOUtil.writing(script.getPath) { o =>
18 |         scala.io.Source.fromInputStream(stream).getLines foreach { line =>
19 |           o.write(line + "\n")
20 |         }
21 |       }
22 |       script
23 |   }
24 | 
25 | }
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/src/main/scala/jigg/util/TreesUtil.scala:
--------------------------------------------------------------------------------
 1 | package jigg.util
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import scala.collection.mutable.ArrayBuffer
20 | import scala.xml._
21 | 
22 | import jigg.pipeline.Annotation
23 | 
24 | object TreesUtil {
25 | 
26 |   def streeToNode(tree: String, sentence: Node, annotator: String) = {
27 |     val tokens = tree.replaceAllLiterally("(", " ( ").replaceAllLiterally(")", " ) ").trim.split("\\s+")
28 | 
29 |     val tokenSeq = (sentence \ "tokens").head \ "token"
30 |     var tokIdx = -1
31 |     def nextTokId = { tokIdx += 1; tokenSeq(tokIdx) \@ "id" }
32 | 
33 |     val spans = new ArrayBuffer[Node]
34 | 
35 |     // Fill in spans; return the id of constructed subtree, and the arrived index.
36 |     def readTopdown(idx: Int): (String, Int) = {
37 | 
38 |       def collectChildren(curChildren: List[String], cur: Int): (Seq[String], Int) =
39 |         tokens(cur) match {
40 |           case ")" =>
41 |             (curChildren.reverse, cur)
42 |           case "(" =>
43 |             val (nextChildId, nextIdx) = readTopdown(cur)
44 |             collectChildren(nextChildId :: curChildren, nextIdx)
45 |         }
46 | 
47 |       tokens(idx) match {
48 |         case "(" =>
49 |           def skipParen(i: Int = 0): Int = {
50 |             if (tokens(idx + i) == "(") skipParen(i + 1)
51 |             else i
52 |           }
53 |           val parenCount = skipParen()
54 | 
55 |           val labelIdx = idx + parenCount
56 |           val label = tokens(labelIdx)
57 | 
58 |           val (children, closeIdx) = tokens(labelIdx + 1) match {
59 |             case "(" => collectChildren(Nil, labelIdx + 1)
60 |             case word => (Nil, labelIdx + 1 + 1)
61 |           }
62 |           val thisId = children match {
63 |             case Nil => nextTokId
64 |             case children => Annotation.ParseSpan.nextId
65 |           }
66 |           if (!children.isEmpty) {
67 |             val childStr = children mkString " "
68 |             spans += <span id={ thisId } symbol={ label } children={ childStr }/>
69 |           }
70 |           for (i <- 0 until parenCount) { assert(tokens(closeIdx + i) == ")") }
71 |           (thisId, closeIdx + parenCount)
72 |       }
73 |     }
74 | 
75 |     val (rootId, _) =  readTopdown(0)
76 |     <parse root={ rootId } annotators={ annotator }>{ spans }</parse>
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/resources/data/Japanese.small.lexicon:
--------------------------------------------------------------------------------
1 | @UNK@/フィラー/_ S1／S1 NP[nc,adv]1／NP[nc,adv]1 NP[nc,nm]1／NP[nc,nm]1
2 | @UNK@/副詞-一般/_ S1／S1 NP[nc,nm]1／NP[nc,nm]1 S[nm,stem] NP[nc,adv]1／NP[nc,adv]1
3 | @UNK@/副詞-助詞類接続/_ S1／S1 NP[nc,nm]1／NP[nc,nm]1 S[nm,stem]
4 | あふれる/動詞-自立/基本形 S[nm,stem]＼NP[ga,nm,ga]-base_verb_rule S[nm,stem]＼NP[ga,nm,ga]-adnominal_verb_rule


--------------------------------------------------------------------------------
/src/test/resources/data/Japanese.unkVerb.lexicon:
--------------------------------------------------------------------------------
1 | @UNK@/動詞-非自立/仮定形 S[nm,hyp]＼S[nm,cont]sem
2 | @UNK@/動詞-非自立/体言接続特殊 S[nm,attr]＼S[nm,neg]sem
3 | @UNK@/動詞-非自立/体言接続特殊２ S[adn,attr]
4 | @UNK@/動詞-非自立/基本形 S[nm,base]＼S[nm,cont]sem S[adn,base]＼S[nm,cont]sem S[nm,base] S[nm,base]＼NP[ga,nm,ga] NP[nc,nm]1／NP[nc,nm]1
5 | @UNK@/動詞-非自立/未然ウ接続 S[nm,neg]＼S[nm,cont]sem
6 | @UNK@/動詞-非自立/未然形 S[nm,neg]＼S[nm,cont]sem S[nm,neg] S[nm,neg]＼S[nm,r]sem
7 | @UNK@/動詞-非自立/連用タ接続 S[nm,cont]＼S[nm,cont]sem S[nm,cont]
8 | @UNK@/動詞-非自立/連用形 S[nm,cont]＼S[nm,cont]sem S[adv,cont]＼S[nm,cont]sem S[nm,cont] S[adn,cont] S[adn,cont]＼S[nm,cont]sem S[nm,cont]＼NP[ga,nm,ga]


--------------------------------------------------------------------------------
/src/test/resources/data/json/english.ssplit.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   ".tag" : "root",
 3 |   ".child" : [ {
 4 |     ".tag" : "document",
 5 |     "id" : "d0",
 6 |     ".child" : [ {
 7 |       ".tag" : "sentences",
 8 |       ".child" : [ {
 9 |         ".tag" : "sentence",
10 |         "text" : "Alice asked her mother to cook a cake.",
11 |         "id" : "s0",
12 |         "characterOffsetBegin" : "0",
13 |         "characterOffsetEnd" : "38"
14 |       }, {
15 |         ".tag" : "sentence",
16 |         "text" : "Bob saw a girl in the garden with a telescope.",
17 |         "id" : "s1",
18 |         "characterOffsetBegin" : "39",
19 |         "characterOffsetEnd" : "85"
20 |       } ]
21 |     } ]
22 |   } ]
23 | }


--------------------------------------------------------------------------------
/src/test/resources/data/json/japanese.ssplit.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   ".tag" : "root",
 3 |   ".child" : [ {
 4 |     ".tag" : "document",
 5 |     "id" : "d0",
 6 |     ".child" : [ {
 7 |       ".tag" : "sentences",
 8 |       ".child" : [ {
 9 |         ".tag" : "sentence",
10 |         "text" : "自転車で走っている少女を見た",
11 |         "id" : "s0",
12 |         "characterOffsetBegin" : "0",
13 |         "characterOffsetEnd" : "14"
14 |       }, {
15 |         ".tag" : "sentence",
16 |         "text" : "テレビで走っている少女を見た",
17 |         "id" : "s1",
18 |         "characterOffsetBegin" : "15",
19 |         "characterOffsetEnd" : "29"
20 |       } ]
21 |     } ]
22 |   } ]
23 | }


--------------------------------------------------------------------------------
/src/test/resources/data/keras/bunsetsu_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/keras/bunsetsu_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/keras/ssplit_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/keras/ssplit_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/convolution1d/convolution1d_gold.csv:
--------------------------------------------------------------------------------
 1 | -0.288217455148697,0.681861579418182
 2 | -0.538490712642670,0.062052655965090
 3 | -0.318091481924057,-0.074813574552536
 4 | -0.023546881973743,0.040708515793085
 5 | -0.485583871603012,0.224703624844551
 6 | -0.450441420078278,0.002716975519434
 7 | -0.176823571324348,0.489799916744232
 8 | -0.123186729848385,0.057490978389978
 9 | -0.336253672838211,-0.084099449217319
10 | 0.059555754065514,0.000320440391079
11 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/convolution1d/convolution1d_input.csv:
--------------------------------------------------------------------------------
 1 | 0.027738961009708,0.393455303479803,0.694816228560713,0.157559454348151,0.214884384043615,0.005565078182797,0.949002280200014,0.690369967699377,0.998176256773562,0.204425396011438,0.845982123544135,0.818198829832328
 2 | 0.252301884057857,0.437311847167796,0.104436208603942,0.763925291392123,0.870987562303758,0.079435648160725,0.142875224317561,0.170360773159227,0.387373867227415,0.745431984723710,0.479836153327895,0.744296844299619
 3 | 0.883415945353071,0.697078201963215,0.606604317884067,0.777094318509148,0.956809131373719,0.018343700379643,0.692863164913816,0.107627736723910,0.595232367723716,0.618970512903785,0.748639111184423,0.941869156250547
 4 | 0.035042201371063,0.700113249200931,0.717126347279872,0.511744032438561,0.247658441044617,0.576820124281050,0.047399750738226,0.067116874648913,0.175494795121527,0.240304085868729,0.603887921839716,0.537397181554857
 5 | 0.554501767544110,0.411117180527812,0.648722795158795,0.508408218827410,0.785647318386747,0.947404977871054,0.113110476551426,0.936072327771750,0.863526769665361,0.172236633875255,0.715443984726397,0.869742300523170
 6 | 0.331881976191941,0.174389983798250,0.974055309053648,0.952572967439939,0.395194463615389,0.979596804619930,0.126419143266621,0.028127155855804,0.377202820144004,0.788029009784025,0.143934466920253,0.885531232719449
 7 | 0.082605263961736,0.816844068389051,0.742036051284236,0.448338330763183,0.231913187967981,0.324263082007595,0.095113194171922,0.575291246962427,0.402043739476673,0.773164202330256,0.978885567374195,0.531234497631943
 8 | 0.797474806333550,0.770689995657307,0.286838584369559,0.272812118439933,0.522711445247614,0.557358959671089,0.655063150020376,0.613348870624681,0.903721040494730,0.676600535740517,0.862388024752785,0.483734729571592
 9 | 0.511364975233000,0.956982804048265,0.489405080608254,0.946988783071462,0.304099907120206,0.159633845243493,0.441705350104236,0.014337837348216,0.609972921479224,0.159291332076170,0.521437544993183,0.863046123179579
10 | 0.043232549851898,0.273736339785920,0.378312369831591,0.953767858492059,0.200604482875413,0.810072095098931,0.391870443803649,0.639344286225899,0.677303032937693,0.276362747713528,0.359063987058490,0.334056036907750
11 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/convolution1d/convolution1d_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/convolution1d/convolution1d_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/dense/dense_gold.csv:
--------------------------------------------------------------------------------
1 | -0.265054643154144,0.819157660007477
2 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/dense/dense_input.csv:
--------------------------------------------------------------------------------
1 | 0.919222086072171,0.268580028843516,0.850487637208910,0.195140088357300,0.915650682096673,0.694448840619902,0.686364957159918,0.845189174009755,0.515407551460194,0.707307670736291
2 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/dense/dense_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/dense/dense_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/embedding/embedding_gold.csv:
--------------------------------------------------------------------------------
1 | -0.024064350873232,0.015874337404966
2 | -0.032138548791409,0.035715412348509
3 | -0.009305894374847,0.047007892280817
4 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/embedding/embedding_input.csv:
--------------------------------------------------------------------------------
1 | 4.000000000000000,3.000000000000000,6.000000000000000
2 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/embedding/embedding_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/embedding/embedding_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/flatten/flatten_gold.csv:
--------------------------------------------------------------------------------
1 | 0.483355849981308,0.272490352392197,0.915887176990509,0.335418432950974,0.778468728065491,0.853674173355103
2 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/flatten/flatten_input.csv:
--------------------------------------------------------------------------------
1 | 0.483355847870847,0.272490343423817
2 | 0.915887187299997,0.335418421687206
3 | 0.778468739455691,0.853674144810384
4 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/flatten/flatten_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/flatten/flatten_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/kerasModel/kerasModel_gold.csv:
--------------------------------------------------------------------------------
1 | 0.066982857882977,0.864855527877808,0.068161644041538
2 | 0.036359727382660,0.940843880176544,0.022796416655183
3 | 0.000093939248472,0.024136895313859,0.975769102573395
4 | 0.000007191142231,0.037699114531279,0.962293744087219
5 | 0.859113097190857,0.130854964256287,0.010032005608082
6 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/kerasModel/kerasModel_input.csv:
--------------------------------------------------------------------------------
1 | 0.000000000000000,6.000000000000000,6.000000000000000,2.000000000000000,6.000000000000000
2 | 


--------------------------------------------------------------------------------
/src/test/resources/data/ml/keras/kerasModel/kerasModel_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mynlp/jigg/e1427356a43f125088aa7acd7854df2c5f9ad433/src/test/resources/data/ml/keras/kerasModel/kerasModel_model.h5


--------------------------------------------------------------------------------
/src/test/resources/data/template.small.lst:
--------------------------------------------------------------------------------
1 | NP[nc,nm]1／NP[nc,nm]1	NP[nc,nm]1／NP[nc,nm]1
2 | S[nm,stem]＼NP[ga,nm,ga]-base_verb_rule	S[nm,base]＼NP[ga,nm,ga]
3 | S[nm,stem]＼NP[ga,nm,ga]-adnominal_verb_rule	S[adn,base]＼NP[ga,nm,ga]
4 | S1／S1	S1／S1
5 | NP[nc,adv]1／NP[nc,adv]1	NP[nc,adv]1／NP[nc,adv]1
6 | S[nm,stem]	S[nm,stem]
7 | 


--------------------------------------------------------------------------------
/src/test/resources/data/template.unkVerb.lst:
--------------------------------------------------------------------------------
 1 | S[adn,attr]	S[adn,attr]
 2 | S[adn,base]＼S[nm,cont]sem	S[adn,base]＼S[nm,cont]sem
 3 | S[adv,cont]＼S[nm,cont]sem	S[adv,cont]＼S[nm,cont]sem
 4 | S[adn,cont]	S[adn,cont]
 5 | S[adn,cont]＼S[nm,cont]sem	S[adn,cont]＼S[nm,cont]sem
 6 | S[nm,hyp]＼S[nm,cont]sem	S[nm,hyp]＼S[nm,cont]sem
 7 | S[nm,attr]＼S[nm,neg]sem	S[nm,attr]＼S[nm,neg]sem
 8 | S[nm,base]	S[nm,base]
 9 | S[nm,base]＼S[nm,cont]sem	S[nm,base]＼S[nm,cont]sem
10 | S[nm,base]＼NP[ga,nm,ga]	S[nm,base]＼NP[ga,nm,ga]
11 | S[nm,neg]	 S[nm,neg]
12 | S[nm,neg]＼S[nm,cont]sem	S[nm,neg]＼S[nm,cont]sem
13 | S[nm,neg]＼S[nm,r]sem	S[nm,neg]＼S[nm,r]sem
14 | S[nm,cont]＼S[nm,cont]sem	S[nm,cont]＼S[nm,cont]sem
15 | S[nm,cont]	S[nm,cont]
16 | S[nm,cont]＼S[nm,cont]sem	S[nm,cont]＼S[nm,cont]sem
17 | S[nm,cont]＼NP[ga,nm,ga]	S[nm,cont]＼NP[ga,nm,ga]
18 | NP[nc,nm]1／NP[nc,nm]1	NP[nc,nm]1／NP[nc,nm]1
19 | 


--------------------------------------------------------------------------------
/src/test/resources/data/xml/english.ssplit.spaceTokenize.gold.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <root>
 3 |   <document id="d0">
 4 |     <sentences>
 5 |       <sentence id="s0" characterOffsetBegin="0" characterOffsetEnd="38">
 6 |         Alice asked her mother to cook a cake.
 7 |         <tokens annotators="spaceTokenize">
 8 |           <token id="s0_tok0" form="Alice" characterOffsetBegin="0" characterOffsetEnd="5"/>
 9 |           <token id="s0_tok1" form="asked" characterOffsetBegin="6" characterOffsetEnd="11"/>
10 |           <token id="s0_tok2" form="her" characterOffsetBegin="12" characterOffsetEnd="15"/>
11 |           <token id="s0_tok3" form="mother" characterOffsetBegin="16" characterOffsetEnd="22"/>
12 |           <token id="s0_tok4" form="to" characterOffsetBegin="23" characterOffsetEnd="25"/>
13 |           <token id="s0_tok5" form="cook" characterOffsetBegin="26" characterOffsetEnd="30"/>
14 |           <token id="s0_tok6" form="a" characterOffsetBegin="31" characterOffsetEnd="32"/>
15 |           <token id="s0_tok7" form="cake." characterOffsetBegin="33" characterOffsetEnd="38"/>
16 |         </tokens>
17 |       </sentence>
18 |       <sentence id="s1" characterOffsetBegin="39" characterOffsetEnd="85">
19 |         Bob saw a girl in the garden with a telescope.
20 |         <tokens annotators="spaceTokenize">
21 |           <token id="s1_tok0" form="Bob" characterOffsetBegin="0" characterOffsetEnd="3"/>
22 |           <token id="s1_tok1" form="saw" characterOffsetBegin="4" characterOffsetEnd="7"/>
23 |           <token id="s1_tok2" form="a" characterOffsetBegin="8" characterOffsetEnd="9"/>
24 |           <token id="s1_tok3" form="girl" characterOffsetBegin="10" characterOffsetEnd="14"/>
25 |           <token id="s1_tok4" form="in" characterOffsetBegin="15" characterOffsetEnd="17"/>
26 |           <token id="s1_tok5" form="the" characterOffsetBegin="18" characterOffsetEnd="21"/>
27 |           <token id="s1_tok6" form="garden" characterOffsetBegin="22" characterOffsetEnd="28"/>
28 |           <token id="s1_tok7" form="with" characterOffsetBegin="29" characterOffsetEnd="33"/>
29 |           <token id="s1_tok8" form="a" characterOffsetBegin="34" characterOffsetEnd="35"/>
30 |           <token id="s1_tok9" form="telescope." characterOffsetBegin="36" characterOffsetEnd="46"/>
31 |         </tokens>
32 |       </sentence>
33 |     </sentences>
34 |   </document>
35 | </root>
36 | 


--------------------------------------------------------------------------------
/src/test/resources/data/xml/english.ssplit.test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <root>
 3 |   <document id="d0">
 4 |     <sentences>
 5 |       <sentence id="s0" characterOffsetBegin="0" characterOffsetEnd="38">Alice asked her mother to cook a cake.</sentence>
 6 |       <sentence id="s1" characterOffsetBegin="39" characterOffsetEnd="85">Bob saw a girl in the garden with a telescope.</sentence>
 7 |     </sentences>
 8 |   </document>
 9 | </root>
10 | 


--------------------------------------------------------------------------------
/src/test/resources/data/xml/japanese.ssplit.test.xml:
--------------------------------------------------------------------------------
1 | <?xml version='1.0' encoding='UTF-8'?>
2 | <root>
3 |   <document id="d0">
4 |     <sentences>
5 |       <sentence characterOffsetEnd="14" characterOffsetBegin="0" id="s0">自転車で走っている少女を見た</sentence>
6 |       <sentence characterOffsetEnd="29" characterOffsetBegin="15" id="s1">テレビで走っている少女を見た</sentence>
7 |     </sentences>
8 |   </document>
9 | </root>


--------------------------------------------------------------------------------
/src/test/resources/script/create_small_lst_from_lexicon.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ''' This script is used for creating data used for test
 4 | 
 5 | Output is already included in resources/data directory
 6 | as `template.small.lst`, so usually this file is unnecessary.
 7 | 
 8 | Example usage from the project root directory is
 9 | ./src/test/resources/script/create_small_lst_from_lexicon.py \
10 | ./ccgbank/template.lst
11 | 
12 | '''
13 | 
14 | import sys, os
15 | 
16 | if __name__ == '__main__':
17 |     if len(sys.argv) < 2:
18 |         print "usage", sys.argv[0], "full_template_lst"
19 |         exit()
20 | 
21 |     data_dir = os.path.abspath(os.path.dirname(__file__))+'/../data'
22 |     small_lexicon_path = data_dir+'/Japanese.small.lexicon'
23 |     output_path = data_dir+'/template.small.lst'
24 |     
25 |     cat_tmps = []
26 |     for line in open(small_lexicon_path):
27 |         cat_tmps += line.strip().split(' ')[1:]
28 |     cat_tmps = set(cat_tmps)
29 | 
30 |     with open(output_path, 'w') as f:
31 |         for line in open(sys.argv[1]):
32 |             line = line.strip().split('\t')
33 |             cat_tmp = line[0]
34 |             cat_str = line[1]
35 | 
36 |             if cat_tmp in cat_tmps:
37 |                 f.write("%s\t%s\n" % (cat_tmp, cat_str))
38 |             
39 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/Convolution1DSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.io._
20 | import org.scalatest._
21 | 
22 | import jigg.util.HDF5Object
23 | 
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 | 
27 | class Convolution1DSpec extends FlatSpec with Matchers{
28 | 
29 |   def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 | 
31 |   "convert" should "load model and convert input matrix" in {
32 |     val hdf5 = HDF5Object.fromResource("./data/ml/keras/convolution1d/convolution1d_model.h5")
33 |     val model = new KerasModel(hdf5)
34 |     val inputData = csvread(new File(findPath("./data/ml/keras/convolution1d/convolution1d_input.csv")),separator = ',').map{x => x.toFloat}
35 |     val goldData = csvread(new File(findPath("./data/ml/keras/convolution1d/convolution1d_gold.csv")),separator = ',').map{x => x.toFloat}
36 | 
37 |     val output = model.convert(inputData)
38 | 
39 |     val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 | 
41 |     diff should be (true)
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/DenseSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.io._
20 | import org.scalatest._
21 | 
22 | import jigg.util.HDF5Object
23 | 
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 | 
27 | 
28 | class DenseSpec extends FlatSpec with Matchers{
29 | 
30 |   def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
31 | 
32 |   "convert" should "load model and convert input matrix" in {
33 |     val hdf5 = HDF5Object.fromResource("./data/ml/keras/dense/dense_model.h5")
34 |     val model = new KerasModel(hdf5)
35 |     val inputData = csvread(new File(findPath("./data/ml/keras/dense/dense_input.csv")),separator = ',').map{x => x.toFloat}
36 |     val goldData = csvread(new File(findPath("./data/ml/keras/dense/dense_gold.csv")),separator = ',').map{x => x.toFloat}
37 | 
38 |     val output = model.convert(inputData)
39 | 
40 |     val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
41 | 
42 |     diff should be (true)
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/EmbeddingSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.io._
20 | import org.scalatest._
21 | 
22 | import jigg.util.HDF5Object
23 | 
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 | 
27 | class EmbeddingSpec extends FlatSpec with Matchers{
28 | 
29 |   def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 | 
31 |   "convert" should "load model and convert input matrix" in {
32 |     val hdf5 = HDF5Object.fromResource("./data/ml/keras/embedding/embedding_model.h5")
33 |     val model = new KerasModel(hdf5)
34 |     val inputData = csvread(new File(findPath("./data/ml/keras/embedding/embedding_input.csv")),separator = ',').map{x => x.toFloat}
35 |     val goldData = csvread(new File(findPath("./data/ml/keras/embedding/embedding_gold.csv")),separator = ',').map{x => x.toFloat}
36 | 
37 |     val output = model.convert(inputData)
38 | 
39 |     val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 | 
41 |     diff should be (true)
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/FlattenSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.io._
20 | import org.scalatest._
21 | 
22 | import jigg.util.HDF5Object
23 | 
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 | 
27 | class FlattenSpec extends FlatSpec with Matchers{
28 | 
29 |   def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 | 
31 |   "convert" should "load model and convert input matrix" in {
32 |     val hdf5 = HDF5Object.fromResource("./data/ml/keras/flatten/flatten_model.h5")
33 |     val model = new KerasModel(hdf5)
34 |     val inputData = csvread(new File(findPath("./data/ml/keras/flatten/flatten_input.csv")),separator = ',').map{x => x.toFloat}
35 |     val goldData = csvread(new File(findPath("./data/ml/keras/flatten/flatten_gold.csv")),separator = ',').map{x => x.toFloat}
36 | 
37 |     val output = model.convert(inputData)
38 | 
39 |     val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 | 
41 |     diff should be (true)
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/KerasModelSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.io._
20 | import org.scalatest._
21 | 
22 | import jigg.util.HDF5Object
23 | 
24 | import breeze.linalg.csvread
25 | import breeze.numerics.abs
26 | 
27 | class KerasModelSpec extends FlatSpec with Matchers{
28 | 
29 |   def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
30 | 
31 |   "convert" should "load model and convert input matrix" in {
32 |     val hdf5 = HDF5Object.fromResource("./data/ml/keras/kerasModel/kerasModel_model.h5")
33 |     val model = new KerasModel(hdf5)
34 |     val inputData = csvread(new File(findPath("./data/ml/keras/kerasModel/kerasModel_input.csv")),separator = ',').map{x => x.toFloat}
35 |     val goldData = csvread(new File(findPath("./data/ml/keras/kerasModel/kerasModel_gold.csv")),separator = ',').map{x => x.toFloat}
36 | 
37 |     val output = model.convert(inputData)
38 | 
39 |     val diff = abs(output - goldData).forall(x => x < 1e-6.toFloat)
40 | 
41 |     diff should be (true)
42 |   }
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/ml/keras/KerasParserTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.ml.keras
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | 
21 | import org.scalatest.FunSuite
22 | import org.scalatest.Matchers._
23 | 
24 | import jigg.util.{HDF5Object, LookupTable}
25 | 
26 | class KerasParserTest extends FunSuite{
27 | 
28 |   val model = new KerasModel(HDF5Object.fromResource("./data/keras/ssplit_model.h5"))
29 |   val table = LookupTable.fromResource("data/keras/jpnLookupCharacter.json")
30 | 
31 |   val parser = new KerasParser(model, table)
32 | 
33 |   test("get an offset list from pattern1") {
34 |     val pattern = Array[Int](0,1,1,0,1,1)
35 |     val ranges  = parser.getOffsets(pattern)
36 |     ranges should be (Array[(Int, Int)]((0,3),(3,6)))
37 |   }
38 | 
39 |   test("get an offset list from pattern2") {
40 |     val pattern = Array[Int](0,1,1,2,2,0,1,1)
41 |     val ranges  = parser.getOffsets(pattern)
42 |     ranges should be (Array[(Int, Int)]((0,3),(5,8)))
43 |   }
44 | 
45 |   test("get an offset list from pattern3") {
46 |     val pattern = Array[Int](0,1,1,2,0,1,1,2)
47 |     val ranges  = parser.getOffsets(pattern)
48 |     ranges should be (Array[(Int, Int)]((0,3),(4,7)))
49 | 
50 |   }
51 | 
52 |   test("get an offset list from pattern4") {
53 |     val pattern = Array[Int](2,2,0,1,1,2,0,1,1,2)
54 |     val ranges  = parser.getOffsets(pattern)
55 |     ranges should be (Array[(Int, Int)]((2,5),(6,9)))
56 |   }
57 | 
58 |   test("get an offset list from pattern5") {
59 |     val pattern = Array[Int](1,1,1,0,1,1)
60 |     val ranges  = parser.getOffsets(pattern)
61 |     ranges should be (Array[(Int, Int)]((0,3),(3,6)))
62 |   }
63 | 
64 |   test("get an offset list from pattern6") {
65 |     val pattern = Array[Int](2,2,1,1,1,0,1,1)
66 |     val ranges  = parser.getOffsets(pattern)
67 |     ranges should be (Array[(Int, Int)]((2,5),(5,8)))
68 |   }
69 | 
70 |   test("get an offset list from pattern7") {
71 |     val pattern = Array[Int](0,1,1,0,0,1,1)
72 |     val ranges  = parser.getOffsets(pattern)
73 |     ranges should be (Array[(Int, Int)]((0,3),(3,4),(4,7)))
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/BunsetsuTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import org.scalatest.FunSuite
20 | import org.scalatest.Matchers._
21 | 
22 | class BunsetsuTest extends FunSuite {
23 |   test("A gold derivation with cabocha bunsetsu-segments recover gold dependencies") {
24 |     import jigg.nlp.ccg.parser.ParsedSentences
25 |     val parsedSentences = new ParsedSentences
26 |     val (sentence, derivation) = parsedSentences.simpleSentenceAndDerivation
27 | 
28 |     val bunsetsuSentence = BunsetsuSentence(Array(
29 |       Bunsetsu(0, sentence.wordSeq.slice(0, 2), sentence.posSeq.slice(0, 2)), // 政権 に
30 |       Bunsetsu(2, sentence.wordSeq.slice(2, 4), sentence.posSeq.slice(2, 4)), // 影響 を
31 |       Bunsetsu(4, sentence.wordSeq.slice(4, 5), sentence.posSeq.slice(4, 5)), // 及ぼす
32 |       Bunsetsu(5, sentence.wordSeq.slice(5, 6), sentence.posSeq.slice(5, 6)))) // こと
33 | 
34 |     val parsed = bunsetsuSentence.parseWithCCGDerivation(derivation)
35 |     parsed.headSeq should equal (Seq(2, 2, 3, -1))
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/CategoryFeatureTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | import org.scalatest.FunSuite
19 | import org.scalatest.Matchers._
20 | import scala.collection.mutable.HashSet
21 | 
22 | class JPCategoryFeatureTest extends FunSuite {
23 |   test("equal test") {
24 |     val feat1 = JPCategoryFeature.createFromValues(List("adn","attr","ga"))
25 |     val feat2 = JPCategoryFeature.createFromValues(List("nm","attr","ga"))
26 |     val feat3 = JPCategoryFeature.createFromValues(List("adn","attr"))
27 |     val feat4 = JPCategoryFeature.createFromValues(List("adn","attr","ga"))
28 | 
29 |     feat1.kvs should equal (feat4.kvs)
30 |     feat1.kvs should not equal (feat2.kvs)
31 |     feat1.kvs should not equal (feat3.kvs)
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/CategoryManagerTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | import org.scalatest.FunSuite
19 | import org.scalatest.Matchers._
20 | 
21 | class CategoryManagerTest extends FunSuite {
22 |   test("the same child node should be assiged the same id") {
23 |     val manager = new CategoryManager // Constructor automatically creates unknown category which is assigned id 0
24 | 
25 |     val cat = JapaneseCategoryParser.parse("NP[case=o,mod=nm]/NP[case=o,mod=nm]")
26 |     manager.assignID(cat) match {
27 |       case ComplexCategory(id, left, right, _) => {
28 |         left.id should equal (1)
29 |         right.id should equal (1)
30 |         id should equal (2)
31 |       }
32 |       case _ => fail() // should not occur
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/lexicon/CategoryParserTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.lexicon
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | import org.scalatest.FunSuite
19 | import org.scalatest.Matchers._
20 | 
21 | class CategoryParserTest extends FunSuite {
22 |   test("extractCategoryFeature") {
23 |     val reader = new JapaneseCategoryParser.JapaneseReader
24 |     val ni_nm = reader.extractCategoryFeature("ni,nm")
25 |     ni_nm.toString should equal ("mod=nm,case=ni")
26 |     //assert(ni_nm.toString == "mod=nm,case=ni")
27 |   }
28 | 
29 |   test("createAomicCategory") {
30 |     val cat1Str = "NP[case=nc,mod=nm]{I1}"
31 |     val cat1 = JapaneseCategoryParser.parse(cat1Str)
32 |     cat1.toString should equal ("NP[mod=nm,case=nc]")
33 | 
34 |     val cat2Str = "(((S[mod=adn,form=base]{I1}\\NP[case=ni,mod=nm]{I2}){I1})\\NP[case=o,mod=nm]{I3}){I1}_I1(unk,I3,I2,_)"
35 |     val cat2 = JapaneseCategoryParser.parse(cat2Str)
36 |     cat2.toString should equal ("(S[mod=adn,form=base]\\NP[mod=nm,case=ni])\\NP[mod=nm,case=o]")
37 | 
38 | 
39 |     val cat3Str = "(NP[case=X1,mod=X2,fin=f]{I1}/NP[case=X1,mod=X2,fin=f]{I1}){I2}_none"
40 |     val cat3 = JapaneseCategoryParser.parse(cat3Str)
41 |     cat3.toString should equal ("NP[fin=f]/NP[fin=f]")
42 |   }
43 | 
44 |   // These are obsolute tests for previous version
45 |   // test("createComplexCategory") {
46 |   //   JapaneseCategoryParser.parse("NP[nc,nm]1//NP[nc,nm]1").toString should equal("NP[mod=nm,case=nc]/NP[mod=nm,case=nc]")
47 |   //   JapaneseCategoryParser.parse("(S[nm,stem,nm]＼NP[nc,nm])／NP[nc,nm]").toString should equal(
48 |   //     """(S[mod=nm,form=stem]\NP[mod=nm,case=nc])/NP[mod=nm,case=nc]""")
49 |   //   JapaneseCategoryParser.parse("(((S＼NP)／NP[nc,nm])＼(S[nm,stem]1／NP[o,nm]sem))／NP[nc,nm]1").toString should equal(
50 |   //     """(((S\NP)/NP[mod=nm,case=nc])\(S[mod=nm,form=stem]/NP[mod=nm,case=o]))/NP[mod=nm,case=nc]""")
51 |   //   JapaneseCategoryParser.parse("S1／S1").toString should equal("S/S")
52 |   //   JapaneseCategoryParser.parse("(S2／S2)1／(S3／S3)1").toString should equal("(S/S)/(S/S)")
53 |   // }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/nlp/ccg/parser/RuleTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.nlp.ccg.parser
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import org.scalatest.FunSuite
20 | import org.scalatest.Matchers._
21 | 
22 | class RuleTest extends FunSuite {
23 |   val parsedSentences = new ParsedSentences
24 |   val dict = parsedSentences.dict
25 |   def cat(str:String) = dict.getCategory(str).get
26 | 
27 |   test("extract all rules from derivations") {
28 |     val (sentence, derivation) = parsedSentences.simpleSentenceAndDerivation
29 | 
30 |     val rule = CFGRule.extractRulesFromDerivations(Array(derivation), JapaneseHeadFinder)
31 |     rule.unify(cat("(NP[case=nc,mod=X1]{I1}/NP[case=nc,mod=X1]{I1}){I2}"), cat("NP[case=nc,mod=nm]{I1}_none")).get should contain (cat("NP[case=nc,mod=nm]{I1}"), ">")
32 |     rule.raise(cat("S[mod=adn,form=base]{I1}")).get should contain (cat("(NP[case=nc,mod=X1]{I1}/NP[case=nc,mod=X1]{I1}){I2}"), "ADN")
33 |     rule.unify(cat("NP[case=ni,mod=nm]{I1}"), cat("(S[mod=adn,form=base]{I1}\\NP[case=ni,mod=nm]{I2}){I1}")).get should contain (cat("S[mod=adn,form=base]{I1}"), "<")
34 | 
35 |     rule.unify(cat("NP[case=nc,mod=nm]{I1}_none"), cat("(NP[case=o,mod=nm]{I1}\\NP[case=nc,mod=nm]{I1}){I2}_none")).get should contain (cat("NP[case=o,mod=nm]{I1}"), "<")
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/AnnotatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import scala.xml.Node
21 | import org.scalatest._
22 | 
23 | import jigg.util.Prop
24 | 
25 | class NothingAnnotator(override val name: String, override val props: Properties) extends Annotator {
26 | 
27 |   @Prop(gloss = "gloss of variable1", required=true) var variable1 = ""
28 |   readProps()
29 | 
30 |   def annotate(node: Node) = node
31 | }
32 | 
33 | class AnnotatorSpec extends FlatSpec with Matchers {
34 | 
35 |   "Opt variable" should "be customizable with property file" in {
36 |     val props = new Properties
37 |     props.setProperty("nothing.variable1", "hoge")
38 | 
39 |     val annotator = new NothingAnnotator("nothing", props)
40 | 
41 |     annotator.variable1 should be("hoge")
42 |   }
43 | 
44 |   "Annotator" should "throws an exception during initProps if required variable is missed" in {
45 |     val props = new Properties
46 |     try {
47 |       val annotator = new NothingAnnotator("nothing", props)
48 |       fail()
49 |     } catch {
50 |       case e: ArgumentError =>
51 |       case _: Throwable => fail()
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/BaseAnnotatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import org.scalactic.Equality
21 | import org.scalatest._
22 | import scala.xml._
23 | 
24 | trait BaseAnnotatorSpec extends FlatSpec with Matchers {
25 | 
26 |   val sameElem = new Equality[Node] {
27 |     import scala.xml.Utility.trim
28 |     override def areEqual(a: Node, b: Any) = b match {
29 |       case n: Node => trim(a) == trim(n)
30 |       case _ => false
31 |     }
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/BeneParAnnotatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2017 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | 
21 | import org.scalatest._
22 | import scala.xml._
23 | 
24 | class BeneParAnnotatorSpec extends BaseAnnotatorSpec {
25 | 
26 |   class AnnotatorStub(output: String) extends BeneParAnnotator("benepar", new Properties) {
27 |     override def mkLocalAnnotator = new LocalBeneParAnnotator {
28 |       override def mkCommunicator = new StubExternalCommunicator(output)
29 |     }
30 |     assert(nThreads == 1)
31 |   }
32 | 
33 |   Annotation.ParseSpan.idGen.reset()
34 | 
35 |   "BeneParAnnotator" should "convert a s-tree output of benepar into a node" in {
36 |     val doc =
37 |       <document id="d1">
38 |         <sentences>
39 |           <sentence id="s1" characterOffsetBegin="0" characterOffsetEnd="14">
40 |             He ate pizza .
41 |             <tokens annotators="corenlp">
42 |               <token characterOffsetEnd="2" characterOffsetBegin="0" id="t4" form="He" pos="PRP"/>
43 |               <token characterOffsetEnd="6" characterOffsetBegin="3" id="t5" form="ate" pos="VBD"/>
44 |               <token characterOffsetEnd="12" characterOffsetBegin="7" id="t6" form="pizza" pos="NN"/>
45 |               <token characterOffsetEnd="14" characterOffsetBegin="13" id="t7" form="." pos="."/>
46 |             </tokens>
47 |           </sentence>
48 |         </sentences>
49 |       </document>
50 | 
51 |     val output = """(S (NP (PRP He)) (VP (VBD ate) (NN pizza)) (. .))
52 | END"""
53 | 
54 |     val ann = new AnnotatorStub(output)
55 |     val annotation = ann.annotate(doc)
56 | 
57 |     val s = annotation \\ "sentence"
58 | 
59 |     (s \ "parse").head should equal(<parse annotators="benepar" root="sp2">
60 |       <span id="sp0" symbol="NP" children="t4"/>
61 |       <span id="sp1" symbol="VP" children="t5 t6"/>
62 |       <span id="sp2" symbol="S" children="sp0 sp1 t7"/>
63 |       </parse>) (decided by sameElem)
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/BunsetsuKerasAnnotatorTest.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Hiroshi Noji
 5 |  
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 |  
10 |      http://www.apache.org/licencses/LICENSE-2.0
11 |      
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitation under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | 
21 | import org.scalatest.FunSuite
22 | import org.scalatest.Matchers._
23 | 
24 | import scala.xml.{NodeSeq, Node}
25 | 
26 | class BunsetsuKerasAnnotatorTest extends FunSuite {
27 | 
28 |   def findPath(localPath: String): String = getClass.getClassLoader.getResource(localPath).getPath
29 | 
30 |   def segment(node: Node, properties: Properties): NodeSeq = {
31 |     val bunsetsuSplitter = new IPABunsetsuKerasAnnotator("bunsetsuKeras", properties)
32 |     bunsetsuSplitter.mkLocalAnnotator.newSentenceAnnotation(node)
33 |   }
34 | 
35 |   val properties = new Properties
36 |   properties.setProperty("bunsetsuKeras.model", findPath("./data/keras/bunsetsu_model.h5"))
37 |   properties.setProperty("bunsetsuKeras.table", findPath("data/keras/jpnLookupWords.json"))
38 | 
39 |   test("do chunking") {
40 | 
41 |     val chunks = segment(Sentences.xml("oneSentence"),properties) \\ "chunk"
42 | 
43 |     chunks.length should be (2)
44 |   }
45 | 
46 |   object Sentences {
47 |     val xml = Map("oneSentence" ->
48 |       <sentence id="s1" characterOffsetBegin="0" characterOffsetEnd="6">
49 |         梅が咲いた。
50 |         <tokens annotators="mecab">
51 |           <token id="s1_tok0" form="梅" offsetBegin="0" offsetEnd="1" pos="名詞" pos1="一般" pos2="*" pos3="*" cType="*" cForm="*" lemma="梅" yomi="ウメ" pron="ウメ"/>
52 |           <token id="s1_tok1" form="が" offsetBegin="1" offsetEnd="2" pos="助詞" pos1="格助詞" pos2="一般" pos3="*" cType="*" cForm="*" lemma="が" yomi="ガ" pron="ガ"/>
53 |           <token id="s1_tok2" form="咲い" offsetBegin="2" offsetEnd="4" pos="動詞" pos1="自立" pos2="*" pos3="*" cType="五段・カ行イ音便" cForm="連用タ接続" lemma="咲く" yomi="サイ" pron="サイ"/>
54 |           <token id="s1_tok3" form="た" offsetBegin="4" offsetEnd="5" pos="助動詞" pos1="*" pos2="*" pos3="*" cType="特殊・タ" cForm="基本形" lemma="た" yomi="タ" pron="タ"/>
55 |           <token id="s1_tok4" form="。" offsetBegin="5" offsetEnd="6" pos="記号" pos1="句点" pos2="*" pos3="*" cType="*" cForm="*" lemma="。" yomi="。" pron="。"/>
56 |         </tokens>
57 |       </sentence>
58 |     )
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/KuromojiAnnotatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2015 Takafumi Sakakibara and Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import scala.xml.Node
21 | import org.scalatest._
22 | 
23 | import com.atilika.kuromoji.{TokenBase, TokenizerBase}
24 | import com.atilika.kuromoji.ipadic.{Token=>IToken, Tokenizer=>ITokenizer}
25 | 
26 | class KuromojiAnnotatorSpec extends FlatSpec with Matchers {
27 | 
28 |   "Annotator" should "assign token id using sentence id" in {
29 | 
30 |     val annotator = KuromojiAnnotator.fromProps("kuromoji", new Properties)
31 | 
32 |     val sentence = <sentence id="a">あ</sentence>
33 |     val annotated = annotator newSentenceAnnotation sentence
34 | 
35 |     val tokenId = annotated \\ "token" \@ "id"
36 |     tokenId should be ("a_0")
37 |   }
38 | 
39 |   "TokenAnnotator" should "segment into tokens" in {
40 |     val annotator = KuromojiAnnotator.fromProps("kuromoji[tokenize]", new Properties)
41 | 
42 |     val sentence = <sentence id = "a">あ</sentence>
43 |     val annotated = annotator newSentenceAnnotation sentence
44 | 
45 |     val token = annotated \\ "token"
46 |     token \@ "form" should be ("あ")
47 |     token \@ "pos" should be ("")
48 |   }
49 | 
50 |   "POSAnnotator" should "assign POS tags" in {
51 |     val annotator = KuromojiAnnotator.fromProps("kuromoji[pos]", new Properties)
52 | 
53 |     val sentence = <sentence id = "a">
54 |     <tokens>
55 |     <token id="a_0" form="あ" characterOffsetBegin="0" characterOffsetEnd="1" dummy="a"/>
56 |     </tokens>
57 |     </sentence>
58 | 
59 |     val annotated = annotator newSentenceAnnotation sentence
60 | 
61 |     val token = annotated \\ "token"
62 |     token \@ "pos" should not be ("")
63 |     token \@ "dummy" should be ("a") // not removed (overriden)
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/MecabAnnotatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2017 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import scala.xml.Node
21 | import org.scalatest._
22 | 
23 | class MecabAnnotatorSpec extends BaseAnnotatorSpec {
24 | 
25 |   def stubCom(output: String) = new StubExternalCommunicator(output)
26 |   def mapCom(responces: Map[String, String]) = new MapStubExternalCommunicator(responces)
27 | 
28 |   def newIPA(mkCom: ()=>IOCommunicator, threads: Int = 1, p: Properties = new Properties) =
29 |     new IPAMecabAnnotator("mecab", p) {
30 |       override def mkLocalAnnotator = new IPALocalMecabAnnotator {
31 |         override def mkCommunicator = mkCom()
32 |       }
33 |       override def nThreads = threads
34 |     }
35 | 
36 |   "Annotator with nThreads=1" should "be able to annotate one sentence" in {
37 |     val s = "a"
38 |     val in = <root><document><sentences><sentence id="s0">a</sentence></sentences></document></root>
39 |     val out = """a	名詞,固有名詞,組織,*,*,*,*
40 | EOS"""
41 |     val annotator = newIPA(()=>stubCom(out), threads=1)
42 |     val result = annotator.annotate(in)
43 |     val tokens = result \\ "token"
44 |     tokens.size should be(1)
45 |     (tokens(0) \@ "pos") should be("名詞")
46 | 
47 |     result \\ "tokens" \@ "annotators" should be("mecab")
48 |   }
49 | 
50 |   "Annotator with nThreads=2" should "annotate in parallel" in {
51 |     val responces = Map(
52 |       "a" -> """a	名詞,固有名詞,*,*,*,*,*
53 | EOS""",
54 |       "b" -> """b	動詞,*,*,*,*,*,*
55 | EOS""",
56 |       "c" -> """c	形容詞,*,*,*,*,*,*
57 | EOS"""
58 |     )
59 |     val in = <root>
60 |     <document>
61 |     <sentences>
62 |     <sentence id="s0">a</sentence>
63 |     <sentence id="s1">b</sentence>
64 |     <sentence id="s2">c</sentence>
65 |     </sentences>
66 |     </document>
67 |     </root>
68 | 
69 |     val annotator = newIPA(()=>mapCom(responces), threads=2)
70 |     val result = annotator.annotate(in)
71 | 
72 |     val sentences = result \\ "sentence"
73 |     sentences.size should be(3)
74 |     ((sentences(0) \\ "token")(0) \@ "form") should be("a")
75 |     ((sentences(1) \\ "token")(0) \@ "form") should be("b")
76 |     ((sentences(2) \\ "token")(0) \@ "form") should be("c")
77 |   }
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/PipelineSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2018 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import org.scalatest._
21 | import scala.xml._
22 | import jigg.util.{XMLUtil, JSONUtil}
23 | 
24 | class PipelineSpec extends BaseAnnotatorSpec {
25 | 
26 |   class StubMecabAnnotator(n: String, p: Properties)
27 |       extends IPAMecabAnnotator(n, p) {
28 |     override def mkLocalAnnotator = new IPALocalMecabAnnotator {
29 |       override def mkCommunicator = new StubExternalCommunicator("aaa")
30 |     }
31 |   }
32 | 
33 |   class DummyPipeline(p: Properties) extends Pipeline(p) {
34 |     override def getAnnotator(name: String) = name match {
35 |       case "dummy" => new StubMecabAnnotator(name, p)
36 |       case _ => super.getAnnotator(name)
37 |     }
38 |   }
39 | 
40 |   "-Threads option" should "be able to customize each annotator's number of threads" in {
41 |     val p = new Properties
42 |     p.setProperty("annotators", "ssplit,dummy")
43 |     p.setProperty("nThreads", "2")
44 |     p.setProperty("dummy.nThreads", "4")
45 | 
46 |     val pipeline = new DummyPipeline(p)
47 | 
48 |     val annotators = pipeline.annotatorList
49 |     annotators(0).name should equal("ssplit")
50 |     annotators(0).nThreads should equal(2)
51 |     annotators(1).name should equal("dummy")
52 |     annotators(1).nThreads should equal(4)
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/RequirementSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | import org.scalatest._
21 | 
22 | class RequirementSpec extends FlatSpec with Matchers {
23 | 
24 |   "Tokenize" should "be satisfied when TokenizeWithIPA is satisfied" in {
25 | 
26 |     val satisfied = RequirementSet(JaRequirement.TokenizeWithIPA)
27 |     val requires: Set[Requirement] = Set(Requirement.Tokenize)
28 | 
29 |     val lacked = satisfied.lackedIn(requires)
30 |     lacked shouldBe empty
31 |   }
32 | 
33 |   "TokenizedWithIPA" should "not be satisifed when Tokenize is satisfied" in {
34 | 
35 |     val satisfied = RequirementSet(Requirement.Tokenize)
36 |     val requires: Set[Requirement] = Set(JaRequirement.TokenizeWithIPA)
37 | 
38 |     val lacked = satisfied.lackedIn(requires)
39 |     lacked shouldBe Set(JaRequirement.TokenizeWithIPA)
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/pipeline/SyntaxNetAnnotatorSpec.scala:
--------------------------------------------------------------------------------
 1 | package jigg.pipeline
 2 | 
 3 | /*
 4 |  Copyright 2013-2016 Hiroshi Noji
 5 | 
 6 |  Licensed under the Apache License, Version 2.0 (the "License");
 7 |  you may not use this file except in compliance with the License.
 8 |  You may obtain a copy of the License at
 9 | 
10 |      http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 |  Unless required by applicable law or agreed to in writing, software
13 |  distributed under the License is distributed on an "AS IS" BASIS,
14 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  See the License for the specific language governing permissions and
16 |  limitations under the License.
17 | */
18 | 
19 | import java.util.Properties
20 | 
21 | import org.scalatest._
22 | import scala.xml._
23 | 
24 | class SyntaxNetAnnotatorSpec extends BaseAnnotatorSpec {
25 | 
26 |   class POSAnnotatorStub(output: String) extends
27 |       SyntaxNetPOSAnnotator("syntaxnetpos", new Properties) {
28 | 
29 |     override def run(input: String) = output.split("\n").toStream
30 |   }
31 | 
32 |   "POSAnnotator" should "annotate all sentences across documents" in {
33 | 
34 |     val root = <root>
35 |       <document id="0">
36 |         <sentences><sentence><tokens><token id="t0" surf="a"/></tokens></sentence></sentences>
37 |       </document>
38 |       <document id="1">
39 |         <sentences><sentence><tokens>
40 |           <token id="t1" surf="b"/>
41 |           <token id="t2" surf="c"/>
42 |         </tokens></sentence></sentences>
43 |       </document>
44 |       <document id="2">
45 |         <sentences><sentence><tokens><token id="t3" surf="d"/></tokens></sentence></sentences>
46 |       </document>
47 |     </root>
48 | 
49 |     val output = """1	a	_	A	A	_	0	A	_	_
50 | 
51 | 1	b	_	B	B	_	0	B	_	_
52 | 2	c	_	C	C	_	0	C	_	_
53 | 
54 | 1	c	_	D	D	_	0	D	_	_
55 | """
56 | 
57 |     val annotator = new POSAnnotatorStub(output)
58 |     val annotated = annotator.annotate(root)
59 | 
60 |     annotated should equal (<root>
61 |       <document id="0">
62 |         <sentences><sentence><tokens annotators="syntaxnetpos">
63 |           <token id="t0" surf="a" pos="A" cpos="A"/>
64 |         </tokens></sentence></sentences>
65 |       </document>
66 |       <document id="1">
67 |         <sentences><sentence><tokens annotators="syntaxnetpos">
68 |           <token id="t1" surf="b" pos="B" cpos="B"/>
69 |           <token id="t2" surf="c" pos="C" cpos="C"/>
70 |         </tokens></sentence></sentences>
71 |       </document>
72 |       <document id="2">
73 |         <sentences><sentence><tokens annotators="syntaxnetpos">
74 |           <token id="t3" surf="d" pos="D" cpos="D"/>
75 |         </tokens></sentence></sentences>
76 |       </document>
77 |     </root>) (decided by sameElem)
78 |   }
79 | 
80 | }
81 | 


--------------------------------------------------------------------------------
/src/test/scala/jigg/util/JSONUtilSpec.scala:
--------------------------------------------------------------------------------
  1 | package jigg.util
  2 | 
  3 | package jigg.pipeline
  4 | 
  5 | import org.scalatest.FunSuite
  6 | import org.scalatest.Matchers._
  7 | 
  8 | class JSONUtilSpec extends FunSuite{
  9 |   import org.json4s._
 10 |   import org.json4s.jackson.JsonMethods._
 11 | 
 12 |   val testNode =
 13 |     <root>
 14 |       <document id={"d0"}>
 15 |         Test Node
 16 |       </document>
 17 |     </root>
 18 |   val goldJSON =
 19 |     parse(
 20 |     """
 21 |       {
 22 |         ".tag" : "root",
 23 |         ".child" : [ {
 24 |           ".tag" : "document",
 25 |           "id" : "d0",
 26 |           "text" : "Test Node"
 27 |         } ]
 28 |       }
 29 |     """
 30 |     )
 31 | 
 32 |   /**
 33 |     * For handling a backslash.
 34 |     */
 35 |   val testNodeForBackslash =
 36 |     <root>
 37 |       <document id={"d0\\N"}>
 38 |         Test Node
 39 |       </document>
 40 |     </root>
 41 | 
 42 |   val goldJSONForBackSlash =
 43 |     parse(
 44 |     """{".tag":"root",".child":
 45 |       [{".tag":"document","id":"d0\\N","text":"Test Node"}
 46 |       ]
 47 |     }"""
 48 |     )
 49 | 
 50 |   /**
 51 |     * For handling escaped strings.
 52 |     */
 53 |   val testNodeForEscaping =
 54 |     <root>
 55 |       <document id={"<d0>"}>
 56 |         {"quot\" amp&"}
 57 |       </document>
 58 |       <document id={"d1"}>
 59 |         {"new line\n \n tab\t \t carriage return\r \r backslash\\ \\"}
 60 |       </document>
 61 |     </root>
 62 | 
 63 |   val goldJSONForEscaping =
 64 |   parse(
 65 |   """{".tag":"root",".child":
 66 |       [{".tag":"document","id":"<d0>","text":"quot\" amp&"},
 67 |       {".tag":"document", "id":"d1", "text": "new line\n \n tab\t \t carriage return\r \r backslash\\ \\"}
 68 |       ]
 69 |     }"""
 70 |   )
 71 | 
 72 |   val testJSONForEscaping =
 73 |   parse(
 74 |   """{".tag":"root",".child":
 75 |       [{".tag":"document","id":"&lt;d0&gt;","text":"&amp;Test Node&quot;amp;"}
 76 |       ]
 77 |     }"""
 78 |   )
 79 | 
 80 |   /**
 81 |    * Unit testing toJSON
 82 |    */
 83 |   test("toJSON should generate formatted String object from scala.xml.Node"){
 84 |     parse(JSONUtil.toJSON(testNode)) should be (goldJSON)
 85 |     parse(JSONUtil.toJSON(testNodeForBackslash)) should be (goldJSONForBackSlash)
 86 |     parse(JSONUtil.toJSON(testNodeForEscaping)) should be (goldJSONForEscaping)
 87 |   }
 88 |   /**
 89 |    * Unit testing JSON to XML
 90 |    */
 91 |   test("toXML should generate xml.Node"){
 92 |     val xmlFromJSON = JSONUtil.toXML(goldJSON)
 93 |     val xmlFromJSONWithBackslash = JSONUtil.toXML(goldJSONForBackSlash)
 94 |     val xmlFromJSONWithEscapeChar = JSONUtil.toXML(testJSONForEscaping)
 95 |     xmlFromJSON should be (<root><document id={"d0"}>{"Test Node"}</document></root>)
 96 |     xmlFromJSONWithBackslash should be (<root><document id={"d0\\N"}>{"Test Node"}</document></root>)
 97 |     xmlFromJSONWithEscapeChar should be (<root><document id={"<d0>"}>{"&Test Node\"amp;"}</document></root>)
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------