├── .gitignore ├── versions └── treetagger-3.2 │ ├── options │ └── Dockerfile ├── LICENSE ├── circle.yml ├── test └── test_leodido_treetagger-3.2.bats └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | 3 | .idea 4 | 5 | /images 6 | /versions/**/treetagger.tar.gz -------------------------------------------------------------------------------- /versions/treetagger-3.2/options: -------------------------------------------------------------------------------- 1 | export RELEASE="3.2" 2 | export BUILD_OPTIONS="-r ${RELEASE}" 3 | export TAGS="leodido/treetagger:${RELEASE} leodido/treetagger:latest" 4 | export PUSH_IMAGE="true" -------------------------------------------------------------------------------- /versions/treetagger-3.2/Dockerfile: -------------------------------------------------------------------------------- 1 | # TreeTagger 2 | # 3 | # @version latest (3.2) 4 | # @author leodido (@leodido) 5 | FROM gliderlabs/alpine:latest 6 | ADD /treetagger.tar.gz /usr/local/ 7 | RUN apk --update add perl && chmod +x /usr/local/cmd/* 8 | ENV PATH $PATH:/usr/local/cmd -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Leo Di Donato 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | general: 2 | artifacts: 3 | - "images" 4 | 5 | machine: 6 | services: 7 | - docker 8 | 9 | dependencies: 10 | cache_directories: 11 | - "~/docker" 12 | - "~/deps" 13 | pre: 14 | - if [[ ! -e ~/deps/bats_v0.4.0.tar.gz ]]; then mkdir -p ~/deps; curl -sSL -o ~/deps/bats_v0.4.0.tar.gz https://github.com/sstephenson/bats/archive/v0.4.0.tar.gz; fi 15 | - tar -xf ~/deps/bats_v0.4.0.tar.gz 16 | - sudo bats-0.4.0/install.sh /usr/local 17 | override: 18 | - docker info 19 | - if [[ -e ~/docker/builder.tar ]]; then docker load --input ~/docker/builder.tar; fi 20 | - ./build.bash: 21 | parallel: true 22 | files: 23 | - versions/**/options 24 | - mkdir -p ~/docker; docker save treetagger-builder > ~/docker/builder.tar 25 | - docker images 26 | 27 | test: 28 | override: 29 | - ./build.bash test: 30 | parallel: true 31 | files: 32 | - versions/**/options 33 | 34 | deployment: 35 | hub: 36 | branch: master 37 | commands: 38 | - docker login -e $DOCKER_EMAIL -u $DOCKER_USER -p $DOCKER_PASSWORD: 39 | parallel: true 40 | - ./build.bash push: 41 | parallel: true -------------------------------------------------------------------------------- /test/test_leodido_treetagger-3.2.bats: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bats 2 | 3 | setup() { 4 | docker history "leodido/treetagger:3.2" >/dev/null 2>&1 5 | shortcuts=(bulgarian dutch english estonian finnish french galician german italian latin polish portuguese portuguese-finegrained russian slovak spanish swahili) 6 | params=("bulgarian-utf8" "dutch-utf8" "dutch2-utf8" "english-utf8" "estonian-utf8" "finnish-utf8" "french-utf8" "galician" "german-utf8" "italian-utf8" "italian2-utf8" "latin" "latinIT" "mongolian" "polish-utf8" "portuguese-utf8" "portuguese-finegrained-utf8" "russian-utf8" "slovak-utf8" "slovak2-utf8" "spanish-utf8" "swahili") 7 | cparams=("english-chunker-utf8" "french-chunker-utf8" "german-chunker-utf8") 8 | } 9 | 10 | @test "parameter files are available" { 11 | for p in "${params[@]}" 12 | do 13 | run docker run "leodido/treetagger:3.2" test -e /usr/local/lib/${p}.par 14 | [ ${status} -eq 0 ] 15 | done 16 | } 17 | 18 | @test "chunker parameter files are available" { 19 | for cp in "${cparams[@]}" 20 | do 21 | run docker run "leodido/treetagger:3.2" test -e /usr/local/lib/${cp}.par 22 | [ ${status} -eq 0 ] 23 | done 24 | } 25 | 26 | @test "main executable is available" { 27 | run docker run "leodido/treetagger:3.2" which tree-tagger 28 | expected="/usr/local/bin/tree-tagger" 29 | [ ${output} = ${expected} ] 30 | } 31 | 32 | @test "version is correct" { 33 | run docker run "leodido/treetagger:3.2" tree-tagger -version 34 | [ ${status} -eq 0 ] 35 | [ ${output} = "Program version is 3.2.1" ] 36 | } 37 | 38 | @test "trainer executable is available" { 39 | run docker run "leodido/treetagger:3.2" which train-tree-tagger 40 | expected="/usr/local/bin/train-tree-tagger" 41 | [ ${output} = ${expected} ] 42 | } 43 | 44 | @test "shortcut executables are available and have 755 permissions" { 45 | for name in "${shortcuts[@]}" 46 | do 47 | run docker run "leodido/treetagger:3.2" which "tree-tagger-${name}" 48 | expected="/usr/local/cmd/tree-tagger-${name}" 49 | [ ${output} = ${expected} ] 50 | [ ${status} -eq 0 ] 51 | run docker run "leodido/treetagger:3.2" stat -c "%a" ${expected} 52 | [ ${output} = 755 ] 53 | done 54 | } 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | treetagger.docker 2 | ================= 3 | 4 | [![Build](https://img.shields.io/circleci/project/leodido/treetagger.docker/master.svg?style=flat-square)](https://circleci.com/gh/leodido/treetagger.docker) [![Docker](https://img.shields.io/badge/docker-ready-blue.svg?style=flat-square)](https://registry.hub.docker.com/u/leodido/treetagger) 5 | 6 | This repository contains docker images to build and ship ready to use [TreeTagger](http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/) instances. 7 | 8 | You will not have to manually install TreeTagger in your system again. 9 | 10 | What it is 11 | ---------- 12 | 13 | **A tool for annotating text with part-of-speech** ([POS tagging](http://en.wikipedia.org/wiki/Part-of-speech_tagging)) **and lemma information**. 14 | 15 | TreeTagger consists of two programs: 16 | 17 | 1. **train-tree-tagger** 18 | 19 | Creates a parameter file from a lexicon and a handtagged corpus. 20 | 21 | 2. **tree-tagger** 22 | 23 | Annotates the text with part-of-speech tags, given a parameter file and a text file as arguments. 24 | 25 | This image contains: 26 | 27 | - training program and tagger executables 28 | 29 | - program for tokenization (i.e., **separate-punctuation**) 30 | 31 | - shell scripts (shortcuts) which simplify tagging and chunking: 32 | 33 | e.g., **tree-tagger-italian**, **tree-tagger-german**, **tagger-chunker-english**, ... 34 | 35 | - parameter files, chunker parameter files, and abbreviations files 36 | 37 | - documentaion and language tagsets references 38 | 39 | See yourself them: 40 | 41 | ```bash 42 | $ docker run -i -t leodido/treetagger ls /usr/local 43 | ``` 44 | 45 | At this [link](http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger) the offical page and further documentation. 46 | 47 | Installation 48 | ------------ 49 | 50 | Directly pull this image from the docker index. 51 | 52 | ``` 53 | $ docker pull leodido/treetagger 54 | ``` 55 | 56 | Usage 57 | ----- 58 | 59 | ### Tagging 60 | 61 | Suppose you want to (tokenize and) tag an Italian text. 62 | 63 | The script to use is **tree-tagger-italian**. 64 | 65 | It expects UTF8 encoded input files as arguments. If no files have been specified, input from stdin is expected. 66 | 67 | ```bash 68 | $ echo 'Proviamo semplicemente a eseguire un test di prova.' | docker run --rm -i leodido/treetagger tree-tagger-italian 69 | ``` 70 | 71 | Outputs: 72 | 73 | ``` 74 | Proviamo VER:pres provare 75 | semplicemente ADV semplicemente 76 | a PRE a 77 | eseguire VER:infi eseguire 78 | un DET:indef un 79 | test NOM test 80 | di PRE di 81 | prova NOM prova 82 | . SENT . 83 | ``` 84 | 85 | Now, try with some Portuguese. 86 | 87 | ```bash 88 | $ echo 'Qual é o seu nome?' | docker run --rm -i leodido/treetagger tree-tagger-portuguese 89 | ``` 90 | 91 | Results: 92 | 93 | ``` 94 | Qual PT0 qual 95 | é VMI ser 96 | o DA0 o 97 | seu DP3 seu 98 | nome NCMS nome 99 | ? Fit ? 100 | ``` 101 | 102 | Finegrained? 103 | 104 | ```bash 105 | $ echo 'Qual é o seu nome?' | docker run --rm -i leodido/treetagger tree-tagger-portuguese-finegrained 106 | ``` 107 | 108 | Results: 109 | 110 | ``` 111 | Qual PT0CS000 qual 112 | é VMIP3S0 ser 113 | o DA0MS0 o 114 | seu DP3MSS seu 115 | nome NCMS000 nome 116 | ? Fit ? 117 | ``` 118 | 119 | And so on for other supported languages. 120 | 121 | 122 | ### Chunking 123 | 124 | Suppose you want to tokenize, tag and annotate a German text with nominal and verbal chunks. 125 | 126 | ```bash 127 | $ echo 'Das ist ein Test.' | docker run -i leodido/treetagger tagger-chunker-german 128 | ``` 129 | 130 | Outputs: 131 | 132 | ```xml 133 | 134 | Das PDS die 135 | 136 | 137 | ist VAFIN sein 138 | 139 | 140 | ein ART eine 141 | Test NN Test 142 | 143 | . $. . 144 | ``` 145 | 146 | Supported languages 147 | ------------------- 148 | 149 | **17 languages** are **supported**: bulgarian, dutch, english, estonian, finnish, french, galician, german, italian, latin, portuguese, polish, russian, slovak, spanish, swahili, mongolian (only parameter file provided, no scripts). 150 | 151 | Some of them have also alternative parameter files. 152 | 153 | Todos 154 | ----- 155 | 156 | - Add support for Chinese, and Spoken French. 157 | 158 | Credits 159 | ------- 160 | 161 | - Helmut Schmid, University of Stuttgart, Germany - [TreeTagger](http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger). 162 | 163 | _Last update: 28/05/2015_ 164 | 165 | --- 166 | 167 | [![Analytics](https://ga-beacon.appspot.com/UA-49657176-1/treetagger.docker)](https://github.com/igrigorik/ga-beacon) 168 | --------------------------------------------------------------------------------