├── .gitignore ├── .travis.yml ├── LICENSE ├── README.markdown ├── build.sbt ├── core ├── .scala_dependencies └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── core │ │ ├── AsAnalysisEngine.scala │ │ ├── Converter.scala │ │ ├── SCasAnnotator_ImplBase.scala │ │ ├── SCasCollectionReader_ImplBase.scala │ │ ├── SCasConsumer_ImplBase.scala │ │ ├── SCasFlowController_ImplBase.scala │ │ ├── SCasMultiplier_ImplBase.scala │ │ ├── SimplePipeline.scala │ │ ├── XmlDescriptor.scala │ │ ├── configuration │ │ ├── ConfigurationInitialization.scala │ │ ├── Parameter.scala │ │ ├── Resource.scala │ │ └── ResourceInitialization.scala │ │ ├── package.scala │ │ ├── stream │ │ ├── annotators.scala │ │ └── package.scala │ │ └── wrapper │ │ ├── AnnotationWrapper.scala │ │ └── JCasWrapper.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── core │ ├── ConverterSpec.scala │ ├── SCasAnnotator_ImplBaseSpecs.scala │ ├── SimplePipelineSpecs.scala │ ├── configuration │ ├── ConfigurationInitalizationSpec.scala │ ├── ParameterSpec.scala │ ├── ResourceInitializationSpec.scala │ └── ResourceSpec.scala │ ├── stream │ └── annotatorsSpec.scala │ ├── util │ └── Helper.scala │ └── wrapper │ ├── AnnotationWrapperSpec.scala │ └── JCasWrapperSpec.scala ├── language-identification └── n-gram-language-identifier │ └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── languageidentifier │ │ └── NGramLanguageIdentifier.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── languageidentifier │ └── NGramLanguageIdentifierSpec.scala ├── lemmatizer └── mate-lemmatizer │ └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── lemmatizer │ │ └── MateLemmatizer.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── lemmatizer │ └── MateLemmatizerSpec.scala ├── name-entity-recognizer └── stanford-ner │ └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── ner │ │ └── StanfordNer.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── ner │ └── StanfordNerSpec.scala ├── parser ├── mate-parser │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── parser │ │ │ └── MateParser.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── parser │ │ └── MateParserSpec.scala └── stanford-parser │ └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── parser │ │ └── StanfordParser.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── parser │ └── StanfordParserSpec.scala ├── part-of-speech-tagger ├── ark-tweet-pos-tagger │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── pos │ │ │ └── ArkTweetPosTagger.scala │ │ └── test │ │ ├── resources │ │ └── model.20120919 │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── pos │ │ └── ArkTweetPosTaggerSpec.scala ├── mate-pos-tagger │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── pos │ │ │ └── MatePosTagger.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── pos │ │ └── MatePosTaggerSpec.scala └── stanford-pos-tagger │ └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── pos │ │ └── StanfordPosTagger.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── pos │ └── StanfordPosTaggerSpec.scala ├── project └── plugins.sbt ├── sbt-plugin ├── build.sbt ├── project │ └── plugin.sbt ├── src │ └── main │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── sbt │ │ ├── UimaSbtPlugin.scala │ │ ├── UimaScalaTypeTemplate.scala │ │ └── UimaScala_TypeTemplate.scala └── version.sbt ├── segmenter ├── ark-tweet-tokenizer │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── segmenter │ │ │ └── ArkTweetTokenizer.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── ArkTweetTokenizerSpec.scala ├── break-iterator-segmenter │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── segmenter │ │ │ └── BreakIteratorSegmenter.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── BreakIteratorSegmenterSpec.scala ├── lucene-tokenizer │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── segmenter │ │ │ └── LuceneTokenizer.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── LuceneTokenizerSpec.scala ├── open-nlp-segmenter │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── segmenter │ │ │ └── OpenNlpSegmenter.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── OpenNlpSegmenterSpec.scala ├── regex-tokenizer │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── segmenter │ │ │ └── RegexTokenizer.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── RegexTokenizerSpec.scala ├── stanford-segmenter │ └── src │ │ ├── main │ │ └── scala │ │ │ └── com │ │ │ └── github │ │ │ └── jenshaase │ │ │ └── uimascala │ │ │ └── segmenter │ │ │ └── StanfordSegmenter.scala │ │ └── test │ │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── StanfordSegmenterSpec.scala └── whitespace-tokenizer │ └── src │ ├── main │ └── scala │ │ └── com │ │ └── github │ │ └── jenshaase │ │ └── uimascala │ │ └── segmenter │ │ └── WhitespaceTokenizer.scala │ └── test │ └── scala │ └── com │ └── github │ └── jenshaase │ └── uimascala │ └── segmenter │ └── WhitespaceTokenizerSpec.scala ├── type-system └── src │ └── main │ └── resources │ ├── META-INF │ └── org.apache.uima.fit │ │ └── types.txt │ └── desc │ └── types │ └── TypeSystem.xml └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | lib_managed/ 3 | src_managed/ 4 | project/boot/ 5 | uima-scala-docs/build/ 6 | *.iml 7 | *.ipr 8 | *.iws 9 | /.idea 10 | .scala-dependencies 11 | 12 | # Eclipse 13 | *.pydevproject 14 | .project 15 | .metadata 16 | .history 17 | bin/** 18 | tmp/** 19 | tmp/**/* 20 | *.tmp 21 | *.bak 22 | *.swp 23 | *~.nib 24 | local.properties 25 | .classpath 26 | .settings/ 27 | .loadpath 28 | 29 | # CDT-specific 30 | .cproject 31 | *~ 32 | *.sublime-workspace 33 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.8 4 | jdk: 5 | - oraclejdk8 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This software is licensed under the Apache 2 license, quoted below. 2 | 3 | Copyright 2011 Jens Haase 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 6 | use this file except in compliance with the License. You may obtain a copy of 7 | the License at 8 | 9 | [http://www.apache.org/licenses/LICENSE-2.0] 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 14 | License for the specific language governing permissions and limitations under 15 | the License. 16 | 17 | --------------- 18 | 19 | Notice: Licenses of dependency projects my be different -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # UimaScala [![Build Status](https://travis-ci.org/jenshaase/uimaScala.svg?branch=master)](https://travis-ci.org/jenshaase/uimaScala) 2 | 3 | ## About 4 | 5 | uimaScala is toolkit to develop natural language application in 6 | Scala. It bases mainly on 7 | [uimaFIT](https://uima.apache.org/uimafit.html), which itsself bases on 8 | [Apache UIMA](http://uima.apache.org/). To develop natural language 9 | processing (NLP) application in [Apache UIMA](http://uima.apache.org/) 10 | you need to work with lots of XML files. For nearly every Java class 11 | you will need an XML File. If your Java class changes you also need to 12 | change you XML file. [uimaFIT](http://code.google.com/p/uimafit/) 13 | tries to solve this problem with reflection and nearly removes all XML 14 | files. 15 | 16 | This project started as a wrapper for 17 | [uimaFIT](https://uima.apache.org/uimafit.html). With Scala's collection 18 | library and the functional programming stuff it is a lot easier to 19 | develop NLP Application. Also a type safe configuration system and a 20 | nicer DSL was added. 21 | 22 | This readme provides a short introduction. More documentation will be 23 | added later. 24 | 25 | ## Setup a project 26 | 27 | To use this project add following configuration to your `built.sbt` 28 | file. Uimscala requires Scala version `2.11` 29 | 30 | ~~~ 31 | scalaVersion := "2.11.1" 32 | 33 | resolvers ++= Seq( 34 | "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/", 35 | "Sonatype OSS Snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/" 36 | ) 37 | 38 | libraryDependencies += "com.github.jenshaase.uimascala" %% "uimascala-core" % "0.5.0-SNAPSHOT" 39 | 40 | addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full) 41 | ~~~ 42 | 43 | Next you need to tell UIMA where to find the description 44 | files. Therefore add the file `types.txt` to the folder 45 | `src/main/resources/META-INF/org.apache.uima.fit`. Add following 46 | content: 47 | 48 | ~~~ 49 | classpath*:desc/types/**/*.xml 50 | ~~~ 51 | 52 | ## A simple annotator 53 | 54 | Annotators in UIMA will process a document. Most of the time they are 55 | using annotations from previous annotators and combine them to new 56 | annotations. The following annotator is Tokenizer. It looks at the 57 | text and identifies single words, also called tokens. We can use 58 | Java's `BreakIterator` to tokenize the text. You will find the class 59 | also in the toolkit with some additional processing: 60 | 61 | ~~~ 62 | package com.github.jenshaase.test 63 | 64 | import com.github.jenshaase.uimascala.core._ 65 | import com.github.jenshaase.uimascala.core.configuration._ 66 | import java.util.Locale 67 | import org.apache.uima.jcas.JCas 68 | import java.text.BreakIterator 69 | 70 | class BreakIteratorTokenizer extends SCasAnnotator_ImplBase { 71 | 72 | object locale extends Parameter[Locale](Locale.getDefault) 73 | 74 | def process(jcas: JCas) = { 75 | val bi = BreakIterator.getWordInstance(locale.is) 76 | bi.setText(jcas.getDocumentText) 77 | 78 | var last = bi.first 79 | var cur = bi.next 80 | while (cur != BreakIterator.DONE) { 81 | if (jcas.getDocumentText().substring(last, cur).trim != "") { 82 | jcas.annotate[Token](last, cur) 83 | } 84 | 85 | last = cur 86 | cur = bi.next 87 | } 88 | } 89 | } 90 | ~~~ 91 | 92 | An annotator in uimaScala extends the `SCasAnnotator_ImplBase` 93 | class. To implement this class you need to implement the `process` 94 | method. Here we use Java's `BreakIterator` to process the 95 | document. For each token we add a new `Token` type (the next part will 96 | explain how to create such type). You can also see the `locale` 97 | configuration parameter. It has a name (`locale`) and type (`Locale`) 98 | and a default value `Locale.getDefault`. These parameter can be change 99 | when using this component in a UIMA pipeline. 100 | 101 | 102 | ## Adding your own type system description 103 | 104 | The goal of an annotator is to add new annotation to text. With UIMA 105 | you can create you custom annotation with XML Files and then generate 106 | the Java classes. uimaScala uses a Scala marco and custom DSL to 107 | provide this features. In order to create your type system you need to 108 | define an object in your scala code: 109 | 110 | ~~~ 111 | package com.github.jenshaase.test 112 | 113 | import com.github.jenshaase.uimascala.core.description._ 114 | 115 | @TypeSystemDescription 116 | object TypeSystem { 117 | 118 | val Token = Annotation { 119 | val pos = Feature[String] 120 | val lemma = Feature[String] 121 | val stem = Feature[String] 122 | } 123 | 124 | val Sentence = Annotation {} 125 | } 126 | ~~~ 127 | 128 | After running `compile` your can see following output on your sbt console: 129 | 130 | ~~~ 131 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35) 132 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Token'. 133 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35) 134 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Token_Type'. 135 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35) 136 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Sentence'. 137 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35) 138 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Sentence_Type' 139 | ~~~ 140 | 141 | Now the necessary Java files are created. You need to run `compile` 142 | again to compile the generated Java sources. 143 | 144 | ## Running a pipeline 145 | 146 | Tu run a pipeline uimascala use 147 | [scalaz-stream](https://github.com/scalaz/scalaz-stream) library. To 148 | run a pipeline we need to convert documents to a CAS and process the 149 | CAS with our annotators: 150 | 151 | ~~~ 152 | package com.github.jenshaase.test 153 | 154 | import com.github.jenshaase.uimascala.core._ 155 | import com.github.jenshaase.uimascala.core.stream._ 156 | import scalaz._, Scalaz._ 157 | import scalaz.stream._ 158 | import java.util.Locale 159 | 160 | object Main extends App { 161 | 162 | val p = Process("this is a text", "and another text") |> 163 | casFromText |> 164 | annotate(new BreakIteratorTokenizer().config(_.locale := Locale.US)) |> 165 | extractCas { cas => 166 | cas.select[Token].map(_.getCoveredText).toList 167 | } 168 | 169 | println(p.toList) 170 | 171 | p.toList == List( 172 | List("this", "is", "a", "text"), 173 | List("and", "another", "text") 174 | ) 175 | } 176 | 177 | ~~~ 178 | 179 | 180 | ## TODO 181 | 182 | * Add more documentation 183 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | import com.github.jenshaase.uimascala.UimaSbtPlugin._ 2 | 3 | lazy val commonSettings = Seq( 4 | organization := "com.github.jenshaase.uimascala", 5 | scalaVersion := "2.11.8", 6 | libraryDependencies ++= Seq( 7 | "org.specs2" %% "specs2-core" % "3.8.4" % "test" 8 | ) 9 | ) 10 | 11 | lazy val componentSettings = commonSettings ++ releaseSettings 12 | 13 | lazy val root = (project in file(".")). 14 | settings(releaseSettings:_*). 15 | settings( 16 | publishArtifact in Compile := false, 17 | parallelExecution in Test := false 18 | ). 19 | aggregate( 20 | core, typeSystem, 21 | breakIteratorSegmenter, regexTokenizer, whitespaceTokenizer, stanfordSegmenter, arkTweetTokenizer, openNlpSegmenter, luceneTokenizer, 22 | stanfordPosTagger, arkTweetPosTagger, 23 | stanfordParser, 24 | stanfordNer, 25 | nGramLanguageIdentifier 26 | // Do not run these test in build environment because of too much memory consumption 27 | //mateLemmatizer, mateParser, matePosTagger 28 | ) 29 | 30 | lazy val core = (project in file("core")). 31 | settings(commonSettings: _*). 32 | settings(releaseSettings: _*). 33 | settings( 34 | libraryDependencies ++= Seq( 35 | "org.apache.uima" % "uimafit-core" % "2.2.0", 36 | "org.scala-lang.modules" %% "scala-xml" % "1.0.5", 37 | "co.fs2" %% "fs2-core" % "0.9.0-M5" 38 | ) 39 | ) 40 | 41 | lazy val typeSystem = (project in file("type-system")). 42 | settings(componentSettings: _*). 43 | settings(uimaScalaSettings: _*). 44 | dependsOn(core) 45 | 46 | // ================================================== 47 | // Segmenter 48 | 49 | lazy val breakIteratorSegmenter = (project in file("segmenter/break-iterator-segmenter")). 50 | settings(componentSettings). 51 | dependsOn(core, typeSystem) 52 | 53 | lazy val regexTokenizer = (project in file("segmenter/regex-tokenizer")). 54 | settings(componentSettings). 55 | dependsOn(core, typeSystem) 56 | 57 | lazy val whitespaceTokenizer = (project in file("segmenter/whitespace-tokenizer")). 58 | settings(componentSettings). 59 | dependsOn(core, typeSystem, regexTokenizer) 60 | 61 | lazy val stanfordSegmenter = (project in file("segmenter/stanford-segmenter")). 62 | settings(componentSettings). 63 | settings( 64 | libraryDependencies ++= Seq( 65 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" 66 | ) 67 | ). 68 | dependsOn(core, typeSystem) 69 | 70 | lazy val arkTweetTokenizer = (project in file("segmenter/ark-tweet-tokenizer")). 71 | settings(componentSettings). 72 | settings( 73 | libraryDependencies ++= Seq( 74 | "edu.cmu.cs" % "ark-tweet-nlp" % "0.3.2" 75 | ) 76 | ). 77 | dependsOn(core, typeSystem) 78 | 79 | lazy val openNlpSegmenter = (project in file("segmenter/open-nlp-segmenter")). 80 | settings(componentSettings). 81 | settings( 82 | libraryDependencies ++= Seq( 83 | "org.apache.opennlp" % "opennlp-tools" % "1.6.0", 84 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-de-maxent" % "20120616.1" % "test", 85 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-de-maxent" % "20120616.1" % "test" 86 | ), 87 | resolvers ++= Seq( 88 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local" 89 | ) 90 | ). 91 | dependsOn(core, typeSystem) 92 | 93 | lazy val luceneTokenizer = (project in file("segmenter/lucene-tokenizer")). 94 | settings(componentSettings). 95 | settings( 96 | libraryDependencies ++= Seq( 97 | "org.apache.lucene" % "lucene-analyzers-common" % "6.1.0" 98 | ) 99 | ). 100 | dependsOn(core, typeSystem) 101 | 102 | // ================================================== 103 | // Lemmatizer 104 | 105 | lazy val mateLemmatizer = (project in file("lemmatizer/mate-lemmatizer")). 106 | settings(componentSettings). 107 | settings( 108 | libraryDependencies ++= Seq( 109 | "com.googlecode.mate-tools" % "anna" % "3.5", 110 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-lemmatizer-de-tiger" % "20121024.1" % "test" 111 | ), 112 | resolvers ++= Seq( 113 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local" 114 | ) 115 | ). 116 | dependsOn(core, typeSystem) 117 | 118 | // ================================================== 119 | // POS Tagger 120 | 121 | lazy val stanfordPosTagger = (project in file("part-of-speech-tagger/stanford-pos-tagger")). 122 | settings(componentSettings). 123 | settings( 124 | libraryDependencies ++= Seq( 125 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0", 126 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german" 127 | ) 128 | ). 129 | dependsOn(core, typeSystem) 130 | 131 | lazy val matePosTagger = (project in file("part-of-speech-tagger/mate-pos-tagger")). 132 | settings(componentSettings). 133 | settings( 134 | libraryDependencies ++= Seq( 135 | "com.googlecode.mate-tools" % "anna" % "3.5", 136 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-tagger-de-tiger" % "20121024.1" % "test" 137 | ), 138 | resolvers ++= Seq( 139 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local" 140 | ) 141 | ). 142 | dependsOn(core, typeSystem) 143 | 144 | lazy val arkTweetPosTagger = (project in file("part-of-speech-tagger/ark-tweet-pos-tagger")). 145 | settings(componentSettings). 146 | settings( 147 | libraryDependencies ++= Seq( 148 | "edu.cmu.cs" % "ark-tweet-nlp" % "0.3.2" 149 | ) 150 | ). 151 | dependsOn(core, typeSystem) 152 | 153 | // ================================================== 154 | // Parser 155 | 156 | lazy val stanfordParser = (project in file("parser/stanford-parser")). 157 | settings(componentSettings). 158 | settings( 159 | libraryDependencies ++= Seq( 160 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0", 161 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german" 162 | ) 163 | ). 164 | dependsOn(core, typeSystem) 165 | 166 | lazy val mateParser = (project in file("parser/mate-parser")). 167 | settings(componentSettings). 168 | settings( 169 | libraryDependencies ++= Seq( 170 | "com.googlecode.mate-tools" % "anna" % "3.5", 171 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-parser-de-tiger" % "20121024.1" % "test" 172 | ), 173 | resolvers ++= Seq( 174 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local" 175 | ) 176 | ). 177 | dependsOn(core, typeSystem) 178 | 179 | // ================================================== 180 | // Name Entity Recognizer 181 | 182 | lazy val stanfordNer = (project in file("name-entity-recognizer/stanford-ner")). 183 | settings(componentSettings). 184 | settings( 185 | libraryDependencies ++= Seq( 186 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0", 187 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german" 188 | ) 189 | ). 190 | dependsOn(core, typeSystem) 191 | 192 | // ================================================== 193 | // Language Identifer 194 | 195 | lazy val nGramLanguageIdentifier = (project in file("language-identification/n-gram-language-identifier")). 196 | settings(componentSettings). 197 | settings( 198 | libraryDependencies ++= Seq( 199 | "com.optimaize.languagedetector" % "language-detector" % "0.5" 200 | ) 201 | ). 202 | dependsOn(core, typeSystem) 203 | 204 | 205 | lazy val releaseSettings = Seq( 206 | releasePublishArtifactsAction := PgpKeys.publishSigned.value, 207 | publishTo := { 208 | val nexus = "https://oss.sonatype.org/" 209 | if ( version.value.trim.endsWith( "SNAPSHOT" ) ) 210 | Some( "snapshots" at nexus + "content/repositories/snapshots" ) 211 | else 212 | Some( "releases" at nexus + "service/local/staging/deploy/maven2" ) 213 | }, 214 | publishMavenStyle := true, 215 | pomExtra := ( 216 | https://github.com/jenshaase/uimaScala 217 | 218 | git@github.com:jenshaase/uimascala.git 219 | scm:git:git@github.com:jenshaase/uimascala.git 220 | 221 | 222 | 223 | jenshaase 224 | Jens Haase 225 | 226 | 227 | 228 | 229 | Apache 2 230 | http://www.apache.org/licenses/LICENSE-2.0.txt 231 | repo 232 | 233 | 234 | ) 235 | ) 236 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/AsAnalysisEngine.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | 8 | trait AsAnalysisEngine { 9 | def asAnalysisEngine: AnalysisEngine 10 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/Converter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import PartialFunction._ 7 | import java.util.regex.Pattern 8 | import java.util.Locale 9 | import java.io.File 10 | import util.matching.Regex 11 | import com.github.jenshaase.uimascala.core.configuration._ 12 | 13 | abstract class Caster[In, Out](implicit in: Manifest[In], out: Manifest[Out]) { 14 | def convertToUimaType[X](c: X)(implicit m: Manifest[X]): Option[Any] = { 15 | def sameArgs = in.typeArguments.zip(m.typeArguments).forall { 16 | case (in, actual) ⇒ in >:> actual 17 | } 18 | 19 | // Special case for options 20 | val isOption = (in.erasure.toString, m.erasure.toString) match { 21 | case ("class scala.Option", "class scala.Some") ⇒ true 22 | case ("class scala.Option", "class scala.None$") ⇒ true 23 | case _ ⇒ false 24 | } 25 | 26 | if ((isOption || in >:> m) && sameArgs) Some(toUimaType(c.asInstanceOf[In])) 27 | else None 28 | } 29 | 30 | def convertFromUimaType[X](c: Any)(implicit m: Manifest[X]): Option[In] = { 31 | def sameArgs = in.typeArguments.zip(m.typeArguments).forall { 32 | case (in, actual) ⇒ in >:> actual 33 | } 34 | 35 | // Special case for options 36 | val isOption = (in.erasure.toString, m.erasure.toString) match { 37 | case ("class scala.Option", "class scala.Some") ⇒ true 38 | case ("class scala.Option", "class scala.None$") ⇒ true 39 | case _ ⇒ false 40 | } 41 | 42 | if ((isOption || in >:> m) && sameArgs) fromUimaType(c) 43 | else None 44 | } 45 | 46 | def toUimaType(in: In): Out 47 | def fromUimaType(in: Any): Option[In] 48 | } 49 | 50 | object CastFactory { 51 | 52 | import BasicCaster._ 53 | 54 | var convertSeq: Seq[Caster[_, _]] = Seq.empty 55 | 56 | register(stringCaster) 57 | register(intCaster) 58 | register(floatCaster) 59 | register(doubleCaster) 60 | register(booleanCaster) 61 | register(localeCaster) 62 | register(regexCaster) 63 | register(patternCaster) 64 | register(fileCaster) 65 | 66 | // TODO: Output error if not caster is found 67 | def toUima[A](in: A)(implicit m: Manifest[A]): Either[Failure, Option[Any]] = 68 | convertSeq.map(_.convertToUimaType(in)).find(_.isDefined) match { 69 | case Some(v) ⇒ Right(v) 70 | case None ⇒ Left(Failure("Can not find a converter for: " + m.erasure.toString)) 71 | } 72 | 73 | // TODO: Output error if not caster is found 74 | def fromUima[A](in: Any)(implicit m: Manifest[A]): Either[Failure, Option[A]] = { 75 | convertSeq.map(c ⇒ c.convertFromUimaType[A](in)).find(_.isDefined) match { 76 | case Some(v) ⇒ Right(v.map(_.asInstanceOf[A])) 77 | case None ⇒ Left(Failure("Can not find a converter for: " + m.erasure.toString)) 78 | } 79 | } 80 | 81 | def register[In, Out](c: Caster[In, Out])(implicit ml: Manifest[List[In]], m: Manifest[In], mo: Manifest[Out]) = 82 | convertSeq ++= Seq(c, buildListCaster(c), buildSeqCaster(c), buildOptionCaster(c)) 83 | 84 | protected def buildListCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[List[In]], m: Manifest[In], mo: Manifest[Out]) = 85 | new Caster[List[In], Array[Out]] { 86 | def toUimaType(in: List[In]) = in.map(c.toUimaType).toArray 87 | def fromUimaType(in: Any) = in match { 88 | case arr: Array[_] ⇒ sequence(arr.toList.map(c.fromUimaType)) 89 | case _ ⇒ None 90 | } 91 | } 92 | 93 | protected def buildSeqCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[Seq[In]], m: Manifest[In], mo: Manifest[Out]) = 94 | new Caster[Seq[In], Array[Out]] { 95 | def toUimaType(in: Seq[In]) = in.map(c.toUimaType).toArray 96 | def fromUimaType(in: Any) = in match { 97 | case arr: Array[_] ⇒ sequence(arr.toSeq.map(c.fromUimaType)) 98 | case _ ⇒ None 99 | } 100 | } 101 | 102 | protected def buildOptionCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[Option[In]], m: Manifest[In], mo: Manifest[Out]) = 103 | new Caster[Option[In], Out] { 104 | def toUimaType(in: Option[In]) = in.map(c.toUimaType).getOrElse(null.asInstanceOf[Out]) 105 | def fromUimaType(in: Any) = 106 | if (in != null) c.fromUimaType(in.asInstanceOf[In]) match { 107 | case Some(v) ⇒ Some(Some(v)) 108 | case None ⇒ None 109 | } 110 | else Some(None) 111 | } 112 | 113 | def sequence[A](l: List[Option[A]]) = 114 | if (l.contains(None)) None else Some(l.flatten) 115 | 116 | def sequence[A](l: Seq[Option[A]]) = 117 | if (l.contains(None)) None else Some(l.flatten) 118 | } 119 | 120 | object BasicCaster { 121 | 122 | import java.util.Locale 123 | import java.util.regex.Pattern 124 | import scala.util.matching.Regex 125 | 126 | val stringCaster = new Caster[String, String] { 127 | def toUimaType(in: String) = in 128 | def fromUimaType(in: Any) = in match { 129 | case s: String ⇒ Some(s) 130 | case _ ⇒ None 131 | } 132 | } 133 | 134 | val intCaster = new Caster[Int, Int] { 135 | def toUimaType(in: Int): Int = in 136 | def fromUimaType(in: Any) = in match { 137 | case i: Int ⇒ Some(i) 138 | case _ ⇒ None 139 | } 140 | } 141 | 142 | val floatCaster = new Caster[Float, Float] { 143 | def toUimaType(in: Float): Float = in 144 | def fromUimaType(in: Any) = in match { 145 | case f: Float ⇒ Some(f) 146 | case _ ⇒ None 147 | } 148 | } 149 | 150 | val doubleCaster = new Caster[Double, Float] { 151 | def toUimaType(in: Double): Float = in.toFloat 152 | def fromUimaType(in: Any) = in match { 153 | case f: Float ⇒ Some(f.toDouble) 154 | case d: Double ⇒ Some(d) 155 | case _ ⇒ None 156 | } 157 | } 158 | 159 | val booleanCaster = new Caster[Boolean, Boolean] { 160 | def toUimaType(in: Boolean): Boolean = in 161 | def fromUimaType(in: Any) = in match { 162 | case b: Boolean ⇒ Some(b) 163 | case _ ⇒ None 164 | } 165 | } 166 | 167 | val localeCaster = new Caster[Locale, String] { 168 | def toUimaType(in: Locale): String = in.getLanguage 169 | def fromUimaType(in: Any) = in match { 170 | case l: Locale ⇒ Some(l) 171 | case s: String ⇒ Some(new Locale(s)) 172 | case _ ⇒ None 173 | } 174 | } 175 | 176 | val regexCaster = new Caster[Regex, String] { 177 | def toUimaType(in: Regex): String = in.pattern.pattern 178 | def fromUimaType(in: Any) = in match { 179 | case l: Regex ⇒ Some(l) 180 | case s: String ⇒ Some(s.r) 181 | case _ ⇒ None 182 | } 183 | } 184 | 185 | val patternCaster = new Caster[Pattern, String] { 186 | def toUimaType(in: Pattern): String = in.pattern 187 | def fromUimaType(in: Any) = in match { 188 | case l: Pattern ⇒ Some(l) 189 | case s: String ⇒ Some(Pattern.compile(s)) 190 | case _ ⇒ None 191 | } 192 | } 193 | 194 | val fileCaster = new Caster[File, String] { 195 | def toUimaType(in: File): String = in.getAbsolutePath 196 | def fromUimaType(in: Any) = in match { 197 | case f: File ⇒ Some(f) 198 | case s: String ⇒ Some(new File(s)) 199 | case _ ⇒ None 200 | } 201 | } 202 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/SCasAnnotator_ImplBase.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import configuration.Parameter 7 | import java.io.File 8 | import java.lang.reflect.Method 9 | import java.net.URL 10 | import com.github.jenshaase.uimascala.core.configuration._ 11 | import com.github.jenshaase.uimascala.core.wrapper._ 12 | import org.apache.uima.analysis_component.AnalysisComponent 13 | import org.apache.uima.analysis_component.JCasAnnotator_ImplBase 14 | import org.apache.uima.analysis_engine.AnalysisEngineDescription 15 | import org.apache.uima.jcas.JCas 16 | import org.apache.uima.jcas.tcas.Annotation 17 | import org.apache.uima.resource.ResourceInitializationException 18 | import org.apache.uima.resource.ResourceSpecifier 19 | import org.apache.uima.UimaContext 20 | import org.apache.uima.UIMAFramework 21 | import org.apache.uima.fit.factory.AnalysisEngineFactory 22 | import org.apache.uima.fit.factory.ExternalResourceFactory 23 | import scala.collection.mutable.ListBuffer 24 | import xml.Node 25 | 26 | /** 27 | * Scala Annotator. 28 | * 29 | * Loads the parameter when initialized 30 | * 31 | * @author Jens Haase 32 | */ 33 | abstract class SCasAnnotator_ImplBase extends JCasAnnotator_ImplBase 34 | with Configurable 35 | with ConfigurationInitialization 36 | with ResourceInitialization 37 | with AsAnalysisEngine { 38 | 39 | override def initialize(context: UimaContext) = { 40 | super.initialize(context) 41 | 42 | this.loadParameter(context) 43 | this.loadResources(context) 44 | } 45 | 46 | /** 47 | * Creates a analysis engine from an Annotator instance 48 | */ 49 | def asAnalysisEngine = { 50 | val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*) 51 | 52 | aed.setExternalResourceDependencies(resources.map(r ⇒ 53 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray) 54 | resources.foreach { r ⇒ 55 | r.createBinding(aed) 56 | } 57 | 58 | AnalysisEngineFactory.createAggregate(aed) 59 | } 60 | 61 | /** 62 | * Adds an annotation to the index 63 | * if the annotation is not empty 64 | */ 65 | def addIfNotEmpty[T <: Annotation](a: T): T = if (!a.isEmpty) { 66 | add(a) 67 | } else { 68 | a 69 | } 70 | 71 | /** 72 | * Adds a annotation to the index 73 | */ 74 | def add[T <: Annotation](a: T): T = { 75 | a.addToIndexes 76 | a 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/SCasCollectionReader_ImplBase.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import com.github.jenshaase.uimascala.core.configuration._ 7 | import org.apache.uima.cas.CAS 8 | import org.apache.uima.collection.CollectionReader_ImplBase 9 | import org.apache.uima.jcas.JCas 10 | import org.apache.uima.UimaContext 11 | import org.apache.uima.fit.factory.CollectionReaderFactory 12 | import org.apache.uima.fit.factory.ExternalResourceFactory 13 | 14 | abstract class SCasCollectionReader_ImplBase extends CollectionReader_ImplBase 15 | with Configurable 16 | with ConfigurationInitialization 17 | with ResourceInitialization { 18 | 19 | override def initialize = { 20 | super.initialize 21 | 22 | loadParameter(getUimaContext) 23 | loadResources(getUimaContext) 24 | initialize(getUimaContext) 25 | } 26 | 27 | def initialize(context: UimaContext) = {} 28 | 29 | def asCollectionReader = { 30 | val aed = CollectionReaderFactory.createDescription(this.niceClass, parameterKeyValues: _*) 31 | 32 | aed.setExternalResourceDependencies(resources.map(r ⇒ 33 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray) 34 | resources.foreach { r ⇒ 35 | r.createBinding(aed) 36 | } 37 | 38 | CollectionReaderFactory.createCollectionReader(aed) 39 | } 40 | 41 | def getNext(cas: CAS) = { 42 | getNext(cas.getJCas()) 43 | } 44 | 45 | def getNext(cas: JCas) 46 | 47 | def close() = {} 48 | } 49 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/SCasConsumer_ImplBase.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import configuration._ 7 | import org.apache.uima.analysis_component.JCasAnnotator_ImplBase 8 | import org.apache.uima.UimaContext 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.factory.ExternalResourceFactory 11 | 12 | abstract class SCasConsumer_ImplBase extends JCasAnnotator_ImplBase 13 | with Configurable 14 | with ConfigurationInitialization 15 | with ResourceInitialization { 16 | 17 | override def initialize(context: UimaContext) = { 18 | super.initialize(context) 19 | 20 | this.loadParameter(context) 21 | this.loadResources(context) 22 | } 23 | 24 | def asAnalysisEngine = { 25 | val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*) 26 | 27 | aed.setExternalResourceDependencies(resources.map(r ⇒ 28 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray) 29 | resources.foreach { r ⇒ 30 | r.createBinding(aed) 31 | } 32 | 33 | AnalysisEngineFactory.createAggregate(aed) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/SCasFlowController_ImplBase.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import configuration._ 7 | import org.apache.uima.flow.FlowControllerContext 8 | import org.apache.uima.flow.JCasFlowController_ImplBase 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.factory.FlowControllerFactory 11 | 12 | abstract class SCasFlowController_ImplBase extends JCasFlowController_ImplBase 13 | with Configurable 14 | with ConfigurationInitialization 15 | with ResourceInitialization { 16 | 17 | override def initialize(context: FlowControllerContext) = { 18 | super.initialize(context) 19 | 20 | this.loadParameter(context) 21 | } 22 | 23 | def asAnalysisEngine = { 24 | FlowControllerFactory.createFlowControllerDescription(this.niceClass, this.parameterKeyValues: _*) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/SCasMultiplier_ImplBase.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import configuration._ 7 | import org.apache.uima.analysis_component.JCasMultiplier_ImplBase 8 | import org.apache.uima.UimaContext 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.factory.ExternalResourceFactory 11 | 12 | abstract class SCasMultiplier_ImplBase extends JCasMultiplier_ImplBase 13 | with Configurable 14 | with ConfigurationInitialization 15 | with ResourceInitialization { 16 | 17 | override def initialize(context: UimaContext) = { 18 | super.initialize(context) 19 | 20 | this.loadParameter(context) 21 | this.loadResources(context) 22 | } 23 | 24 | def asAnalysisEngine = { 25 | val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*) 26 | 27 | aed.setExternalResourceDependencies(resources.map(r ⇒ 28 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray) 29 | resources.foreach { r ⇒ 30 | r.createBinding(aed) 31 | } 32 | 33 | AnalysisEngineFactory.createAggregate(aed) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/SimplePipeline.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.apache.uima.collection.CollectionReader 8 | 9 | @deprecated("Use org.apache.uima.fit.pipeline.SimplePipeline or uimascala-stream", "0.5.0") 10 | class SimplePipeline(reader: CollectionReader) { 11 | 12 | private var descs: Seq[AnalysisEngine] = Seq.empty 13 | 14 | def ~>(in: AsAnalysisEngine): SimplePipeline = 15 | ~>(in.asAnalysisEngine) 16 | 17 | def ~>(in: AnalysisEngine): SimplePipeline = { 18 | descs = descs :+ in 19 | this 20 | } 21 | 22 | def run() = { 23 | org.apache.uima.fit.pipeline.SimplePipeline.runPipeline(reader, descs: _*) 24 | } 25 | } 26 | 27 | object SimplePipeline { 28 | 29 | def apply(reader: CollectionReader) = 30 | new SimplePipeline(reader) 31 | 32 | def apply(reader: SCasCollectionReader_ImplBase) = 33 | new SimplePipeline(reader.asCollectionReader) 34 | } 35 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/XmlDescriptor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import xml.Node 7 | 8 | trait XmlDescriptor { 9 | def xmlType: String 10 | def toXml: Node 11 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/ConfigurationInitialization.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import scala.collection.mutable.ListBuffer 7 | import org.apache.uima.analysis_component.AnalysisComponent 8 | import java.lang.reflect.Method 9 | import org.apache.uima.UimaContext 10 | import org.apache.uima.resource.ResourceInitializationException 11 | 12 | /** 13 | * Configuration Initalization trait 14 | * 15 | * This can be used whenever configuration parameters must be initalized 16 | */ 17 | trait ConfigurationInitialization { this: Configurable ⇒ 18 | 19 | private var parameterList: List[ParameterHolder] = Nil 20 | 21 | private val tArray: ListBuffer[ParameterHolder] = new ListBuffer[ParameterHolder] 22 | val methods = this.getClass.getMethods 23 | introspect(this, methods) { 24 | case (v, mf) ⇒ tArray += ParameterHolder(mf.name, v, mf) 25 | } 26 | parameterList = tArray.toList 27 | 28 | /** 29 | * Uses reflection to find the parameters in the class 30 | */ 31 | protected def introspect[B, V](comp: Configurable, methods: Array[Method])(f: (Method, Parameter[_]) ⇒ Any): Unit = { 32 | val potentialParams = methods.toList.filter(isParameter) 33 | 34 | val map: Map[String, List[Method]] = potentialParams.foldLeft[Map[String, List[Method]]](Map()) { 35 | case (map, method) ⇒ 36 | val name = method.getName 37 | map + (name -> (method :: map.getOrElse(name, Nil))) 38 | } 39 | 40 | val realMeth = map.values.map(_.sortWith { 41 | case (a, b) ⇒ !a.getReturnType().isAssignableFrom(b.getReturnType) 42 | }).map(_.head) 43 | 44 | for (v ← realMeth) { 45 | v.invoke(comp) match { 46 | case mf: Parameter[_] ⇒ 47 | mf.setName_!(v.getName) 48 | f(v, mf) 49 | case _ ⇒ 50 | } 51 | } 52 | } 53 | 54 | /** 55 | * Returns all parameters for the class 56 | */ 57 | def parameters = parameterList.map(_.parameter(this)) 58 | 59 | /** 60 | * Uses the uima context to set the parameter 61 | */ 62 | protected def loadParameter(context: UimaContext) = { 63 | parameters.foreach { f ⇒ 64 | val value = context.getConfigParameterValue(f.name) 65 | 66 | if (f.mandatory_? && value == null) { 67 | throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, Array(f.name)) 68 | } 69 | 70 | if (value != null) { 71 | f.setFromUimaType(value) match { 72 | case Right(o) ⇒ () 73 | case Left(l) ⇒ throw new ResourceInitializationException(new ClassCastException(l.msg)) 74 | } 75 | } 76 | } 77 | } 78 | 79 | /** 80 | * Checks if a method is a subclass of Parameter 81 | */ 82 | def isParameter(m: Method) = 83 | !m.isSynthetic && classOf[Parameter[_]].isAssignableFrom(m.getReturnType) 84 | 85 | class NiceObject[T <: AnyRef](x: T) { 86 | def niceClass: Class[_ <: T] = x.getClass.asInstanceOf[Class[T]] 87 | } 88 | implicit def toNiceObject[T <: AnyRef](x: T) = new NiceObject(x) 89 | 90 | def parameterKeyValues: Array[Object] = parameters.flatMap { f ⇒ 91 | Array(f.name, f.toUimaType match { 92 | case Right(o) ⇒ o 93 | case Left(l) ⇒ throw new ResourceInitializationException(new ClassCastException(l.msg)) 94 | }) 95 | }.toArray 96 | 97 | case class ParameterHolder(name: String, method: Method, metaParameter: Parameter[_]) { 98 | def parameter(inst: Configurable): Parameter[_] = method.invoke(inst).asInstanceOf[Parameter[_]] 99 | } 100 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/Parameter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import org.apache.uima.analysis_component.AnalysisComponent 7 | 8 | trait Configurable {} 9 | 10 | class ConfigurationBuilder[T <: Configurable](conf: T) { 11 | def config(mutators: T ⇒ Unit*) = { 12 | for (f ← mutators) f(conf) 13 | conf 14 | } 15 | } 16 | 17 | /** 18 | * Base Parameter trait 19 | */ 20 | trait BaseParameter { 21 | 22 | // The parameter name 23 | private var fieldName: String = _ 24 | private var set = false 25 | 26 | protected def set_?(b: Boolean) = set = b 27 | 28 | def set_? : Boolean = set 29 | 30 | /** 31 | * Returns the parameter name 32 | */ 33 | def name: String = fieldName 34 | 35 | /** 36 | * Returns the parameter description 37 | * Default: None 38 | */ 39 | def description: Option[String] = None 40 | 41 | /** 42 | * Is this parameter mandatory? 43 | */ 44 | def mandatory_? = true 45 | 46 | /** 47 | * If the parameter can take multiple values (collections) 48 | */ 49 | def multiValued_? = false 50 | 51 | /** 52 | * Default is string. 53 | * Also possible: Integer, Float, Boolean 54 | */ 55 | def uimaType: String 56 | 57 | /** 58 | * Sets the parameter name 59 | */ 60 | private[configuration] final def setName_!(newName: String): String = { 61 | fieldName = newName 62 | fieldName 63 | } 64 | } 65 | 66 | case class Failure(msg: String, exception: Option[Exception] = None) 67 | 68 | /** 69 | * A typed parameter 70 | */ 71 | abstract class Parameter[ThisType](val defaultValue: ThisType)(implicit mf: Manifest[ThisType]) 72 | extends BaseParameter { 73 | 74 | import com.github.jenshaase.uimascala.core.CastFactory._ 75 | 76 | private var data: Option[ThisType] = None 77 | 78 | /** 79 | * Sets a new value to this parameter 80 | */ 81 | def :=(in: ThisType) = 82 | data = Some(in) 83 | 84 | /** 85 | * Set the parameter value by an object 86 | */ 87 | def setFromUimaType(in: Any): Either[Failure, ThisType] = fromUima[ThisType](in) match { 88 | case Right(Some(d)) if d.isInstanceOf[ThisType] ⇒ { 89 | :=(d.asInstanceOf[ThisType]); Right(d) 90 | } 91 | case Right(_) ⇒ Left(Failure("Value could not be casted: " + in.toString)) 92 | case Left(l) ⇒ Left(l) 93 | } 94 | 95 | /** 96 | * Coverts this parameter value to a uima type 97 | */ 98 | def toUimaType: Either[Failure, Object] = toUima(value) match { 99 | case Right(Some(s)) ⇒ Right(s.asInstanceOf[Object]) 100 | case Right(None) ⇒ Left(Failure("Value could not be casted: " + value)) 101 | case Left(l) ⇒ Left(l) 102 | } 103 | 104 | /** 105 | * Checks if the parameter is mutlivalued 106 | */ 107 | override def multiValued_? = mf.erasure.toString match { 108 | case "class scala.collection.immutable.List" ⇒ true 109 | case "interface scala.collection.Seq" ⇒ true 110 | case s: String if (s.startsWith("class [L")) ⇒ true 111 | case _ ⇒ false 112 | } 113 | 114 | def value: ThisType = data getOrElse defaultValue 115 | 116 | def is = value 117 | 118 | def get = value 119 | 120 | def uimaType = 121 | if (multiValued_?) 122 | _uimaType(mf.typeArguments.head.erasure.toString) 123 | else 124 | _uimaType(mf.erasure.toString) 125 | 126 | def _uimaType(s: String) = s match { 127 | case "int" | "class java.lang.Integer" ⇒ "Integer" 128 | case "float" ⇒ "Float" 129 | case "boolean" ⇒ "Boolean" 130 | case _ ⇒ "String" 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/Resource.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import java.net.URL 7 | import java.io.File 8 | import org.apache.uima.resource.ResourceSpecifier 9 | import org.apache.uima.fit.factory.ExternalResourceFactory 10 | import org.apache.uima.resource.SharedResourceObject 11 | 12 | trait BaseResource { 13 | 14 | private var resourceKey: String = _ 15 | 16 | def name: String = resourceKey 17 | 18 | def description: String = "" 19 | 20 | def interfaceName: String 21 | 22 | def mandatory_? = true 23 | 24 | private[configuration] final def setName_!(newName: String): String = { 25 | resourceKey = newName 26 | resourceKey 27 | } 28 | } 29 | 30 | trait TypedResource[ThisType, ParamType] extends BaseResource { 31 | 32 | private var boundResource: Option[ThisType] = None 33 | private[configuration] var parameters: Option[Map[ParamType, ParamType]] = None 34 | 35 | def params = parameters getOrElse defaultParameter 36 | 37 | def defaultParameter: Map[ParamType, ParamType] 38 | 39 | def parameterList: Seq[ParamType] = 40 | params.toSeq.flatMap(p ⇒ List(p._1, p._2)) 41 | 42 | def setFromUima(a: Any) = a match { 43 | case x: ThisType ⇒ Right(bind(x)) 44 | case Some(x: ThisType) ⇒ Right(bind(x)) 45 | case _ ⇒ Left(Failure("Can not bind resource from uima context: " + name)) 46 | } 47 | 48 | def bind(newResource: ThisType) = { 49 | boundResource = Some(newResource) 50 | boundResource 51 | } 52 | 53 | def resource = boundResource get 54 | 55 | def createBinding(aed: ResourceSpecifier) 56 | 57 | def className: Class[_ <: ThisType] 58 | 59 | def interfaceName = className.getName 60 | } 61 | 62 | case class SharedBinding[T](url: String, params: Map[Object, Object] = Map.empty) 63 | object SharedBinding { 64 | 65 | def apply[T](url: URL) = 66 | new SharedBinding[T](url.toString, Map.empty) 67 | 68 | def apply[T](url: URL, params: Map[Object, Object]) = 69 | new SharedBinding[T](url.toString, params) 70 | 71 | def apply[T](url: File) = 72 | new SharedBinding[T](url.toURI().toURL().toString, Map.empty) 73 | 74 | def apply[T](url: File, params: Map[Object, Object]) = 75 | new SharedBinding[T](url.toURI().toURL().toString, params) 76 | } 77 | 78 | abstract class SharedResource[ThisType <: SharedResourceObject]( 79 | val defaultURL: String, 80 | val defaultParams: Map[Object, Object] = Map.empty)(implicit m: Manifest[ThisType]) 81 | extends TypedResource[ThisType, Object] { 82 | 83 | private var dataUrl: Option[String] = None 84 | private var clazz: Option[Class[_ <: ThisType]] = None 85 | 86 | def this(defaultUrl: URL, defaultParams: Map[Object, Object])(implicit m: Manifest[ThisType]) = 87 | this(defaultUrl.toString, defaultParams) 88 | 89 | def this(defaultUrl: File, defaultParams: Map[Object, Object])(implicit m: Manifest[ThisType]) = 90 | this(defaultUrl.toURI().toURL(), defaultParams) 91 | 92 | def :=[T <: ThisType](bind: SharedBinding[T])(implicit mf: Manifest[T]) = { 93 | clazz = Some(mf.erasure.asInstanceOf[Class[T]]) 94 | dataUrl = Some(bind.url) 95 | parameters = Some(bind.params) 96 | } 97 | 98 | def url = dataUrl getOrElse defaultURL 99 | 100 | def defaultParameter = defaultParams 101 | 102 | def defaultClass = m.erasure.asInstanceOf[Class[ThisType]] 103 | 104 | def className: Class[_ <: ThisType] = clazz getOrElse defaultClass 105 | 106 | // format: OFF 107 | def createBinding(aed: ResourceSpecifier) = { 108 | ExternalResourceFactory.bindResource( 109 | aed, 110 | name, 111 | className, 112 | url, 113 | parameterList:_*) 114 | } 115 | } 116 | 117 | case class Binding[T](params: Map[String, String] = Map.empty) 118 | 119 | abstract class Resource[ThisType <: org.apache.uima.resource.Resource]( 120 | val defaultParams: Map[String, String] = Map.empty)(implicit m: Manifest[ThisType]) 121 | extends TypedResource[ThisType, String] { 122 | 123 | private var clazz: Option[Class[_ <: ThisType]] = None 124 | 125 | def defaultClass = m.erasure.asInstanceOf[Class[ThisType]] 126 | 127 | def :=[T <: ThisType](bind: Binding[T])(implicit mf: Manifest[T]) = { 128 | clazz = Some(mf.erasure.asInstanceOf[Class[T]]) 129 | parameters = Some(bind.params) 130 | } 131 | 132 | def defaultParameter = defaultParams 133 | 134 | def className: Class[_ <: ThisType] = clazz getOrElse defaultClass 135 | 136 | // format: OFF 137 | def createBinding(aed: ResourceSpecifier) = { 138 | ExternalResourceFactory.bindResource( 139 | aed, 140 | name, 141 | className, 142 | parameterList:_*) 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/ResourceInitialization.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import java.lang.reflect.Method 7 | import org.apache.uima.resource.ResourceAccessException 8 | import org.apache.uima.resource.ResourceInitializationException 9 | import org.apache.uima.UimaContext 10 | import org.apache.uima.fit.descriptor.ExternalResourceLocator 11 | import scala.collection.mutable.ListBuffer 12 | 13 | trait ResourceInitialization { this: Configurable ⇒ 14 | 15 | private var resourceList: List[ResourceHolder] = Nil 16 | 17 | private val resTempArray: ListBuffer[ResourceHolder] = new ListBuffer[ResourceHolder] 18 | val resMethods = this.getClass.getMethods 19 | introspectResources(this, resMethods) { 20 | case (v, mf) ⇒ { 21 | resTempArray += ResourceHolder(mf.name, v, mf) 22 | } 23 | } 24 | resourceList = resTempArray.toList 25 | 26 | protected def introspectResources(comp: Configurable, methods: Array[Method])(f: (Method, TypedResource[_, _]) ⇒ Any): Unit = { 27 | val potentialResources = methods.toList.filter(isResource) 28 | 29 | val map: Map[String, List[Method]] = potentialResources.foldLeft[Map[String, List[Method]]](Map()) { 30 | case (map, method) ⇒ 31 | val name = method.getName 32 | map + (name -> (method :: map.getOrElse(name, Nil))) 33 | } 34 | 35 | val realMeth = map.values.map(_.sortWith { 36 | case (a, b) ⇒ !a.getReturnType().isAssignableFrom(b.getReturnType) 37 | }).map(_.head) 38 | 39 | for (v ← realMeth) { 40 | v.invoke(comp) match { 41 | case mf: TypedResource[_, _] ⇒ 42 | mf.setName_!(v.getName) 43 | f(v, mf) 44 | case _ ⇒ 45 | } 46 | } 47 | } 48 | 49 | def resources = resourceList.map(_.resource(this)) 50 | 51 | def loadResources(context: UimaContext) = { 52 | resources.foreach { r ⇒ 53 | var value: Object = null; 54 | try { 55 | value = context.getResourceObject(r.name) 56 | } catch { 57 | case e: Exception ⇒ throw new ResourceInitializationException(e) 58 | } 59 | 60 | if (value.isInstanceOf[ExternalResourceLocator]) { 61 | value = value.asInstanceOf[ExternalResourceLocator].getResource() 62 | } 63 | 64 | if (r.mandatory_? && value == null) { 65 | throw new ResourceInitializationException(new IllegalStateException("Mandatory resource '%s' is not set".format(r.name))) 66 | } 67 | 68 | if (value != null) { 69 | r.setFromUima(value) match { 70 | case Left(f: Failure) ⇒ throw f.exception.map(new ResourceInitializationException(_)).getOrElse(new ResourceInitializationException()) 71 | case _ ⇒ 72 | } 73 | } 74 | } 75 | } 76 | 77 | def isResource(m: Method) = 78 | !m.isSynthetic && classOf[TypedResource[_, _]].isAssignableFrom(m.getReturnType) 79 | 80 | case class ResourceHolder(name: String, method: Method, metaParameter: TypedResource[_, _]) { 81 | def resource(inst: Configurable): TypedResource[_, _] = method.invoke(inst).asInstanceOf[TypedResource[_, _]] 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/package.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala 5 | 6 | import org.apache.uima.jcas.tcas.Annotation 7 | import org.apache.uima.jcas.JCas 8 | import org.apache.uima.jcas.cas.FSArray 9 | import com.github.jenshaase.uimascala.core.wrapper._ 10 | import com.github.jenshaase.uimascala.core.configuration._ 11 | import org.apache.uima.collection.CollectionReader 12 | 13 | package object core { 14 | 15 | implicit def toScalaAnnotation(a: Annotation) = new AnnotationWrapper(a) 16 | 17 | implicit def toScalaCas(jcas: JCas) = new JCasWrapper(jcas) 18 | 19 | implicit def configBuilder[T <: Configurable](conf: T) = new ConfigurationBuilder(conf) 20 | 21 | @deprecated("See com.github.jenshaase.uimascala.core.SimplePipeline", "0.5.0") 22 | implicit def collectionReaderToPipeline(reader: SCasCollectionReader_ImplBase) = new SimplePipeline(reader.asCollectionReader) 23 | 24 | @deprecated("See com.github.jenshaase.uimascala.core.SimplePipeline", "0.5.0") 25 | implicit def collectionReaderToPipeline(reader: CollectionReader) = new SimplePipeline(reader) 26 | } 27 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/stream/annotators.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.core.stream 2 | 3 | import scala.util.matching.Regex 4 | import org.apache.uima.jcas.tcas.Annotation 5 | import scala.reflect.ClassTag 6 | import com.github.jenshaase.uimascala.core._ 7 | import org.apache.uima.jcas.JCas 8 | 9 | trait annotators { 10 | 11 | @deprecated("Use com.github.jenshaase.uimascala.segmenter.RegexTokenizer") 12 | def regexTokenizer[F[_], T <: Annotation](pattern: Regex, allowEmptyToken: Boolean = true)(implicit cf: ClassTag[T]) = 13 | annotate[F] { cas: JCas => 14 | val txt = cas.getDocumentText 15 | 16 | val mostlyAll = pattern.findAllMatchIn(txt).foldLeft(0) { 17 | case (last, m) if ((allowEmptyToken && m.start >= last) || (!allowEmptyToken && m.start > last)) ⇒ 18 | cas.annotate[T](last, m.start) 19 | m.end 20 | case (_, m) => 21 | m.end 22 | } 23 | 24 | if (mostlyAll < txt.length) 25 | cas.annotate[T](mostlyAll, txt.length) 26 | } 27 | 28 | @deprecated("Use com.github.jenshaase.uimascala.segmenter.WhitespaceTokenizer") 29 | def whitespaceTokenizer[F[_], T <: Annotation](allowEmptyToken: Boolean = true)(implicit cf: ClassTag[T]) = 30 | regexTokenizer[F, Annotation]("\\s+".r, allowEmptyToken) 31 | 32 | def removeStopwords[F[_], T <: Annotation](isStopword: String => Boolean)(implicit cf: ClassTag[T]) = 33 | annotate[F] { cas: JCas => 34 | cas.select[T]. 35 | filter { token => isStopword(token.getCoveredText) }. 36 | foreach { token => token.removeFromIndexes() } 37 | } 38 | 39 | def annotateStopwords[F[_], Token <: Annotation, Stopword <: Annotation](isStopword: String => Boolean) 40 | (implicit ct: ClassTag[Token], cs: ClassTag[Stopword]) = 41 | annotate[F] { cas: JCas => 42 | cas.select[Token].foreach { token => 43 | if (isStopword(token.getCoveredText)) { 44 | cas.annotate[Stopword](token.getBegin, token.getEnd) 45 | } 46 | } 47 | } 48 | } 49 | 50 | object annotators extends annotators 51 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/stream/package.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import org.apache.uima.jcas.tcas.Annotation 7 | import org.apache.uima.analysis_engine.AnalysisEngine; 8 | import org.apache.uima.analysis_engine.AnalysisEngineDescription; 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine 10 | import org.apache.uima.jcas.JCas 11 | import org.apache.uima.util.CasCreationUtils 12 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory 13 | import fs2._ 14 | 15 | package object stream { 16 | 17 | type AnnotatorProcess[F[_]] = Pipe[F, JCas, JCas] 18 | 19 | def annotate[F[_]](f: (JCas => Any)): AnnotatorProcess[F] = 20 | _.map { cas => 21 | f(cas) 22 | cas 23 | } 24 | 25 | def annotate[F[_]](a: AnalysisEngine): AnnotatorProcess[F] = 26 | _.map { cas => 27 | a.process(cas) 28 | cas 29 | } 30 | 31 | def annotate[F[_]](a: AnalysisEngineDescription): AnnotatorProcess[F] = 32 | annotate(createEngine(a)) 33 | 34 | def annotate[F[_]](a: AsAnalysisEngine): AnnotatorProcess[F] = 35 | annotate(a.asAnalysisEngine) 36 | 37 | def initCas[F[_], I](f: ((I, JCas) => Any)): Pipe[F, I, JCas] = 38 | _.map { something => 39 | val cas = CasCreationUtils.createCas( 40 | TypeSystemDescriptionFactory.createTypeSystemDescription, null, null).getJCas 41 | 42 | f(something, cas) 43 | cas 44 | } 45 | 46 | def casFromText[F[_]] = initCas[F, String] { (str ,cas) => 47 | cas.setDocumentText(str) 48 | } 49 | 50 | def extractCas[F[_], I](f: JCas => I): Pipe[F, JCas, I] = 51 | _.map(f) 52 | } 53 | -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/wrapper/AnnotationWrapper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.wrapper 5 | 6 | import org.apache.uima.jcas.tcas.Annotation 7 | 8 | /** 9 | * A Uima Annotation wrapper for implicity. 10 | * @author Jens Haase 11 | */ 12 | class AnnotationWrapper(a: Annotation) { 13 | 14 | /** 15 | * Remove whitespace before and after the annotation 16 | * by increasing/decreasing the begin/end value 17 | */ 18 | def trim: Annotation = { 19 | var begin = a.getBegin 20 | var end = a.getEnd - 1 21 | 22 | val data = a.getCAS.getDocumentText 23 | 24 | while (begin < (data.length - 1) && trimChar(data.charAt(begin))) 25 | begin += 1 26 | 27 | while (end > 0 && trimChar(data.charAt(end))) 28 | end -= 1 29 | 30 | end += 1 31 | a.setBegin(begin) 32 | a.setEnd(end) 33 | 34 | a 35 | } 36 | 37 | /** 38 | * Add annotation to index if the covering text 39 | * of the annotation is not empty 40 | */ 41 | def addToIndexIfNotEmpty = if (!isEmpty) a.addToIndexes 42 | 43 | /** 44 | * Checks if the covering text of the annotation 45 | * is empty 46 | */ 47 | def isEmpty = a.getBegin >= a.getEnd 48 | 49 | protected def trimChar(c: Char): Boolean = c match { 50 | case '\n' ⇒ true 51 | case '\r' ⇒ true 52 | case '\t' ⇒ true 53 | case '\u200E' ⇒ true 54 | case '\u200F' ⇒ true 55 | case '\u2028' ⇒ true 56 | case '\u2029' ⇒ true 57 | case _ ⇒ Character.isWhitespace(c) 58 | } 59 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/github/jenshaase/uimascala/core/wrapper/JCasWrapper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.wrapper 5 | 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.cas.text.AnnotationFS 8 | import org.apache.uima.jcas.tcas.Annotation 9 | import org.apache.uima.jcas.cas.TOP 10 | import scala.collection.JavaConversions._ 11 | import org.apache.uima.cas.FeatureStructure 12 | import scala.collection.JavaConversions._ 13 | import collection.mutable.Buffer 14 | import org.apache.uima.fit.util.{ CasUtil, JCasUtil } 15 | import scala.reflect.ClassTag 16 | 17 | /** 18 | * A JCas wrapper for implicity 19 | * @author Jens Haase 20 | */ 21 | class JCasWrapper(cas: JCas) { 22 | 23 | def create[T <: TOP](f: (T => Unit)*)(implicit cf: ClassTag[T]): T = { 24 | val constructor = cf.runtimeClass.getConstructor(classOf[JCas]) 25 | val obj = constructor.newInstance(cas).asInstanceOf[T] 26 | f.foreach { f => f(obj) } 27 | obj.addToIndexes() 28 | obj 29 | } 30 | 31 | def annotate[T <: Annotation](begin: Int, end: Int)(implicit cf: ClassTag[T]): T = { 32 | val constructor = cf.runtimeClass.getConstructor(classOf[JCas]) 33 | val obj = constructor.newInstance(cas).asInstanceOf[T] 34 | obj.setBegin(begin) 35 | obj.setEnd(end) 36 | obj.addToIndexes() 37 | obj 38 | } 39 | 40 | /** 41 | * @see org.apache.uima.fit.uitl.JCasUtil#select 42 | */ 43 | def select[T <: TOP](implicit cf: ClassTag[T]): Iterable[T] = 44 | JCasUtil.select(cas, cf.runtimeClass.asInstanceOf[Class[T]]) 45 | 46 | /** 47 | * @see org.apache.uima.fit.uitl.JCasUtil#selectByIndex 48 | */ 49 | def selectByIndex[T <: Annotation](index: Int)(implicit cf: ClassTag[T]) = 50 | JCasUtil.selectByIndex(cas, cf.runtimeClass.asInstanceOf[Class[T]], index) 51 | 52 | /** 53 | * @see org.apache.uima.fit.uitl.JCasUtil#selectCovered 54 | */ 55 | def selectCovered[T <: Annotation](coveringAnnotation: Annotation)(implicit cf: ClassTag[T]) = 56 | JCasUtil.selectCovered(cas, cf.runtimeClass.asInstanceOf[Class[T]], coveringAnnotation) 57 | 58 | /** 59 | * @see org.apache.uima.fit.uitl.JCasUtil#selectCovered 60 | */ 61 | def selectCovered[T <: Annotation](begin: Int, end: Int)(implicit cf: ClassTag[T]) = 62 | JCasUtil.selectCovered(cas, cf.runtimeClass.asInstanceOf[Class[T]], begin, end) 63 | 64 | /** 65 | * @see org.apache.uima.fit.uitl.JCasUtil#selectSingle 66 | */ 67 | def selectSingle[T <: TOP](implicit cf: ClassTag[T]) = 68 | JCasUtil.selectSingle(cas, cf.runtimeClass.asInstanceOf[Class[T]]) 69 | 70 | /** 71 | * @see org.apache.uima.fit.uitl.JCasUtil#selectPreceding 72 | */ 73 | def selectPreceding[T <: Annotation](annotation: Annotation, count: Int = Int.MaxValue)(implicit cf: ClassTag[T]): Buffer[T] = { 74 | JCasUtil.selectPreceding(cas, cf.runtimeClass.asInstanceOf[Class[T]], annotation, count); 75 | } 76 | 77 | /** 78 | * @see org.apache.uima.fit.uitl.JCasUtil#selectFollowing 79 | */ 80 | def selectFollowing[T <: Annotation](annotation: Annotation, count: Int = Int.MaxValue)(implicit cf: ClassTag[T]): Buffer[T] = { 81 | JCasUtil.selectFollowing(cas, cf.runtimeClass.asInstanceOf[Class[T]], annotation, count) 82 | } 83 | 84 | /** 85 | * @see org.apache.uima.fit.uitl.JCasUtil#exists 86 | */ 87 | def exists[T <: TOP](implicit ct: ClassTag[T]) = 88 | JCasUtil.exists(cas, ct.runtimeClass.asInstanceOf[Class[T]]) 89 | 90 | /** 91 | * @see org.apache.uima.fit.uitl.JCasUtil#getView 92 | */ 93 | def getView(name: String, fallback: JCas) = 94 | JCasUtil.getView(cas, name, fallback) 95 | 96 | /** 97 | * @see org.apache.uima.fit.uitl.JCasUtil#getView 98 | */ 99 | def getView(name: String, create: Boolean) = 100 | JCasUtil.getView(cas, name, create) 101 | } 102 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/ConverterSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import org.specs2._ 7 | import matcher._ 8 | import java.util.regex.Pattern 9 | import java.util.Locale 10 | import java.io.File 11 | import scala.util.matching.Regex 12 | 13 | class ConverterSpec extends Specification { 14 | import CastFactory._ 15 | 16 | // format: OFF 17 | def is = s2""" 18 | The Formatter should 19 | convert string ${convert("test", "test")} 20 | convert int ${convert(1, 1)} 21 | convert float ${convert(1.2f, 1.2f)} 22 | convert double ${convert(1.2d, 1.2d)} 23 | convert boolean ${convert(true, true)} 24 | convert locale ${convert(new Locale("en"), "en")} 25 | convert pattern ${convert(Pattern.compile("[A-Z]*"), "[A-Z]*", Some(patToString _))} 26 | convert regex ${convert("[A-Z]*".r, "[A-Z]*", Some(regToString _))} 27 | convert file ${convert(new File("/test/abc"), "/test/abc")} 28 | 29 | convert string list ${convert(List("t", "v"), Array("t", "v"))} 30 | convert int list ${convert(List(1, 2), Array(1, 2))} 31 | convert float list ${convert(List(1.2f, 2.3f), Array(1.2f, 2.3f))} 32 | convert double list ${convert(List(1.2d, 2.3d), Array(1.2d, 2.3d))} 33 | convert boolean list ${convert(List(true, false, true), Array(true, false, true))} 34 | convert pattern list ${convert(List(Pattern.compile("[A-Z]"), Pattern.compile("[1-4]")), Array("[A-Z]", "[1-4]"), Some{in: List[Pattern] => in.map(patToString)})} 35 | convert regex list ${convert(List("[A-Z]".r, "[1-4]".r), Array("[A-Z]", "[1-4]"), Some{in: List[Regex] => in.map(regToString)})} 36 | convert file list ${convert(List(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))} 37 | convert locale list ${convert(List(new Locale("de"), new Locale("en")), Array("de", "en"))} 38 | convert file list ${convert(List(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))} 39 | 40 | convert string Seq ${convert(Seq("t", "v"), Array("t", "v"))} 41 | convert int Seq ${convert(Seq(1, 2), Array(1, 2))} 42 | convert float Seq ${convert(Seq(1.2f, 2.3f), Array(1.2f, 2.3f))} 43 | convert double Seq ${convert(Seq(1.2d, 2.3d), Array(1.2d, 2.3d))} 44 | convert boolean Seq ${convert(Seq(true, false, true), Array(true, false, true))} 45 | convert pattern Seq ${convert(Seq(Pattern.compile("[A-Z]"), Pattern.compile("[1-4]")), Array("[A-Z]", "[1-4]"), Some{in: Seq[Pattern] => in.map(patToString)})} 46 | convert regex Seq ${convert(Seq("[A-Z]".r, "[1-4]".r), Array("[A-Z]", "[1-4]"), Some{in: Seq[Regex] => in.map(regToString)})} 47 | convert file Seq" ${convert(Seq(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))} 48 | convert locale Seq" ${convert(Seq(new Locale("de"), new Locale("en")), Array("de", "en"))} 49 | convert file Seq" ${convert(Seq(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))} 50 | 51 | convert string option ${convert(Some("test"), "test")} 52 | convert int option ${convert(Some(1), 1)} 53 | convert float option ${convert(Some(1.2f), 1.2f)} 54 | convert double option ${convert(Some(1.2d), 1.2d)} 55 | convert boolean option ${convert(Some(true), true)} 56 | convert locale option ${convert(Some(new Locale("en")), "en")} 57 | convert pattern option ${convert(Some(Pattern.compile("[A-Z]*")), "[A-Z]*", Some(optPatToString _))} 58 | convert regex option ${convert(Some("[A-Z]*".r), "[A-Z]*", Some(optRegToString _))} 59 | convert file option ${convert(Some(new File("/test/abc")), "/test/abc")} 60 | convert none option ${convert(None, null)} 61 | """ 62 | 63 | 64 | def convert[T, R](in: T, out: R, func: Option[T => _] = None)(implicit m: Manifest[T], r: Manifest[R]) = { 65 | val to = toUima(in) 66 | to must beRight 67 | to.right.get must beSome 68 | to.right.get.get must_== out 69 | 70 | val from = fromUima[T](out) 71 | from must beRight 72 | from.right.get must beSome 73 | func match { 74 | case Some(f) => f(from.right.get.get) must_== f(in) 75 | case None => from.right.get.get must_== in 76 | } 77 | } 78 | 79 | def patToString(in: Pattern) = in.pattern 80 | def regToString(in: Regex) = in.pattern.pattern 81 | def optPatToString(in: Option[Pattern]) = in.map(_.pattern) 82 | def optRegToString(in: Option[Regex]) = in.map(_.pattern.pattern) 83 | } 84 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/SCasAnnotator_ImplBaseSpecs.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import org.specs2.mutable.Specification 7 | import com.github.jenshaase.uimascala.core.configuration._ 8 | import org.apache.uima.jcas.JCas 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.resource.Resource_ImplBase 11 | import org.apache.uima.resource.SharedResourceObject 12 | import org.apache.uima.resource.DataResource 13 | 14 | class SCasAnnotator_ImplBaseSpecs extends Specification { 15 | 16 | "SCasAnnotator_ImplBase" should { 17 | 18 | "initialize one string parameter in a Annotator" in { 19 | val d = new DummyAnnotator().config( 20 | _.stringParam := "dummy").asAnalysisEngine 21 | 22 | val cas = d.newJCas 23 | d.process(cas) 24 | 25 | cas.getDocumentText must be equalTo ("dummy") 26 | } 27 | 28 | "initalize two parameters in a Annotator" in { 29 | val d = new Dummy2Annotator().config( 30 | _.stringParam := "dummy", 31 | _.intParam := 1).asAnalysisEngine 32 | 33 | val cas = d.newJCas 34 | d.process(cas) 35 | 36 | cas.getDocumentText must be equalTo ("dummy1") 37 | } 38 | 39 | "initalize list parameter in a Annotator" in { 40 | val d = new Dummy3Annotator().config( 41 | _.listParam := List("2", "3")).asAnalysisEngine 42 | 43 | val cas = d.newJCas 44 | d.process(cas) 45 | 46 | cas.getDocumentText must be equalTo ("23") 47 | } 48 | 49 | "not require to set a optinal value" in { 50 | val d = new Dummy2Annotator().config( 51 | _.stringParam := "dummy").asAnalysisEngine 52 | 53 | val cas = d.newJCas 54 | d.process(cas) 55 | 56 | cas.getDocumentText must be equalTo ("dummy100") 57 | } 58 | 59 | "initialize a Annotator with a SharedResourceObject" in { 60 | val d = new ResourceDummyAnnotator().config( 61 | _.dict := SharedBinding[SharedDict](new java.io.File("/path/to/nowhere")), 62 | _.name := Binding[SharedName2]()).asAnalysisEngine 63 | val cas = d.newJCas 64 | d.process(cas) 65 | 66 | cas.getDocumentText() must be equalTo ("SharedDict|SharedName2") 67 | } 68 | } 69 | } 70 | 71 | class DummyAnnotator extends SCasAnnotator_ImplBase { 72 | 73 | object stringParam extends Parameter[String]("test") 74 | 75 | def process(cas: JCas) = { 76 | cas.setDocumentText(stringParam.is) 77 | } 78 | } 79 | 80 | class Dummy2Annotator extends SCasAnnotator_ImplBase { 81 | 82 | object stringParam extends Parameter[String]("test") 83 | object intParam extends Parameter[Int](100) 84 | 85 | def process(cas: JCas) = { 86 | cas.setDocumentText(stringParam.is + intParam.is) 87 | } 88 | } 89 | 90 | class Dummy3Annotator extends SCasAnnotator_ImplBase { 91 | 92 | object listParam extends Parameter[List[String]](List("a", "b")) 93 | 94 | def process(cas: JCas) = { 95 | cas.setDocumentText(listParam.is.foldLeft("")(_ + _)) 96 | } 97 | } 98 | 99 | class SharedDict extends SharedResourceObject { 100 | def load(data: DataResource) = {} 101 | 102 | def name = "SharedDict" 103 | } 104 | class SharedName extends Resource_ImplBase { def name = "SharedName" } 105 | class SharedName2 extends SharedName { override def name = "SharedName2" } 106 | 107 | class ResourceDummyAnnotator extends SCasAnnotator_ImplBase { 108 | object dict extends SharedResource[SharedDict]("/path/to/nowhere") 109 | object name extends Resource[SharedName] 110 | 111 | def process(cas: JCas) = { 112 | cas.setDocumentText(dict.resource.name + "|" + name.resource.name); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/SimplePipelineSpecs.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core 5 | 6 | import org.specs2.mutable.Specification 7 | import org.apache.uima.jcas.JCas 8 | import org.apache.uima.util.ProgressImpl 9 | import org.apache.uima.jcas.tcas.Annotation 10 | 11 | class SimplePipelineSpecs extends Specification { 12 | 13 | "SimplePipeline" should { 14 | "add one reader and one annotator" in { 15 | try { 16 | new PipelineDummyReader() ~> new PipelineAnnotatorA() run () 17 | 18 | success 19 | } catch { 20 | case e ⇒ { 21 | e.printStackTrace() 22 | failure 23 | } 24 | } 25 | } 26 | 27 | "add one reader and two annotator" in { 28 | try { 29 | new PipelineDummyReader() ~> 30 | new PipelineAnnotatorA() ~> 31 | new PipelineAnnotatorB() run () 32 | 33 | success 34 | } catch { 35 | case _ ⇒ failure 36 | } 37 | } 38 | } 39 | } 40 | 41 | class PipelineDummyReader extends SCasCollectionReader_ImplBase { 42 | val total = 2 43 | var i = total 44 | 45 | def getNext(cas: JCas) = { 46 | cas.setDocumentText("Doc" + i) 47 | i = i - 1 48 | } 49 | 50 | def getProgress = Array(new ProgressImpl(total - i, total, "test")) 51 | 52 | def hasNext = i > 0 53 | } 54 | 55 | class PipelineAnnotatorA extends SCasAnnotator_ImplBase { 56 | def process(cas: JCas) = { 57 | new Annotation(cas, 0, 1).addToIndexes 58 | } 59 | } 60 | 61 | class PipelineAnnotatorB extends SCasAnnotator_ImplBase { 62 | def process(cas: JCas) = { 63 | new Annotation(cas, 1, 2).addToIndexes 64 | } 65 | } -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ConfigurationInitalizationSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import org.specs2.Specification 7 | import com.github.jenshaase.uimascala.core.configuration._ 8 | 9 | class ConfigurationInitalizationSpec extends Specification { 10 | def is = s2""" 11 | This is a specification to check the configuration system 12 | 13 | ConfigMock should 14 | 15 | have 4 parameters ${nbParams(4)} 16 | have a parameter called 'stringParam' ${hasParam("stringParam")} 17 | have a parameter called 'optStringParam' ${hasParam("stringListParam")} 18 | have a parameter called 'intParam' ${hasParam("intParam")} 19 | have a parameter called 'optIntParam' ${hasParam("intListParam")} 20 | """ 21 | 22 | def hasParam(name: String) = 23 | new ConfigMock().parameters.map(_.name).contains(name) must beTrue 24 | 25 | def nbParams(count: Int) = 26 | new ConfigMock().parameters.size must be equalTo (count) 27 | 28 | def createKeyValues = { 29 | val config = new ConfigMock() 30 | config.stringParam := "Test" 31 | config.intParam := 100 32 | 33 | config.parameterKeyValues.toList. 34 | sliding(2, 2).map(l ⇒ Pair(l(0).asInstanceOf[String], l(1))).toList.sortBy(_._1) must be equalTo (List( 35 | ("intListParam", Array(1, 2).asInstanceOf[Object]), 36 | ("intParam", 100.asInstanceOf[Object]), 37 | ("stringListParam", Array("ab", "cd").asInstanceOf[Object]), 38 | ("stringParam", "Test".asInstanceOf[Object]))) 39 | } 40 | } 41 | 42 | class ConfigMock extends Configurable with ConfigurationInitialization { 43 | object stringParam extends Parameter[String]("test") 44 | object stringListParam extends Parameter[List[String]](List("ab", "cd")) 45 | 46 | object intParam extends Parameter[Int](1) 47 | object intListParam extends Parameter[List[Int]](List(1, 2)) 48 | 49 | object somethingElse { 50 | def someMethod = "Anything" 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ParameterSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import org.specs2.Specification 7 | import com.github.jenshaase.uimascala.core.configuration._ 8 | import java.util.regex.Pattern 9 | 10 | class ParameterSpec extends Specification { 11 | 12 | // format: OFF 13 | def is = s2""" 14 | A Parameter can 15 | return default values if not value is set ${defaultVal} 16 | have a new value ${newVal} 17 | be set from uima ${fromUima} 18 | be converted to uima ${toUima} 19 | be mutli valued ${multiVal} 20 | be single valued ${singleVal} 21 | have a correct uima type ${uimaType} 22 | """ 23 | 24 | def defaultVal = { 25 | object param extends Parameter[String]("a") 26 | param.is must_== "a" 27 | } 28 | 29 | def newVal = { 30 | object param extends Parameter[String]("a") 31 | param := "b" 32 | param.is must_== "b" 33 | } 34 | 35 | def fromUima = { 36 | object param extends Parameter[Pattern](Pattern.compile("[A-Z]")) 37 | param.setFromUimaType("[1-4]") 38 | param.is.pattern must_== "[1-4]" 39 | } 40 | 41 | def toUima = { 42 | object param extends Parameter[Pattern](Pattern.compile("[A-Z]")) 43 | param.toUimaType must beRight 44 | param.toUimaType.right.get must_== "[A-Z]" 45 | } 46 | 47 | def multiVal = { 48 | object l extends Parameter[List[String]](List("b")) 49 | object s extends Parameter[Seq[String]](Seq("b")) 50 | object a extends Parameter[Array[String]](Array("b")) 51 | 52 | l.multiValued_? must_== true 53 | s.multiValued_? must_== true 54 | a.multiValued_? must_== true 55 | } 56 | 57 | def singleVal = { 58 | object param extends Parameter[String]("b") 59 | param.multiValued_? must_== false 60 | } 61 | 62 | def uimaType = { 63 | object p1 extends Parameter[Pattern](Pattern.compile("[A-Z]")) 64 | object p2 extends Parameter[String]("a") 65 | object p3 extends Parameter[Float](1.2f) 66 | object p4 extends Parameter[Boolean](true) 67 | object p5 extends Parameter[Int](2) 68 | 69 | object p6 extends Parameter[List[Pattern]](List(Pattern.compile("[A-Z]"))) 70 | object p7 extends Parameter[List[String]](List("a")) 71 | object p8 extends Parameter[List[Float]](List(1.2f)) 72 | object p9 extends Parameter[List[Boolean]](List(true)) 73 | object p10 extends Parameter[List[Int]](List(2)) 74 | 75 | p1.uimaType must_== "String" 76 | p2.uimaType must_== "String" 77 | p3.uimaType must_== "Float" 78 | p4.uimaType must_== "Boolean" 79 | p5.uimaType must_== "Integer" 80 | 81 | p6.uimaType must_== "String" 82 | p7.uimaType must_== "String" 83 | p8.uimaType must_== "Float" 84 | p9.uimaType must_== "Boolean" 85 | p10.uimaType must_== "Integer" 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ResourceInitializationSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.configuration 5 | 6 | import org.specs2.Specification 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import org.apache.uima.resource.Resource_ImplBase 10 | 11 | class ResourceInitializationSpec extends Specification { 12 | def is = s2""" 13 | This specification describes the resource initialization 14 | 15 | The ResourceMock class should 16 | have 4 resource objects ${nbResource(4)} 17 | have a resource called 'dictionary' ${hasResource("dictionary")} 18 | have a resource called 'name' ${hasResource("name")} 19 | have a resource called 'stopwords' ${hasResource("stopwords")} 20 | have a resource called 'optName' ${hasResource("optName")} 21 | return the correct dictionary resource (todo) 22 | return the correct name resource (todo) 23 | return the correct stopwords resource (todo) 24 | return the correct optName resource (todo) 25 | """ 26 | 27 | def nbResource(count: Int) = 28 | new ResourceMock().resources.size must be equalTo (count) 29 | 30 | def hasResource(name: String) = 31 | new ResourceMock().resources.map(_.name).contains(name) must beTrue 32 | 33 | } 34 | 35 | class ResourceMock extends Configurable with ResourceInitialization { 36 | 37 | object dictionary extends SharedResource[SharedDict]("/path/to/noWhere") 38 | 39 | object name extends Resource[SharedName] 40 | 41 | object stopwords extends SharedResource[SharedStopword]("/path/to/noWhere") 42 | 43 | object optName extends Resource[SharedOptName] 44 | } 45 | 46 | class SharedName extends Resource_ImplBase { 47 | def name = "myName" 48 | } 49 | 50 | class SharedOptName extends Resource_ImplBase { 51 | def name = "myOptName" 52 | } 53 | 54 | class SharedDict extends SharedResourceObject { 55 | 56 | def load(data: DataResource) = 57 | data.getUri.toString 58 | 59 | def getDict = "dict" 60 | } 61 | 62 | class SharedStopword extends SharedResourceObject { 63 | 64 | def load(data: DataResource) = 65 | data.getUri.toString 66 | 67 | def getStopword = "stopword" 68 | } 69 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ResourceSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.configuration 5 | 6 | import java.util.regex.Pattern 7 | import com.github.jenshaase.uimascala.core.configuration._ 8 | import org.apache.uima.resource.Resource_ImplBase 9 | import org.apache.uima.resource.SharedResourceObject 10 | import org.apache.uima.resource.DataResource 11 | import org.specs2.Specification 12 | 13 | class ResourceSpec extends Specification { 14 | 15 | // format: OFF 16 | def is = s2""" 17 | A Resource should 18 | return default parameters ${defaultParams} 19 | convert parameter to a list ${paramToList} 20 | bind a resource ${bind} 21 | bind a resource from Uima ${bindUima} 22 | be set by a Binding ${setBinding} 23 | return a class name ${className} 24 | return a interface name ${interfaceName} 25 | 26 | A SharedResource should 27 | return default parameters ${sharedDefaultParams} 28 | convert parameter to a list ${sharedParamToList} 29 | bind a resource ${sharedBind} 30 | bind a resource from Uima ${sharedBindUima} 31 | be set by a Binding ${sharedSetBinding} 32 | return a class name ${sharedClassName} 33 | return a interface name ${sharedInterfaceName} 34 | """ 35 | 36 | def defaultParams = { 37 | object r extends Resource[DummyRes](Map("a" -> "b")) 38 | r.params must_== Map("a" -> "b") 39 | } 40 | 41 | def paramToList = { 42 | object r extends Resource[DummyRes](Map("a" -> "b")) 43 | r.parameterList must_== Seq("a", "b") 44 | } 45 | 46 | def bind = { 47 | object r extends Resource[DummyRes]() 48 | val o = new DummyRes() 49 | r.bind(o) 50 | 51 | r.resource must_== o 52 | } 53 | 54 | def bindUima = { 55 | object r extends Resource[DummyRes]() 56 | val o = new DummyRes() 57 | r.setFromUima(o) 58 | 59 | r.resource must_== o 60 | } 61 | 62 | def setBinding = { 63 | object r extends Resource[DummyRes](Map("a" -> "b")) 64 | r := Binding(Map("c" -> "d")) 65 | r.params must_== Map("c" -> "d") 66 | } 67 | 68 | def className = { 69 | object r extends Resource[DummyRes](Map("a" -> "b")) 70 | r.className.getName must_== classOf[DummyRes].getName 71 | } 72 | 73 | def interfaceName = { 74 | object r extends Resource[DummyRes](Map("a" -> "b")) 75 | r.interfaceName must_== classOf[DummyRes].getName 76 | } 77 | 78 | class DummyRes extends Resource_ImplBase { def name = "DummyRes" } 79 | 80 | 81 | // Shared Resource 82 | 83 | def sharedDefaultParams = { 84 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])) 85 | r.url must_== "/test/data" 86 | r.params must_== Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]) 87 | } 88 | 89 | def sharedParamToList = { 90 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])) 91 | r.parameterList must_== Seq("a", "b") 92 | } 93 | 94 | def sharedBind = { 95 | object r extends SharedResource[DummyShared]("/test/data") 96 | val o = new DummyShared() 97 | r.bind(o) 98 | 99 | r.resource must_== o 100 | } 101 | 102 | def sharedBindUima = { 103 | object r extends SharedResource[DummyShared]("/test/data") 104 | val o = new DummyShared() 105 | r.setFromUima(o) 106 | 107 | r.resource must_== o 108 | } 109 | 110 | def sharedSetBinding = { 111 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])) 112 | r := SharedBinding("/abc/def", Map("c".asInstanceOf[Object] -> "d".asInstanceOf[Object])) 113 | r.url must_== "/abc/def" 114 | r.params must_== Map("c".asInstanceOf[Object] -> "d".asInstanceOf[Object]) 115 | } 116 | 117 | def sharedClassName = { 118 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])) 119 | r.className.getName must_== classOf[DummyShared].getName 120 | } 121 | 122 | def sharedInterfaceName = { 123 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])) 124 | r.interfaceName must_== classOf[DummyShared].getName 125 | } 126 | 127 | class DummyShared extends SharedResourceObject { 128 | def load(data: DataResource) = {} 129 | def name = "SharedDict" 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/stream/annotatorsSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.core.stream 2 | 3 | import org.specs2.mutable._ 4 | import fs2._ 5 | import org.apache.uima.jcas.tcas.Annotation 6 | import com.github.jenshaase.uimascala.core._ 7 | 8 | class annotateSpec extends Specification { 9 | 10 | import annotators._ 11 | 12 | "Annotators" should { 13 | 14 | def tokenizeText[F[_]] = 15 | casFromText[F] andThen whitespaceTokenizer[F, Annotation](false) 16 | 17 | "tokenize a document" in { 18 | val p = Stream.pure("this is a text", " and another text "). 19 | through(tokenizeText). 20 | through(extractCas { cas => 21 | cas.select[Annotation].drop(1).map(_.getCoveredText).toList 22 | }) 23 | 24 | p.toList must be equalTo (List( 25 | List("this", "is", "a", "text"), 26 | List("and", "another", "text") 27 | )) 28 | } 29 | 30 | "remove stopwords" in { 31 | val p = Stream.pure("this is a text", " and another text "). 32 | through(tokenizeText). 33 | through(removeStopwords[Pure, Annotation](s => Set("is", "a").contains(s))). 34 | through(extractCas { cas => 35 | cas.select[Annotation].drop(1).map(_.getCoveredText).toList 36 | }) 37 | 38 | p.toList must be equalTo (List( 39 | List("this", "text"), 40 | List("and", "another", "text") 41 | )) 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/util/Helper.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.util 5 | 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.util.CasCreationUtils 8 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory 9 | 10 | /** 11 | * @author Jens Haase 12 | */ 13 | 14 | trait Helper { 15 | 16 | def newJCas: JCas = { 17 | CasCreationUtils.createCas( 18 | TypeSystemDescriptionFactory.createTypeSystemDescription, null, null).getJCas 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/wrapper/AnnotationWrapperSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.wrapper 5 | 6 | import org.specs2.mutable.Specification 7 | import org.apache.uima.util.CasCreationUtils 8 | import org.apache.uima.fit.factory.{ TypePrioritiesFactory, TypeSystemDescriptionFactory } 9 | import org.apache.uima.jcas.JCas 10 | import org.apache.uima.jcas.tcas.Annotation 11 | import com.github.jenshaase.uimascala.core._ 12 | import util.Helper 13 | 14 | /** 15 | * @author Jens Haase 16 | */ 17 | class AnnotationWrapperSpec extends Specification with Helper { 18 | 19 | "Annotation Wrapper" should { 20 | 21 | "trim a annotation" in { 22 | val cas = newJCas 23 | cas.setDocumentText("This is text") 24 | 25 | val a = new Annotation(cas, 4, 8) 26 | a.getCoveredText must be equalTo (" is ") 27 | a.trim.getCoveredText must be equalTo ("is") 28 | } 29 | 30 | "check if a annotation is empty" in { 31 | new Annotation(newJCas, 0, 0).isEmpty must beTrue 32 | new Annotation(newJCas, 0, 1).isEmpty must beFalse 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /core/src/test/scala/com/github/jenshaase/uimascala/core/wrapper/JCasWrapperSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.core.wrapper 5 | 6 | import org.specs2.mutable.Specification 7 | import com.github.jenshaase.uimascala.core._ 8 | import util.Helper 9 | import org.apache.uima.jcas.tcas.Annotation 10 | import org.apache.uima.jcas.JCas 11 | 12 | /** 13 | * @author Jens Haase 14 | */ 15 | class JCasWrapperSpec extends Specification with Helper { 16 | 17 | class Token(cas: JCas, begin: Int, end: Int) extends Annotation(cas, begin, end) 18 | 19 | "JCasWrapper" should { 20 | 21 | "select annotation of same type" in { 22 | val cas = newJCas 23 | cas.setDocumentText("This is a text") 24 | 25 | new Annotation(cas, 0, 4).addToIndexes 26 | new Annotation(cas, 5, 7).addToIndexes 27 | 28 | // Note: One Annotation and one DocumentAnnotation are default 29 | // to each new JCas 30 | cas.select[Annotation].size must be equalTo (3) 31 | } 32 | 33 | "select annotation by index" in { 34 | val cas = newJCas 35 | cas.setDocumentText("This is a text") 36 | 37 | new Annotation(cas, 0, 4).addToIndexes 38 | 39 | cas.selectByIndex[Annotation](1).getCoveredText must be equalTo ("This") 40 | } 41 | 42 | "select all anntation covered by another annotation" in { 43 | val cas = newJCas 44 | cas.setDocumentText("This is a text") 45 | 46 | val a1 = new Annotation(cas, 0, 4) 47 | a1.addToIndexes 48 | val a2 = new Annotation(cas, 0, 1) 49 | a2.addToIndexes 50 | val a3 = new Annotation(cas, 1, 2) 51 | a3.addToIndexes 52 | 53 | cas.selectCovered[Annotation](a1).size must be equalTo (2) 54 | cas.selectCovered[Annotation](a1).get(0).getCoveredText must be equalTo ("T") 55 | } 56 | 57 | "select a single annotation" in { 58 | val cas = newJCas 59 | cas.setDocumentText("This is a text") 60 | 61 | cas.selectSingle[Annotation].getCoveredText must be equalTo ("This is a text") 62 | } 63 | 64 | "select all preceding annotation" in { 65 | val cas = newJCas 66 | cas.setDocumentText("This is a text") 67 | 68 | val a1 = new Annotation(cas, 0, 4) 69 | a1.addToIndexes 70 | val a2 = new Annotation(cas, 5, 7) 71 | a2.addToIndexes 72 | val a3 = new Annotation(cas, 8, 9) 73 | a3.addToIndexes 74 | 75 | val p1 = cas.selectPreceding[Annotation](a2, 1) 76 | p1.size must be equalTo (1) 77 | p1.head.getCoveredText must be equalTo (a1.getCoveredText) 78 | } 79 | 80 | "select all following annotation" in { 81 | val cas = newJCas 82 | cas.setDocumentText("This is a text") 83 | 84 | val a1 = new Annotation(cas, 0, 4) 85 | a1.addToIndexes 86 | val a2 = new Annotation(cas, 5, 7) 87 | a2.addToIndexes 88 | val a3 = new Annotation(cas, 8, 9) 89 | a3.addToIndexes 90 | 91 | val p1 = cas.selectFollowing[Annotation](a2, 1) 92 | p1.size must be equalTo (1) 93 | p1.head.getCoveredText must be equalTo (a3.getCoveredText) 94 | } 95 | 96 | "checks if an annotation type exists" in { 97 | val cas = newJCas 98 | cas.setDocumentText("This is a text") 99 | 100 | cas.exists[Annotation] must beTrue 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /language-identification/n-gram-language-identifier/src/main/scala/com/github/jenshaase/uimascala/languageidentifier/NGramLanguageIdentifier.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.languageidentifier 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import scala.collection.JavaConversions._ 8 | import com.optimaize.langdetect.text.CommonTextObjectFactories 9 | import com.optimaize.langdetect.ngram.NgramExtractors 10 | import com.optimaize.langdetect.profiles._ 11 | import com.optimaize.langdetect._ 12 | 13 | class NGramLanguageIdentifier extends SCasAnnotator_ImplBase { 14 | 15 | object shortText extends Parameter[Boolean](false) 16 | 17 | lazy val languageDetector = { 18 | val languageProfiles = new LanguageProfileReader().readAllBuiltIn() 19 | LanguageDetectorBuilder.create(NgramExtractors.standard()) 20 | .withProfiles(languageProfiles) 21 | .build() 22 | } 23 | 24 | def process(jcas: JCas) = { 25 | val textObjectFactory = 26 | if (shortText.is) { 27 | CommonTextObjectFactories.forDetectingOnLargeText() 28 | } else { 29 | CommonTextObjectFactories.forDetectingShortCleanText() 30 | } 31 | 32 | val text = textObjectFactory.forText(jcas.getDocumentText); 33 | val lang = languageDetector.detect(text) 34 | if (lang.isPresent()) { 35 | jcas.setDocumentLanguage(lang.get().toString) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /language-identification/n-gram-language-identifier/src/test/scala/com/github/jenshaase/uimascala/languageidentifier/NGramLanguageIdentifierSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.languageidentifier 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.specs2.mutable.Specification 8 | 9 | class NGramLanguageIdentifierSpec extends Specification { 10 | 11 | "The ngram language idenifier" should { 12 | "detect the german language" in { 13 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().asAnalysisEngine 14 | 15 | val jcas = analyser.newJCas() 16 | jcas.setDocumentText("Das ist ein Text in deutscher Sprache") 17 | analyser.process(jcas) 18 | 19 | jcas.getDocumentLanguage must be equalTo("de") 20 | } 21 | 22 | "detect the english language" in { 23 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().asAnalysisEngine 24 | 25 | val jcas = analyser.newJCas() 26 | jcas.setDocumentText("This is a english text with so information.") 27 | analyser.process(jcas) 28 | 29 | jcas.getDocumentLanguage must be equalTo("en") 30 | } 31 | 32 | "detect the german language in short text snippets" in { 33 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().config(_.shortText := true).asAnalysisEngine 34 | 35 | val jcas = analyser.newJCas() 36 | jcas.setDocumentText("Das ist ein Text in deutscher Sprache") 37 | analyser.process(jcas) 38 | 39 | jcas.getDocumentLanguage must be equalTo("de") 40 | } 41 | 42 | "detect the english language in short text snippets" in { 43 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().config(_.shortText := true).asAnalysisEngine 44 | 45 | val jcas = analyser.newJCas() 46 | jcas.setDocumentText("This is a english text with so information.") 47 | analyser.process(jcas) 48 | 49 | jcas.getDocumentLanguage must be equalTo("en") 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /lemmatizer/mate-lemmatizer/src/main/scala/com/github/jenshaase/uimascala/lemmatizer/MateLemmatizer.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.lemmatizer 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import scala.collection.JavaConversions._ 10 | import is2.data.SentenceData09 11 | import is2.io.CONLLReader09 12 | import is2.io.IOGenerals 13 | import is2.lemmatizer.Lemmatizer 14 | 15 | class MateLemmatizerResource extends SharedResourceObject { 16 | private var lemmatizer: Lemmatizer = _ 17 | 18 | def load(data: DataResource) { 19 | val uri = data.getUri.toString 20 | 21 | if (new java.io.File(uri).exists) { 22 | lemmatizer = new Lemmatizer(uri) 23 | } else { 24 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri 25 | val resource = this.getClass.getResource(resourceUri) 26 | 27 | val file = java.io.File.createTempFile("mate-lemmatizer", ".temp") 28 | file.deleteOnExit(); 29 | 30 | val source = resource.openStream(); 31 | try { 32 | java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING); 33 | } finally { 34 | source.close(); 35 | } 36 | 37 | lemmatizer = new Lemmatizer(file.getAbsolutePath) 38 | } 39 | } 40 | 41 | def getLemmatizer = lemmatizer 42 | } 43 | 44 | class MateLemmatizer extends SCasAnnotator_ImplBase { 45 | 46 | object model extends SharedResource[MateLemmatizerResource]("") 47 | 48 | def process(jcas: JCas) = { 49 | jcas.select[Sentence].foreach { sentence => 50 | val tokens = jcas.selectCovered[Token](sentence).toVector 51 | 52 | val sentenceData = new SentenceData09() 53 | sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText)) 54 | 55 | model.resource.getLemmatizer.apply(sentenceData).plemmas.zipWithIndex.foreach { case (tag, idx) => 56 | val token = tokens(idx) 57 | 58 | val lemma = new Lemma(jcas, token.getBegin, token.getEnd) 59 | lemma.setValue(tag) 60 | add(lemma) 61 | 62 | token.setLemma(lemma) 63 | } 64 | } 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /lemmatizer/mate-lemmatizer/src/test/scala/com/github/jenshaase/uimascala/lemmatizer/MateLemmatizerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.lemmatizer 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import com.github.jenshaase.uimascala.core.configuration._ 7 | import org.apache.uima.analysis_engine.AnalysisEngine 8 | import org.specs2.mutable.Specification 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.util.JCasUtil 11 | 12 | class MateLemmatizerSpec extends Specification { 13 | 14 | "MateLemmatizer" should { 15 | "lemmatize each word in a sentence" in { 16 | val tagger: AnalysisEngine = new MateLemmatizer(). 17 | config( 18 | _.model := SharedBinding[MateLemmatizerResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/lemmatizer-de-tiger.model") 19 | ). 20 | asAnalysisEngine 21 | 22 | val jcas = tagger.newJCas() 23 | jcas.setDocumentText("Hallo Welt! Was geht?") 24 | jcas.annotate[Sentence](0, 10) 25 | jcas.annotate[Sentence](12, 20) 26 | jcas.annotate[Token](0, 5) 27 | jcas.annotate[Token](6, 10) 28 | jcas.annotate[Token](12, 15) 29 | jcas.annotate[Token](16, 20) 30 | tagger.process(jcas) 31 | 32 | jcas.select[Lemma].size must be equalTo(4) 33 | jcas.selectByIndex[Lemma](0).getCoveredText must be equalTo ("Hallo") 34 | jcas.selectByIndex[Lemma](1).getCoveredText must be equalTo ("Welt") 35 | jcas.selectByIndex[Lemma](2).getCoveredText must be equalTo ("Was") 36 | jcas.selectByIndex[Lemma](3).getCoveredText must be equalTo ("geht") 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /name-entity-recognizer/stanford-ner/src/main/scala/com/github/jenshaase/uimascala/ner/StanfordNer.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.ner 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import edu.stanford.nlp.ling.TaggedWord 10 | import edu.stanford.nlp.ie.crf.CRFClassifier 11 | import edu.stanford.nlp.util.CoreMap 12 | import edu.stanford.nlp.ling.CoreLabel 13 | import edu.stanford.nlp.ling.CoreAnnotations 14 | import scala.collection.JavaConversions._ 15 | import java.util.zip.GZIPInputStream 16 | 17 | 18 | class StanfordNerResource extends SharedResourceObject { 19 | private var tagger: CRFClassifier[CoreMap] = _ 20 | 21 | def load(data: DataResource) { 22 | val uri = data.getUri.toString 23 | 24 | if (new java.io.File(uri).exists) { 25 | tagger = CRFClassifier.getClassifier[CoreMap](new java.io.File(uri)) 26 | } else { 27 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri 28 | val resource = this.getClass.getResource(resourceUri) 29 | 30 | val is = if (uri.endsWith(".gz")) { 31 | new GZIPInputStream(resource.openStream) 32 | } else { 33 | resource.openStream 34 | } 35 | 36 | tagger = CRFClassifier.getClassifier[CoreMap](is) 37 | } 38 | } 39 | 40 | def getTagger = tagger 41 | } 42 | 43 | class StanfordNer extends SCasAnnotator_ImplBase { 44 | 45 | object model extends SharedResource[StanfordNerResource]("") 46 | 47 | def process(jcas: JCas) = { 48 | jcas.select[Sentence].foreach { sentence => 49 | val tokens = jcas.selectCovered[Token](sentence).toVector 50 | 51 | model.resource.getTagger. 52 | classifySentence(tokens.map(tokenToCoreLabel _)). 53 | foldLeft[(Int, Int, Option[String])](-1, -1, None) { case ((begin, end, currentType), taggedWord) => 54 | val tokenType = taggedWord.get(classOf[CoreAnnotations.AnswerAnnotation]) 55 | val tokenBegin = taggedWord.get(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation]) 56 | val tokenEnd = taggedWord.get(classOf[CoreAnnotations.CharacterOffsetEndAnnotation]) 57 | 58 | (tokenType, currentType) match { 59 | case ("O", Some(b)) => 60 | val namedEntity = new NamedEntity(jcas, begin, end) 61 | namedEntity.setValue(b) 62 | add(namedEntity) 63 | (begin, end, None) 64 | 65 | case (a, Some(b)) if (a != b) => 66 | val namedEntity = new NamedEntity(jcas, begin, end) 67 | namedEntity.setValue(b) 68 | add(namedEntity) 69 | (begin, tokenEnd, Some(tokenType)) 70 | 71 | case (a, None) if (a != "O") => 72 | (tokenBegin, tokenEnd, Some(tokenType)) 73 | 74 | case (a, Some(b)) if (a == b) => 75 | (begin, tokenEnd, Some(tokenType)) 76 | 77 | case ("O", None) => 78 | (begin, end, currentType) 79 | } 80 | } 81 | } 82 | } 83 | 84 | def tokenToCoreLabel(token: Token): CoreLabel = { 85 | val word = new CoreLabel() 86 | word.setValue(token.getCoveredText) 87 | word.setOriginalText(token.getCoveredText) 88 | word.setWord(token.getCoveredText) 89 | word.setBeginPosition(token.getBegin) 90 | word.setEndPosition(token.getEnd) 91 | 92 | if (token.getPos != null) { 93 | word.setTag(token.getPos.getName) 94 | } 95 | 96 | word 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /name-entity-recognizer/stanford-ner/src/test/scala/com/github/jenshaase/uimascala/ner/StanfordNerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.ner 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import com.github.jenshaase.uimascala.core.configuration._ 7 | import org.apache.uima.analysis_engine.AnalysisEngine 8 | import org.specs2.mutable.Specification 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.util.JCasUtil 11 | 12 | class StanfordNerSpec extends Specification { 13 | 14 | "The Stanford Parser" should { 15 | "add constituents" in { 16 | val parser: AnalysisEngine = new StanfordNer(). 17 | config( 18 | _.model := SharedBinding[StanfordNerResource]("edu/stanford/nlp/models/ner/german.dewac_175m_600.crf.ser.gz") 19 | ). 20 | asAnalysisEngine 21 | 22 | val jcas = parser.newJCas() 23 | jcas.setDocumentText("Angela Merkel fliegt nach Berlin.") 24 | jcas.annotate[Sentence](0, 33) 25 | val t1 = jcas.annotate[Token](0, 6) 26 | val p1 = jcas.annotate[POS](0, 6) 27 | p1.setName("NE") 28 | t1.setPos(p1) 29 | 30 | val t2 = jcas.annotate[Token](7, 13) 31 | val p2 = jcas.annotate[POS](7, 13) 32 | p2.setName("NE") 33 | t2.setPos(p2) 34 | 35 | val t3 = jcas.annotate[Token](14, 20) 36 | val p3 = jcas.annotate[POS](14, 20) 37 | p3.setName("VVFIN") 38 | t3.setPos(p3) 39 | 40 | val t4 = jcas.annotate[Token](21, 25) 41 | val p4 = jcas.annotate[POS](21, 25) 42 | p4.setName("APPR") 43 | t4.setPos(p4) 44 | 45 | val t5 = jcas.annotate[Token](26, 32) 46 | val p5 = jcas.annotate[POS](26, 32) 47 | p5.setName("NE") 48 | t5.setPos(p5) 49 | 50 | val t6 = jcas.annotate[Token](32, 33) 51 | val p6 = jcas.annotate[POS](32, 33) 52 | p6.setName("$.") 53 | t6.setPos(p6) 54 | 55 | parser.process(jcas) 56 | 57 | val namedEntities = jcas.select[NamedEntity].toVector 58 | namedEntities.size must be equalTo(2) 59 | namedEntities(0).getCoveredText must be equalTo("Angela Merkel") 60 | namedEntities(0).getValue must be equalTo("I-PER") 61 | namedEntities(1).getCoveredText must be equalTo("Berlin") 62 | namedEntities(1).getValue must be equalTo("I-LOC") 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /parser/mate-parser/src/main/scala/com/github/jenshaase/uimascala/parser/MateParser.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.parser 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import scala.collection.JavaConversions._ 8 | import org.apache.uima.resource.DataResource 9 | import org.apache.uima.resource.SharedResourceObject 10 | import is2.data.SentenceData09 11 | import is2.io.CONLLReader09 12 | import is2.io.IOGenerals 13 | import is2.parser.Options 14 | import is2.parser.Parser 15 | 16 | class MateParserResource extends SharedResourceObject { 17 | private var parser: Parser = _ 18 | 19 | def load(data: DataResource) { 20 | val uri = data.getUri.toString 21 | 22 | if (new java.io.File(uri).exists) { 23 | parser = new Parser(new Options(Array("-model", uri))) 24 | } else { 25 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri 26 | val resource = this.getClass.getResource(resourceUri) 27 | 28 | val file = java.io.File.createTempFile("mate-parser", ".temp") 29 | file.deleteOnExit(); 30 | 31 | val source = resource.openStream(); 32 | try { 33 | java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING); 34 | } finally { 35 | source.close(); 36 | } 37 | 38 | parser = new Parser(new Options(Array("-model", file.getAbsolutePath))) 39 | } 40 | } 41 | 42 | def getParser = parser 43 | } 44 | 45 | class MateParser extends SCasAnnotator_ImplBase { 46 | 47 | object model extends SharedResource[MateParserResource]("") 48 | 49 | def process(jcas: JCas) = { 50 | jcas.select[Sentence].foreach { sentence => 51 | val tokens = jcas.selectCovered[Token](sentence).toVector 52 | 53 | val sentenceData = new SentenceData09() 54 | sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText)) 55 | sentenceData.setLemmas(Array[String](IOGenerals.ROOT_LEMMA) ++ tokens.map { t => 56 | if (t.getLemma != null) { 57 | t.getLemma.getValue() 58 | } else { 59 | "_" 60 | } 61 | }) 62 | sentenceData.setPPos(Array[String](IOGenerals.ROOT_POS) ++ tokens.map { t => 63 | t.getPos.getName() 64 | }) 65 | 66 | val parsed = model.resource.getParser.apply(sentenceData) 67 | 68 | parsed.labels.zipWithIndex.foreach { case (label, i) => 69 | if (parsed.pheads(i) != 0) { 70 | val sourceToken = tokens(parsed.pheads(i) - 1) 71 | val targetToken = tokens(i) 72 | val depType = parsed.plabels(i) 73 | 74 | val dep = new Dependency(jcas) 75 | dep.setGovernor(sourceToken) 76 | dep.setDependent(targetToken) 77 | dep.setDependencyType(depType) 78 | dep.setBegin(dep.getDependent().getBegin()) 79 | dep.setEnd(dep.getDependent().getEnd()) 80 | dep.addToIndexes() 81 | } else { 82 | val rootToken = tokens(i) 83 | 84 | val dep = new DependencyRoot(jcas) 85 | dep.setGovernor(rootToken) 86 | dep.setDependent(rootToken) 87 | dep.setDependencyType(parsed.plabels(i)) 88 | dep.setBegin(dep.getDependent().getBegin()) 89 | dep.setEnd(dep.getDependent().getEnd()) 90 | dep.addToIndexes() 91 | } 92 | } 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /parser/mate-parser/src/test/scala/com/github/jenshaase/uimascala/parser/MateParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.parser 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.typesystem._ 5 | import com.github.jenshaase.uimascala.core.configuration._ 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.specs2.mutable.Specification 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory 9 | import org.apache.uima.fit.util.JCasUtil 10 | 11 | class MateParserSpec extends Specification { 12 | 13 | "The Mate Parser" should { 14 | "add dependencies" in { 15 | val parser: AnalysisEngine = new MateParser(). 16 | config( 17 | _.model := SharedBinding[MateParserResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/parser-de-tiger.model") 18 | ). 19 | asAnalysisEngine 20 | 21 | val jcas = parser.newJCas() 22 | jcas.setDocumentText("Wie alt bist du?") 23 | jcas.annotate[Sentence](0, 16) 24 | val t1 = jcas.annotate[Token](0, 3) 25 | val p1 = jcas.annotate[POS](0, 3) 26 | p1.setName("PWAV") 27 | t1.setPos(p1) 28 | 29 | val t2 = jcas.annotate[Token](4, 7) 30 | val p2 = jcas.annotate[POS](4, 7) 31 | p2.setName("ADJD") 32 | t2.setPos(p2) 33 | 34 | val t3 = jcas.annotate[Token](8, 12) 35 | val p3 = jcas.annotate[POS](8, 12) 36 | p3.setName("VAFIN") 37 | t3.setPos(p3) 38 | 39 | val t4 = jcas.annotate[Token](13, 15) 40 | val p4 = jcas.annotate[POS](13, 15) 41 | p4.setName("PPER") 42 | t4.setPos(p4) 43 | 44 | val t5 = jcas.annotate[Token](15, 16) 45 | val p5 = jcas.annotate[POS](15, 16) 46 | p5.setName("$.") 47 | t5.setPos(p5) 48 | 49 | parser.process(jcas) 50 | 51 | val dependencies = jcas.select[Dependency].toVector 52 | dependencies(0).getCoveredText must be equalTo ("Wie") 53 | dependencies(0).getGovernor.getCoveredText must be equalTo ("alt") 54 | dependencies(0).getDependent.getCoveredText must be equalTo ("Wie") 55 | dependencies(0).getDependencyType must be equalTo ("MO") 56 | 57 | dependencies(1).getCoveredText must be equalTo ("alt") 58 | dependencies(1).getGovernor.getCoveredText must be equalTo ("bist") 59 | dependencies(1).getDependent.getCoveredText must be equalTo ("alt") 60 | dependencies(1).getDependencyType must be equalTo ("PD") 61 | 62 | dependencies(2).getCoveredText must be equalTo ("bist") 63 | dependencies(2).getGovernor.getCoveredText must be equalTo ("bist") 64 | dependencies(2).getDependent.getCoveredText must be equalTo ("bist") 65 | dependencies(2).getDependencyType must be equalTo ("--") 66 | 67 | dependencies(3).getCoveredText must be equalTo ("du") 68 | dependencies(3).getGovernor.getCoveredText must be equalTo ("bist") 69 | dependencies(3).getDependent.getCoveredText must be equalTo ("du") 70 | dependencies(3).getDependencyType must be equalTo ("SB") 71 | 72 | dependencies(4).getCoveredText must be equalTo ("?") 73 | dependencies(4).getGovernor.getCoveredText must be equalTo ("du") 74 | dependencies(4).getDependent.getCoveredText must be equalTo ("?") 75 | dependencies(4).getDependencyType must be equalTo ("--") 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /parser/stanford-parser/src/main/scala/com/github/jenshaase/uimascala/parser/StanfordParser.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.parser 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import scala.collection.JavaConversions._ 8 | import edu.stanford.nlp.parser.common.ParserGrammar 9 | import java.io._ 10 | import java.util.zip.GZIPInputStream 11 | import org.apache.uima.resource.DataResource 12 | import org.apache.uima.resource.SharedResourceObject 13 | import edu.stanford.nlp.ling.CoreLabel 14 | import edu.stanford.nlp.trees.Tree 15 | import org.apache.uima.jcas.tcas.Annotation 16 | import org.apache.uima.jcas.cas.FSArray 17 | import org.apache.uima.util.Level.WARNING 18 | 19 | class StanfordParserGrammerResource extends SharedResourceObject { 20 | private var parser: ParserGrammar = _ 21 | 22 | def load(data: DataResource) { 23 | parser = ParserGrammar.loadModel(data.getUri.toString) 24 | } 25 | 26 | def getParserGrammer = parser 27 | } 28 | 29 | object DependencyMode { 30 | val BASIC = "BASIC" 31 | val NON_COLLAPSED = "NON_COLLAPSED" 32 | val COLLAPSED = "COLLAPSED" 33 | val COLLAPSED_WITH_EXTRA = "COLLAPSED_WITH_EXTRA" 34 | val CC_PROPAGATED = "CC_PROPAGATED" 35 | val CC_PROPAGATED_NO_EXTRA = "CC_PROPAGATED_NO_EXTRA" 36 | val TREE = "TREE" 37 | } 38 | 39 | class StanfordParser extends SCasAnnotator_ImplBase { 40 | 41 | object model extends SharedResource[StanfordParserGrammerResource]("") 42 | object mode extends Parameter[String](DependencyMode.BASIC) 43 | object readPOS extends Parameter[Boolean](true) { 44 | override def mandatory_? = false 45 | } 46 | object createPOS extends Parameter[Boolean](false) { 47 | override def mandatory_? = false 48 | } 49 | 50 | def process(jcas: JCas) = { 51 | val parser = model.resource.getParserGrammer 52 | 53 | jcas.select[Sentence].foreach { sentence => 54 | val tokens = jcas.selectCovered[Token](sentence).toVector 55 | 56 | val query = parser.parserQuery() 57 | query.parse(tokens.map(tokenToCoreLabel _)) 58 | val parseTree = query.getBestParse() 59 | parseTree.setSpans() 60 | 61 | doCreateConstituentAnnotation(jcas, tokens, parseTree, None) 62 | doCreateDependencyAnnotation(jcas, parser, parseTree, tokens) 63 | } 64 | } 65 | 66 | def tokenToCoreLabel(token: Token): CoreLabel = { 67 | val word = new CoreLabel() 68 | word.setValue(token.getCoveredText) 69 | word.setOriginalText(token.getCoveredText) 70 | word.setWord(token.getCoveredText) 71 | word.setBeginPosition(token.getBegin) 72 | word.setEndPosition(token.getEnd) 73 | 74 | if (readPOS.is && token.getPos != null) { 75 | word.setTag(token.getPos.getName) 76 | } 77 | 78 | word 79 | } 80 | 81 | def doCreateConstituentAnnotation(jcas: JCas, tokens: Vector[Token], node: Tree, parent: Option[Annotation]): Annotation = { 82 | val nodeLabelValue = node.value() 83 | val source = tokens.get(node.getSpan().getSource) 84 | val target = tokens.get(node.getSpan().getTarget) 85 | 86 | if (node.isPhrasal) { 87 | val constituent = createConstituent(jcas, source.getBegin, target.getEnd, nodeLabelValue) 88 | parent.foreach { p => constituent.setParent(p) } 89 | 90 | val childAnnotations = node. 91 | getChildrenAsList(). 92 | map(doCreateConstituentAnnotation(jcas, tokens, _, Some(constituent))) 93 | 94 | val children = childAnnotations.zipWithIndex. 95 | foldLeft(new FSArray(jcas, childAnnotations.size())) { case (fsArray, (ann, idx)) => 96 | fsArray.set(idx, ann) 97 | fsArray 98 | } 99 | 100 | constituent.setChildren(children) 101 | add(constituent) 102 | constituent 103 | } else if (node.isPreTerminal) { 104 | val pos = createPOS(jcas, source.getBegin, target.getEnd, nodeLabelValue) 105 | val coveredToken = jcas.selectCovered[Token](pos) 106 | require(coveredToken.size == 1) 107 | val token = coveredToken.get(0) 108 | 109 | if (createPOS.is) { 110 | add(pos) 111 | token.setPos(pos) 112 | } 113 | 114 | parent.foreach { p => 115 | token.setParent(p) 116 | } 117 | 118 | token 119 | } else { 120 | throw new Exception("Node must be either phrasal nor pre-terminal") 121 | } 122 | } 123 | 124 | def createConstituent(jcas: JCas, begin: Int, end: Int, constituentType: String) = { 125 | val c = new Constituent(jcas, begin, end) 126 | c.setConstituentType(constituentType) 127 | c 128 | } 129 | 130 | def createPOS(jcas: JCas, begin: Int, end: Int, name: String) = { 131 | val p = new POS(jcas, begin, end) 132 | p.setName(name) 133 | p 134 | } 135 | 136 | 137 | def doCreateDependencyAnnotation(jcas: JCas, parser: ParserGrammar, parseTree: Tree, tokens: Seq[Token]) { 138 | try { 139 | val gs = parser.getTLPParams().getGrammaticalStructure( 140 | parseTree, 141 | parser.treebankLanguagePack().punctuationWordRejectFilter(), 142 | parser.getTLPParams().typedDependencyHeadFinder() 143 | ) 144 | 145 | val dependencies = mode.is match { 146 | case DependencyMode.BASIC => gs.typedDependencies() 147 | case DependencyMode.NON_COLLAPSED => gs.allTypedDependencies() 148 | case DependencyMode.COLLAPSED => gs.typedDependenciesCollapsed(false) 149 | case DependencyMode.COLLAPSED_WITH_EXTRA => gs.typedDependenciesCollapsed(true) 150 | case DependencyMode.CC_PROPAGATED => gs.typedDependenciesCCprocessed(true) 151 | case DependencyMode.CC_PROPAGATED_NO_EXTRA => gs.typedDependenciesCCprocessed(false) 152 | case DependencyMode.TREE => gs.typedDependenciesCollapsedTree() 153 | case _ => throw new Exception("DependencyMode not supported: " + mode.is) 154 | } 155 | 156 | dependencies.foreach { currTypedDep => 157 | val govIndex = currTypedDep.gov().index(); 158 | val depIndex = currTypedDep.dep().index(); 159 | 160 | val dep = if (govIndex != 0) { 161 | val govToken = tokens(govIndex - 1) 162 | val depToken = tokens(depIndex - 1) 163 | 164 | val dep = new Dependency(jcas) 165 | dep.setDependencyType(currTypedDep.reln().toString()); 166 | dep.setGovernor(govToken); 167 | dep.setDependent(depToken); 168 | dep.setBegin(dep.getDependent().getBegin()); 169 | dep.setEnd(dep.getDependent().getEnd()); 170 | dep.addToIndexes(); 171 | } else { 172 | val depToken = tokens(depIndex - 1); 173 | 174 | val dep = new DependencyRoot(jcas); 175 | dep.setDependencyType(currTypedDep.reln().toString()); 176 | dep.setGovernor(depToken); 177 | dep.setDependent(depToken); 178 | dep.setBegin(dep.getDependent().getBegin()); 179 | dep.setEnd(dep.getDependent().getEnd()); 180 | dep.addToIndexes(); 181 | 182 | dep 183 | } 184 | } 185 | } catch { 186 | case e: UnsupportedOperationException => 187 | getContext().getLogger().log(WARNING, "Current model does not seem to support dependencies."); 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /parser/stanford-parser/src/test/scala/com/github/jenshaase/uimascala/parser/StanfordParserSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.parser 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import com.github.jenshaase.uimascala.core.configuration._ 7 | import org.apache.uima.analysis_engine.AnalysisEngine 8 | import org.specs2.mutable.Specification 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.util.JCasUtil 11 | 12 | class StanfordParserSpec extends Specification { 13 | 14 | "The Stanford Parser" should { 15 | "add constituents" in { 16 | val parser: AnalysisEngine = new StanfordParser(). 17 | config( 18 | _.model := SharedBinding[StanfordParserGrammerResource]("edu/stanford/nlp/models/srparser/germanSR.ser.gz") 19 | ). 20 | asAnalysisEngine 21 | 22 | val jcas = parser.newJCas() 23 | jcas.setDocumentText("Wie alt bist du?") 24 | jcas.annotate[Sentence](0, 16) 25 | val t1 = jcas.annotate[Token](0, 3) 26 | val p1 = jcas.annotate[POS](0, 3) 27 | p1.setName("PWAV") 28 | t1.setPos(p1) 29 | 30 | val t2 = jcas.annotate[Token](4, 7) 31 | val p2 = jcas.annotate[POS](4, 7) 32 | p2.setName("ADJD") 33 | t2.setPos(p2) 34 | 35 | val t3 = jcas.annotate[Token](8, 12) 36 | val p3 = jcas.annotate[POS](8, 12) 37 | p3.setName("VAFIN") 38 | t3.setPos(p3) 39 | 40 | val t4 = jcas.annotate[Token](13, 15) 41 | val p4 = jcas.annotate[POS](13, 15) 42 | p4.setName("PPER") 43 | t4.setPos(p4) 44 | 45 | val t5 = jcas.annotate[Token](15, 16) 46 | val p5 = jcas.annotate[POS](15, 16) 47 | p5.setName("$.") 48 | t5.setPos(p5) 49 | 50 | parser.process(jcas) 51 | 52 | val constituents = jcas.select[Constituent].toVector 53 | constituents(0).getBegin must be equalTo (0) 54 | constituents(0).getEnd must be equalTo (16) 55 | constituents(0).getConstituentType must be equalTo ("S") 56 | constituents(0).getChildren.size must be equalTo (4) 57 | constituents(0).getParent must be equalTo(constituents(1)) 58 | 59 | constituents(1).getBegin must be equalTo (0) 60 | constituents(1).getEnd must be equalTo (16) 61 | constituents(1).getConstituentType must be equalTo ("ROOT") 62 | constituents(1).getChildren.size must be equalTo (1) 63 | constituents(1).getParent must beNull 64 | 65 | constituents(2).getBegin must be equalTo (0) 66 | constituents(2).getEnd must be equalTo (7) 67 | constituents(2).getConstituentType must be equalTo ("AP") 68 | constituents(2).getChildren.size must be equalTo (2) 69 | constituents(2).getParent must be equalTo(constituents(0)) 70 | 71 | val tokens = jcas.select[Token].toVector 72 | tokens(0).getParent must be equalTo(constituents(2)) 73 | tokens(1).getParent must be equalTo(constituents(2)) 74 | tokens(2).getParent must be equalTo(constituents(0)) 75 | tokens(3).getParent must be equalTo(constituents(0)) 76 | tokens(4).getParent must be equalTo(constituents(0)) 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /part-of-speech-tagger/ark-tweet-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/ArkTweetPosTagger.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.pos 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem.{Token, POS} 6 | import org.apache.uima.jcas.JCas 7 | import cmu.arktweetnlp.Twokenize 8 | import scala.collection.JavaConversions._ 9 | import cmu.arktweetnlp.impl.Model 10 | import cmu.arktweetnlp.impl.features.FeatureExtractor 11 | import org.apache.uima.UimaContext; 12 | import cmu.arktweetnlp.impl.ModelSentence 13 | import cmu.arktweetnlp.impl.Sentence 14 | 15 | class ArkTweetPosTagger extends SCasAnnotator_ImplBase { 16 | 17 | object modelLocation extends Parameter[String]("") 18 | 19 | private var model: Model = _ 20 | private var featureExtractor: FeatureExtractor = _ 21 | 22 | override def initialize(context: UimaContext) { 23 | super.initialize(context) 24 | 25 | model = Model.loadModelFromText(modelLocation.is) 26 | featureExtractor = new FeatureExtractor(model, false); 27 | } 28 | 29 | def process(jcas: JCas) = { 30 | val tokens = jcas.select[Token].toVector 31 | 32 | val sentence = new Sentence() 33 | sentence.tokens = tokens.map(_.getCoveredText) 34 | val ms = new ModelSentence(sentence.T()) 35 | featureExtractor.computeFeatures(sentence, ms) 36 | model.greedyDecode(ms, false) 37 | 38 | tokens.zipWithIndex.foreach { case (token, idx) => 39 | val tag = model.labelVocab.name( ms.labels(idx) ); 40 | 41 | val pos = new POS(jcas, token.getBegin, token.getEnd) 42 | pos.setName(tag) 43 | add(pos) 44 | 45 | token.setPos(pos) 46 | } 47 | } 48 | 49 | def createToken(cas: JCas, begin: Int, end: Int) = 50 | new Token(cas, begin, end) 51 | } 52 | -------------------------------------------------------------------------------- /part-of-speech-tagger/ark-tweet-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/ArkTweetPosTaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.pos 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.specs2.mutable.Specification 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory 9 | import org.apache.uima.fit.util.JCasUtil 10 | 11 | class ArkTweetPosTaggerSpec extends Specification { 12 | 13 | "Ark Tweet Pos Tagger" should { 14 | "add POS tags" in { 15 | val modelPath = new java.io.File(getClass.getResource("/model.20120919").toURI).getAbsolutePath 16 | val tagger: AnalysisEngine = new ArkTweetPosTagger(). 17 | config( 18 | _.modelLocation := modelPath 19 | ). 20 | asAnalysisEngine 21 | 22 | val jcas = tagger.newJCas() 23 | jcas.setDocumentText("RT @DjBlack_Pearl: wat muhfuckaz wearin 4 the lingerie party?????") 24 | jcas.annotate[Token](0, 2) 25 | jcas.annotate[Token](3, 17) 26 | jcas.annotate[Token](17, 18) 27 | jcas.annotate[Token](19, 22) 28 | jcas.annotate[Token](23, 32) 29 | jcas.annotate[Token](33, 39) 30 | jcas.annotate[Token](40, 41) 31 | jcas.annotate[Token](42, 45) 32 | jcas.annotate[Token](46, 54) 33 | jcas.annotate[Token](55, 60) 34 | jcas.annotate[Token](60, 65) 35 | tagger.process(jcas) 36 | 37 | jcas.select[POS].size must be equalTo(11) 38 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("RT") 39 | jcas.selectByIndex[POS](0).getName must be equalTo ("~") 40 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("@DjBlack_Pearl") 41 | jcas.selectByIndex[POS](1).getName must be equalTo ("@") 42 | jcas.selectByIndex[POS](2).getCoveredText must be equalTo (":") 43 | jcas.selectByIndex[POS](2).getName must be equalTo ("~") 44 | jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("wat") 45 | jcas.selectByIndex[POS](3).getName must be equalTo ("O") 46 | jcas.selectByIndex[POS](4).getCoveredText must be equalTo ("muhfuckaz") 47 | jcas.selectByIndex[POS](4).getName must be equalTo ("N") 48 | jcas.selectByIndex[POS](5).getCoveredText must be equalTo ("wearin") 49 | jcas.selectByIndex[POS](5).getName must be equalTo ("V") 50 | jcas.selectByIndex[POS](6).getCoveredText must be equalTo ("4") 51 | jcas.selectByIndex[POS](6).getName must be equalTo ("P") 52 | jcas.selectByIndex[POS](7).getCoveredText must be equalTo ("the") 53 | jcas.selectByIndex[POS](7).getName must be equalTo ("D") 54 | jcas.selectByIndex[POS](8).getCoveredText must be equalTo ("lingerie") 55 | jcas.selectByIndex[POS](8).getName must be equalTo ("N") 56 | jcas.selectByIndex[POS](9).getCoveredText must be equalTo ("party") 57 | jcas.selectByIndex[POS](9).getName must be equalTo ("N") 58 | jcas.selectByIndex[POS](10).getCoveredText must be equalTo ("?????") 59 | jcas.selectByIndex[POS](10).getName must be equalTo (",") 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /part-of-speech-tagger/mate-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/MatePosTagger.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.pos 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import scala.collection.JavaConversions._ 10 | import is2.data.SentenceData09 11 | import is2.io.CONLLReader09 12 | import is2.io.IOGenerals 13 | import is2.tag.Options 14 | import is2.tag.Tagger 15 | 16 | class MatePosTaggerResource extends SharedResourceObject { 17 | private var tagger: Tagger = _ 18 | 19 | def load(data: DataResource) { 20 | val uri = data.getUri.toString 21 | 22 | if (new java.io.File(uri).exists) { 23 | tagger = new Tagger(new Options(Array("-model", uri))) 24 | } else { 25 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri 26 | val resource = this.getClass.getResource(resourceUri) 27 | 28 | val file = java.io.File.createTempFile("mate-pos-tagger", ".temp") 29 | file.deleteOnExit(); 30 | 31 | val source = resource.openStream(); 32 | try { 33 | java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING); 34 | } finally { 35 | source.close(); 36 | } 37 | 38 | tagger = new Tagger(new Options(Array("-model", file.getAbsolutePath))) 39 | } 40 | } 41 | 42 | def getTagger = tagger 43 | } 44 | 45 | class MatePosTagger extends SCasAnnotator_ImplBase { 46 | 47 | object model extends SharedResource[MatePosTaggerResource]("") 48 | 49 | def process(jcas: JCas) = { 50 | jcas.select[Sentence].foreach { sentence => 51 | val tokens = jcas.selectCovered[Token](sentence).toVector 52 | 53 | val sentenceData = new SentenceData09() 54 | sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText)) 55 | sentenceData.setLemmas(Array[String](IOGenerals.ROOT_LEMMA) ++ tokens.map { t => 56 | if (t.getLemma != null) { 57 | t.getLemma.getValue() 58 | } else { 59 | "_" 60 | } 61 | }) 62 | 63 | model.resource.getTagger.apply(sentenceData).ppos.drop(1).zipWithIndex.foreach { case (tag, idx) => 64 | val token = tokens(idx) 65 | 66 | val pos = new POS(jcas, token.getBegin, token.getEnd) 67 | pos.setName(tag) 68 | add(pos) 69 | 70 | token.setPos(pos) 71 | } 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /part-of-speech-tagger/mate-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/MatePosTaggerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.pos 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import com.github.jenshaase.uimascala.core.configuration._ 7 | import org.apache.uima.analysis_engine.AnalysisEngine 8 | import org.specs2.mutable.Specification 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.util.JCasUtil 11 | 12 | class MatePosTaggerSpec extends Specification { 13 | 14 | "MatePosTagger" should { 15 | "get the correct pos values" in { 16 | val tagger: AnalysisEngine = new MatePosTagger(). 17 | config( 18 | _.model := SharedBinding[MatePosTaggerResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/tagger-de-tiger.model") 19 | ). 20 | asAnalysisEngine 21 | 22 | val jcas = tagger.newJCas() 23 | jcas.setDocumentText("Wie alt bist du?") 24 | jcas.annotate[Sentence](0, 16) 25 | jcas.annotate[Token](0, 3) 26 | jcas.annotate[Token](4, 7) 27 | jcas.annotate[Token](8, 12) 28 | jcas.annotate[Token](13, 15) 29 | jcas.annotate[Token](15, 16) 30 | 31 | tagger.process(jcas) 32 | 33 | jcas.select[POS].size must be equalTo(5) 34 | jcas.selectByIndex[POS](0).getName must be equalTo ("PWAV") 35 | jcas.selectByIndex[POS](1).getName must be equalTo ("ADJD") 36 | jcas.selectByIndex[POS](2).getName must be equalTo ("VAFIN") 37 | jcas.selectByIndex[POS](3).getName must be equalTo ("PPER") 38 | jcas.selectByIndex[POS](4).getName must be equalTo ("$.") 39 | 40 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Wie") 41 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("alt") 42 | jcas.selectByIndex[POS](2).getCoveredText must be equalTo ("bist") 43 | jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("du") 44 | jcas.selectByIndex[POS](4).getCoveredText must be equalTo ("?") 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /part-of-speech-tagger/stanford-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/StanfordPosTagger.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.pos 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import edu.stanford.nlp.ling.TaggedWord 10 | import scala.collection.JavaConversions._ 11 | import edu.stanford.nlp.tagger.maxent.MaxentTagger 12 | 13 | class MaxentTaggerResource extends SharedResourceObject { 14 | private var tagger: MaxentTagger = _ 15 | 16 | def load(data: DataResource) { 17 | tagger = new MaxentTagger(data.getUri.toString) 18 | } 19 | 20 | def getTagger = tagger 21 | } 22 | 23 | class StanfordPosTagger extends SCasAnnotator_ImplBase { 24 | 25 | object model extends SharedResource[MaxentTaggerResource](MaxentTagger.DEFAULT_JAR_PATH) 26 | object maxTokensPerSentence extends Parameter[Option[Int]](None) { 27 | override def mandatory_? = false 28 | } 29 | 30 | def process(jcas: JCas) = { 31 | jcas.select[Sentence].foreach { sentence => 32 | val tokens = jcas.selectCovered[Token](sentence) 33 | 34 | maxTokensPerSentence.is match { 35 | case None => 36 | processTokens(jcas, tokens) 37 | case Some(n) if (n > 0 && tokens.size <= n) => 38 | processTokens(jcas, tokens) 39 | case _ => 40 | } 41 | } 42 | } 43 | 44 | def processTokens(jcas: JCas, tokens: Seq[Token]) { 45 | val words = tokens.map { token => new TaggedWord(token.getCoveredText) } 46 | val taggedWords = model.resource.getTagger.tagSentence(words) 47 | 48 | tokens.zipWithIndex.foreach { case (token, idx) => 49 | val tag = taggedWords.get(idx).tag() 50 | 51 | val pos = new POS(jcas, token.getBegin, token.getEnd) 52 | pos.setName(tag) 53 | add(pos) 54 | 55 | token.setPos(pos) 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /part-of-speech-tagger/stanford-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/StanfordPosTaggerSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.pos 5 | 6 | import java.util.Locale 7 | import com.github.jenshaase.uimascala.core._ 8 | import com.github.jenshaase.uimascala.typesystem._ 9 | import com.github.jenshaase.uimascala.core.configuration._ 10 | import org.apache.uima.analysis_engine.AnalysisEngine 11 | import org.specs2.mutable.Specification 12 | import org.apache.uima.fit.factory.AnalysisEngineFactory 13 | import org.apache.uima.fit.util.JCasUtil 14 | 15 | class StanfordPosTaggerSpec extends Specification { 16 | 17 | "StanfordPosTagger" should { 18 | "tag each word in a sentence" in { 19 | val tagger: AnalysisEngine = new StanfordPosTagger(). 20 | config( 21 | _.model := SharedBinding[MaxentTaggerResource]("edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger") 22 | ). 23 | asAnalysisEngine 24 | 25 | val jcas = tagger.newJCas() 26 | jcas.setDocumentText("Hallo Welt! Was geht?") 27 | jcas.annotate[Sentence](0, 10) 28 | jcas.annotate[Sentence](12, 20) 29 | jcas.annotate[Token](0, 5) 30 | jcas.annotate[Token](6, 10) 31 | jcas.annotate[Token](12, 15) 32 | jcas.annotate[Token](16, 20) 33 | tagger.process(jcas) 34 | 35 | jcas.select[POS].size must be equalTo(4) 36 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Hallo") 37 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("Welt") 38 | jcas.selectByIndex[POS](2).getCoveredText must be equalTo ("Was") 39 | jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("geht") 40 | } 41 | 42 | "tag each word in a sentence if the sentences is short enough" in { 43 | val tagger: AnalysisEngine = new StanfordPosTagger(). 44 | config( 45 | _.model := SharedBinding[MaxentTaggerResource]("edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger"), 46 | _.maxTokensPerSentence := Some(2) 47 | ). 48 | asAnalysisEngine 49 | 50 | val jcas = tagger.newJCas() 51 | jcas.setDocumentText("Hallo Welt! Was geht heute?") 52 | jcas.annotate[Sentence](0, 10) 53 | jcas.annotate[Sentence](12, 26) 54 | jcas.annotate[Token](0, 5) 55 | jcas.annotate[Token](6, 10) 56 | jcas.annotate[Token](12, 15) 57 | jcas.annotate[Token](16, 20) 58 | jcas.annotate[Token](21, 26) 59 | tagger.process(jcas) 60 | 61 | jcas.select[POS].size must be equalTo(2) 62 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Hallo") 63 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("Welt") 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3") 2 | 3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 4 | 5 | lazy val plugins = project in file(".") dependsOn(file("../sbt-plugin")) 6 | -------------------------------------------------------------------------------- /sbt-plugin/build.sbt: -------------------------------------------------------------------------------- 1 | sbtPlugin := true 2 | 3 | organization := "com.github.jenshaase.uimascala" 4 | 5 | libraryDependencies ++= Seq( 6 | "org.apache.uima" % "uimaj-tools" % "2.8.1" 7 | ) 8 | 9 | releasePublishArtifactsAction := PgpKeys.publishSigned.value 10 | 11 | publishTo := { 12 | val nexus = "https://oss.sonatype.org/" 13 | if ( version.value.trim.endsWith( "SNAPSHOT" ) ) 14 | Some( "snapshots" at nexus + "content/repositories/snapshots" ) 15 | else 16 | Some( "releases" at nexus + "service/local/staging/deploy/maven2" ) 17 | } 18 | 19 | publishMavenStyle := true 20 | 21 | pomExtra := ( 22 | https://github.com/jenshaase/uimaScala 23 | 24 | git@github.com:jenshaase/uimascala.git 25 | scm:git:git@github.com:jenshaase/uimascala.git 26 | 27 | 28 | 29 | jenshaase 30 | Jens Haase 31 | 32 | 33 | 34 | 35 | Apache 2 36 | http://www.apache.org/licenses/LICENSE-2.0.txt 37 | repo 38 | 39 | 40 | ) 41 | 42 | -------------------------------------------------------------------------------- /sbt-plugin/project/plugin.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3") 2 | 3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0") 4 | -------------------------------------------------------------------------------- /sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaSbtPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala 2 | 3 | import sbt._ 4 | import Keys._ 5 | import plugins._ 6 | 7 | import org.apache.uima.UIMAFramework 8 | import org.apache.uima.util.{ CasCreationUtils, XMLInputSource } 9 | import org.apache.uima.cas.impl.CASImpl 10 | import org.apache.uima.tools.jcasgen._ 11 | 12 | object UimaSbtPlugin extends Plugin { 13 | 14 | val uimaConfig = config("uima") 15 | 16 | val jcasGen = TaskKey[Unit]("jcasgen") 17 | val visualDebugger = TaskKey[Unit]("visualDebugger") 18 | 19 | def uimaScalaSettings = Seq( 20 | sourceDirectory in uimaConfig <<= (resourceDirectory in Compile) { _ / "desc" / "types" }, 21 | javaSource in uimaConfig <<= (sourceManaged in Compile) { _ / "java" }, 22 | sourceGenerators in Compile <+= generateTypeSystemSourcesTask, 23 | managedSourceDirectories in Compile <+= (javaSource in uimaConfig), 24 | cleanFiles <+= (javaSource in uimaConfig), 25 | jcasGen <<= jcasGenTask, 26 | visualDebugger <<= visualDebuggerTask 27 | ) 28 | 29 | def generateTypeSystemSourcesTask = 30 | (sourceDirectory in uimaConfig, javaSource in uimaConfig) map { (srcDir, targetDir) => 31 | generateTypeSystemSources(srcDir, targetDir) 32 | } 33 | 34 | def generateTypeSystemSources(srcDir: File, targetDir: File): Seq[File] = { 35 | (srcDir ** "*.xml").get foreach { filename => 36 | val xmlIS = new XMLInputSource(filename) 37 | val tsd = UIMAFramework.getXMLParser.parseTypeSystemDescription(xmlIS) 38 | val cas = CasCreationUtils.createCas(tsd, null, null) 39 | val jg = new Jg() 40 | jg.mainGenerateAllTypesFromTemplates( 41 | null, new UimaLoggerProgressMonitor(), new LogThrowErrorImpl(), 42 | filename.getAbsolutePath, targetDir.getAbsolutePath, tsd.getTypes, 43 | cas.asInstanceOf[CASImpl], classOf[UimaScalaTypeTemplate], 44 | classOf[UimaScala_TypeTemplate], "", false, null 45 | ) 46 | } 47 | 48 | (targetDir ** "*.java").get 49 | } 50 | 51 | def jcasGenTask = 52 | (streams) map { streams => 53 | Run.executeTrapExit( 54 | (new Jg()).main0(Array[String](), null, null, new LogThrowErrorImpl()), 55 | streams.log 56 | ) 57 | () 58 | } 59 | 60 | def visualDebuggerTask = 61 | (streams) map { streams => 62 | Run.executeTrapExit ( 63 | org.apache.uima.tools.cvd.CVD.main(Array[String]()), 64 | streams.log 65 | ) 66 | () 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaScalaTypeTemplate.scala: -------------------------------------------------------------------------------- 1 | package org.apache.uima.tools.jcasgen 2 | 3 | import org.apache.uima.resource.metadata.TypeDescription 4 | import scala.collection.JavaConversions._ 5 | 6 | class UimaScalaTypeTemplate extends Jg.IJCasTypeTemplate { 7 | 8 | def generate(argument: Any): String = { 9 | val stringBuffer = new StringBuffer(); 10 | stringBuffer.append("\n\n"); 11 | 12 | val args: Array[Object] = argument.asInstanceOf[Array[Object]] 13 | val jg = args(0).asInstanceOf[Jg] 14 | val td = args(1).asInstanceOf[TypeDescription] 15 | jg.packageName = jg.getJavaPkg(td); 16 | 17 | if (0 != jg.packageName.length()) { 18 | stringBuffer.append(s"""package ${jg.packageName};"""); 19 | stringBuffer.append("\n"); 20 | } 21 | else 22 | jg.error.newError(IError.WARN, 23 | jg.getString("pkgMissing", Array.apply[Object](td.getName)), null); 24 | stringBuffer.append(""" 25 | import org.apache.uima.jcas.JCas; 26 | import org.apache.uima.jcas.JCasRegistry; 27 | import org.apache.uima.jcas.cas.TOP_Type; 28 | 29 | """); 30 | 31 | jg.collectImports(td, false).foreach { imp => 32 | stringBuffer.append(s"""import $imp;"""); 33 | stringBuffer.append("\n"); 34 | } 35 | 36 | stringBuffer.append("\n\n"); 37 | 38 | val typeName = jg.getJavaName(td); 39 | val typeName_Type = typeName + "_Type"; 40 | val jcasTypeCasted = "((" + typeName_Type + ")jcasType)"; 41 | 42 | stringBuffer.append(s"""/** ${jg.nullBlank(td.getDescription())} */ 43 | public class ${typeName} extends ${jg.getJavaName(td.getSupertypeName())} { 44 | @SuppressWarnings ("hiding") 45 | public final static int typeIndexID = JCasRegistry.register(${typeName}.class); 46 | @SuppressWarnings ("hiding") 47 | public final static int type = typeIndexID; 48 | @Override 49 | public int getTypeIndexID() {return typeIndexID;} 50 | 51 | /** Never called. Disable default constructor */ 52 | protected ${typeName}() {/* intentionally empty block */} 53 | 54 | /** Internal - constructor used by generator 55 | * 56 | * @param addr low level Feature Structure reference 57 | * @param type the type of this Feature Structure 58 | */ 59 | public ${typeName}(int addr, TOP_Type type) { 60 | super(addr, type); 61 | readObject(); 62 | } 63 | 64 | /** 65 | * @param jcas JCas to which this Feature Structure belongs 66 | */ 67 | public ${typeName}(JCas jcas) { 68 | super(jcas); 69 | readObject(); 70 | } 71 | """); 72 | 73 | if (jg.isSubTypeOfAnnotation(td)) { 74 | stringBuffer.append(s""" 75 | /** 76 | * @param jcas JCas to which this Feature Structure belongs 77 | * @param begin offset to the begin spot in the SofA 78 | * @param end offset to the end spot in the SofA 79 | */ 80 | public ${typeName}(JCas jcas, int begin, int end) { 81 | super(jcas); 82 | setBegin(begin); 83 | setEnd(end); 84 | readObject(); 85 | } 86 | """); 87 | } 88 | 89 | stringBuffer.append(s""" 90 | /** 91 | * 92 | * Write your own initialization here 93 | * 94 | * 95 | */ 96 | private void readObject() {/*default - does nothing empty block */} 97 | 98 | """); 99 | 100 | td.getFeatures().foreach { fd => 101 | val featName = fd.getName(); 102 | val featUName = jg.uc1(featName); // upper case first letter 103 | if (Jg.reservedFeatureNames.contains(featUName)) 104 | jg.error.newError(IError.ERROR, 105 | jg.getString("reservedNameUsed", Array.apply[Object](featName, td.getName)), 106 | null); 107 | 108 | val featDesc = jg.nullBlank(fd.getDescription()); 109 | val featDescCmt = featDesc; 110 | 111 | val rangeType = jg.getJavaRangeType(fd); 112 | val elemType = jg.getJavaRangeArrayElementType(fd); 113 | 114 | stringBuffer.append(s""" 115 | 116 | //*--------------* 117 | //* Feature: ${featName} 118 | 119 | /** getter for ${featName} - gets ${featDescCmt} 120 | * @return value of the feature 121 | */ 122 | public ${rangeType} get${featUName}() { 123 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null) 124 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName}"); 125 | return ${jg.getFeatureValue(fd, td)};} 126 | 127 | /** setter for ${featName} - sets ${featDescCmt} 128 | * @param v value to set into the feature 129 | */ 130 | public void set${featUName}(${rangeType} v) { 131 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null) 132 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}"); 133 | ${jg.setFeatureValue(fd, td)};} 134 | """); 135 | 136 | if (jg.hasArrayRange(fd)) { 137 | stringBuffer.append(s""" 138 | /** indexed getter for ${featName} - gets an indexed value - ${featDescCmt} 139 | * @param i index in the array to get 140 | * @return value of the element at index i 141 | */ 142 | public ${elemType} get${featUName}(int i) { 143 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null) 144 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}"); 145 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ${jcasTypeCasted}.casFeatCode_${featName}), i); 146 | return ${jg.getArrayFeatureValue(fd, td)};} 147 | 148 | /** indexed setter for ${featName} - sets an indexed value - ${featDescCmt} 149 | * @param i index in the array to set 150 | * @param v value to set into the array 151 | */ 152 | public void set${featUName}(int i, ${elemType} v) { 153 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null) 154 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}"); 155 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ${jcasTypeCasted}.casFeatCode_${featName}), i); 156 | ${jg.setArrayFeatureValue(fd, td)};} 157 | """); 158 | } /* of hasArray */ 159 | 160 | stringBuffer.append(""); 161 | 162 | } /* of Features iteration */ 163 | 164 | stringBuffer.append(""); 165 | 166 | if (td.getName().equals("uima.cas.Annotation")) { 167 | stringBuffer.append(" "); 168 | stringBuffer.append(""" /** Constructor with begin and end passed as arguments 169 | * @param jcas JCas this Annotation is in 170 | * @param begin the begin offset 171 | * @param end the end offset 172 | */ 173 | public Annotation(JCas jcas, int begin, int end) { 174 | this(jcas); // forward to constructor 175 | this.setBegin(begin); 176 | this.setEnd(end); 177 | } 178 | 179 | /** @see org.apache.uima.cas.text.AnnotationFS#getCoveredText() 180 | * @return the covered Text 181 | */ 182 | public String getCoveredText() { 183 | final CAS casView = this.getView(); 184 | final String text = casView.getDocumentText(); 185 | if (text == null) { 186 | return null; 187 | } 188 | return text.substring(getBegin(), getEnd()); 189 | } 190 | 191 | /** @deprecated 192 | * @return the begin offset 193 | */ 194 | public int getStart() {return getBegin();} 195 | """); 196 | stringBuffer.append(""); 197 | } /* of Annotation if-statement */ 198 | stringBuffer.append("}\n\n "); 199 | return stringBuffer.toString(); 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaScala_TypeTemplate.scala: -------------------------------------------------------------------------------- 1 | package org.apache.uima.tools.jcasgen 2 | 3 | import org.apache.uima.resource.metadata.TypeDescription 4 | import scala.collection.JavaConversions._ 5 | 6 | class UimaScala_TypeTemplate extends Jg.IJCasTypeTemplate { 7 | 8 | def generate(argument: Any): String = { 9 | val args: Array[Any] = argument.asInstanceOf[Array[Any]] 10 | val jg = args(0).asInstanceOf[Jg] 11 | val td = args(1).asInstanceOf[TypeDescription] 12 | val stringBuffer = new StringBuffer() 13 | 14 | jg.packageName = jg.getJavaPkg(td); 15 | if (0 != jg.packageName.length()) { 16 | stringBuffer.append("package "); 17 | stringBuffer.append(jg.packageName); 18 | stringBuffer.append(";\n"); 19 | } 20 | stringBuffer.append(""" 21 | import org.apache.uima.jcas.JCas; 22 | import org.apache.uima.jcas.JCasRegistry; 23 | import org.apache.uima.cas.impl.CASImpl; 24 | import org.apache.uima.cas.impl.FSGenerator; 25 | import org.apache.uima.cas.FeatureStructure; 26 | import org.apache.uima.cas.impl.TypeImpl; 27 | import org.apache.uima.cas.Type; 28 | """); 29 | 30 | if (td.getFeatures().length > 0) { 31 | stringBuffer.append("""import org.apache.uima.cas.impl.FeatureImpl; 32 | import org.apache.uima.cas.Feature; 33 | """); 34 | } 35 | 36 | stringBuffer.append(""); 37 | 38 | jg.collectImports(td, true).foreach { imp => 39 | if (!imp.equals(jg.getJavaNameWithPkg(td.getName()+"_Type"))) { 40 | stringBuffer.append(s"""import ${imp};""") 41 | stringBuffer.append("\n") 42 | } 43 | } 44 | 45 | stringBuffer.append("\n"); 46 | val typeName = jg.getJavaName(td); 47 | val typeName_Type = typeName + "_Type"; 48 | stringBuffer.append(s"""/** ${jg.nullBlank(td.getDescription())} */ 49 | public class ${typeName_Type} extends ${jg.getJavaName(td.getSupertypeName())}_Type { 50 | /** 51 | * @return the generator for this type 52 | */ 53 | @Override 54 | protected FSGenerator getFSGenerator() {return fsGenerator;} 55 | 56 | private final FSGenerator fsGenerator = 57 | new FSGenerator() { 58 | public FeatureStructure createFS(int addr, CASImpl cas) { 59 | if (${typeName_Type}.this.useExistingInstance) { 60 | // Return eq fs instance if already created 61 | FeatureStructure fs = ${typeName_Type}.this.jcas.getJfsFromCaddr(addr); 62 | if (null == fs) { 63 | fs = new ${typeName}(addr, ${typeName_Type}.this); 64 | ${typeName_Type}.this.jcas.putJfsFromCaddr(addr, fs); 65 | return fs; 66 | } 67 | return fs; 68 | } else return new ${typeName}(addr, ${typeName_Type}.this); 69 | } 70 | }; 71 | 72 | @SuppressWarnings ("hiding") 73 | public final static int typeIndexID = ${typeName}.typeIndexID; 74 | 75 | @SuppressWarnings ("hiding") 76 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("${td.getName()}"); 77 | """); 78 | 79 | 80 | td.getFeatures().foreach { fd => 81 | val featName = fd.getName(); 82 | val featUName = jg.uc1(featName); // upper case first letter 83 | 84 | val rangeType = jg.getJavaRangeType(fd); 85 | val getSetNamePart = jg.sc(rangeType); 86 | val returnType = if (getSetNamePart.equals("Ref")) "int" else rangeType; 87 | val getSetArrayNamePart = jg.getGetSetArrayNamePart(fd); 88 | 89 | val elemType = 90 | if (jg.sc(jg.getJavaRangeArrayElementType(fd)).equals("Ref")) { 91 | "int"; 92 | } else { 93 | jg.getJavaRangeArrayElementType(fd); 94 | } 95 | val casFeatCode = "casFeatCode_" + featName; 96 | 97 | stringBuffer.append(s""" 98 | final Feature casFeat_${featName}; 99 | final int ${casFeatCode}; 100 | /** 101 | * @param addr low level Feature Structure reference 102 | * @return the feature value 103 | */ 104 | public ${returnType} get${featUName}(int addr) { 105 | if (featOkTst && casFeat_${featName} == null) 106 | jcas.throwFeatMissing("${featName}", "${td.getName()}"); 107 | return ll_cas.ll_get${getSetNamePart}Value(addr, ${casFeatCode}); 108 | } 109 | /** 110 | * @param addr low level Feature Structure reference 111 | * @param v value to set 112 | */ 113 | public void set${featUName}(int addr, ${returnType} v) { 114 | if (featOkTst && casFeat_${featName} == null) 115 | jcas.throwFeatMissing("${featName}", "${td.getName()}"); 116 | ll_cas.ll_set${getSetNamePart}Value(addr, ${casFeatCode}, v);} 117 | 118 | """); 119 | 120 | if (jg.hasArrayRange(fd)) { 121 | stringBuffer.append(s""" 122 | /** 123 | * @param addr low level Feature Structure reference 124 | * @param i index of item in the array 125 | * @return value at index i in the array 126 | */ 127 | public ${elemType} get${featUName}(int addr, int i) { 128 | if (featOkTst && casFeat_${featName} == null) 129 | jcas.throwFeatMissing("${featName}", "${td.getName()}"); 130 | if (lowLevelTypeChecks) 131 | return ll_cas.ll_get${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, true); 132 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i); 133 | return ll_cas.ll_get${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i); 134 | } 135 | 136 | /** 137 | * @param addr low level Feature Structure reference 138 | * @param i index of item in the array 139 | * @param v value to set 140 | */ 141 | public void set${featUName}(int addr, int i, ${elemType} v) { 142 | if (featOkTst && casFeat_${featName} == null) 143 | jcas.throwFeatMissing("${featName}", "${td.getName}"); 144 | if (lowLevelTypeChecks) 145 | ll_cas.ll_set${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, v, true); 146 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i); 147 | ll_cas.ll_set${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, v); 148 | } 149 | """); 150 | } 151 | stringBuffer.append(" \n"); 152 | } 153 | 154 | stringBuffer.append("\n"); 155 | 156 | if (td.getName().equals("uima.cas.Annotation")) { 157 | stringBuffer.append(" "); 158 | stringBuffer.append(s""" /** @see org.apache.uima.cas.text.AnnotationFS#getCoveredText() 159 | * @param inst the low level Feature Structure reference 160 | * @return the covered text 161 | */ 162 | public String getCoveredText(int inst) { 163 | final CASImpl casView = ll_cas.ll_getSofaCasView(inst); 164 | final String text = casView.getDocumentText(); 165 | if (text == null) { 166 | return null; 167 | } 168 | return text.substring(getBegin(inst), getEnd(inst)); 169 | } 170 | """); 171 | } /* of Annotation if-statement */ 172 | 173 | stringBuffer.append(s""" 174 | 175 | /** initialize variables to correspond with Cas Type and Features 176 | * @param jcas JCas 177 | * @param casType Type 178 | */ 179 | public ${typeName_Type}(JCas jcas, Type casType) { 180 | super(jcas, casType); 181 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); 182 | 183 | """); 184 | td.getFeatures().foreach { fd => 185 | val featName = fd.getName(); 186 | 187 | stringBuffer.append(s""" 188 | casFeat_${featName} = jcas.getRequiredFeatureDE(casType, "${featName}", "${fd.getRangeTypeName()}", featOkTst); 189 | casFeatCode_${featName} = (null == casFeat_${featName}) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_${featName}).getCode(); 190 | 191 | """); 192 | } 193 | stringBuffer.append(" }\n}\n\n\n\n "); 194 | return stringBuffer.toString(); 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /sbt-plugin/version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.6.2-SNAPSHOT" 2 | -------------------------------------------------------------------------------- /segmenter/ark-tweet-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/ArkTweetTokenizer.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import cmu.arktweetnlp.Twokenize 8 | import scala.collection.JavaConversions._ 9 | 10 | object ArkTweetTokenizer { 11 | def normalizeTweet(tweet: String): String = 12 | Twokenize.normalizeTextForTagger(tweet) 13 | } 14 | 15 | class ArkTweetTokenizer extends SCasAnnotator_ImplBase { 16 | 17 | def process(jcas: JCas) = { 18 | val txt = jcas.getDocumentText 19 | 20 | Twokenize.tokenize(txt).foldLeft(0) { (offset, token) => 21 | val start = txt.indexOf(token, offset); 22 | val end = start + token.length 23 | add(createToken(jcas, start, end)) 24 | end 25 | } 26 | } 27 | 28 | def createToken(cas: JCas, begin: Int, end: Int) = 29 | new Token(cas, begin, end) 30 | } 31 | -------------------------------------------------------------------------------- /segmenter/ark-tweet-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/ArkTweetTokenizerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.specs2.mutable.Specification 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory 9 | import org.apache.uima.fit.util.JCasUtil 10 | 11 | class ArkTweetTokenizerSpec extends Specification { 12 | 13 | "Ark Tweet Tokenizer" should { 14 | "annotate all tokens in a tweet" in { 15 | val tokenizer: AnalysisEngine = new ArkTweetTokenizer().asAnalysisEngine 16 | 17 | val jcas = tokenizer.newJCas() 18 | jcas.setDocumentText("This is a test & a thing #hash #tag bit.ly/link") 19 | tokenizer.process(jcas) 20 | 21 | jcas.select[Token].size must be equalTo(12) 22 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("This") 23 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("is") 24 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("a") 25 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("test") 26 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("&") 27 | jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("amp") 28 | jcas.selectByIndex[Token](6).getCoveredText must be equalTo (";") 29 | jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("a") 30 | jcas.selectByIndex[Token](8).getCoveredText must be equalTo ("thing") 31 | jcas.selectByIndex[Token](9).getCoveredText must be equalTo ("#hash") 32 | jcas.selectByIndex[Token](10).getCoveredText must be equalTo ("#tag") 33 | jcas.selectByIndex[Token](11).getCoveredText must be equalTo ("bit.ly/link") 34 | } 35 | 36 | "it should normalize a tweet" in { 37 | ArkTweetTokenizer.normalizeTweet("This is a test & a thing #hash #tag bit.ly/link") must be equalTo ( 38 | "This is a test & a thing #hash #tag bit.ly/link" 39 | ) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /segmenter/break-iterator-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/BreakIteratorSegmenter.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.segmenter 5 | 6 | import java.text.BreakIterator 7 | import java.util.Locale 8 | import com.github.jenshaase.uimascala.core._ 9 | import com.github.jenshaase.uimascala.typesystem._ 10 | import com.github.jenshaase.uimascala.core.configuration._ 11 | import org.apache.uima.jcas.JCas 12 | import org.apache.uima.fit.descriptor.ConfigurationParameter 13 | import org.apache.uima.fit.factory.AnalysisEngineFactory 14 | 15 | /** 16 | * @author Jens Haase 17 | */ 18 | class BreakIteratorSegmenter extends SCasAnnotator_ImplBase { 19 | 20 | object locale extends Parameter[Locale](Locale.getDefault) 21 | 22 | def process(jcas: JCas) = { 23 | val bi = BreakIterator.getSentenceInstance(getLocale(jcas)) 24 | bi.setText(jcas.getDocumentText) 25 | 26 | var last = bi.first 27 | var cur = bi.next 28 | while (cur != BreakIterator.DONE) { 29 | val sentence = addIfNotEmpty(createSentence(jcas, last, cur).trim) 30 | processSentence(jcas, sentence.getCoveredText, last) 31 | 32 | last = cur 33 | cur = bi.next 34 | } 35 | } 36 | 37 | def processSentence(jcas: JCas, sentence: String, offset: Int) = { 38 | val bi = BreakIterator.getWordInstance(getLocale(jcas)) 39 | bi.setText(sentence) 40 | 41 | var last = bi.first 42 | var cur = bi.next 43 | while (cur != BreakIterator.DONE) { 44 | addIfNotEmpty(createToken(jcas, last + offset, cur + offset).trim) 45 | 46 | last = cur 47 | cur = bi.next 48 | } 49 | } 50 | 51 | protected def createSentence(cas: JCas, begin: Int, end: Int) = 52 | new Sentence(cas, begin, end) 53 | 54 | protected def createToken(cas: JCas, begin: Int, end: Int) = 55 | new Token(cas, begin, end) 56 | 57 | protected def getLocale(jcas: JCas): Locale = { 58 | val l = jcas.getDocumentLanguage() 59 | if (l != null && l != "x-unspecified") { 60 | return new Locale(l) 61 | } 62 | 63 | locale.is 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /segmenter/break-iterator-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/BreakIteratorSegmenterSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.segmenter 5 | 6 | import java.util.Locale 7 | import com.github.jenshaase.uimascala.core._ 8 | import com.github.jenshaase.uimascala.typesystem._ 9 | import org.apache.uima.analysis_engine.AnalysisEngine 10 | import org.specs2.mutable.Specification 11 | import org.apache.uima.fit.factory.AnalysisEngineFactory 12 | import org.apache.uima.fit.util.JCasUtil 13 | 14 | /** 15 | * @author Jens Haase 16 | */ 17 | class BreakIteratorSegmenterSpec extends Specification { 18 | 19 | "Break Iterator" should { 20 | val germanTokenizer: AnalysisEngine = new BreakIteratorSegmenter().config( 21 | _.locale := Locale.GERMAN).asAnalysisEngine 22 | 23 | "split german sentences" in { 24 | val jcas = germanTokenizer.newJCas() 25 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?") 26 | germanTokenizer.process(jcas) 27 | 28 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Hallo, alle zusammen.") 29 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Wie geht es euch?") 30 | } 31 | 32 | "split german words" in { 33 | val jcas = germanTokenizer.newJCas() 34 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?") 35 | germanTokenizer.process(jcas) 36 | 37 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo") 38 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo (",") 39 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("alle") 40 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("zusammen") 41 | } 42 | 43 | "split english words when document language is set" in { 44 | val jcas = germanTokenizer.newJCas() 45 | jcas.setDocumentText("What's up? Once again") 46 | jcas.setDocumentLanguage("en"); 47 | germanTokenizer.process(jcas) 48 | 49 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("What's") 50 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("up") 51 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("?") 52 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("Once") 53 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("again") 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /segmenter/lucene-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/LuceneTokenizer.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import java.text.BreakIterator 8 | import org.apache.lucene.analysis.standard.StandardTokenizer 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute 10 | 11 | class LuceneTokenizer extends SCasAnnotator_ImplBase { 12 | 13 | def process(jcas: JCas) = { 14 | val bi = BreakIterator.getSentenceInstance() 15 | bi.setText(jcas.getDocumentText) 16 | 17 | var last = bi.first 18 | var cur = bi.next 19 | while (cur != BreakIterator.DONE) { 20 | val sentence = addIfNotEmpty(createSentence(jcas, last, cur).trim) 21 | processSentence(jcas, sentence.getCoveredText, last) 22 | 23 | last = cur 24 | cur = bi.next 25 | } 26 | } 27 | 28 | def processSentence(jcas: JCas, sentence: String, offset: Int) = { 29 | val tokenizer = new StandardTokenizer() 30 | tokenizer.setReader(new java.io.StringReader(sentence)) 31 | tokenizer.reset() 32 | while (tokenizer.incrementToken()) { 33 | val tokenOffset = tokenizer.getAttribute(classOf[OffsetAttribute]) 34 | add(createToken(jcas, offset + tokenOffset.startOffset, offset + tokenOffset.endOffset)) 35 | } 36 | tokenizer.end() 37 | tokenizer.close() 38 | } 39 | 40 | protected def createSentence(cas: JCas, begin: Int, end: Int) = 41 | new Sentence(cas, begin, end) 42 | 43 | protected def createToken(cas: JCas, begin: Int, end: Int) = 44 | new Token(cas, begin, end) 45 | } 46 | -------------------------------------------------------------------------------- /segmenter/lucene-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/LuceneTokenizerSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.specs2.mutable.Specification 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory 9 | import org.apache.uima.fit.util.JCasUtil 10 | 11 | class LuceneTokenizerSpec extends Specification { 12 | 13 | "Lucene Tokenizer" should { 14 | val tokenizer: AnalysisEngine = new LuceneTokenizer().asAnalysisEngine 15 | 16 | "split in sentences" in { 17 | val jcas = tokenizer.newJCas() 18 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?") 19 | tokenizer.process(jcas) 20 | 21 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Hallo, alle zusammen.") 22 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Wie geht es euch?") 23 | } 24 | 25 | "split words" in { 26 | val jcas = tokenizer.newJCas() 27 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?") 28 | tokenizer.process(jcas) 29 | 30 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo") 31 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("alle") 32 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("zusammen") 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /segmenter/open-nlp-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/OpenNlpSegmenter.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import java.util.zip.GZIPInputStream 10 | import scala.collection.JavaConversions._ 11 | import opennlp.tools.sentdetect.SentenceDetectorME 12 | import opennlp.tools.sentdetect.SentenceModel 13 | import opennlp.tools.tokenize.TokenizerME 14 | import opennlp.tools.tokenize.TokenizerModel 15 | 16 | class OpenNlpSentenceSegmenterResource extends SharedResourceObject { 17 | private var model: SentenceDetectorME = _ 18 | 19 | def load(data: DataResource) { 20 | val uri = data.getUri.toString 21 | 22 | if (new java.io.File(uri).exists) { 23 | model = new SentenceDetectorME(new SentenceModel(new java.io.File(uri))) 24 | } else { 25 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri 26 | val resource = this.getClass.getResource(resourceUri) 27 | 28 | val is = if (uri.endsWith(".gz")) { 29 | new GZIPInputStream(resource.openStream) 30 | } else { 31 | resource.openStream 32 | } 33 | 34 | model = new SentenceDetectorME(new SentenceModel(is)) 35 | } 36 | } 37 | 38 | def getModel = model 39 | } 40 | 41 | class OpenNlpTokenSegmenterResource extends SharedResourceObject { 42 | private var model: TokenizerME = _ 43 | 44 | def load(data: DataResource) { 45 | val uri = data.getUri.toString 46 | 47 | if (new java.io.File(uri).exists) { 48 | model = new TokenizerME(new TokenizerModel(new java.io.File(uri))) 49 | } else { 50 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri 51 | val resource = this.getClass.getResource(resourceUri) 52 | 53 | val is = if (uri.endsWith(".gz")) { 54 | new GZIPInputStream(resource.openStream) 55 | } else { 56 | resource.openStream 57 | } 58 | 59 | model = new TokenizerME(new TokenizerModel(is)) 60 | } 61 | } 62 | 63 | def getModel = model 64 | } 65 | 66 | class OpenNlpSegmenter extends SCasAnnotator_ImplBase { 67 | 68 | object sentenceModel extends SharedResource[OpenNlpSentenceSegmenterResource]("") 69 | object tokenModel extends SharedResource[OpenNlpTokenSegmenterResource]("") 70 | 71 | def process(jcas: JCas) = { 72 | sentenceModel.resource.getModel.sentPosDetect(jcas.getDocumentText).foreach { span => 73 | add(createSentence(jcas, span.getStart, span.getEnd)) 74 | } 75 | 76 | jcas.select[Sentence].foreach { sentence => 77 | tokenModel.resource.getModel.tokenizePos(sentence.getCoveredText).foreach { span => 78 | add(createToken(jcas, span.getStart + sentence.getStart, span.getEnd + sentence.getStart)) 79 | } 80 | } 81 | } 82 | 83 | protected def createToken(cas: JCas, begin: Int, end: Int) = 84 | new Token(cas, begin, end) 85 | 86 | protected def createSentence(cas: JCas, begin: Int, end: Int) = 87 | new Sentence(cas, begin, end) 88 | } 89 | -------------------------------------------------------------------------------- /segmenter/open-nlp-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/OpenNlpSegmenterSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.typesystem._ 5 | import com.github.jenshaase.uimascala.core.configuration._ 6 | import org.apache.uima.analysis_engine.AnalysisEngine 7 | import org.specs2.mutable.Specification 8 | 9 | class OpenNlpSegmenterSpec extends Specification { 10 | 11 | "Open Nlp Segmenter" should { 12 | "add sentence and token annotations" in { 13 | val segmenter: AnalysisEngine = new OpenNlpSegmenter(). 14 | config( 15 | _.sentenceModel := SharedBinding[OpenNlpSentenceSegmenterResource]("/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/sentence-de-maxent.bin"), 16 | _.tokenModel := SharedBinding[OpenNlpTokenSegmenterResource]("/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/token-de-maxent.bin") 17 | ). 18 | asAnalysisEngine 19 | 20 | val jcas = segmenter.newJCas() 21 | jcas.setDocumentText("Wie alt bist du?") 22 | segmenter.process(jcas) 23 | 24 | val sentences = jcas.select[Sentence].toVector 25 | sentences.size must be equalTo(1) 26 | sentences(0).getCoveredText must be equalTo(jcas.getDocumentText) 27 | 28 | val tokens = jcas.select[Token].toVector 29 | tokens.size must be equalTo(5) 30 | tokens(0).getCoveredText must be equalTo("Wie") 31 | tokens(1).getCoveredText must be equalTo("alt") 32 | tokens(2).getCoveredText must be equalTo("bist") 33 | tokens(3).getCoveredText must be equalTo("du") 34 | tokens(4).getCoveredText must be equalTo("?") 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /segmenter/regex-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/RegexTokenizer.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import scala.util.matching.Regex 8 | 9 | /** 10 | * @author Jens Haase 11 | */ 12 | class RegexTokenizer extends SCasAnnotator_ImplBase { 13 | 14 | object regex extends Parameter[Regex]("""\s+""".r) 15 | object allowEmptyToken extends Parameter[Boolean](false) 16 | 17 | def process(jcas: JCas) = { 18 | val txt = jcas.getDocumentText 19 | 20 | val mostlyAll = getRegex.findAllMatchIn(txt).foldLeft(0) { 21 | case (last, m) if ((allowEmptyToken.is && m.start >= last) || (!allowEmptyToken.is && m.start > last)) => 22 | add(createToken(jcas, last, m.start)) 23 | m.end 24 | case (_, m) => 25 | m.end 26 | } 27 | 28 | if (mostlyAll < txt.length) 29 | add(createToken(jcas, mostlyAll, txt.length)) 30 | } 31 | 32 | protected def getRegex = 33 | regex.is 34 | 35 | protected def createToken(cas: JCas, begin: Int, end: Int) = 36 | new Token(cas, begin, end) 37 | } 38 | -------------------------------------------------------------------------------- /segmenter/regex-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/RegexTokenizerSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.segmenter 5 | 6 | import java.util.Locale 7 | import com.github.jenshaase.uimascala.core._ 8 | import com.github.jenshaase.uimascala.typesystem._ 9 | import org.apache.uima.analysis_engine.AnalysisEngine 10 | import org.specs2.mutable.Specification 11 | import org.apache.uima.fit.factory.AnalysisEngineFactory 12 | import org.apache.uima.fit.util.JCasUtil 13 | 14 | class RegexTokenizerSpec extends Specification { 15 | 16 | "Regex Tokenizer" should { 17 | "split by x" in { 18 | val tokenizer: AnalysisEngine = new RegexTokenizer(). 19 | config( 20 | _.regex := "x".r 21 | ). 22 | asAnalysisEngine 23 | 24 | val jcas = tokenizer.newJCas() 25 | jcas.setDocumentText("HalloxWeltxlosxgehtsx") 26 | tokenizer.process(jcas) 27 | 28 | jcas.select[Token].size must be equalTo(4) 29 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo") 30 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt") 31 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los") 32 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("gehts") 33 | } 34 | 35 | "allow empty token" in { 36 | val tokenizer: AnalysisEngine = new RegexTokenizer(). 37 | config( 38 | _.regex := "x".r, 39 | _.allowEmptyToken := true 40 | ). 41 | asAnalysisEngine 42 | 43 | val jcas = tokenizer.newJCas() 44 | jcas.setDocumentText("HalloxWeltxlosxxgehts") 45 | tokenizer.process(jcas) 46 | 47 | jcas.select[Token].size must be equalTo(5) 48 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo") 49 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt") 50 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los") 51 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("") 52 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("gehts") 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /segmenter/stanford-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/StanfordSegmenter.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core._ 4 | import com.github.jenshaase.uimascala.core.configuration._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import org.apache.uima.jcas.JCas 7 | import org.apache.uima.resource.SharedResourceObject 8 | import org.apache.uima.resource.DataResource 9 | import edu.stanford.nlp.ling.TaggedWord 10 | import scala.collection.JavaConversions._ 11 | import java.io.StringReader 12 | import java.util.Properties 13 | import edu.stanford.nlp.ling.{CoreLabel, Word} 14 | import edu.stanford.nlp.international.spanish.process.SpanishTokenizer 15 | import edu.stanford.nlp.international.arabic.process.ArabicTokenizer 16 | import edu.stanford.nlp.international.french.process.FrenchTokenizer 17 | import edu.stanford.nlp.trees.international.pennchinese.CHTBTokenizer 18 | import edu.stanford.nlp.process.{WordToSentenceProcessor, Tokenizer, PTBTokenizer, CoreLabelTokenFactory} 19 | import edu.stanford.nlp.ling.CoreAnnotations.{CharacterOffsetBeginAnnotation, CharacterOffsetEndAnnotation} 20 | import edu.stanford.nlp.trees.international.negra.NegraPennTokenizer 21 | 22 | class StanfordSegmenter extends SCasAnnotator_ImplBase { 23 | 24 | object annotateToken extends Parameter[Boolean](true) 25 | object annotateSentence extends Parameter[Boolean](true) 26 | object fallbackLanguage extends Parameter[Option[String]](None) { 27 | override def mandatory_? = false 28 | } 29 | 30 | def process(jcas: JCas) = { 31 | if (annotateToken.is) annotateTokens(jcas) 32 | if (annotateSentence.is) annotateSentences(jcas) 33 | } 34 | 35 | def annotateTokens(jcas: JCas) { 36 | val text = jcas.getDocumentText 37 | val tokenizer = getTokenizer(jcas.getDocumentLanguage, text) 38 | 39 | var offsetInSentence = 0 40 | tokenizer.tokenize().foreach { token => 41 | token match { 42 | case token: String => 43 | offsetInSentence = skipWhitespace(text, offsetInSentence) 44 | 45 | if (text.startsWith(token, offsetInSentence)) { 46 | add(createToken(jcas, offsetInSentence, offsetInSentence + token.size)) 47 | offsetInSentence = offsetInSentence + token.size 48 | } else { 49 | throw new Exception("Text mismatch in Tokenizer: " + token + " not found") 50 | } 51 | 52 | case label: CoreLabel => 53 | val begin = label.beginPosition 54 | val end = label.endPosition 55 | add(createToken(jcas, begin, end)) 56 | offsetInSentence = end 57 | 58 | case word: Word => 59 | val token = word.word 60 | offsetInSentence = skipWhitespace(text, offsetInSentence) 61 | 62 | if (text.startsWith(token, offsetInSentence)) { 63 | add(createToken(jcas, offsetInSentence, offsetInSentence + token.size)) 64 | offsetInSentence = offsetInSentence + token.size 65 | } else { 66 | throw new Exception("Text mismatch in Tokenizer: " + token + " not found") 67 | } 68 | } 69 | } 70 | } 71 | 72 | def annotateSentences(jcas: JCas) { 73 | val tokens = jcas.select[Token].map { token => 74 | val label = new CoreLabel() 75 | label.setBeginPosition(token.getBegin) 76 | label.setEndPosition(token.getEnd) 77 | label.setWord(token.getCoveredText) 78 | label 79 | }.toList 80 | 81 | val proc = new WordToSentenceProcessor[CoreLabel]() 82 | proc.process(tokens).foreach { sentence => 83 | add(createSentence(jcas, sentence.head.beginPosition, sentence.last.endPosition)) 84 | } 85 | } 86 | 87 | protected def skipWhitespace(text: String, offset: Int): Int = { 88 | var newOffset = offset 89 | while (newOffset < text.size && Character.isWhitespace(text.charAt(newOffset))) { 90 | newOffset = newOffset + 1 91 | } 92 | newOffset 93 | } 94 | 95 | protected def createToken(cas: JCas, begin: Int, end: Int) = 96 | new Token(cas, begin, end) 97 | 98 | protected def createSentence(cas: JCas, begin: Int, end: Int) = 99 | new Sentence(cas, begin, end) 100 | 101 | protected def getTokenizer(lang: String, text: String): Tokenizer[_] = { 102 | getTokenizerFromLanguage(lang, text) match { 103 | case Some(tokenizer) => tokenizer 104 | case None => 105 | fallbackLanguage.is.flatMap { lang => 106 | getTokenizerFromLanguage(lang, text) 107 | }.getOrElse( 108 | throw new Exception("can not create tokenizer for language: " + lang) 109 | ) 110 | } 111 | } 112 | 113 | private def getTokenizerFromLanguage(lang: String, text: String): Option[Tokenizer[_]] = 114 | lang match { 115 | case "ar" => Some(ArabicTokenizer.newArabicTokenizer(new StringReader(text), new Properties())) 116 | case "en" => Some(new PTBTokenizer[CoreLabel](new StringReader(text), new CoreLabelTokenFactory(), "invertible")) 117 | case "es" => Some(SpanishTokenizer.factory(new CoreLabelTokenFactory(), null).getTokenizer(new StringReader(text))) 118 | case "fr" => Some(FrenchTokenizer.factory().getTokenizer(new StringReader(text), "tokenizeNLs=false")) 119 | case "de" => Some(new NegraPennTokenizer(new StringReader(text))) 120 | case "zh" => Some(new CHTBTokenizer(new StringReader(text))) 121 | case _ => None 122 | } 123 | } 124 | 125 | -------------------------------------------------------------------------------- /segmenter/stanford-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/StanfordSegmenterSpec.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import java.util.Locale 4 | import com.github.jenshaase.uimascala.core._ 5 | import com.github.jenshaase.uimascala.typesystem._ 6 | import com.github.jenshaase.uimascala.core.configuration._ 7 | import org.apache.uima.analysis_engine.AnalysisEngine 8 | import org.specs2.mutable.Specification 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory 10 | import org.apache.uima.fit.util.JCasUtil 11 | 12 | class StanfordSegmenterSpec extends Specification { 13 | 14 | "StanfordSegmenter" should { 15 | "segment english sentences and tokens" in { 16 | val segmenter: AnalysisEngine = new StanfordSegmenter(). 17 | asAnalysisEngine 18 | 19 | val jcas = segmenter.newJCas() 20 | jcas.setDocumentText("This is a text. Here we are! ") 21 | jcas.setDocumentLanguage("en") 22 | segmenter.process(jcas) 23 | 24 | jcas.select[Token].size must be equalTo(9) 25 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("This") 26 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("is") 27 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("a") 28 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("text") 29 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo (".") 30 | jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("Here") 31 | jcas.selectByIndex[Token](6).getCoveredText must be equalTo ("we") 32 | jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("are") 33 | jcas.selectByIndex[Token](8).getCoveredText must be equalTo ("!") 34 | 35 | jcas.select[Sentence].size must be equalTo(2) 36 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("This is a text.") 37 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Here we are!") 38 | } 39 | 40 | "segment french sentences and tokens" in { 41 | val segmenter: AnalysisEngine = new StanfordSegmenter(). 42 | asAnalysisEngine 43 | 44 | val jcas = segmenter.newJCas() 45 | jcas.setDocumentText("Bonjour à tous. C'est parti!") 46 | jcas.setDocumentLanguage("fr") 47 | segmenter.process(jcas) 48 | 49 | jcas.select[Token].size must be equalTo(8) 50 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Bonjour") 51 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("à") 52 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("tous") 53 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo (".") 54 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("C'") 55 | jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("est") 56 | jcas.selectByIndex[Token](6).getCoveredText must be equalTo ("parti") 57 | jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("!") 58 | 59 | jcas.select[Sentence].size must be equalTo(2) 60 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Bonjour à tous.") 61 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("C'est parti!") 62 | } 63 | 64 | "segment english text without a point" in { 65 | val segmenter: AnalysisEngine = new StanfordSegmenter(). 66 | asAnalysisEngine 67 | 68 | val jcas = segmenter.newJCas() 69 | jcas.setDocumentText("This is a sentence") 70 | jcas.setDocumentLanguage("en") 71 | segmenter.process(jcas) 72 | 73 | jcas.select[Sentence].size must be equalTo(1) 74 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("This is a sentence") 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /segmenter/whitespace-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/WhitespaceTokenizer.scala: -------------------------------------------------------------------------------- 1 | package com.github.jenshaase.uimascala.segmenter 2 | 3 | import com.github.jenshaase.uimascala.core.configuration._ 4 | import scala.util.matching.Regex 5 | 6 | class WhitespaceTokenizer extends RegexTokenizer { 7 | 8 | override def getRegex = """\s+""".r 9 | } 10 | -------------------------------------------------------------------------------- /segmenter/whitespace-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/WhitespaceTokenizerSpec.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2011 Jens Haase 3 | */ 4 | package com.github.jenshaase.uimascala.segmenter 5 | 6 | import java.util.Locale 7 | import com.github.jenshaase.uimascala.core._ 8 | import com.github.jenshaase.uimascala.typesystem._ 9 | import org.apache.uima.analysis_engine.AnalysisEngine 10 | import org.specs2.mutable.Specification 11 | import org.apache.uima.fit.factory.AnalysisEngineFactory 12 | import org.apache.uima.fit.util.JCasUtil 13 | 14 | class WhitespaceTokenizerSpec extends Specification { 15 | 16 | "Whitespace Tokenizer" should { 17 | "split by whitespace" in { 18 | val tokenizer: AnalysisEngine = new WhitespaceTokenizer().asAnalysisEngine 19 | 20 | val jcas = tokenizer.newJCas() 21 | jcas.setDocumentText("Hallo Welt los\ngehts ") 22 | tokenizer.process(jcas) 23 | 24 | jcas.select[Token].size must be equalTo(4) 25 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo") 26 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt") 27 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los") 28 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("gehts") 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /type-system/src/main/resources/META-INF/org.apache.uima.fit/types.txt: -------------------------------------------------------------------------------- 1 | classpath*:desc/types/**/*.xml 2 | -------------------------------------------------------------------------------- /type-system/src/main/resources/desc/types/TypeSystem.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | TypeSystem 4 | 5 | 6 | com.github.jenshaase.uimascala.typesystem.Token 7 | 8 | uima.tcas.Annotation 9 | 10 | 11 | pos 12 | com.github.jenshaase.uimascala.typesystem.POS 13 | 14 | 15 | lemma 16 | com.github.jenshaase.uimascala.typesystem.Lemma 17 | 18 | 19 | parent 20 | uima.tcas.Annotation 21 | 22 | 23 | 24 | 25 | 26 | com.github.jenshaase.uimascala.typesystem.Sentence 27 | 28 | uima.tcas.Annotation 29 | 30 | 31 | 32 | 33 | 34 | com.github.jenshaase.uimascala.typesystem.POS 35 | 36 | uima.tcas.Annotation 37 | 38 | 39 | name 40 | uima.cas.String 41 | 42 | 43 | 44 | 45 | 46 | com.github.jenshaase.uimascala.typesystem.Lemma 47 | 48 | uima.tcas.Annotation 49 | 50 | 51 | value 52 | uima.cas.String 53 | 54 | 55 | 56 | 57 | 58 | com.github.jenshaase.uimascala.typesystem.Dependency 59 | 60 | uima.tcas.Annotation 61 | 62 | 63 | governor 64 | com.github.jenshaase.uimascala.typesystem.Token 65 | 66 | 67 | dependent 68 | com.github.jenshaase.uimascala.typesystem.Token 69 | 70 | 71 | dependencyType 72 | uima.cas.String 73 | 74 | 75 | 76 | 77 | 78 | com.github.jenshaase.uimascala.typesystem.DependencyRoot 79 | 80 | com.github.jenshaase.uimascala.typesystem.Dependency 81 | 82 | 83 | 84 | com.github.jenshaase.uimascala.typesystem.Constituent 85 | 86 | uima.tcas.Annotation 87 | 88 | 89 | constituentType 90 | uima.cas.String 91 | 92 | 93 | parent 94 | uima.tcas.Annotation 95 | 96 | 97 | children 98 | uima.cas.FSArray 99 | uima.tcas.Annotation 100 | 101 | 102 | syntacticFunction 103 | uima.cas.String 104 | 105 | 106 | 107 | 108 | 109 | com.github.jenshaase.uimascala.typesystem.NamedEntity 110 | 111 | uima.tcas.Annotation 112 | 113 | 114 | value 115 | uima.cas.String 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.6.2-SNAPSHOT" 2 | --------------------------------------------------------------------------------