├── .gitignore
├── .travis.yml
├── LICENSE
├── README.markdown
├── build.sbt
├── core
    ├── .scala_dependencies
    └── src
    │   ├── main
    │       └── scala
    │       │   └── com
    │       │       └── github
    │       │           └── jenshaase
    │       │               └── uimascala
    │       │                   └── core
    │       │                       ├── AsAnalysisEngine.scala
    │       │                       ├── Converter.scala
    │       │                       ├── SCasAnnotator_ImplBase.scala
    │       │                       ├── SCasCollectionReader_ImplBase.scala
    │       │                       ├── SCasConsumer_ImplBase.scala
    │       │                       ├── SCasFlowController_ImplBase.scala
    │       │                       ├── SCasMultiplier_ImplBase.scala
    │       │                       ├── SimplePipeline.scala
    │       │                       ├── XmlDescriptor.scala
    │       │                       ├── configuration
    │       │                           ├── ConfigurationInitialization.scala
    │       │                           ├── Parameter.scala
    │       │                           ├── Resource.scala
    │       │                           └── ResourceInitialization.scala
    │       │                       ├── package.scala
    │       │                       ├── stream
    │       │                           ├── annotators.scala
    │       │                           └── package.scala
    │       │                       └── wrapper
    │       │                           ├── AnnotationWrapper.scala
    │       │                           └── JCasWrapper.scala
    │   └── test
    │       └── scala
    │           └── com
    │               └── github
    │                   └── jenshaase
    │                       └── uimascala
    │                           └── core
    │                               ├── ConverterSpec.scala
    │                               ├── SCasAnnotator_ImplBaseSpecs.scala
    │                               ├── SimplePipelineSpecs.scala
    │                               ├── configuration
    │                                   ├── ConfigurationInitalizationSpec.scala
    │                                   ├── ParameterSpec.scala
    │                                   ├── ResourceInitializationSpec.scala
    │                                   └── ResourceSpec.scala
    │                               ├── stream
    │                                   └── annotatorsSpec.scala
    │                               ├── util
    │                                   └── Helper.scala
    │                               └── wrapper
    │                                   ├── AnnotationWrapperSpec.scala
    │                                   └── JCasWrapperSpec.scala
├── language-identification
    └── n-gram-language-identifier
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── com
    │           │       └── github
    │           │           └── jenshaase
    │           │               └── uimascala
    │           │                   └── languageidentifier
    │           │                       └── NGramLanguageIdentifier.scala
    │       └── test
    │           └── scala
    │               └── com
    │                   └── github
    │                       └── jenshaase
    │                           └── uimascala
    │                               └── languageidentifier
    │                                   └── NGramLanguageIdentifierSpec.scala
├── lemmatizer
    └── mate-lemmatizer
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── com
    │           │       └── github
    │           │           └── jenshaase
    │           │               └── uimascala
    │           │                   └── lemmatizer
    │           │                       └── MateLemmatizer.scala
    │       └── test
    │           └── scala
    │               └── com
    │                   └── github
    │                       └── jenshaase
    │                           └── uimascala
    │                               └── lemmatizer
    │                                   └── MateLemmatizerSpec.scala
├── name-entity-recognizer
    └── stanford-ner
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── com
    │           │       └── github
    │           │           └── jenshaase
    │           │               └── uimascala
    │           │                   └── ner
    │           │                       └── StanfordNer.scala
    │       └── test
    │           └── scala
    │               └── com
    │                   └── github
    │                       └── jenshaase
    │                           └── uimascala
    │                               └── ner
    │                                   └── StanfordNerSpec.scala
├── parser
    ├── mate-parser
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── parser
    │   │       │                       └── MateParser.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── parser
    │   │                               └── MateParserSpec.scala
    └── stanford-parser
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── com
    │           │       └── github
    │           │           └── jenshaase
    │           │               └── uimascala
    │           │                   └── parser
    │           │                       └── StanfordParser.scala
    │       └── test
    │           └── scala
    │               └── com
    │                   └── github
    │                       └── jenshaase
    │                           └── uimascala
    │                               └── parser
    │                                   └── StanfordParserSpec.scala
├── part-of-speech-tagger
    ├── ark-tweet-pos-tagger
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── pos
    │   │       │                       └── ArkTweetPosTagger.scala
    │   │   └── test
    │   │       ├── resources
    │   │           └── model.20120919
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── pos
    │   │                               └── ArkTweetPosTaggerSpec.scala
    ├── mate-pos-tagger
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── pos
    │   │       │                       └── MatePosTagger.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── pos
    │   │                               └── MatePosTaggerSpec.scala
    └── stanford-pos-tagger
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── com
    │           │       └── github
    │           │           └── jenshaase
    │           │               └── uimascala
    │           │                   └── pos
    │           │                       └── StanfordPosTagger.scala
    │       └── test
    │           └── scala
    │               └── com
    │                   └── github
    │                       └── jenshaase
    │                           └── uimascala
    │                               └── pos
    │                                   └── StanfordPosTaggerSpec.scala
├── project
    └── plugins.sbt
├── sbt-plugin
    ├── build.sbt
    ├── project
    │   └── plugin.sbt
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── com
    │   │           └── github
    │   │               └── jenshaase
    │   │                   └── uimascala
    │   │                       └── sbt
    │   │                           ├── UimaSbtPlugin.scala
    │   │                           ├── UimaScalaTypeTemplate.scala
    │   │                           └── UimaScala_TypeTemplate.scala
    └── version.sbt
├── segmenter
    ├── ark-tweet-tokenizer
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── segmenter
    │   │       │                       └── ArkTweetTokenizer.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── segmenter
    │   │                               └── ArkTweetTokenizerSpec.scala
    ├── break-iterator-segmenter
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── segmenter
    │   │       │                       └── BreakIteratorSegmenter.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── segmenter
    │   │                               └── BreakIteratorSegmenterSpec.scala
    ├── lucene-tokenizer
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── segmenter
    │   │       │                       └── LuceneTokenizer.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── segmenter
    │   │                               └── LuceneTokenizerSpec.scala
    ├── open-nlp-segmenter
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── segmenter
    │   │       │                       └── OpenNlpSegmenter.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── segmenter
    │   │                               └── OpenNlpSegmenterSpec.scala
    ├── regex-tokenizer
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── segmenter
    │   │       │                       └── RegexTokenizer.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── segmenter
    │   │                               └── RegexTokenizerSpec.scala
    ├── stanford-segmenter
    │   └── src
    │   │   ├── main
    │   │       └── scala
    │   │       │   └── com
    │   │       │       └── github
    │   │       │           └── jenshaase
    │   │       │               └── uimascala
    │   │       │                   └── segmenter
    │   │       │                       └── StanfordSegmenter.scala
    │   │   └── test
    │   │       └── scala
    │   │           └── com
    │   │               └── github
    │   │                   └── jenshaase
    │   │                       └── uimascala
    │   │                           └── segmenter
    │   │                               └── StanfordSegmenterSpec.scala
    └── whitespace-tokenizer
    │   └── src
    │       ├── main
    │           └── scala
    │           │   └── com
    │           │       └── github
    │           │           └── jenshaase
    │           │               └── uimascala
    │           │                   └── segmenter
    │           │                       └── WhitespaceTokenizer.scala
    │       └── test
    │           └── scala
    │               └── com
    │                   └── github
    │                       └── jenshaase
    │                           └── uimascala
    │                               └── segmenter
    │                                   └── WhitespaceTokenizerSpec.scala
├── type-system
    └── src
    │   └── main
    │       └── resources
    │           ├── META-INF
    │               └── org.apache.uima.fit
    │               │   └── types.txt
    │           └── desc
    │               └── types
    │                   └── TypeSystem.xml
└── version.sbt


/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | lib_managed/
 3 | src_managed/
 4 | project/boot/
 5 | uima-scala-docs/build/
 6 | *.iml
 7 | *.ipr
 8 | *.iws
 9 | /.idea
10 | .scala-dependencies
11 | 
12 | # Eclipse
13 | *.pydevproject
14 | .project
15 | .metadata
16 | .history
17 | bin/**
18 | tmp/**
19 | tmp/**/*
20 | *.tmp
21 | *.bak
22 | *.swp
23 | *~.nib
24 | local.properties
25 | .classpath
26 | .settings/
27 | .loadpath
28 | 
29 | # CDT-specific
30 | .cproject
31 | *~
32 | *.sublime-workspace
33 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |   - 2.11.8
4 | jdk:
5 |   - oraclejdk8


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This software is licensed under the Apache 2 license, quoted below.
 2 | 
 3 | Copyright 2011 Jens Haase
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not
 6 | use this file except in compliance with the License. You may obtain a copy of
 7 | the License at
 8 | 
 9 |     [http://www.apache.org/licenses/LICENSE-2.0]
10 | 
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | License for the specific language governing permissions and limitations under
15 | the License.
16 | 
17 | ---------------
18 | 
19 | Notice: Licenses of dependency projects my be different


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
  1 | # UimaScala [![Build Status](https://travis-ci.org/jenshaase/uimaScala.svg?branch=master)](https://travis-ci.org/jenshaase/uimaScala)
  2 | 
  3 | ## About
  4 | 
  5 | uimaScala is toolkit to develop natural language application in
  6 | Scala. It bases mainly on
  7 | [uimaFIT](https://uima.apache.org/uimafit.html), which itsself bases on
  8 | [Apache UIMA](http://uima.apache.org/). To develop natural language
  9 | processing (NLP) application in [Apache UIMA](http://uima.apache.org/)
 10 | you need to work with lots of XML files. For nearly every Java class
 11 | you will need an XML File. If your Java class changes you also need to
 12 | change you XML file. [uimaFIT](http://code.google.com/p/uimafit/)
 13 | tries to solve this problem with reflection and nearly removes all XML
 14 | files.
 15 | 
 16 | This project started as a wrapper for
 17 | [uimaFIT](https://uima.apache.org/uimafit.html). With Scala's collection
 18 | library and the functional programming stuff it is a lot easier to
 19 | develop NLP Application. Also a type safe configuration system and a
 20 | nicer DSL was added.
 21 | 
 22 | This readme provides a short introduction. More documentation will be
 23 | added later.
 24 | 
 25 | ## Setup a project
 26 | 
 27 | To use this project add following configuration to your `built.sbt`
 28 | file. Uimscala requires Scala version `2.11`
 29 | 
 30 | ~~~
 31 | scalaVersion := "2.11.1"
 32 | 
 33 | resolvers ++= Seq(
 34 |   "Sonatype OSS Releases"  at "http://oss.sonatype.org/content/repositories/releases/",
 35 |   "Sonatype OSS Snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/"
 36 | )
 37 | 
 38 | libraryDependencies += "com.github.jenshaase.uimascala" %% "uimascala-core" % "0.5.0-SNAPSHOT"
 39 | 
 40 | addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full)
 41 | ~~~
 42 | 
 43 | Next you need to tell UIMA where to find the description
 44 | files. Therefore add the file `types.txt` to the folder
 45 | `src/main/resources/META-INF/org.apache.uima.fit`. Add following
 46 | content:
 47 | 
 48 | ~~~
 49 | classpath*:desc/types/**/*.xml
 50 | ~~~
 51 | 
 52 | ## A simple annotator
 53 | 
 54 | Annotators in UIMA will process a document. Most of the time they are
 55 | using annotations from previous annotators and combine them to new
 56 | annotations. The following annotator is Tokenizer. It looks at the
 57 | text and identifies single words, also called tokens. We can use
 58 | Java's `BreakIterator` to tokenize the text. You will find the class
 59 | also in the toolkit with some additional processing:
 60 | 
 61 | ~~~
 62 | package com.github.jenshaase.test
 63 | 
 64 | import com.github.jenshaase.uimascala.core._
 65 | import com.github.jenshaase.uimascala.core.configuration._
 66 | import java.util.Locale
 67 | import org.apache.uima.jcas.JCas
 68 | import java.text.BreakIterator
 69 | 
 70 | class BreakIteratorTokenizer extends SCasAnnotator_ImplBase {
 71 | 
 72 |   object locale extends Parameter[Locale](Locale.getDefault)
 73 | 
 74 |   def process(jcas: JCas) = {
 75 |     val bi = BreakIterator.getWordInstance(locale.is)
 76 |     bi.setText(jcas.getDocumentText)
 77 | 
 78 |     var last = bi.first
 79 |     var cur = bi.next
 80 |     while (cur != BreakIterator.DONE) {
 81 |       if (jcas.getDocumentText().substring(last, cur).trim != "") {
 82 |         jcas.annotate[Token](last, cur)
 83 |       }
 84 | 
 85 |       last = cur
 86 |       cur = bi.next
 87 |     }
 88 |   }
 89 | }
 90 | ~~~
 91 | 
 92 | An annotator in uimaScala extends the `SCasAnnotator_ImplBase`
 93 | class. To implement this class you need to implement the `process`
 94 | method. Here we use Java's `BreakIterator` to process the
 95 | document. For each token we add a new `Token` type (the next part will
 96 | explain how to create such type). You can also see the `locale`
 97 | configuration parameter. It has a name (`locale`) and type (`Locale`)
 98 | and a default value `Locale.getDefault`. These parameter can be change
 99 | when using this component in a UIMA pipeline.
100 | 
101 | 
102 | ## Adding your own type system description
103 | 
104 | The goal of an annotator is to add new annotation to text. With UIMA
105 | you can create you custom annotation with XML Files and then generate
106 | the Java classes. uimaScala uses a Scala marco and custom DSL to
107 | provide this features. In order to create your type system you need to
108 | define an object in your scala code:
109 | 
110 | ~~~
111 | package com.github.jenshaase.test
112 | 
113 | import com.github.jenshaase.uimascala.core.description._ 
114 | 
115 | @TypeSystemDescription
116 | object TypeSystem {
117 | 
118 |   val Token = Annotation {
119 |     val pos = Feature[String]
120 |     val lemma = Feature[String]
121 |     val stem = Feature[String]
122 |   }
123 | 
124 |   val Sentence = Annotation {}
125 | }
126 | ~~~
127 | 
128 | After running `compile` your can see following output on your sbt console:
129 | 
130 | ~~~
131 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
132 | INFORMATION:  >>JCasGen Creating: 'com.github.jenshaase.test.Token'.
133 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
134 | INFORMATION:  >>JCasGen Creating: 'com.github.jenshaase.test.Token_Type'.
135 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
136 | INFORMATION:  >>JCasGen Creating: 'com.github.jenshaase.test.Sentence'.
137 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
138 | INFORMATION:  >>JCasGen Creating: 'com.github.jenshaase.test.Sentence_Type'
139 | ~~~
140 | 
141 | Now the necessary Java files are created. You need to run `compile`
142 | again to compile the generated Java sources.
143 | 
144 | ## Running a pipeline
145 | 
146 | Tu run a pipeline uimascala use
147 | [scalaz-stream](https://github.com/scalaz/scalaz-stream) library. To
148 | run a pipeline we need to convert documents to a CAS and process the
149 | CAS with our annotators:
150 | 
151 | ~~~
152 | package com.github.jenshaase.test
153 | 
154 | import com.github.jenshaase.uimascala.core._
155 | import com.github.jenshaase.uimascala.core.stream._
156 | import scalaz._, Scalaz._
157 | import scalaz.stream._
158 | import java.util.Locale
159 | 
160 | object Main extends App {
161 | 
162 |   val p = Process("this is a text", "and another text") |>
163 |     casFromText |>
164 |     annotate(new BreakIteratorTokenizer().config(_.locale := Locale.US)) |>
165 |     extractCas { cas =>
166 |       cas.select[Token].map(_.getCoveredText).toList
167 |     }
168 | 
169 |   println(p.toList)
170 | 
171 |   p.toList == List(
172 |     List("this", "is", "a", "text"),
173 |     List("and", "another", "text")
174 |   )
175 | }
176 | 
177 | ~~~
178 | 
179 | 
180 | ## TODO
181 | 
182 | * Add more documentation
183 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | import com.github.jenshaase.uimascala.UimaSbtPlugin._
  2 | 
  3 | lazy val commonSettings = Seq(
  4 |   organization := "com.github.jenshaase.uimascala",
  5 |   scalaVersion := "2.11.8",
  6 |   libraryDependencies ++= Seq(
  7 |     "org.specs2" %% "specs2-core" % "3.8.4" % "test"
  8 |   )
  9 | )
 10 | 
 11 | lazy val componentSettings = commonSettings ++ releaseSettings
 12 | 
 13 | lazy val root = (project in file(".")).
 14 |   settings(releaseSettings:_*).
 15 |   settings(
 16 |     publishArtifact in Compile := false,
 17 |     parallelExecution in Test := false
 18 |   ).
 19 |   aggregate(
 20 |     core, typeSystem,
 21 |     breakIteratorSegmenter, regexTokenizer, whitespaceTokenizer, stanfordSegmenter, arkTweetTokenizer, openNlpSegmenter, luceneTokenizer,
 22 |     stanfordPosTagger, arkTweetPosTagger,
 23 |     stanfordParser,
 24 |     stanfordNer,
 25 |     nGramLanguageIdentifier
 26 |     // Do not run these test in build environment because of too much memory consumption
 27 |     //mateLemmatizer, mateParser, matePosTagger
 28 |   )
 29 | 
 30 | lazy val core = (project in file("core")).
 31 |   settings(commonSettings: _*).
 32 |   settings(releaseSettings: _*).
 33 |   settings(
 34 |     libraryDependencies ++= Seq(
 35 |       "org.apache.uima" % "uimafit-core" % "2.2.0",
 36 |       "org.scala-lang.modules" %% "scala-xml" % "1.0.5",
 37 |       "co.fs2" %% "fs2-core" % "0.9.0-M5"
 38 |     )
 39 |   )
 40 | 
 41 | lazy val typeSystem = (project in file("type-system")).
 42 |   settings(componentSettings: _*).
 43 |   settings(uimaScalaSettings: _*).
 44 |   dependsOn(core)
 45 | 
 46 | // ==================================================
 47 | // Segmenter
 48 | 
 49 | lazy val breakIteratorSegmenter = (project in file("segmenter/break-iterator-segmenter")).
 50 |   settings(componentSettings).
 51 |   dependsOn(core, typeSystem)
 52 | 
 53 | lazy val regexTokenizer = (project in file("segmenter/regex-tokenizer")).
 54 |   settings(componentSettings).
 55 |   dependsOn(core, typeSystem)
 56 | 
 57 | lazy val whitespaceTokenizer = (project in file("segmenter/whitespace-tokenizer")).
 58 |   settings(componentSettings).
 59 |   dependsOn(core, typeSystem, regexTokenizer)
 60 | 
 61 | lazy val stanfordSegmenter = (project in file("segmenter/stanford-segmenter")).
 62 |   settings(componentSettings).
 63 |   settings(
 64 |     libraryDependencies ++= Seq(
 65 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0"
 66 |     )
 67 |   ).
 68 |   dependsOn(core, typeSystem)
 69 | 
 70 | lazy val arkTweetTokenizer = (project in file("segmenter/ark-tweet-tokenizer")).
 71 |   settings(componentSettings).
 72 |   settings(
 73 |     libraryDependencies ++= Seq(
 74 |       "edu.cmu.cs" % "ark-tweet-nlp" % "0.3.2"
 75 |     )
 76 |   ).
 77 |   dependsOn(core, typeSystem)
 78 | 
 79 | lazy val openNlpSegmenter = (project in file("segmenter/open-nlp-segmenter")).
 80 |   settings(componentSettings).
 81 |   settings(
 82 |     libraryDependencies ++= Seq(
 83 |       "org.apache.opennlp" % "opennlp-tools" % "1.6.0",
 84 |       "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-de-maxent" % "20120616.1" % "test",
 85 |       "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-de-maxent" % "20120616.1" % "test"
 86 |     ),
 87 |     resolvers ++= Seq(
 88 |       "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
 89 |     )
 90 |   ).
 91 |   dependsOn(core, typeSystem)
 92 | 
 93 | lazy val luceneTokenizer = (project in file("segmenter/lucene-tokenizer")).
 94 |   settings(componentSettings).
 95 |   settings(
 96 |     libraryDependencies ++= Seq(
 97 |       "org.apache.lucene" % "lucene-analyzers-common" % "6.1.0"
 98 |     )
 99 |   ).
100 |   dependsOn(core, typeSystem)
101 | 
102 | // ==================================================
103 | // Lemmatizer
104 | 
105 | lazy val mateLemmatizer = (project in file("lemmatizer/mate-lemmatizer")).
106 |   settings(componentSettings).
107 |   settings(
108 |     libraryDependencies ++= Seq(
109 |       "com.googlecode.mate-tools" % "anna" % "3.5",
110 |       "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-lemmatizer-de-tiger" % "20121024.1" % "test"
111 |     ),
112 |     resolvers ++= Seq(
113 |       "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
114 |     )
115 |   ).
116 |   dependsOn(core, typeSystem)
117 | 
118 | // ==================================================
119 | // POS Tagger
120 | 
121 | lazy val stanfordPosTagger = (project in file("part-of-speech-tagger/stanford-pos-tagger")).
122 |   settings(componentSettings).
123 |   settings(
124 |     libraryDependencies ++= Seq(
125 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
126 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german"
127 |     )
128 |   ).
129 |   dependsOn(core, typeSystem)
130 | 
131 | lazy val matePosTagger = (project in file("part-of-speech-tagger/mate-pos-tagger")).
132 |   settings(componentSettings).
133 |   settings(
134 |     libraryDependencies ++= Seq(
135 |       "com.googlecode.mate-tools" % "anna" % "3.5",
136 |       "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-tagger-de-tiger" % "20121024.1" % "test"
137 |     ),
138 |     resolvers ++= Seq(
139 |       "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
140 |     )
141 |   ).
142 |   dependsOn(core, typeSystem)
143 | 
144 | lazy val arkTweetPosTagger = (project in file("part-of-speech-tagger/ark-tweet-pos-tagger")).
145 |   settings(componentSettings).
146 |   settings(
147 |     libraryDependencies ++= Seq(
148 |       "edu.cmu.cs" % "ark-tweet-nlp" % "0.3.2"
149 |     )
150 |   ).
151 |   dependsOn(core, typeSystem)
152 | 
153 | // ==================================================
154 | // Parser
155 | 
156 | lazy val stanfordParser = (project in file("parser/stanford-parser")).
157 |   settings(componentSettings).
158 |   settings(
159 |     libraryDependencies ++= Seq(
160 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
161 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german"
162 |     )
163 |   ).
164 |   dependsOn(core, typeSystem)
165 | 
166 | lazy val mateParser = (project in file("parser/mate-parser")).
167 |   settings(componentSettings).
168 |   settings(
169 |     libraryDependencies ++= Seq(
170 |       "com.googlecode.mate-tools" % "anna" % "3.5",
171 |       "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-parser-de-tiger" % "20121024.1" % "test"
172 |     ),
173 |     resolvers ++= Seq(
174 |       "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
175 |     )
176 |   ).
177 |   dependsOn(core, typeSystem)
178 | 
179 | // ==================================================
180 | // Name Entity Recognizer
181 | 
182 | lazy val stanfordNer = (project in file("name-entity-recognizer/stanford-ner")).
183 |   settings(componentSettings).
184 |   settings(
185 |     libraryDependencies ++= Seq(
186 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
187 |       "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german"
188 |     )
189 |   ).
190 |   dependsOn(core, typeSystem)
191 | 
192 | // ==================================================
193 | // Language Identifer
194 | 
195 | lazy val nGramLanguageIdentifier = (project in file("language-identification/n-gram-language-identifier")).
196 |   settings(componentSettings).
197 |   settings(
198 |     libraryDependencies ++= Seq(
199 |       "com.optimaize.languagedetector" % "language-detector" % "0.5"
200 |     )
201 |   ).
202 |   dependsOn(core, typeSystem)
203 | 
204 | 
205 | lazy val releaseSettings = Seq(
206 |   releasePublishArtifactsAction := PgpKeys.publishSigned.value,
207 |   publishTo := {
208 |     val nexus = "https://oss.sonatype.org/"
209 |     if ( version.value.trim.endsWith( "SNAPSHOT" ) )
210 |       Some( "snapshots" at nexus + "content/repositories/snapshots" )
211 |     else
212 |       Some( "releases"  at nexus + "service/local/staging/deploy/maven2" )
213 |   },
214 |   publishMavenStyle := true,
215 |   pomExtra := (
216 |     <url>https://github.com/jenshaase/uimaScala</url>
217 |     <scm>
218 |       <url>git@github.com:jenshaase/uimascala.git</url>
219 |       <connection>scm:git:git@github.com:jenshaase/uimascala.git</connection>
220 |     </scm>
221 |     <developers>
222 |       <developer>
223 |         <id>jenshaase</id>
224 |         <name>Jens Haase</name>
225 |       </developer>
226 |     </developers>
227 |     <licenses>
228 |       <license>
229 |         <name>Apache 2</name>
230 |         <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
231 |         <distribution>repo</distribution>
232 |       </license>
233 |     </licenses>
234 |   )
235 | )
236 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/AsAnalysisEngine.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | 
 8 | trait AsAnalysisEngine {
 9 |   def asAnalysisEngine: AnalysisEngine
10 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/Converter.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core
  5 | 
  6 | import PartialFunction._
  7 | import java.util.regex.Pattern
  8 | import java.util.Locale
  9 | import java.io.File
 10 | import util.matching.Regex
 11 | import com.github.jenshaase.uimascala.core.configuration._
 12 | 
 13 | abstract class Caster[In, Out](implicit in: Manifest[In], out: Manifest[Out]) {
 14 |   def convertToUimaType[X](c: X)(implicit m: Manifest[X]): Option[Any] = {
 15 |     def sameArgs = in.typeArguments.zip(m.typeArguments).forall {
 16 |       case (in, actual) ⇒ in >:> actual
 17 |     }
 18 | 
 19 |     // Special case for options
 20 |     val isOption = (in.erasure.toString, m.erasure.toString) match {
 21 |       case ("class scala.Option", "class scala.Some") ⇒ true
 22 |       case ("class scala.Option", "class scala.None$") ⇒ true
 23 |       case _ ⇒ false
 24 |     }
 25 | 
 26 |     if ((isOption || in >:> m) && sameArgs) Some(toUimaType(c.asInstanceOf[In]))
 27 |     else None
 28 |   }
 29 | 
 30 |   def convertFromUimaType[X](c: Any)(implicit m: Manifest[X]): Option[In] = {
 31 |     def sameArgs = in.typeArguments.zip(m.typeArguments).forall {
 32 |       case (in, actual) ⇒ in >:> actual
 33 |     }
 34 | 
 35 |     // Special case for options
 36 |     val isOption = (in.erasure.toString, m.erasure.toString) match {
 37 |       case ("class scala.Option", "class scala.Some") ⇒ true
 38 |       case ("class scala.Option", "class scala.None$") ⇒ true
 39 |       case _ ⇒ false
 40 |     }
 41 | 
 42 |     if ((isOption || in >:> m) && sameArgs) fromUimaType(c)
 43 |     else None
 44 |   }
 45 | 
 46 |   def toUimaType(in: In): Out
 47 |   def fromUimaType(in: Any): Option[In]
 48 | }
 49 | 
 50 | object CastFactory {
 51 | 
 52 |   import BasicCaster._
 53 | 
 54 |   var convertSeq: Seq[Caster[_, _]] = Seq.empty
 55 | 
 56 |   register(stringCaster)
 57 |   register(intCaster)
 58 |   register(floatCaster)
 59 |   register(doubleCaster)
 60 |   register(booleanCaster)
 61 |   register(localeCaster)
 62 |   register(regexCaster)
 63 |   register(patternCaster)
 64 |   register(fileCaster)
 65 | 
 66 |   // TODO: Output error if not caster is found
 67 |   def toUima[A](in: A)(implicit m: Manifest[A]): Either[Failure, Option[Any]] =
 68 |     convertSeq.map(_.convertToUimaType(in)).find(_.isDefined) match {
 69 |       case Some(v) ⇒ Right(v)
 70 |       case None    ⇒ Left(Failure("Can not find a converter for: " + m.erasure.toString))
 71 |     }
 72 | 
 73 |   // TODO: Output error if not caster is found
 74 |   def fromUima[A](in: Any)(implicit m: Manifest[A]): Either[Failure, Option[A]] = {
 75 |     convertSeq.map(c ⇒ c.convertFromUimaType[A](in)).find(_.isDefined) match {
 76 |       case Some(v) ⇒ Right(v.map(_.asInstanceOf[A]))
 77 |       case None    ⇒ Left(Failure("Can not find a converter for: " + m.erasure.toString))
 78 |     }
 79 |   }
 80 | 
 81 |   def register[In, Out](c: Caster[In, Out])(implicit ml: Manifest[List[In]], m: Manifest[In], mo: Manifest[Out]) =
 82 |     convertSeq ++= Seq(c, buildListCaster(c), buildSeqCaster(c), buildOptionCaster(c))
 83 | 
 84 |   protected def buildListCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[List[In]], m: Manifest[In], mo: Manifest[Out]) =
 85 |     new Caster[List[In], Array[Out]] {
 86 |       def toUimaType(in: List[In]) = in.map(c.toUimaType).toArray
 87 |       def fromUimaType(in: Any) = in match {
 88 |         case arr: Array[_] ⇒ sequence(arr.toList.map(c.fromUimaType))
 89 |         case _             ⇒ None
 90 |       }
 91 |     }
 92 | 
 93 |   protected def buildSeqCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[Seq[In]], m: Manifest[In], mo: Manifest[Out]) =
 94 |     new Caster[Seq[In], Array[Out]] {
 95 |       def toUimaType(in: Seq[In]) = in.map(c.toUimaType).toArray
 96 |       def fromUimaType(in: Any) = in match {
 97 |         case arr: Array[_] ⇒ sequence(arr.toSeq.map(c.fromUimaType))
 98 |         case _             ⇒ None
 99 |       }
100 |     }
101 | 
102 |   protected def buildOptionCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[Option[In]], m: Manifest[In], mo: Manifest[Out]) =
103 |     new Caster[Option[In], Out] {
104 |       def toUimaType(in: Option[In]) = in.map(c.toUimaType).getOrElse(null.asInstanceOf[Out])
105 |       def fromUimaType(in: Any) =
106 |         if (in != null) c.fromUimaType(in.asInstanceOf[In]) match {
107 |           case Some(v) ⇒ Some(Some(v))
108 |           case None    ⇒ None
109 |         }
110 |         else Some(None)
111 |     }
112 | 
113 |   def sequence[A](l: List[Option[A]]) =
114 |     if (l.contains(None)) None else Some(l.flatten)
115 | 
116 |   def sequence[A](l: Seq[Option[A]]) =
117 |     if (l.contains(None)) None else Some(l.flatten)
118 | }
119 | 
120 | object BasicCaster {
121 | 
122 |   import java.util.Locale
123 |   import java.util.regex.Pattern
124 |   import scala.util.matching.Regex
125 | 
126 |   val stringCaster = new Caster[String, String] {
127 |     def toUimaType(in: String) = in
128 |     def fromUimaType(in: Any) = in match {
129 |       case s: String ⇒ Some(s)
130 |       case _         ⇒ None
131 |     }
132 |   }
133 | 
134 |   val intCaster = new Caster[Int, Int] {
135 |     def toUimaType(in: Int): Int = in
136 |     def fromUimaType(in: Any) = in match {
137 |       case i: Int ⇒ Some(i)
138 |       case _      ⇒ None
139 |     }
140 |   }
141 | 
142 |   val floatCaster = new Caster[Float, Float] {
143 |     def toUimaType(in: Float): Float = in
144 |     def fromUimaType(in: Any) = in match {
145 |       case f: Float ⇒ Some(f)
146 |       case _        ⇒ None
147 |     }
148 |   }
149 | 
150 |   val doubleCaster = new Caster[Double, Float] {
151 |     def toUimaType(in: Double): Float = in.toFloat
152 |     def fromUimaType(in: Any) = in match {
153 |       case f: Float  ⇒ Some(f.toDouble)
154 |       case d: Double ⇒ Some(d)
155 |       case _         ⇒ None
156 |     }
157 |   }
158 | 
159 |   val booleanCaster = new Caster[Boolean, Boolean] {
160 |     def toUimaType(in: Boolean): Boolean = in
161 |     def fromUimaType(in: Any) = in match {
162 |       case b: Boolean ⇒ Some(b)
163 |       case _          ⇒ None
164 |     }
165 |   }
166 | 
167 |   val localeCaster = new Caster[Locale, String] {
168 |     def toUimaType(in: Locale): String = in.getLanguage
169 |     def fromUimaType(in: Any) = in match {
170 |       case l: Locale ⇒ Some(l)
171 |       case s: String ⇒ Some(new Locale(s))
172 |       case _         ⇒ None
173 |     }
174 |   }
175 | 
176 |   val regexCaster = new Caster[Regex, String] {
177 |     def toUimaType(in: Regex): String = in.pattern.pattern
178 |     def fromUimaType(in: Any) = in match {
179 |       case l: Regex  ⇒ Some(l)
180 |       case s: String ⇒ Some(s.r)
181 |       case _         ⇒ None
182 |     }
183 |   }
184 | 
185 |   val patternCaster = new Caster[Pattern, String] {
186 |     def toUimaType(in: Pattern): String = in.pattern
187 |     def fromUimaType(in: Any) = in match {
188 |       case l: Pattern ⇒ Some(l)
189 |       case s: String  ⇒ Some(Pattern.compile(s))
190 |       case _          ⇒ None
191 |     }
192 |   }
193 | 
194 |   val fileCaster = new Caster[File, String] {
195 |     def toUimaType(in: File): String = in.getAbsolutePath
196 |     def fromUimaType(in: Any) = in match {
197 |       case f: File   ⇒ Some(f)
198 |       case s: String ⇒ Some(new File(s))
199 |       case _         ⇒ None
200 |     }
201 |   }
202 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasAnnotator_ImplBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import configuration.Parameter
 7 | import java.io.File
 8 | import java.lang.reflect.Method
 9 | import java.net.URL
10 | import com.github.jenshaase.uimascala.core.configuration._
11 | import com.github.jenshaase.uimascala.core.wrapper._
12 | import org.apache.uima.analysis_component.AnalysisComponent
13 | import org.apache.uima.analysis_component.JCasAnnotator_ImplBase
14 | import org.apache.uima.analysis_engine.AnalysisEngineDescription
15 | import org.apache.uima.jcas.JCas
16 | import org.apache.uima.jcas.tcas.Annotation
17 | import org.apache.uima.resource.ResourceInitializationException
18 | import org.apache.uima.resource.ResourceSpecifier
19 | import org.apache.uima.UimaContext
20 | import org.apache.uima.UIMAFramework
21 | import org.apache.uima.fit.factory.AnalysisEngineFactory
22 | import org.apache.uima.fit.factory.ExternalResourceFactory
23 | import scala.collection.mutable.ListBuffer
24 | import xml.Node
25 | 
26 | /**
27 |  * Scala Annotator.
28 |  *
29 |  * Loads the parameter when initialized
30 |  *
31 |  * @author Jens Haase <je.haase@googlemail.com>
32 |  */
33 | abstract class SCasAnnotator_ImplBase extends JCasAnnotator_ImplBase
34 |     with Configurable
35 |     with ConfigurationInitialization
36 |     with ResourceInitialization
37 |     with AsAnalysisEngine {
38 | 
39 |   override def initialize(context: UimaContext) = {
40 |     super.initialize(context)
41 | 
42 |     this.loadParameter(context)
43 |     this.loadResources(context)
44 |   }
45 | 
46 |   /**
47 |    * Creates a analysis engine from an Annotator instance
48 |    */
49 |   def asAnalysisEngine = {
50 |     val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*)
51 | 
52 |     aed.setExternalResourceDependencies(resources.map(r ⇒
53 |       ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
54 |     resources.foreach { r ⇒
55 |       r.createBinding(aed)
56 |     }
57 | 
58 |     AnalysisEngineFactory.createAggregate(aed)
59 |   }
60 | 
61 |   /**
62 |    * Adds an annotation to the index
63 |    * if the annotation is not empty
64 |    */
65 |   def addIfNotEmpty[T <: Annotation](a: T): T = if (!a.isEmpty) {
66 |     add(a)
67 |   } else {
68 |     a
69 |   }
70 | 
71 |   /**
72 |    * Adds a annotation to the index
73 |    */
74 |   def add[T <: Annotation](a: T): T = {
75 |     a.addToIndexes
76 |     a
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasCollectionReader_ImplBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import com.github.jenshaase.uimascala.core.configuration._
 7 | import org.apache.uima.cas.CAS
 8 | import org.apache.uima.collection.CollectionReader_ImplBase
 9 | import org.apache.uima.jcas.JCas
10 | import org.apache.uima.UimaContext
11 | import org.apache.uima.fit.factory.CollectionReaderFactory
12 | import org.apache.uima.fit.factory.ExternalResourceFactory
13 | 
14 | abstract class SCasCollectionReader_ImplBase extends CollectionReader_ImplBase
15 |     with Configurable
16 |     with ConfigurationInitialization
17 |     with ResourceInitialization {
18 | 
19 |   override def initialize = {
20 |     super.initialize
21 | 
22 |     loadParameter(getUimaContext)
23 |     loadResources(getUimaContext)
24 |     initialize(getUimaContext)
25 |   }
26 | 
27 |   def initialize(context: UimaContext) = {}
28 | 
29 |   def asCollectionReader = {
30 |     val aed = CollectionReaderFactory.createDescription(this.niceClass, parameterKeyValues: _*)
31 | 
32 |     aed.setExternalResourceDependencies(resources.map(r ⇒
33 |       ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
34 |     resources.foreach { r ⇒
35 |       r.createBinding(aed)
36 |     }
37 | 
38 |     CollectionReaderFactory.createCollectionReader(aed)
39 |   }
40 | 
41 |   def getNext(cas: CAS) = {
42 |     getNext(cas.getJCas())
43 |   }
44 | 
45 |   def getNext(cas: JCas)
46 | 
47 |   def close() = {}
48 | }
49 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasConsumer_ImplBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import configuration._
 7 | import org.apache.uima.analysis_component.JCasAnnotator_ImplBase
 8 | import org.apache.uima.UimaContext
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.factory.ExternalResourceFactory
11 | 
12 | abstract class SCasConsumer_ImplBase extends JCasAnnotator_ImplBase
13 |     with Configurable
14 |     with ConfigurationInitialization
15 |     with ResourceInitialization {
16 | 
17 |   override def initialize(context: UimaContext) = {
18 |     super.initialize(context)
19 | 
20 |     this.loadParameter(context)
21 |     this.loadResources(context)
22 |   }
23 | 
24 |   def asAnalysisEngine = {
25 |     val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*)
26 | 
27 |     aed.setExternalResourceDependencies(resources.map(r ⇒
28 |       ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
29 |     resources.foreach { r ⇒
30 |       r.createBinding(aed)
31 |     }
32 | 
33 |     AnalysisEngineFactory.createAggregate(aed)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasFlowController_ImplBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import configuration._
 7 | import org.apache.uima.flow.FlowControllerContext
 8 | import org.apache.uima.flow.JCasFlowController_ImplBase
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.factory.FlowControllerFactory
11 | 
12 | abstract class SCasFlowController_ImplBase extends JCasFlowController_ImplBase
13 |     with Configurable
14 |     with ConfigurationInitialization
15 |     with ResourceInitialization {
16 | 
17 |   override def initialize(context: FlowControllerContext) = {
18 |     super.initialize(context)
19 | 
20 |     this.loadParameter(context)
21 |   }
22 | 
23 |   def asAnalysisEngine = {
24 |     FlowControllerFactory.createFlowControllerDescription(this.niceClass, this.parameterKeyValues: _*)
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasMultiplier_ImplBase.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import configuration._
 7 | import org.apache.uima.analysis_component.JCasMultiplier_ImplBase
 8 | import org.apache.uima.UimaContext
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.factory.ExternalResourceFactory
11 | 
12 | abstract class SCasMultiplier_ImplBase extends JCasMultiplier_ImplBase
13 |     with Configurable
14 |     with ConfigurationInitialization
15 |     with ResourceInitialization {
16 | 
17 |   override def initialize(context: UimaContext) = {
18 |     super.initialize(context)
19 | 
20 |     this.loadParameter(context)
21 |     this.loadResources(context)
22 |   }
23 | 
24 |   def asAnalysisEngine = {
25 |     val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*)
26 | 
27 |     aed.setExternalResourceDependencies(resources.map(r ⇒
28 |       ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
29 |     resources.foreach { r ⇒
30 |       r.createBinding(aed)
31 |     }
32 | 
33 |     AnalysisEngineFactory.createAggregate(aed)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SimplePipeline.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.apache.uima.collection.CollectionReader
 8 | 
 9 | @deprecated("Use org.apache.uima.fit.pipeline.SimplePipeline or uimascala-stream", "0.5.0")
10 | class SimplePipeline(reader: CollectionReader) {
11 | 
12 |   private var descs: Seq[AnalysisEngine] = Seq.empty
13 | 
14 |   def ~>(in: AsAnalysisEngine): SimplePipeline =
15 |     ~>(in.asAnalysisEngine)
16 | 
17 |   def ~>(in: AnalysisEngine): SimplePipeline = {
18 |     descs = descs :+ in
19 |     this
20 |   }
21 | 
22 |   def run() = {
23 |     org.apache.uima.fit.pipeline.SimplePipeline.runPipeline(reader, descs: _*)
24 |   }
25 | }
26 | 
27 | object SimplePipeline {
28 | 
29 |   def apply(reader: CollectionReader) =
30 |     new SimplePipeline(reader)
31 | 
32 |   def apply(reader: SCasCollectionReader_ImplBase) =
33 |     new SimplePipeline(reader.asCollectionReader)
34 | }
35 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/XmlDescriptor.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import xml.Node
 7 | 
 8 | trait XmlDescriptor {
 9 |   def xmlType: String
10 |   def toXml: Node
11 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/ConfigurationInitialization.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core.configuration
  5 | 
  6 | import scala.collection.mutable.ListBuffer
  7 | import org.apache.uima.analysis_component.AnalysisComponent
  8 | import java.lang.reflect.Method
  9 | import org.apache.uima.UimaContext
 10 | import org.apache.uima.resource.ResourceInitializationException
 11 | 
 12 | /**
 13 |  * Configuration Initalization trait
 14 |  *
 15 |  * This can be used whenever configuration parameters must be initalized
 16 |  */
 17 | trait ConfigurationInitialization { this: Configurable ⇒
 18 | 
 19 |   private var parameterList: List[ParameterHolder] = Nil
 20 | 
 21 |   private val tArray: ListBuffer[ParameterHolder] = new ListBuffer[ParameterHolder]
 22 |   val methods = this.getClass.getMethods
 23 |   introspect(this, methods) {
 24 |     case (v, mf) ⇒ tArray += ParameterHolder(mf.name, v, mf)
 25 |   }
 26 |   parameterList = tArray.toList
 27 | 
 28 |   /**
 29 |    * Uses reflection to find the parameters in the class
 30 |    */
 31 |   protected def introspect[B, V](comp: Configurable, methods: Array[Method])(f: (Method, Parameter[_]) ⇒ Any): Unit = {
 32 |     val potentialParams = methods.toList.filter(isParameter)
 33 | 
 34 |     val map: Map[String, List[Method]] = potentialParams.foldLeft[Map[String, List[Method]]](Map()) {
 35 |       case (map, method) ⇒
 36 |         val name = method.getName
 37 |         map + (name -> (method :: map.getOrElse(name, Nil)))
 38 |     }
 39 | 
 40 |     val realMeth = map.values.map(_.sortWith {
 41 |       case (a, b) ⇒ !a.getReturnType().isAssignableFrom(b.getReturnType)
 42 |     }).map(_.head)
 43 | 
 44 |     for (v ← realMeth) {
 45 |       v.invoke(comp) match {
 46 |         case mf: Parameter[_] ⇒
 47 |           mf.setName_!(v.getName)
 48 |           f(v, mf)
 49 |         case _ ⇒
 50 |       }
 51 |     }
 52 |   }
 53 | 
 54 |   /**
 55 |    * Returns all parameters for the class
 56 |    */
 57 |   def parameters = parameterList.map(_.parameter(this))
 58 | 
 59 |   /**
 60 |    * Uses the uima context to set the parameter
 61 |    */
 62 |   protected def loadParameter(context: UimaContext) = {
 63 |     parameters.foreach { f ⇒
 64 |       val value = context.getConfigParameterValue(f.name)
 65 | 
 66 |       if (f.mandatory_? && value == null) {
 67 |         throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, Array(f.name))
 68 |       }
 69 | 
 70 |       if (value != null) {
 71 |         f.setFromUimaType(value) match {
 72 |           case Right(o) ⇒ ()
 73 |           case Left(l)  ⇒ throw new ResourceInitializationException(new ClassCastException(l.msg))
 74 |         }
 75 |       }
 76 |     }
 77 |   }
 78 | 
 79 |   /**
 80 |    * Checks if a method is a subclass of Parameter
 81 |    */
 82 |   def isParameter(m: Method) =
 83 |     !m.isSynthetic && classOf[Parameter[_]].isAssignableFrom(m.getReturnType)
 84 | 
 85 |   class NiceObject[T <: AnyRef](x: T) {
 86 |     def niceClass: Class[_ <: T] = x.getClass.asInstanceOf[Class[T]]
 87 |   }
 88 |   implicit def toNiceObject[T <: AnyRef](x: T) = new NiceObject(x)
 89 | 
 90 |   def parameterKeyValues: Array[Object] = parameters.flatMap { f ⇒
 91 |     Array(f.name, f.toUimaType match {
 92 |       case Right(o) ⇒ o
 93 |       case Left(l)  ⇒ throw new ResourceInitializationException(new ClassCastException(l.msg))
 94 |     })
 95 |   }.toArray
 96 | 
 97 |   case class ParameterHolder(name: String, method: Method, metaParameter: Parameter[_]) {
 98 |     def parameter(inst: Configurable): Parameter[_] = method.invoke(inst).asInstanceOf[Parameter[_]]
 99 |   }
100 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/Parameter.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core.configuration
  5 | 
  6 | import org.apache.uima.analysis_component.AnalysisComponent
  7 | 
  8 | trait Configurable {}
  9 | 
 10 | class ConfigurationBuilder[T <: Configurable](conf: T) {
 11 |   def config(mutators: T ⇒ Unit*) = {
 12 |     for (f ← mutators) f(conf)
 13 |     conf
 14 |   }
 15 | }
 16 | 
 17 | /**
 18 |  * Base Parameter trait
 19 |  */
 20 | trait BaseParameter {
 21 | 
 22 |   // The parameter name
 23 |   private var fieldName: String = _
 24 |   private var set = false
 25 | 
 26 |   protected def set_?(b: Boolean) = set = b
 27 | 
 28 |   def set_? : Boolean = set
 29 | 
 30 |   /**
 31 |    * Returns the parameter name
 32 |    */
 33 |   def name: String = fieldName
 34 | 
 35 |   /**
 36 |    * Returns the parameter description
 37 |    * Default: None
 38 |    */
 39 |   def description: Option[String] = None
 40 | 
 41 |   /**
 42 |    * Is this parameter mandatory?
 43 |    */
 44 |   def mandatory_? = true
 45 | 
 46 |   /**
 47 |    * If the parameter can take multiple values (collections)
 48 |    */
 49 |   def multiValued_? = false
 50 | 
 51 |   /**
 52 |    * Default is string.
 53 |    * Also possible: Integer, Float, Boolean
 54 |    */
 55 |   def uimaType: String
 56 | 
 57 |   /**
 58 |    * Sets the parameter name
 59 |    */
 60 |   private[configuration] final def setName_!(newName: String): String = {
 61 |     fieldName = newName
 62 |     fieldName
 63 |   }
 64 | }
 65 | 
 66 | case class Failure(msg: String, exception: Option[Exception] = None)
 67 | 
 68 | /**
 69 |  * A typed parameter
 70 |  */
 71 | abstract class Parameter[ThisType](val defaultValue: ThisType)(implicit mf: Manifest[ThisType])
 72 |     extends BaseParameter {
 73 | 
 74 |   import com.github.jenshaase.uimascala.core.CastFactory._
 75 | 
 76 |   private var data: Option[ThisType] = None
 77 | 
 78 |   /**
 79 |    * Sets a new value to this parameter
 80 |    */
 81 |   def :=(in: ThisType) =
 82 |     data = Some(in)
 83 | 
 84 |   /**
 85 |    * Set the parameter value by an object
 86 |    */
 87 |   def setFromUimaType(in: Any): Either[Failure, ThisType] = fromUima[ThisType](in) match {
 88 |     case Right(Some(d)) if d.isInstanceOf[ThisType] ⇒ {
 89 |       :=(d.asInstanceOf[ThisType]); Right(d)
 90 |     }
 91 |     case Right(_)              ⇒ Left(Failure("Value could not be casted: " + in.toString))
 92 |     case Left(l)                  ⇒ Left(l)
 93 |   }
 94 | 
 95 |   /**
 96 |    * Coverts this parameter value to a uima type
 97 |    */
 98 |   def toUimaType: Either[Failure, Object] = toUima(value) match {
 99 |     case Right(Some(s)) ⇒ Right(s.asInstanceOf[Object])
100 |     case Right(None)    ⇒ Left(Failure("Value could not be casted: " + value))
101 |     case Left(l)        ⇒ Left(l)
102 |   }
103 | 
104 |   /**
105 |    * Checks if the parameter is mutlivalued
106 |    */
107 |   override def multiValued_? = mf.erasure.toString match {
108 |     case "class scala.collection.immutable.List" ⇒ true
109 |     case "interface scala.collection.Seq"        ⇒ true
110 |     case s: String if (s.startsWith("class [L")) ⇒ true
111 |     case _                                       ⇒ false
112 |   }
113 | 
114 |   def value: ThisType = data getOrElse defaultValue
115 | 
116 |   def is = value
117 | 
118 |   def get = value
119 | 
120 |   def uimaType =
121 |     if (multiValued_?)
122 |       _uimaType(mf.typeArguments.head.erasure.toString)
123 |     else
124 |       _uimaType(mf.erasure.toString)
125 | 
126 |   def _uimaType(s: String) = s match {
127 |     case "int" | "class java.lang.Integer" ⇒ "Integer"
128 |     case "float"                           ⇒ "Float"
129 |     case "boolean"                         ⇒ "Boolean"
130 |     case _                                 ⇒ "String"
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/Resource.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core.configuration
  5 | 
  6 | import java.net.URL
  7 | import java.io.File
  8 | import org.apache.uima.resource.ResourceSpecifier
  9 | import org.apache.uima.fit.factory.ExternalResourceFactory
 10 | import org.apache.uima.resource.SharedResourceObject
 11 | 
 12 | trait BaseResource {
 13 | 
 14 |   private var resourceKey: String = _
 15 | 
 16 |   def name: String = resourceKey
 17 | 
 18 |   def description: String = ""
 19 | 
 20 |   def interfaceName: String
 21 | 
 22 |   def mandatory_? = true
 23 | 
 24 |   private[configuration] final def setName_!(newName: String): String = {
 25 |     resourceKey = newName
 26 |     resourceKey
 27 |   }
 28 | }
 29 | 
 30 | trait TypedResource[ThisType, ParamType] extends BaseResource {
 31 | 
 32 |   private var boundResource: Option[ThisType] = None
 33 |   private[configuration] var parameters: Option[Map[ParamType, ParamType]] = None
 34 | 
 35 |   def params = parameters getOrElse defaultParameter
 36 | 
 37 |   def defaultParameter: Map[ParamType, ParamType]
 38 | 
 39 |   def parameterList: Seq[ParamType] =
 40 |     params.toSeq.flatMap(p ⇒ List(p._1, p._2))
 41 | 
 42 |   def setFromUima(a: Any) = a match {
 43 |     case x: ThisType       ⇒ Right(bind(x))
 44 |     case Some(x: ThisType) ⇒ Right(bind(x))
 45 |     case _                 ⇒ Left(Failure("Can not bind resource from uima context: " + name))
 46 |   }
 47 | 
 48 |   def bind(newResource: ThisType) = {
 49 |     boundResource = Some(newResource)
 50 |     boundResource
 51 |   }
 52 | 
 53 |   def resource = boundResource get
 54 | 
 55 |   def createBinding(aed: ResourceSpecifier)
 56 | 
 57 |   def className: Class[_ <: ThisType]
 58 | 
 59 |   def interfaceName = className.getName
 60 | }
 61 | 
 62 | case class SharedBinding[T](url: String, params: Map[Object, Object] = Map.empty)
 63 | object SharedBinding {
 64 | 
 65 |   def apply[T](url: URL) =
 66 |     new SharedBinding[T](url.toString, Map.empty)
 67 | 
 68 |   def apply[T](url: URL, params: Map[Object, Object]) =
 69 |     new SharedBinding[T](url.toString, params)
 70 | 
 71 |   def apply[T](url: File) =
 72 |     new SharedBinding[T](url.toURI().toURL().toString, Map.empty)
 73 | 
 74 |   def apply[T](url: File, params: Map[Object, Object]) =
 75 |     new SharedBinding[T](url.toURI().toURL().toString, params)
 76 | }
 77 | 
 78 | abstract class SharedResource[ThisType <: SharedResourceObject](
 79 |   val defaultURL: String,
 80 |   val defaultParams: Map[Object, Object] = Map.empty)(implicit m: Manifest[ThisType])
 81 |     extends TypedResource[ThisType, Object] {
 82 | 
 83 |   private var dataUrl: Option[String] = None
 84 |   private var clazz: Option[Class[_ <: ThisType]] = None
 85 | 
 86 |   def this(defaultUrl: URL, defaultParams: Map[Object, Object])(implicit m: Manifest[ThisType]) =
 87 |     this(defaultUrl.toString, defaultParams)
 88 | 
 89 |   def this(defaultUrl: File, defaultParams: Map[Object, Object])(implicit m: Manifest[ThisType]) =
 90 |     this(defaultUrl.toURI().toURL(), defaultParams)
 91 | 
 92 |   def :=[T <: ThisType](bind: SharedBinding[T])(implicit mf: Manifest[T]) = {
 93 |     clazz = Some(mf.erasure.asInstanceOf[Class[T]])
 94 |     dataUrl = Some(bind.url)
 95 |     parameters = Some(bind.params)
 96 |   }
 97 | 
 98 |   def url = dataUrl getOrElse defaultURL
 99 | 
100 |   def defaultParameter = defaultParams
101 | 
102 |   def defaultClass = m.erasure.asInstanceOf[Class[ThisType]]
103 | 
104 |   def className: Class[_ <: ThisType] = clazz getOrElse defaultClass
105 | 
106 |   // format: OFF
107 |   def createBinding(aed: ResourceSpecifier) = {
108 |     ExternalResourceFactory.bindResource(
109 |       aed,
110 |       name,
111 |       className,
112 |       url,
113 |       parameterList:_*)
114 |   }
115 | }
116 | 
117 | case class Binding[T](params: Map[String, String] = Map.empty)
118 | 
119 | abstract class Resource[ThisType <: org.apache.uima.resource.Resource](
120 |   val defaultParams: Map[String, String] = Map.empty)(implicit m: Manifest[ThisType])
121 |   extends TypedResource[ThisType, String] {
122 | 
123 |   private var clazz: Option[Class[_ <: ThisType]] = None
124 | 
125 |   def defaultClass = m.erasure.asInstanceOf[Class[ThisType]]
126 | 
127 |   def :=[T <: ThisType](bind: Binding[T])(implicit mf: Manifest[T]) = {
128 |     clazz = Some(mf.erasure.asInstanceOf[Class[T]])
129 |     parameters = Some(bind.params)
130 |   }
131 | 
132 |   def defaultParameter = defaultParams
133 | 
134 |   def className: Class[_ <: ThisType] = clazz getOrElse defaultClass
135 | 
136 |   // format: OFF
137 |   def createBinding(aed: ResourceSpecifier) = {
138 |     ExternalResourceFactory.bindResource(
139 |       aed,
140 |       name,
141 |       className,
142 |       parameterList:_*)
143 |   }
144 | }
145 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/ResourceInitialization.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.configuration
 5 | 
 6 | import java.lang.reflect.Method
 7 | import org.apache.uima.resource.ResourceAccessException
 8 | import org.apache.uima.resource.ResourceInitializationException
 9 | import org.apache.uima.UimaContext
10 | import org.apache.uima.fit.descriptor.ExternalResourceLocator
11 | import scala.collection.mutable.ListBuffer
12 | 
13 | trait ResourceInitialization { this: Configurable ⇒
14 | 
15 |   private var resourceList: List[ResourceHolder] = Nil
16 | 
17 |   private val resTempArray: ListBuffer[ResourceHolder] = new ListBuffer[ResourceHolder]
18 |   val resMethods = this.getClass.getMethods
19 |   introspectResources(this, resMethods) {
20 |     case (v, mf) ⇒ {
21 |       resTempArray += ResourceHolder(mf.name, v, mf)
22 |     }
23 |   }
24 |   resourceList = resTempArray.toList
25 | 
26 |   protected def introspectResources(comp: Configurable, methods: Array[Method])(f: (Method, TypedResource[_, _]) ⇒ Any): Unit = {
27 |     val potentialResources = methods.toList.filter(isResource)
28 | 
29 |     val map: Map[String, List[Method]] = potentialResources.foldLeft[Map[String, List[Method]]](Map()) {
30 |       case (map, method) ⇒
31 |         val name = method.getName
32 |         map + (name -> (method :: map.getOrElse(name, Nil)))
33 |     }
34 | 
35 |     val realMeth = map.values.map(_.sortWith {
36 |       case (a, b) ⇒ !a.getReturnType().isAssignableFrom(b.getReturnType)
37 |     }).map(_.head)
38 | 
39 |     for (v ← realMeth) {
40 |       v.invoke(comp) match {
41 |         case mf: TypedResource[_, _] ⇒
42 |           mf.setName_!(v.getName)
43 |           f(v, mf)
44 |         case _ ⇒
45 |       }
46 |     }
47 |   }
48 | 
49 |   def resources = resourceList.map(_.resource(this))
50 | 
51 |   def loadResources(context: UimaContext) = {
52 |     resources.foreach { r ⇒
53 |       var value: Object = null;
54 |       try {
55 |         value = context.getResourceObject(r.name)
56 |       } catch {
57 |         case e: Exception ⇒ throw new ResourceInitializationException(e)
58 |       }
59 | 
60 |       if (value.isInstanceOf[ExternalResourceLocator]) {
61 |         value = value.asInstanceOf[ExternalResourceLocator].getResource()
62 |       }
63 | 
64 |       if (r.mandatory_? && value == null) {
65 |         throw new ResourceInitializationException(new IllegalStateException("Mandatory resource '%s' is not set".format(r.name)))
66 |       }
67 | 
68 |       if (value != null) {
69 |         r.setFromUima(value) match {
70 |           case Left(f: Failure) ⇒ throw f.exception.map(new ResourceInitializationException(_)).getOrElse(new ResourceInitializationException())
71 |           case _                ⇒
72 |         }
73 |       }
74 |     }
75 |   }
76 | 
77 |   def isResource(m: Method) =
78 |     !m.isSynthetic && classOf[TypedResource[_, _]].isAssignableFrom(m.getReturnType)
79 | 
80 |   case class ResourceHolder(name: String, method: Method, metaParameter: TypedResource[_, _]) {
81 |     def resource(inst: Configurable): TypedResource[_, _] = method.invoke(inst).asInstanceOf[TypedResource[_, _]]
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/package.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala
 5 | 
 6 | import org.apache.uima.jcas.tcas.Annotation
 7 | import org.apache.uima.jcas.JCas
 8 | import org.apache.uima.jcas.cas.FSArray
 9 | import com.github.jenshaase.uimascala.core.wrapper._
10 | import com.github.jenshaase.uimascala.core.configuration._
11 | import org.apache.uima.collection.CollectionReader
12 | 
13 | package object core {
14 | 
15 |   implicit def toScalaAnnotation(a: Annotation) = new AnnotationWrapper(a)
16 | 
17 |   implicit def toScalaCas(jcas: JCas) = new JCasWrapper(jcas)
18 | 
19 |   implicit def configBuilder[T <: Configurable](conf: T) = new ConfigurationBuilder(conf)
20 | 
21 |   @deprecated("See com.github.jenshaase.uimascala.core.SimplePipeline", "0.5.0")
22 |   implicit def collectionReaderToPipeline(reader: SCasCollectionReader_ImplBase) = new SimplePipeline(reader.asCollectionReader)
23 | 
24 |   @deprecated("See com.github.jenshaase.uimascala.core.SimplePipeline", "0.5.0")
25 |   implicit def collectionReaderToPipeline(reader: CollectionReader) = new SimplePipeline(reader)
26 | }
27 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/stream/annotators.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.core.stream
 2 | 
 3 | import scala.util.matching.Regex
 4 | import org.apache.uima.jcas.tcas.Annotation
 5 | import scala.reflect.ClassTag
 6 | import com.github.jenshaase.uimascala.core._
 7 | import org.apache.uima.jcas.JCas
 8 | 
 9 | trait annotators {
10 | 
11 |   @deprecated("Use com.github.jenshaase.uimascala.segmenter.RegexTokenizer")
12 |   def regexTokenizer[F[_], T <: Annotation](pattern: Regex, allowEmptyToken: Boolean = true)(implicit cf: ClassTag[T]) =
13 |     annotate[F] { cas: JCas =>
14 |       val txt = cas.getDocumentText
15 | 
16 |       val mostlyAll = pattern.findAllMatchIn(txt).foldLeft(0) {
17 |         case (last, m) if ((allowEmptyToken && m.start >= last) || (!allowEmptyToken && m.start > last)) ⇒
18 |           cas.annotate[T](last, m.start)
19 |           m.end
20 |         case (_, m) =>
21 |           m.end
22 |       }
23 | 
24 |       if (mostlyAll < txt.length)
25 |         cas.annotate[T](mostlyAll, txt.length)
26 |     }
27 | 
28 |   @deprecated("Use com.github.jenshaase.uimascala.segmenter.WhitespaceTokenizer")
29 |   def whitespaceTokenizer[F[_], T <: Annotation](allowEmptyToken: Boolean = true)(implicit cf: ClassTag[T]) =
30 |     regexTokenizer[F, Annotation]("\\s+".r, allowEmptyToken)
31 | 
32 |   def removeStopwords[F[_], T <: Annotation](isStopword: String => Boolean)(implicit cf: ClassTag[T]) =
33 |     annotate[F] { cas: JCas =>
34 |       cas.select[T].
35 |         filter { token => isStopword(token.getCoveredText) }.
36 |         foreach { token => token.removeFromIndexes() }
37 |     }
38 | 
39 |   def annotateStopwords[F[_], Token <: Annotation, Stopword <: Annotation](isStopword: String => Boolean)
40 |   (implicit ct: ClassTag[Token], cs: ClassTag[Stopword]) =
41 |     annotate[F] { cas: JCas =>
42 |       cas.select[Token].foreach { token =>
43 |         if (isStopword(token.getCoveredText)) {
44 |           cas.annotate[Stopword](token.getBegin, token.getEnd)
45 |         }
46 |       }
47 |     }
48 | }
49 | 
50 | object annotators extends annotators
51 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/stream/package.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import org.apache.uima.jcas.tcas.Annotation
 7 | import org.apache.uima.analysis_engine.AnalysisEngine;
 8 | import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine
10 | import org.apache.uima.jcas.JCas
11 | import org.apache.uima.util.CasCreationUtils
12 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
13 | import fs2._
14 | 
15 | package object stream {
16 | 
17 |   type AnnotatorProcess[F[_]] = Pipe[F, JCas, JCas]
18 | 
19 |   def annotate[F[_]](f: (JCas => Any)): AnnotatorProcess[F] =
20 |     _.map { cas =>
21 |       f(cas)
22 |       cas
23 |     }
24 | 
25 |   def annotate[F[_]](a: AnalysisEngine): AnnotatorProcess[F] =
26 |     _.map { cas =>
27 |       a.process(cas)
28 |       cas
29 |     }
30 | 
31 |   def annotate[F[_]](a: AnalysisEngineDescription): AnnotatorProcess[F] =
32 |     annotate(createEngine(a))
33 | 
34 |   def annotate[F[_]](a: AsAnalysisEngine): AnnotatorProcess[F] =
35 |     annotate(a.asAnalysisEngine)
36 | 
37 |   def initCas[F[_], I](f: ((I, JCas) => Any)): Pipe[F, I, JCas] =
38 |     _.map { something =>
39 |       val cas = CasCreationUtils.createCas(
40 |         TypeSystemDescriptionFactory.createTypeSystemDescription, null, null).getJCas
41 | 
42 |       f(something, cas)
43 |       cas
44 |     }
45 | 
46 |   def casFromText[F[_]] = initCas[F, String] { (str ,cas) =>
47 |     cas.setDocumentText(str)
48 |   }
49 | 
50 |   def extractCas[F[_], I](f: JCas => I): Pipe[F, JCas, I] =
51 |     _.map(f)
52 | }
53 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/wrapper/AnnotationWrapper.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.wrapper
 5 | 
 6 | import org.apache.uima.jcas.tcas.Annotation
 7 | 
 8 | /**
 9 |  * A Uima Annotation wrapper for implicity.
10 |  * @author Jens Haase <je.haase@googlemail.com>
11 |  */
12 | class AnnotationWrapper(a: Annotation) {
13 | 
14 |   /**
15 |    * Remove whitespace before and after the annotation
16 |    * by increasing/decreasing the begin/end value
17 |    */
18 |   def trim: Annotation = {
19 |     var begin = a.getBegin
20 |     var end = a.getEnd - 1
21 | 
22 |     val data = a.getCAS.getDocumentText
23 | 
24 |     while (begin < (data.length - 1) && trimChar(data.charAt(begin)))
25 |       begin += 1
26 | 
27 |     while (end > 0 && trimChar(data.charAt(end)))
28 |       end -= 1
29 | 
30 |     end += 1
31 |     a.setBegin(begin)
32 |     a.setEnd(end)
33 | 
34 |     a
35 |   }
36 | 
37 |   /**
38 |    * Add annotation to index if the covering text
39 |    * of the annotation is not empty
40 |    */
41 |   def addToIndexIfNotEmpty = if (!isEmpty) a.addToIndexes
42 | 
43 |   /**
44 |    * Checks if the covering text of the annotation
45 |    * is empty
46 |    */
47 |   def isEmpty = a.getBegin >= a.getEnd
48 | 
49 |   protected def trimChar(c: Char): Boolean = c match {
50 |     case '\n'     ⇒ true
51 |     case '\r'     ⇒ true
52 |     case '\t'     ⇒ true
53 |     case '\u200E' ⇒ true
54 |     case '\u200F' ⇒ true
55 |     case '\u2028' ⇒ true
56 |     case '\u2029' ⇒ true
57 |     case _        ⇒ Character.isWhitespace(c)
58 |   }
59 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/wrapper/JCasWrapper.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core.wrapper
  5 | 
  6 | import org.apache.uima.jcas.JCas
  7 | import org.apache.uima.cas.text.AnnotationFS
  8 | import org.apache.uima.jcas.tcas.Annotation
  9 | import org.apache.uima.jcas.cas.TOP
 10 | import scala.collection.JavaConversions._
 11 | import org.apache.uima.cas.FeatureStructure
 12 | import scala.collection.JavaConversions._
 13 | import collection.mutable.Buffer
 14 | import org.apache.uima.fit.util.{ CasUtil, JCasUtil }
 15 | import scala.reflect.ClassTag
 16 | 
 17 | /**
 18 |  * A JCas wrapper for implicity
 19 |  * @author Jens Haase <je.haase@googlemail.com>
 20 |  */
 21 | class JCasWrapper(cas: JCas) {
 22 | 
 23 |   def create[T <: TOP](f: (T => Unit)*)(implicit cf: ClassTag[T]): T = {
 24 |     val constructor = cf.runtimeClass.getConstructor(classOf[JCas])
 25 |     val obj = constructor.newInstance(cas).asInstanceOf[T]
 26 |     f.foreach { f => f(obj) }
 27 |     obj.addToIndexes()
 28 |     obj
 29 |   }
 30 | 
 31 |   def annotate[T <: Annotation](begin: Int, end: Int)(implicit cf: ClassTag[T]): T = {
 32 |     val constructor = cf.runtimeClass.getConstructor(classOf[JCas])
 33 |     val obj = constructor.newInstance(cas).asInstanceOf[T]
 34 |     obj.setBegin(begin)
 35 |     obj.setEnd(end)
 36 |     obj.addToIndexes()
 37 |     obj
 38 |   }
 39 | 
 40 |   /**
 41 |    * @see org.apache.uima.fit.uitl.JCasUtil#select
 42 |    */
 43 |   def select[T <: TOP](implicit cf: ClassTag[T]): Iterable[T] =
 44 |     JCasUtil.select(cas, cf.runtimeClass.asInstanceOf[Class[T]])
 45 | 
 46 |   /**
 47 |    * @see org.apache.uima.fit.uitl.JCasUtil#selectByIndex
 48 |    */
 49 |   def selectByIndex[T <: Annotation](index: Int)(implicit cf: ClassTag[T]) =
 50 |     JCasUtil.selectByIndex(cas, cf.runtimeClass.asInstanceOf[Class[T]], index)
 51 | 
 52 |   /**
 53 |    * @see org.apache.uima.fit.uitl.JCasUtil#selectCovered
 54 |    */
 55 |   def selectCovered[T <: Annotation](coveringAnnotation: Annotation)(implicit cf: ClassTag[T]) =
 56 |     JCasUtil.selectCovered(cas, cf.runtimeClass.asInstanceOf[Class[T]], coveringAnnotation)
 57 | 
 58 |   /**
 59 |    * @see org.apache.uima.fit.uitl.JCasUtil#selectCovered
 60 |    */
 61 |   def selectCovered[T <: Annotation](begin: Int, end: Int)(implicit cf: ClassTag[T]) =
 62 |     JCasUtil.selectCovered(cas, cf.runtimeClass.asInstanceOf[Class[T]], begin, end)
 63 | 
 64 |   /**
 65 |    * @see org.apache.uima.fit.uitl.JCasUtil#selectSingle
 66 |    */
 67 |   def selectSingle[T <: TOP](implicit cf: ClassTag[T]) =
 68 |     JCasUtil.selectSingle(cas, cf.runtimeClass.asInstanceOf[Class[T]])
 69 | 
 70 |   /**
 71 |    * @see org.apache.uima.fit.uitl.JCasUtil#selectPreceding
 72 |    */
 73 |   def selectPreceding[T <: Annotation](annotation: Annotation, count: Int = Int.MaxValue)(implicit cf: ClassTag[T]): Buffer[T] = {
 74 |     JCasUtil.selectPreceding(cas, cf.runtimeClass.asInstanceOf[Class[T]], annotation, count);
 75 |   }
 76 | 
 77 |   /**
 78 |    * @see org.apache.uima.fit.uitl.JCasUtil#selectFollowing
 79 |    */
 80 |   def selectFollowing[T <: Annotation](annotation: Annotation, count: Int = Int.MaxValue)(implicit cf: ClassTag[T]): Buffer[T] = {
 81 |     JCasUtil.selectFollowing(cas, cf.runtimeClass.asInstanceOf[Class[T]], annotation, count)
 82 |   }
 83 | 
 84 |   /**
 85 |    * @see org.apache.uima.fit.uitl.JCasUtil#exists
 86 |    */
 87 |   def exists[T <: TOP](implicit ct: ClassTag[T]) =
 88 |     JCasUtil.exists(cas, ct.runtimeClass.asInstanceOf[Class[T]])
 89 | 
 90 |   /**
 91 |    * @see org.apache.uima.fit.uitl.JCasUtil#getView
 92 |    */
 93 |   def getView(name: String, fallback: JCas) =
 94 |     JCasUtil.getView(cas, name, fallback)
 95 | 
 96 |   /**
 97 |    * @see org.apache.uima.fit.uitl.JCasUtil#getView
 98 |    */
 99 |   def getView(name: String, create: Boolean) =
100 |     JCasUtil.getView(cas, name, create)
101 | }
102 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/ConverterSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import org.specs2._
 7 | import matcher._
 8 | import java.util.regex.Pattern
 9 | import java.util.Locale
10 | import java.io.File
11 | import scala.util.matching.Regex
12 | 
13 | class ConverterSpec extends Specification {
14 |   import CastFactory._
15 | 
16 |   // format: OFF
17 |   def is = s2"""
18 |     The Formatter should
19 |       convert string            ${convert("test", "test")}
20 |       convert int               ${convert(1, 1)}
21 |       convert float             ${convert(1.2f, 1.2f)}
22 |       convert double            ${convert(1.2d, 1.2d)}
23 |       convert boolean           ${convert(true, true)}
24 |       convert locale            ${convert(new Locale("en"), "en")}
25 |       convert pattern           ${convert(Pattern.compile("[A-Z]*"), "[A-Z]*", Some(patToString _))}
26 |       convert regex             ${convert("[A-Z]*".r, "[A-Z]*", Some(regToString _))}
27 |       convert file              ${convert(new File("/test/abc"), "/test/abc")}
28 |       
29 |       convert string list       ${convert(List("t", "v"), Array("t", "v"))}
30 |       convert int list          ${convert(List(1, 2), Array(1, 2))}
31 |       convert float list        ${convert(List(1.2f, 2.3f), Array(1.2f, 2.3f))}
32 |       convert double list       ${convert(List(1.2d, 2.3d), Array(1.2d, 2.3d))}
33 |       convert boolean list      ${convert(List(true, false, true), Array(true, false, true))}
34 |       convert pattern list      ${convert(List(Pattern.compile("[A-Z]"), Pattern.compile("[1-4]")), Array("[A-Z]", "[1-4]"), Some{in: List[Pattern] => in.map(patToString)})}
35 |       convert regex list        ${convert(List("[A-Z]".r, "[1-4]".r), Array("[A-Z]", "[1-4]"), Some{in: List[Regex] => in.map(regToString)})}
36 |       convert file list         ${convert(List(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
37 |       convert locale list       ${convert(List(new Locale("de"), new Locale("en")), Array("de", "en"))}
38 |       convert file list         ${convert(List(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
39 |       
40 |       convert string Seq        ${convert(Seq("t", "v"), Array("t", "v"))}
41 |       convert int Seq           ${convert(Seq(1, 2), Array(1, 2))}
42 |       convert float Seq         ${convert(Seq(1.2f, 2.3f), Array(1.2f, 2.3f))}
43 |       convert double Seq        ${convert(Seq(1.2d, 2.3d), Array(1.2d, 2.3d))}
44 |       convert boolean Seq       ${convert(Seq(true, false, true), Array(true, false, true))}
45 |       convert pattern Seq       ${convert(Seq(Pattern.compile("[A-Z]"), Pattern.compile("[1-4]")), Array("[A-Z]", "[1-4]"), Some{in: Seq[Pattern] => in.map(patToString)})}
46 |       convert regex Seq         ${convert(Seq("[A-Z]".r, "[1-4]".r), Array("[A-Z]", "[1-4]"), Some{in: Seq[Regex] => in.map(regToString)})}
47 |       convert file Seq"         ${convert(Seq(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
48 |       convert locale Seq"       ${convert(Seq(new Locale("de"), new Locale("en")), Array("de", "en"))}
49 |       convert file Seq"         ${convert(Seq(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
50 |       
51 |       convert string option     ${convert(Some("test"), "test")}
52 |       convert int option        ${convert(Some(1), 1)}
53 |       convert float option      ${convert(Some(1.2f), 1.2f)}
54 |       convert double option     ${convert(Some(1.2d), 1.2d)}
55 |       convert boolean option    ${convert(Some(true), true)}
56 |       convert locale option     ${convert(Some(new Locale("en")), "en")}
57 |       convert pattern option    ${convert(Some(Pattern.compile("[A-Z]*")), "[A-Z]*", Some(optPatToString _))}
58 |       convert regex option      ${convert(Some("[A-Z]*".r), "[A-Z]*", Some(optRegToString _))}
59 |       convert file option       ${convert(Some(new File("/test/abc")), "/test/abc")}
60 |       convert none option       ${convert(None, null)}
61 |   """
62 | 
63 |   
64 |   def convert[T, R](in: T, out: R, func: Option[T => _] = None)(implicit m: Manifest[T], r: Manifest[R]) = {
65 |     val to = toUima(in)
66 |     to must beRight
67 |     to.right.get must beSome
68 |     to.right.get.get must_== out
69 | 
70 |     val from = fromUima[T](out)
71 |     from must beRight
72 |     from.right.get must beSome
73 |     func match {
74 |       case Some(f) => f(from.right.get.get) must_== f(in)
75 |       case None => from.right.get.get must_== in
76 |     }
77 |   }
78 | 
79 |   def patToString(in: Pattern) = in.pattern
80 |   def regToString(in: Regex) = in.pattern.pattern
81 |   def optPatToString(in: Option[Pattern]) = in.map(_.pattern)
82 |   def optRegToString(in: Option[Regex]) = in.map(_.pattern.pattern)
83 | }
84 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/SCasAnnotator_ImplBaseSpecs.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core
  5 | 
  6 | import org.specs2.mutable.Specification
  7 | import com.github.jenshaase.uimascala.core.configuration._
  8 | import org.apache.uima.jcas.JCas
  9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
 10 | import org.apache.uima.resource.Resource_ImplBase
 11 | import org.apache.uima.resource.SharedResourceObject
 12 | import org.apache.uima.resource.DataResource
 13 | 
 14 | class SCasAnnotator_ImplBaseSpecs extends Specification {
 15 | 
 16 |   "SCasAnnotator_ImplBase" should {
 17 | 
 18 |     "initialize one string parameter in a Annotator" in {
 19 |       val d = new DummyAnnotator().config(
 20 |         _.stringParam := "dummy").asAnalysisEngine
 21 | 
 22 |       val cas = d.newJCas
 23 |       d.process(cas)
 24 | 
 25 |       cas.getDocumentText must be equalTo ("dummy")
 26 |     }
 27 | 
 28 |     "initalize two parameters in a Annotator" in {
 29 |       val d = new Dummy2Annotator().config(
 30 |         _.stringParam := "dummy",
 31 |         _.intParam := 1).asAnalysisEngine
 32 | 
 33 |       val cas = d.newJCas
 34 |       d.process(cas)
 35 | 
 36 |       cas.getDocumentText must be equalTo ("dummy1")
 37 |     }
 38 | 
 39 |     "initalize list parameter in a Annotator" in {
 40 |       val d = new Dummy3Annotator().config(
 41 |         _.listParam := List("2", "3")).asAnalysisEngine
 42 | 
 43 |       val cas = d.newJCas
 44 |       d.process(cas)
 45 | 
 46 |       cas.getDocumentText must be equalTo ("23")
 47 |     }
 48 | 
 49 |     "not require to set a optinal value" in {
 50 |       val d = new Dummy2Annotator().config(
 51 |         _.stringParam := "dummy").asAnalysisEngine
 52 | 
 53 |       val cas = d.newJCas
 54 |       d.process(cas)
 55 | 
 56 |       cas.getDocumentText must be equalTo ("dummy100")
 57 |     }
 58 | 
 59 |     "initialize a Annotator with a SharedResourceObject" in {
 60 |       val d = new ResourceDummyAnnotator().config(
 61 |         _.dict := SharedBinding[SharedDict](new java.io.File("/path/to/nowhere")),
 62 |         _.name := Binding[SharedName2]()).asAnalysisEngine
 63 |       val cas = d.newJCas
 64 |       d.process(cas)
 65 | 
 66 |       cas.getDocumentText() must be equalTo ("SharedDict|SharedName2")
 67 |     }
 68 |   }
 69 | }
 70 | 
 71 | class DummyAnnotator extends SCasAnnotator_ImplBase {
 72 | 
 73 |   object stringParam extends Parameter[String]("test")
 74 | 
 75 |   def process(cas: JCas) = {
 76 |     cas.setDocumentText(stringParam.is)
 77 |   }
 78 | }
 79 | 
 80 | class Dummy2Annotator extends SCasAnnotator_ImplBase {
 81 | 
 82 |   object stringParam extends Parameter[String]("test")
 83 |   object intParam extends Parameter[Int](100)
 84 | 
 85 |   def process(cas: JCas) = {
 86 |     cas.setDocumentText(stringParam.is + intParam.is)
 87 |   }
 88 | }
 89 | 
 90 | class Dummy3Annotator extends SCasAnnotator_ImplBase {
 91 | 
 92 |   object listParam extends Parameter[List[String]](List("a", "b"))
 93 | 
 94 |   def process(cas: JCas) = {
 95 |     cas.setDocumentText(listParam.is.foldLeft("")(_ + _))
 96 |   }
 97 | }
 98 | 
 99 | class SharedDict extends SharedResourceObject {
100 |   def load(data: DataResource) = {}
101 | 
102 |   def name = "SharedDict"
103 | }
104 | class SharedName extends Resource_ImplBase { def name = "SharedName" }
105 | class SharedName2 extends SharedName { override def name = "SharedName2" }
106 | 
107 | class ResourceDummyAnnotator extends SCasAnnotator_ImplBase {
108 |   object dict extends SharedResource[SharedDict]("/path/to/nowhere")
109 |   object name extends Resource[SharedName]
110 | 
111 |   def process(cas: JCas) = {
112 |     cas.setDocumentText(dict.resource.name + "|" + name.resource.name);
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/SimplePipelineSpecs.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core
 5 | 
 6 | import org.specs2.mutable.Specification
 7 | import org.apache.uima.jcas.JCas
 8 | import org.apache.uima.util.ProgressImpl
 9 | import org.apache.uima.jcas.tcas.Annotation
10 | 
11 | class SimplePipelineSpecs extends Specification {
12 | 
13 |   "SimplePipeline" should {
14 |     "add one reader and one annotator" in {
15 |       try {
16 |         new PipelineDummyReader() ~> new PipelineAnnotatorA() run ()
17 | 
18 |         success
19 |       } catch {
20 |         case e ⇒ {
21 |           e.printStackTrace()
22 |           failure
23 |         }
24 |       }
25 |     }
26 | 
27 |     "add one reader and two annotator" in {
28 |       try {
29 |         new PipelineDummyReader() ~>
30 |           new PipelineAnnotatorA() ~>
31 |           new PipelineAnnotatorB() run ()
32 | 
33 |         success
34 |       } catch {
35 |         case _ ⇒ failure
36 |       }
37 |     }
38 |   }
39 | }
40 | 
41 | class PipelineDummyReader extends SCasCollectionReader_ImplBase {
42 |   val total = 2
43 |   var i = total
44 | 
45 |   def getNext(cas: JCas) = {
46 |     cas.setDocumentText("Doc" + i)
47 |     i = i - 1
48 |   }
49 | 
50 |   def getProgress = Array(new ProgressImpl(total - i, total, "test"))
51 | 
52 |   def hasNext = i > 0
53 | }
54 | 
55 | class PipelineAnnotatorA extends SCasAnnotator_ImplBase {
56 |   def process(cas: JCas) = {
57 |     new Annotation(cas, 0, 1).addToIndexes
58 |   }
59 | }
60 | 
61 | class PipelineAnnotatorB extends SCasAnnotator_ImplBase {
62 |   def process(cas: JCas) = {
63 |     new Annotation(cas, 1, 2).addToIndexes
64 |   }
65 | }


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ConfigurationInitalizationSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.configuration
 5 | 
 6 | import org.specs2.Specification
 7 | import com.github.jenshaase.uimascala.core.configuration._
 8 | 
 9 | class ConfigurationInitalizationSpec extends Specification {
10 |   def is = s2"""
11 |     This is a specification to check the configuration system
12 |       
13 |       ConfigMock should
14 | 
15 |       have 4 parameters  ${nbParams(4)}
16 |       have a parameter called 'stringParam' ${hasParam("stringParam")}
17 |       have a parameter called 'optStringParam' ${hasParam("stringListParam")}
18 |       have a parameter called 'intParam' ${hasParam("intParam")}
19 |       have a parameter called 'optIntParam' ${hasParam("intListParam")}
20 |   """
21 | 
22 |   def hasParam(name: String) =
23 |     new ConfigMock().parameters.map(_.name).contains(name) must beTrue
24 | 
25 |   def nbParams(count: Int) =
26 |     new ConfigMock().parameters.size must be equalTo (count)
27 | 
28 |   def createKeyValues = {
29 |     val config = new ConfigMock()
30 |     config.stringParam := "Test"
31 |     config.intParam := 100
32 | 
33 |     config.parameterKeyValues.toList.
34 |       sliding(2, 2).map(l ⇒ Pair(l(0).asInstanceOf[String], l(1))).toList.sortBy(_._1) must be equalTo (List(
35 |         ("intListParam", Array(1, 2).asInstanceOf[Object]),
36 |         ("intParam", 100.asInstanceOf[Object]),
37 |         ("stringListParam", Array("ab", "cd").asInstanceOf[Object]),
38 |         ("stringParam", "Test".asInstanceOf[Object])))
39 |   }
40 | }
41 | 
42 | class ConfigMock extends Configurable with ConfigurationInitialization {
43 |   object stringParam extends Parameter[String]("test")
44 |   object stringListParam extends Parameter[List[String]](List("ab", "cd"))
45 | 
46 |   object intParam extends Parameter[Int](1)
47 |   object intListParam extends Parameter[List[Int]](List(1, 2))
48 | 
49 |   object somethingElse {
50 |     def someMethod = "Anything"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ParameterSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.configuration
 5 | 
 6 | import org.specs2.Specification
 7 | import com.github.jenshaase.uimascala.core.configuration._
 8 | import java.util.regex.Pattern
 9 | 
10 | class ParameterSpec extends Specification {
11 | 
12 |   // format: OFF
13 |   def is = s2"""
14 |     A Parameter can
15 |       return default values if not value is set ${defaultVal}
16 |       have a new value ${newVal}
17 |       be set from uima ${fromUima}
18 |       be converted to uima ${toUima}
19 |       be mutli valued ${multiVal}
20 |       be single valued ${singleVal}
21 |       have a correct uima type ${uimaType}
22 |   """
23 | 
24 |   def defaultVal = {
25 |     object param extends Parameter[String]("a")
26 |     param.is must_== "a"
27 |   }
28 | 
29 |   def newVal = {
30 |     object param extends Parameter[String]("a")
31 |     param := "b"
32 |     param.is must_== "b"
33 |   }
34 | 
35 |   def fromUima = {
36 |     object param extends Parameter[Pattern](Pattern.compile("[A-Z]"))
37 |     param.setFromUimaType("[1-4]")
38 |     param.is.pattern must_== "[1-4]"
39 |   }
40 | 
41 |   def toUima = {
42 |     object param extends Parameter[Pattern](Pattern.compile("[A-Z]"))
43 |     param.toUimaType must beRight
44 |     param.toUimaType.right.get must_== "[A-Z]"
45 |   }
46 | 
47 |   def multiVal = {
48 |     object l extends Parameter[List[String]](List("b"))
49 |     object s extends Parameter[Seq[String]](Seq("b"))
50 |     object a extends Parameter[Array[String]](Array("b"))
51 | 
52 |     l.multiValued_? must_== true
53 |     s.multiValued_? must_== true
54 |     a.multiValued_? must_== true
55 |   }
56 | 
57 |   def singleVal = {
58 |     object param extends Parameter[String]("b")
59 |     param.multiValued_? must_== false
60 |   }
61 | 
62 |   def uimaType = {
63 |     object p1 extends Parameter[Pattern](Pattern.compile("[A-Z]"))
64 |     object p2 extends Parameter[String]("a")
65 |     object p3 extends Parameter[Float](1.2f)
66 |     object p4 extends Parameter[Boolean](true)
67 |     object p5 extends Parameter[Int](2)
68 | 
69 |     object p6 extends Parameter[List[Pattern]](List(Pattern.compile("[A-Z]")))
70 |     object p7 extends Parameter[List[String]](List("a"))
71 |     object p8 extends Parameter[List[Float]](List(1.2f))
72 |     object p9 extends Parameter[List[Boolean]](List(true))
73 |     object p10 extends Parameter[List[Int]](List(2))
74 | 
75 |     p1.uimaType must_== "String"
76 |     p2.uimaType must_== "String"
77 |     p3.uimaType must_== "Float"
78 |     p4.uimaType must_== "Boolean"
79 |     p5.uimaType must_== "Integer"
80 | 
81 |     p6.uimaType must_== "String"
82 |     p7.uimaType must_== "String"
83 |     p8.uimaType must_== "Float"
84 |     p9.uimaType must_== "Boolean"
85 |     p10.uimaType must_== "Integer"
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ResourceInitializationSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.configuration
 5 | 
 6 | import org.specs2.Specification
 7 | import org.apache.uima.resource.SharedResourceObject
 8 | import org.apache.uima.resource.DataResource
 9 | import org.apache.uima.resource.Resource_ImplBase
10 | 
11 | class ResourceInitializationSpec extends Specification {
12 |   def is = s2"""
13 |     This specification describes the resource initialization
14 | 
15 |       The ResourceMock class should
16 |       have 4 resource objects ${nbResource(4)}
17 |       have a resource called 'dictionary' ${hasResource("dictionary")}
18 |       have a resource called 'name' ${hasResource("name")}
19 |       have a resource called 'stopwords' ${hasResource("stopwords")}
20 |       have a resource called 'optName' ${hasResource("optName")}
21 |       return the correct dictionary resource (todo)
22 |       return the correct name resource (todo)
23 |       return the correct stopwords resource (todo)
24 |       return the correct optName resource (todo)
25 |   """
26 | 
27 |   def nbResource(count: Int) =
28 |     new ResourceMock().resources.size must be equalTo (count)
29 | 
30 |   def hasResource(name: String) =
31 |     new ResourceMock().resources.map(_.name).contains(name) must beTrue
32 | 
33 | }
34 | 
35 | class ResourceMock extends Configurable with ResourceInitialization {
36 | 
37 |   object dictionary extends SharedResource[SharedDict]("/path/to/noWhere")
38 | 
39 |   object name extends Resource[SharedName]
40 | 
41 |   object stopwords extends SharedResource[SharedStopword]("/path/to/noWhere")
42 | 
43 |   object optName extends Resource[SharedOptName]
44 | }
45 | 
46 | class SharedName extends Resource_ImplBase {
47 |   def name = "myName"
48 | }
49 | 
50 | class SharedOptName extends Resource_ImplBase {
51 |   def name = "myOptName"
52 | }
53 | 
54 | class SharedDict extends SharedResourceObject {
55 | 
56 |   def load(data: DataResource) =
57 |     data.getUri.toString
58 | 
59 |   def getDict = "dict"
60 | }
61 | 
62 | class SharedStopword extends SharedResourceObject {
63 | 
64 |   def load(data: DataResource) =
65 |     data.getUri.toString
66 | 
67 |   def getStopword = "stopword"
68 | }
69 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ResourceSpec.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.configuration
  5 | 
  6 | import java.util.regex.Pattern
  7 | import com.github.jenshaase.uimascala.core.configuration._
  8 | import org.apache.uima.resource.Resource_ImplBase
  9 | import org.apache.uima.resource.SharedResourceObject
 10 | import org.apache.uima.resource.DataResource
 11 | import org.specs2.Specification
 12 | 
 13 | class ResourceSpec extends Specification {
 14 | 
 15 |   // format: OFF
 16 |   def is = s2"""
 17 |     A Resource should
 18 |       return default parameters ${defaultParams}
 19 |       convert parameter to a list ${paramToList}
 20 |       bind a resource ${bind}
 21 |       bind a resource from Uima ${bindUima}
 22 |       be set by a Binding ${setBinding}
 23 |       return a class name ${className}
 24 |       return a interface name ${interfaceName}
 25 |    
 26 |     A SharedResource should
 27 |       return default parameters ${sharedDefaultParams}
 28 |       convert parameter to a list ${sharedParamToList}
 29 |       bind a resource ${sharedBind}
 30 |       bind a resource from Uima ${sharedBindUima}
 31 |       be set by a Binding ${sharedSetBinding}
 32 |       return a class name ${sharedClassName}
 33 |       return a interface name ${sharedInterfaceName}
 34 |   """
 35 | 
 36 |   def defaultParams = {
 37 |     object r extends Resource[DummyRes](Map("a" -> "b"))
 38 |     r.params must_== Map("a" -> "b")
 39 |   }
 40 | 
 41 |   def paramToList = {
 42 |     object r extends Resource[DummyRes](Map("a" -> "b"))
 43 |     r.parameterList must_== Seq("a", "b")
 44 |   }
 45 | 
 46 |   def bind = {
 47 |     object r extends Resource[DummyRes]()
 48 |     val o = new DummyRes()
 49 |     r.bind(o)
 50 | 
 51 |     r.resource must_== o
 52 |   }
 53 | 
 54 |   def bindUima = {
 55 |     object r extends Resource[DummyRes]()
 56 |     val o = new DummyRes()
 57 |     r.setFromUima(o)
 58 | 
 59 |     r.resource must_== o
 60 |   }
 61 | 
 62 |   def setBinding = {
 63 |     object r extends Resource[DummyRes](Map("a" -> "b"))
 64 |     r := Binding(Map("c" -> "d"))
 65 |     r.params must_== Map("c" -> "d")
 66 |   }
 67 | 
 68 |   def className = {
 69 |     object r extends Resource[DummyRes](Map("a" -> "b"))
 70 |     r.className.getName must_== classOf[DummyRes].getName
 71 |   }
 72 | 
 73 |   def interfaceName = {
 74 |     object r extends Resource[DummyRes](Map("a" -> "b"))
 75 |     r.interfaceName must_== classOf[DummyRes].getName
 76 |   }
 77 | 
 78 |   class DummyRes extends Resource_ImplBase { def name = "DummyRes" }
 79 | 
 80 | 
 81 |   // Shared Resource
 82 | 
 83 |   def sharedDefaultParams = {
 84 |     object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
 85 |     r.url must_== "/test/data"
 86 |     r.params must_== Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])
 87 |   }
 88 | 
 89 |   def sharedParamToList = {
 90 |     object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
 91 |     r.parameterList must_== Seq("a", "b")
 92 |   }
 93 | 
 94 |   def sharedBind = {
 95 |     object r extends SharedResource[DummyShared]("/test/data")
 96 |     val o = new DummyShared()
 97 |     r.bind(o)
 98 | 
 99 |     r.resource must_== o
100 |   }
101 | 
102 |   def sharedBindUima = {
103 |     object r extends SharedResource[DummyShared]("/test/data")
104 |     val o = new DummyShared()
105 |     r.setFromUima(o)
106 | 
107 |     r.resource must_== o
108 |   }
109 | 
110 |   def sharedSetBinding = {
111 |     object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
112 |     r := SharedBinding("/abc/def", Map("c".asInstanceOf[Object] -> "d".asInstanceOf[Object]))
113 |     r.url must_== "/abc/def"
114 |     r.params must_== Map("c".asInstanceOf[Object] -> "d".asInstanceOf[Object])
115 |   }
116 | 
117 |   def sharedClassName = {
118 |     object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
119 |     r.className.getName must_== classOf[DummyShared].getName
120 |   }
121 | 
122 |   def sharedInterfaceName = {
123 |     object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
124 |     r.interfaceName must_== classOf[DummyShared].getName
125 |   }
126 | 
127 |   class DummyShared extends SharedResourceObject {
128 |     def load(data: DataResource) = {}
129 |     def name = "SharedDict"
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/stream/annotatorsSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.core.stream
 2 | 
 3 | import org.specs2.mutable._
 4 | import fs2._
 5 | import org.apache.uima.jcas.tcas.Annotation
 6 | import com.github.jenshaase.uimascala.core._
 7 | 
 8 | class annotateSpec extends Specification {
 9 | 
10 |   import annotators._
11 | 
12 |   "Annotators" should {
13 | 
14 |     def tokenizeText[F[_]] =
15 |       casFromText[F] andThen whitespaceTokenizer[F, Annotation](false)
16 | 
17 |     "tokenize a document" in {
18 |       val p = Stream.pure("this is a text", " and another text ").
19 |         through(tokenizeText).
20 |         through(extractCas { cas =>
21 |           cas.select[Annotation].drop(1).map(_.getCoveredText).toList
22 |         })
23 |  
24 |       p.toList must be equalTo (List(
25 |         List("this", "is", "a", "text"),
26 |         List("and", "another", "text")
27 |       ))
28 |     }
29 | 
30 |     "remove stopwords" in {
31 |       val p = Stream.pure("this is a text", " and another text ").
32 |         through(tokenizeText).
33 |         through(removeStopwords[Pure, Annotation](s => Set("is", "a").contains(s))).
34 |         through(extractCas { cas =>
35 |           cas.select[Annotation].drop(1).map(_.getCoveredText).toList
36 |         })
37 |  
38 |       p.toList must be equalTo (List(
39 |         List("this", "text"),
40 |         List("and", "another", "text")
41 |       ))
42 |     }
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/util/Helper.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.util
 5 | 
 6 | import org.apache.uima.jcas.JCas
 7 | import org.apache.uima.util.CasCreationUtils
 8 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
 9 | 
10 | /**
11 |  * @author Jens Haase <je.haase@googlemail.com>
12 |  */
13 | 
14 | trait Helper {
15 | 
16 |   def newJCas: JCas = {
17 |     CasCreationUtils.createCas(
18 |       TypeSystemDescriptionFactory.createTypeSystemDescription, null, null).getJCas
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/wrapper/AnnotationWrapperSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.core.wrapper
 5 | 
 6 | import org.specs2.mutable.Specification
 7 | import org.apache.uima.util.CasCreationUtils
 8 | import org.apache.uima.fit.factory.{ TypePrioritiesFactory, TypeSystemDescriptionFactory }
 9 | import org.apache.uima.jcas.JCas
10 | import org.apache.uima.jcas.tcas.Annotation
11 | import com.github.jenshaase.uimascala.core._
12 | import util.Helper
13 | 
14 | /**
15 |  * @author Jens Haase <je.haase@googlemail.com>
16 |  */
17 | class AnnotationWrapperSpec extends Specification with Helper {
18 | 
19 |   "Annotation Wrapper" should {
20 | 
21 |     "trim a annotation" in {
22 |       val cas = newJCas
23 |       cas.setDocumentText("This is text")
24 | 
25 |       val a = new Annotation(cas, 4, 8)
26 |       a.getCoveredText must be equalTo (" is ")
27 |       a.trim.getCoveredText must be equalTo ("is")
28 |     }
29 | 
30 |     "check if a annotation is empty" in {
31 |       new Annotation(newJCas, 0, 0).isEmpty must beTrue
32 |       new Annotation(newJCas, 0, 1).isEmpty must beFalse
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/wrapper/JCasWrapperSpec.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright (C) 2011 Jens Haase
  3 |  */
  4 | package com.github.jenshaase.uimascala.core.wrapper
  5 | 
  6 | import org.specs2.mutable.Specification
  7 | import com.github.jenshaase.uimascala.core._
  8 | import util.Helper
  9 | import org.apache.uima.jcas.tcas.Annotation
 10 | import org.apache.uima.jcas.JCas
 11 | 
 12 | /**
 13 |  * @author Jens Haase <je.haase@googlemail.com>
 14 |  */
 15 | class JCasWrapperSpec extends Specification with Helper {
 16 | 
 17 |   class Token(cas: JCas, begin: Int, end: Int) extends Annotation(cas, begin, end)
 18 | 
 19 |   "JCasWrapper" should {
 20 | 
 21 |     "select annotation of same type" in {
 22 |       val cas = newJCas
 23 |       cas.setDocumentText("This is a text")
 24 | 
 25 |       new Annotation(cas, 0, 4).addToIndexes
 26 |       new Annotation(cas, 5, 7).addToIndexes
 27 | 
 28 |       // Note: One Annotation and one DocumentAnnotation are default
 29 |       // to each new JCas
 30 |       cas.select[Annotation].size must be equalTo (3)
 31 |     }
 32 | 
 33 |     "select annotation by index" in {
 34 |       val cas = newJCas
 35 |       cas.setDocumentText("This is a text")
 36 | 
 37 |       new Annotation(cas, 0, 4).addToIndexes
 38 | 
 39 |       cas.selectByIndex[Annotation](1).getCoveredText must be equalTo ("This")
 40 |     }
 41 | 
 42 |     "select all anntation covered by another annotation" in {
 43 |       val cas = newJCas
 44 |       cas.setDocumentText("This is a text")
 45 | 
 46 |       val a1 = new Annotation(cas, 0, 4)
 47 |       a1.addToIndexes
 48 |       val a2 = new Annotation(cas, 0, 1)
 49 |       a2.addToIndexes
 50 |       val a3 = new Annotation(cas, 1, 2)
 51 |       a3.addToIndexes
 52 | 
 53 |       cas.selectCovered[Annotation](a1).size must be equalTo (2)
 54 |       cas.selectCovered[Annotation](a1).get(0).getCoveredText must be equalTo ("T")
 55 |     }
 56 | 
 57 |     "select a single annotation" in {
 58 |       val cas = newJCas
 59 |       cas.setDocumentText("This is a text")
 60 | 
 61 |       cas.selectSingle[Annotation].getCoveredText must be equalTo ("This is a text")
 62 |     }
 63 | 
 64 |     "select all preceding annotation" in {
 65 |       val cas = newJCas
 66 |       cas.setDocumentText("This is a text")
 67 | 
 68 |       val a1 = new Annotation(cas, 0, 4)
 69 |       a1.addToIndexes
 70 |       val a2 = new Annotation(cas, 5, 7)
 71 |       a2.addToIndexes
 72 |       val a3 = new Annotation(cas, 8, 9)
 73 |       a3.addToIndexes
 74 | 
 75 |       val p1 = cas.selectPreceding[Annotation](a2, 1)
 76 |       p1.size must be equalTo (1)
 77 |       p1.head.getCoveredText must be equalTo (a1.getCoveredText)
 78 |     }
 79 | 
 80 |     "select all following annotation" in {
 81 |       val cas = newJCas
 82 |       cas.setDocumentText("This is a text")
 83 | 
 84 |       val a1 = new Annotation(cas, 0, 4)
 85 |       a1.addToIndexes
 86 |       val a2 = new Annotation(cas, 5, 7)
 87 |       a2.addToIndexes
 88 |       val a3 = new Annotation(cas, 8, 9)
 89 |       a3.addToIndexes
 90 | 
 91 |       val p1 = cas.selectFollowing[Annotation](a2, 1)
 92 |       p1.size must be equalTo (1)
 93 |       p1.head.getCoveredText must be equalTo (a3.getCoveredText)
 94 |     }
 95 | 
 96 |     "checks if an annotation type exists" in {
 97 |       val cas = newJCas
 98 |       cas.setDocumentText("This is a text")
 99 | 
100 |       cas.exists[Annotation] must beTrue
101 |     }
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/language-identification/n-gram-language-identifier/src/main/scala/com/github/jenshaase/uimascala/languageidentifier/NGramLanguageIdentifier.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.languageidentifier
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import scala.collection.JavaConversions._
 8 | import com.optimaize.langdetect.text.CommonTextObjectFactories
 9 | import com.optimaize.langdetect.ngram.NgramExtractors
10 | import com.optimaize.langdetect.profiles._
11 | import com.optimaize.langdetect._
12 | 
13 | class NGramLanguageIdentifier extends SCasAnnotator_ImplBase {
14 | 
15 |   object shortText extends Parameter[Boolean](false)
16 | 
17 |   lazy val languageDetector = {
18 |     val languageProfiles = new LanguageProfileReader().readAllBuiltIn()
19 |     LanguageDetectorBuilder.create(NgramExtractors.standard())
20 |       .withProfiles(languageProfiles)
21 |       .build()
22 |   }
23 | 
24 |   def process(jcas: JCas) = {
25 |     val textObjectFactory = 
26 |       if (shortText.is) {
27 |         CommonTextObjectFactories.forDetectingOnLargeText()
28 |       } else {
29 |         CommonTextObjectFactories.forDetectingShortCleanText()
30 |       }
31 | 
32 |     val text = textObjectFactory.forText(jcas.getDocumentText);
33 |     val lang = languageDetector.detect(text)
34 |     if (lang.isPresent()) {
35 |       jcas.setDocumentLanguage(lang.get().toString)
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/language-identification/n-gram-language-identifier/src/test/scala/com/github/jenshaase/uimascala/languageidentifier/NGramLanguageIdentifierSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.languageidentifier
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.specs2.mutable.Specification
 8 | 
 9 | class NGramLanguageIdentifierSpec extends Specification {
10 | 
11 |   "The ngram language idenifier" should {
12 |     "detect the german language" in {
13 |       val analyser: AnalysisEngine = new NGramLanguageIdentifier().asAnalysisEngine
14 | 
15 |       val jcas = analyser.newJCas()
16 |       jcas.setDocumentText("Das ist ein Text in deutscher Sprache")
17 |       analyser.process(jcas)
18 | 
19 |       jcas.getDocumentLanguage must be equalTo("de")
20 |     }
21 | 
22 |     "detect the english language" in {
23 |       val analyser: AnalysisEngine = new NGramLanguageIdentifier().asAnalysisEngine
24 | 
25 |       val jcas = analyser.newJCas()
26 |       jcas.setDocumentText("This is a english text with so information.")
27 |       analyser.process(jcas)
28 | 
29 |       jcas.getDocumentLanguage must be equalTo("en")
30 |     }
31 | 
32 |     "detect the german language in short text snippets" in {
33 |       val analyser: AnalysisEngine = new NGramLanguageIdentifier().config(_.shortText := true).asAnalysisEngine
34 | 
35 |       val jcas = analyser.newJCas()
36 |       jcas.setDocumentText("Das ist ein Text in deutscher Sprache")
37 |       analyser.process(jcas)
38 | 
39 |       jcas.getDocumentLanguage must be equalTo("de")
40 |     }
41 | 
42 |     "detect the english language in short text snippets" in {
43 |       val analyser: AnalysisEngine = new NGramLanguageIdentifier().config(_.shortText := true).asAnalysisEngine
44 | 
45 |       val jcas = analyser.newJCas()
46 |       jcas.setDocumentText("This is a english text with so information.")
47 |       analyser.process(jcas)
48 | 
49 |       jcas.getDocumentLanguage must be equalTo("en")
50 |     }
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/lemmatizer/mate-lemmatizer/src/main/scala/com/github/jenshaase/uimascala/lemmatizer/MateLemmatizer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.lemmatizer
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import org.apache.uima.resource.SharedResourceObject
 8 | import org.apache.uima.resource.DataResource
 9 | import scala.collection.JavaConversions._
10 | import is2.data.SentenceData09
11 | import is2.io.CONLLReader09
12 | import is2.io.IOGenerals
13 | import is2.lemmatizer.Lemmatizer
14 | 
15 | class MateLemmatizerResource extends SharedResourceObject {
16 |   private var lemmatizer: Lemmatizer = _
17 | 
18 |   def load(data: DataResource) {
19 |     val uri = data.getUri.toString
20 | 
21 |     if (new java.io.File(uri).exists) {
22 |       lemmatizer = new Lemmatizer(uri)
23 |     } else {
24 |       val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
25 |       val resource = this.getClass.getResource(resourceUri)
26 | 
27 |       val file = java.io.File.createTempFile("mate-lemmatizer", ".temp")
28 |       file.deleteOnExit();
29 | 
30 |       val source = resource.openStream();
31 |       try {
32 |         java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
33 |       } finally {
34 |         source.close();
35 |       }
36 | 
37 |       lemmatizer = new Lemmatizer(file.getAbsolutePath)
38 |     }
39 |   }
40 | 
41 |   def getLemmatizer = lemmatizer
42 | }
43 | 
44 | class MateLemmatizer extends SCasAnnotator_ImplBase {
45 | 
46 |   object model extends SharedResource[MateLemmatizerResource]("")
47 | 
48 |   def process(jcas: JCas) = {
49 |     jcas.select[Sentence].foreach { sentence =>
50 |       val tokens = jcas.selectCovered[Token](sentence).toVector
51 | 
52 |       val sentenceData = new SentenceData09()
53 |       sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText))
54 | 
55 |       model.resource.getLemmatizer.apply(sentenceData).plemmas.zipWithIndex.foreach { case (tag, idx) =>
56 |         val token = tokens(idx)
57 | 
58 |         val lemma = new Lemma(jcas, token.getBegin, token.getEnd)
59 |         lemma.setValue(tag)
60 |         add(lemma)
61 | 
62 |         token.setLemma(lemma)
63 |       }
64 |     }
65 |   }
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/lemmatizer/mate-lemmatizer/src/test/scala/com/github/jenshaase/uimascala/lemmatizer/MateLemmatizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.lemmatizer
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import com.github.jenshaase.uimascala.core.configuration._
 7 | import org.apache.uima.analysis_engine.AnalysisEngine
 8 | import org.specs2.mutable.Specification
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 | 
12 | class MateLemmatizerSpec extends Specification {
13 | 
14 |   "MateLemmatizer" should {
15 |     "lemmatize each word in a sentence" in {
16 |       val tagger: AnalysisEngine = new MateLemmatizer().
17 |         config(
18 |           _.model := SharedBinding[MateLemmatizerResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/lemmatizer-de-tiger.model")
19 |         ).
20 |         asAnalysisEngine
21 | 
22 |       val jcas = tagger.newJCas()
23 |       jcas.setDocumentText("Hallo Welt! Was geht?")
24 |       jcas.annotate[Sentence](0, 10)
25 |       jcas.annotate[Sentence](12, 20)
26 |       jcas.annotate[Token](0, 5)
27 |       jcas.annotate[Token](6, 10)
28 |       jcas.annotate[Token](12, 15)
29 |       jcas.annotate[Token](16, 20)
30 |       tagger.process(jcas)
31 | 
32 |       jcas.select[Lemma].size must be equalTo(4)
33 |       jcas.selectByIndex[Lemma](0).getCoveredText must be equalTo ("Hallo")
34 |       jcas.selectByIndex[Lemma](1).getCoveredText must be equalTo ("Welt")
35 |       jcas.selectByIndex[Lemma](2).getCoveredText must be equalTo ("Was")
36 |       jcas.selectByIndex[Lemma](3).getCoveredText must be equalTo ("geht")
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/name-entity-recognizer/stanford-ner/src/main/scala/com/github/jenshaase/uimascala/ner/StanfordNer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.ner
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import org.apache.uima.resource.SharedResourceObject
 8 | import org.apache.uima.resource.DataResource
 9 | import edu.stanford.nlp.ling.TaggedWord
10 | import edu.stanford.nlp.ie.crf.CRFClassifier
11 | import edu.stanford.nlp.util.CoreMap
12 | import edu.stanford.nlp.ling.CoreLabel
13 | import edu.stanford.nlp.ling.CoreAnnotations
14 | import scala.collection.JavaConversions._
15 | import java.util.zip.GZIPInputStream
16 | 
17 | 
18 | class StanfordNerResource extends SharedResourceObject {
19 |   private var tagger: CRFClassifier[CoreMap] = _
20 | 
21 |   def load(data: DataResource) {
22 |     val uri = data.getUri.toString
23 | 
24 |     if (new java.io.File(uri).exists) {
25 |       tagger = CRFClassifier.getClassifier[CoreMap](new java.io.File(uri))
26 |     } else {
27 |       val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
28 |       val resource = this.getClass.getResource(resourceUri)
29 | 
30 |       val is = if (uri.endsWith(".gz")) {
31 |         new GZIPInputStream(resource.openStream)
32 |       } else {
33 |         resource.openStream
34 |       }
35 | 
36 |       tagger = CRFClassifier.getClassifier[CoreMap](is)
37 |     }
38 |   }
39 | 
40 |   def getTagger = tagger
41 | }
42 | 
43 | class StanfordNer extends SCasAnnotator_ImplBase {
44 | 
45 |   object model extends SharedResource[StanfordNerResource]("")
46 | 
47 |   def process(jcas: JCas) = {
48 |     jcas.select[Sentence].foreach { sentence =>
49 |       val tokens = jcas.selectCovered[Token](sentence).toVector
50 | 
51 |       model.resource.getTagger.
52 |         classifySentence(tokens.map(tokenToCoreLabel _)).
53 |         foldLeft[(Int, Int, Option[String])](-1, -1, None) { case ((begin, end, currentType), taggedWord) =>
54 |           val tokenType = taggedWord.get(classOf[CoreAnnotations.AnswerAnnotation])
55 |           val tokenBegin = taggedWord.get(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation])
56 |           val tokenEnd = taggedWord.get(classOf[CoreAnnotations.CharacterOffsetEndAnnotation])
57 | 
58 |           (tokenType, currentType) match {
59 |             case ("O", Some(b)) =>
60 |               val namedEntity = new NamedEntity(jcas, begin, end)
61 |               namedEntity.setValue(b)
62 |               add(namedEntity)
63 |               (begin, end, None)
64 | 
65 |             case (a, Some(b)) if (a != b) =>
66 |               val namedEntity = new NamedEntity(jcas, begin, end)
67 |               namedEntity.setValue(b)
68 |               add(namedEntity)
69 |               (begin, tokenEnd, Some(tokenType))
70 | 
71 |             case (a, None) if (a != "O") =>
72 |               (tokenBegin, tokenEnd, Some(tokenType))
73 | 
74 |             case (a, Some(b)) if (a == b) =>
75 |               (begin, tokenEnd, Some(tokenType))
76 | 
77 |             case ("O", None) =>
78 |               (begin, end, currentType)
79 |           }
80 |         }
81 |     }
82 |   }
83 | 
84 |   def tokenToCoreLabel(token: Token): CoreLabel = {
85 |     val word = new CoreLabel()
86 |     word.setValue(token.getCoveredText)
87 |     word.setOriginalText(token.getCoveredText)
88 |     word.setWord(token.getCoveredText)
89 |     word.setBeginPosition(token.getBegin)
90 |     word.setEndPosition(token.getEnd)
91 | 
92 |     if (token.getPos != null) {
93 |       word.setTag(token.getPos.getName)
94 |     }
95 | 
96 |     word
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/name-entity-recognizer/stanford-ner/src/test/scala/com/github/jenshaase/uimascala/ner/StanfordNerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.ner
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import com.github.jenshaase.uimascala.core.configuration._
 7 | import org.apache.uima.analysis_engine.AnalysisEngine
 8 | import org.specs2.mutable.Specification
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 | 
12 | class StanfordNerSpec extends Specification {
13 | 
14 |   "The Stanford Parser" should {
15 |     "add constituents" in {
16 |       val parser: AnalysisEngine = new StanfordNer().
17 |         config(
18 |           _.model := SharedBinding[StanfordNerResource]("edu/stanford/nlp/models/ner/german.dewac_175m_600.crf.ser.gz")
19 |         ).
20 |         asAnalysisEngine
21 | 
22 |       val jcas = parser.newJCas()
23 |       jcas.setDocumentText("Angela Merkel fliegt nach Berlin.")
24 |       jcas.annotate[Sentence](0, 33)
25 |       val t1 = jcas.annotate[Token](0, 6)
26 |       val p1 = jcas.annotate[POS](0, 6)
27 |       p1.setName("NE")
28 |       t1.setPos(p1)
29 | 
30 |       val t2 = jcas.annotate[Token](7, 13)
31 |       val p2 = jcas.annotate[POS](7, 13)
32 |       p2.setName("NE")
33 |       t2.setPos(p2)
34 | 
35 |       val t3 = jcas.annotate[Token](14, 20)
36 |       val p3 = jcas.annotate[POS](14, 20)
37 |       p3.setName("VVFIN")
38 |       t3.setPos(p3)
39 | 
40 |       val t4 = jcas.annotate[Token](21, 25)
41 |       val p4 = jcas.annotate[POS](21, 25)
42 |       p4.setName("APPR")
43 |       t4.setPos(p4)
44 | 
45 |       val t5 = jcas.annotate[Token](26, 32)
46 |       val p5 = jcas.annotate[POS](26, 32)
47 |       p5.setName("NE")
48 |       t5.setPos(p5)
49 | 
50 |       val t6 = jcas.annotate[Token](32, 33)
51 |       val p6 = jcas.annotate[POS](32, 33)
52 |       p6.setName("$.")
53 |       t6.setPos(p6)
54 | 
55 |       parser.process(jcas)
56 | 
57 |       val namedEntities = jcas.select[NamedEntity].toVector
58 |       namedEntities.size must be equalTo(2)
59 |       namedEntities(0).getCoveredText must be equalTo("Angela Merkel")
60 |       namedEntities(0).getValue must be equalTo("I-PER")
61 |       namedEntities(1).getCoveredText must be equalTo("Berlin")
62 |       namedEntities(1).getValue must be equalTo("I-LOC")
63 |     }
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/parser/mate-parser/src/main/scala/com/github/jenshaase/uimascala/parser/MateParser.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.parser
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import scala.collection.JavaConversions._
 8 | import org.apache.uima.resource.DataResource
 9 | import org.apache.uima.resource.SharedResourceObject
10 | import is2.data.SentenceData09
11 | import is2.io.CONLLReader09
12 | import is2.io.IOGenerals
13 | import is2.parser.Options
14 | import is2.parser.Parser
15 | 
16 | class MateParserResource extends SharedResourceObject {
17 |   private var parser: Parser = _
18 | 
19 |   def load(data: DataResource) {
20 |     val uri = data.getUri.toString
21 | 
22 |     if (new java.io.File(uri).exists) {
23 |       parser = new Parser(new Options(Array("-model", uri)))
24 |     } else {
25 |       val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
26 |       val resource = this.getClass.getResource(resourceUri)
27 | 
28 |       val file = java.io.File.createTempFile("mate-parser", ".temp")
29 |       file.deleteOnExit();
30 | 
31 |       val source = resource.openStream();
32 |       try {
33 |         java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
34 |       } finally {
35 |         source.close();
36 |       }
37 | 
38 |       parser = new Parser(new Options(Array("-model", file.getAbsolutePath)))
39 |     }
40 |   }
41 | 
42 |   def getParser = parser
43 | }
44 | 
45 | class MateParser extends SCasAnnotator_ImplBase {
46 | 
47 |   object model extends SharedResource[MateParserResource]("")
48 | 
49 |   def process(jcas: JCas) = {
50 |     jcas.select[Sentence].foreach { sentence =>
51 |       val tokens = jcas.selectCovered[Token](sentence).toVector
52 | 
53 |       val sentenceData = new SentenceData09()
54 |       sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText))
55 |       sentenceData.setLemmas(Array[String](IOGenerals.ROOT_LEMMA) ++ tokens.map { t =>
56 |         if (t.getLemma != null) {
57 |           t.getLemma.getValue()
58 |         } else {
59 |           "_"
60 |         }
61 |       })
62 |       sentenceData.setPPos(Array[String](IOGenerals.ROOT_POS) ++ tokens.map { t =>
63 |         t.getPos.getName()
64 |       })
65 | 
66 |       val parsed = model.resource.getParser.apply(sentenceData)
67 | 
68 |       parsed.labels.zipWithIndex.foreach { case (label, i) =>
69 |         if (parsed.pheads(i) != 0) {
70 |           val sourceToken = tokens(parsed.pheads(i) - 1)
71 |           val targetToken = tokens(i)
72 |           val depType = parsed.plabels(i)
73 | 
74 |           val dep = new Dependency(jcas)
75 |           dep.setGovernor(sourceToken)
76 |           dep.setDependent(targetToken)
77 |           dep.setDependencyType(depType)
78 |           dep.setBegin(dep.getDependent().getBegin())
79 |           dep.setEnd(dep.getDependent().getEnd())
80 |           dep.addToIndexes()
81 |         } else {
82 |           val rootToken = tokens(i)
83 | 
84 |           val dep = new DependencyRoot(jcas)
85 |           dep.setGovernor(rootToken)
86 |           dep.setDependent(rootToken)
87 |           dep.setDependencyType(parsed.plabels(i))
88 |           dep.setBegin(dep.getDependent().getBegin())
89 |           dep.setEnd(dep.getDependent().getEnd())
90 |           dep.addToIndexes()
91 |         }
92 |       }
93 |     }
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/parser/mate-parser/src/test/scala/com/github/jenshaase/uimascala/parser/MateParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.parser
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.typesystem._
 5 | import com.github.jenshaase.uimascala.core.configuration._
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.specs2.mutable.Specification
 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
 9 | import org.apache.uima.fit.util.JCasUtil
10 | 
11 | class MateParserSpec extends Specification {
12 | 
13 |   "The Mate Parser" should {
14 |     "add dependencies" in {
15 |       val parser: AnalysisEngine = new MateParser().
16 |         config(
17 |           _.model := SharedBinding[MateParserResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/parser-de-tiger.model")
18 |         ).
19 |         asAnalysisEngine
20 | 
21 |       val jcas = parser.newJCas()
22 |       jcas.setDocumentText("Wie alt bist du?")
23 |       jcas.annotate[Sentence](0, 16)
24 |       val t1 = jcas.annotate[Token](0, 3)
25 |       val p1 = jcas.annotate[POS](0, 3)
26 |       p1.setName("PWAV")
27 |       t1.setPos(p1)
28 | 
29 |       val t2 = jcas.annotate[Token](4, 7)
30 |       val p2 = jcas.annotate[POS](4, 7)
31 |       p2.setName("ADJD")
32 |       t2.setPos(p2)
33 | 
34 |       val t3 = jcas.annotate[Token](8, 12)
35 |       val p3 = jcas.annotate[POS](8, 12)
36 |       p3.setName("VAFIN")
37 |       t3.setPos(p3)
38 | 
39 |       val t4 = jcas.annotate[Token](13, 15)
40 |       val p4 = jcas.annotate[POS](13, 15)
41 |       p4.setName("PPER")
42 |       t4.setPos(p4)
43 | 
44 |       val t5 = jcas.annotate[Token](15, 16)
45 |       val p5 = jcas.annotate[POS](15, 16)
46 |       p5.setName("$.")
47 |       t5.setPos(p5)
48 | 
49 |       parser.process(jcas)
50 | 
51 |       val dependencies = jcas.select[Dependency].toVector
52 |       dependencies(0).getCoveredText must be equalTo ("Wie")
53 |       dependencies(0).getGovernor.getCoveredText must be equalTo ("alt")
54 |       dependencies(0).getDependent.getCoveredText must be equalTo ("Wie")
55 |       dependencies(0).getDependencyType must be equalTo ("MO")
56 | 
57 |       dependencies(1).getCoveredText must be equalTo ("alt")
58 |       dependencies(1).getGovernor.getCoveredText must be equalTo ("bist")
59 |       dependencies(1).getDependent.getCoveredText must be equalTo ("alt")
60 |       dependencies(1).getDependencyType must be equalTo ("PD")
61 | 
62 |       dependencies(2).getCoveredText must be equalTo ("bist")
63 |       dependencies(2).getGovernor.getCoveredText must be equalTo ("bist")
64 |       dependencies(2).getDependent.getCoveredText must be equalTo ("bist")
65 |       dependencies(2).getDependencyType must be equalTo ("--")
66 | 
67 |       dependencies(3).getCoveredText must be equalTo ("du")
68 |       dependencies(3).getGovernor.getCoveredText must be equalTo ("bist")
69 |       dependencies(3).getDependent.getCoveredText must be equalTo ("du")
70 |       dependencies(3).getDependencyType must be equalTo ("SB")
71 | 
72 |       dependencies(4).getCoveredText must be equalTo ("?")
73 |       dependencies(4).getGovernor.getCoveredText must be equalTo ("du")
74 |       dependencies(4).getDependent.getCoveredText must be equalTo ("?")
75 |       dependencies(4).getDependencyType must be equalTo ("--")
76 |     }
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/parser/stanford-parser/src/main/scala/com/github/jenshaase/uimascala/parser/StanfordParser.scala:
--------------------------------------------------------------------------------
  1 | package com.github.jenshaase.uimascala.parser
  2 | 
  3 | import com.github.jenshaase.uimascala.core._
  4 | import com.github.jenshaase.uimascala.core.configuration._
  5 | import com.github.jenshaase.uimascala.typesystem._
  6 | import org.apache.uima.jcas.JCas
  7 | import scala.collection.JavaConversions._
  8 | import edu.stanford.nlp.parser.common.ParserGrammar
  9 | import java.io._
 10 | import java.util.zip.GZIPInputStream
 11 | import org.apache.uima.resource.DataResource
 12 | import org.apache.uima.resource.SharedResourceObject
 13 | import edu.stanford.nlp.ling.CoreLabel
 14 | import edu.stanford.nlp.trees.Tree
 15 | import org.apache.uima.jcas.tcas.Annotation
 16 | import org.apache.uima.jcas.cas.FSArray
 17 | import org.apache.uima.util.Level.WARNING
 18 | 
 19 | class StanfordParserGrammerResource extends SharedResourceObject {
 20 |   private var parser: ParserGrammar = _
 21 | 
 22 |   def load(data: DataResource) {
 23 |     parser = ParserGrammar.loadModel(data.getUri.toString)
 24 |   }
 25 | 
 26 |   def getParserGrammer = parser
 27 | }
 28 | 
 29 | object DependencyMode {
 30 |   val BASIC = "BASIC"
 31 |   val NON_COLLAPSED = "NON_COLLAPSED"
 32 |   val COLLAPSED = "COLLAPSED"
 33 |   val COLLAPSED_WITH_EXTRA = "COLLAPSED_WITH_EXTRA"
 34 |   val CC_PROPAGATED = "CC_PROPAGATED"
 35 |   val CC_PROPAGATED_NO_EXTRA = "CC_PROPAGATED_NO_EXTRA"
 36 |   val TREE = "TREE"
 37 | }
 38 | 
 39 | class StanfordParser extends SCasAnnotator_ImplBase {
 40 | 
 41 |   object model extends SharedResource[StanfordParserGrammerResource]("")
 42 |   object mode extends Parameter[String](DependencyMode.BASIC)
 43 |   object readPOS extends Parameter[Boolean](true) {
 44 |     override def mandatory_? = false
 45 |   }
 46 |   object createPOS extends Parameter[Boolean](false) {
 47 |     override def mandatory_? = false
 48 |   }
 49 | 
 50 |   def process(jcas: JCas) = {
 51 |     val parser = model.resource.getParserGrammer
 52 | 
 53 |     jcas.select[Sentence].foreach { sentence =>
 54 |       val tokens = jcas.selectCovered[Token](sentence).toVector
 55 | 
 56 |       val query = parser.parserQuery()
 57 |       query.parse(tokens.map(tokenToCoreLabel _))
 58 |       val parseTree = query.getBestParse()
 59 |       parseTree.setSpans()
 60 | 
 61 |       doCreateConstituentAnnotation(jcas, tokens, parseTree, None)
 62 |       doCreateDependencyAnnotation(jcas, parser, parseTree, tokens)
 63 |     }
 64 |   }
 65 | 
 66 |   def tokenToCoreLabel(token: Token): CoreLabel = {
 67 |     val word = new CoreLabel()
 68 |     word.setValue(token.getCoveredText)
 69 |     word.setOriginalText(token.getCoveredText)
 70 |     word.setWord(token.getCoveredText)
 71 |     word.setBeginPosition(token.getBegin)
 72 |     word.setEndPosition(token.getEnd)
 73 | 
 74 |     if (readPOS.is && token.getPos != null) {
 75 |       word.setTag(token.getPos.getName)
 76 |     }
 77 | 
 78 |     word
 79 |   }
 80 | 
 81 |   def doCreateConstituentAnnotation(jcas: JCas, tokens: Vector[Token], node: Tree, parent: Option[Annotation]): Annotation = {
 82 |     val nodeLabelValue = node.value()
 83 |     val source = tokens.get(node.getSpan().getSource)
 84 |     val target = tokens.get(node.getSpan().getTarget)
 85 | 
 86 |     if (node.isPhrasal) {
 87 |       val constituent = createConstituent(jcas, source.getBegin, target.getEnd, nodeLabelValue)
 88 |       parent.foreach { p => constituent.setParent(p) }
 89 | 
 90 |       val childAnnotations = node.
 91 |         getChildrenAsList().
 92 |         map(doCreateConstituentAnnotation(jcas, tokens, _, Some(constituent)))
 93 | 
 94 |       val children = childAnnotations.zipWithIndex.
 95 |         foldLeft(new FSArray(jcas, childAnnotations.size())) { case (fsArray, (ann, idx)) =>
 96 |           fsArray.set(idx, ann)
 97 |           fsArray
 98 |         }
 99 | 
100 |       constituent.setChildren(children)
101 |       add(constituent)
102 |       constituent
103 |     } else if (node.isPreTerminal) {
104 |       val pos = createPOS(jcas, source.getBegin, target.getEnd, nodeLabelValue)
105 |       val coveredToken = jcas.selectCovered[Token](pos)
106 |       require(coveredToken.size == 1)
107 |       val token = coveredToken.get(0)
108 | 
109 |       if (createPOS.is) {
110 |         add(pos)
111 |         token.setPos(pos)
112 |       }
113 | 
114 |       parent.foreach { p =>
115 |         token.setParent(p)
116 |       }
117 | 
118 |       token
119 |     } else {
120 |       throw new Exception("Node must be either phrasal nor pre-terminal")
121 |     }
122 |   }
123 | 
124 |   def createConstituent(jcas: JCas, begin: Int, end: Int, constituentType: String) = {
125 |     val c = new Constituent(jcas, begin, end)
126 |     c.setConstituentType(constituentType)
127 |     c
128 |   }
129 | 
130 |   def createPOS(jcas: JCas, begin: Int, end: Int, name: String) = {
131 |     val p = new POS(jcas, begin, end)
132 |     p.setName(name)
133 |     p
134 |   }
135 | 
136 | 
137 |   def doCreateDependencyAnnotation(jcas: JCas, parser: ParserGrammar, parseTree: Tree, tokens: Seq[Token]) {
138 |     try {
139 |       val gs = parser.getTLPParams().getGrammaticalStructure(
140 |         parseTree,
141 |         parser.treebankLanguagePack().punctuationWordRejectFilter(),
142 |         parser.getTLPParams().typedDependencyHeadFinder()
143 |       )
144 | 
145 |       val dependencies = mode.is match {
146 |         case DependencyMode.BASIC => gs.typedDependencies()
147 |         case DependencyMode.NON_COLLAPSED => gs.allTypedDependencies()
148 |         case DependencyMode.COLLAPSED => gs.typedDependenciesCollapsed(false)
149 |         case DependencyMode.COLLAPSED_WITH_EXTRA => gs.typedDependenciesCollapsed(true)
150 |         case DependencyMode.CC_PROPAGATED => gs.typedDependenciesCCprocessed(true)
151 |         case DependencyMode.CC_PROPAGATED_NO_EXTRA => gs.typedDependenciesCCprocessed(false)
152 |         case DependencyMode.TREE => gs.typedDependenciesCollapsedTree()
153 |         case _ => throw new Exception("DependencyMode not supported: " + mode.is)
154 |       }
155 | 
156 |       dependencies.foreach { currTypedDep =>
157 |         val govIndex = currTypedDep.gov().index();
158 |         val depIndex = currTypedDep.dep().index();
159 | 
160 |         val dep = if (govIndex != 0) {
161 |           val govToken = tokens(govIndex - 1)
162 |           val depToken = tokens(depIndex - 1)
163 | 
164 |           val dep = new Dependency(jcas)
165 |           dep.setDependencyType(currTypedDep.reln().toString());
166 |           dep.setGovernor(govToken);
167 |           dep.setDependent(depToken);
168 |           dep.setBegin(dep.getDependent().getBegin());
169 |           dep.setEnd(dep.getDependent().getEnd());
170 |           dep.addToIndexes();
171 |         } else {
172 |           val depToken = tokens(depIndex - 1);
173 |           
174 |           val dep = new DependencyRoot(jcas);
175 |           dep.setDependencyType(currTypedDep.reln().toString());
176 |           dep.setGovernor(depToken);
177 |           dep.setDependent(depToken);
178 |           dep.setBegin(dep.getDependent().getBegin());
179 |           dep.setEnd(dep.getDependent().getEnd());
180 |           dep.addToIndexes();
181 | 
182 |           dep
183 |         }
184 |       }
185 |     } catch {
186 |       case e: UnsupportedOperationException =>
187 |         getContext().getLogger().log(WARNING, "Current model does not seem to support dependencies.");
188 |     }
189 |   }
190 | }
191 | 


--------------------------------------------------------------------------------
/parser/stanford-parser/src/test/scala/com/github/jenshaase/uimascala/parser/StanfordParserSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.parser
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import com.github.jenshaase.uimascala.core.configuration._
 7 | import org.apache.uima.analysis_engine.AnalysisEngine
 8 | import org.specs2.mutable.Specification
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 | 
12 | class StanfordParserSpec extends Specification {
13 | 
14 |   "The Stanford Parser" should {
15 |     "add constituents" in {
16 |       val parser: AnalysisEngine = new StanfordParser().
17 |         config(
18 |           _.model := SharedBinding[StanfordParserGrammerResource]("edu/stanford/nlp/models/srparser/germanSR.ser.gz")
19 |         ).
20 |         asAnalysisEngine
21 | 
22 |       val jcas = parser.newJCas()
23 |       jcas.setDocumentText("Wie alt bist du?")
24 |       jcas.annotate[Sentence](0, 16)
25 |       val t1 = jcas.annotate[Token](0, 3)
26 |       val p1 = jcas.annotate[POS](0, 3)
27 |       p1.setName("PWAV")
28 |       t1.setPos(p1)
29 | 
30 |       val t2 = jcas.annotate[Token](4, 7)
31 |       val p2 = jcas.annotate[POS](4, 7)
32 |       p2.setName("ADJD")
33 |       t2.setPos(p2)
34 | 
35 |       val t3 = jcas.annotate[Token](8, 12)
36 |       val p3 = jcas.annotate[POS](8, 12)
37 |       p3.setName("VAFIN")
38 |       t3.setPos(p3)
39 | 
40 |       val t4 = jcas.annotate[Token](13, 15)
41 |       val p4 = jcas.annotate[POS](13, 15)
42 |       p4.setName("PPER")
43 |       t4.setPos(p4)
44 | 
45 |       val t5 = jcas.annotate[Token](15, 16)
46 |       val p5 = jcas.annotate[POS](15, 16)
47 |       p5.setName("$.")
48 |       t5.setPos(p5)
49 | 
50 |       parser.process(jcas)
51 | 
52 |       val constituents = jcas.select[Constituent].toVector
53 |       constituents(0).getBegin must be equalTo (0)
54 |       constituents(0).getEnd must be equalTo (16)
55 |       constituents(0).getConstituentType must be equalTo ("S")
56 |       constituents(0).getChildren.size must be equalTo (4)
57 |       constituents(0).getParent must be equalTo(constituents(1))
58 | 
59 |       constituents(1).getBegin must be equalTo (0)
60 |       constituents(1).getEnd must be equalTo (16)
61 |       constituents(1).getConstituentType must be equalTo ("ROOT")
62 |       constituents(1).getChildren.size must be equalTo (1)
63 |       constituents(1).getParent must beNull
64 | 
65 |       constituents(2).getBegin must be equalTo (0)
66 |       constituents(2).getEnd must be equalTo (7)
67 |       constituents(2).getConstituentType must be equalTo ("AP")
68 |       constituents(2).getChildren.size must be equalTo (2)
69 |       constituents(2).getParent must be equalTo(constituents(0))
70 | 
71 |       val tokens = jcas.select[Token].toVector
72 |       tokens(0).getParent must be equalTo(constituents(2))
73 |       tokens(1).getParent must be equalTo(constituents(2))
74 |       tokens(2).getParent must be equalTo(constituents(0))
75 |       tokens(3).getParent must be equalTo(constituents(0))
76 |       tokens(4).getParent must be equalTo(constituents(0))
77 |     }
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/part-of-speech-tagger/ark-tweet-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/ArkTweetPosTagger.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.pos
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem.{Token, POS}
 6 | import org.apache.uima.jcas.JCas
 7 | import cmu.arktweetnlp.Twokenize
 8 | import scala.collection.JavaConversions._
 9 | import cmu.arktweetnlp.impl.Model
10 | import cmu.arktweetnlp.impl.features.FeatureExtractor
11 | import org.apache.uima.UimaContext;
12 | import cmu.arktweetnlp.impl.ModelSentence
13 | import cmu.arktweetnlp.impl.Sentence
14 | 
15 | class ArkTweetPosTagger extends SCasAnnotator_ImplBase {
16 | 
17 |   object modelLocation extends Parameter[String]("")
18 | 
19 |   private var model: Model = _
20 |   private var featureExtractor: FeatureExtractor = _
21 | 
22 |   override def initialize(context: UimaContext) {
23 |     super.initialize(context)
24 | 
25 |     model = Model.loadModelFromText(modelLocation.is)
26 |     featureExtractor = new FeatureExtractor(model, false);
27 |   }
28 | 
29 |   def process(jcas: JCas) = {
30 |     val tokens = jcas.select[Token].toVector
31 | 
32 |     val sentence = new Sentence()
33 |     sentence.tokens = tokens.map(_.getCoveredText)
34 |     val ms = new ModelSentence(sentence.T())
35 |     featureExtractor.computeFeatures(sentence, ms)
36 |     model.greedyDecode(ms, false)
37 | 
38 |     tokens.zipWithIndex.foreach { case (token, idx) =>
39 |       val tag = model.labelVocab.name( ms.labels(idx) );
40 | 
41 |       val pos = new POS(jcas, token.getBegin, token.getEnd)
42 |       pos.setName(tag)
43 |       add(pos)
44 | 
45 |       token.setPos(pos)
46 |     }
47 |   }
48 | 
49 |   def createToken(cas: JCas, begin: Int, end: Int) =
50 |     new Token(cas, begin, end)
51 | }
52 | 


--------------------------------------------------------------------------------
/part-of-speech-tagger/ark-tweet-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/ArkTweetPosTaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.pos
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.specs2.mutable.Specification
 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
 9 | import org.apache.uima.fit.util.JCasUtil
10 | 
11 | class ArkTweetPosTaggerSpec extends Specification {
12 | 
13 |   "Ark Tweet Pos Tagger" should {
14 |     "add POS tags" in {
15 |       val modelPath = new java.io.File(getClass.getResource("/model.20120919").toURI).getAbsolutePath
16 |       val tagger: AnalysisEngine = new ArkTweetPosTagger().
17 |         config(
18 |           _.modelLocation := modelPath
19 |         ).
20 |         asAnalysisEngine
21 | 
22 |       val jcas = tagger.newJCas()
23 |       jcas.setDocumentText("RT @DjBlack_Pearl: wat muhfuckaz wearin 4 the lingerie party?????")
24 |       jcas.annotate[Token](0, 2)
25 |       jcas.annotate[Token](3, 17)
26 |       jcas.annotate[Token](17, 18)
27 |       jcas.annotate[Token](19, 22)
28 |       jcas.annotate[Token](23, 32)
29 |       jcas.annotate[Token](33, 39)
30 |       jcas.annotate[Token](40, 41)
31 |       jcas.annotate[Token](42, 45)
32 |       jcas.annotate[Token](46, 54)
33 |       jcas.annotate[Token](55, 60)
34 |       jcas.annotate[Token](60, 65)
35 |       tagger.process(jcas)
36 | 
37 |       jcas.select[POS].size must be equalTo(11)
38 |       jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("RT")
39 |       jcas.selectByIndex[POS](0).getName must be equalTo ("~")
40 |       jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("@DjBlack_Pearl")
41 |       jcas.selectByIndex[POS](1).getName must be equalTo ("@")
42 |       jcas.selectByIndex[POS](2).getCoveredText must be equalTo (":")
43 |       jcas.selectByIndex[POS](2).getName must be equalTo ("~")
44 |       jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("wat")
45 |       jcas.selectByIndex[POS](3).getName must be equalTo ("O")
46 |       jcas.selectByIndex[POS](4).getCoveredText must be equalTo ("muhfuckaz")
47 |       jcas.selectByIndex[POS](4).getName must be equalTo ("N")
48 |       jcas.selectByIndex[POS](5).getCoveredText must be equalTo ("wearin")
49 |       jcas.selectByIndex[POS](5).getName must be equalTo ("V")
50 |       jcas.selectByIndex[POS](6).getCoveredText must be equalTo ("4")
51 |       jcas.selectByIndex[POS](6).getName must be equalTo ("P")
52 |       jcas.selectByIndex[POS](7).getCoveredText must be equalTo ("the")
53 |       jcas.selectByIndex[POS](7).getName must be equalTo ("D")
54 |       jcas.selectByIndex[POS](8).getCoveredText must be equalTo ("lingerie")
55 |       jcas.selectByIndex[POS](8).getName must be equalTo ("N")
56 |       jcas.selectByIndex[POS](9).getCoveredText must be equalTo ("party")
57 |       jcas.selectByIndex[POS](9).getName must be equalTo ("N")
58 |       jcas.selectByIndex[POS](10).getCoveredText must be equalTo ("?????")
59 |       jcas.selectByIndex[POS](10).getName must be equalTo (",")
60 |     }
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/part-of-speech-tagger/mate-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/MatePosTagger.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.pos
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import org.apache.uima.resource.SharedResourceObject
 8 | import org.apache.uima.resource.DataResource
 9 | import scala.collection.JavaConversions._
10 | import is2.data.SentenceData09
11 | import is2.io.CONLLReader09
12 | import is2.io.IOGenerals
13 | import is2.tag.Options
14 | import is2.tag.Tagger
15 | 
16 | class MatePosTaggerResource extends SharedResourceObject {
17 |   private var tagger: Tagger = _
18 | 
19 |   def load(data: DataResource) {
20 |     val uri = data.getUri.toString
21 | 
22 |     if (new java.io.File(uri).exists) {
23 |       tagger = new Tagger(new Options(Array("-model", uri)))
24 |     } else {
25 |       val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
26 |       val resource = this.getClass.getResource(resourceUri)
27 | 
28 |       val file = java.io.File.createTempFile("mate-pos-tagger", ".temp")
29 |       file.deleteOnExit();
30 | 
31 |       val source = resource.openStream();
32 |       try {
33 |         java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
34 |       } finally {
35 |         source.close();
36 |       }
37 | 
38 |       tagger = new Tagger(new Options(Array("-model", file.getAbsolutePath)))
39 |     }
40 |   }
41 | 
42 |   def getTagger = tagger
43 | }
44 | 
45 | class MatePosTagger extends SCasAnnotator_ImplBase {
46 | 
47 |   object model extends SharedResource[MatePosTaggerResource]("")
48 | 
49 |   def process(jcas: JCas) = {
50 |     jcas.select[Sentence].foreach { sentence =>
51 |       val tokens = jcas.selectCovered[Token](sentence).toVector
52 | 
53 |       val sentenceData = new SentenceData09()
54 |       sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText))
55 |       sentenceData.setLemmas(Array[String](IOGenerals.ROOT_LEMMA) ++ tokens.map { t =>
56 |         if (t.getLemma != null) {
57 |           t.getLemma.getValue()
58 |         } else {
59 |           "_"
60 |         }
61 |       })
62 | 
63 |       model.resource.getTagger.apply(sentenceData).ppos.drop(1).zipWithIndex.foreach { case (tag, idx) =>
64 |         val token = tokens(idx)
65 | 
66 |         val pos = new POS(jcas, token.getBegin, token.getEnd)
67 |         pos.setName(tag)
68 |         add(pos)
69 | 
70 |         token.setPos(pos)
71 |       }
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/part-of-speech-tagger/mate-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/MatePosTaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.pos
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import com.github.jenshaase.uimascala.core.configuration._
 7 | import org.apache.uima.analysis_engine.AnalysisEngine
 8 | import org.specs2.mutable.Specification
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 | 
12 | class MatePosTaggerSpec extends Specification {
13 | 
14 |   "MatePosTagger" should {
15 |     "get the correct pos values" in {
16 |       val tagger: AnalysisEngine = new MatePosTagger().
17 |         config(
18 |           _.model := SharedBinding[MatePosTaggerResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/tagger-de-tiger.model")
19 |         ).
20 |         asAnalysisEngine
21 | 
22 |       val jcas = tagger.newJCas()
23 |       jcas.setDocumentText("Wie alt bist du?")
24 |       jcas.annotate[Sentence](0, 16)
25 |       jcas.annotate[Token](0, 3)
26 |       jcas.annotate[Token](4, 7)
27 |       jcas.annotate[Token](8, 12)
28 |       jcas.annotate[Token](13, 15)
29 |       jcas.annotate[Token](15, 16)
30 | 
31 |       tagger.process(jcas)
32 | 
33 |       jcas.select[POS].size must be equalTo(5)
34 |       jcas.selectByIndex[POS](0).getName must be equalTo ("PWAV")
35 |       jcas.selectByIndex[POS](1).getName must be equalTo ("ADJD")
36 |       jcas.selectByIndex[POS](2).getName must be equalTo ("VAFIN")
37 |       jcas.selectByIndex[POS](3).getName must be equalTo ("PPER")
38 |       jcas.selectByIndex[POS](4).getName must be equalTo ("$.")
39 | 
40 |       jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Wie")
41 |       jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("alt")
42 |       jcas.selectByIndex[POS](2).getCoveredText must be equalTo ("bist")
43 |       jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("du")
44 |       jcas.selectByIndex[POS](4).getCoveredText must be equalTo ("?")
45 |     }
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/part-of-speech-tagger/stanford-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/StanfordPosTagger.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.pos
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import org.apache.uima.resource.SharedResourceObject
 8 | import org.apache.uima.resource.DataResource
 9 | import edu.stanford.nlp.ling.TaggedWord
10 | import scala.collection.JavaConversions._
11 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
12 | 
13 | class MaxentTaggerResource extends SharedResourceObject {
14 |   private var tagger: MaxentTagger = _
15 | 
16 |   def load(data: DataResource) {
17 |     tagger = new MaxentTagger(data.getUri.toString)
18 |   }
19 | 
20 |   def getTagger = tagger
21 | }
22 | 
23 | class StanfordPosTagger extends SCasAnnotator_ImplBase {
24 | 
25 |   object model extends SharedResource[MaxentTaggerResource](MaxentTagger.DEFAULT_JAR_PATH)
26 |   object maxTokensPerSentence extends Parameter[Option[Int]](None) {
27 |     override def mandatory_? = false
28 |   }
29 | 
30 |   def process(jcas: JCas) = {
31 |     jcas.select[Sentence].foreach { sentence =>
32 |       val tokens = jcas.selectCovered[Token](sentence)
33 | 
34 |       maxTokensPerSentence.is match {
35 |         case None =>
36 |           processTokens(jcas, tokens)
37 |         case Some(n) if (n > 0 && tokens.size <= n) =>
38 |           processTokens(jcas, tokens)
39 |         case _ =>
40 |       }
41 |     }
42 |   }
43 | 
44 |   def processTokens(jcas: JCas, tokens: Seq[Token]) {
45 |     val words = tokens.map { token => new TaggedWord(token.getCoveredText) }
46 |     val taggedWords = model.resource.getTagger.tagSentence(words)
47 | 
48 |     tokens.zipWithIndex.foreach { case (token, idx) =>
49 |       val tag = taggedWords.get(idx).tag()
50 | 
51 |       val pos = new POS(jcas, token.getBegin, token.getEnd)
52 |       pos.setName(tag)
53 |       add(pos)
54 | 
55 |       token.setPos(pos)
56 |     }
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/part-of-speech-tagger/stanford-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/StanfordPosTaggerSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) 2011 Jens Haase
 3 |   */
 4 | package com.github.jenshaase.uimascala.pos
 5 | 
 6 | import java.util.Locale
 7 | import com.github.jenshaase.uimascala.core._
 8 | import com.github.jenshaase.uimascala.typesystem._
 9 | import com.github.jenshaase.uimascala.core.configuration._
10 | import org.apache.uima.analysis_engine.AnalysisEngine
11 | import org.specs2.mutable.Specification
12 | import org.apache.uima.fit.factory.AnalysisEngineFactory
13 | import org.apache.uima.fit.util.JCasUtil
14 | 
15 | class StanfordPosTaggerSpec extends Specification {
16 | 
17 |   "StanfordPosTagger" should {
18 |     "tag each word in a sentence" in {
19 |       val tagger: AnalysisEngine = new StanfordPosTagger().
20 |         config(
21 |           _.model := SharedBinding[MaxentTaggerResource]("edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger")
22 |         ).
23 |         asAnalysisEngine
24 | 
25 |       val jcas = tagger.newJCas()
26 |       jcas.setDocumentText("Hallo Welt! Was geht?")
27 |       jcas.annotate[Sentence](0, 10)
28 |       jcas.annotate[Sentence](12, 20)
29 |       jcas.annotate[Token](0, 5)
30 |       jcas.annotate[Token](6, 10)
31 |       jcas.annotate[Token](12, 15)
32 |       jcas.annotate[Token](16, 20)
33 |       tagger.process(jcas)
34 | 
35 |       jcas.select[POS].size must be equalTo(4)
36 |       jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Hallo")
37 |       jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("Welt")
38 |       jcas.selectByIndex[POS](2).getCoveredText must be equalTo ("Was")
39 |       jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("geht")
40 |     }
41 | 
42 |     "tag each word in a sentence if the sentences is short enough" in {
43 |       val tagger: AnalysisEngine = new StanfordPosTagger().
44 |         config(
45 |           _.model := SharedBinding[MaxentTaggerResource]("edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger"),
46 |           _.maxTokensPerSentence := Some(2)
47 |         ).
48 |         asAnalysisEngine
49 | 
50 |       val jcas = tagger.newJCas()
51 |       jcas.setDocumentText("Hallo Welt! Was geht heute?")
52 |       jcas.annotate[Sentence](0, 10)
53 |       jcas.annotate[Sentence](12, 26)
54 |       jcas.annotate[Token](0, 5)
55 |       jcas.annotate[Token](6, 10)
56 |       jcas.annotate[Token](12, 15)
57 |       jcas.annotate[Token](16, 20)
58 |       jcas.annotate[Token](21, 26)
59 |       tagger.process(jcas)
60 | 
61 |       jcas.select[POS].size must be equalTo(2)
62 |       jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Hallo")
63 |       jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("Welt")
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3")
2 | 
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
4 | 
5 | lazy val plugins = project in file(".") dependsOn(file("../sbt-plugin"))
6 | 


--------------------------------------------------------------------------------
/sbt-plugin/build.sbt:
--------------------------------------------------------------------------------
 1 | sbtPlugin := true
 2 | 
 3 | organization := "com.github.jenshaase.uimascala"
 4 | 
 5 | libraryDependencies ++= Seq(
 6 |   "org.apache.uima" % "uimaj-tools" % "2.8.1"
 7 | )
 8 | 
 9 | releasePublishArtifactsAction := PgpKeys.publishSigned.value
10 | 
11 | publishTo := {
12 |   val nexus = "https://oss.sonatype.org/"
13 |   if ( version.value.trim.endsWith( "SNAPSHOT" ) )
14 |     Some( "snapshots" at nexus + "content/repositories/snapshots" )
15 |   else
16 |     Some( "releases"  at nexus + "service/local/staging/deploy/maven2" )
17 | }
18 | 
19 | publishMavenStyle := true
20 | 
21 | pomExtra := (
22 |   <url>https://github.com/jenshaase/uimaScala</url>
23 |   <scm>
24 |     <url>git@github.com:jenshaase/uimascala.git</url>
25 |     <connection>scm:git:git@github.com:jenshaase/uimascala.git</connection>
26 |   </scm>
27 |   <developers>
28 |     <developer>
29 |       <id>jenshaase</id>
30 |       <name>Jens Haase</name>
31 |     </developer>
32 |   </developers>
33 |   <licenses>
34 |     <license>
35 |       <name>Apache 2</name>
36 |       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
37 |       <distribution>repo</distribution>
38 |     </license>
39 |   </licenses>
40 | )
41 | 
42 | 


--------------------------------------------------------------------------------
/sbt-plugin/project/plugin.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3")
2 | 
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
4 | 


--------------------------------------------------------------------------------
/sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaSbtPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala
 2 | 
 3 | import sbt._
 4 | import Keys._
 5 | import plugins._
 6 | 
 7 | import org.apache.uima.UIMAFramework
 8 | import org.apache.uima.util.{ CasCreationUtils, XMLInputSource }
 9 | import org.apache.uima.cas.impl.CASImpl
10 | import org.apache.uima.tools.jcasgen._
11 | 
12 | object UimaSbtPlugin extends Plugin {
13 | 
14 |   val uimaConfig = config("uima")
15 | 
16 |   val jcasGen = TaskKey[Unit]("jcasgen")
17 |   val visualDebugger = TaskKey[Unit]("visualDebugger")
18 | 
19 |   def uimaScalaSettings = Seq(
20 |     sourceDirectory in uimaConfig <<= (resourceDirectory in Compile) { _ / "desc" / "types" },
21 |     javaSource in uimaConfig <<= (sourceManaged in Compile) { _ / "java" },
22 |     sourceGenerators in Compile <+= generateTypeSystemSourcesTask,
23 |     managedSourceDirectories in Compile <+= (javaSource in uimaConfig),
24 |     cleanFiles <+= (javaSource in uimaConfig),
25 |     jcasGen <<= jcasGenTask,
26 |     visualDebugger <<= visualDebuggerTask
27 |   )
28 | 
29 |   def generateTypeSystemSourcesTask =
30 |     (sourceDirectory in uimaConfig, javaSource in uimaConfig) map { (srcDir, targetDir) =>
31 |       generateTypeSystemSources(srcDir, targetDir)
32 |     }
33 | 
34 |   def generateTypeSystemSources(srcDir: File, targetDir: File): Seq[File] = {
35 |     (srcDir ** "*.xml").get foreach { filename =>
36 |       val xmlIS = new XMLInputSource(filename)
37 |       val tsd = UIMAFramework.getXMLParser.parseTypeSystemDescription(xmlIS)
38 |       val cas = CasCreationUtils.createCas(tsd, null, null)
39 |       val jg = new Jg()
40 |       jg.mainGenerateAllTypesFromTemplates(
41 |         null, new UimaLoggerProgressMonitor(), new LogThrowErrorImpl(),
42 |         filename.getAbsolutePath, targetDir.getAbsolutePath, tsd.getTypes,
43 |         cas.asInstanceOf[CASImpl], classOf[UimaScalaTypeTemplate],
44 |         classOf[UimaScala_TypeTemplate], "", false, null
45 |       )
46 |     }
47 | 
48 |     (targetDir ** "*.java").get
49 |   }
50 | 
51 |   def jcasGenTask =
52 |     (streams) map { streams =>
53 |       Run.executeTrapExit(
54 |         (new Jg()).main0(Array[String](), null, null, new LogThrowErrorImpl()),
55 |         streams.log
56 |       )
57 |       ()
58 |     }
59 | 
60 |   def visualDebuggerTask =
61 |     (streams) map { streams =>
62 |       Run.executeTrapExit (
63 |         org.apache.uima.tools.cvd.CVD.main(Array[String]()),
64 |         streams.log
65 |       )
66 |       ()
67 |     }
68 |     
69 | }
70 | 


--------------------------------------------------------------------------------
/sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaScalaTypeTemplate.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.uima.tools.jcasgen
  2 | 
  3 | import org.apache.uima.resource.metadata.TypeDescription
  4 | import scala.collection.JavaConversions._
  5 | 
  6 | class UimaScalaTypeTemplate extends Jg.IJCasTypeTemplate {
  7 | 
  8 |   def generate(argument: Any): String = {
  9 |     val stringBuffer = new StringBuffer();
 10 |     stringBuffer.append("\n\n");
 11 | 
 12 |     val args: Array[Object] = argument.asInstanceOf[Array[Object]]
 13 |     val jg = args(0).asInstanceOf[Jg]
 14 |     val td = args(1).asInstanceOf[TypeDescription]
 15 |     jg.packageName = jg.getJavaPkg(td);
 16 | 
 17 |     if (0 != jg.packageName.length()) {
 18 |       stringBuffer.append(s"""package ${jg.packageName};""");
 19 |       stringBuffer.append("\n");
 20 |     }
 21 |     else
 22 |       jg.error.newError(IError.WARN,
 23 | 		    jg.getString("pkgMissing", Array.apply[Object](td.getName)), null);
 24 |     stringBuffer.append("""
 25 | import org.apache.uima.jcas.JCas; 
 26 | import org.apache.uima.jcas.JCasRegistry;
 27 | import org.apache.uima.jcas.cas.TOP_Type;
 28 | 
 29 | """);
 30 | 
 31 |     jg.collectImports(td, false).foreach { imp =>
 32 |       stringBuffer.append(s"""import $imp;""");
 33 |       stringBuffer.append("\n");
 34 |     }
 35 | 
 36 |     stringBuffer.append("\n\n");
 37 | 
 38 |     val typeName = jg.getJavaName(td);
 39 |     val typeName_Type = typeName + "_Type";
 40 |     val jcasTypeCasted = "((" + typeName_Type + ")jcasType)";
 41 | 
 42 |     stringBuffer.append(s"""/** ${jg.nullBlank(td.getDescription())} */
 43 | public class ${typeName} extends ${jg.getJavaName(td.getSupertypeName())} {
 44 |   @SuppressWarnings ("hiding")
 45 |   public final static int typeIndexID = JCasRegistry.register(${typeName}.class);
 46 |   @SuppressWarnings ("hiding")
 47 |   public final static int type = typeIndexID;
 48 |   @Override
 49 |   public              int getTypeIndexID() {return typeIndexID;}
 50 |  
 51 |   /** Never called.  Disable default constructor */
 52 |   protected ${typeName}() {/* intentionally empty block */}
 53 |     
 54 |   /** Internal - constructor used by generator 
 55 |    *
 56 |    * @param addr low level Feature Structure reference
 57 |    * @param type the type of this Feature Structure 
 58 |    */
 59 |   public ${typeName}(int addr, TOP_Type type) {
 60 |     super(addr, type);
 61 |     readObject();
 62 |   }
 63 |   
 64 |   /**
 65 |    * @param jcas JCas to which this Feature Structure belongs 
 66 |    */
 67 |   public ${typeName}(JCas jcas) {
 68 |     super(jcas);
 69 |     readObject();   
 70 |   } 
 71 | """);
 72 | 
 73 |     if (jg.isSubTypeOfAnnotation(td)) {
 74 |       stringBuffer.append(s"""
 75 |   /**
 76 |    * @param jcas JCas to which this Feature Structure belongs
 77 |    * @param begin offset to the begin spot in the SofA
 78 |    * @param end offset to the end spot in the SofA 
 79 |   */  
 80 |   public ${typeName}(JCas jcas, int begin, int end) {
 81 |     super(jcas);
 82 |     setBegin(begin);
 83 |     setEnd(end);
 84 |     readObject();
 85 |   }   
 86 | """);
 87 |     }
 88 | 
 89 |     stringBuffer.append(s"""
 90 |   /** 
 91 |    * <!-- begin-user-doc -->
 92 |    * Write your own initialization here
 93 |    * <!-- end-user-doc -->
 94 |    *
 95 |    */
 96 |   private void readObject() {/*default - does nothing empty block */}
 97 |      
 98 | """);
 99 | 
100 |     td.getFeatures().foreach { fd =>
101 |       val featName = fd.getName();
102 |       val featUName = jg.uc1(featName);  // upper case first letter
103 | 	    if (Jg.reservedFeatureNames.contains(featUName))
104 | 	      jg.error.newError(IError.ERROR,
105 | 		      jg.getString("reservedNameUsed", Array.apply[Object](featName, td.getName)),
106 | 		      null);
107 | 
108 |       val featDesc = jg.nullBlank(fd.getDescription());
109 |       val featDescCmt = featDesc;
110 | 
111 |       val rangeType = jg.getJavaRangeType(fd);
112 |       val elemType = jg.getJavaRangeArrayElementType(fd);
113 | 
114 |       stringBuffer.append(s""" 
115 |     
116 |   //*--------------*
117 |   //* Feature: ${featName}
118 | 
119 |   /** getter for ${featName} - gets ${featDescCmt}
120 |    * @return value of the feature 
121 |    */
122 |   public ${rangeType} get${featUName}() {
123 |     if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
124 |       jcasType.jcas.throwFeatMissing("${featName}", "${td.getName}");
125 |     return ${jg.getFeatureValue(fd, td)};}
126 |     
127 |   /** setter for ${featName} - sets ${featDescCmt} 
128 |    * @param v value to set into the feature 
129 |    */
130 |   public void set${featUName}(${rangeType} v) {
131 |     if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
132 |       jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}");
133 |     ${jg.setFeatureValue(fd, td)};}    
134 |   """);
135 | 
136 |       if (jg.hasArrayRange(fd)) {
137 |         stringBuffer.append(s"""  
138 |   /** indexed getter for ${featName} - gets an indexed value - ${featDescCmt}
139 |    * @param i index in the array to get
140 |    * @return value of the element at index i 
141 |    */
142 |   public ${elemType} get${featUName}(int i) {
143 |     if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
144 |       jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}");
145 |     jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ${jcasTypeCasted}.casFeatCode_${featName}), i);
146 |     return ${jg.getArrayFeatureValue(fd, td)};}
147 | 
148 |   /** indexed setter for ${featName} - sets an indexed value - ${featDescCmt}
149 |    * @param i index in the array to set
150 |    * @param v value to set into the array 
151 |    */
152 |   public void set${featUName}(int i, ${elemType} v) { 
153 |     if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
154 |       jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}");
155 |     jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ${jcasTypeCasted}.casFeatCode_${featName}), i);
156 |     ${jg.setArrayFeatureValue(fd, td)};}
157 |   """);
158 |       } /* of hasArray */
159 | 
160 |       stringBuffer.append("");
161 | 
162 |     } /* of Features iteration */
163 | 
164 |     stringBuffer.append("");
165 | 
166 |     if (td.getName().equals("uima.cas.Annotation")) {
167 |       stringBuffer.append("  ");
168 |       stringBuffer.append("""  /** Constructor with begin and end passed as arguments 
169 |     * @param jcas JCas this Annotation is in
170 |     * @param begin the begin offset
171 |     * @param end the end offset
172 |     */
173 |   public Annotation(JCas jcas, int begin, int end) { 
174 | 	  this(jcas); // forward to constructor 
175 | 	  this.setBegin(begin); 
176 | 	  this.setEnd(end); 
177 |   } 
178 |   
179 |   /** @see org.apache.uima.cas.text.AnnotationFS#getCoveredText() 
180 |     * @return the covered Text 
181 |     */ 
182 |   public String getCoveredText() { 
183 |     final CAS casView = this.getView();
184 |     final String text = casView.getDocumentText();
185 |     if (text == null) {
186 |       return null;
187 |     }
188 |     return text.substring(getBegin(), getEnd());
189 |   } 
190 |   
191 |   /** @deprecated 
192 |     * @return the begin offset 
193 |     */
194 |   public int getStart() {return getBegin();}
195 | """);
196 |       stringBuffer.append("");
197 |     } /* of Annotation if-statement */
198 |     stringBuffer.append("}\n\n    ");
199 |     return stringBuffer.toString();
200 |   }
201 | }
202 | 


--------------------------------------------------------------------------------
/sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaScala_TypeTemplate.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.uima.tools.jcasgen
  2 | 
  3 | import org.apache.uima.resource.metadata.TypeDescription
  4 | import scala.collection.JavaConversions._
  5 | 
  6 | class UimaScala_TypeTemplate extends Jg.IJCasTypeTemplate {
  7 | 
  8 |   def generate(argument: Any): String = {
  9 |     val args: Array[Any] = argument.asInstanceOf[Array[Any]]
 10 |     val jg = args(0).asInstanceOf[Jg]
 11 |     val td = args(1).asInstanceOf[TypeDescription]
 12 |     val stringBuffer = new StringBuffer()
 13 | 
 14 |     jg.packageName = jg.getJavaPkg(td);
 15 |     if (0 != jg.packageName.length()) {
 16 |       stringBuffer.append("package ");
 17 |       stringBuffer.append(jg.packageName);
 18 |       stringBuffer.append(";\n");
 19 |     }
 20 |     stringBuffer.append("""
 21 | import org.apache.uima.jcas.JCas;
 22 | import org.apache.uima.jcas.JCasRegistry;
 23 | import org.apache.uima.cas.impl.CASImpl;
 24 | import org.apache.uima.cas.impl.FSGenerator;
 25 | import org.apache.uima.cas.FeatureStructure;
 26 | import org.apache.uima.cas.impl.TypeImpl;
 27 | import org.apache.uima.cas.Type;
 28 | """);
 29 | 
 30 |     if (td.getFeatures().length > 0) {
 31 |       stringBuffer.append("""import org.apache.uima.cas.impl.FeatureImpl;
 32 | import org.apache.uima.cas.Feature;
 33 | """);
 34 |     }
 35 | 
 36 |     stringBuffer.append("");
 37 | 
 38 |     jg.collectImports(td, true).foreach { imp =>
 39 |       if (!imp.equals(jg.getJavaNameWithPkg(td.getName()+"_Type"))) {
 40 |         stringBuffer.append(s"""import ${imp};""")
 41 |         stringBuffer.append("\n")
 42 |       }
 43 |     }
 44 | 
 45 |     stringBuffer.append("\n");
 46 |     val typeName = jg.getJavaName(td);
 47 |     val typeName_Type = typeName + "_Type";
 48 |     stringBuffer.append(s"""/** ${jg.nullBlank(td.getDescription())} */
 49 | public class ${typeName_Type} extends ${jg.getJavaName(td.getSupertypeName())}_Type {
 50 |   /**
 51 |    * @return the generator for this type
 52 |    */
 53 |   @Override
 54 |   protected FSGenerator getFSGenerator() {return fsGenerator;}
 55 | 
 56 |   private final FSGenerator fsGenerator = 
 57 |     new FSGenerator() {
 58 |       public FeatureStructure createFS(int addr, CASImpl cas) {
 59 |   			 if (${typeName_Type}.this.useExistingInstance) {
 60 |   			   // Return eq fs instance if already created
 61 |   		     FeatureStructure fs = ${typeName_Type}.this.jcas.getJfsFromCaddr(addr);
 62 |   		     if (null == fs) {
 63 |   		       fs = new ${typeName}(addr, ${typeName_Type}.this);
 64 |   			   ${typeName_Type}.this.jcas.putJfsFromCaddr(addr, fs);
 65 |   			   return fs;
 66 |   		     }
 67 |   		     return fs;
 68 |         } else return new ${typeName}(addr, ${typeName_Type}.this);
 69 |   	  }
 70 |     };
 71 | 
 72 |   @SuppressWarnings ("hiding")
 73 |   public final static int typeIndexID = ${typeName}.typeIndexID;
 74 | 
 75 |   @SuppressWarnings ("hiding")
 76 |   public final static boolean featOkTst = JCasRegistry.getFeatOkTst("${td.getName()}");
 77 | """);
 78 |     
 79 | 
 80 |     td.getFeatures().foreach { fd =>
 81 |       val featName = fd.getName();
 82 |       val featUName = jg.uc1(featName);  // upper case first letter
 83 | 
 84 |       val rangeType = jg.getJavaRangeType(fd);
 85 |       val getSetNamePart = jg.sc(rangeType);
 86 |       val returnType = if (getSetNamePart.equals("Ref")) "int" else rangeType;
 87 |       val getSetArrayNamePart = jg.getGetSetArrayNamePart(fd);
 88 |       
 89 |       val elemType =
 90 |         if (jg.sc(jg.getJavaRangeArrayElementType(fd)).equals("Ref")) {
 91 |           "int";
 92 |         } else {
 93 |           jg.getJavaRangeArrayElementType(fd);
 94 |         }
 95 |       val casFeatCode = "casFeatCode_" + featName;
 96 | 
 97 |       stringBuffer.append(s""" 
 98 |   final Feature casFeat_${featName};
 99 |   final int     ${casFeatCode};
100 |   /**
101 |    * @param addr low level Feature Structure reference
102 |    * @return the feature value 
103 |    */ 
104 |   public ${returnType} get${featUName}(int addr) {
105 |         if (featOkTst && casFeat_${featName} == null)
106 |       jcas.throwFeatMissing("${featName}", "${td.getName()}");
107 |     return ll_cas.ll_get${getSetNamePart}Value(addr, ${casFeatCode});
108 |   }
109 |   /**
110 |    * @param addr low level Feature Structure reference
111 |    * @param v value to set 
112 |    */    
113 |   public void set${featUName}(int addr, ${returnType} v) {
114 |         if (featOkTst && casFeat_${featName} == null)
115 |       jcas.throwFeatMissing("${featName}", "${td.getName()}");
116 |     ll_cas.ll_set${getSetNamePart}Value(addr, ${casFeatCode}, v);}
117 |     
118 |  """);
119 |       
120 |       if (jg.hasArrayRange(fd)) {
121 |         stringBuffer.append(s"""
122 |   /**
123 |    * @param addr low level Feature Structure reference
124 |    * @param i index of item in the array
125 |    * @return value at index i in the array 
126 |    */
127 |   public ${elemType} get${featUName}(int addr, int i) {
128 |         if (featOkTst && casFeat_${featName} == null)
129 |       jcas.throwFeatMissing("${featName}", "${td.getName()}");
130 |     if (lowLevelTypeChecks)
131 |       return ll_cas.ll_get${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, true);
132 |     jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i);
133 | 	return ll_cas.ll_get${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i);
134 |   }
135 |    
136 |   /**
137 |    * @param addr low level Feature Structure reference
138 |    * @param i index of item in the array
139 |    * @param v value to set
140 |    */ 
141 |   public void set${featUName}(int addr, int i, ${elemType} v) {
142 |         if (featOkTst && casFeat_${featName} == null)
143 |       jcas.throwFeatMissing("${featName}", "${td.getName}");
144 |     if (lowLevelTypeChecks)
145 |       ll_cas.ll_set${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, v, true);
146 |     jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i);
147 |     ll_cas.ll_set${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, v);
148 |   }
149 | """);        
150 |       }
151 |       stringBuffer.append(" \n");
152 |     }
153 | 
154 |     stringBuffer.append("\n");
155 | 
156 |     if (td.getName().equals("uima.cas.Annotation")) {
157 |       stringBuffer.append("  ");
158 |       stringBuffer.append(s"""  /** @see org.apache.uima.cas.text.AnnotationFS#getCoveredText() 
159 |     * @param inst the low level Feature Structure reference 
160 |     * @return the covered text 
161 |     */ 
162 |   public String getCoveredText(int inst) { 
163 |     final CASImpl casView = ll_cas.ll_getSofaCasView(inst);
164 |     final String text = casView.getDocumentText();
165 |     if (text == null) {
166 |       return null;
167 |     }
168 |     return text.substring(getBegin(inst), getEnd(inst)); 
169 |   }
170 | """);
171 |     } /* of Annotation if-statement */
172 | 
173 |     stringBuffer.append(s"""
174 | 
175 |   /** initialize variables to correspond with Cas Type and Features
176 | 	 * @param jcas JCas
177 | 	 * @param casType Type 
178 | 	 */
179 |   public ${typeName_Type}(JCas jcas, Type casType) {
180 |     super(jcas, casType);
181 |     casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
182 | 
183 | """);
184 |     td.getFeatures().foreach { fd =>
185 |       val featName = fd.getName();
186 | 
187 |       stringBuffer.append(s""" 
188 |     casFeat_${featName} = jcas.getRequiredFeatureDE(casType, "${featName}", "${fd.getRangeTypeName()}", featOkTst);
189 |     casFeatCode_${featName}  = (null == casFeat_${featName}) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_${featName}).getCode();
190 | 
191 | """);
192 |     }
193 |     stringBuffer.append("  }\n}\n\n\n\n    ");
194 |     return stringBuffer.toString();
195 |   }
196 | }
197 | 


--------------------------------------------------------------------------------
/sbt-plugin/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.2-SNAPSHOT"
2 | 


--------------------------------------------------------------------------------
/segmenter/ark-tweet-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/ArkTweetTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import cmu.arktweetnlp.Twokenize
 8 | import scala.collection.JavaConversions._
 9 | 
10 | object ArkTweetTokenizer {
11 |   def normalizeTweet(tweet: String): String =
12 |     Twokenize.normalizeTextForTagger(tweet)
13 | }
14 | 
15 | class ArkTweetTokenizer extends SCasAnnotator_ImplBase {
16 | 
17 |   def process(jcas: JCas) = {
18 |     val txt = jcas.getDocumentText
19 | 
20 |     Twokenize.tokenize(txt).foldLeft(0) { (offset, token) =>
21 |       val start = txt.indexOf(token, offset);
22 |       val end = start + token.length
23 |       add(createToken(jcas, start, end))
24 |       end
25 |     }
26 |   }
27 | 
28 |   def createToken(cas: JCas, begin: Int, end: Int) =
29 |     new Token(cas, begin, end)
30 | }
31 | 


--------------------------------------------------------------------------------
/segmenter/ark-tweet-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/ArkTweetTokenizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.specs2.mutable.Specification
 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
 9 | import org.apache.uima.fit.util.JCasUtil
10 | 
11 | class ArkTweetTokenizerSpec extends Specification {
12 | 
13 |   "Ark Tweet Tokenizer" should {
14 |     "annotate all tokens in a tweet" in {
15 |       val tokenizer: AnalysisEngine = new ArkTweetTokenizer().asAnalysisEngine
16 | 
17 |       val jcas = tokenizer.newJCas()
18 |       jcas.setDocumentText("This is a test &amp; a thing #hash #tag bit.ly/link")
19 |       tokenizer.process(jcas)
20 | 
21 |       jcas.select[Token].size must be equalTo(12)
22 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("This")
23 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("is")
24 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("a")
25 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("test")
26 |       jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("&")
27 |       jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("amp")
28 |       jcas.selectByIndex[Token](6).getCoveredText must be equalTo (";")
29 |       jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("a")
30 |       jcas.selectByIndex[Token](8).getCoveredText must be equalTo ("thing")
31 |       jcas.selectByIndex[Token](9).getCoveredText must be equalTo ("#hash")
32 |       jcas.selectByIndex[Token](10).getCoveredText must be equalTo ("#tag")
33 |       jcas.selectByIndex[Token](11).getCoveredText must be equalTo ("bit.ly/link")
34 |     }
35 | 
36 |     "it should normalize a tweet" in {
37 |       ArkTweetTokenizer.normalizeTweet("This is a test &amp; a thing #hash #tag bit.ly/link") must be equalTo (
38 |         "This is a test & a thing #hash #tag bit.ly/link"
39 |       )
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/segmenter/break-iterator-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/BreakIteratorSegmenter.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.segmenter
 5 | 
 6 | import java.text.BreakIterator
 7 | import java.util.Locale
 8 | import com.github.jenshaase.uimascala.core._
 9 | import com.github.jenshaase.uimascala.typesystem._
10 | import com.github.jenshaase.uimascala.core.configuration._
11 | import org.apache.uima.jcas.JCas
12 | import org.apache.uima.fit.descriptor.ConfigurationParameter
13 | import org.apache.uima.fit.factory.AnalysisEngineFactory
14 | 
15 | /**
16 |  * @author Jens Haase <je.haase@googlemail.com>
17 |  */
18 | class BreakIteratorSegmenter extends SCasAnnotator_ImplBase {
19 | 
20 |   object locale extends Parameter[Locale](Locale.getDefault)
21 | 
22 |   def process(jcas: JCas) = {
23 |     val bi = BreakIterator.getSentenceInstance(getLocale(jcas))
24 |     bi.setText(jcas.getDocumentText)
25 | 
26 |     var last = bi.first
27 |     var cur = bi.next
28 |     while (cur != BreakIterator.DONE) {
29 |       val sentence = addIfNotEmpty(createSentence(jcas, last, cur).trim)
30 |       processSentence(jcas, sentence.getCoveredText, last)
31 | 
32 |       last = cur
33 |       cur = bi.next
34 |     }
35 |   }
36 | 
37 |   def processSentence(jcas: JCas, sentence: String, offset: Int) = {
38 |     val bi = BreakIterator.getWordInstance(getLocale(jcas))
39 |     bi.setText(sentence)
40 | 
41 |     var last = bi.first
42 |     var cur = bi.next
43 |     while (cur != BreakIterator.DONE) {
44 |       addIfNotEmpty(createToken(jcas, last + offset, cur + offset).trim)
45 | 
46 |       last = cur
47 |       cur = bi.next
48 |     }
49 |   }
50 | 
51 |   protected def createSentence(cas: JCas, begin: Int, end: Int) =
52 |     new Sentence(cas, begin, end)
53 | 
54 |   protected def createToken(cas: JCas, begin: Int, end: Int) =
55 |     new Token(cas, begin, end)
56 | 
57 |   protected def getLocale(jcas: JCas): Locale = {
58 |     val l = jcas.getDocumentLanguage()
59 |     if (l != null && l != "x-unspecified") {
60 |       return new Locale(l)
61 |     }
62 | 
63 |     locale.is
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/segmenter/break-iterator-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/BreakIteratorSegmenterSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (C) 2011 Jens Haase
 3 |  */
 4 | package com.github.jenshaase.uimascala.segmenter
 5 | 
 6 | import java.util.Locale
 7 | import com.github.jenshaase.uimascala.core._
 8 | import com.github.jenshaase.uimascala.typesystem._
 9 | import org.apache.uima.analysis_engine.AnalysisEngine
10 | import org.specs2.mutable.Specification
11 | import org.apache.uima.fit.factory.AnalysisEngineFactory
12 | import org.apache.uima.fit.util.JCasUtil
13 | 
14 | /**
15 |  * @author Jens Haase <je.haase@googlemail.com>
16 |  */
17 | class BreakIteratorSegmenterSpec extends Specification {
18 | 
19 |   "Break Iterator" should {
20 |     val germanTokenizer: AnalysisEngine = new BreakIteratorSegmenter().config(
21 |       _.locale := Locale.GERMAN).asAnalysisEngine
22 | 
23 |     "split german sentences" in {
24 |       val jcas = germanTokenizer.newJCas()
25 |       jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
26 |       germanTokenizer.process(jcas)
27 | 
28 |       jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Hallo, alle zusammen.")
29 |       jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Wie geht es euch?")
30 |     }
31 | 
32 |     "split german words" in {
33 |       val jcas = germanTokenizer.newJCas()
34 |       jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
35 |       germanTokenizer.process(jcas)
36 | 
37 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
38 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo (",")
39 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("alle")
40 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("zusammen")
41 |     }
42 | 
43 |     "split english words when document language is set" in {
44 |       val jcas = germanTokenizer.newJCas()
45 |       jcas.setDocumentText("What's up? Once again")
46 |       jcas.setDocumentLanguage("en");
47 |       germanTokenizer.process(jcas)
48 | 
49 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("What's")
50 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("up")
51 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("?")
52 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("Once")
53 |       jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("again")
54 |     }
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/segmenter/lucene-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/LuceneTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import java.text.BreakIterator
 8 | import org.apache.lucene.analysis.standard.StandardTokenizer
 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
10 | 
11 | class LuceneTokenizer extends SCasAnnotator_ImplBase {
12 | 
13 |   def process(jcas: JCas) = {
14 |     val bi = BreakIterator.getSentenceInstance()
15 |     bi.setText(jcas.getDocumentText)
16 | 
17 |     var last = bi.first
18 |     var cur = bi.next
19 |     while (cur != BreakIterator.DONE) {
20 |       val sentence = addIfNotEmpty(createSentence(jcas, last, cur).trim)
21 |       processSentence(jcas, sentence.getCoveredText, last)
22 | 
23 |       last = cur
24 |       cur = bi.next
25 |     }
26 |   }
27 | 
28 |   def processSentence(jcas: JCas, sentence: String, offset: Int) = {
29 |     val tokenizer = new StandardTokenizer()
30 |     tokenizer.setReader(new java.io.StringReader(sentence))
31 |     tokenizer.reset()
32 |     while (tokenizer.incrementToken()) {
33 |       val tokenOffset = tokenizer.getAttribute(classOf[OffsetAttribute])
34 |       add(createToken(jcas, offset + tokenOffset.startOffset, offset + tokenOffset.endOffset))
35 |     }
36 |     tokenizer.end()
37 |     tokenizer.close()
38 |   }
39 | 
40 |   protected def createSentence(cas: JCas, begin: Int, end: Int) =
41 |     new Sentence(cas, begin, end)
42 | 
43 |   protected def createToken(cas: JCas, begin: Int, end: Int) =
44 |     new Token(cas, begin, end)
45 | }
46 | 


--------------------------------------------------------------------------------
/segmenter/lucene-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/LuceneTokenizerSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.specs2.mutable.Specification
 8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
 9 | import org.apache.uima.fit.util.JCasUtil
10 | 
11 | class LuceneTokenizerSpec extends Specification {
12 | 
13 |   "Lucene Tokenizer" should {
14 |     val tokenizer: AnalysisEngine = new LuceneTokenizer().asAnalysisEngine
15 | 
16 |     "split in sentences" in {
17 |       val jcas = tokenizer.newJCas()
18 |       jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
19 |       tokenizer.process(jcas)
20 | 
21 |       jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Hallo, alle zusammen.")
22 |       jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Wie geht es euch?")
23 |     }
24 | 
25 |     "split words" in {
26 |       val jcas = tokenizer.newJCas()
27 |       jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
28 |       tokenizer.process(jcas)
29 | 
30 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
31 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("alle")
32 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("zusammen")
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/segmenter/open-nlp-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/OpenNlpSegmenter.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import org.apache.uima.resource.SharedResourceObject
 8 | import org.apache.uima.resource.DataResource
 9 | import java.util.zip.GZIPInputStream
10 | import scala.collection.JavaConversions._
11 | import opennlp.tools.sentdetect.SentenceDetectorME
12 | import opennlp.tools.sentdetect.SentenceModel
13 | import opennlp.tools.tokenize.TokenizerME
14 | import opennlp.tools.tokenize.TokenizerModel
15 | 
16 | class OpenNlpSentenceSegmenterResource extends SharedResourceObject {
17 |   private var model: SentenceDetectorME = _
18 | 
19 |   def load(data: DataResource) {
20 |     val uri = data.getUri.toString
21 | 
22 |     if (new java.io.File(uri).exists) {
23 |       model = new SentenceDetectorME(new SentenceModel(new java.io.File(uri)))
24 |     } else {
25 |       val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
26 |       val resource = this.getClass.getResource(resourceUri)
27 | 
28 |       val is = if (uri.endsWith(".gz")) {
29 |         new GZIPInputStream(resource.openStream)
30 |       } else {
31 |         resource.openStream
32 |       }
33 | 
34 |       model = new SentenceDetectorME(new SentenceModel(is))
35 |     }
36 |   }
37 | 
38 |   def getModel = model
39 | }
40 | 
41 | class OpenNlpTokenSegmenterResource extends SharedResourceObject {
42 |   private var model: TokenizerME = _
43 | 
44 |   def load(data: DataResource) {
45 |     val uri = data.getUri.toString
46 | 
47 |     if (new java.io.File(uri).exists) {
48 |       model = new TokenizerME(new TokenizerModel(new java.io.File(uri)))
49 |     } else {
50 |       val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
51 |       val resource = this.getClass.getResource(resourceUri)
52 | 
53 |       val is = if (uri.endsWith(".gz")) {
54 |         new GZIPInputStream(resource.openStream)
55 |       } else {
56 |         resource.openStream
57 |       }
58 | 
59 |       model = new TokenizerME(new TokenizerModel(is))
60 |     }
61 |   }
62 | 
63 |   def getModel = model
64 | }
65 | 
66 | class OpenNlpSegmenter extends SCasAnnotator_ImplBase {
67 | 
68 |   object sentenceModel extends SharedResource[OpenNlpSentenceSegmenterResource]("")
69 |   object tokenModel extends SharedResource[OpenNlpTokenSegmenterResource]("")
70 | 
71 |   def process(jcas: JCas) = {
72 |     sentenceModel.resource.getModel.sentPosDetect(jcas.getDocumentText).foreach { span =>
73 |       add(createSentence(jcas, span.getStart, span.getEnd))
74 |     }
75 | 
76 |     jcas.select[Sentence].foreach { sentence =>
77 |       tokenModel.resource.getModel.tokenizePos(sentence.getCoveredText).foreach { span =>
78 |         add(createToken(jcas, span.getStart + sentence.getStart, span.getEnd + sentence.getStart))
79 |       }
80 |     }
81 |   }
82 | 
83 |   protected def createToken(cas: JCas, begin: Int, end: Int) =
84 |     new Token(cas, begin, end)
85 | 
86 |   protected def createSentence(cas: JCas, begin: Int, end: Int) =
87 |     new Sentence(cas, begin, end)
88 | }
89 | 


--------------------------------------------------------------------------------
/segmenter/open-nlp-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/OpenNlpSegmenterSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.typesystem._
 5 | import com.github.jenshaase.uimascala.core.configuration._
 6 | import org.apache.uima.analysis_engine.AnalysisEngine
 7 | import org.specs2.mutable.Specification
 8 | 
 9 | class OpenNlpSegmenterSpec extends Specification {
10 | 
11 |   "Open Nlp Segmenter" should {
12 |     "add sentence and token annotations" in {
13 |       val segmenter: AnalysisEngine = new OpenNlpSegmenter().
14 |         config(
15 |           _.sentenceModel := SharedBinding[OpenNlpSentenceSegmenterResource]("/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/sentence-de-maxent.bin"),
16 |           _.tokenModel := SharedBinding[OpenNlpTokenSegmenterResource]("/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/token-de-maxent.bin")
17 |         ).
18 |         asAnalysisEngine
19 | 
20 |       val jcas = segmenter.newJCas()
21 |       jcas.setDocumentText("Wie alt bist du?")
22 |       segmenter.process(jcas)
23 | 
24 |       val sentences = jcas.select[Sentence].toVector
25 |       sentences.size must be equalTo(1)
26 |       sentences(0).getCoveredText must be equalTo(jcas.getDocumentText)
27 | 
28 |       val tokens = jcas.select[Token].toVector
29 |       tokens.size must be equalTo(5)
30 |       tokens(0).getCoveredText must be equalTo("Wie")
31 |       tokens(1).getCoveredText must be equalTo("alt")
32 |       tokens(2).getCoveredText must be equalTo("bist")
33 |       tokens(3).getCoveredText must be equalTo("du")
34 |       tokens(4).getCoveredText must be equalTo("?")
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/segmenter/regex-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/RegexTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import com.github.jenshaase.uimascala.core._
 4 | import com.github.jenshaase.uimascala.core.configuration._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import org.apache.uima.jcas.JCas
 7 | import scala.util.matching.Regex
 8 | 
 9 | /**
10 |   * @author Jens Haase <je.haase@googlemail.com>
11 |   */
12 | class RegexTokenizer extends SCasAnnotator_ImplBase {
13 | 
14 |   object regex extends Parameter[Regex]("""\s+""".r)
15 |   object allowEmptyToken extends Parameter[Boolean](false)
16 | 
17 |   def process(jcas: JCas) = {
18 |     val txt = jcas.getDocumentText
19 | 
20 |     val mostlyAll = getRegex.findAllMatchIn(txt).foldLeft(0) {
21 |       case (last, m) if ((allowEmptyToken.is && m.start >= last) || (!allowEmptyToken.is && m.start > last)) =>
22 |         add(createToken(jcas, last, m.start))
23 |         m.end
24 |       case (_, m) =>
25 |         m.end
26 |     }
27 | 
28 |     if (mostlyAll < txt.length)
29 |       add(createToken(jcas, mostlyAll, txt.length))
30 |   }
31 | 
32 |   protected def getRegex =
33 |     regex.is
34 | 
35 |   protected def createToken(cas: JCas, begin: Int, end: Int) =
36 |     new Token(cas, begin, end)
37 | }
38 | 


--------------------------------------------------------------------------------
/segmenter/regex-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/RegexTokenizerSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) 2011 Jens Haase
 3 |   */
 4 | package com.github.jenshaase.uimascala.segmenter
 5 | 
 6 | import java.util.Locale
 7 | import com.github.jenshaase.uimascala.core._
 8 | import com.github.jenshaase.uimascala.typesystem._
 9 | import org.apache.uima.analysis_engine.AnalysisEngine
10 | import org.specs2.mutable.Specification
11 | import org.apache.uima.fit.factory.AnalysisEngineFactory
12 | import org.apache.uima.fit.util.JCasUtil
13 | 
14 | class RegexTokenizerSpec extends Specification {
15 | 
16 |   "Regex Tokenizer" should {
17 |     "split by x" in {
18 |       val tokenizer: AnalysisEngine = new RegexTokenizer().
19 |         config(
20 |           _.regex := "x".r
21 |         ).
22 |         asAnalysisEngine
23 | 
24 |       val jcas = tokenizer.newJCas()
25 |       jcas.setDocumentText("HalloxWeltxlosxgehtsx")
26 |       tokenizer.process(jcas)
27 | 
28 |       jcas.select[Token].size must be equalTo(4)
29 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
30 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt")
31 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los")
32 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("gehts")
33 |     }
34 | 
35 |     "allow empty token" in {
36 |       val tokenizer: AnalysisEngine = new RegexTokenizer().
37 |         config(
38 |           _.regex := "x".r,
39 |           _.allowEmptyToken := true
40 |         ).
41 |         asAnalysisEngine
42 | 
43 |       val jcas = tokenizer.newJCas()
44 |       jcas.setDocumentText("HalloxWeltxlosxxgehts")
45 |       tokenizer.process(jcas)
46 | 
47 |       jcas.select[Token].size must be equalTo(5)
48 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
49 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt")
50 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los")
51 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("")
52 |       jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("gehts")
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/segmenter/stanford-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/StanfordSegmenter.scala:
--------------------------------------------------------------------------------
  1 | package com.github.jenshaase.uimascala.segmenter
  2 | 
  3 | import com.github.jenshaase.uimascala.core._
  4 | import com.github.jenshaase.uimascala.core.configuration._
  5 | import com.github.jenshaase.uimascala.typesystem._
  6 | import org.apache.uima.jcas.JCas
  7 | import org.apache.uima.resource.SharedResourceObject
  8 | import org.apache.uima.resource.DataResource
  9 | import edu.stanford.nlp.ling.TaggedWord
 10 | import scala.collection.JavaConversions._
 11 | import java.io.StringReader
 12 | import java.util.Properties
 13 | import edu.stanford.nlp.ling.{CoreLabel, Word}
 14 | import edu.stanford.nlp.international.spanish.process.SpanishTokenizer
 15 | import edu.stanford.nlp.international.arabic.process.ArabicTokenizer
 16 | import edu.stanford.nlp.international.french.process.FrenchTokenizer
 17 | import edu.stanford.nlp.trees.international.pennchinese.CHTBTokenizer
 18 | import edu.stanford.nlp.process.{WordToSentenceProcessor, Tokenizer, PTBTokenizer, CoreLabelTokenFactory}
 19 | import edu.stanford.nlp.ling.CoreAnnotations.{CharacterOffsetBeginAnnotation, CharacterOffsetEndAnnotation}
 20 | import edu.stanford.nlp.trees.international.negra.NegraPennTokenizer
 21 | 
 22 | class StanfordSegmenter extends SCasAnnotator_ImplBase {
 23 | 
 24 |   object annotateToken extends Parameter[Boolean](true)
 25 |   object annotateSentence extends Parameter[Boolean](true)
 26 |   object fallbackLanguage extends Parameter[Option[String]](None) {
 27 |     override def mandatory_? = false
 28 |   }
 29 | 
 30 |   def process(jcas: JCas) = {
 31 |     if (annotateToken.is) annotateTokens(jcas)
 32 |     if (annotateSentence.is) annotateSentences(jcas)
 33 |   }
 34 | 
 35 |   def annotateTokens(jcas: JCas) {
 36 |     val text = jcas.getDocumentText
 37 |     val tokenizer = getTokenizer(jcas.getDocumentLanguage, text)
 38 | 
 39 |     var offsetInSentence = 0
 40 |     tokenizer.tokenize().foreach { token =>
 41 |       token match {
 42 |         case token: String =>
 43 |           offsetInSentence = skipWhitespace(text, offsetInSentence)
 44 | 
 45 |           if (text.startsWith(token, offsetInSentence)) {
 46 |             add(createToken(jcas, offsetInSentence, offsetInSentence + token.size))
 47 |             offsetInSentence = offsetInSentence + token.size
 48 |           } else {
 49 |             throw new Exception("Text mismatch in Tokenizer: " + token + " not found")
 50 |           }
 51 | 
 52 |         case label: CoreLabel =>
 53 |           val begin = label.beginPosition
 54 |           val end = label.endPosition
 55 |           add(createToken(jcas, begin, end))
 56 |           offsetInSentence = end
 57 | 
 58 |         case word: Word =>
 59 |           val token = word.word
 60 |           offsetInSentence = skipWhitespace(text, offsetInSentence)
 61 | 
 62 |           if (text.startsWith(token, offsetInSentence)) {
 63 |             add(createToken(jcas, offsetInSentence, offsetInSentence + token.size))
 64 |             offsetInSentence = offsetInSentence + token.size
 65 |           } else {
 66 |             throw new Exception("Text mismatch in Tokenizer: " + token + " not found")
 67 |           }
 68 |       }
 69 |     }
 70 |   }
 71 | 
 72 |   def annotateSentences(jcas: JCas) {
 73 |     val tokens = jcas.select[Token].map { token =>
 74 |       val label = new CoreLabel()
 75 |       label.setBeginPosition(token.getBegin)
 76 |       label.setEndPosition(token.getEnd)
 77 |       label.setWord(token.getCoveredText)
 78 |       label
 79 |     }.toList
 80 | 
 81 |     val proc = new WordToSentenceProcessor[CoreLabel]()
 82 |     proc.process(tokens).foreach { sentence =>
 83 |       add(createSentence(jcas, sentence.head.beginPosition, sentence.last.endPosition))
 84 |     }
 85 |   }
 86 | 
 87 |   protected def skipWhitespace(text: String, offset: Int): Int = {
 88 |     var newOffset = offset
 89 |     while (newOffset < text.size && Character.isWhitespace(text.charAt(newOffset))) {
 90 |       newOffset = newOffset + 1
 91 |     }
 92 |     newOffset
 93 |   }
 94 | 
 95 |   protected def createToken(cas: JCas, begin: Int, end: Int) =
 96 |     new Token(cas, begin, end)
 97 | 
 98 |   protected def createSentence(cas: JCas, begin: Int, end: Int) =
 99 |     new Sentence(cas, begin, end)
100 | 
101 |   protected def getTokenizer(lang: String, text: String): Tokenizer[_] = {
102 |     getTokenizerFromLanguage(lang, text) match {
103 |       case Some(tokenizer) => tokenizer
104 |       case None =>
105 |         fallbackLanguage.is.flatMap { lang => 
106 |           getTokenizerFromLanguage(lang, text)
107 |         }.getOrElse(
108 |           throw new Exception("can not create tokenizer for language: " + lang)
109 |         )
110 |     }
111 |   }
112 | 
113 |   private def getTokenizerFromLanguage(lang: String, text: String): Option[Tokenizer[_]] = 
114 |     lang match {
115 |       case "ar" => Some(ArabicTokenizer.newArabicTokenizer(new StringReader(text), new Properties()))
116 |       case "en" => Some(new PTBTokenizer[CoreLabel](new StringReader(text), new CoreLabelTokenFactory(), "invertible"))
117 |       case "es" => Some(SpanishTokenizer.factory(new CoreLabelTokenFactory(), null).getTokenizer(new StringReader(text)))
118 |       case "fr" => Some(FrenchTokenizer.factory().getTokenizer(new StringReader(text), "tokenizeNLs=false"))
119 |       case "de" => Some(new NegraPennTokenizer(new StringReader(text)))
120 |       case "zh" => Some(new CHTBTokenizer(new StringReader(text)))
121 |       case _ => None
122 |     }
123 | }
124 | 
125 | 


--------------------------------------------------------------------------------
/segmenter/stanford-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/StanfordSegmenterSpec.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import java.util.Locale
 4 | import com.github.jenshaase.uimascala.core._
 5 | import com.github.jenshaase.uimascala.typesystem._
 6 | import com.github.jenshaase.uimascala.core.configuration._
 7 | import org.apache.uima.analysis_engine.AnalysisEngine
 8 | import org.specs2.mutable.Specification
 9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 | 
12 | class StanfordSegmenterSpec extends Specification {
13 | 
14 |   "StanfordSegmenter" should {
15 |     "segment english sentences and tokens" in {
16 |       val segmenter: AnalysisEngine = new StanfordSegmenter().
17 |         asAnalysisEngine
18 | 
19 |       val jcas = segmenter.newJCas()
20 |       jcas.setDocumentText("This is a text. Here we are! ")
21 |       jcas.setDocumentLanguage("en")
22 |       segmenter.process(jcas)
23 | 
24 |       jcas.select[Token].size must be equalTo(9)
25 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("This")
26 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("is")
27 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("a")
28 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("text")
29 |       jcas.selectByIndex[Token](4).getCoveredText must be equalTo (".")
30 |       jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("Here")
31 |       jcas.selectByIndex[Token](6).getCoveredText must be equalTo ("we")
32 |       jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("are")
33 |       jcas.selectByIndex[Token](8).getCoveredText must be equalTo ("!")
34 | 
35 |       jcas.select[Sentence].size must be equalTo(2)
36 |       jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("This is a text.")
37 |       jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Here we are!")
38 |     }
39 | 
40 |     "segment french sentences and tokens" in {
41 |       val segmenter: AnalysisEngine = new StanfordSegmenter().
42 |         asAnalysisEngine
43 | 
44 |       val jcas = segmenter.newJCas()
45 |       jcas.setDocumentText("Bonjour à tous. C'est parti!")
46 |       jcas.setDocumentLanguage("fr")
47 |       segmenter.process(jcas)
48 | 
49 |       jcas.select[Token].size must be equalTo(8)
50 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Bonjour")
51 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("à")
52 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("tous")
53 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo (".")
54 |       jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("C'")
55 |       jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("est")
56 |       jcas.selectByIndex[Token](6).getCoveredText must be equalTo ("parti")
57 |       jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("!")
58 | 
59 |       jcas.select[Sentence].size must be equalTo(2)
60 |       jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Bonjour à tous.")
61 |       jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("C'est parti!")
62 |     }
63 | 
64 |     "segment english text without a point" in {
65 |       val segmenter: AnalysisEngine = new StanfordSegmenter().
66 |         asAnalysisEngine
67 | 
68 |       val jcas = segmenter.newJCas()
69 |       jcas.setDocumentText("This is a sentence")
70 |       jcas.setDocumentLanguage("en")
71 |       segmenter.process(jcas)
72 | 
73 |       jcas.select[Sentence].size must be equalTo(1)
74 |       jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("This is a sentence")
75 |     }
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/segmenter/whitespace-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/WhitespaceTokenizer.scala:
--------------------------------------------------------------------------------
 1 | package com.github.jenshaase.uimascala.segmenter
 2 | 
 3 | import com.github.jenshaase.uimascala.core.configuration._
 4 | import scala.util.matching.Regex
 5 | 
 6 | class WhitespaceTokenizer extends RegexTokenizer {
 7 | 
 8 |   override def getRegex = """\s+""".r
 9 | }
10 | 


--------------------------------------------------------------------------------
/segmenter/whitespace-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/WhitespaceTokenizerSpec.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Copyright (C) 2011 Jens Haase
 3 |   */
 4 | package com.github.jenshaase.uimascala.segmenter
 5 | 
 6 | import java.util.Locale
 7 | import com.github.jenshaase.uimascala.core._
 8 | import com.github.jenshaase.uimascala.typesystem._
 9 | import org.apache.uima.analysis_engine.AnalysisEngine
10 | import org.specs2.mutable.Specification
11 | import org.apache.uima.fit.factory.AnalysisEngineFactory
12 | import org.apache.uima.fit.util.JCasUtil
13 | 
14 | class WhitespaceTokenizerSpec extends Specification {
15 | 
16 |   "Whitespace Tokenizer" should {
17 |     "split by whitespace" in {
18 |       val tokenizer: AnalysisEngine = new WhitespaceTokenizer().asAnalysisEngine
19 | 
20 |       val jcas = tokenizer.newJCas()
21 |       jcas.setDocumentText("Hallo Welt  los\ngehts ")
22 |       tokenizer.process(jcas)
23 | 
24 |       jcas.select[Token].size must be equalTo(4)
25 |       jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
26 |       jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt")
27 |       jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los")
28 |       jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("gehts")
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/type-system/src/main/resources/META-INF/org.apache.uima.fit/types.txt:
--------------------------------------------------------------------------------
1 | classpath*:desc/types/**/*.xml
2 | 


--------------------------------------------------------------------------------
/type-system/src/main/resources/desc/types/TypeSystem.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
  3 |   <name>TypeSystem</name>
  4 |   <types>
  5 |     <typeDescription>
  6 |       <name>com.github.jenshaase.uimascala.typesystem.Token</name>
  7 |       <description></description>
  8 |       <supertypeName>uima.tcas.Annotation</supertypeName>
  9 |       <features>
 10 |         <featureDescription>
 11 |           <name>pos</name>
 12 |           <rangeTypeName>com.github.jenshaase.uimascala.typesystem.POS</rangeTypeName>
 13 |         </featureDescription>
 14 |         <featureDescription>
 15 |           <name>lemma</name>
 16 |           <rangeTypeName>com.github.jenshaase.uimascala.typesystem.Lemma</rangeTypeName>
 17 |         </featureDescription>
 18 |         <featureDescription>
 19 |           <name>parent</name>
 20 |           <rangeTypeName>uima.tcas.Annotation</rangeTypeName>
 21 |         </featureDescription>
 22 |       </features>
 23 |     </typeDescription>
 24 | 
 25 |     <typeDescription>
 26 |       <name>com.github.jenshaase.uimascala.typesystem.Sentence</name>
 27 |       <description></description>
 28 |       <supertypeName>uima.tcas.Annotation</supertypeName>
 29 |       <features>
 30 |       </features>
 31 |     </typeDescription>
 32 | 
 33 |     <typeDescription>
 34 |       <name>com.github.jenshaase.uimascala.typesystem.POS</name>
 35 |       <description></description>
 36 |       <supertypeName>uima.tcas.Annotation</supertypeName>
 37 |       <features>
 38 |         <featureDescription>
 39 |           <name>name</name>
 40 |           <rangeTypeName>uima.cas.String</rangeTypeName>
 41 |         </featureDescription>
 42 |       </features>
 43 |     </typeDescription>
 44 | 
 45 |     <typeDescription>
 46 |       <name>com.github.jenshaase.uimascala.typesystem.Lemma</name>
 47 |       <description></description>
 48 |       <supertypeName>uima.tcas.Annotation</supertypeName>
 49 |       <features>
 50 |         <featureDescription>
 51 |           <name>value</name>
 52 |           <rangeTypeName>uima.cas.String</rangeTypeName>
 53 |         </featureDescription>
 54 |       </features>
 55 |     </typeDescription>
 56 | 
 57 |     <typeDescription>
 58 |       <name>com.github.jenshaase.uimascala.typesystem.Dependency</name>
 59 |       <description></description>
 60 |       <supertypeName>uima.tcas.Annotation</supertypeName>
 61 |       <features>
 62 |         <featureDescription>
 63 |           <name>governor</name>
 64 |           <rangeTypeName>com.github.jenshaase.uimascala.typesystem.Token</rangeTypeName>
 65 |         </featureDescription>
 66 |         <featureDescription>
 67 |           <name>dependent</name>
 68 |           <rangeTypeName>com.github.jenshaase.uimascala.typesystem.Token</rangeTypeName>
 69 |         </featureDescription>
 70 |         <featureDescription>
 71 |           <name>dependencyType</name>
 72 |           <rangeTypeName>uima.cas.String</rangeTypeName>
 73 |         </featureDescription>
 74 |       </features>
 75 |     </typeDescription>
 76 | 
 77 |     <typeDescription>
 78 |       <name>com.github.jenshaase.uimascala.typesystem.DependencyRoot</name>
 79 |       <description></description>
 80 |       <supertypeName>com.github.jenshaase.uimascala.typesystem.Dependency</supertypeName>
 81 |     </typeDescription>
 82 | 
 83 |     <typeDescription>
 84 |       <name>com.github.jenshaase.uimascala.typesystem.Constituent</name>
 85 |       <description></description>
 86 |       <supertypeName>uima.tcas.Annotation</supertypeName>
 87 |       <features>
 88 |         <featureDescription>
 89 |           <name>constituentType</name>
 90 |           <rangeTypeName>uima.cas.String</rangeTypeName>
 91 |         </featureDescription>
 92 |         <featureDescription>
 93 |           <name>parent</name>
 94 |           <rangeTypeName>uima.tcas.Annotation</rangeTypeName>
 95 |         </featureDescription>
 96 |         <featureDescription>
 97 |           <name>children</name>
 98 |           <rangeTypeName>uima.cas.FSArray</rangeTypeName>
 99 |           <elementType>uima.tcas.Annotation</elementType>
100 |         </featureDescription>
101 |         <featureDescription>
102 |           <name>syntacticFunction</name>
103 |           <rangeTypeName>uima.cas.String</rangeTypeName>
104 |         </featureDescription>
105 |       </features>
106 |     </typeDescription>
107 | 
108 |     <typeDescription>
109 |       <name>com.github.jenshaase.uimascala.typesystem.NamedEntity</name>
110 |       <description></description>
111 |       <supertypeName>uima.tcas.Annotation</supertypeName>
112 |       <features>
113 |         <featureDescription>
114 |           <name>value</name>
115 |           <rangeTypeName>uima.cas.String</rangeTypeName>
116 |         </featureDescription>
117 |       </features>
118 |     </typeDescription>
119 |   </types>
120 | </typeSystemDescription>
121 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.2-SNAPSHOT"
2 | 


--------------------------------------------------------------------------------