├── .gitignore
├── .travis.yml
├── LICENSE
├── README.markdown
├── build.sbt
├── core
├── .scala_dependencies
└── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── core
│ │ ├── AsAnalysisEngine.scala
│ │ ├── Converter.scala
│ │ ├── SCasAnnotator_ImplBase.scala
│ │ ├── SCasCollectionReader_ImplBase.scala
│ │ ├── SCasConsumer_ImplBase.scala
│ │ ├── SCasFlowController_ImplBase.scala
│ │ ├── SCasMultiplier_ImplBase.scala
│ │ ├── SimplePipeline.scala
│ │ ├── XmlDescriptor.scala
│ │ ├── configuration
│ │ ├── ConfigurationInitialization.scala
│ │ ├── Parameter.scala
│ │ ├── Resource.scala
│ │ └── ResourceInitialization.scala
│ │ ├── package.scala
│ │ ├── stream
│ │ ├── annotators.scala
│ │ └── package.scala
│ │ └── wrapper
│ │ ├── AnnotationWrapper.scala
│ │ └── JCasWrapper.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── core
│ ├── ConverterSpec.scala
│ ├── SCasAnnotator_ImplBaseSpecs.scala
│ ├── SimplePipelineSpecs.scala
│ ├── configuration
│ ├── ConfigurationInitalizationSpec.scala
│ ├── ParameterSpec.scala
│ ├── ResourceInitializationSpec.scala
│ └── ResourceSpec.scala
│ ├── stream
│ └── annotatorsSpec.scala
│ ├── util
│ └── Helper.scala
│ └── wrapper
│ ├── AnnotationWrapperSpec.scala
│ └── JCasWrapperSpec.scala
├── language-identification
└── n-gram-language-identifier
│ └── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── languageidentifier
│ │ └── NGramLanguageIdentifier.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── languageidentifier
│ └── NGramLanguageIdentifierSpec.scala
├── lemmatizer
└── mate-lemmatizer
│ └── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── lemmatizer
│ │ └── MateLemmatizer.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── lemmatizer
│ └── MateLemmatizerSpec.scala
├── name-entity-recognizer
└── stanford-ner
│ └── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── ner
│ │ └── StanfordNer.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── ner
│ └── StanfordNerSpec.scala
├── parser
├── mate-parser
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── parser
│ │ │ └── MateParser.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── parser
│ │ └── MateParserSpec.scala
└── stanford-parser
│ └── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── parser
│ │ └── StanfordParser.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── parser
│ └── StanfordParserSpec.scala
├── part-of-speech-tagger
├── ark-tweet-pos-tagger
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── pos
│ │ │ └── ArkTweetPosTagger.scala
│ │ └── test
│ │ ├── resources
│ │ └── model.20120919
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── pos
│ │ └── ArkTweetPosTaggerSpec.scala
├── mate-pos-tagger
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── pos
│ │ │ └── MatePosTagger.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── pos
│ │ └── MatePosTaggerSpec.scala
└── stanford-pos-tagger
│ └── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── pos
│ │ └── StanfordPosTagger.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── pos
│ └── StanfordPosTaggerSpec.scala
├── project
└── plugins.sbt
├── sbt-plugin
├── build.sbt
├── project
│ └── plugin.sbt
├── src
│ └── main
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── sbt
│ │ ├── UimaSbtPlugin.scala
│ │ ├── UimaScalaTypeTemplate.scala
│ │ └── UimaScala_TypeTemplate.scala
└── version.sbt
├── segmenter
├── ark-tweet-tokenizer
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── segmenter
│ │ │ └── ArkTweetTokenizer.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── ArkTweetTokenizerSpec.scala
├── break-iterator-segmenter
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── segmenter
│ │ │ └── BreakIteratorSegmenter.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── BreakIteratorSegmenterSpec.scala
├── lucene-tokenizer
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── segmenter
│ │ │ └── LuceneTokenizer.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── LuceneTokenizerSpec.scala
├── open-nlp-segmenter
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── segmenter
│ │ │ └── OpenNlpSegmenter.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── OpenNlpSegmenterSpec.scala
├── regex-tokenizer
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── segmenter
│ │ │ └── RegexTokenizer.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── RegexTokenizerSpec.scala
├── stanford-segmenter
│ └── src
│ │ ├── main
│ │ └── scala
│ │ │ └── com
│ │ │ └── github
│ │ │ └── jenshaase
│ │ │ └── uimascala
│ │ │ └── segmenter
│ │ │ └── StanfordSegmenter.scala
│ │ └── test
│ │ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── StanfordSegmenterSpec.scala
└── whitespace-tokenizer
│ └── src
│ ├── main
│ └── scala
│ │ └── com
│ │ └── github
│ │ └── jenshaase
│ │ └── uimascala
│ │ └── segmenter
│ │ └── WhitespaceTokenizer.scala
│ └── test
│ └── scala
│ └── com
│ └── github
│ └── jenshaase
│ └── uimascala
│ └── segmenter
│ └── WhitespaceTokenizerSpec.scala
├── type-system
└── src
│ └── main
│ └── resources
│ ├── META-INF
│ └── org.apache.uima.fit
│ │ └── types.txt
│ └── desc
│ └── types
│ └── TypeSystem.xml
└── version.sbt
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | lib_managed/
3 | src_managed/
4 | project/boot/
5 | uima-scala-docs/build/
6 | *.iml
7 | *.ipr
8 | *.iws
9 | /.idea
10 | .scala-dependencies
11 |
12 | # Eclipse
13 | *.pydevproject
14 | .project
15 | .metadata
16 | .history
17 | bin/**
18 | tmp/**
19 | tmp/**/*
20 | *.tmp
21 | *.bak
22 | *.swp
23 | *~.nib
24 | local.properties
25 | .classpath
26 | .settings/
27 | .loadpath
28 |
29 | # CDT-specific
30 | .cproject
31 | *~
32 | *.sublime-workspace
33 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.11.8
4 | jdk:
5 | - oraclejdk8
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | This software is licensed under the Apache 2 license, quoted below.
2 |
3 | Copyright 2011 Jens Haase
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 | use this file except in compliance with the License. You may obtain a copy of
7 | the License at
8 |
9 | [http://www.apache.org/licenses/LICENSE-2.0]
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 | License for the specific language governing permissions and limitations under
15 | the License.
16 |
17 | ---------------
18 |
19 | Notice: Licenses of dependency projects my be different
--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
1 | # UimaScala [](https://travis-ci.org/jenshaase/uimaScala)
2 |
3 | ## About
4 |
5 | uimaScala is toolkit to develop natural language application in
6 | Scala. It bases mainly on
7 | [uimaFIT](https://uima.apache.org/uimafit.html), which itsself bases on
8 | [Apache UIMA](http://uima.apache.org/). To develop natural language
9 | processing (NLP) application in [Apache UIMA](http://uima.apache.org/)
10 | you need to work with lots of XML files. For nearly every Java class
11 | you will need an XML File. If your Java class changes you also need to
12 | change you XML file. [uimaFIT](http://code.google.com/p/uimafit/)
13 | tries to solve this problem with reflection and nearly removes all XML
14 | files.
15 |
16 | This project started as a wrapper for
17 | [uimaFIT](https://uima.apache.org/uimafit.html). With Scala's collection
18 | library and the functional programming stuff it is a lot easier to
19 | develop NLP Application. Also a type safe configuration system and a
20 | nicer DSL was added.
21 |
22 | This readme provides a short introduction. More documentation will be
23 | added later.
24 |
25 | ## Setup a project
26 |
27 | To use this project add following configuration to your `built.sbt`
28 | file. Uimscala requires Scala version `2.11`
29 |
30 | ~~~
31 | scalaVersion := "2.11.1"
32 |
33 | resolvers ++= Seq(
34 | "Sonatype OSS Releases" at "http://oss.sonatype.org/content/repositories/releases/",
35 | "Sonatype OSS Snapshots" at "http://oss.sonatype.org/content/repositories/snapshots/"
36 | )
37 |
38 | libraryDependencies += "com.github.jenshaase.uimascala" %% "uimascala-core" % "0.5.0-SNAPSHOT"
39 |
40 | addCompilerPlugin("org.scalamacros" % "paradise" % "2.0.1" cross CrossVersion.full)
41 | ~~~
42 |
43 | Next you need to tell UIMA where to find the description
44 | files. Therefore add the file `types.txt` to the folder
45 | `src/main/resources/META-INF/org.apache.uima.fit`. Add following
46 | content:
47 |
48 | ~~~
49 | classpath*:desc/types/**/*.xml
50 | ~~~
51 |
52 | ## A simple annotator
53 |
54 | Annotators in UIMA will process a document. Most of the time they are
55 | using annotations from previous annotators and combine them to new
56 | annotations. The following annotator is Tokenizer. It looks at the
57 | text and identifies single words, also called tokens. We can use
58 | Java's `BreakIterator` to tokenize the text. You will find the class
59 | also in the toolkit with some additional processing:
60 |
61 | ~~~
62 | package com.github.jenshaase.test
63 |
64 | import com.github.jenshaase.uimascala.core._
65 | import com.github.jenshaase.uimascala.core.configuration._
66 | import java.util.Locale
67 | import org.apache.uima.jcas.JCas
68 | import java.text.BreakIterator
69 |
70 | class BreakIteratorTokenizer extends SCasAnnotator_ImplBase {
71 |
72 | object locale extends Parameter[Locale](Locale.getDefault)
73 |
74 | def process(jcas: JCas) = {
75 | val bi = BreakIterator.getWordInstance(locale.is)
76 | bi.setText(jcas.getDocumentText)
77 |
78 | var last = bi.first
79 | var cur = bi.next
80 | while (cur != BreakIterator.DONE) {
81 | if (jcas.getDocumentText().substring(last, cur).trim != "") {
82 | jcas.annotate[Token](last, cur)
83 | }
84 |
85 | last = cur
86 | cur = bi.next
87 | }
88 | }
89 | }
90 | ~~~
91 |
92 | An annotator in uimaScala extends the `SCasAnnotator_ImplBase`
93 | class. To implement this class you need to implement the `process`
94 | method. Here we use Java's `BreakIterator` to process the
95 | document. For each token we add a new `Token` type (the next part will
96 | explain how to create such type). You can also see the `locale`
97 | configuration parameter. It has a name (`locale`) and type (`Locale`)
98 | and a default value `Locale.getDefault`. These parameter can be change
99 | when using this component in a UIMA pipeline.
100 |
101 |
102 | ## Adding your own type system description
103 |
104 | The goal of an annotator is to add new annotation to text. With UIMA
105 | you can create you custom annotation with XML Files and then generate
106 | the Java classes. uimaScala uses a Scala marco and custom DSL to
107 | provide this features. In order to create your type system you need to
108 | define an object in your scala code:
109 |
110 | ~~~
111 | package com.github.jenshaase.test
112 |
113 | import com.github.jenshaase.uimascala.core.description._
114 |
115 | @TypeSystemDescription
116 | object TypeSystem {
117 |
118 | val Token = Annotation {
119 | val pos = Feature[String]
120 | val lemma = Feature[String]
121 | val stem = Feature[String]
122 | }
123 |
124 | val Sentence = Annotation {}
125 | }
126 | ~~~
127 |
128 | After running `compile` your can see following output on your sbt console:
129 |
130 | ~~~
131 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
132 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Token'.
133 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
134 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Token_Type'.
135 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
136 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Sentence'.
137 | Jul 03, 2014 8:28:37 AM org.apache.uima.tools.jcasgen.UimaLoggerProgressMonitor subTask(35)
138 | INFORMATION: >>JCasGen Creating: 'com.github.jenshaase.test.Sentence_Type'
139 | ~~~
140 |
141 | Now the necessary Java files are created. You need to run `compile`
142 | again to compile the generated Java sources.
143 |
144 | ## Running a pipeline
145 |
146 | Tu run a pipeline uimascala use
147 | [scalaz-stream](https://github.com/scalaz/scalaz-stream) library. To
148 | run a pipeline we need to convert documents to a CAS and process the
149 | CAS with our annotators:
150 |
151 | ~~~
152 | package com.github.jenshaase.test
153 |
154 | import com.github.jenshaase.uimascala.core._
155 | import com.github.jenshaase.uimascala.core.stream._
156 | import scalaz._, Scalaz._
157 | import scalaz.stream._
158 | import java.util.Locale
159 |
160 | object Main extends App {
161 |
162 | val p = Process("this is a text", "and another text") |>
163 | casFromText |>
164 | annotate(new BreakIteratorTokenizer().config(_.locale := Locale.US)) |>
165 | extractCas { cas =>
166 | cas.select[Token].map(_.getCoveredText).toList
167 | }
168 |
169 | println(p.toList)
170 |
171 | p.toList == List(
172 | List("this", "is", "a", "text"),
173 | List("and", "another", "text")
174 | )
175 | }
176 |
177 | ~~~
178 |
179 |
180 | ## TODO
181 |
182 | * Add more documentation
183 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | import com.github.jenshaase.uimascala.UimaSbtPlugin._
2 |
3 | lazy val commonSettings = Seq(
4 | organization := "com.github.jenshaase.uimascala",
5 | scalaVersion := "2.11.8",
6 | libraryDependencies ++= Seq(
7 | "org.specs2" %% "specs2-core" % "3.8.4" % "test"
8 | )
9 | )
10 |
11 | lazy val componentSettings = commonSettings ++ releaseSettings
12 |
13 | lazy val root = (project in file(".")).
14 | settings(releaseSettings:_*).
15 | settings(
16 | publishArtifact in Compile := false,
17 | parallelExecution in Test := false
18 | ).
19 | aggregate(
20 | core, typeSystem,
21 | breakIteratorSegmenter, regexTokenizer, whitespaceTokenizer, stanfordSegmenter, arkTweetTokenizer, openNlpSegmenter, luceneTokenizer,
22 | stanfordPosTagger, arkTweetPosTagger,
23 | stanfordParser,
24 | stanfordNer,
25 | nGramLanguageIdentifier
26 | // Do not run these test in build environment because of too much memory consumption
27 | //mateLemmatizer, mateParser, matePosTagger
28 | )
29 |
30 | lazy val core = (project in file("core")).
31 | settings(commonSettings: _*).
32 | settings(releaseSettings: _*).
33 | settings(
34 | libraryDependencies ++= Seq(
35 | "org.apache.uima" % "uimafit-core" % "2.2.0",
36 | "org.scala-lang.modules" %% "scala-xml" % "1.0.5",
37 | "co.fs2" %% "fs2-core" % "0.9.0-M5"
38 | )
39 | )
40 |
41 | lazy val typeSystem = (project in file("type-system")).
42 | settings(componentSettings: _*).
43 | settings(uimaScalaSettings: _*).
44 | dependsOn(core)
45 |
46 | // ==================================================
47 | // Segmenter
48 |
49 | lazy val breakIteratorSegmenter = (project in file("segmenter/break-iterator-segmenter")).
50 | settings(componentSettings).
51 | dependsOn(core, typeSystem)
52 |
53 | lazy val regexTokenizer = (project in file("segmenter/regex-tokenizer")).
54 | settings(componentSettings).
55 | dependsOn(core, typeSystem)
56 |
57 | lazy val whitespaceTokenizer = (project in file("segmenter/whitespace-tokenizer")).
58 | settings(componentSettings).
59 | dependsOn(core, typeSystem, regexTokenizer)
60 |
61 | lazy val stanfordSegmenter = (project in file("segmenter/stanford-segmenter")).
62 | settings(componentSettings).
63 | settings(
64 | libraryDependencies ++= Seq(
65 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0"
66 | )
67 | ).
68 | dependsOn(core, typeSystem)
69 |
70 | lazy val arkTweetTokenizer = (project in file("segmenter/ark-tweet-tokenizer")).
71 | settings(componentSettings).
72 | settings(
73 | libraryDependencies ++= Seq(
74 | "edu.cmu.cs" % "ark-tweet-nlp" % "0.3.2"
75 | )
76 | ).
77 | dependsOn(core, typeSystem)
78 |
79 | lazy val openNlpSegmenter = (project in file("segmenter/open-nlp-segmenter")).
80 | settings(componentSettings).
81 | settings(
82 | libraryDependencies ++= Seq(
83 | "org.apache.opennlp" % "opennlp-tools" % "1.6.0",
84 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.opennlp-model-sentence-de-maxent" % "20120616.1" % "test",
85 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.opennlp-model-token-de-maxent" % "20120616.1" % "test"
86 | ),
87 | resolvers ++= Seq(
88 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
89 | )
90 | ).
91 | dependsOn(core, typeSystem)
92 |
93 | lazy val luceneTokenizer = (project in file("segmenter/lucene-tokenizer")).
94 | settings(componentSettings).
95 | settings(
96 | libraryDependencies ++= Seq(
97 | "org.apache.lucene" % "lucene-analyzers-common" % "6.1.0"
98 | )
99 | ).
100 | dependsOn(core, typeSystem)
101 |
102 | // ==================================================
103 | // Lemmatizer
104 |
105 | lazy val mateLemmatizer = (project in file("lemmatizer/mate-lemmatizer")).
106 | settings(componentSettings).
107 | settings(
108 | libraryDependencies ++= Seq(
109 | "com.googlecode.mate-tools" % "anna" % "3.5",
110 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-lemmatizer-de-tiger" % "20121024.1" % "test"
111 | ),
112 | resolvers ++= Seq(
113 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
114 | )
115 | ).
116 | dependsOn(core, typeSystem)
117 |
118 | // ==================================================
119 | // POS Tagger
120 |
121 | lazy val stanfordPosTagger = (project in file("part-of-speech-tagger/stanford-pos-tagger")).
122 | settings(componentSettings).
123 | settings(
124 | libraryDependencies ++= Seq(
125 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
126 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german"
127 | )
128 | ).
129 | dependsOn(core, typeSystem)
130 |
131 | lazy val matePosTagger = (project in file("part-of-speech-tagger/mate-pos-tagger")).
132 | settings(componentSettings).
133 | settings(
134 | libraryDependencies ++= Seq(
135 | "com.googlecode.mate-tools" % "anna" % "3.5",
136 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-tagger-de-tiger" % "20121024.1" % "test"
137 | ),
138 | resolvers ++= Seq(
139 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
140 | )
141 | ).
142 | dependsOn(core, typeSystem)
143 |
144 | lazy val arkTweetPosTagger = (project in file("part-of-speech-tagger/ark-tweet-pos-tagger")).
145 | settings(componentSettings).
146 | settings(
147 | libraryDependencies ++= Seq(
148 | "edu.cmu.cs" % "ark-tweet-nlp" % "0.3.2"
149 | )
150 | ).
151 | dependsOn(core, typeSystem)
152 |
153 | // ==================================================
154 | // Parser
155 |
156 | lazy val stanfordParser = (project in file("parser/stanford-parser")).
157 | settings(componentSettings).
158 | settings(
159 | libraryDependencies ++= Seq(
160 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
161 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german"
162 | )
163 | ).
164 | dependsOn(core, typeSystem)
165 |
166 | lazy val mateParser = (project in file("parser/mate-parser")).
167 | settings(componentSettings).
168 | settings(
169 | libraryDependencies ++= Seq(
170 | "com.googlecode.mate-tools" % "anna" % "3.5",
171 | "de.tudarmstadt.ukp.dkpro.core" % "de.tudarmstadt.ukp.dkpro.core.matetools-model-parser-de-tiger" % "20121024.1" % "test"
172 | ),
173 | resolvers ++= Seq(
174 | "ukp-oss-model-releases" at "http://zoidberg.ukp.informatik.tu-darmstadt.de/artifactory/public-model-releases-local"
175 | )
176 | ).
177 | dependsOn(core, typeSystem)
178 |
179 | // ==================================================
180 | // Name Entity Recognizer
181 |
182 | lazy val stanfordNer = (project in file("name-entity-recognizer/stanford-ner")).
183 | settings(componentSettings).
184 | settings(
185 | libraryDependencies ++= Seq(
186 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0",
187 | "edu.stanford.nlp" % "stanford-corenlp" % "3.6.0" % "test" classifier "models-german"
188 | )
189 | ).
190 | dependsOn(core, typeSystem)
191 |
192 | // ==================================================
193 | // Language Identifer
194 |
195 | lazy val nGramLanguageIdentifier = (project in file("language-identification/n-gram-language-identifier")).
196 | settings(componentSettings).
197 | settings(
198 | libraryDependencies ++= Seq(
199 | "com.optimaize.languagedetector" % "language-detector" % "0.5"
200 | )
201 | ).
202 | dependsOn(core, typeSystem)
203 |
204 |
205 | lazy val releaseSettings = Seq(
206 | releasePublishArtifactsAction := PgpKeys.publishSigned.value,
207 | publishTo := {
208 | val nexus = "https://oss.sonatype.org/"
209 | if ( version.value.trim.endsWith( "SNAPSHOT" ) )
210 | Some( "snapshots" at nexus + "content/repositories/snapshots" )
211 | else
212 | Some( "releases" at nexus + "service/local/staging/deploy/maven2" )
213 | },
214 | publishMavenStyle := true,
215 | pomExtra := (
216 | https://github.com/jenshaase/uimaScala
217 |
218 | git@github.com:jenshaase/uimascala.git
219 | scm:git:git@github.com:jenshaase/uimascala.git
220 |
221 |
222 |
223 | jenshaase
224 | Jens Haase
225 |
226 |
227 |
228 |
229 | Apache 2
230 | http://www.apache.org/licenses/LICENSE-2.0.txt
231 | repo
232 |
233 |
234 | )
235 | )
236 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/AsAnalysisEngine.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 |
8 | trait AsAnalysisEngine {
9 | def asAnalysisEngine: AnalysisEngine
10 | }
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/Converter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import PartialFunction._
7 | import java.util.regex.Pattern
8 | import java.util.Locale
9 | import java.io.File
10 | import util.matching.Regex
11 | import com.github.jenshaase.uimascala.core.configuration._
12 |
13 | abstract class Caster[In, Out](implicit in: Manifest[In], out: Manifest[Out]) {
14 | def convertToUimaType[X](c: X)(implicit m: Manifest[X]): Option[Any] = {
15 | def sameArgs = in.typeArguments.zip(m.typeArguments).forall {
16 | case (in, actual) ⇒ in >:> actual
17 | }
18 |
19 | // Special case for options
20 | val isOption = (in.erasure.toString, m.erasure.toString) match {
21 | case ("class scala.Option", "class scala.Some") ⇒ true
22 | case ("class scala.Option", "class scala.None$") ⇒ true
23 | case _ ⇒ false
24 | }
25 |
26 | if ((isOption || in >:> m) && sameArgs) Some(toUimaType(c.asInstanceOf[In]))
27 | else None
28 | }
29 |
30 | def convertFromUimaType[X](c: Any)(implicit m: Manifest[X]): Option[In] = {
31 | def sameArgs = in.typeArguments.zip(m.typeArguments).forall {
32 | case (in, actual) ⇒ in >:> actual
33 | }
34 |
35 | // Special case for options
36 | val isOption = (in.erasure.toString, m.erasure.toString) match {
37 | case ("class scala.Option", "class scala.Some") ⇒ true
38 | case ("class scala.Option", "class scala.None$") ⇒ true
39 | case _ ⇒ false
40 | }
41 |
42 | if ((isOption || in >:> m) && sameArgs) fromUimaType(c)
43 | else None
44 | }
45 |
46 | def toUimaType(in: In): Out
47 | def fromUimaType(in: Any): Option[In]
48 | }
49 |
50 | object CastFactory {
51 |
52 | import BasicCaster._
53 |
54 | var convertSeq: Seq[Caster[_, _]] = Seq.empty
55 |
56 | register(stringCaster)
57 | register(intCaster)
58 | register(floatCaster)
59 | register(doubleCaster)
60 | register(booleanCaster)
61 | register(localeCaster)
62 | register(regexCaster)
63 | register(patternCaster)
64 | register(fileCaster)
65 |
66 | // TODO: Output error if not caster is found
67 | def toUima[A](in: A)(implicit m: Manifest[A]): Either[Failure, Option[Any]] =
68 | convertSeq.map(_.convertToUimaType(in)).find(_.isDefined) match {
69 | case Some(v) ⇒ Right(v)
70 | case None ⇒ Left(Failure("Can not find a converter for: " + m.erasure.toString))
71 | }
72 |
73 | // TODO: Output error if not caster is found
74 | def fromUima[A](in: Any)(implicit m: Manifest[A]): Either[Failure, Option[A]] = {
75 | convertSeq.map(c ⇒ c.convertFromUimaType[A](in)).find(_.isDefined) match {
76 | case Some(v) ⇒ Right(v.map(_.asInstanceOf[A]))
77 | case None ⇒ Left(Failure("Can not find a converter for: " + m.erasure.toString))
78 | }
79 | }
80 |
81 | def register[In, Out](c: Caster[In, Out])(implicit ml: Manifest[List[In]], m: Manifest[In], mo: Manifest[Out]) =
82 | convertSeq ++= Seq(c, buildListCaster(c), buildSeqCaster(c), buildOptionCaster(c))
83 |
84 | protected def buildListCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[List[In]], m: Manifest[In], mo: Manifest[Out]) =
85 | new Caster[List[In], Array[Out]] {
86 | def toUimaType(in: List[In]) = in.map(c.toUimaType).toArray
87 | def fromUimaType(in: Any) = in match {
88 | case arr: Array[_] ⇒ sequence(arr.toList.map(c.fromUimaType))
89 | case _ ⇒ None
90 | }
91 | }
92 |
93 | protected def buildSeqCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[Seq[In]], m: Manifest[In], mo: Manifest[Out]) =
94 | new Caster[Seq[In], Array[Out]] {
95 | def toUimaType(in: Seq[In]) = in.map(c.toUimaType).toArray
96 | def fromUimaType(in: Any) = in match {
97 | case arr: Array[_] ⇒ sequence(arr.toSeq.map(c.fromUimaType))
98 | case _ ⇒ None
99 | }
100 | }
101 |
102 | protected def buildOptionCaster[In, Out](c: Caster[In, Out])(implicit ml: Manifest[Option[In]], m: Manifest[In], mo: Manifest[Out]) =
103 | new Caster[Option[In], Out] {
104 | def toUimaType(in: Option[In]) = in.map(c.toUimaType).getOrElse(null.asInstanceOf[Out])
105 | def fromUimaType(in: Any) =
106 | if (in != null) c.fromUimaType(in.asInstanceOf[In]) match {
107 | case Some(v) ⇒ Some(Some(v))
108 | case None ⇒ None
109 | }
110 | else Some(None)
111 | }
112 |
113 | def sequence[A](l: List[Option[A]]) =
114 | if (l.contains(None)) None else Some(l.flatten)
115 |
116 | def sequence[A](l: Seq[Option[A]]) =
117 | if (l.contains(None)) None else Some(l.flatten)
118 | }
119 |
120 | object BasicCaster {
121 |
122 | import java.util.Locale
123 | import java.util.regex.Pattern
124 | import scala.util.matching.Regex
125 |
126 | val stringCaster = new Caster[String, String] {
127 | def toUimaType(in: String) = in
128 | def fromUimaType(in: Any) = in match {
129 | case s: String ⇒ Some(s)
130 | case _ ⇒ None
131 | }
132 | }
133 |
134 | val intCaster = new Caster[Int, Int] {
135 | def toUimaType(in: Int): Int = in
136 | def fromUimaType(in: Any) = in match {
137 | case i: Int ⇒ Some(i)
138 | case _ ⇒ None
139 | }
140 | }
141 |
142 | val floatCaster = new Caster[Float, Float] {
143 | def toUimaType(in: Float): Float = in
144 | def fromUimaType(in: Any) = in match {
145 | case f: Float ⇒ Some(f)
146 | case _ ⇒ None
147 | }
148 | }
149 |
150 | val doubleCaster = new Caster[Double, Float] {
151 | def toUimaType(in: Double): Float = in.toFloat
152 | def fromUimaType(in: Any) = in match {
153 | case f: Float ⇒ Some(f.toDouble)
154 | case d: Double ⇒ Some(d)
155 | case _ ⇒ None
156 | }
157 | }
158 |
159 | val booleanCaster = new Caster[Boolean, Boolean] {
160 | def toUimaType(in: Boolean): Boolean = in
161 | def fromUimaType(in: Any) = in match {
162 | case b: Boolean ⇒ Some(b)
163 | case _ ⇒ None
164 | }
165 | }
166 |
167 | val localeCaster = new Caster[Locale, String] {
168 | def toUimaType(in: Locale): String = in.getLanguage
169 | def fromUimaType(in: Any) = in match {
170 | case l: Locale ⇒ Some(l)
171 | case s: String ⇒ Some(new Locale(s))
172 | case _ ⇒ None
173 | }
174 | }
175 |
176 | val regexCaster = new Caster[Regex, String] {
177 | def toUimaType(in: Regex): String = in.pattern.pattern
178 | def fromUimaType(in: Any) = in match {
179 | case l: Regex ⇒ Some(l)
180 | case s: String ⇒ Some(s.r)
181 | case _ ⇒ None
182 | }
183 | }
184 |
185 | val patternCaster = new Caster[Pattern, String] {
186 | def toUimaType(in: Pattern): String = in.pattern
187 | def fromUimaType(in: Any) = in match {
188 | case l: Pattern ⇒ Some(l)
189 | case s: String ⇒ Some(Pattern.compile(s))
190 | case _ ⇒ None
191 | }
192 | }
193 |
194 | val fileCaster = new Caster[File, String] {
195 | def toUimaType(in: File): String = in.getAbsolutePath
196 | def fromUimaType(in: Any) = in match {
197 | case f: File ⇒ Some(f)
198 | case s: String ⇒ Some(new File(s))
199 | case _ ⇒ None
200 | }
201 | }
202 | }
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasAnnotator_ImplBase.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import configuration.Parameter
7 | import java.io.File
8 | import java.lang.reflect.Method
9 | import java.net.URL
10 | import com.github.jenshaase.uimascala.core.configuration._
11 | import com.github.jenshaase.uimascala.core.wrapper._
12 | import org.apache.uima.analysis_component.AnalysisComponent
13 | import org.apache.uima.analysis_component.JCasAnnotator_ImplBase
14 | import org.apache.uima.analysis_engine.AnalysisEngineDescription
15 | import org.apache.uima.jcas.JCas
16 | import org.apache.uima.jcas.tcas.Annotation
17 | import org.apache.uima.resource.ResourceInitializationException
18 | import org.apache.uima.resource.ResourceSpecifier
19 | import org.apache.uima.UimaContext
20 | import org.apache.uima.UIMAFramework
21 | import org.apache.uima.fit.factory.AnalysisEngineFactory
22 | import org.apache.uima.fit.factory.ExternalResourceFactory
23 | import scala.collection.mutable.ListBuffer
24 | import xml.Node
25 |
26 | /**
27 | * Scala Annotator.
28 | *
29 | * Loads the parameter when initialized
30 | *
31 | * @author Jens Haase
32 | */
33 | abstract class SCasAnnotator_ImplBase extends JCasAnnotator_ImplBase
34 | with Configurable
35 | with ConfigurationInitialization
36 | with ResourceInitialization
37 | with AsAnalysisEngine {
38 |
39 | override def initialize(context: UimaContext) = {
40 | super.initialize(context)
41 |
42 | this.loadParameter(context)
43 | this.loadResources(context)
44 | }
45 |
46 | /**
47 | * Creates a analysis engine from an Annotator instance
48 | */
49 | def asAnalysisEngine = {
50 | val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*)
51 |
52 | aed.setExternalResourceDependencies(resources.map(r ⇒
53 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
54 | resources.foreach { r ⇒
55 | r.createBinding(aed)
56 | }
57 |
58 | AnalysisEngineFactory.createAggregate(aed)
59 | }
60 |
61 | /**
62 | * Adds an annotation to the index
63 | * if the annotation is not empty
64 | */
65 | def addIfNotEmpty[T <: Annotation](a: T): T = if (!a.isEmpty) {
66 | add(a)
67 | } else {
68 | a
69 | }
70 |
71 | /**
72 | * Adds a annotation to the index
73 | */
74 | def add[T <: Annotation](a: T): T = {
75 | a.addToIndexes
76 | a
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasCollectionReader_ImplBase.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import com.github.jenshaase.uimascala.core.configuration._
7 | import org.apache.uima.cas.CAS
8 | import org.apache.uima.collection.CollectionReader_ImplBase
9 | import org.apache.uima.jcas.JCas
10 | import org.apache.uima.UimaContext
11 | import org.apache.uima.fit.factory.CollectionReaderFactory
12 | import org.apache.uima.fit.factory.ExternalResourceFactory
13 |
14 | abstract class SCasCollectionReader_ImplBase extends CollectionReader_ImplBase
15 | with Configurable
16 | with ConfigurationInitialization
17 | with ResourceInitialization {
18 |
19 | override def initialize = {
20 | super.initialize
21 |
22 | loadParameter(getUimaContext)
23 | loadResources(getUimaContext)
24 | initialize(getUimaContext)
25 | }
26 |
27 | def initialize(context: UimaContext) = {}
28 |
29 | def asCollectionReader = {
30 | val aed = CollectionReaderFactory.createDescription(this.niceClass, parameterKeyValues: _*)
31 |
32 | aed.setExternalResourceDependencies(resources.map(r ⇒
33 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
34 | resources.foreach { r ⇒
35 | r.createBinding(aed)
36 | }
37 |
38 | CollectionReaderFactory.createCollectionReader(aed)
39 | }
40 |
41 | def getNext(cas: CAS) = {
42 | getNext(cas.getJCas())
43 | }
44 |
45 | def getNext(cas: JCas)
46 |
47 | def close() = {}
48 | }
49 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasConsumer_ImplBase.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import configuration._
7 | import org.apache.uima.analysis_component.JCasAnnotator_ImplBase
8 | import org.apache.uima.UimaContext
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.factory.ExternalResourceFactory
11 |
12 | abstract class SCasConsumer_ImplBase extends JCasAnnotator_ImplBase
13 | with Configurable
14 | with ConfigurationInitialization
15 | with ResourceInitialization {
16 |
17 | override def initialize(context: UimaContext) = {
18 | super.initialize(context)
19 |
20 | this.loadParameter(context)
21 | this.loadResources(context)
22 | }
23 |
24 | def asAnalysisEngine = {
25 | val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*)
26 |
27 | aed.setExternalResourceDependencies(resources.map(r ⇒
28 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
29 | resources.foreach { r ⇒
30 | r.createBinding(aed)
31 | }
32 |
33 | AnalysisEngineFactory.createAggregate(aed)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasFlowController_ImplBase.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import configuration._
7 | import org.apache.uima.flow.FlowControllerContext
8 | import org.apache.uima.flow.JCasFlowController_ImplBase
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.factory.FlowControllerFactory
11 |
12 | abstract class SCasFlowController_ImplBase extends JCasFlowController_ImplBase
13 | with Configurable
14 | with ConfigurationInitialization
15 | with ResourceInitialization {
16 |
17 | override def initialize(context: FlowControllerContext) = {
18 | super.initialize(context)
19 |
20 | this.loadParameter(context)
21 | }
22 |
23 | def asAnalysisEngine = {
24 | FlowControllerFactory.createFlowControllerDescription(this.niceClass, this.parameterKeyValues: _*)
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SCasMultiplier_ImplBase.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import configuration._
7 | import org.apache.uima.analysis_component.JCasMultiplier_ImplBase
8 | import org.apache.uima.UimaContext
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.factory.ExternalResourceFactory
11 |
12 | abstract class SCasMultiplier_ImplBase extends JCasMultiplier_ImplBase
13 | with Configurable
14 | with ConfigurationInitialization
15 | with ResourceInitialization {
16 |
17 | override def initialize(context: UimaContext) = {
18 | super.initialize(context)
19 |
20 | this.loadParameter(context)
21 | this.loadResources(context)
22 | }
23 |
24 | def asAnalysisEngine = {
25 | val aed = AnalysisEngineFactory.createPrimitiveDescription(this.niceClass, parameterKeyValues: _*)
26 |
27 | aed.setExternalResourceDependencies(resources.map(r ⇒
28 | ExternalResourceFactory.createExternalResourceDependency(r.name, r.className, !r.mandatory_?, r.description)).toArray)
29 | resources.foreach { r ⇒
30 | r.createBinding(aed)
31 | }
32 |
33 | AnalysisEngineFactory.createAggregate(aed)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/SimplePipeline.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.apache.uima.collection.CollectionReader
8 |
9 | @deprecated("Use org.apache.uima.fit.pipeline.SimplePipeline or uimascala-stream", "0.5.0")
10 | class SimplePipeline(reader: CollectionReader) {
11 |
12 | private var descs: Seq[AnalysisEngine] = Seq.empty
13 |
14 | def ~>(in: AsAnalysisEngine): SimplePipeline =
15 | ~>(in.asAnalysisEngine)
16 |
17 | def ~>(in: AnalysisEngine): SimplePipeline = {
18 | descs = descs :+ in
19 | this
20 | }
21 |
22 | def run() = {
23 | org.apache.uima.fit.pipeline.SimplePipeline.runPipeline(reader, descs: _*)
24 | }
25 | }
26 |
27 | object SimplePipeline {
28 |
29 | def apply(reader: CollectionReader) =
30 | new SimplePipeline(reader)
31 |
32 | def apply(reader: SCasCollectionReader_ImplBase) =
33 | new SimplePipeline(reader.asCollectionReader)
34 | }
35 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/XmlDescriptor.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import xml.Node
7 |
8 | trait XmlDescriptor {
9 | def xmlType: String
10 | def toXml: Node
11 | }
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/ConfigurationInitialization.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import scala.collection.mutable.ListBuffer
7 | import org.apache.uima.analysis_component.AnalysisComponent
8 | import java.lang.reflect.Method
9 | import org.apache.uima.UimaContext
10 | import org.apache.uima.resource.ResourceInitializationException
11 |
12 | /**
13 | * Configuration Initalization trait
14 | *
15 | * This can be used whenever configuration parameters must be initalized
16 | */
17 | trait ConfigurationInitialization { this: Configurable ⇒
18 |
19 | private var parameterList: List[ParameterHolder] = Nil
20 |
21 | private val tArray: ListBuffer[ParameterHolder] = new ListBuffer[ParameterHolder]
22 | val methods = this.getClass.getMethods
23 | introspect(this, methods) {
24 | case (v, mf) ⇒ tArray += ParameterHolder(mf.name, v, mf)
25 | }
26 | parameterList = tArray.toList
27 |
28 | /**
29 | * Uses reflection to find the parameters in the class
30 | */
31 | protected def introspect[B, V](comp: Configurable, methods: Array[Method])(f: (Method, Parameter[_]) ⇒ Any): Unit = {
32 | val potentialParams = methods.toList.filter(isParameter)
33 |
34 | val map: Map[String, List[Method]] = potentialParams.foldLeft[Map[String, List[Method]]](Map()) {
35 | case (map, method) ⇒
36 | val name = method.getName
37 | map + (name -> (method :: map.getOrElse(name, Nil)))
38 | }
39 |
40 | val realMeth = map.values.map(_.sortWith {
41 | case (a, b) ⇒ !a.getReturnType().isAssignableFrom(b.getReturnType)
42 | }).map(_.head)
43 |
44 | for (v ← realMeth) {
45 | v.invoke(comp) match {
46 | case mf: Parameter[_] ⇒
47 | mf.setName_!(v.getName)
48 | f(v, mf)
49 | case _ ⇒
50 | }
51 | }
52 | }
53 |
54 | /**
55 | * Returns all parameters for the class
56 | */
57 | def parameters = parameterList.map(_.parameter(this))
58 |
59 | /**
60 | * Uses the uima context to set the parameter
61 | */
62 | protected def loadParameter(context: UimaContext) = {
63 | parameters.foreach { f ⇒
64 | val value = context.getConfigParameterValue(f.name)
65 |
66 | if (f.mandatory_? && value == null) {
67 | throw new ResourceInitializationException(ResourceInitializationException.CONFIG_SETTING_ABSENT, Array(f.name))
68 | }
69 |
70 | if (value != null) {
71 | f.setFromUimaType(value) match {
72 | case Right(o) ⇒ ()
73 | case Left(l) ⇒ throw new ResourceInitializationException(new ClassCastException(l.msg))
74 | }
75 | }
76 | }
77 | }
78 |
79 | /**
80 | * Checks if a method is a subclass of Parameter
81 | */
82 | def isParameter(m: Method) =
83 | !m.isSynthetic && classOf[Parameter[_]].isAssignableFrom(m.getReturnType)
84 |
85 | class NiceObject[T <: AnyRef](x: T) {
86 | def niceClass: Class[_ <: T] = x.getClass.asInstanceOf[Class[T]]
87 | }
88 | implicit def toNiceObject[T <: AnyRef](x: T) = new NiceObject(x)
89 |
90 | def parameterKeyValues: Array[Object] = parameters.flatMap { f ⇒
91 | Array(f.name, f.toUimaType match {
92 | case Right(o) ⇒ o
93 | case Left(l) ⇒ throw new ResourceInitializationException(new ClassCastException(l.msg))
94 | })
95 | }.toArray
96 |
97 | case class ParameterHolder(name: String, method: Method, metaParameter: Parameter[_]) {
98 | def parameter(inst: Configurable): Parameter[_] = method.invoke(inst).asInstanceOf[Parameter[_]]
99 | }
100 | }
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/Parameter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import org.apache.uima.analysis_component.AnalysisComponent
7 |
8 | trait Configurable {}
9 |
10 | class ConfigurationBuilder[T <: Configurable](conf: T) {
11 | def config(mutators: T ⇒ Unit*) = {
12 | for (f ← mutators) f(conf)
13 | conf
14 | }
15 | }
16 |
17 | /**
18 | * Base Parameter trait
19 | */
20 | trait BaseParameter {
21 |
22 | // The parameter name
23 | private var fieldName: String = _
24 | private var set = false
25 |
26 | protected def set_?(b: Boolean) = set = b
27 |
28 | def set_? : Boolean = set
29 |
30 | /**
31 | * Returns the parameter name
32 | */
33 | def name: String = fieldName
34 |
35 | /**
36 | * Returns the parameter description
37 | * Default: None
38 | */
39 | def description: Option[String] = None
40 |
41 | /**
42 | * Is this parameter mandatory?
43 | */
44 | def mandatory_? = true
45 |
46 | /**
47 | * If the parameter can take multiple values (collections)
48 | */
49 | def multiValued_? = false
50 |
51 | /**
52 | * Default is string.
53 | * Also possible: Integer, Float, Boolean
54 | */
55 | def uimaType: String
56 |
57 | /**
58 | * Sets the parameter name
59 | */
60 | private[configuration] final def setName_!(newName: String): String = {
61 | fieldName = newName
62 | fieldName
63 | }
64 | }
65 |
66 | case class Failure(msg: String, exception: Option[Exception] = None)
67 |
68 | /**
69 | * A typed parameter
70 | */
71 | abstract class Parameter[ThisType](val defaultValue: ThisType)(implicit mf: Manifest[ThisType])
72 | extends BaseParameter {
73 |
74 | import com.github.jenshaase.uimascala.core.CastFactory._
75 |
76 | private var data: Option[ThisType] = None
77 |
78 | /**
79 | * Sets a new value to this parameter
80 | */
81 | def :=(in: ThisType) =
82 | data = Some(in)
83 |
84 | /**
85 | * Set the parameter value by an object
86 | */
87 | def setFromUimaType(in: Any): Either[Failure, ThisType] = fromUima[ThisType](in) match {
88 | case Right(Some(d)) if d.isInstanceOf[ThisType] ⇒ {
89 | :=(d.asInstanceOf[ThisType]); Right(d)
90 | }
91 | case Right(_) ⇒ Left(Failure("Value could not be casted: " + in.toString))
92 | case Left(l) ⇒ Left(l)
93 | }
94 |
95 | /**
96 | * Coverts this parameter value to a uima type
97 | */
98 | def toUimaType: Either[Failure, Object] = toUima(value) match {
99 | case Right(Some(s)) ⇒ Right(s.asInstanceOf[Object])
100 | case Right(None) ⇒ Left(Failure("Value could not be casted: " + value))
101 | case Left(l) ⇒ Left(l)
102 | }
103 |
104 | /**
105 | * Checks if the parameter is mutlivalued
106 | */
107 | override def multiValued_? = mf.erasure.toString match {
108 | case "class scala.collection.immutable.List" ⇒ true
109 | case "interface scala.collection.Seq" ⇒ true
110 | case s: String if (s.startsWith("class [L")) ⇒ true
111 | case _ ⇒ false
112 | }
113 |
114 | def value: ThisType = data getOrElse defaultValue
115 |
116 | def is = value
117 |
118 | def get = value
119 |
120 | def uimaType =
121 | if (multiValued_?)
122 | _uimaType(mf.typeArguments.head.erasure.toString)
123 | else
124 | _uimaType(mf.erasure.toString)
125 |
126 | def _uimaType(s: String) = s match {
127 | case "int" | "class java.lang.Integer" ⇒ "Integer"
128 | case "float" ⇒ "Float"
129 | case "boolean" ⇒ "Boolean"
130 | case _ ⇒ "String"
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/Resource.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import java.net.URL
7 | import java.io.File
8 | import org.apache.uima.resource.ResourceSpecifier
9 | import org.apache.uima.fit.factory.ExternalResourceFactory
10 | import org.apache.uima.resource.SharedResourceObject
11 |
12 | trait BaseResource {
13 |
14 | private var resourceKey: String = _
15 |
16 | def name: String = resourceKey
17 |
18 | def description: String = ""
19 |
20 | def interfaceName: String
21 |
22 | def mandatory_? = true
23 |
24 | private[configuration] final def setName_!(newName: String): String = {
25 | resourceKey = newName
26 | resourceKey
27 | }
28 | }
29 |
30 | trait TypedResource[ThisType, ParamType] extends BaseResource {
31 |
32 | private var boundResource: Option[ThisType] = None
33 | private[configuration] var parameters: Option[Map[ParamType, ParamType]] = None
34 |
35 | def params = parameters getOrElse defaultParameter
36 |
37 | def defaultParameter: Map[ParamType, ParamType]
38 |
39 | def parameterList: Seq[ParamType] =
40 | params.toSeq.flatMap(p ⇒ List(p._1, p._2))
41 |
42 | def setFromUima(a: Any) = a match {
43 | case x: ThisType ⇒ Right(bind(x))
44 | case Some(x: ThisType) ⇒ Right(bind(x))
45 | case _ ⇒ Left(Failure("Can not bind resource from uima context: " + name))
46 | }
47 |
48 | def bind(newResource: ThisType) = {
49 | boundResource = Some(newResource)
50 | boundResource
51 | }
52 |
53 | def resource = boundResource get
54 |
55 | def createBinding(aed: ResourceSpecifier)
56 |
57 | def className: Class[_ <: ThisType]
58 |
59 | def interfaceName = className.getName
60 | }
61 |
62 | case class SharedBinding[T](url: String, params: Map[Object, Object] = Map.empty)
63 | object SharedBinding {
64 |
65 | def apply[T](url: URL) =
66 | new SharedBinding[T](url.toString, Map.empty)
67 |
68 | def apply[T](url: URL, params: Map[Object, Object]) =
69 | new SharedBinding[T](url.toString, params)
70 |
71 | def apply[T](url: File) =
72 | new SharedBinding[T](url.toURI().toURL().toString, Map.empty)
73 |
74 | def apply[T](url: File, params: Map[Object, Object]) =
75 | new SharedBinding[T](url.toURI().toURL().toString, params)
76 | }
77 |
78 | abstract class SharedResource[ThisType <: SharedResourceObject](
79 | val defaultURL: String,
80 | val defaultParams: Map[Object, Object] = Map.empty)(implicit m: Manifest[ThisType])
81 | extends TypedResource[ThisType, Object] {
82 |
83 | private var dataUrl: Option[String] = None
84 | private var clazz: Option[Class[_ <: ThisType]] = None
85 |
86 | def this(defaultUrl: URL, defaultParams: Map[Object, Object])(implicit m: Manifest[ThisType]) =
87 | this(defaultUrl.toString, defaultParams)
88 |
89 | def this(defaultUrl: File, defaultParams: Map[Object, Object])(implicit m: Manifest[ThisType]) =
90 | this(defaultUrl.toURI().toURL(), defaultParams)
91 |
92 | def :=[T <: ThisType](bind: SharedBinding[T])(implicit mf: Manifest[T]) = {
93 | clazz = Some(mf.erasure.asInstanceOf[Class[T]])
94 | dataUrl = Some(bind.url)
95 | parameters = Some(bind.params)
96 | }
97 |
98 | def url = dataUrl getOrElse defaultURL
99 |
100 | def defaultParameter = defaultParams
101 |
102 | def defaultClass = m.erasure.asInstanceOf[Class[ThisType]]
103 |
104 | def className: Class[_ <: ThisType] = clazz getOrElse defaultClass
105 |
106 | // format: OFF
107 | def createBinding(aed: ResourceSpecifier) = {
108 | ExternalResourceFactory.bindResource(
109 | aed,
110 | name,
111 | className,
112 | url,
113 | parameterList:_*)
114 | }
115 | }
116 |
117 | case class Binding[T](params: Map[String, String] = Map.empty)
118 |
119 | abstract class Resource[ThisType <: org.apache.uima.resource.Resource](
120 | val defaultParams: Map[String, String] = Map.empty)(implicit m: Manifest[ThisType])
121 | extends TypedResource[ThisType, String] {
122 |
123 | private var clazz: Option[Class[_ <: ThisType]] = None
124 |
125 | def defaultClass = m.erasure.asInstanceOf[Class[ThisType]]
126 |
127 | def :=[T <: ThisType](bind: Binding[T])(implicit mf: Manifest[T]) = {
128 | clazz = Some(mf.erasure.asInstanceOf[Class[T]])
129 | parameters = Some(bind.params)
130 | }
131 |
132 | def defaultParameter = defaultParams
133 |
134 | def className: Class[_ <: ThisType] = clazz getOrElse defaultClass
135 |
136 | // format: OFF
137 | def createBinding(aed: ResourceSpecifier) = {
138 | ExternalResourceFactory.bindResource(
139 | aed,
140 | name,
141 | className,
142 | parameterList:_*)
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/configuration/ResourceInitialization.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import java.lang.reflect.Method
7 | import org.apache.uima.resource.ResourceAccessException
8 | import org.apache.uima.resource.ResourceInitializationException
9 | import org.apache.uima.UimaContext
10 | import org.apache.uima.fit.descriptor.ExternalResourceLocator
11 | import scala.collection.mutable.ListBuffer
12 |
13 | trait ResourceInitialization { this: Configurable ⇒
14 |
15 | private var resourceList: List[ResourceHolder] = Nil
16 |
17 | private val resTempArray: ListBuffer[ResourceHolder] = new ListBuffer[ResourceHolder]
18 | val resMethods = this.getClass.getMethods
19 | introspectResources(this, resMethods) {
20 | case (v, mf) ⇒ {
21 | resTempArray += ResourceHolder(mf.name, v, mf)
22 | }
23 | }
24 | resourceList = resTempArray.toList
25 |
26 | protected def introspectResources(comp: Configurable, methods: Array[Method])(f: (Method, TypedResource[_, _]) ⇒ Any): Unit = {
27 | val potentialResources = methods.toList.filter(isResource)
28 |
29 | val map: Map[String, List[Method]] = potentialResources.foldLeft[Map[String, List[Method]]](Map()) {
30 | case (map, method) ⇒
31 | val name = method.getName
32 | map + (name -> (method :: map.getOrElse(name, Nil)))
33 | }
34 |
35 | val realMeth = map.values.map(_.sortWith {
36 | case (a, b) ⇒ !a.getReturnType().isAssignableFrom(b.getReturnType)
37 | }).map(_.head)
38 |
39 | for (v ← realMeth) {
40 | v.invoke(comp) match {
41 | case mf: TypedResource[_, _] ⇒
42 | mf.setName_!(v.getName)
43 | f(v, mf)
44 | case _ ⇒
45 | }
46 | }
47 | }
48 |
49 | def resources = resourceList.map(_.resource(this))
50 |
51 | def loadResources(context: UimaContext) = {
52 | resources.foreach { r ⇒
53 | var value: Object = null;
54 | try {
55 | value = context.getResourceObject(r.name)
56 | } catch {
57 | case e: Exception ⇒ throw new ResourceInitializationException(e)
58 | }
59 |
60 | if (value.isInstanceOf[ExternalResourceLocator]) {
61 | value = value.asInstanceOf[ExternalResourceLocator].getResource()
62 | }
63 |
64 | if (r.mandatory_? && value == null) {
65 | throw new ResourceInitializationException(new IllegalStateException("Mandatory resource '%s' is not set".format(r.name)))
66 | }
67 |
68 | if (value != null) {
69 | r.setFromUima(value) match {
70 | case Left(f: Failure) ⇒ throw f.exception.map(new ResourceInitializationException(_)).getOrElse(new ResourceInitializationException())
71 | case _ ⇒
72 | }
73 | }
74 | }
75 | }
76 |
77 | def isResource(m: Method) =
78 | !m.isSynthetic && classOf[TypedResource[_, _]].isAssignableFrom(m.getReturnType)
79 |
80 | case class ResourceHolder(name: String, method: Method, metaParameter: TypedResource[_, _]) {
81 | def resource(inst: Configurable): TypedResource[_, _] = method.invoke(inst).asInstanceOf[TypedResource[_, _]]
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/package.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala
5 |
6 | import org.apache.uima.jcas.tcas.Annotation
7 | import org.apache.uima.jcas.JCas
8 | import org.apache.uima.jcas.cas.FSArray
9 | import com.github.jenshaase.uimascala.core.wrapper._
10 | import com.github.jenshaase.uimascala.core.configuration._
11 | import org.apache.uima.collection.CollectionReader
12 |
13 | package object core {
14 |
15 | implicit def toScalaAnnotation(a: Annotation) = new AnnotationWrapper(a)
16 |
17 | implicit def toScalaCas(jcas: JCas) = new JCasWrapper(jcas)
18 |
19 | implicit def configBuilder[T <: Configurable](conf: T) = new ConfigurationBuilder(conf)
20 |
21 | @deprecated("See com.github.jenshaase.uimascala.core.SimplePipeline", "0.5.0")
22 | implicit def collectionReaderToPipeline(reader: SCasCollectionReader_ImplBase) = new SimplePipeline(reader.asCollectionReader)
23 |
24 | @deprecated("See com.github.jenshaase.uimascala.core.SimplePipeline", "0.5.0")
25 | implicit def collectionReaderToPipeline(reader: CollectionReader) = new SimplePipeline(reader)
26 | }
27 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/stream/annotators.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.core.stream
2 |
3 | import scala.util.matching.Regex
4 | import org.apache.uima.jcas.tcas.Annotation
5 | import scala.reflect.ClassTag
6 | import com.github.jenshaase.uimascala.core._
7 | import org.apache.uima.jcas.JCas
8 |
9 | trait annotators {
10 |
11 | @deprecated("Use com.github.jenshaase.uimascala.segmenter.RegexTokenizer")
12 | def regexTokenizer[F[_], T <: Annotation](pattern: Regex, allowEmptyToken: Boolean = true)(implicit cf: ClassTag[T]) =
13 | annotate[F] { cas: JCas =>
14 | val txt = cas.getDocumentText
15 |
16 | val mostlyAll = pattern.findAllMatchIn(txt).foldLeft(0) {
17 | case (last, m) if ((allowEmptyToken && m.start >= last) || (!allowEmptyToken && m.start > last)) ⇒
18 | cas.annotate[T](last, m.start)
19 | m.end
20 | case (_, m) =>
21 | m.end
22 | }
23 |
24 | if (mostlyAll < txt.length)
25 | cas.annotate[T](mostlyAll, txt.length)
26 | }
27 |
28 | @deprecated("Use com.github.jenshaase.uimascala.segmenter.WhitespaceTokenizer")
29 | def whitespaceTokenizer[F[_], T <: Annotation](allowEmptyToken: Boolean = true)(implicit cf: ClassTag[T]) =
30 | regexTokenizer[F, Annotation]("\\s+".r, allowEmptyToken)
31 |
32 | def removeStopwords[F[_], T <: Annotation](isStopword: String => Boolean)(implicit cf: ClassTag[T]) =
33 | annotate[F] { cas: JCas =>
34 | cas.select[T].
35 | filter { token => isStopword(token.getCoveredText) }.
36 | foreach { token => token.removeFromIndexes() }
37 | }
38 |
39 | def annotateStopwords[F[_], Token <: Annotation, Stopword <: Annotation](isStopword: String => Boolean)
40 | (implicit ct: ClassTag[Token], cs: ClassTag[Stopword]) =
41 | annotate[F] { cas: JCas =>
42 | cas.select[Token].foreach { token =>
43 | if (isStopword(token.getCoveredText)) {
44 | cas.annotate[Stopword](token.getBegin, token.getEnd)
45 | }
46 | }
47 | }
48 | }
49 |
50 | object annotators extends annotators
51 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/stream/package.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import org.apache.uima.jcas.tcas.Annotation
7 | import org.apache.uima.analysis_engine.AnalysisEngine;
8 | import org.apache.uima.analysis_engine.AnalysisEngineDescription;
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine
10 | import org.apache.uima.jcas.JCas
11 | import org.apache.uima.util.CasCreationUtils
12 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
13 | import fs2._
14 |
15 | package object stream {
16 |
17 | type AnnotatorProcess[F[_]] = Pipe[F, JCas, JCas]
18 |
19 | def annotate[F[_]](f: (JCas => Any)): AnnotatorProcess[F] =
20 | _.map { cas =>
21 | f(cas)
22 | cas
23 | }
24 |
25 | def annotate[F[_]](a: AnalysisEngine): AnnotatorProcess[F] =
26 | _.map { cas =>
27 | a.process(cas)
28 | cas
29 | }
30 |
31 | def annotate[F[_]](a: AnalysisEngineDescription): AnnotatorProcess[F] =
32 | annotate(createEngine(a))
33 |
34 | def annotate[F[_]](a: AsAnalysisEngine): AnnotatorProcess[F] =
35 | annotate(a.asAnalysisEngine)
36 |
37 | def initCas[F[_], I](f: ((I, JCas) => Any)): Pipe[F, I, JCas] =
38 | _.map { something =>
39 | val cas = CasCreationUtils.createCas(
40 | TypeSystemDescriptionFactory.createTypeSystemDescription, null, null).getJCas
41 |
42 | f(something, cas)
43 | cas
44 | }
45 |
46 | def casFromText[F[_]] = initCas[F, String] { (str ,cas) =>
47 | cas.setDocumentText(str)
48 | }
49 |
50 | def extractCas[F[_], I](f: JCas => I): Pipe[F, JCas, I] =
51 | _.map(f)
52 | }
53 |
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/wrapper/AnnotationWrapper.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.wrapper
5 |
6 | import org.apache.uima.jcas.tcas.Annotation
7 |
8 | /**
9 | * A Uima Annotation wrapper for implicity.
10 | * @author Jens Haase
11 | */
12 | class AnnotationWrapper(a: Annotation) {
13 |
14 | /**
15 | * Remove whitespace before and after the annotation
16 | * by increasing/decreasing the begin/end value
17 | */
18 | def trim: Annotation = {
19 | var begin = a.getBegin
20 | var end = a.getEnd - 1
21 |
22 | val data = a.getCAS.getDocumentText
23 |
24 | while (begin < (data.length - 1) && trimChar(data.charAt(begin)))
25 | begin += 1
26 |
27 | while (end > 0 && trimChar(data.charAt(end)))
28 | end -= 1
29 |
30 | end += 1
31 | a.setBegin(begin)
32 | a.setEnd(end)
33 |
34 | a
35 | }
36 |
37 | /**
38 | * Add annotation to index if the covering text
39 | * of the annotation is not empty
40 | */
41 | def addToIndexIfNotEmpty = if (!isEmpty) a.addToIndexes
42 |
43 | /**
44 | * Checks if the covering text of the annotation
45 | * is empty
46 | */
47 | def isEmpty = a.getBegin >= a.getEnd
48 |
49 | protected def trimChar(c: Char): Boolean = c match {
50 | case '\n' ⇒ true
51 | case '\r' ⇒ true
52 | case '\t' ⇒ true
53 | case '\u200E' ⇒ true
54 | case '\u200F' ⇒ true
55 | case '\u2028' ⇒ true
56 | case '\u2029' ⇒ true
57 | case _ ⇒ Character.isWhitespace(c)
58 | }
59 | }
--------------------------------------------------------------------------------
/core/src/main/scala/com/github/jenshaase/uimascala/core/wrapper/JCasWrapper.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.wrapper
5 |
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.cas.text.AnnotationFS
8 | import org.apache.uima.jcas.tcas.Annotation
9 | import org.apache.uima.jcas.cas.TOP
10 | import scala.collection.JavaConversions._
11 | import org.apache.uima.cas.FeatureStructure
12 | import scala.collection.JavaConversions._
13 | import collection.mutable.Buffer
14 | import org.apache.uima.fit.util.{ CasUtil, JCasUtil }
15 | import scala.reflect.ClassTag
16 |
17 | /**
18 | * A JCas wrapper for implicity
19 | * @author Jens Haase
20 | */
21 | class JCasWrapper(cas: JCas) {
22 |
23 | def create[T <: TOP](f: (T => Unit)*)(implicit cf: ClassTag[T]): T = {
24 | val constructor = cf.runtimeClass.getConstructor(classOf[JCas])
25 | val obj = constructor.newInstance(cas).asInstanceOf[T]
26 | f.foreach { f => f(obj) }
27 | obj.addToIndexes()
28 | obj
29 | }
30 |
31 | def annotate[T <: Annotation](begin: Int, end: Int)(implicit cf: ClassTag[T]): T = {
32 | val constructor = cf.runtimeClass.getConstructor(classOf[JCas])
33 | val obj = constructor.newInstance(cas).asInstanceOf[T]
34 | obj.setBegin(begin)
35 | obj.setEnd(end)
36 | obj.addToIndexes()
37 | obj
38 | }
39 |
40 | /**
41 | * @see org.apache.uima.fit.uitl.JCasUtil#select
42 | */
43 | def select[T <: TOP](implicit cf: ClassTag[T]): Iterable[T] =
44 | JCasUtil.select(cas, cf.runtimeClass.asInstanceOf[Class[T]])
45 |
46 | /**
47 | * @see org.apache.uima.fit.uitl.JCasUtil#selectByIndex
48 | */
49 | def selectByIndex[T <: Annotation](index: Int)(implicit cf: ClassTag[T]) =
50 | JCasUtil.selectByIndex(cas, cf.runtimeClass.asInstanceOf[Class[T]], index)
51 |
52 | /**
53 | * @see org.apache.uima.fit.uitl.JCasUtil#selectCovered
54 | */
55 | def selectCovered[T <: Annotation](coveringAnnotation: Annotation)(implicit cf: ClassTag[T]) =
56 | JCasUtil.selectCovered(cas, cf.runtimeClass.asInstanceOf[Class[T]], coveringAnnotation)
57 |
58 | /**
59 | * @see org.apache.uima.fit.uitl.JCasUtil#selectCovered
60 | */
61 | def selectCovered[T <: Annotation](begin: Int, end: Int)(implicit cf: ClassTag[T]) =
62 | JCasUtil.selectCovered(cas, cf.runtimeClass.asInstanceOf[Class[T]], begin, end)
63 |
64 | /**
65 | * @see org.apache.uima.fit.uitl.JCasUtil#selectSingle
66 | */
67 | def selectSingle[T <: TOP](implicit cf: ClassTag[T]) =
68 | JCasUtil.selectSingle(cas, cf.runtimeClass.asInstanceOf[Class[T]])
69 |
70 | /**
71 | * @see org.apache.uima.fit.uitl.JCasUtil#selectPreceding
72 | */
73 | def selectPreceding[T <: Annotation](annotation: Annotation, count: Int = Int.MaxValue)(implicit cf: ClassTag[T]): Buffer[T] = {
74 | JCasUtil.selectPreceding(cas, cf.runtimeClass.asInstanceOf[Class[T]], annotation, count);
75 | }
76 |
77 | /**
78 | * @see org.apache.uima.fit.uitl.JCasUtil#selectFollowing
79 | */
80 | def selectFollowing[T <: Annotation](annotation: Annotation, count: Int = Int.MaxValue)(implicit cf: ClassTag[T]): Buffer[T] = {
81 | JCasUtil.selectFollowing(cas, cf.runtimeClass.asInstanceOf[Class[T]], annotation, count)
82 | }
83 |
84 | /**
85 | * @see org.apache.uima.fit.uitl.JCasUtil#exists
86 | */
87 | def exists[T <: TOP](implicit ct: ClassTag[T]) =
88 | JCasUtil.exists(cas, ct.runtimeClass.asInstanceOf[Class[T]])
89 |
90 | /**
91 | * @see org.apache.uima.fit.uitl.JCasUtil#getView
92 | */
93 | def getView(name: String, fallback: JCas) =
94 | JCasUtil.getView(cas, name, fallback)
95 |
96 | /**
97 | * @see org.apache.uima.fit.uitl.JCasUtil#getView
98 | */
99 | def getView(name: String, create: Boolean) =
100 | JCasUtil.getView(cas, name, create)
101 | }
102 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/ConverterSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import org.specs2._
7 | import matcher._
8 | import java.util.regex.Pattern
9 | import java.util.Locale
10 | import java.io.File
11 | import scala.util.matching.Regex
12 |
13 | class ConverterSpec extends Specification {
14 | import CastFactory._
15 |
16 | // format: OFF
17 | def is = s2"""
18 | The Formatter should
19 | convert string ${convert("test", "test")}
20 | convert int ${convert(1, 1)}
21 | convert float ${convert(1.2f, 1.2f)}
22 | convert double ${convert(1.2d, 1.2d)}
23 | convert boolean ${convert(true, true)}
24 | convert locale ${convert(new Locale("en"), "en")}
25 | convert pattern ${convert(Pattern.compile("[A-Z]*"), "[A-Z]*", Some(patToString _))}
26 | convert regex ${convert("[A-Z]*".r, "[A-Z]*", Some(regToString _))}
27 | convert file ${convert(new File("/test/abc"), "/test/abc")}
28 |
29 | convert string list ${convert(List("t", "v"), Array("t", "v"))}
30 | convert int list ${convert(List(1, 2), Array(1, 2))}
31 | convert float list ${convert(List(1.2f, 2.3f), Array(1.2f, 2.3f))}
32 | convert double list ${convert(List(1.2d, 2.3d), Array(1.2d, 2.3d))}
33 | convert boolean list ${convert(List(true, false, true), Array(true, false, true))}
34 | convert pattern list ${convert(List(Pattern.compile("[A-Z]"), Pattern.compile("[1-4]")), Array("[A-Z]", "[1-4]"), Some{in: List[Pattern] => in.map(patToString)})}
35 | convert regex list ${convert(List("[A-Z]".r, "[1-4]".r), Array("[A-Z]", "[1-4]"), Some{in: List[Regex] => in.map(regToString)})}
36 | convert file list ${convert(List(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
37 | convert locale list ${convert(List(new Locale("de"), new Locale("en")), Array("de", "en"))}
38 | convert file list ${convert(List(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
39 |
40 | convert string Seq ${convert(Seq("t", "v"), Array("t", "v"))}
41 | convert int Seq ${convert(Seq(1, 2), Array(1, 2))}
42 | convert float Seq ${convert(Seq(1.2f, 2.3f), Array(1.2f, 2.3f))}
43 | convert double Seq ${convert(Seq(1.2d, 2.3d), Array(1.2d, 2.3d))}
44 | convert boolean Seq ${convert(Seq(true, false, true), Array(true, false, true))}
45 | convert pattern Seq ${convert(Seq(Pattern.compile("[A-Z]"), Pattern.compile("[1-4]")), Array("[A-Z]", "[1-4]"), Some{in: Seq[Pattern] => in.map(patToString)})}
46 | convert regex Seq ${convert(Seq("[A-Z]".r, "[1-4]".r), Array("[A-Z]", "[1-4]"), Some{in: Seq[Regex] => in.map(regToString)})}
47 | convert file Seq" ${convert(Seq(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
48 | convert locale Seq" ${convert(Seq(new Locale("de"), new Locale("en")), Array("de", "en"))}
49 | convert file Seq" ${convert(Seq(new File("/test/a"), new File("/test/b")), Array("/test/a", "/test/b"))}
50 |
51 | convert string option ${convert(Some("test"), "test")}
52 | convert int option ${convert(Some(1), 1)}
53 | convert float option ${convert(Some(1.2f), 1.2f)}
54 | convert double option ${convert(Some(1.2d), 1.2d)}
55 | convert boolean option ${convert(Some(true), true)}
56 | convert locale option ${convert(Some(new Locale("en")), "en")}
57 | convert pattern option ${convert(Some(Pattern.compile("[A-Z]*")), "[A-Z]*", Some(optPatToString _))}
58 | convert regex option ${convert(Some("[A-Z]*".r), "[A-Z]*", Some(optRegToString _))}
59 | convert file option ${convert(Some(new File("/test/abc")), "/test/abc")}
60 | convert none option ${convert(None, null)}
61 | """
62 |
63 |
64 | def convert[T, R](in: T, out: R, func: Option[T => _] = None)(implicit m: Manifest[T], r: Manifest[R]) = {
65 | val to = toUima(in)
66 | to must beRight
67 | to.right.get must beSome
68 | to.right.get.get must_== out
69 |
70 | val from = fromUima[T](out)
71 | from must beRight
72 | from.right.get must beSome
73 | func match {
74 | case Some(f) => f(from.right.get.get) must_== f(in)
75 | case None => from.right.get.get must_== in
76 | }
77 | }
78 |
79 | def patToString(in: Pattern) = in.pattern
80 | def regToString(in: Regex) = in.pattern.pattern
81 | def optPatToString(in: Option[Pattern]) = in.map(_.pattern)
82 | def optRegToString(in: Option[Regex]) = in.map(_.pattern.pattern)
83 | }
84 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/SCasAnnotator_ImplBaseSpecs.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import org.specs2.mutable.Specification
7 | import com.github.jenshaase.uimascala.core.configuration._
8 | import org.apache.uima.jcas.JCas
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.resource.Resource_ImplBase
11 | import org.apache.uima.resource.SharedResourceObject
12 | import org.apache.uima.resource.DataResource
13 |
14 | class SCasAnnotator_ImplBaseSpecs extends Specification {
15 |
16 | "SCasAnnotator_ImplBase" should {
17 |
18 | "initialize one string parameter in a Annotator" in {
19 | val d = new DummyAnnotator().config(
20 | _.stringParam := "dummy").asAnalysisEngine
21 |
22 | val cas = d.newJCas
23 | d.process(cas)
24 |
25 | cas.getDocumentText must be equalTo ("dummy")
26 | }
27 |
28 | "initalize two parameters in a Annotator" in {
29 | val d = new Dummy2Annotator().config(
30 | _.stringParam := "dummy",
31 | _.intParam := 1).asAnalysisEngine
32 |
33 | val cas = d.newJCas
34 | d.process(cas)
35 |
36 | cas.getDocumentText must be equalTo ("dummy1")
37 | }
38 |
39 | "initalize list parameter in a Annotator" in {
40 | val d = new Dummy3Annotator().config(
41 | _.listParam := List("2", "3")).asAnalysisEngine
42 |
43 | val cas = d.newJCas
44 | d.process(cas)
45 |
46 | cas.getDocumentText must be equalTo ("23")
47 | }
48 |
49 | "not require to set a optinal value" in {
50 | val d = new Dummy2Annotator().config(
51 | _.stringParam := "dummy").asAnalysisEngine
52 |
53 | val cas = d.newJCas
54 | d.process(cas)
55 |
56 | cas.getDocumentText must be equalTo ("dummy100")
57 | }
58 |
59 | "initialize a Annotator with a SharedResourceObject" in {
60 | val d = new ResourceDummyAnnotator().config(
61 | _.dict := SharedBinding[SharedDict](new java.io.File("/path/to/nowhere")),
62 | _.name := Binding[SharedName2]()).asAnalysisEngine
63 | val cas = d.newJCas
64 | d.process(cas)
65 |
66 | cas.getDocumentText() must be equalTo ("SharedDict|SharedName2")
67 | }
68 | }
69 | }
70 |
71 | class DummyAnnotator extends SCasAnnotator_ImplBase {
72 |
73 | object stringParam extends Parameter[String]("test")
74 |
75 | def process(cas: JCas) = {
76 | cas.setDocumentText(stringParam.is)
77 | }
78 | }
79 |
80 | class Dummy2Annotator extends SCasAnnotator_ImplBase {
81 |
82 | object stringParam extends Parameter[String]("test")
83 | object intParam extends Parameter[Int](100)
84 |
85 | def process(cas: JCas) = {
86 | cas.setDocumentText(stringParam.is + intParam.is)
87 | }
88 | }
89 |
90 | class Dummy3Annotator extends SCasAnnotator_ImplBase {
91 |
92 | object listParam extends Parameter[List[String]](List("a", "b"))
93 |
94 | def process(cas: JCas) = {
95 | cas.setDocumentText(listParam.is.foldLeft("")(_ + _))
96 | }
97 | }
98 |
99 | class SharedDict extends SharedResourceObject {
100 | def load(data: DataResource) = {}
101 |
102 | def name = "SharedDict"
103 | }
104 | class SharedName extends Resource_ImplBase { def name = "SharedName" }
105 | class SharedName2 extends SharedName { override def name = "SharedName2" }
106 |
107 | class ResourceDummyAnnotator extends SCasAnnotator_ImplBase {
108 | object dict extends SharedResource[SharedDict]("/path/to/nowhere")
109 | object name extends Resource[SharedName]
110 |
111 | def process(cas: JCas) = {
112 | cas.setDocumentText(dict.resource.name + "|" + name.resource.name);
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/SimplePipelineSpecs.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core
5 |
6 | import org.specs2.mutable.Specification
7 | import org.apache.uima.jcas.JCas
8 | import org.apache.uima.util.ProgressImpl
9 | import org.apache.uima.jcas.tcas.Annotation
10 |
11 | class SimplePipelineSpecs extends Specification {
12 |
13 | "SimplePipeline" should {
14 | "add one reader and one annotator" in {
15 | try {
16 | new PipelineDummyReader() ~> new PipelineAnnotatorA() run ()
17 |
18 | success
19 | } catch {
20 | case e ⇒ {
21 | e.printStackTrace()
22 | failure
23 | }
24 | }
25 | }
26 |
27 | "add one reader and two annotator" in {
28 | try {
29 | new PipelineDummyReader() ~>
30 | new PipelineAnnotatorA() ~>
31 | new PipelineAnnotatorB() run ()
32 |
33 | success
34 | } catch {
35 | case _ ⇒ failure
36 | }
37 | }
38 | }
39 | }
40 |
41 | class PipelineDummyReader extends SCasCollectionReader_ImplBase {
42 | val total = 2
43 | var i = total
44 |
45 | def getNext(cas: JCas) = {
46 | cas.setDocumentText("Doc" + i)
47 | i = i - 1
48 | }
49 |
50 | def getProgress = Array(new ProgressImpl(total - i, total, "test"))
51 |
52 | def hasNext = i > 0
53 | }
54 |
55 | class PipelineAnnotatorA extends SCasAnnotator_ImplBase {
56 | def process(cas: JCas) = {
57 | new Annotation(cas, 0, 1).addToIndexes
58 | }
59 | }
60 |
61 | class PipelineAnnotatorB extends SCasAnnotator_ImplBase {
62 | def process(cas: JCas) = {
63 | new Annotation(cas, 1, 2).addToIndexes
64 | }
65 | }
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ConfigurationInitalizationSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import org.specs2.Specification
7 | import com.github.jenshaase.uimascala.core.configuration._
8 |
9 | class ConfigurationInitalizationSpec extends Specification {
10 | def is = s2"""
11 | This is a specification to check the configuration system
12 |
13 | ConfigMock should
14 |
15 | have 4 parameters ${nbParams(4)}
16 | have a parameter called 'stringParam' ${hasParam("stringParam")}
17 | have a parameter called 'optStringParam' ${hasParam("stringListParam")}
18 | have a parameter called 'intParam' ${hasParam("intParam")}
19 | have a parameter called 'optIntParam' ${hasParam("intListParam")}
20 | """
21 |
22 | def hasParam(name: String) =
23 | new ConfigMock().parameters.map(_.name).contains(name) must beTrue
24 |
25 | def nbParams(count: Int) =
26 | new ConfigMock().parameters.size must be equalTo (count)
27 |
28 | def createKeyValues = {
29 | val config = new ConfigMock()
30 | config.stringParam := "Test"
31 | config.intParam := 100
32 |
33 | config.parameterKeyValues.toList.
34 | sliding(2, 2).map(l ⇒ Pair(l(0).asInstanceOf[String], l(1))).toList.sortBy(_._1) must be equalTo (List(
35 | ("intListParam", Array(1, 2).asInstanceOf[Object]),
36 | ("intParam", 100.asInstanceOf[Object]),
37 | ("stringListParam", Array("ab", "cd").asInstanceOf[Object]),
38 | ("stringParam", "Test".asInstanceOf[Object])))
39 | }
40 | }
41 |
42 | class ConfigMock extends Configurable with ConfigurationInitialization {
43 | object stringParam extends Parameter[String]("test")
44 | object stringListParam extends Parameter[List[String]](List("ab", "cd"))
45 |
46 | object intParam extends Parameter[Int](1)
47 | object intListParam extends Parameter[List[Int]](List(1, 2))
48 |
49 | object somethingElse {
50 | def someMethod = "Anything"
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ParameterSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import org.specs2.Specification
7 | import com.github.jenshaase.uimascala.core.configuration._
8 | import java.util.regex.Pattern
9 |
10 | class ParameterSpec extends Specification {
11 |
12 | // format: OFF
13 | def is = s2"""
14 | A Parameter can
15 | return default values if not value is set ${defaultVal}
16 | have a new value ${newVal}
17 | be set from uima ${fromUima}
18 | be converted to uima ${toUima}
19 | be mutli valued ${multiVal}
20 | be single valued ${singleVal}
21 | have a correct uima type ${uimaType}
22 | """
23 |
24 | def defaultVal = {
25 | object param extends Parameter[String]("a")
26 | param.is must_== "a"
27 | }
28 |
29 | def newVal = {
30 | object param extends Parameter[String]("a")
31 | param := "b"
32 | param.is must_== "b"
33 | }
34 |
35 | def fromUima = {
36 | object param extends Parameter[Pattern](Pattern.compile("[A-Z]"))
37 | param.setFromUimaType("[1-4]")
38 | param.is.pattern must_== "[1-4]"
39 | }
40 |
41 | def toUima = {
42 | object param extends Parameter[Pattern](Pattern.compile("[A-Z]"))
43 | param.toUimaType must beRight
44 | param.toUimaType.right.get must_== "[A-Z]"
45 | }
46 |
47 | def multiVal = {
48 | object l extends Parameter[List[String]](List("b"))
49 | object s extends Parameter[Seq[String]](Seq("b"))
50 | object a extends Parameter[Array[String]](Array("b"))
51 |
52 | l.multiValued_? must_== true
53 | s.multiValued_? must_== true
54 | a.multiValued_? must_== true
55 | }
56 |
57 | def singleVal = {
58 | object param extends Parameter[String]("b")
59 | param.multiValued_? must_== false
60 | }
61 |
62 | def uimaType = {
63 | object p1 extends Parameter[Pattern](Pattern.compile("[A-Z]"))
64 | object p2 extends Parameter[String]("a")
65 | object p3 extends Parameter[Float](1.2f)
66 | object p4 extends Parameter[Boolean](true)
67 | object p5 extends Parameter[Int](2)
68 |
69 | object p6 extends Parameter[List[Pattern]](List(Pattern.compile("[A-Z]")))
70 | object p7 extends Parameter[List[String]](List("a"))
71 | object p8 extends Parameter[List[Float]](List(1.2f))
72 | object p9 extends Parameter[List[Boolean]](List(true))
73 | object p10 extends Parameter[List[Int]](List(2))
74 |
75 | p1.uimaType must_== "String"
76 | p2.uimaType must_== "String"
77 | p3.uimaType must_== "Float"
78 | p4.uimaType must_== "Boolean"
79 | p5.uimaType must_== "Integer"
80 |
81 | p6.uimaType must_== "String"
82 | p7.uimaType must_== "String"
83 | p8.uimaType must_== "Float"
84 | p9.uimaType must_== "Boolean"
85 | p10.uimaType must_== "Integer"
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ResourceInitializationSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.configuration
5 |
6 | import org.specs2.Specification
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import org.apache.uima.resource.Resource_ImplBase
10 |
11 | class ResourceInitializationSpec extends Specification {
12 | def is = s2"""
13 | This specification describes the resource initialization
14 |
15 | The ResourceMock class should
16 | have 4 resource objects ${nbResource(4)}
17 | have a resource called 'dictionary' ${hasResource("dictionary")}
18 | have a resource called 'name' ${hasResource("name")}
19 | have a resource called 'stopwords' ${hasResource("stopwords")}
20 | have a resource called 'optName' ${hasResource("optName")}
21 | return the correct dictionary resource (todo)
22 | return the correct name resource (todo)
23 | return the correct stopwords resource (todo)
24 | return the correct optName resource (todo)
25 | """
26 |
27 | def nbResource(count: Int) =
28 | new ResourceMock().resources.size must be equalTo (count)
29 |
30 | def hasResource(name: String) =
31 | new ResourceMock().resources.map(_.name).contains(name) must beTrue
32 |
33 | }
34 |
35 | class ResourceMock extends Configurable with ResourceInitialization {
36 |
37 | object dictionary extends SharedResource[SharedDict]("/path/to/noWhere")
38 |
39 | object name extends Resource[SharedName]
40 |
41 | object stopwords extends SharedResource[SharedStopword]("/path/to/noWhere")
42 |
43 | object optName extends Resource[SharedOptName]
44 | }
45 |
46 | class SharedName extends Resource_ImplBase {
47 | def name = "myName"
48 | }
49 |
50 | class SharedOptName extends Resource_ImplBase {
51 | def name = "myOptName"
52 | }
53 |
54 | class SharedDict extends SharedResourceObject {
55 |
56 | def load(data: DataResource) =
57 | data.getUri.toString
58 |
59 | def getDict = "dict"
60 | }
61 |
62 | class SharedStopword extends SharedResourceObject {
63 |
64 | def load(data: DataResource) =
65 | data.getUri.toString
66 |
67 | def getStopword = "stopword"
68 | }
69 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/configuration/ResourceSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.configuration
5 |
6 | import java.util.regex.Pattern
7 | import com.github.jenshaase.uimascala.core.configuration._
8 | import org.apache.uima.resource.Resource_ImplBase
9 | import org.apache.uima.resource.SharedResourceObject
10 | import org.apache.uima.resource.DataResource
11 | import org.specs2.Specification
12 |
13 | class ResourceSpec extends Specification {
14 |
15 | // format: OFF
16 | def is = s2"""
17 | A Resource should
18 | return default parameters ${defaultParams}
19 | convert parameter to a list ${paramToList}
20 | bind a resource ${bind}
21 | bind a resource from Uima ${bindUima}
22 | be set by a Binding ${setBinding}
23 | return a class name ${className}
24 | return a interface name ${interfaceName}
25 |
26 | A SharedResource should
27 | return default parameters ${sharedDefaultParams}
28 | convert parameter to a list ${sharedParamToList}
29 | bind a resource ${sharedBind}
30 | bind a resource from Uima ${sharedBindUima}
31 | be set by a Binding ${sharedSetBinding}
32 | return a class name ${sharedClassName}
33 | return a interface name ${sharedInterfaceName}
34 | """
35 |
36 | def defaultParams = {
37 | object r extends Resource[DummyRes](Map("a" -> "b"))
38 | r.params must_== Map("a" -> "b")
39 | }
40 |
41 | def paramToList = {
42 | object r extends Resource[DummyRes](Map("a" -> "b"))
43 | r.parameterList must_== Seq("a", "b")
44 | }
45 |
46 | def bind = {
47 | object r extends Resource[DummyRes]()
48 | val o = new DummyRes()
49 | r.bind(o)
50 |
51 | r.resource must_== o
52 | }
53 |
54 | def bindUima = {
55 | object r extends Resource[DummyRes]()
56 | val o = new DummyRes()
57 | r.setFromUima(o)
58 |
59 | r.resource must_== o
60 | }
61 |
62 | def setBinding = {
63 | object r extends Resource[DummyRes](Map("a" -> "b"))
64 | r := Binding(Map("c" -> "d"))
65 | r.params must_== Map("c" -> "d")
66 | }
67 |
68 | def className = {
69 | object r extends Resource[DummyRes](Map("a" -> "b"))
70 | r.className.getName must_== classOf[DummyRes].getName
71 | }
72 |
73 | def interfaceName = {
74 | object r extends Resource[DummyRes](Map("a" -> "b"))
75 | r.interfaceName must_== classOf[DummyRes].getName
76 | }
77 |
78 | class DummyRes extends Resource_ImplBase { def name = "DummyRes" }
79 |
80 |
81 | // Shared Resource
82 |
83 | def sharedDefaultParams = {
84 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
85 | r.url must_== "/test/data"
86 | r.params must_== Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object])
87 | }
88 |
89 | def sharedParamToList = {
90 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
91 | r.parameterList must_== Seq("a", "b")
92 | }
93 |
94 | def sharedBind = {
95 | object r extends SharedResource[DummyShared]("/test/data")
96 | val o = new DummyShared()
97 | r.bind(o)
98 |
99 | r.resource must_== o
100 | }
101 |
102 | def sharedBindUima = {
103 | object r extends SharedResource[DummyShared]("/test/data")
104 | val o = new DummyShared()
105 | r.setFromUima(o)
106 |
107 | r.resource must_== o
108 | }
109 |
110 | def sharedSetBinding = {
111 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
112 | r := SharedBinding("/abc/def", Map("c".asInstanceOf[Object] -> "d".asInstanceOf[Object]))
113 | r.url must_== "/abc/def"
114 | r.params must_== Map("c".asInstanceOf[Object] -> "d".asInstanceOf[Object])
115 | }
116 |
117 | def sharedClassName = {
118 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
119 | r.className.getName must_== classOf[DummyShared].getName
120 | }
121 |
122 | def sharedInterfaceName = {
123 | object r extends SharedResource[DummyShared]("/test/data", Map("a".asInstanceOf[Object] -> "b".asInstanceOf[Object]))
124 | r.interfaceName must_== classOf[DummyShared].getName
125 | }
126 |
127 | class DummyShared extends SharedResourceObject {
128 | def load(data: DataResource) = {}
129 | def name = "SharedDict"
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/stream/annotatorsSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.core.stream
2 |
3 | import org.specs2.mutable._
4 | import fs2._
5 | import org.apache.uima.jcas.tcas.Annotation
6 | import com.github.jenshaase.uimascala.core._
7 |
8 | class annotateSpec extends Specification {
9 |
10 | import annotators._
11 |
12 | "Annotators" should {
13 |
14 | def tokenizeText[F[_]] =
15 | casFromText[F] andThen whitespaceTokenizer[F, Annotation](false)
16 |
17 | "tokenize a document" in {
18 | val p = Stream.pure("this is a text", " and another text ").
19 | through(tokenizeText).
20 | through(extractCas { cas =>
21 | cas.select[Annotation].drop(1).map(_.getCoveredText).toList
22 | })
23 |
24 | p.toList must be equalTo (List(
25 | List("this", "is", "a", "text"),
26 | List("and", "another", "text")
27 | ))
28 | }
29 |
30 | "remove stopwords" in {
31 | val p = Stream.pure("this is a text", " and another text ").
32 | through(tokenizeText).
33 | through(removeStopwords[Pure, Annotation](s => Set("is", "a").contains(s))).
34 | through(extractCas { cas =>
35 | cas.select[Annotation].drop(1).map(_.getCoveredText).toList
36 | })
37 |
38 | p.toList must be equalTo (List(
39 | List("this", "text"),
40 | List("and", "another", "text")
41 | ))
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/util/Helper.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.util
5 |
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.util.CasCreationUtils
8 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
9 |
10 | /**
11 | * @author Jens Haase
12 | */
13 |
14 | trait Helper {
15 |
16 | def newJCas: JCas = {
17 | CasCreationUtils.createCas(
18 | TypeSystemDescriptionFactory.createTypeSystemDescription, null, null).getJCas
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/wrapper/AnnotationWrapperSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.wrapper
5 |
6 | import org.specs2.mutable.Specification
7 | import org.apache.uima.util.CasCreationUtils
8 | import org.apache.uima.fit.factory.{ TypePrioritiesFactory, TypeSystemDescriptionFactory }
9 | import org.apache.uima.jcas.JCas
10 | import org.apache.uima.jcas.tcas.Annotation
11 | import com.github.jenshaase.uimascala.core._
12 | import util.Helper
13 |
14 | /**
15 | * @author Jens Haase
16 | */
17 | class AnnotationWrapperSpec extends Specification with Helper {
18 |
19 | "Annotation Wrapper" should {
20 |
21 | "trim a annotation" in {
22 | val cas = newJCas
23 | cas.setDocumentText("This is text")
24 |
25 | val a = new Annotation(cas, 4, 8)
26 | a.getCoveredText must be equalTo (" is ")
27 | a.trim.getCoveredText must be equalTo ("is")
28 | }
29 |
30 | "check if a annotation is empty" in {
31 | new Annotation(newJCas, 0, 0).isEmpty must beTrue
32 | new Annotation(newJCas, 0, 1).isEmpty must beFalse
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/core/src/test/scala/com/github/jenshaase/uimascala/core/wrapper/JCasWrapperSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.core.wrapper
5 |
6 | import org.specs2.mutable.Specification
7 | import com.github.jenshaase.uimascala.core._
8 | import util.Helper
9 | import org.apache.uima.jcas.tcas.Annotation
10 | import org.apache.uima.jcas.JCas
11 |
12 | /**
13 | * @author Jens Haase
14 | */
15 | class JCasWrapperSpec extends Specification with Helper {
16 |
17 | class Token(cas: JCas, begin: Int, end: Int) extends Annotation(cas, begin, end)
18 |
19 | "JCasWrapper" should {
20 |
21 | "select annotation of same type" in {
22 | val cas = newJCas
23 | cas.setDocumentText("This is a text")
24 |
25 | new Annotation(cas, 0, 4).addToIndexes
26 | new Annotation(cas, 5, 7).addToIndexes
27 |
28 | // Note: One Annotation and one DocumentAnnotation are default
29 | // to each new JCas
30 | cas.select[Annotation].size must be equalTo (3)
31 | }
32 |
33 | "select annotation by index" in {
34 | val cas = newJCas
35 | cas.setDocumentText("This is a text")
36 |
37 | new Annotation(cas, 0, 4).addToIndexes
38 |
39 | cas.selectByIndex[Annotation](1).getCoveredText must be equalTo ("This")
40 | }
41 |
42 | "select all anntation covered by another annotation" in {
43 | val cas = newJCas
44 | cas.setDocumentText("This is a text")
45 |
46 | val a1 = new Annotation(cas, 0, 4)
47 | a1.addToIndexes
48 | val a2 = new Annotation(cas, 0, 1)
49 | a2.addToIndexes
50 | val a3 = new Annotation(cas, 1, 2)
51 | a3.addToIndexes
52 |
53 | cas.selectCovered[Annotation](a1).size must be equalTo (2)
54 | cas.selectCovered[Annotation](a1).get(0).getCoveredText must be equalTo ("T")
55 | }
56 |
57 | "select a single annotation" in {
58 | val cas = newJCas
59 | cas.setDocumentText("This is a text")
60 |
61 | cas.selectSingle[Annotation].getCoveredText must be equalTo ("This is a text")
62 | }
63 |
64 | "select all preceding annotation" in {
65 | val cas = newJCas
66 | cas.setDocumentText("This is a text")
67 |
68 | val a1 = new Annotation(cas, 0, 4)
69 | a1.addToIndexes
70 | val a2 = new Annotation(cas, 5, 7)
71 | a2.addToIndexes
72 | val a3 = new Annotation(cas, 8, 9)
73 | a3.addToIndexes
74 |
75 | val p1 = cas.selectPreceding[Annotation](a2, 1)
76 | p1.size must be equalTo (1)
77 | p1.head.getCoveredText must be equalTo (a1.getCoveredText)
78 | }
79 |
80 | "select all following annotation" in {
81 | val cas = newJCas
82 | cas.setDocumentText("This is a text")
83 |
84 | val a1 = new Annotation(cas, 0, 4)
85 | a1.addToIndexes
86 | val a2 = new Annotation(cas, 5, 7)
87 | a2.addToIndexes
88 | val a3 = new Annotation(cas, 8, 9)
89 | a3.addToIndexes
90 |
91 | val p1 = cas.selectFollowing[Annotation](a2, 1)
92 | p1.size must be equalTo (1)
93 | p1.head.getCoveredText must be equalTo (a3.getCoveredText)
94 | }
95 |
96 | "checks if an annotation type exists" in {
97 | val cas = newJCas
98 | cas.setDocumentText("This is a text")
99 |
100 | cas.exists[Annotation] must beTrue
101 | }
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/language-identification/n-gram-language-identifier/src/main/scala/com/github/jenshaase/uimascala/languageidentifier/NGramLanguageIdentifier.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.languageidentifier
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import scala.collection.JavaConversions._
8 | import com.optimaize.langdetect.text.CommonTextObjectFactories
9 | import com.optimaize.langdetect.ngram.NgramExtractors
10 | import com.optimaize.langdetect.profiles._
11 | import com.optimaize.langdetect._
12 |
13 | class NGramLanguageIdentifier extends SCasAnnotator_ImplBase {
14 |
15 | object shortText extends Parameter[Boolean](false)
16 |
17 | lazy val languageDetector = {
18 | val languageProfiles = new LanguageProfileReader().readAllBuiltIn()
19 | LanguageDetectorBuilder.create(NgramExtractors.standard())
20 | .withProfiles(languageProfiles)
21 | .build()
22 | }
23 |
24 | def process(jcas: JCas) = {
25 | val textObjectFactory =
26 | if (shortText.is) {
27 | CommonTextObjectFactories.forDetectingOnLargeText()
28 | } else {
29 | CommonTextObjectFactories.forDetectingShortCleanText()
30 | }
31 |
32 | val text = textObjectFactory.forText(jcas.getDocumentText);
33 | val lang = languageDetector.detect(text)
34 | if (lang.isPresent()) {
35 | jcas.setDocumentLanguage(lang.get().toString)
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/language-identification/n-gram-language-identifier/src/test/scala/com/github/jenshaase/uimascala/languageidentifier/NGramLanguageIdentifierSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.languageidentifier
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.specs2.mutable.Specification
8 |
9 | class NGramLanguageIdentifierSpec extends Specification {
10 |
11 | "The ngram language idenifier" should {
12 | "detect the german language" in {
13 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().asAnalysisEngine
14 |
15 | val jcas = analyser.newJCas()
16 | jcas.setDocumentText("Das ist ein Text in deutscher Sprache")
17 | analyser.process(jcas)
18 |
19 | jcas.getDocumentLanguage must be equalTo("de")
20 | }
21 |
22 | "detect the english language" in {
23 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().asAnalysisEngine
24 |
25 | val jcas = analyser.newJCas()
26 | jcas.setDocumentText("This is a english text with so information.")
27 | analyser.process(jcas)
28 |
29 | jcas.getDocumentLanguage must be equalTo("en")
30 | }
31 |
32 | "detect the german language in short text snippets" in {
33 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().config(_.shortText := true).asAnalysisEngine
34 |
35 | val jcas = analyser.newJCas()
36 | jcas.setDocumentText("Das ist ein Text in deutscher Sprache")
37 | analyser.process(jcas)
38 |
39 | jcas.getDocumentLanguage must be equalTo("de")
40 | }
41 |
42 | "detect the english language in short text snippets" in {
43 | val analyser: AnalysisEngine = new NGramLanguageIdentifier().config(_.shortText := true).asAnalysisEngine
44 |
45 | val jcas = analyser.newJCas()
46 | jcas.setDocumentText("This is a english text with so information.")
47 | analyser.process(jcas)
48 |
49 | jcas.getDocumentLanguage must be equalTo("en")
50 | }
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/lemmatizer/mate-lemmatizer/src/main/scala/com/github/jenshaase/uimascala/lemmatizer/MateLemmatizer.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.lemmatizer
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import scala.collection.JavaConversions._
10 | import is2.data.SentenceData09
11 | import is2.io.CONLLReader09
12 | import is2.io.IOGenerals
13 | import is2.lemmatizer.Lemmatizer
14 |
15 | class MateLemmatizerResource extends SharedResourceObject {
16 | private var lemmatizer: Lemmatizer = _
17 |
18 | def load(data: DataResource) {
19 | val uri = data.getUri.toString
20 |
21 | if (new java.io.File(uri).exists) {
22 | lemmatizer = new Lemmatizer(uri)
23 | } else {
24 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
25 | val resource = this.getClass.getResource(resourceUri)
26 |
27 | val file = java.io.File.createTempFile("mate-lemmatizer", ".temp")
28 | file.deleteOnExit();
29 |
30 | val source = resource.openStream();
31 | try {
32 | java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
33 | } finally {
34 | source.close();
35 | }
36 |
37 | lemmatizer = new Lemmatizer(file.getAbsolutePath)
38 | }
39 | }
40 |
41 | def getLemmatizer = lemmatizer
42 | }
43 |
44 | class MateLemmatizer extends SCasAnnotator_ImplBase {
45 |
46 | object model extends SharedResource[MateLemmatizerResource]("")
47 |
48 | def process(jcas: JCas) = {
49 | jcas.select[Sentence].foreach { sentence =>
50 | val tokens = jcas.selectCovered[Token](sentence).toVector
51 |
52 | val sentenceData = new SentenceData09()
53 | sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText))
54 |
55 | model.resource.getLemmatizer.apply(sentenceData).plemmas.zipWithIndex.foreach { case (tag, idx) =>
56 | val token = tokens(idx)
57 |
58 | val lemma = new Lemma(jcas, token.getBegin, token.getEnd)
59 | lemma.setValue(tag)
60 | add(lemma)
61 |
62 | token.setLemma(lemma)
63 | }
64 | }
65 | }
66 | }
67 |
68 |
--------------------------------------------------------------------------------
/lemmatizer/mate-lemmatizer/src/test/scala/com/github/jenshaase/uimascala/lemmatizer/MateLemmatizerSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.lemmatizer
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import com.github.jenshaase.uimascala.core.configuration._
7 | import org.apache.uima.analysis_engine.AnalysisEngine
8 | import org.specs2.mutable.Specification
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 |
12 | class MateLemmatizerSpec extends Specification {
13 |
14 | "MateLemmatizer" should {
15 | "lemmatize each word in a sentence" in {
16 | val tagger: AnalysisEngine = new MateLemmatizer().
17 | config(
18 | _.model := SharedBinding[MateLemmatizerResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/lemmatizer-de-tiger.model")
19 | ).
20 | asAnalysisEngine
21 |
22 | val jcas = tagger.newJCas()
23 | jcas.setDocumentText("Hallo Welt! Was geht?")
24 | jcas.annotate[Sentence](0, 10)
25 | jcas.annotate[Sentence](12, 20)
26 | jcas.annotate[Token](0, 5)
27 | jcas.annotate[Token](6, 10)
28 | jcas.annotate[Token](12, 15)
29 | jcas.annotate[Token](16, 20)
30 | tagger.process(jcas)
31 |
32 | jcas.select[Lemma].size must be equalTo(4)
33 | jcas.selectByIndex[Lemma](0).getCoveredText must be equalTo ("Hallo")
34 | jcas.selectByIndex[Lemma](1).getCoveredText must be equalTo ("Welt")
35 | jcas.selectByIndex[Lemma](2).getCoveredText must be equalTo ("Was")
36 | jcas.selectByIndex[Lemma](3).getCoveredText must be equalTo ("geht")
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/name-entity-recognizer/stanford-ner/src/main/scala/com/github/jenshaase/uimascala/ner/StanfordNer.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.ner
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import edu.stanford.nlp.ling.TaggedWord
10 | import edu.stanford.nlp.ie.crf.CRFClassifier
11 | import edu.stanford.nlp.util.CoreMap
12 | import edu.stanford.nlp.ling.CoreLabel
13 | import edu.stanford.nlp.ling.CoreAnnotations
14 | import scala.collection.JavaConversions._
15 | import java.util.zip.GZIPInputStream
16 |
17 |
18 | class StanfordNerResource extends SharedResourceObject {
19 | private var tagger: CRFClassifier[CoreMap] = _
20 |
21 | def load(data: DataResource) {
22 | val uri = data.getUri.toString
23 |
24 | if (new java.io.File(uri).exists) {
25 | tagger = CRFClassifier.getClassifier[CoreMap](new java.io.File(uri))
26 | } else {
27 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
28 | val resource = this.getClass.getResource(resourceUri)
29 |
30 | val is = if (uri.endsWith(".gz")) {
31 | new GZIPInputStream(resource.openStream)
32 | } else {
33 | resource.openStream
34 | }
35 |
36 | tagger = CRFClassifier.getClassifier[CoreMap](is)
37 | }
38 | }
39 |
40 | def getTagger = tagger
41 | }
42 |
43 | class StanfordNer extends SCasAnnotator_ImplBase {
44 |
45 | object model extends SharedResource[StanfordNerResource]("")
46 |
47 | def process(jcas: JCas) = {
48 | jcas.select[Sentence].foreach { sentence =>
49 | val tokens = jcas.selectCovered[Token](sentence).toVector
50 |
51 | model.resource.getTagger.
52 | classifySentence(tokens.map(tokenToCoreLabel _)).
53 | foldLeft[(Int, Int, Option[String])](-1, -1, None) { case ((begin, end, currentType), taggedWord) =>
54 | val tokenType = taggedWord.get(classOf[CoreAnnotations.AnswerAnnotation])
55 | val tokenBegin = taggedWord.get(classOf[CoreAnnotations.CharacterOffsetBeginAnnotation])
56 | val tokenEnd = taggedWord.get(classOf[CoreAnnotations.CharacterOffsetEndAnnotation])
57 |
58 | (tokenType, currentType) match {
59 | case ("O", Some(b)) =>
60 | val namedEntity = new NamedEntity(jcas, begin, end)
61 | namedEntity.setValue(b)
62 | add(namedEntity)
63 | (begin, end, None)
64 |
65 | case (a, Some(b)) if (a != b) =>
66 | val namedEntity = new NamedEntity(jcas, begin, end)
67 | namedEntity.setValue(b)
68 | add(namedEntity)
69 | (begin, tokenEnd, Some(tokenType))
70 |
71 | case (a, None) if (a != "O") =>
72 | (tokenBegin, tokenEnd, Some(tokenType))
73 |
74 | case (a, Some(b)) if (a == b) =>
75 | (begin, tokenEnd, Some(tokenType))
76 |
77 | case ("O", None) =>
78 | (begin, end, currentType)
79 | }
80 | }
81 | }
82 | }
83 |
84 | def tokenToCoreLabel(token: Token): CoreLabel = {
85 | val word = new CoreLabel()
86 | word.setValue(token.getCoveredText)
87 | word.setOriginalText(token.getCoveredText)
88 | word.setWord(token.getCoveredText)
89 | word.setBeginPosition(token.getBegin)
90 | word.setEndPosition(token.getEnd)
91 |
92 | if (token.getPos != null) {
93 | word.setTag(token.getPos.getName)
94 | }
95 |
96 | word
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/name-entity-recognizer/stanford-ner/src/test/scala/com/github/jenshaase/uimascala/ner/StanfordNerSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.ner
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import com.github.jenshaase.uimascala.core.configuration._
7 | import org.apache.uima.analysis_engine.AnalysisEngine
8 | import org.specs2.mutable.Specification
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 |
12 | class StanfordNerSpec extends Specification {
13 |
14 | "The Stanford Parser" should {
15 | "add constituents" in {
16 | val parser: AnalysisEngine = new StanfordNer().
17 | config(
18 | _.model := SharedBinding[StanfordNerResource]("edu/stanford/nlp/models/ner/german.dewac_175m_600.crf.ser.gz")
19 | ).
20 | asAnalysisEngine
21 |
22 | val jcas = parser.newJCas()
23 | jcas.setDocumentText("Angela Merkel fliegt nach Berlin.")
24 | jcas.annotate[Sentence](0, 33)
25 | val t1 = jcas.annotate[Token](0, 6)
26 | val p1 = jcas.annotate[POS](0, 6)
27 | p1.setName("NE")
28 | t1.setPos(p1)
29 |
30 | val t2 = jcas.annotate[Token](7, 13)
31 | val p2 = jcas.annotate[POS](7, 13)
32 | p2.setName("NE")
33 | t2.setPos(p2)
34 |
35 | val t3 = jcas.annotate[Token](14, 20)
36 | val p3 = jcas.annotate[POS](14, 20)
37 | p3.setName("VVFIN")
38 | t3.setPos(p3)
39 |
40 | val t4 = jcas.annotate[Token](21, 25)
41 | val p4 = jcas.annotate[POS](21, 25)
42 | p4.setName("APPR")
43 | t4.setPos(p4)
44 |
45 | val t5 = jcas.annotate[Token](26, 32)
46 | val p5 = jcas.annotate[POS](26, 32)
47 | p5.setName("NE")
48 | t5.setPos(p5)
49 |
50 | val t6 = jcas.annotate[Token](32, 33)
51 | val p6 = jcas.annotate[POS](32, 33)
52 | p6.setName("$.")
53 | t6.setPos(p6)
54 |
55 | parser.process(jcas)
56 |
57 | val namedEntities = jcas.select[NamedEntity].toVector
58 | namedEntities.size must be equalTo(2)
59 | namedEntities(0).getCoveredText must be equalTo("Angela Merkel")
60 | namedEntities(0).getValue must be equalTo("I-PER")
61 | namedEntities(1).getCoveredText must be equalTo("Berlin")
62 | namedEntities(1).getValue must be equalTo("I-LOC")
63 | }
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/parser/mate-parser/src/main/scala/com/github/jenshaase/uimascala/parser/MateParser.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.parser
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import scala.collection.JavaConversions._
8 | import org.apache.uima.resource.DataResource
9 | import org.apache.uima.resource.SharedResourceObject
10 | import is2.data.SentenceData09
11 | import is2.io.CONLLReader09
12 | import is2.io.IOGenerals
13 | import is2.parser.Options
14 | import is2.parser.Parser
15 |
16 | class MateParserResource extends SharedResourceObject {
17 | private var parser: Parser = _
18 |
19 | def load(data: DataResource) {
20 | val uri = data.getUri.toString
21 |
22 | if (new java.io.File(uri).exists) {
23 | parser = new Parser(new Options(Array("-model", uri)))
24 | } else {
25 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
26 | val resource = this.getClass.getResource(resourceUri)
27 |
28 | val file = java.io.File.createTempFile("mate-parser", ".temp")
29 | file.deleteOnExit();
30 |
31 | val source = resource.openStream();
32 | try {
33 | java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
34 | } finally {
35 | source.close();
36 | }
37 |
38 | parser = new Parser(new Options(Array("-model", file.getAbsolutePath)))
39 | }
40 | }
41 |
42 | def getParser = parser
43 | }
44 |
45 | class MateParser extends SCasAnnotator_ImplBase {
46 |
47 | object model extends SharedResource[MateParserResource]("")
48 |
49 | def process(jcas: JCas) = {
50 | jcas.select[Sentence].foreach { sentence =>
51 | val tokens = jcas.selectCovered[Token](sentence).toVector
52 |
53 | val sentenceData = new SentenceData09()
54 | sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText))
55 | sentenceData.setLemmas(Array[String](IOGenerals.ROOT_LEMMA) ++ tokens.map { t =>
56 | if (t.getLemma != null) {
57 | t.getLemma.getValue()
58 | } else {
59 | "_"
60 | }
61 | })
62 | sentenceData.setPPos(Array[String](IOGenerals.ROOT_POS) ++ tokens.map { t =>
63 | t.getPos.getName()
64 | })
65 |
66 | val parsed = model.resource.getParser.apply(sentenceData)
67 |
68 | parsed.labels.zipWithIndex.foreach { case (label, i) =>
69 | if (parsed.pheads(i) != 0) {
70 | val sourceToken = tokens(parsed.pheads(i) - 1)
71 | val targetToken = tokens(i)
72 | val depType = parsed.plabels(i)
73 |
74 | val dep = new Dependency(jcas)
75 | dep.setGovernor(sourceToken)
76 | dep.setDependent(targetToken)
77 | dep.setDependencyType(depType)
78 | dep.setBegin(dep.getDependent().getBegin())
79 | dep.setEnd(dep.getDependent().getEnd())
80 | dep.addToIndexes()
81 | } else {
82 | val rootToken = tokens(i)
83 |
84 | val dep = new DependencyRoot(jcas)
85 | dep.setGovernor(rootToken)
86 | dep.setDependent(rootToken)
87 | dep.setDependencyType(parsed.plabels(i))
88 | dep.setBegin(dep.getDependent().getBegin())
89 | dep.setEnd(dep.getDependent().getEnd())
90 | dep.addToIndexes()
91 | }
92 | }
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/parser/mate-parser/src/test/scala/com/github/jenshaase/uimascala/parser/MateParserSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.parser
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.typesystem._
5 | import com.github.jenshaase.uimascala.core.configuration._
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.specs2.mutable.Specification
8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
9 | import org.apache.uima.fit.util.JCasUtil
10 |
11 | class MateParserSpec extends Specification {
12 |
13 | "The Mate Parser" should {
14 | "add dependencies" in {
15 | val parser: AnalysisEngine = new MateParser().
16 | config(
17 | _.model := SharedBinding[MateParserResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/parser-de-tiger.model")
18 | ).
19 | asAnalysisEngine
20 |
21 | val jcas = parser.newJCas()
22 | jcas.setDocumentText("Wie alt bist du?")
23 | jcas.annotate[Sentence](0, 16)
24 | val t1 = jcas.annotate[Token](0, 3)
25 | val p1 = jcas.annotate[POS](0, 3)
26 | p1.setName("PWAV")
27 | t1.setPos(p1)
28 |
29 | val t2 = jcas.annotate[Token](4, 7)
30 | val p2 = jcas.annotate[POS](4, 7)
31 | p2.setName("ADJD")
32 | t2.setPos(p2)
33 |
34 | val t3 = jcas.annotate[Token](8, 12)
35 | val p3 = jcas.annotate[POS](8, 12)
36 | p3.setName("VAFIN")
37 | t3.setPos(p3)
38 |
39 | val t4 = jcas.annotate[Token](13, 15)
40 | val p4 = jcas.annotate[POS](13, 15)
41 | p4.setName("PPER")
42 | t4.setPos(p4)
43 |
44 | val t5 = jcas.annotate[Token](15, 16)
45 | val p5 = jcas.annotate[POS](15, 16)
46 | p5.setName("$.")
47 | t5.setPos(p5)
48 |
49 | parser.process(jcas)
50 |
51 | val dependencies = jcas.select[Dependency].toVector
52 | dependencies(0).getCoveredText must be equalTo ("Wie")
53 | dependencies(0).getGovernor.getCoveredText must be equalTo ("alt")
54 | dependencies(0).getDependent.getCoveredText must be equalTo ("Wie")
55 | dependencies(0).getDependencyType must be equalTo ("MO")
56 |
57 | dependencies(1).getCoveredText must be equalTo ("alt")
58 | dependencies(1).getGovernor.getCoveredText must be equalTo ("bist")
59 | dependencies(1).getDependent.getCoveredText must be equalTo ("alt")
60 | dependencies(1).getDependencyType must be equalTo ("PD")
61 |
62 | dependencies(2).getCoveredText must be equalTo ("bist")
63 | dependencies(2).getGovernor.getCoveredText must be equalTo ("bist")
64 | dependencies(2).getDependent.getCoveredText must be equalTo ("bist")
65 | dependencies(2).getDependencyType must be equalTo ("--")
66 |
67 | dependencies(3).getCoveredText must be equalTo ("du")
68 | dependencies(3).getGovernor.getCoveredText must be equalTo ("bist")
69 | dependencies(3).getDependent.getCoveredText must be equalTo ("du")
70 | dependencies(3).getDependencyType must be equalTo ("SB")
71 |
72 | dependencies(4).getCoveredText must be equalTo ("?")
73 | dependencies(4).getGovernor.getCoveredText must be equalTo ("du")
74 | dependencies(4).getDependent.getCoveredText must be equalTo ("?")
75 | dependencies(4).getDependencyType must be equalTo ("--")
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/parser/stanford-parser/src/main/scala/com/github/jenshaase/uimascala/parser/StanfordParser.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.parser
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import scala.collection.JavaConversions._
8 | import edu.stanford.nlp.parser.common.ParserGrammar
9 | import java.io._
10 | import java.util.zip.GZIPInputStream
11 | import org.apache.uima.resource.DataResource
12 | import org.apache.uima.resource.SharedResourceObject
13 | import edu.stanford.nlp.ling.CoreLabel
14 | import edu.stanford.nlp.trees.Tree
15 | import org.apache.uima.jcas.tcas.Annotation
16 | import org.apache.uima.jcas.cas.FSArray
17 | import org.apache.uima.util.Level.WARNING
18 |
19 | class StanfordParserGrammerResource extends SharedResourceObject {
20 | private var parser: ParserGrammar = _
21 |
22 | def load(data: DataResource) {
23 | parser = ParserGrammar.loadModel(data.getUri.toString)
24 | }
25 |
26 | def getParserGrammer = parser
27 | }
28 |
29 | object DependencyMode {
30 | val BASIC = "BASIC"
31 | val NON_COLLAPSED = "NON_COLLAPSED"
32 | val COLLAPSED = "COLLAPSED"
33 | val COLLAPSED_WITH_EXTRA = "COLLAPSED_WITH_EXTRA"
34 | val CC_PROPAGATED = "CC_PROPAGATED"
35 | val CC_PROPAGATED_NO_EXTRA = "CC_PROPAGATED_NO_EXTRA"
36 | val TREE = "TREE"
37 | }
38 |
39 | class StanfordParser extends SCasAnnotator_ImplBase {
40 |
41 | object model extends SharedResource[StanfordParserGrammerResource]("")
42 | object mode extends Parameter[String](DependencyMode.BASIC)
43 | object readPOS extends Parameter[Boolean](true) {
44 | override def mandatory_? = false
45 | }
46 | object createPOS extends Parameter[Boolean](false) {
47 | override def mandatory_? = false
48 | }
49 |
50 | def process(jcas: JCas) = {
51 | val parser = model.resource.getParserGrammer
52 |
53 | jcas.select[Sentence].foreach { sentence =>
54 | val tokens = jcas.selectCovered[Token](sentence).toVector
55 |
56 | val query = parser.parserQuery()
57 | query.parse(tokens.map(tokenToCoreLabel _))
58 | val parseTree = query.getBestParse()
59 | parseTree.setSpans()
60 |
61 | doCreateConstituentAnnotation(jcas, tokens, parseTree, None)
62 | doCreateDependencyAnnotation(jcas, parser, parseTree, tokens)
63 | }
64 | }
65 |
66 | def tokenToCoreLabel(token: Token): CoreLabel = {
67 | val word = new CoreLabel()
68 | word.setValue(token.getCoveredText)
69 | word.setOriginalText(token.getCoveredText)
70 | word.setWord(token.getCoveredText)
71 | word.setBeginPosition(token.getBegin)
72 | word.setEndPosition(token.getEnd)
73 |
74 | if (readPOS.is && token.getPos != null) {
75 | word.setTag(token.getPos.getName)
76 | }
77 |
78 | word
79 | }
80 |
81 | def doCreateConstituentAnnotation(jcas: JCas, tokens: Vector[Token], node: Tree, parent: Option[Annotation]): Annotation = {
82 | val nodeLabelValue = node.value()
83 | val source = tokens.get(node.getSpan().getSource)
84 | val target = tokens.get(node.getSpan().getTarget)
85 |
86 | if (node.isPhrasal) {
87 | val constituent = createConstituent(jcas, source.getBegin, target.getEnd, nodeLabelValue)
88 | parent.foreach { p => constituent.setParent(p) }
89 |
90 | val childAnnotations = node.
91 | getChildrenAsList().
92 | map(doCreateConstituentAnnotation(jcas, tokens, _, Some(constituent)))
93 |
94 | val children = childAnnotations.zipWithIndex.
95 | foldLeft(new FSArray(jcas, childAnnotations.size())) { case (fsArray, (ann, idx)) =>
96 | fsArray.set(idx, ann)
97 | fsArray
98 | }
99 |
100 | constituent.setChildren(children)
101 | add(constituent)
102 | constituent
103 | } else if (node.isPreTerminal) {
104 | val pos = createPOS(jcas, source.getBegin, target.getEnd, nodeLabelValue)
105 | val coveredToken = jcas.selectCovered[Token](pos)
106 | require(coveredToken.size == 1)
107 | val token = coveredToken.get(0)
108 |
109 | if (createPOS.is) {
110 | add(pos)
111 | token.setPos(pos)
112 | }
113 |
114 | parent.foreach { p =>
115 | token.setParent(p)
116 | }
117 |
118 | token
119 | } else {
120 | throw new Exception("Node must be either phrasal nor pre-terminal")
121 | }
122 | }
123 |
124 | def createConstituent(jcas: JCas, begin: Int, end: Int, constituentType: String) = {
125 | val c = new Constituent(jcas, begin, end)
126 | c.setConstituentType(constituentType)
127 | c
128 | }
129 |
130 | def createPOS(jcas: JCas, begin: Int, end: Int, name: String) = {
131 | val p = new POS(jcas, begin, end)
132 | p.setName(name)
133 | p
134 | }
135 |
136 |
137 | def doCreateDependencyAnnotation(jcas: JCas, parser: ParserGrammar, parseTree: Tree, tokens: Seq[Token]) {
138 | try {
139 | val gs = parser.getTLPParams().getGrammaticalStructure(
140 | parseTree,
141 | parser.treebankLanguagePack().punctuationWordRejectFilter(),
142 | parser.getTLPParams().typedDependencyHeadFinder()
143 | )
144 |
145 | val dependencies = mode.is match {
146 | case DependencyMode.BASIC => gs.typedDependencies()
147 | case DependencyMode.NON_COLLAPSED => gs.allTypedDependencies()
148 | case DependencyMode.COLLAPSED => gs.typedDependenciesCollapsed(false)
149 | case DependencyMode.COLLAPSED_WITH_EXTRA => gs.typedDependenciesCollapsed(true)
150 | case DependencyMode.CC_PROPAGATED => gs.typedDependenciesCCprocessed(true)
151 | case DependencyMode.CC_PROPAGATED_NO_EXTRA => gs.typedDependenciesCCprocessed(false)
152 | case DependencyMode.TREE => gs.typedDependenciesCollapsedTree()
153 | case _ => throw new Exception("DependencyMode not supported: " + mode.is)
154 | }
155 |
156 | dependencies.foreach { currTypedDep =>
157 | val govIndex = currTypedDep.gov().index();
158 | val depIndex = currTypedDep.dep().index();
159 |
160 | val dep = if (govIndex != 0) {
161 | val govToken = tokens(govIndex - 1)
162 | val depToken = tokens(depIndex - 1)
163 |
164 | val dep = new Dependency(jcas)
165 | dep.setDependencyType(currTypedDep.reln().toString());
166 | dep.setGovernor(govToken);
167 | dep.setDependent(depToken);
168 | dep.setBegin(dep.getDependent().getBegin());
169 | dep.setEnd(dep.getDependent().getEnd());
170 | dep.addToIndexes();
171 | } else {
172 | val depToken = tokens(depIndex - 1);
173 |
174 | val dep = new DependencyRoot(jcas);
175 | dep.setDependencyType(currTypedDep.reln().toString());
176 | dep.setGovernor(depToken);
177 | dep.setDependent(depToken);
178 | dep.setBegin(dep.getDependent().getBegin());
179 | dep.setEnd(dep.getDependent().getEnd());
180 | dep.addToIndexes();
181 |
182 | dep
183 | }
184 | }
185 | } catch {
186 | case e: UnsupportedOperationException =>
187 | getContext().getLogger().log(WARNING, "Current model does not seem to support dependencies.");
188 | }
189 | }
190 | }
191 |
--------------------------------------------------------------------------------
/parser/stanford-parser/src/test/scala/com/github/jenshaase/uimascala/parser/StanfordParserSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.parser
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import com.github.jenshaase.uimascala.core.configuration._
7 | import org.apache.uima.analysis_engine.AnalysisEngine
8 | import org.specs2.mutable.Specification
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 |
12 | class StanfordParserSpec extends Specification {
13 |
14 | "The Stanford Parser" should {
15 | "add constituents" in {
16 | val parser: AnalysisEngine = new StanfordParser().
17 | config(
18 | _.model := SharedBinding[StanfordParserGrammerResource]("edu/stanford/nlp/models/srparser/germanSR.ser.gz")
19 | ).
20 | asAnalysisEngine
21 |
22 | val jcas = parser.newJCas()
23 | jcas.setDocumentText("Wie alt bist du?")
24 | jcas.annotate[Sentence](0, 16)
25 | val t1 = jcas.annotate[Token](0, 3)
26 | val p1 = jcas.annotate[POS](0, 3)
27 | p1.setName("PWAV")
28 | t1.setPos(p1)
29 |
30 | val t2 = jcas.annotate[Token](4, 7)
31 | val p2 = jcas.annotate[POS](4, 7)
32 | p2.setName("ADJD")
33 | t2.setPos(p2)
34 |
35 | val t3 = jcas.annotate[Token](8, 12)
36 | val p3 = jcas.annotate[POS](8, 12)
37 | p3.setName("VAFIN")
38 | t3.setPos(p3)
39 |
40 | val t4 = jcas.annotate[Token](13, 15)
41 | val p4 = jcas.annotate[POS](13, 15)
42 | p4.setName("PPER")
43 | t4.setPos(p4)
44 |
45 | val t5 = jcas.annotate[Token](15, 16)
46 | val p5 = jcas.annotate[POS](15, 16)
47 | p5.setName("$.")
48 | t5.setPos(p5)
49 |
50 | parser.process(jcas)
51 |
52 | val constituents = jcas.select[Constituent].toVector
53 | constituents(0).getBegin must be equalTo (0)
54 | constituents(0).getEnd must be equalTo (16)
55 | constituents(0).getConstituentType must be equalTo ("S")
56 | constituents(0).getChildren.size must be equalTo (4)
57 | constituents(0).getParent must be equalTo(constituents(1))
58 |
59 | constituents(1).getBegin must be equalTo (0)
60 | constituents(1).getEnd must be equalTo (16)
61 | constituents(1).getConstituentType must be equalTo ("ROOT")
62 | constituents(1).getChildren.size must be equalTo (1)
63 | constituents(1).getParent must beNull
64 |
65 | constituents(2).getBegin must be equalTo (0)
66 | constituents(2).getEnd must be equalTo (7)
67 | constituents(2).getConstituentType must be equalTo ("AP")
68 | constituents(2).getChildren.size must be equalTo (2)
69 | constituents(2).getParent must be equalTo(constituents(0))
70 |
71 | val tokens = jcas.select[Token].toVector
72 | tokens(0).getParent must be equalTo(constituents(2))
73 | tokens(1).getParent must be equalTo(constituents(2))
74 | tokens(2).getParent must be equalTo(constituents(0))
75 | tokens(3).getParent must be equalTo(constituents(0))
76 | tokens(4).getParent must be equalTo(constituents(0))
77 | }
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/part-of-speech-tagger/ark-tweet-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/ArkTweetPosTagger.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.pos
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem.{Token, POS}
6 | import org.apache.uima.jcas.JCas
7 | import cmu.arktweetnlp.Twokenize
8 | import scala.collection.JavaConversions._
9 | import cmu.arktweetnlp.impl.Model
10 | import cmu.arktweetnlp.impl.features.FeatureExtractor
11 | import org.apache.uima.UimaContext;
12 | import cmu.arktweetnlp.impl.ModelSentence
13 | import cmu.arktweetnlp.impl.Sentence
14 |
15 | class ArkTweetPosTagger extends SCasAnnotator_ImplBase {
16 |
17 | object modelLocation extends Parameter[String]("")
18 |
19 | private var model: Model = _
20 | private var featureExtractor: FeatureExtractor = _
21 |
22 | override def initialize(context: UimaContext) {
23 | super.initialize(context)
24 |
25 | model = Model.loadModelFromText(modelLocation.is)
26 | featureExtractor = new FeatureExtractor(model, false);
27 | }
28 |
29 | def process(jcas: JCas) = {
30 | val tokens = jcas.select[Token].toVector
31 |
32 | val sentence = new Sentence()
33 | sentence.tokens = tokens.map(_.getCoveredText)
34 | val ms = new ModelSentence(sentence.T())
35 | featureExtractor.computeFeatures(sentence, ms)
36 | model.greedyDecode(ms, false)
37 |
38 | tokens.zipWithIndex.foreach { case (token, idx) =>
39 | val tag = model.labelVocab.name( ms.labels(idx) );
40 |
41 | val pos = new POS(jcas, token.getBegin, token.getEnd)
42 | pos.setName(tag)
43 | add(pos)
44 |
45 | token.setPos(pos)
46 | }
47 | }
48 |
49 | def createToken(cas: JCas, begin: Int, end: Int) =
50 | new Token(cas, begin, end)
51 | }
52 |
--------------------------------------------------------------------------------
/part-of-speech-tagger/ark-tweet-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/ArkTweetPosTaggerSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.pos
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.specs2.mutable.Specification
8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
9 | import org.apache.uima.fit.util.JCasUtil
10 |
11 | class ArkTweetPosTaggerSpec extends Specification {
12 |
13 | "Ark Tweet Pos Tagger" should {
14 | "add POS tags" in {
15 | val modelPath = new java.io.File(getClass.getResource("/model.20120919").toURI).getAbsolutePath
16 | val tagger: AnalysisEngine = new ArkTweetPosTagger().
17 | config(
18 | _.modelLocation := modelPath
19 | ).
20 | asAnalysisEngine
21 |
22 | val jcas = tagger.newJCas()
23 | jcas.setDocumentText("RT @DjBlack_Pearl: wat muhfuckaz wearin 4 the lingerie party?????")
24 | jcas.annotate[Token](0, 2)
25 | jcas.annotate[Token](3, 17)
26 | jcas.annotate[Token](17, 18)
27 | jcas.annotate[Token](19, 22)
28 | jcas.annotate[Token](23, 32)
29 | jcas.annotate[Token](33, 39)
30 | jcas.annotate[Token](40, 41)
31 | jcas.annotate[Token](42, 45)
32 | jcas.annotate[Token](46, 54)
33 | jcas.annotate[Token](55, 60)
34 | jcas.annotate[Token](60, 65)
35 | tagger.process(jcas)
36 |
37 | jcas.select[POS].size must be equalTo(11)
38 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("RT")
39 | jcas.selectByIndex[POS](0).getName must be equalTo ("~")
40 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("@DjBlack_Pearl")
41 | jcas.selectByIndex[POS](1).getName must be equalTo ("@")
42 | jcas.selectByIndex[POS](2).getCoveredText must be equalTo (":")
43 | jcas.selectByIndex[POS](2).getName must be equalTo ("~")
44 | jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("wat")
45 | jcas.selectByIndex[POS](3).getName must be equalTo ("O")
46 | jcas.selectByIndex[POS](4).getCoveredText must be equalTo ("muhfuckaz")
47 | jcas.selectByIndex[POS](4).getName must be equalTo ("N")
48 | jcas.selectByIndex[POS](5).getCoveredText must be equalTo ("wearin")
49 | jcas.selectByIndex[POS](5).getName must be equalTo ("V")
50 | jcas.selectByIndex[POS](6).getCoveredText must be equalTo ("4")
51 | jcas.selectByIndex[POS](6).getName must be equalTo ("P")
52 | jcas.selectByIndex[POS](7).getCoveredText must be equalTo ("the")
53 | jcas.selectByIndex[POS](7).getName must be equalTo ("D")
54 | jcas.selectByIndex[POS](8).getCoveredText must be equalTo ("lingerie")
55 | jcas.selectByIndex[POS](8).getName must be equalTo ("N")
56 | jcas.selectByIndex[POS](9).getCoveredText must be equalTo ("party")
57 | jcas.selectByIndex[POS](9).getName must be equalTo ("N")
58 | jcas.selectByIndex[POS](10).getCoveredText must be equalTo ("?????")
59 | jcas.selectByIndex[POS](10).getName must be equalTo (",")
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/part-of-speech-tagger/mate-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/MatePosTagger.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.pos
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import scala.collection.JavaConversions._
10 | import is2.data.SentenceData09
11 | import is2.io.CONLLReader09
12 | import is2.io.IOGenerals
13 | import is2.tag.Options
14 | import is2.tag.Tagger
15 |
16 | class MatePosTaggerResource extends SharedResourceObject {
17 | private var tagger: Tagger = _
18 |
19 | def load(data: DataResource) {
20 | val uri = data.getUri.toString
21 |
22 | if (new java.io.File(uri).exists) {
23 | tagger = new Tagger(new Options(Array("-model", uri)))
24 | } else {
25 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
26 | val resource = this.getClass.getResource(resourceUri)
27 |
28 | val file = java.io.File.createTempFile("mate-pos-tagger", ".temp")
29 | file.deleteOnExit();
30 |
31 | val source = resource.openStream();
32 | try {
33 | java.nio.file.Files.copy(source, file.toPath, java.nio.file.StandardCopyOption.REPLACE_EXISTING);
34 | } finally {
35 | source.close();
36 | }
37 |
38 | tagger = new Tagger(new Options(Array("-model", file.getAbsolutePath)))
39 | }
40 | }
41 |
42 | def getTagger = tagger
43 | }
44 |
45 | class MatePosTagger extends SCasAnnotator_ImplBase {
46 |
47 | object model extends SharedResource[MatePosTaggerResource]("")
48 |
49 | def process(jcas: JCas) = {
50 | jcas.select[Sentence].foreach { sentence =>
51 | val tokens = jcas.selectCovered[Token](sentence).toVector
52 |
53 | val sentenceData = new SentenceData09()
54 | sentenceData.init(Array[String](IOGenerals.ROOT) ++ tokens.map(_.getCoveredText))
55 | sentenceData.setLemmas(Array[String](IOGenerals.ROOT_LEMMA) ++ tokens.map { t =>
56 | if (t.getLemma != null) {
57 | t.getLemma.getValue()
58 | } else {
59 | "_"
60 | }
61 | })
62 |
63 | model.resource.getTagger.apply(sentenceData).ppos.drop(1).zipWithIndex.foreach { case (tag, idx) =>
64 | val token = tokens(idx)
65 |
66 | val pos = new POS(jcas, token.getBegin, token.getEnd)
67 | pos.setName(tag)
68 | add(pos)
69 |
70 | token.setPos(pos)
71 | }
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/part-of-speech-tagger/mate-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/MatePosTaggerSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.pos
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import com.github.jenshaase.uimascala.core.configuration._
7 | import org.apache.uima.analysis_engine.AnalysisEngine
8 | import org.specs2.mutable.Specification
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 |
12 | class MatePosTaggerSpec extends Specification {
13 |
14 | "MatePosTagger" should {
15 | "get the correct pos values" in {
16 | val tagger: AnalysisEngine = new MatePosTagger().
17 | config(
18 | _.model := SharedBinding[MatePosTaggerResource]("de/tudarmstadt/ukp/dkpro/core/matetools/lib/tagger-de-tiger.model")
19 | ).
20 | asAnalysisEngine
21 |
22 | val jcas = tagger.newJCas()
23 | jcas.setDocumentText("Wie alt bist du?")
24 | jcas.annotate[Sentence](0, 16)
25 | jcas.annotate[Token](0, 3)
26 | jcas.annotate[Token](4, 7)
27 | jcas.annotate[Token](8, 12)
28 | jcas.annotate[Token](13, 15)
29 | jcas.annotate[Token](15, 16)
30 |
31 | tagger.process(jcas)
32 |
33 | jcas.select[POS].size must be equalTo(5)
34 | jcas.selectByIndex[POS](0).getName must be equalTo ("PWAV")
35 | jcas.selectByIndex[POS](1).getName must be equalTo ("ADJD")
36 | jcas.selectByIndex[POS](2).getName must be equalTo ("VAFIN")
37 | jcas.selectByIndex[POS](3).getName must be equalTo ("PPER")
38 | jcas.selectByIndex[POS](4).getName must be equalTo ("$.")
39 |
40 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Wie")
41 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("alt")
42 | jcas.selectByIndex[POS](2).getCoveredText must be equalTo ("bist")
43 | jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("du")
44 | jcas.selectByIndex[POS](4).getCoveredText must be equalTo ("?")
45 | }
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/part-of-speech-tagger/stanford-pos-tagger/src/main/scala/com/github/jenshaase/uimascala/pos/StanfordPosTagger.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.pos
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import edu.stanford.nlp.ling.TaggedWord
10 | import scala.collection.JavaConversions._
11 | import edu.stanford.nlp.tagger.maxent.MaxentTagger
12 |
13 | class MaxentTaggerResource extends SharedResourceObject {
14 | private var tagger: MaxentTagger = _
15 |
16 | def load(data: DataResource) {
17 | tagger = new MaxentTagger(data.getUri.toString)
18 | }
19 |
20 | def getTagger = tagger
21 | }
22 |
23 | class StanfordPosTagger extends SCasAnnotator_ImplBase {
24 |
25 | object model extends SharedResource[MaxentTaggerResource](MaxentTagger.DEFAULT_JAR_PATH)
26 | object maxTokensPerSentence extends Parameter[Option[Int]](None) {
27 | override def mandatory_? = false
28 | }
29 |
30 | def process(jcas: JCas) = {
31 | jcas.select[Sentence].foreach { sentence =>
32 | val tokens = jcas.selectCovered[Token](sentence)
33 |
34 | maxTokensPerSentence.is match {
35 | case None =>
36 | processTokens(jcas, tokens)
37 | case Some(n) if (n > 0 && tokens.size <= n) =>
38 | processTokens(jcas, tokens)
39 | case _ =>
40 | }
41 | }
42 | }
43 |
44 | def processTokens(jcas: JCas, tokens: Seq[Token]) {
45 | val words = tokens.map { token => new TaggedWord(token.getCoveredText) }
46 | val taggedWords = model.resource.getTagger.tagSentence(words)
47 |
48 | tokens.zipWithIndex.foreach { case (token, idx) =>
49 | val tag = taggedWords.get(idx).tag()
50 |
51 | val pos = new POS(jcas, token.getBegin, token.getEnd)
52 | pos.setName(tag)
53 | add(pos)
54 |
55 | token.setPos(pos)
56 | }
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/part-of-speech-tagger/stanford-pos-tagger/src/test/scala/com/github/jenshaase/uimascala/pos/StanfordPosTaggerSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.pos
5 |
6 | import java.util.Locale
7 | import com.github.jenshaase.uimascala.core._
8 | import com.github.jenshaase.uimascala.typesystem._
9 | import com.github.jenshaase.uimascala.core.configuration._
10 | import org.apache.uima.analysis_engine.AnalysisEngine
11 | import org.specs2.mutable.Specification
12 | import org.apache.uima.fit.factory.AnalysisEngineFactory
13 | import org.apache.uima.fit.util.JCasUtil
14 |
15 | class StanfordPosTaggerSpec extends Specification {
16 |
17 | "StanfordPosTagger" should {
18 | "tag each word in a sentence" in {
19 | val tagger: AnalysisEngine = new StanfordPosTagger().
20 | config(
21 | _.model := SharedBinding[MaxentTaggerResource]("edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger")
22 | ).
23 | asAnalysisEngine
24 |
25 | val jcas = tagger.newJCas()
26 | jcas.setDocumentText("Hallo Welt! Was geht?")
27 | jcas.annotate[Sentence](0, 10)
28 | jcas.annotate[Sentence](12, 20)
29 | jcas.annotate[Token](0, 5)
30 | jcas.annotate[Token](6, 10)
31 | jcas.annotate[Token](12, 15)
32 | jcas.annotate[Token](16, 20)
33 | tagger.process(jcas)
34 |
35 | jcas.select[POS].size must be equalTo(4)
36 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Hallo")
37 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("Welt")
38 | jcas.selectByIndex[POS](2).getCoveredText must be equalTo ("Was")
39 | jcas.selectByIndex[POS](3).getCoveredText must be equalTo ("geht")
40 | }
41 |
42 | "tag each word in a sentence if the sentences is short enough" in {
43 | val tagger: AnalysisEngine = new StanfordPosTagger().
44 | config(
45 | _.model := SharedBinding[MaxentTaggerResource]("edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger"),
46 | _.maxTokensPerSentence := Some(2)
47 | ).
48 | asAnalysisEngine
49 |
50 | val jcas = tagger.newJCas()
51 | jcas.setDocumentText("Hallo Welt! Was geht heute?")
52 | jcas.annotate[Sentence](0, 10)
53 | jcas.annotate[Sentence](12, 26)
54 | jcas.annotate[Token](0, 5)
55 | jcas.annotate[Token](6, 10)
56 | jcas.annotate[Token](12, 15)
57 | jcas.annotate[Token](16, 20)
58 | jcas.annotate[Token](21, 26)
59 | tagger.process(jcas)
60 |
61 | jcas.select[POS].size must be equalTo(2)
62 | jcas.selectByIndex[POS](0).getCoveredText must be equalTo ("Hallo")
63 | jcas.selectByIndex[POS](1).getCoveredText must be equalTo ("Welt")
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3")
2 |
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
4 |
5 | lazy val plugins = project in file(".") dependsOn(file("../sbt-plugin"))
6 |
--------------------------------------------------------------------------------
/sbt-plugin/build.sbt:
--------------------------------------------------------------------------------
1 | sbtPlugin := true
2 |
3 | organization := "com.github.jenshaase.uimascala"
4 |
5 | libraryDependencies ++= Seq(
6 | "org.apache.uima" % "uimaj-tools" % "2.8.1"
7 | )
8 |
9 | releasePublishArtifactsAction := PgpKeys.publishSigned.value
10 |
11 | publishTo := {
12 | val nexus = "https://oss.sonatype.org/"
13 | if ( version.value.trim.endsWith( "SNAPSHOT" ) )
14 | Some( "snapshots" at nexus + "content/repositories/snapshots" )
15 | else
16 | Some( "releases" at nexus + "service/local/staging/deploy/maven2" )
17 | }
18 |
19 | publishMavenStyle := true
20 |
21 | pomExtra := (
22 | https://github.com/jenshaase/uimaScala
23 |
24 | git@github.com:jenshaase/uimascala.git
25 | scm:git:git@github.com:jenshaase/uimascala.git
26 |
27 |
28 |
29 | jenshaase
30 | Jens Haase
31 |
32 |
33 |
34 |
35 | Apache 2
36 | http://www.apache.org/licenses/LICENSE-2.0.txt
37 | repo
38 |
39 |
40 | )
41 |
42 |
--------------------------------------------------------------------------------
/sbt-plugin/project/plugin.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.3")
2 |
3 | addSbtPlugin("com.jsuereth" % "sbt-pgp" % "1.0.0")
4 |
--------------------------------------------------------------------------------
/sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaSbtPlugin.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala
2 |
3 | import sbt._
4 | import Keys._
5 | import plugins._
6 |
7 | import org.apache.uima.UIMAFramework
8 | import org.apache.uima.util.{ CasCreationUtils, XMLInputSource }
9 | import org.apache.uima.cas.impl.CASImpl
10 | import org.apache.uima.tools.jcasgen._
11 |
12 | object UimaSbtPlugin extends Plugin {
13 |
14 | val uimaConfig = config("uima")
15 |
16 | val jcasGen = TaskKey[Unit]("jcasgen")
17 | val visualDebugger = TaskKey[Unit]("visualDebugger")
18 |
19 | def uimaScalaSettings = Seq(
20 | sourceDirectory in uimaConfig <<= (resourceDirectory in Compile) { _ / "desc" / "types" },
21 | javaSource in uimaConfig <<= (sourceManaged in Compile) { _ / "java" },
22 | sourceGenerators in Compile <+= generateTypeSystemSourcesTask,
23 | managedSourceDirectories in Compile <+= (javaSource in uimaConfig),
24 | cleanFiles <+= (javaSource in uimaConfig),
25 | jcasGen <<= jcasGenTask,
26 | visualDebugger <<= visualDebuggerTask
27 | )
28 |
29 | def generateTypeSystemSourcesTask =
30 | (sourceDirectory in uimaConfig, javaSource in uimaConfig) map { (srcDir, targetDir) =>
31 | generateTypeSystemSources(srcDir, targetDir)
32 | }
33 |
34 | def generateTypeSystemSources(srcDir: File, targetDir: File): Seq[File] = {
35 | (srcDir ** "*.xml").get foreach { filename =>
36 | val xmlIS = new XMLInputSource(filename)
37 | val tsd = UIMAFramework.getXMLParser.parseTypeSystemDescription(xmlIS)
38 | val cas = CasCreationUtils.createCas(tsd, null, null)
39 | val jg = new Jg()
40 | jg.mainGenerateAllTypesFromTemplates(
41 | null, new UimaLoggerProgressMonitor(), new LogThrowErrorImpl(),
42 | filename.getAbsolutePath, targetDir.getAbsolutePath, tsd.getTypes,
43 | cas.asInstanceOf[CASImpl], classOf[UimaScalaTypeTemplate],
44 | classOf[UimaScala_TypeTemplate], "", false, null
45 | )
46 | }
47 |
48 | (targetDir ** "*.java").get
49 | }
50 |
51 | def jcasGenTask =
52 | (streams) map { streams =>
53 | Run.executeTrapExit(
54 | (new Jg()).main0(Array[String](), null, null, new LogThrowErrorImpl()),
55 | streams.log
56 | )
57 | ()
58 | }
59 |
60 | def visualDebuggerTask =
61 | (streams) map { streams =>
62 | Run.executeTrapExit (
63 | org.apache.uima.tools.cvd.CVD.main(Array[String]()),
64 | streams.log
65 | )
66 | ()
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaScalaTypeTemplate.scala:
--------------------------------------------------------------------------------
1 | package org.apache.uima.tools.jcasgen
2 |
3 | import org.apache.uima.resource.metadata.TypeDescription
4 | import scala.collection.JavaConversions._
5 |
6 | class UimaScalaTypeTemplate extends Jg.IJCasTypeTemplate {
7 |
8 | def generate(argument: Any): String = {
9 | val stringBuffer = new StringBuffer();
10 | stringBuffer.append("\n\n");
11 |
12 | val args: Array[Object] = argument.asInstanceOf[Array[Object]]
13 | val jg = args(0).asInstanceOf[Jg]
14 | val td = args(1).asInstanceOf[TypeDescription]
15 | jg.packageName = jg.getJavaPkg(td);
16 |
17 | if (0 != jg.packageName.length()) {
18 | stringBuffer.append(s"""package ${jg.packageName};""");
19 | stringBuffer.append("\n");
20 | }
21 | else
22 | jg.error.newError(IError.WARN,
23 | jg.getString("pkgMissing", Array.apply[Object](td.getName)), null);
24 | stringBuffer.append("""
25 | import org.apache.uima.jcas.JCas;
26 | import org.apache.uima.jcas.JCasRegistry;
27 | import org.apache.uima.jcas.cas.TOP_Type;
28 |
29 | """);
30 |
31 | jg.collectImports(td, false).foreach { imp =>
32 | stringBuffer.append(s"""import $imp;""");
33 | stringBuffer.append("\n");
34 | }
35 |
36 | stringBuffer.append("\n\n");
37 |
38 | val typeName = jg.getJavaName(td);
39 | val typeName_Type = typeName + "_Type";
40 | val jcasTypeCasted = "((" + typeName_Type + ")jcasType)";
41 |
42 | stringBuffer.append(s"""/** ${jg.nullBlank(td.getDescription())} */
43 | public class ${typeName} extends ${jg.getJavaName(td.getSupertypeName())} {
44 | @SuppressWarnings ("hiding")
45 | public final static int typeIndexID = JCasRegistry.register(${typeName}.class);
46 | @SuppressWarnings ("hiding")
47 | public final static int type = typeIndexID;
48 | @Override
49 | public int getTypeIndexID() {return typeIndexID;}
50 |
51 | /** Never called. Disable default constructor */
52 | protected ${typeName}() {/* intentionally empty block */}
53 |
54 | /** Internal - constructor used by generator
55 | *
56 | * @param addr low level Feature Structure reference
57 | * @param type the type of this Feature Structure
58 | */
59 | public ${typeName}(int addr, TOP_Type type) {
60 | super(addr, type);
61 | readObject();
62 | }
63 |
64 | /**
65 | * @param jcas JCas to which this Feature Structure belongs
66 | */
67 | public ${typeName}(JCas jcas) {
68 | super(jcas);
69 | readObject();
70 | }
71 | """);
72 |
73 | if (jg.isSubTypeOfAnnotation(td)) {
74 | stringBuffer.append(s"""
75 | /**
76 | * @param jcas JCas to which this Feature Structure belongs
77 | * @param begin offset to the begin spot in the SofA
78 | * @param end offset to the end spot in the SofA
79 | */
80 | public ${typeName}(JCas jcas, int begin, int end) {
81 | super(jcas);
82 | setBegin(begin);
83 | setEnd(end);
84 | readObject();
85 | }
86 | """);
87 | }
88 |
89 | stringBuffer.append(s"""
90 | /**
91 | *
92 | * Write your own initialization here
93 | *
94 | *
95 | */
96 | private void readObject() {/*default - does nothing empty block */}
97 |
98 | """);
99 |
100 | td.getFeatures().foreach { fd =>
101 | val featName = fd.getName();
102 | val featUName = jg.uc1(featName); // upper case first letter
103 | if (Jg.reservedFeatureNames.contains(featUName))
104 | jg.error.newError(IError.ERROR,
105 | jg.getString("reservedNameUsed", Array.apply[Object](featName, td.getName)),
106 | null);
107 |
108 | val featDesc = jg.nullBlank(fd.getDescription());
109 | val featDescCmt = featDesc;
110 |
111 | val rangeType = jg.getJavaRangeType(fd);
112 | val elemType = jg.getJavaRangeArrayElementType(fd);
113 |
114 | stringBuffer.append(s"""
115 |
116 | //*--------------*
117 | //* Feature: ${featName}
118 |
119 | /** getter for ${featName} - gets ${featDescCmt}
120 | * @return value of the feature
121 | */
122 | public ${rangeType} get${featUName}() {
123 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
124 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName}");
125 | return ${jg.getFeatureValue(fd, td)};}
126 |
127 | /** setter for ${featName} - sets ${featDescCmt}
128 | * @param v value to set into the feature
129 | */
130 | public void set${featUName}(${rangeType} v) {
131 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
132 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}");
133 | ${jg.setFeatureValue(fd, td)};}
134 | """);
135 |
136 | if (jg.hasArrayRange(fd)) {
137 | stringBuffer.append(s"""
138 | /** indexed getter for ${featName} - gets an indexed value - ${featDescCmt}
139 | * @param i index in the array to get
140 | * @return value of the element at index i
141 | */
142 | public ${elemType} get${featUName}(int i) {
143 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
144 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}");
145 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ${jcasTypeCasted}.casFeatCode_${featName}), i);
146 | return ${jg.getArrayFeatureValue(fd, td)};}
147 |
148 | /** indexed setter for ${featName} - sets an indexed value - ${featDescCmt}
149 | * @param i index in the array to set
150 | * @param v value to set into the array
151 | */
152 | public void set${featUName}(int i, ${elemType} v) {
153 | if (${typeName_Type}.featOkTst && ${jcasTypeCasted}.casFeat_${featName} == null)
154 | jcasType.jcas.throwFeatMissing("${featName}", "${td.getName()}");
155 | jcasType.jcas.checkArrayBounds(jcasType.ll_cas.ll_getRefValue(addr, ${jcasTypeCasted}.casFeatCode_${featName}), i);
156 | ${jg.setArrayFeatureValue(fd, td)};}
157 | """);
158 | } /* of hasArray */
159 |
160 | stringBuffer.append("");
161 |
162 | } /* of Features iteration */
163 |
164 | stringBuffer.append("");
165 |
166 | if (td.getName().equals("uima.cas.Annotation")) {
167 | stringBuffer.append(" ");
168 | stringBuffer.append(""" /** Constructor with begin and end passed as arguments
169 | * @param jcas JCas this Annotation is in
170 | * @param begin the begin offset
171 | * @param end the end offset
172 | */
173 | public Annotation(JCas jcas, int begin, int end) {
174 | this(jcas); // forward to constructor
175 | this.setBegin(begin);
176 | this.setEnd(end);
177 | }
178 |
179 | /** @see org.apache.uima.cas.text.AnnotationFS#getCoveredText()
180 | * @return the covered Text
181 | */
182 | public String getCoveredText() {
183 | final CAS casView = this.getView();
184 | final String text = casView.getDocumentText();
185 | if (text == null) {
186 | return null;
187 | }
188 | return text.substring(getBegin(), getEnd());
189 | }
190 |
191 | /** @deprecated
192 | * @return the begin offset
193 | */
194 | public int getStart() {return getBegin();}
195 | """);
196 | stringBuffer.append("");
197 | } /* of Annotation if-statement */
198 | stringBuffer.append("}\n\n ");
199 | return stringBuffer.toString();
200 | }
201 | }
202 |
--------------------------------------------------------------------------------
/sbt-plugin/src/main/scala/com/github/jenshaase/uimascala/sbt/UimaScala_TypeTemplate.scala:
--------------------------------------------------------------------------------
1 | package org.apache.uima.tools.jcasgen
2 |
3 | import org.apache.uima.resource.metadata.TypeDescription
4 | import scala.collection.JavaConversions._
5 |
6 | class UimaScala_TypeTemplate extends Jg.IJCasTypeTemplate {
7 |
8 | def generate(argument: Any): String = {
9 | val args: Array[Any] = argument.asInstanceOf[Array[Any]]
10 | val jg = args(0).asInstanceOf[Jg]
11 | val td = args(1).asInstanceOf[TypeDescription]
12 | val stringBuffer = new StringBuffer()
13 |
14 | jg.packageName = jg.getJavaPkg(td);
15 | if (0 != jg.packageName.length()) {
16 | stringBuffer.append("package ");
17 | stringBuffer.append(jg.packageName);
18 | stringBuffer.append(";\n");
19 | }
20 | stringBuffer.append("""
21 | import org.apache.uima.jcas.JCas;
22 | import org.apache.uima.jcas.JCasRegistry;
23 | import org.apache.uima.cas.impl.CASImpl;
24 | import org.apache.uima.cas.impl.FSGenerator;
25 | import org.apache.uima.cas.FeatureStructure;
26 | import org.apache.uima.cas.impl.TypeImpl;
27 | import org.apache.uima.cas.Type;
28 | """);
29 |
30 | if (td.getFeatures().length > 0) {
31 | stringBuffer.append("""import org.apache.uima.cas.impl.FeatureImpl;
32 | import org.apache.uima.cas.Feature;
33 | """);
34 | }
35 |
36 | stringBuffer.append("");
37 |
38 | jg.collectImports(td, true).foreach { imp =>
39 | if (!imp.equals(jg.getJavaNameWithPkg(td.getName()+"_Type"))) {
40 | stringBuffer.append(s"""import ${imp};""")
41 | stringBuffer.append("\n")
42 | }
43 | }
44 |
45 | stringBuffer.append("\n");
46 | val typeName = jg.getJavaName(td);
47 | val typeName_Type = typeName + "_Type";
48 | stringBuffer.append(s"""/** ${jg.nullBlank(td.getDescription())} */
49 | public class ${typeName_Type} extends ${jg.getJavaName(td.getSupertypeName())}_Type {
50 | /**
51 | * @return the generator for this type
52 | */
53 | @Override
54 | protected FSGenerator getFSGenerator() {return fsGenerator;}
55 |
56 | private final FSGenerator fsGenerator =
57 | new FSGenerator() {
58 | public FeatureStructure createFS(int addr, CASImpl cas) {
59 | if (${typeName_Type}.this.useExistingInstance) {
60 | // Return eq fs instance if already created
61 | FeatureStructure fs = ${typeName_Type}.this.jcas.getJfsFromCaddr(addr);
62 | if (null == fs) {
63 | fs = new ${typeName}(addr, ${typeName_Type}.this);
64 | ${typeName_Type}.this.jcas.putJfsFromCaddr(addr, fs);
65 | return fs;
66 | }
67 | return fs;
68 | } else return new ${typeName}(addr, ${typeName_Type}.this);
69 | }
70 | };
71 |
72 | @SuppressWarnings ("hiding")
73 | public final static int typeIndexID = ${typeName}.typeIndexID;
74 |
75 | @SuppressWarnings ("hiding")
76 | public final static boolean featOkTst = JCasRegistry.getFeatOkTst("${td.getName()}");
77 | """);
78 |
79 |
80 | td.getFeatures().foreach { fd =>
81 | val featName = fd.getName();
82 | val featUName = jg.uc1(featName); // upper case first letter
83 |
84 | val rangeType = jg.getJavaRangeType(fd);
85 | val getSetNamePart = jg.sc(rangeType);
86 | val returnType = if (getSetNamePart.equals("Ref")) "int" else rangeType;
87 | val getSetArrayNamePart = jg.getGetSetArrayNamePart(fd);
88 |
89 | val elemType =
90 | if (jg.sc(jg.getJavaRangeArrayElementType(fd)).equals("Ref")) {
91 | "int";
92 | } else {
93 | jg.getJavaRangeArrayElementType(fd);
94 | }
95 | val casFeatCode = "casFeatCode_" + featName;
96 |
97 | stringBuffer.append(s"""
98 | final Feature casFeat_${featName};
99 | final int ${casFeatCode};
100 | /**
101 | * @param addr low level Feature Structure reference
102 | * @return the feature value
103 | */
104 | public ${returnType} get${featUName}(int addr) {
105 | if (featOkTst && casFeat_${featName} == null)
106 | jcas.throwFeatMissing("${featName}", "${td.getName()}");
107 | return ll_cas.ll_get${getSetNamePart}Value(addr, ${casFeatCode});
108 | }
109 | /**
110 | * @param addr low level Feature Structure reference
111 | * @param v value to set
112 | */
113 | public void set${featUName}(int addr, ${returnType} v) {
114 | if (featOkTst && casFeat_${featName} == null)
115 | jcas.throwFeatMissing("${featName}", "${td.getName()}");
116 | ll_cas.ll_set${getSetNamePart}Value(addr, ${casFeatCode}, v);}
117 |
118 | """);
119 |
120 | if (jg.hasArrayRange(fd)) {
121 | stringBuffer.append(s"""
122 | /**
123 | * @param addr low level Feature Structure reference
124 | * @param i index of item in the array
125 | * @return value at index i in the array
126 | */
127 | public ${elemType} get${featUName}(int addr, int i) {
128 | if (featOkTst && casFeat_${featName} == null)
129 | jcas.throwFeatMissing("${featName}", "${td.getName()}");
130 | if (lowLevelTypeChecks)
131 | return ll_cas.ll_get${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, true);
132 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i);
133 | return ll_cas.ll_get${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i);
134 | }
135 |
136 | /**
137 | * @param addr low level Feature Structure reference
138 | * @param i index of item in the array
139 | * @param v value to set
140 | */
141 | public void set${featUName}(int addr, int i, ${elemType} v) {
142 | if (featOkTst && casFeat_${featName} == null)
143 | jcas.throwFeatMissing("${featName}", "${td.getName}");
144 | if (lowLevelTypeChecks)
145 | ll_cas.ll_set${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, v, true);
146 | jcas.checkArrayBounds(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i);
147 | ll_cas.ll_set${getSetArrayNamePart}ArrayValue(ll_cas.ll_getRefValue(addr, ${casFeatCode}), i, v);
148 | }
149 | """);
150 | }
151 | stringBuffer.append(" \n");
152 | }
153 |
154 | stringBuffer.append("\n");
155 |
156 | if (td.getName().equals("uima.cas.Annotation")) {
157 | stringBuffer.append(" ");
158 | stringBuffer.append(s""" /** @see org.apache.uima.cas.text.AnnotationFS#getCoveredText()
159 | * @param inst the low level Feature Structure reference
160 | * @return the covered text
161 | */
162 | public String getCoveredText(int inst) {
163 | final CASImpl casView = ll_cas.ll_getSofaCasView(inst);
164 | final String text = casView.getDocumentText();
165 | if (text == null) {
166 | return null;
167 | }
168 | return text.substring(getBegin(inst), getEnd(inst));
169 | }
170 | """);
171 | } /* of Annotation if-statement */
172 |
173 | stringBuffer.append(s"""
174 |
175 | /** initialize variables to correspond with Cas Type and Features
176 | * @param jcas JCas
177 | * @param casType Type
178 | */
179 | public ${typeName_Type}(JCas jcas, Type casType) {
180 | super(jcas, casType);
181 | casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator());
182 |
183 | """);
184 | td.getFeatures().foreach { fd =>
185 | val featName = fd.getName();
186 |
187 | stringBuffer.append(s"""
188 | casFeat_${featName} = jcas.getRequiredFeatureDE(casType, "${featName}", "${fd.getRangeTypeName()}", featOkTst);
189 | casFeatCode_${featName} = (null == casFeat_${featName}) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_${featName}).getCode();
190 |
191 | """);
192 | }
193 | stringBuffer.append(" }\n}\n\n\n\n ");
194 | return stringBuffer.toString();
195 | }
196 | }
197 |
--------------------------------------------------------------------------------
/sbt-plugin/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.2-SNAPSHOT"
2 |
--------------------------------------------------------------------------------
/segmenter/ark-tweet-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/ArkTweetTokenizer.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import cmu.arktweetnlp.Twokenize
8 | import scala.collection.JavaConversions._
9 |
10 | object ArkTweetTokenizer {
11 | def normalizeTweet(tweet: String): String =
12 | Twokenize.normalizeTextForTagger(tweet)
13 | }
14 |
15 | class ArkTweetTokenizer extends SCasAnnotator_ImplBase {
16 |
17 | def process(jcas: JCas) = {
18 | val txt = jcas.getDocumentText
19 |
20 | Twokenize.tokenize(txt).foldLeft(0) { (offset, token) =>
21 | val start = txt.indexOf(token, offset);
22 | val end = start + token.length
23 | add(createToken(jcas, start, end))
24 | end
25 | }
26 | }
27 |
28 | def createToken(cas: JCas, begin: Int, end: Int) =
29 | new Token(cas, begin, end)
30 | }
31 |
--------------------------------------------------------------------------------
/segmenter/ark-tweet-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/ArkTweetTokenizerSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.specs2.mutable.Specification
8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
9 | import org.apache.uima.fit.util.JCasUtil
10 |
11 | class ArkTweetTokenizerSpec extends Specification {
12 |
13 | "Ark Tweet Tokenizer" should {
14 | "annotate all tokens in a tweet" in {
15 | val tokenizer: AnalysisEngine = new ArkTweetTokenizer().asAnalysisEngine
16 |
17 | val jcas = tokenizer.newJCas()
18 | jcas.setDocumentText("This is a test & a thing #hash #tag bit.ly/link")
19 | tokenizer.process(jcas)
20 |
21 | jcas.select[Token].size must be equalTo(12)
22 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("This")
23 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("is")
24 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("a")
25 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("test")
26 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("&")
27 | jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("amp")
28 | jcas.selectByIndex[Token](6).getCoveredText must be equalTo (";")
29 | jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("a")
30 | jcas.selectByIndex[Token](8).getCoveredText must be equalTo ("thing")
31 | jcas.selectByIndex[Token](9).getCoveredText must be equalTo ("#hash")
32 | jcas.selectByIndex[Token](10).getCoveredText must be equalTo ("#tag")
33 | jcas.selectByIndex[Token](11).getCoveredText must be equalTo ("bit.ly/link")
34 | }
35 |
36 | "it should normalize a tweet" in {
37 | ArkTweetTokenizer.normalizeTweet("This is a test & a thing #hash #tag bit.ly/link") must be equalTo (
38 | "This is a test & a thing #hash #tag bit.ly/link"
39 | )
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/segmenter/break-iterator-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/BreakIteratorSegmenter.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.segmenter
5 |
6 | import java.text.BreakIterator
7 | import java.util.Locale
8 | import com.github.jenshaase.uimascala.core._
9 | import com.github.jenshaase.uimascala.typesystem._
10 | import com.github.jenshaase.uimascala.core.configuration._
11 | import org.apache.uima.jcas.JCas
12 | import org.apache.uima.fit.descriptor.ConfigurationParameter
13 | import org.apache.uima.fit.factory.AnalysisEngineFactory
14 |
15 | /**
16 | * @author Jens Haase
17 | */
18 | class BreakIteratorSegmenter extends SCasAnnotator_ImplBase {
19 |
20 | object locale extends Parameter[Locale](Locale.getDefault)
21 |
22 | def process(jcas: JCas) = {
23 | val bi = BreakIterator.getSentenceInstance(getLocale(jcas))
24 | bi.setText(jcas.getDocumentText)
25 |
26 | var last = bi.first
27 | var cur = bi.next
28 | while (cur != BreakIterator.DONE) {
29 | val sentence = addIfNotEmpty(createSentence(jcas, last, cur).trim)
30 | processSentence(jcas, sentence.getCoveredText, last)
31 |
32 | last = cur
33 | cur = bi.next
34 | }
35 | }
36 |
37 | def processSentence(jcas: JCas, sentence: String, offset: Int) = {
38 | val bi = BreakIterator.getWordInstance(getLocale(jcas))
39 | bi.setText(sentence)
40 |
41 | var last = bi.first
42 | var cur = bi.next
43 | while (cur != BreakIterator.DONE) {
44 | addIfNotEmpty(createToken(jcas, last + offset, cur + offset).trim)
45 |
46 | last = cur
47 | cur = bi.next
48 | }
49 | }
50 |
51 | protected def createSentence(cas: JCas, begin: Int, end: Int) =
52 | new Sentence(cas, begin, end)
53 |
54 | protected def createToken(cas: JCas, begin: Int, end: Int) =
55 | new Token(cas, begin, end)
56 |
57 | protected def getLocale(jcas: JCas): Locale = {
58 | val l = jcas.getDocumentLanguage()
59 | if (l != null && l != "x-unspecified") {
60 | return new Locale(l)
61 | }
62 |
63 | locale.is
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/segmenter/break-iterator-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/BreakIteratorSegmenterSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.segmenter
5 |
6 | import java.util.Locale
7 | import com.github.jenshaase.uimascala.core._
8 | import com.github.jenshaase.uimascala.typesystem._
9 | import org.apache.uima.analysis_engine.AnalysisEngine
10 | import org.specs2.mutable.Specification
11 | import org.apache.uima.fit.factory.AnalysisEngineFactory
12 | import org.apache.uima.fit.util.JCasUtil
13 |
14 | /**
15 | * @author Jens Haase
16 | */
17 | class BreakIteratorSegmenterSpec extends Specification {
18 |
19 | "Break Iterator" should {
20 | val germanTokenizer: AnalysisEngine = new BreakIteratorSegmenter().config(
21 | _.locale := Locale.GERMAN).asAnalysisEngine
22 |
23 | "split german sentences" in {
24 | val jcas = germanTokenizer.newJCas()
25 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
26 | germanTokenizer.process(jcas)
27 |
28 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Hallo, alle zusammen.")
29 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Wie geht es euch?")
30 | }
31 |
32 | "split german words" in {
33 | val jcas = germanTokenizer.newJCas()
34 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
35 | germanTokenizer.process(jcas)
36 |
37 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
38 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo (",")
39 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("alle")
40 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("zusammen")
41 | }
42 |
43 | "split english words when document language is set" in {
44 | val jcas = germanTokenizer.newJCas()
45 | jcas.setDocumentText("What's up? Once again")
46 | jcas.setDocumentLanguage("en");
47 | germanTokenizer.process(jcas)
48 |
49 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("What's")
50 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("up")
51 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("?")
52 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("Once")
53 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("again")
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/segmenter/lucene-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/LuceneTokenizer.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import java.text.BreakIterator
8 | import org.apache.lucene.analysis.standard.StandardTokenizer
9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
10 |
11 | class LuceneTokenizer extends SCasAnnotator_ImplBase {
12 |
13 | def process(jcas: JCas) = {
14 | val bi = BreakIterator.getSentenceInstance()
15 | bi.setText(jcas.getDocumentText)
16 |
17 | var last = bi.first
18 | var cur = bi.next
19 | while (cur != BreakIterator.DONE) {
20 | val sentence = addIfNotEmpty(createSentence(jcas, last, cur).trim)
21 | processSentence(jcas, sentence.getCoveredText, last)
22 |
23 | last = cur
24 | cur = bi.next
25 | }
26 | }
27 |
28 | def processSentence(jcas: JCas, sentence: String, offset: Int) = {
29 | val tokenizer = new StandardTokenizer()
30 | tokenizer.setReader(new java.io.StringReader(sentence))
31 | tokenizer.reset()
32 | while (tokenizer.incrementToken()) {
33 | val tokenOffset = tokenizer.getAttribute(classOf[OffsetAttribute])
34 | add(createToken(jcas, offset + tokenOffset.startOffset, offset + tokenOffset.endOffset))
35 | }
36 | tokenizer.end()
37 | tokenizer.close()
38 | }
39 |
40 | protected def createSentence(cas: JCas, begin: Int, end: Int) =
41 | new Sentence(cas, begin, end)
42 |
43 | protected def createToken(cas: JCas, begin: Int, end: Int) =
44 | new Token(cas, begin, end)
45 | }
46 |
--------------------------------------------------------------------------------
/segmenter/lucene-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/LuceneTokenizerSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.specs2.mutable.Specification
8 | import org.apache.uima.fit.factory.AnalysisEngineFactory
9 | import org.apache.uima.fit.util.JCasUtil
10 |
11 | class LuceneTokenizerSpec extends Specification {
12 |
13 | "Lucene Tokenizer" should {
14 | val tokenizer: AnalysisEngine = new LuceneTokenizer().asAnalysisEngine
15 |
16 | "split in sentences" in {
17 | val jcas = tokenizer.newJCas()
18 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
19 | tokenizer.process(jcas)
20 |
21 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Hallo, alle zusammen.")
22 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Wie geht es euch?")
23 | }
24 |
25 | "split words" in {
26 | val jcas = tokenizer.newJCas()
27 | jcas.setDocumentText("Hallo, alle zusammen. Wie geht es euch?")
28 | tokenizer.process(jcas)
29 |
30 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
31 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("alle")
32 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("zusammen")
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/segmenter/open-nlp-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/OpenNlpSegmenter.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import java.util.zip.GZIPInputStream
10 | import scala.collection.JavaConversions._
11 | import opennlp.tools.sentdetect.SentenceDetectorME
12 | import opennlp.tools.sentdetect.SentenceModel
13 | import opennlp.tools.tokenize.TokenizerME
14 | import opennlp.tools.tokenize.TokenizerModel
15 |
16 | class OpenNlpSentenceSegmenterResource extends SharedResourceObject {
17 | private var model: SentenceDetectorME = _
18 |
19 | def load(data: DataResource) {
20 | val uri = data.getUri.toString
21 |
22 | if (new java.io.File(uri).exists) {
23 | model = new SentenceDetectorME(new SentenceModel(new java.io.File(uri)))
24 | } else {
25 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
26 | val resource = this.getClass.getResource(resourceUri)
27 |
28 | val is = if (uri.endsWith(".gz")) {
29 | new GZIPInputStream(resource.openStream)
30 | } else {
31 | resource.openStream
32 | }
33 |
34 | model = new SentenceDetectorME(new SentenceModel(is))
35 | }
36 | }
37 |
38 | def getModel = model
39 | }
40 |
41 | class OpenNlpTokenSegmenterResource extends SharedResourceObject {
42 | private var model: TokenizerME = _
43 |
44 | def load(data: DataResource) {
45 | val uri = data.getUri.toString
46 |
47 | if (new java.io.File(uri).exists) {
48 | model = new TokenizerME(new TokenizerModel(new java.io.File(uri)))
49 | } else {
50 | val resourceUri = if (uri.startsWith("/")) uri else "/" + uri
51 | val resource = this.getClass.getResource(resourceUri)
52 |
53 | val is = if (uri.endsWith(".gz")) {
54 | new GZIPInputStream(resource.openStream)
55 | } else {
56 | resource.openStream
57 | }
58 |
59 | model = new TokenizerME(new TokenizerModel(is))
60 | }
61 | }
62 |
63 | def getModel = model
64 | }
65 |
66 | class OpenNlpSegmenter extends SCasAnnotator_ImplBase {
67 |
68 | object sentenceModel extends SharedResource[OpenNlpSentenceSegmenterResource]("")
69 | object tokenModel extends SharedResource[OpenNlpTokenSegmenterResource]("")
70 |
71 | def process(jcas: JCas) = {
72 | sentenceModel.resource.getModel.sentPosDetect(jcas.getDocumentText).foreach { span =>
73 | add(createSentence(jcas, span.getStart, span.getEnd))
74 | }
75 |
76 | jcas.select[Sentence].foreach { sentence =>
77 | tokenModel.resource.getModel.tokenizePos(sentence.getCoveredText).foreach { span =>
78 | add(createToken(jcas, span.getStart + sentence.getStart, span.getEnd + sentence.getStart))
79 | }
80 | }
81 | }
82 |
83 | protected def createToken(cas: JCas, begin: Int, end: Int) =
84 | new Token(cas, begin, end)
85 |
86 | protected def createSentence(cas: JCas, begin: Int, end: Int) =
87 | new Sentence(cas, begin, end)
88 | }
89 |
--------------------------------------------------------------------------------
/segmenter/open-nlp-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/OpenNlpSegmenterSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.typesystem._
5 | import com.github.jenshaase.uimascala.core.configuration._
6 | import org.apache.uima.analysis_engine.AnalysisEngine
7 | import org.specs2.mutable.Specification
8 |
9 | class OpenNlpSegmenterSpec extends Specification {
10 |
11 | "Open Nlp Segmenter" should {
12 | "add sentence and token annotations" in {
13 | val segmenter: AnalysisEngine = new OpenNlpSegmenter().
14 | config(
15 | _.sentenceModel := SharedBinding[OpenNlpSentenceSegmenterResource]("/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/sentence-de-maxent.bin"),
16 | _.tokenModel := SharedBinding[OpenNlpTokenSegmenterResource]("/de/tudarmstadt/ukp/dkpro/core/opennlp/lib/token-de-maxent.bin")
17 | ).
18 | asAnalysisEngine
19 |
20 | val jcas = segmenter.newJCas()
21 | jcas.setDocumentText("Wie alt bist du?")
22 | segmenter.process(jcas)
23 |
24 | val sentences = jcas.select[Sentence].toVector
25 | sentences.size must be equalTo(1)
26 | sentences(0).getCoveredText must be equalTo(jcas.getDocumentText)
27 |
28 | val tokens = jcas.select[Token].toVector
29 | tokens.size must be equalTo(5)
30 | tokens(0).getCoveredText must be equalTo("Wie")
31 | tokens(1).getCoveredText must be equalTo("alt")
32 | tokens(2).getCoveredText must be equalTo("bist")
33 | tokens(3).getCoveredText must be equalTo("du")
34 | tokens(4).getCoveredText must be equalTo("?")
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/segmenter/regex-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/RegexTokenizer.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import scala.util.matching.Regex
8 |
9 | /**
10 | * @author Jens Haase
11 | */
12 | class RegexTokenizer extends SCasAnnotator_ImplBase {
13 |
14 | object regex extends Parameter[Regex]("""\s+""".r)
15 | object allowEmptyToken extends Parameter[Boolean](false)
16 |
17 | def process(jcas: JCas) = {
18 | val txt = jcas.getDocumentText
19 |
20 | val mostlyAll = getRegex.findAllMatchIn(txt).foldLeft(0) {
21 | case (last, m) if ((allowEmptyToken.is && m.start >= last) || (!allowEmptyToken.is && m.start > last)) =>
22 | add(createToken(jcas, last, m.start))
23 | m.end
24 | case (_, m) =>
25 | m.end
26 | }
27 |
28 | if (mostlyAll < txt.length)
29 | add(createToken(jcas, mostlyAll, txt.length))
30 | }
31 |
32 | protected def getRegex =
33 | regex.is
34 |
35 | protected def createToken(cas: JCas, begin: Int, end: Int) =
36 | new Token(cas, begin, end)
37 | }
38 |
--------------------------------------------------------------------------------
/segmenter/regex-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/RegexTokenizerSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.segmenter
5 |
6 | import java.util.Locale
7 | import com.github.jenshaase.uimascala.core._
8 | import com.github.jenshaase.uimascala.typesystem._
9 | import org.apache.uima.analysis_engine.AnalysisEngine
10 | import org.specs2.mutable.Specification
11 | import org.apache.uima.fit.factory.AnalysisEngineFactory
12 | import org.apache.uima.fit.util.JCasUtil
13 |
14 | class RegexTokenizerSpec extends Specification {
15 |
16 | "Regex Tokenizer" should {
17 | "split by x" in {
18 | val tokenizer: AnalysisEngine = new RegexTokenizer().
19 | config(
20 | _.regex := "x".r
21 | ).
22 | asAnalysisEngine
23 |
24 | val jcas = tokenizer.newJCas()
25 | jcas.setDocumentText("HalloxWeltxlosxgehtsx")
26 | tokenizer.process(jcas)
27 |
28 | jcas.select[Token].size must be equalTo(4)
29 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
30 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt")
31 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los")
32 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("gehts")
33 | }
34 |
35 | "allow empty token" in {
36 | val tokenizer: AnalysisEngine = new RegexTokenizer().
37 | config(
38 | _.regex := "x".r,
39 | _.allowEmptyToken := true
40 | ).
41 | asAnalysisEngine
42 |
43 | val jcas = tokenizer.newJCas()
44 | jcas.setDocumentText("HalloxWeltxlosxxgehts")
45 | tokenizer.process(jcas)
46 |
47 | jcas.select[Token].size must be equalTo(5)
48 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
49 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt")
50 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los")
51 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("")
52 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("gehts")
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/segmenter/stanford-segmenter/src/main/scala/com/github/jenshaase/uimascala/segmenter/StanfordSegmenter.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core._
4 | import com.github.jenshaase.uimascala.core.configuration._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import org.apache.uima.jcas.JCas
7 | import org.apache.uima.resource.SharedResourceObject
8 | import org.apache.uima.resource.DataResource
9 | import edu.stanford.nlp.ling.TaggedWord
10 | import scala.collection.JavaConversions._
11 | import java.io.StringReader
12 | import java.util.Properties
13 | import edu.stanford.nlp.ling.{CoreLabel, Word}
14 | import edu.stanford.nlp.international.spanish.process.SpanishTokenizer
15 | import edu.stanford.nlp.international.arabic.process.ArabicTokenizer
16 | import edu.stanford.nlp.international.french.process.FrenchTokenizer
17 | import edu.stanford.nlp.trees.international.pennchinese.CHTBTokenizer
18 | import edu.stanford.nlp.process.{WordToSentenceProcessor, Tokenizer, PTBTokenizer, CoreLabelTokenFactory}
19 | import edu.stanford.nlp.ling.CoreAnnotations.{CharacterOffsetBeginAnnotation, CharacterOffsetEndAnnotation}
20 | import edu.stanford.nlp.trees.international.negra.NegraPennTokenizer
21 |
22 | class StanfordSegmenter extends SCasAnnotator_ImplBase {
23 |
24 | object annotateToken extends Parameter[Boolean](true)
25 | object annotateSentence extends Parameter[Boolean](true)
26 | object fallbackLanguage extends Parameter[Option[String]](None) {
27 | override def mandatory_? = false
28 | }
29 |
30 | def process(jcas: JCas) = {
31 | if (annotateToken.is) annotateTokens(jcas)
32 | if (annotateSentence.is) annotateSentences(jcas)
33 | }
34 |
35 | def annotateTokens(jcas: JCas) {
36 | val text = jcas.getDocumentText
37 | val tokenizer = getTokenizer(jcas.getDocumentLanguage, text)
38 |
39 | var offsetInSentence = 0
40 | tokenizer.tokenize().foreach { token =>
41 | token match {
42 | case token: String =>
43 | offsetInSentence = skipWhitespace(text, offsetInSentence)
44 |
45 | if (text.startsWith(token, offsetInSentence)) {
46 | add(createToken(jcas, offsetInSentence, offsetInSentence + token.size))
47 | offsetInSentence = offsetInSentence + token.size
48 | } else {
49 | throw new Exception("Text mismatch in Tokenizer: " + token + " not found")
50 | }
51 |
52 | case label: CoreLabel =>
53 | val begin = label.beginPosition
54 | val end = label.endPosition
55 | add(createToken(jcas, begin, end))
56 | offsetInSentence = end
57 |
58 | case word: Word =>
59 | val token = word.word
60 | offsetInSentence = skipWhitespace(text, offsetInSentence)
61 |
62 | if (text.startsWith(token, offsetInSentence)) {
63 | add(createToken(jcas, offsetInSentence, offsetInSentence + token.size))
64 | offsetInSentence = offsetInSentence + token.size
65 | } else {
66 | throw new Exception("Text mismatch in Tokenizer: " + token + " not found")
67 | }
68 | }
69 | }
70 | }
71 |
72 | def annotateSentences(jcas: JCas) {
73 | val tokens = jcas.select[Token].map { token =>
74 | val label = new CoreLabel()
75 | label.setBeginPosition(token.getBegin)
76 | label.setEndPosition(token.getEnd)
77 | label.setWord(token.getCoveredText)
78 | label
79 | }.toList
80 |
81 | val proc = new WordToSentenceProcessor[CoreLabel]()
82 | proc.process(tokens).foreach { sentence =>
83 | add(createSentence(jcas, sentence.head.beginPosition, sentence.last.endPosition))
84 | }
85 | }
86 |
87 | protected def skipWhitespace(text: String, offset: Int): Int = {
88 | var newOffset = offset
89 | while (newOffset < text.size && Character.isWhitespace(text.charAt(newOffset))) {
90 | newOffset = newOffset + 1
91 | }
92 | newOffset
93 | }
94 |
95 | protected def createToken(cas: JCas, begin: Int, end: Int) =
96 | new Token(cas, begin, end)
97 |
98 | protected def createSentence(cas: JCas, begin: Int, end: Int) =
99 | new Sentence(cas, begin, end)
100 |
101 | protected def getTokenizer(lang: String, text: String): Tokenizer[_] = {
102 | getTokenizerFromLanguage(lang, text) match {
103 | case Some(tokenizer) => tokenizer
104 | case None =>
105 | fallbackLanguage.is.flatMap { lang =>
106 | getTokenizerFromLanguage(lang, text)
107 | }.getOrElse(
108 | throw new Exception("can not create tokenizer for language: " + lang)
109 | )
110 | }
111 | }
112 |
113 | private def getTokenizerFromLanguage(lang: String, text: String): Option[Tokenizer[_]] =
114 | lang match {
115 | case "ar" => Some(ArabicTokenizer.newArabicTokenizer(new StringReader(text), new Properties()))
116 | case "en" => Some(new PTBTokenizer[CoreLabel](new StringReader(text), new CoreLabelTokenFactory(), "invertible"))
117 | case "es" => Some(SpanishTokenizer.factory(new CoreLabelTokenFactory(), null).getTokenizer(new StringReader(text)))
118 | case "fr" => Some(FrenchTokenizer.factory().getTokenizer(new StringReader(text), "tokenizeNLs=false"))
119 | case "de" => Some(new NegraPennTokenizer(new StringReader(text)))
120 | case "zh" => Some(new CHTBTokenizer(new StringReader(text)))
121 | case _ => None
122 | }
123 | }
124 |
125 |
--------------------------------------------------------------------------------
/segmenter/stanford-segmenter/src/test/scala/com/github/jenshaase/uimascala/segmenter/StanfordSegmenterSpec.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import java.util.Locale
4 | import com.github.jenshaase.uimascala.core._
5 | import com.github.jenshaase.uimascala.typesystem._
6 | import com.github.jenshaase.uimascala.core.configuration._
7 | import org.apache.uima.analysis_engine.AnalysisEngine
8 | import org.specs2.mutable.Specification
9 | import org.apache.uima.fit.factory.AnalysisEngineFactory
10 | import org.apache.uima.fit.util.JCasUtil
11 |
12 | class StanfordSegmenterSpec extends Specification {
13 |
14 | "StanfordSegmenter" should {
15 | "segment english sentences and tokens" in {
16 | val segmenter: AnalysisEngine = new StanfordSegmenter().
17 | asAnalysisEngine
18 |
19 | val jcas = segmenter.newJCas()
20 | jcas.setDocumentText("This is a text. Here we are! ")
21 | jcas.setDocumentLanguage("en")
22 | segmenter.process(jcas)
23 |
24 | jcas.select[Token].size must be equalTo(9)
25 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("This")
26 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("is")
27 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("a")
28 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("text")
29 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo (".")
30 | jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("Here")
31 | jcas.selectByIndex[Token](6).getCoveredText must be equalTo ("we")
32 | jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("are")
33 | jcas.selectByIndex[Token](8).getCoveredText must be equalTo ("!")
34 |
35 | jcas.select[Sentence].size must be equalTo(2)
36 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("This is a text.")
37 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("Here we are!")
38 | }
39 |
40 | "segment french sentences and tokens" in {
41 | val segmenter: AnalysisEngine = new StanfordSegmenter().
42 | asAnalysisEngine
43 |
44 | val jcas = segmenter.newJCas()
45 | jcas.setDocumentText("Bonjour à tous. C'est parti!")
46 | jcas.setDocumentLanguage("fr")
47 | segmenter.process(jcas)
48 |
49 | jcas.select[Token].size must be equalTo(8)
50 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Bonjour")
51 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("à")
52 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("tous")
53 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo (".")
54 | jcas.selectByIndex[Token](4).getCoveredText must be equalTo ("C'")
55 | jcas.selectByIndex[Token](5).getCoveredText must be equalTo ("est")
56 | jcas.selectByIndex[Token](6).getCoveredText must be equalTo ("parti")
57 | jcas.selectByIndex[Token](7).getCoveredText must be equalTo ("!")
58 |
59 | jcas.select[Sentence].size must be equalTo(2)
60 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("Bonjour à tous.")
61 | jcas.selectByIndex[Sentence](1).getCoveredText must be equalTo ("C'est parti!")
62 | }
63 |
64 | "segment english text without a point" in {
65 | val segmenter: AnalysisEngine = new StanfordSegmenter().
66 | asAnalysisEngine
67 |
68 | val jcas = segmenter.newJCas()
69 | jcas.setDocumentText("This is a sentence")
70 | jcas.setDocumentLanguage("en")
71 | segmenter.process(jcas)
72 |
73 | jcas.select[Sentence].size must be equalTo(1)
74 | jcas.selectByIndex[Sentence](0).getCoveredText must be equalTo ("This is a sentence")
75 | }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/segmenter/whitespace-tokenizer/src/main/scala/com/github/jenshaase/uimascala/segmenter/WhitespaceTokenizer.scala:
--------------------------------------------------------------------------------
1 | package com.github.jenshaase.uimascala.segmenter
2 |
3 | import com.github.jenshaase.uimascala.core.configuration._
4 | import scala.util.matching.Regex
5 |
6 | class WhitespaceTokenizer extends RegexTokenizer {
7 |
8 | override def getRegex = """\s+""".r
9 | }
10 |
--------------------------------------------------------------------------------
/segmenter/whitespace-tokenizer/src/test/scala/com/github/jenshaase/uimascala/segmenter/WhitespaceTokenizerSpec.scala:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (C) 2011 Jens Haase
3 | */
4 | package com.github.jenshaase.uimascala.segmenter
5 |
6 | import java.util.Locale
7 | import com.github.jenshaase.uimascala.core._
8 | import com.github.jenshaase.uimascala.typesystem._
9 | import org.apache.uima.analysis_engine.AnalysisEngine
10 | import org.specs2.mutable.Specification
11 | import org.apache.uima.fit.factory.AnalysisEngineFactory
12 | import org.apache.uima.fit.util.JCasUtil
13 |
14 | class WhitespaceTokenizerSpec extends Specification {
15 |
16 | "Whitespace Tokenizer" should {
17 | "split by whitespace" in {
18 | val tokenizer: AnalysisEngine = new WhitespaceTokenizer().asAnalysisEngine
19 |
20 | val jcas = tokenizer.newJCas()
21 | jcas.setDocumentText("Hallo Welt los\ngehts ")
22 | tokenizer.process(jcas)
23 |
24 | jcas.select[Token].size must be equalTo(4)
25 | jcas.selectByIndex[Token](0).getCoveredText must be equalTo ("Hallo")
26 | jcas.selectByIndex[Token](1).getCoveredText must be equalTo ("Welt")
27 | jcas.selectByIndex[Token](2).getCoveredText must be equalTo ("los")
28 | jcas.selectByIndex[Token](3).getCoveredText must be equalTo ("gehts")
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/type-system/src/main/resources/META-INF/org.apache.uima.fit/types.txt:
--------------------------------------------------------------------------------
1 | classpath*:desc/types/**/*.xml
2 |
--------------------------------------------------------------------------------
/type-system/src/main/resources/desc/types/TypeSystem.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | TypeSystem
4 |
5 |
6 | com.github.jenshaase.uimascala.typesystem.Token
7 |
8 | uima.tcas.Annotation
9 |
10 |
11 | pos
12 | com.github.jenshaase.uimascala.typesystem.POS
13 |
14 |
15 | lemma
16 | com.github.jenshaase.uimascala.typesystem.Lemma
17 |
18 |
19 | parent
20 | uima.tcas.Annotation
21 |
22 |
23 |
24 |
25 |
26 | com.github.jenshaase.uimascala.typesystem.Sentence
27 |
28 | uima.tcas.Annotation
29 |
30 |
31 |
32 |
33 |
34 | com.github.jenshaase.uimascala.typesystem.POS
35 |
36 | uima.tcas.Annotation
37 |
38 |
39 | name
40 | uima.cas.String
41 |
42 |
43 |
44 |
45 |
46 | com.github.jenshaase.uimascala.typesystem.Lemma
47 |
48 | uima.tcas.Annotation
49 |
50 |
51 | value
52 | uima.cas.String
53 |
54 |
55 |
56 |
57 |
58 | com.github.jenshaase.uimascala.typesystem.Dependency
59 |
60 | uima.tcas.Annotation
61 |
62 |
63 | governor
64 | com.github.jenshaase.uimascala.typesystem.Token
65 |
66 |
67 | dependent
68 | com.github.jenshaase.uimascala.typesystem.Token
69 |
70 |
71 | dependencyType
72 | uima.cas.String
73 |
74 |
75 |
76 |
77 |
78 | com.github.jenshaase.uimascala.typesystem.DependencyRoot
79 |
80 | com.github.jenshaase.uimascala.typesystem.Dependency
81 |
82 |
83 |
84 | com.github.jenshaase.uimascala.typesystem.Constituent
85 |
86 | uima.tcas.Annotation
87 |
88 |
89 | constituentType
90 | uima.cas.String
91 |
92 |
93 | parent
94 | uima.tcas.Annotation
95 |
96 |
97 | children
98 | uima.cas.FSArray
99 | uima.tcas.Annotation
100 |
101 |
102 | syntacticFunction
103 | uima.cas.String
104 |
105 |
106 |
107 |
108 |
109 | com.github.jenshaase.uimascala.typesystem.NamedEntity
110 |
111 | uima.tcas.Annotation
112 |
113 |
114 | value
115 | uima.cas.String
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.2-SNAPSHOT"
2 |
--------------------------------------------------------------------------------