├── .gitignore
├── .travis.yml
├── LICENSE
├── README.markdown
├── pom.xml
├── suim-examples
    ├── data
    │   ├── Apache_UIMA.txt
    │   ├── IBM_LifeSciences.txt
    │   ├── New_IBM_Fellows.txt
    │   ├── SeminarChallengesInSpeechRecognition.txt
    │   ├── TrainableInformationExtractionSystems.txt
    │   ├── UIMASummerSchool2003.txt
    │   ├── UIMA_Seminars.txt
    │   ├── WatsonConferenceRooms.txt
    │   └── xml
    │   │   ├── IBM_LifeSciences.xml
    │   │   ├── New_IBM_Fellows.xml
    │   │   ├── SeminarChallengesInSpeechRecognition.xml
    │   │   ├── TrainableInformationExtractionSystems.xml
    │   │   ├── UIMASummerSchool2003.xml
    │   │   ├── UIMA_Seminars.xml
    │   │   └── WatsonConferenceRooms.xml
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── resources
    │       │   ├── META-INF
    │       │   │   └── org.apache.uima.fit
    │       │   │   │   └── types.txt
    │       │   ├── ex
    │       │   │   ├── RoomNumberAndDateTime.xml
    │       │   │   └── TutorialTypeSystem.xml
    │       │   └── org
    │       │   │   └── apache
    │       │   │       └── uima
    │       │   │           └── tutorial
    │       │   │               └── ex6
    │       │   │                   └── uimaAcronyms.txt
    │       └── scala
    │       │   └── edu
    │       │       └── cmu
    │       │           └── lti
    │       │               └── suim
    │       │                   └── examples
    │       │                       ├── Annotators.scala
    │       │                       ├── App.scala
    │       │                       ├── AppWithHDFS.scala
    │       │                       └── SparkPipelineExample.scala
    │   └── test
    │       └── scala
    │           └── spark-uima-tools
    │               └── AppSpec.scala
├── suim-java
    ├── pom.xml
    └── src
    │   └── main
    │       └── java
    │           └── edu
    │               └── cmu
    │                   └── lti
    │                       └── suim
    │                           └── JavaSparkUima.java
└── suim-scala
    ├── pom.xml
    └── src
        ├── main
            └── scala
            │   └── edu
            │       └── cmu
            │           └── lti
            │               └── suim
            │                   ├── SCAS.scala
            │                   └── SparkUimaUtils.scala
        └── test
            └── scala
                └── spark-uima-tools
                    └── AppSpec.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | logs/
 2 | target/
 3 | *.DS_Store
 4 | *.releaseBackup
 5 | release.properties
 6 | *.iml
 7 | *.iws
 8 | *.ipr
 9 | .idea/
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 |  language: scala
2 |  scala:
3 |    - 2.9.2
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2012 Twitter Inc
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | # SUIM
 2 | 
 3 | Spark for Unstructured Information, provides a thin abstraction layer for [UIMA](http://uima.apache.org/)  
 4 | on top of [Spark](http://spark.apache.org/). 
 5 | SUIM leverages on Spark resilient distributed dataset (RDD) to run UIMA pipelines using uimaFIT, SUIM pipelines are
 6 | distributed across the nodes on a cluster and can be operated on in parallel [1].
 7 | 
 8 | SUIM allows you to run analytical pipelines on the resulting (or intermediate) `CAS` to execute furhter text analytics or 
 9 | machine learning algorithms.
10 | 
11 | ## Examples
12 | 
13 | #### Count buildings from the UIMA tutorial.
14 | 
15 | Using the `RoomAnnotator` from the UIMA tutorial:
16 | 
17 | 
18 | ```scala
19 |     val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
20 |     val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
21 |     val rdd = makeRDD(createCollectionReader(classOf[FileSystemCollectionReader], params: _*), sc)
22 |     val rnum = createEngineDescription(classOf[RoomNumberAnnotator])
23 |     val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber]))
24 |     val counts = rooms.map(room => room.getBuilding()).map((_,1)).reduceByKey(_ + _)
25 |     counts.foreach(println(_))
26 | ```
27 | 
28 | If the collection is to large to fit in memory, or you already have a collection of `SCAS`es use an HDFS RDD:
29 | 
30 | ```scala
31 |     val rdd = sequenceFile(reateCollectionReader(classOf[FileSystemCollectionReader], params: _*),
32 |       "hdfs://localhost:9000/documents", sc)
33 | ```
34 | 
35 | #### Tokenize and count words with DKPro Core
36 | 
37 | Use DKPro Core [2] to tokenize and Spark to do token level analytics.
38 | 
39 | ```scala
40 |     val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
41 |     val rdd = makeRDD(createCollectionReader(classOf[TextReader],
42 |       ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "data",
43 |       ResourceCollectionReaderBase.PARAM_LANGUAGE, "en",
44 |       ResourceCollectionReaderBase.PARAM_PATTERNS,  Array("[+]*.txt")), sc)
45 |     val seg = createPrimitiveDescription(classOf[BreakIteratorSegmenter])
46 |     val tokens = rdd.map(process(_, seg)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[Token]))
47 |     val counts = tokens.map(token => token.getCoveredText())
48 |       .filter(filter(_))
49 |       .map((_,1)).reduceByKey(_ + _)
50 |       .map(pair => (pair._2, pair._1)).sortByKey(true)
51 |     counts.foreach(println(_))
52 | ```
53 | 
54 | ### Common Tasks
55 | 
56 | To build:
57 | 
58 |     mvn compile
59 | 
60 | To run:
61 | 
62 |     mvn scala:run
63 | 
64 | To test:
65 | 
66 |     mvn test
67 | 
68 | To create standalone with dependencies:
69 | 
70 |     mvn package
71 |     java -jar target/spark-uima-tools-0.0.1-SNAPSHOT-jar-with-dependencies.jar
72 | 
73 | ## References
74 | * [1] http://spark.incubator.apache.org/docs/latest/scala-programming-guide.html
75 | * [2] https://code.google.com/p/dkpro-core-asl/
76 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 2 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
 4 |                              http://maven.apache.org/maven-v4_0_0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 |     
 7 |     <groupId>edu.cmu.lti</groupId>
 8 |     <artifactId>suim</artifactId>
 9 |     <version>0.0.1-SNAPSHOT</version>
10 |     <name>SUIM</name>
11 |     <packaging>pom</packaging>
12 | 
13 |     <inceptionYear>2013</inceptionYear>
14 |     <url>https://github.com/oaqa/suim</url>
15 | 
16 |     <parent>
17 |       <groupId>org.sonatype.oss</groupId>
18 |       <artifactId>oss-parent</artifactId>
19 |       <version>7</version>
20 |     </parent>
21 |     
22 |     <issueManagement>
23 |       <system>github.com</system>
24 |       <url>https://github.com/oaqa/suim/issues</url>
25 |     </issueManagement>
26 | 
27 |     <licenses>
28 |       <license>
29 | 	<name>The Apache Software License, Version 2.0</name>
30 | 	<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
31 | 	<distribution>repo</distribution>
32 |       </license>
33 |     </licenses>
34 |     
35 |     <scm>
36 |       <url>git@github.com:oaqa/suim.git</url>
37 |       <connection>scm:git:git@github.com:oaqa/suim.git</connection>
38 |       <developerConnection>scm:git:git@github.com:oaqa/suim.git</developerConnection>
39 |     </scm>
40 |   
41 |     <properties>
42 |       <maven.compiler.source>1.6</maven.compiler.source>
43 |       <maven.compiler.target>1.6</maven.compiler.target>
44 |       <encoding>UTF-8</encoding>
45 |     </properties>
46 |         
47 |     <repositories>
48 |       <repository>
49 |         <id>twttr</id>
50 |         <name>twttr</name>
51 |         <url>http://maven.twttr.com</url>
52 |       </repository>
53 |     </repositories>
54 |     
55 |     <pluginRepositories>
56 |       <pluginRepository>
57 |         <id>scala-tools.org</id>
58 |         <name>Scala-Tools Maven2 Repository</name>
59 |         <url>http://scala-tools.org/repo-releases</url>
60 |       </pluginRepository>
61 |     </pluginRepositories>
62 |     
63 |     <modules>
64 |         <module>suim-java</module>
65 |         <module>suim-scala</module>
66 |         <module>suim-examples</module>
67 |     </modules>
68 | 
69 |     <build>
70 |       <pluginManagement>
71 |         <plugins>
72 |           <plugin>
73 |             <groupId>org.apache.maven.plugins</groupId>
74 |             <artifactId>maven-compiler-plugin</artifactId>
75 |             <configuration>
76 |               <source>1.6</source>
77 |               <target>1.6</target>
78 |             </configuration>
79 |           </plugin>
80 |         </plugins>
81 |       </pluginManagement>
82 |     </build>
83 | 
84 |     <dependencies>
85 |       <dependency>
86 |         <groupId>junit</groupId>
87 |         <artifactId>junit</artifactId>
88 |         <version>3.8.1</version>
89 |         <scope>test</scope>
90 |       </dependency>
91 |     </dependencies>
92 | </project>
93 | 


--------------------------------------------------------------------------------
/suim-examples/data/Apache_UIMA.txt:
--------------------------------------------------------------------------------
 1 | ﻿Welcome to Apache UIMA (Unstructured Information Management Architecture), a incubator project of the  Apache Software Foundation (ASF). 
 2 | Our goal is a thriving community of users and developers of UIMA frameworks, supporting components for analysing unstructured content such as text, audio and video.
 3 | 
 4 | What is UIMA?
 5 | 
 6 | Unstructured Information Management applications are software systems that analyze large volumes of unstructured information in order to discover knowledge that is relevant to an end user. 
 7 | UIMA is a framework and SDK for developing such applications. An example UIM application might ingest plain text and identify entities, such as persons, places, organizations; or relations, such as works-for or located-at. 
 8 | UIMA enables such an application to be decomposed into components, for example "language identification" -> "language specific segmentation" -> "sentence boundary detection" -> "entity detection (person/place names etc.)". 
 9 | Each component must implement interfaces defined by the framework and must provide self-describing metadata via XML descriptor files. The framework manages these components and the data flow between them. Components are written in Java or C++; the data that flows between components is designed for efficient mapping between these languages. 
10 | UIMA additionally provides capabilities to wrap components as network services, and can scale to very large volumes by replicating processing pipelines over a cluster of networked nodes.
11 | 
12 | Apache UIMA is an Apache-licensed open source implementation of the UIMA specification (that specification is, in turn, being developed concurrently by a technical committee within OASIS , a standards organization). 
13 | We invite and encourage you to participate in both the implementation and specification efforts.
14 | 
15 | UIMA is a component framework for analysing unstructured content such as text, audio and video. 
16 | It comprises an SDK and tooling for composing and running analytic components written in Java and C++, with some support for Perl, Python and TCL. 
17 | 
18 | 
19 | Apache UIMA mailing lists:
20 | 
21 | Users - uima-user@incubator.apache.org
22 | Developers - uima-dev@incubator.apache.org
23 | Commits - uima-commits@incubator.apache.org
24 | 
25 | 
26 | Apache UIMA project committers:
27 | 
28 | Michael Baessler
29 | Edward Epstein
30 | Thilo Goetz
31 | Adam Lally
32 | Marshall Schor
33 | 
34 | 
35 | Apache UIMA project Mentors:
36 | 
37 | Ken Coar (ASF member and Vice President)
38 | Sam Ruby (ASF member)


--------------------------------------------------------------------------------
/suim-examples/data/IBM_LifeSciences.txt:
--------------------------------------------------------------------------------
 1 | ﻿"Life sciences is one of the emerging markets at the heart of IBM's growth strategy," said John M. Thompson, IBM senior vice president & group executive, Software. "This investment is the first of a number of steps we will be taking to advance IBM's life sciences initiatives." In his role as newly appointed IBM Corporation vice chairman, effective September 1, Mr. Thompson will be responsible for integrating and accelerating IBM's efforts to exploit life sciences and other emerging growth areas.
 2 | 
 3 | IBM estimates the market for IT solutions for life sciences will skyrocket from $3.5 billion today to more than $9 billion by 2003. Driving demand is the explosive growth in genomic, proteomic and pharmaceutical research. For example, the Human Genome Database is approximately three terabytes of data, or the equivalent of 150 million pages of information. The volume of life sciences data is doubling every six months. 
 4 | 
 5 | "All of this genetic data is worthless without the information technology that can help scientists manage and analyze it to unlock the pathways that will lead to new cures for many of today's diseases," said Dr. Caroline Kovac, vice president of IBM's new Life Sciences unit. "IBM can help speed this process by enabling more efficient interpretation of data and sharing of knowledge. The potential for change based on innovation in life sciences is bigger than the change caused by the digital circuit."
 6 | 
 7 | Among the life sciences initiatives already underway at IBM are:
 8 | - DiscoveryLink* -- For the first time, researchers using this combination of innovative middleware and integration services can join together information from many sources to solve complex medical research problems. DiscoveryLink creates a "virtual database" that permits data to be accessed and extracted from multiple data sources used in research and development projects. This IT solution can dramatically improve product cycle time and lower development costs for pharmaceutical, biotechnology and agri-science companies. 
 9 | 
10 | - Blue Gene* - IBM is building a supercomputer 100 times faster than any available today designed to advance understanding of the mechanisms behind protein folding through large-scale biomolecular simulation. In December, IBM committed $100 million to this five-year research project to advance the state-of-the-art in supercomputing for biological applications.
11 | - Bio-Dictionary* -- IBM has compiled a protein dictionary containing some 30 million protein "words" designed to accelerate the understanding of protein shapes and functions.Bio-Dictionaries for selected genomes, as well as bioinformatics algorithms for pattern discovery and other relevant applications, are available to scientists and researchers for noncommercial use through a website dedicated to life sciences content at http://www.research.ibm.com/compsci/compbio/.
12 | 
13 | * Indicates trademark or registered trademark of IBM Corporation.


--------------------------------------------------------------------------------
/suim-examples/data/New_IBM_Fellows.txt:
--------------------------------------------------------------------------------
 1 | ﻿IBM today elevated five employees to the title of IBM Fellow -- its most prestigious technical honor. The company also presented more than $2.8 million in cash awards to employees whose technical innovation have yielded exceptional value to the company and its customers.
 2 | 
 3 | IBM conferred the accolades and awards at its 2003 Corporate Technical Recognition Event (CTRE) in Scottsdale, Ariz. CTRE is a 40-year tradition at IBM, established to recognize exceptional technical employees and reward them for extraordinary achievements and contributions to the company's technology leadership.
 4 | 
 5 | "Our technical employees are among the best and brightest innovators in the world. They share a passion for excellence that defines their work and permeates the products and services IBM delivers to its customers," said Nick Donofrio, senior vice president, technology and manufacturing for IBM. "CTRE provides the means for us to honor those who have distinguished themselves as exceptional leaders among their peers."
 6 | 
 7 | Among the special honorees at the 2003 CTRE are five employees who earned the coveted distinction of IBM Fellow:
 8 | 
 9 | 
10 | - Grady Booch, chief scientist of Rational Software, IBM Software Group. Recognized internationally for his innovative work on software architecture, modeling, and software engineering process. Mr. Booch is one of the original authors of the Unified Modeling Language (UML), the industry-standard language of blueprints for software-intensive systems.
11 | 
12 | - Dr. Donald Chamberlin, researcher, IBM Almaden Research Center. An expert in relational database languages, Dr. Chamberlin is co- inventor of SQL, the language that energized the relational database market. He has also influenced the creation of XQuery, one of a new generation of database query languages covering structured, semi-structured and unstructured data.
13 | 
14 | - Dr. George Galambos, chief technology officer, IBM Global Services (IGS) in Canada; the first Fellow from Canada. Dr. Galambos specializes in high-performance, high availability designs, operational effectiveness, and risk assessment/mitigation, focusing on systems engineering and architecture reuse that enhances efficiency and stability. He is a principal driver of and contributor to the widely acclaimed "Patterns for e-business" and the Enterprise Solution Structure Reference Architectures, widely used by IGS in customer engagements.
15 | 
16 | - Rod Smith, vice president of Internet emerging technologies, IBM Software Group. A leader in the areas of object-oriented programming, visual development tools, Java, XML, and Web Services. Rod also was the chief technical strategist for focusing the Java platform for use in middleware solutions, in particular initiating contributions to the development of the J2EE.
17 | 
18 | - Charles Webb, eServer processor design, IBM Systems Group. Charles Webb has led the reinvention of IBM's eServer zSeries microprocessor designs and roadmap, including the z900 server, where he provided the bridge among architecture, hardware, compilers and system software, defining major portions of the 64- bit architecture and beyond.
19 | 
20 | 
21 | The title of IBM Fellow is the company's most preeminent technical distinction and is granted in recognition of outstanding and sustained technical achievements in engineering, programming, science and technology. Only 175 individuals have earned this designation in the company's history and, including the newly named Fellows, 56 are active employees. IBM Fellows are encouraged to further enhance their potential for creative achievements and typically work on special projects or research initiatives that lead the company in exciting new directions.
22 | 
23 | 


--------------------------------------------------------------------------------
/suim-examples/data/SeminarChallengesInSpeechRecognition.txt:
--------------------------------------------------------------------------------
 1 | ﻿UIT Seminar: Challenges in Speech Recognition
 2 |   August 8, 2003   10:30 AM - 11:30 AM 
 3 |   Lawrence Rabiner , Associate Director CAIP, Rutgers 
 4 |   University, Professor Univ. of Santa Barbara   
 5 |   Yorktown 20-043
 6 |   Availability: Open 
 7 | 
 8 |   Speech recognition has matured to the point where it
 9 |   is now being widely applied in a range of applications
10 |   including desktop dictation, cell phone name dialing,
11 |   agent technology, automated operator services,
12 |   telematics, call center automation and help desks.
13 | 
14 |   Although the technology is often good enough for many
15 |   of these applications, there remain key challenges in
16 |   virtually every aspect of speech recognition that
17 |   prevent the technology from being used ubiquitously in
18 |   any environment, for any speaker, and for an even
19 |   broader range of applications. This talk will analyze
20 |   the ‘Speech Circle’ that enables a person to maintain
21 |   a dialog with a machine using speech recognition,
22 |   spoken language understanding, dialog management and
23 |   spoken language generation, and finally text-to-speech
24 |   synthesis, and show where significant progress has
25 |   been made, and where there remain critical problems
26 |   that need to be addressed and solved.
27 | 
28 |   The talk will include several audio and video examples
29 |   of speech recognition and speech understanding systems
30 |   that have been studied in the laboratory to illustrate
31 |   the challenges that remain to be solved before speech
32 |   recognition is considered a solved problem. 
33 | 
34 |        
35 | 


--------------------------------------------------------------------------------
/suim-examples/data/TrainableInformationExtractionSystems.txt:
--------------------------------------------------------------------------------
 1 | ﻿Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems
 2 | 
 3 | August 19, 2003   02:00 PM - 03:30 PM  
 4 | David Johnson, Frank Oles, Tong Zhang(IBM Research)    
 5 | Hawthorne GN-F15 
 6 | Availability: Open  
 7 | 
 8 | The technical objective of the TIES project is to build customizable systems that can identify named entities in text, such as persons, organizations, and locations, as well as identifying relations between those entities. The technical approach is to develop new statistical and symbolic machine learning algorithms in service of the technical objective. Also, we are working on combining statistical with symbolic techniques. The first part of this talk, given by David E. Johnson, will provide a general overview of the goals of the TIES project. The second part, given by Tong Zhang, will provide background on applying statistical machine learning to this problem domain. Tong will also describe the particular statistical approach taken, which is termed Robust Risk Minimization (RMM). The final part will be given by Frank J. Oles. Frank will introduce his theory of precedence-inclusion patterns. Precedence-inclusion patterns are mathematical structures possessing multiple interacting strict partial orders that satisfy axioms generalizing the familiar properties of irreflexivity and transitivity. This very general theory provides a radically new approach to symbolic, as opposed to statistical, pattern generalization that can be applied to relational learning in a number of settings, including learning based on text, on images, or on videos. 
 9 |  
10 | 
11 |  
12 | 


--------------------------------------------------------------------------------
/suim-examples/data/UIMASummerSchool2003.txt:
--------------------------------------------------------------------------------
 1 | ﻿UIMA Summer School
 2 | 
 3 | August 26, 2003
 4 | UIMA 101 - The New UIMA Introduction 
 5 | (Hands-on Tutorial)
 6 | 9:00AM-5:00PM in HAW GN-K35
 7 | 
 8 | August 28, 2003
 9 | FROST Tutorial
10 | 9:00AM-5:00PM in HAW GN-K35
11 | 
12 | September 15, 2003
13 | UIMA 201: UIMA Advanced Topics 
14 | (Hands-on Tutorial)
15 | 9:00AM-5:00PM in HAW 1S-F53
16 | 
17 | September 17, 2003
18 | The UIMA System Integration Test and Hardening Service
19 | The "SITH"
20 | 3:00PM-4:30PM in HAW GN-K35
21 | 
22 | 
23 | 
24 | UIMA Summer School Tutorial and Presentation Details
25 | UIMA 101: The new UIMA tutorial  
26 | Tuesday August 26 9:00AM - 4:30PM in GN-K35
27 | 
28 | UIMA 101 is a hands-on programming tutorial.
29 | 
30 | UIMA 101 is intended for people who want a first introductory course to UIMA or for people who would like a refresher.
31 | 
32 | The tutorial covers the same concepts in the first UIMA tutorial given in 3Q 2002 except for some key updates:
33 | 
34 | 1) It uses a new interface to the CAS that makes it more natural to access and update CAS feature structures using ordinary Java objects (i.e., the JCAS) and
35 | 2) It uses updated TAE interfaces that give the application developer more control over managing multiple CASs. 
36 | 
37 | Please NOTE expert users of UIMA can skip this one and should consider attending the Advanced Topics tutorial.
38 | 
39 | Prerequisites for the UIMA 101 Tutorial
40 | 1) Java Programming
41 | 2) Some experience with Eclipse IDE helpful
42 | 
43 | FROST Tutorial
44 | August 28  9:00AM - 5:00PM  in GN-K35
45 | 
46 | Visitors from the FROST team will be here to talk to us about FROST.
47 | 
48 | UIMA 201: The UIMA Advanced Topics Tutorial
49 | September 15:   9:00AM - 5:30PM in Hawthorne 1S-F53
50 | 
51 | UIMA 201 will introduce some new UIMA concepts and walk the student through hands-on examples.
52 | 
53 | The advanced topics tutorial is designed for people who have some experience with UIMA and want 
54 | to use new capabilities of UIMA 1.0 to address one or more of the following 
55 | Advanced Topics:
56 | 
57 | 1) Collection Processing and Collection Processing Engines (CPEs)
58 | 2) Multi-Threading and CAS Pooling
59 | 3) Using the UIMA adapter framework to integrate network TAEs with Java TAEs
60 | 4) A Semantic Search Application that brings it all together	
61 | 
62 | Prerequisites for UIMA 201
63 | 1) UIMA 101 Tutorial OR Extensive UIMA Experience
64 | 
65 | The UIMA Integration Test bed Service (The "SITH")
66 | September 17 3:00PM - 4:30PM in HAW GN-K35
67 | 
68 | We have developed the first version of the UIMA Integration Test bed service.
69 | 
70 | This service is being developed to help test, evaluate, certify and publish UIMA compliant components.
71 | 
72 | In this talk we will explain the service and what it is intended to provide the UIMA community. We will address the following topics:
73 | 
74 | 1. SITH Services
75 | 2. How to submit components and what to expect in return
76 | 3. Overview of the test bed implementation using Collection Processing UIMA and Juru. 
77 | 4. Next Steps for the SITH
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/suim-examples/data/UIMA_Seminars.txt:
--------------------------------------------------------------------------------
 1 | ﻿Upcoming UIMA Seminars
 2 | 
 3 | April 7, 2004 Distillery Lunch Seminar
 4 | UIMA and its Metadata
 5 | 12:00PM-1:00PM in HAW GN-K35. 
 6 | 
 7 | Dave Ferrucci will give a UIMA overview and discuss the types of component metadata that UIMA components provide.  Jon Lenchner will give a demo of the Text Analysis Engine configurator tool.
 8 | 
 9 | 
10 | April 16, 2004 KM & I Department Tea 
11 | Title: An Eclipse-based TAE Configurator Tool
12 | 3:00PM-4:30PM in HAW GN-K35 .
13 | 
14 | Jon Lenchner will demo an Eclipse plugin for configuring TAE descriptors, which will be available soon for you to use.  No more editing XML descriptors by hand!
15 | 
16 | 
17 | May 11, 2004 UIMA Tutorial 
18 | 9:00AM-5:00PM in HAW GN-K35.
19 | 
20 | This is a full-day, hands-on tutorial on UIMA, covering the development of Text Analysis Engines and Collection Processing Engines, as well as how to include these components in your own applications.
21 | 


--------------------------------------------------------------------------------
/suim-examples/data/WatsonConferenceRooms.txt:
--------------------------------------------------------------------------------
 1 | ﻿Conference Rooms at Watson:
 2 |  Location	Capacity 	Wall Phone Ext.  
 3 | 
 4 | Classroom Style  
 5 |   HAW J2-B34 	Seats 12 	tieline 863-3130  
 6 |   HAW J2-N07 	Seats 24 	tieline 863-3210  
 7 |   YKT 20-001 	Seats 36 	tieline 862-4304  
 8 |   YKT 20-051 	Seats 18 	tieline 862-4307  
 9 | 
10 | Conference Style  
11 |   HAW 2N-F28 	Seats 20 	tieline 863-7583  
12 |   HAW 4N-B15 	Seats 14 	tieline 863-7126  
13 |   HAW 4N-B17 	Seats 10 	tieline 863-7089  
14 |   HAW 4S-K21 	Seats 16 	tieline 863-6386  
15 |   HAW GN-F14 	Seats 12 	tieline 863-6770  
16 |   HAW GN-K30 	Seats 12 	tieline 863-7335  
17 |   HAW GN-K36 	Seats 10 	tieline 863-6098  
18 |   HAW J1-N14 	Seats 24 	tieline 863-3629  
19 |   HAW J2-A16 	Seats 12 	tieline 863-3240  
20 |   HAW J2-G27 	Seats 15 	tieline 863-3150  
21 |   HAW J2-M24 	Seats 8 	tieline 863-3160  
22 |   YKT 03-135 	Seats 8 	tieline 862-1696  
23 |   YKT 03-235 	Seats 8 	tieline 862-4278  
24 |   YKT 05-135 	Seats 8 	tieline 862-3477  
25 |   YKT 05-235 	Seats 8 	tieline 862-4279  
26 |   YKT 20-006 	Seats 8 	tieline 862-4301  
27 |   YKT 20-059 	Seats 20 	tieline 862-4308  
28 |   YKT 35-132 	Seats 8 	tieline 862-2873  
29 |   YKT 35-232 	Seats 8 	tieline 862-2860  
30 |   YKT 38-023 	Seats 8 	tieline 862-3299  
31 |   YKT 39-132 	Seats 8 	tieline 862-3486  
32 |   YKT 40-100 	Seats 20 	tieline 862-4199  
33 |   YKT 40-200 	Seats 20 	tieline 862-1379  
34 |  
35 | Other  
36 |   HAW GN-K35 	Seats 24 	tieline 863-6104  
37 | 
38 | Theater Style  
39 |   HAW 1S-F40 	Seats 30 	tieline 863-6396  
40 |   YKT 20-043 	Seats 50 	tieline 862-4306  
41 | 
42 | Video Conference Room  
43 |   YKT 32-026 	Seats 25 	tieline 862-3917 
44 | 


--------------------------------------------------------------------------------
/suim-examples/data/xml/IBM_LifeSciences.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 	<!--
 4 | 	 ***************************************************************
 5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 6 | 	 * or more contributor license agreements.  See the NOTICE file
 7 | 	 * distributed with this work for additional information
 8 | 	 * regarding copyright ownership.  The ASF licenses this file
 9 | 	 * to you under the Apache License, Version 2.0 (the
10 | 	 * "License"); you may not use this file except in compliance
11 | 	 * with the License.  You may obtain a copy of the License at
12 |          *
13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
14 | 	 * 
15 | 	 * Unless required by applicable law or agreed to in writing,
16 | 	 * software distributed under the License is distributed on an
17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 | 	 * KIND, either express or implied.  See the License for the
19 | 	 * specific language governing permissions and limitations
20 | 	 * under the License.
21 | 	 ***************************************************************
22 |    -->
23 | 
24 | <DOC>
25 | <TITLE>IBM announces $100 Million investment in Life Sciences</TITLE>
26 | <DATE>16 August 2000</DATE>
27 | <TEXT>"Life sciences is one of the emerging markets at the heart of IBM's growth strategy," said John M. Thompson, IBM senior vice president &amp; group executive, Software. "This investment is the first of a number of steps we will be taking to advance IBM's life sciences initiatives." In his role as newly appointed IBM Corporation vice chairman, effective September 1, Mr. Thompson will be responsible for integrating and accelerating IBM's efforts to exploit life sciences and other emerging growth areas.
28 | 
29 | IBM estimates the market for IT solutions for life sciences will skyrocket from $3.5 billion today to more than $9 billion by 2003. Driving demand is the explosive growth in genomic, proteomic and pharmaceutical research. For example, the Human Genome Database is approximately three terabytes of data, or the equivalent of 150 million pages of information. The volume of life sciences data is doubling every six months. 
30 | 
31 | "All of this genetic data is worthless without the information technology that can help scientists manage and analyze it to unlock the pathways that will lead to new cures for many of today's diseases," said Dr. Caroline Kovac, vice president of IBM's new Life Sciences unit. "IBM can help speed this process by enabling more efficient interpretation of data and sharing of knowledge. The potential for change based on innovation in life sciences is bigger than the change caused by the digital circuit."
32 | 
33 | Among the life sciences initiatives already underway at IBM are:
34 | - DiscoveryLink* -- For the first time, researchers using this combination of innovative middleware and integration services can join together information from many sources to solve complex medical research problems. DiscoveryLink creates a "virtual database" that permits data to be accessed and extracted from multiple data sources used in research and development projects. This IT solution can dramatically improve product cycle time and lower development costs for pharmaceutical, biotechnology and agri-science companies. 
35 | 
36 | - Blue Gene* - IBM is building a supercomputer 100 times faster than any available today designed to advance understanding of the mechanisms behind protein folding through large-scale biomolecular simulation. In December, IBM committed $100 million to this five-year research project to advance the state-of-the-art in supercomputing for biological applications.
37 | - Bio-Dictionary* -- IBM has compiled a protein dictionary containing some 30 million protein "words" designed to accelerate the understanding of protein shapes and functions.Bio-Dictionaries for selected genomes, as well as bioinformatics algorithms for pattern discovery and other relevant applications, are available to scientists and researchers for noncommercial use through a website dedicated to life sciences content at http://www.research.ibm.com/compsci/compbio/.
38 | </TEXT>
39 | <FOOTER>* Indicates trademark or registered trademark of IBM Corporation.</FOOTER>
40 | </DOC>


--------------------------------------------------------------------------------
/suim-examples/data/xml/New_IBM_Fellows.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 	<!--
 4 | 	 ***************************************************************
 5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 6 | 	 * or more contributor license agreements.  See the NOTICE file
 7 | 	 * distributed with this work for additional information
 8 | 	 * regarding copyright ownership.  The ASF licenses this file
 9 | 	 * to you under the Apache License, Version 2.0 (the
10 | 	 * "License"); you may not use this file except in compliance
11 | 	 * with the License.  You may obtain a copy of the License at
12 |          *
13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
14 | 	 * 
15 | 	 * Unless required by applicable law or agreed to in writing,
16 | 	 * software distributed under the License is distributed on an
17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 | 	 * KIND, either express or implied.  See the License for the
19 | 	 * specific language governing permissions and limitations
20 | 	 * under the License.
21 | 	 ***************************************************************
22 |    -->
23 | 
24 | <DOC>
25 | <TITLE>IBM Names Five Fellows, Company's Highest Techinical Honor</TITLE>
26 | <DATE>05 June 2002</DATE>
27 | <TEXT>
28 | IBM today elevated five employees to the title of IBM Fellow -- its most prestigious technical honor. The company also presented more than $2.8 million in cash awards to employees whose technical innovation have yielded exceptional value to the company and its customers.
29 | 
30 | IBM conferred the accolades and awards at its 2003 Corporate Technical Recognition Event (CTRE) in Scottsdale, Ariz. CTRE is a 40-year tradition at IBM, established to recognize exceptional technical employees and reward them for extraordinary achievements and contributions to the company's technology leadership.
31 | 
32 | "Our technical employees are among the best and brightest innovators in the world. They share a passion for excellence that defines their work and permeates the products and services IBM delivers to its customers," said Nick Donofrio, senior vice president, technology and manufacturing for IBM. "CTRE provides the means for us to honor those who have distinguished themselves as exceptional leaders among their peers."
33 | 
34 | Among the special honorees at the 2003 CTRE are five employees who earned the coveted distinction of IBM Fellow:
35 | 
36 | 
37 | - Grady Booch, chief scientist of Rational Software, IBM Software Group. Recognized internationally for his innovative work on software architecture, modeling, and software engineering process. Mr. Booch is one of the original authors of the Unified Modeling Language (UML), the industry-standard language of blueprints for software-intensive systems.
38 | 
39 | - Dr. Donald Chamberlin, researcher, IBM Almaden Research Center. An expert in relational database languages, Dr. Chamberlin is co- inventor of SQL, the language that energized the relational database market. He has also influenced the creation of XQuery, one of a new generation of database query languages covering structured, semi-structured and unstructured data.
40 | 
41 | - Dr. George Galambos, chief technology officer, IBM Global Services (IGS) in Canada; the first Fellow from Canada. Dr. Galambos specializes in high-performance, high availability designs, operational effectiveness, and risk assessment/mitigation, focusing on systems engineering and architecture reuse that enhances efficiency and stability. He is a principal driver of and contributor to the widely acclaimed "Patterns for e-business" and the Enterprise Solution Structure Reference Architectures, widely used by IGS in customer engagements.
42 | 
43 | - Rod Smith, vice president of Internet emerging technologies, IBM Software Group. A leader in the areas of object-oriented programming, visual development tools, Java, XML, and Web Services. Rod also was the chief technical strategist for focusing the Java platform for use in middleware solutions, in particular initiating contributions to the development of the J2EE.
44 | 
45 | - Charles Webb, eServer processor design, IBM Systems Group. Charles Webb has led the reinvention of IBM's eServer zSeries microprocessor designs and roadmap, including the z900 server, where he provided the bridge among architecture, hardware, compilers and system software, defining major portions of the 64- bit architecture and beyond.
46 | 
47 | 
48 | The title of IBM Fellow is the company's most preeminent technical distinction and is granted in recognition of outstanding and sustained technical achievements in engineering, programming, science and technology. Only 175 individuals have earned this designation in the company's history and, including the newly named Fellows, 56 are active employees. IBM Fellows are encouraged to further enhance their potential for creative achievements and typically work on special projects or research initiatives that lead the company in exciting new directions.
49 | </TEXT>
50 | </DOC>


--------------------------------------------------------------------------------
/suim-examples/data/xml/SeminarChallengesInSpeechRecognition.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 
 4 | 	<!--
 5 | 	 ***************************************************************
 6 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 7 | 	 * or more contributor license agreements.  See the NOTICE file
 8 | 	 * distributed with this work for additional information
 9 | 	 * regarding copyright ownership.  The ASF licenses this file
10 | 	 * to you under the Apache License, Version 2.0 (the
11 | 	 * "License"); you may not use this file except in compliance
12 | 	 * with the License.  You may obtain a copy of the License at
13 |          *
14 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
15 | 	 * 
16 | 	 * Unless required by applicable law or agreed to in writing,
17 | 	 * software distributed under the License is distributed on an
18 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
19 | 	 * KIND, either express or implied.  See the License for the
20 | 	 * specific language governing permissions and limitations
21 | 	 * under the License.
22 | 	 ***************************************************************
23 |    -->
24 | 
25 | <DOC>
26 | <TITLE>UIT Seminar: Challenges in Speech Recognition</TITLE>
27 | <DATE>8 August 2003</DATE>
28 | <TEXT>
29 |   UIT Seminar: Challenges in Speech Recognition
30 |   August 8, 2003   10:30 AM - 11:30 AM 
31 |   Lawrence Rabiner , Associate Director CAIP, Rutgers 
32 |   University, Professor Univ. of Santa Barbara   
33 |   Yorktown 20-043
34 |   Availability: Open 
35 | 
36 |   Speech recognition has matured to the point where it
37 |   is now being widely applied in a range of applications
38 |   including desktop dictation, cell phone name dialing,
39 |   agent technology, automated operator services,
40 |   telematics, call center automation and help desks.
41 | 
42 |   Although the technology is often good enough for many
43 |   of these applications, there remain key challenges in
44 |   virtually every aspect of speech recognition that
45 |   prevent the technology from being used ubiquitously in
46 |   any environment, for any speaker, and for an even
47 |   broader range of applications. This talk will analyze
48 |   the ‘Speech Circle’ that enables a person to maintain
49 |   a dialog with a machine using speech recognition,
50 |   spoken language understanding, dialog management and
51 |   spoken language generation, and finally text-to-speech
52 |   synthesis, and show where significant progress has
53 |   been made, and where there remain critical problems
54 |   that need to be addressed and solved.
55 | 
56 |   The talk will include several audio and video examples
57 |   of speech recognition and speech understanding systems
58 |   that have been studied in the laboratory to illustrate
59 |   the challenges that remain to be solved before speech
60 |   recognition is considered a solved problem. 
61 | </TEXT>
62 | </DOC>
63 |        
64 | 


--------------------------------------------------------------------------------
/suim-examples/data/xml/TrainableInformationExtractionSystems.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 	<!--
 4 | 	 ***************************************************************
 5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 6 | 	 * or more contributor license agreements.  See the NOTICE file
 7 | 	 * distributed with this work for additional information
 8 | 	 * regarding copyright ownership.  The ASF licenses this file
 9 | 	 * to you under the Apache License, Version 2.0 (the
10 | 	 * "License"); you may not use this file except in compliance
11 | 	 * with the License.  You may obtain a copy of the License at
12 |          *
13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
14 | 	 * 
15 | 	 * Unless required by applicable law or agreed to in writing,
16 | 	 * software distributed under the License is distributed on an
17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 | 	 * KIND, either express or implied.  See the License for the
19 | 	 * specific language governing permissions and limitations
20 | 	 * under the License.
21 | 	 ***************************************************************
22 |    -->
23 | 
24 | <DOC>
25 | <TITLE>Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems</TITLE>
26 | <DATE>19 August 2003</DATE>
27 | <TEXT>
28 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems
29 | 
30 | August 19, 2003   02:00 PM - 03:30 PM  
31 | David Johnson, Frank Oles, Tong Zhang(IBM Research)    
32 | Hawthorne GN-F15 
33 | Availability: Open  
34 | 
35 | The technical objective of the TIES project is to build customizable systems that can identify named entities in text, such as persons, organizations, and locations, as well as identifying relations between those entities. The technical approach is to develop new statistical and symbolic machine learning algorithms in service of the technical objective. Also, we are working on combining statistical with symbolic techniques. The first part of this talk, given by David E. Johnson, will provide a general overview of the goals of the TIES project. The second part, given by Tong Zhang, will provide background on applying statistical machine learning to this problem domain. Tong will also describe the particular statistical approach taken, which is termed Robust Risk Minimization (RMM). The final part will be given by Frank J. Oles. Frank will introduce his theory of precedence-inclusion patterns. Precedence-inclusion patterns are mathematical structures possessing multiple interacting strict partial orders that satisfy axioms generalizing the familiar properties of irreflexivity and transitivity. This very general theory provides a radically new approach to symbolic, as opposed to statistical, pattern generalization that can be applied to relational learning in a number of settings, including learning based on text, on images, or on videos. 
36 | </TEXT>
37 | </DOC>
38 |  
39 | 
40 |  
41 | 


--------------------------------------------------------------------------------
/suim-examples/data/xml/UIMASummerSchool2003.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | 	<!--
  4 | 	 ***************************************************************
  5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
  6 | 	 * or more contributor license agreements.  See the NOTICE file
  7 | 	 * distributed with this work for additional information
  8 | 	 * regarding copyright ownership.  The ASF licenses this file
  9 | 	 * to you under the Apache License, Version 2.0 (the
 10 | 	 * "License"); you may not use this file except in compliance
 11 | 	 * with the License.  You may obtain a copy of the License at
 12 |          *
 13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
 14 | 	 * 
 15 | 	 * Unless required by applicable law or agreed to in writing,
 16 | 	 * software distributed under the License is distributed on an
 17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 | 	 * KIND, either express or implied.  See the License for the
 19 | 	 * specific language governing permissions and limitations
 20 | 	 * under the License.
 21 | 	 ***************************************************************
 22 |    -->
 23 | <DOC>
 24 | <TITLE>UIMA Summer School</TITLE>
 25 | <DATE>1 August 2003</DATE>
 26 | <TEXT>
 27 | August 26, 2003
 28 | UIMA 101 - The New UIMA Introduction 
 29 | (Hands-on Tutorial)
 30 | 9:00AM-5:00PM in HAW GN-K35
 31 | 
 32 | August 28, 2003
 33 | FROST Tutorial
 34 | 9:00AM-5:00PM in HAW GN-K35
 35 | 
 36 | September 15, 2003
 37 | UIMA 201: UIMA Advanced Topics 
 38 | (Hands-on Tutorial)
 39 | 9:00AM-5:00PM in HAW 1S-F53
 40 | 
 41 | September 17, 2003
 42 | The UIMA System Integration Test and Hardening Service
 43 | The "SITH"
 44 | 3:00PM-4:30PM in HAW GN-K35
 45 | 
 46 | 
 47 | 
 48 | UIMA Summer School Tutorial and Presentation Details
 49 | UIMA 101: The new UIMA tutorial  
 50 | Tuesday August 26 9:00AM - 4:30PM in GN-K35
 51 | 
 52 | UIMA 101 is a hands-on programming tutorial.
 53 | 
 54 | UIMA 101 is intended for people who want a first introductory course to UIMA or for people who would like a refresher.
 55 | 
 56 | The tutorial covers the same concepts in the first UIMA tutorial given in 3Q 2002 except for some key updates:
 57 | 
 58 | 1) It uses a new interface to the CAS that makes it more natural to access and update CAS feature structures using ordinary Java objects (i.e., the JCAS) and
 59 | 2) It uses updated TAE interfaces that give the application developer more control over managing multiple CASs. 
 60 | 
 61 | Please NOTE expert users of UIMA can skip this one and should consider attending the Advanced Topics tutorial.
 62 | 
 63 | Prerequisites for the UIMA 101 Tutorial
 64 | 1) Java Programming
 65 | 2) Some experience with Eclipse IDE helpful
 66 | 
 67 | FROST Tutorial
 68 | August 28  9:00AM - 5:00PM  in GN-K35
 69 | 
 70 | Visitors from the FROST team will be here to talk to us about FROST.
 71 | 
 72 | UIMA 201: The UIMA Advanced Topics Tutorial
 73 | September 15:   9:00AM - 5:30PM in Hawthorne 1S-F53
 74 | 
 75 | UIMA 201 will introduce some new UIMA concepts and walk the student through hands-on examples.
 76 | 
 77 | The advanced topics tutorial is designed for people who have some experience with UIMA and want 
 78 | to use new capabilities of UIMA 1.0 to address one or more of the following 
 79 | Advanced Topics:
 80 | 
 81 | 1) Collection Processing and Collection Processing Engines (CPEs)
 82 | 2) Multi-Threading and CAS Pooling
 83 | 3) Using the UIMA adapter framework to integrate network TAEs with Java TAEs
 84 | 4) A Semantic Search Application that brings it all together	
 85 | 
 86 | Prerequisites for UIMA 201
 87 | 1) UIMA 101 Tutorial OR Extensive UIMA Experience
 88 | 
 89 | The UIMA Integration Test bed Service (The "SITH")
 90 | September 17 3:00PM - 4:30PM in HAW GN-K35
 91 | 
 92 | We have developed the first version of the UIMA Integration Test bed service.
 93 | 
 94 | This service is being developed to help test, evaluate, certify and publish UIMA compliant components.
 95 | 
 96 | In this talk we will explain the service and what it is intended to provide the UIMA community. We will address the following topics:
 97 | 
 98 | 1. SITH Services
 99 | 2. How to submit components and what to expect in return
100 | 3. Overview of the test bed implementation using Collection Processing UIMA and Juru. 
101 | 4. Next Steps for the SITH
102 | </TEXT>
103 | </DOC>
104 | 
105 | 


--------------------------------------------------------------------------------
/suim-examples/data/xml/UIMA_Seminars.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 	<!--
 4 | 	 ***************************************************************
 5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 6 | 	 * or more contributor license agreements.  See the NOTICE file
 7 | 	 * distributed with this work for additional information
 8 | 	 * regarding copyright ownership.  The ASF licenses this file
 9 | 	 * to you under the Apache License, Version 2.0 (the
10 | 	 * "License"); you may not use this file except in compliance
11 | 	 * with the License.  You may obtain a copy of the License at
12 |          *
13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
14 | 	 * 
15 | 	 * Unless required by applicable law or agreed to in writing,
16 | 	 * software distributed under the License is distributed on an
17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 | 	 * KIND, either express or implied.  See the License for the
19 | 	 * specific language governing permissions and limitations
20 | 	 * under the License.
21 | 	 ***************************************************************
22 |    -->
23 | 
24 | <DOC>
25 | <TITLE>Upcoming UIMA Seminars</TITLE>
26 | <DATE>15 March 2004</DATE>
27 | <TEXT>
28 | April 7, 2004 Distillery Lunch Seminar
29 | UIMA and its Metadata
30 | 12:00PM-1:00PM in HAW GN-K35. 
31 | 
32 | Dave Ferrucci will give a UIMA overview and discuss the types of component metadata that UIMA components provide.  Jon Lenchner will give a demo of the Text Analysis Engine configurator tool.
33 | 
34 | 
35 | April 16, 2004 KM &amp; I Department Tea 
36 | Title: An Eclipse-based TAE Configurator Tool
37 | 3:00PM-4:30PM in HAW GN-K35 .
38 | 
39 | Jon Lenchner will demo an Eclipse plugin for configuring TAE descriptors, which will be available soon for you to use.  No more editing XML descriptors by hand!
40 | 
41 | 
42 | May 11, 2004 UIMA Tutorial 
43 | 9:00AM-5:00PM in HAW GN-K35.
44 | 
45 | This is a full-day, hands-on tutorial on UIMA, covering the development of Text Analysis Engines and Collection Processing Engines, as well as how to include these components in your own applications.
46 | </TEXT>
47 | </DOC>


--------------------------------------------------------------------------------
/suim-examples/data/xml/WatsonConferenceRooms.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | 	<!--
 4 | 	 ***************************************************************
 5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 6 | 	 * or more contributor license agreements.  See the NOTICE file
 7 | 	 * distributed with this work for additional information
 8 | 	 * regarding copyright ownership.  The ASF licenses this file
 9 | 	 * to you under the Apache License, Version 2.0 (the
10 | 	 * "License"); you may not use this file except in compliance
11 | 	 * with the License.  You may obtain a copy of the License at
12 |          *
13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
14 | 	 * 
15 | 	 * Unless required by applicable law or agreed to in writing,
16 | 	 * software distributed under the License is distributed on an
17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 | 	 * KIND, either express or implied.  See the License for the
19 | 	 * specific language governing permissions and limitations
20 | 	 * under the License.
21 | 	 ***************************************************************
22 |    -->
23 |    
24 | <DOC>
25 | <TITLE>Conference Rooms at Watson</TITLE>
26 | <DATE>01 January 2000</DATE>
27 | <TEXT>
28 |  Conference Rooms at Watson:
29 |  Location	Capacity 	Wall Phone Ext.  
30 | 
31 | Classroom Style  
32 |   HAW J2-B34 	Seats 12 	tieline 863-3130  
33 |   HAW J2-N07 	Seats 24 	tieline 863-3210  
34 |   YKT 20-001 	Seats 36 	tieline 862-4304  
35 |   YKT 20-051 	Seats 18 	tieline 862-4307  
36 | 
37 | Conference Style  
38 |   HAW 2N-F28 	Seats 20 	tieline 863-7583  
39 |   HAW 4N-B15 	Seats 14 	tieline 863-7126  
40 |   HAW 4N-B17 	Seats 10 	tieline 863-7089  
41 |   HAW 4S-K21 	Seats 16 	tieline 863-6386  
42 |   HAW GN-F14 	Seats 12 	tieline 863-6770  
43 |   HAW GN-K30 	Seats 12 	tieline 863-7335  
44 |   HAW GN-K36 	Seats 10 	tieline 863-6098  
45 |   HAW J1-N14 	Seats 24 	tieline 863-3629  
46 |   HAW J2-A16 	Seats 12 	tieline 863-3240  
47 |   HAW J2-G27 	Seats 15 	tieline 863-3150  
48 |   HAW J2-M24 	Seats 8 	tieline 863-3160  
49 |   YKT 03-135 	Seats 8 	tieline 862-1696  
50 |   YKT 03-235 	Seats 8 	tieline 862-4278  
51 |   YKT 05-135 	Seats 8 	tieline 862-3477  
52 |   YKT 05-235 	Seats 8 	tieline 862-4279  
53 |   YKT 20-006 	Seats 8 	tieline 862-4301  
54 |   YKT 20-059 	Seats 20 	tieline 862-4308  
55 |   YKT 35-132 	Seats 8 	tieline 862-2873  
56 |   YKT 35-232 	Seats 8 	tieline 862-2860  
57 |   YKT 38-023 	Seats 8 	tieline 862-3299  
58 |   YKT 39-132 	Seats 8 	tieline 862-3486  
59 |   YKT 40-100 	Seats 20 	tieline 862-4199  
60 |   YKT 40-200 	Seats 20 	tieline 862-1379  
61 |  
62 | Other  
63 |   HAW GN-K35 	Seats 24 	tieline 863-6104  
64 | 
65 | Theater Style  
66 |   HAW 1S-F40 	Seats 30 	tieline 863-6396  
67 |   YKT 20-043 	Seats 50 	tieline 862-4306  
68 | 
69 | Video Conference Room  
70 |   YKT 32-026 	Seats 25 	tieline 862-3917 
71 | </TEXT>
72 | </DOC>


--------------------------------------------------------------------------------
/suim-examples/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 | 
  4 |   <groupId>edu.cmu.lti</groupId>
  5 |   <artifactId>suim-examples</artifactId>
  6 |   <version>0.0.1-SNAPSHOT</version>
  7 |   <name>SUIM Examples</name>
  8 | 
  9 |   <properties>
 10 |     <scala.version>2.9.3</scala.version>
 11 |   </properties>
 12 | 
 13 |   <!-- END REPOSITORIES -->
 14 | 
 15 |   <!-- DEPENDENCIES -->
 16 | 
 17 |   <dependencies>
 18 |     <dependency>
 19 |       <groupId>edu.cmu.lti</groupId>
 20 |       <artifactId>suim-scala</artifactId>
 21 |       <version>0.0.1-SNAPSHOT</version>
 22 |     </dependency>
 23 |     <dependency>
 24 |       <groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 25 |       <artifactId>de.tudarmstadt.ukp.dkpro.core.io.text-asl</artifactId>
 26 |       <version>1.5.0</version>
 27 |     </dependency>
 28 |     <dependency>
 29 |       <groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 30 |       <artifactId>de.tudarmstadt.ukp.dkpro.core.tokit-asl</artifactId>
 31 |       <version>1.5.0</version>
 32 |     </dependency> 
 33 |     <dependency>
 34 |       <groupId>de.tudarmstadt.ukp.dkpro.core</groupId>
 35 |       <artifactId>de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl</artifactId>
 36 |       <version>1.5.0</version>
 37 |     </dependency>      
 38 |     <dependency>
 39 |       <groupId>org.scalatest</groupId>
 40 |       <artifactId>scalatest_2.9.2</artifactId>
 41 |       <version>1.7.2</version>
 42 |       <scope>test</scope>
 43 |     </dependency>  
 44 |   </dependencies>
 45 | 
 46 |   <!-- END DEPENDENCIES -->
 47 | 
 48 |   <!-- SCALADOC -->
 49 |   <reporting>
 50 |     <plugins>
 51 |       <plugin>
 52 |         <groupId>net.alchim31.maven</groupId>
 53 |         <artifactId>scala-maven-plugin</artifactId>
 54 |         <version>3.1.0</version>
 55 |       </plugin>
 56 |     </plugins>
 57 |   </reporting>
 58 |   <!-- END SCALADOC -->
 59 | 
 60 |   <!-- BUILD SETTINGS -->
 61 |   <build>
 62 |     
 63 |     <sourceDirectory>src/main/scala</sourceDirectory>
 64 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
 65 |     
 66 |     <!-- BUILD PLUGINS -->
 67 |     <plugins>
 68 |       
 69 |       <plugin>
 70 |         <groupId>org.apache.maven.plugins</groupId>
 71 |         <artifactId>maven-compiler-plugin</artifactId>
 72 |         <configuration>
 73 |           <source>1.6</source>
 74 |           <target>1.6</target> 
 75 |         </configuration>
 76 |       </plugin>
 77 | 
 78 |       <plugin>
 79 |         <groupId>net.alchim31.maven</groupId>
 80 |         <artifactId>scala-maven-plugin</artifactId>
 81 |         <version>3.1.0</version>
 82 |         <configuration>
 83 |           <recompileMode>incremental</recompileMode>
 84 |           <args>
 85 |             <arg>-unchecked</arg>
 86 |             <arg>-deprecation</arg>
 87 |             <arg>-explaintypes</arg>
 88 |           </args>
 89 |           <launchers>
 90 |             <launcher>
 91 |               <id>main</id>
 92 |               <mainClass>spark-uima-tools.App</mainClass>
 93 |             </launcher>
 94 |             <!-- you could define other launcher -->
 95 |           </launchers>
 96 |         </configuration>
 97 |         <executions>
 98 |           <execution>
 99 |             <goals>
100 |               <goal>compile</goal>
101 |               <goal>testCompile</goal>
102 |             </goals>
103 |             <configuration>
104 |               <args>
105 |                 <arg>-make:transitive</arg>
106 |                 <arg>-dependencyfile</arg>
107 |                 <arg>${project.build.directory}/.scala_dependencies</arg>
108 |               </args>
109 |             </configuration>
110 |           </execution>
111 |         </executions>
112 |       </plugin>
113 | 
114 |       <plugin>
115 |         <groupId>org.apache.maven.plugins</groupId>
116 |         <artifactId>maven-assembly-plugin</artifactId>
117 |         <version>2.2-beta-5</version>
118 |         <configuration>
119 |           <descriptorRefs>
120 |             <descriptorRef>jar-with-dependencies</descriptorRef>
121 |           </descriptorRefs>
122 |           <archive>
123 |             <manifest>
124 |               <mainClass>spark-uima-tools.App</mainClass>
125 |             </manifest>
126 |           </archive>
127 |         </configuration>
128 |         <executions>
129 |           <execution>
130 |             <phase>package</phase>
131 |             <goals>
132 |               <goal>single</goal>
133 |             </goals>
134 |           </execution>
135 |         </executions>
136 |       </plugin>
137 | 
138 |       <plugin>
139 |         <groupId>org.apache.maven.plugins</groupId>
140 |         <artifactId>maven-surefire-plugin</artifactId>
141 |         <version>2.12</version>
142 |         <configuration>
143 |           <skipTests>true</skipTests>
144 |           <useSystemClassLoader>false</useSystemClassLoader>
145 |           <argLine>-Xmx1024m</argLine>
146 |           <includes>
147 |             <include>**/*Spec.scala</include>
148 |           </includes>
149 |           <excludes>
150 |             <exclude>**/*Test.scala</exclude>
151 |           </excludes>
152 |         </configuration>
153 |       </plugin>
154 |       
155 |       <plugin>
156 |         <groupId>org.scalatest</groupId>
157 |         <artifactId>scalatest-maven-plugin</artifactId>
158 |         <version>1.0-M2</version>
159 |         <configuration>
160 |           <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
161 |           <junitxml>.</junitxml>
162 |           <filereports>WDF TestSuite.txt</filereports>
163 |         </configuration>
164 |         <executions>
165 |           <execution>
166 |             <id>test</id>
167 |             <goals>
168 |               <goal>test</goal>
169 |             </goals>
170 |           </execution>
171 |         </executions>
172 |       </plugin>
173 | 
174 |       <plugin>
175 |         <groupId>org.apache.maven.plugins</groupId>
176 |         <artifactId>maven-source-plugin</artifactId>
177 |         <version>2.1.2</version>
178 |         <executions>
179 |           <execution>
180 |             <id>attach-sources</id>
181 |             <goals>
182 |               <goal>jar</goal>
183 |             </goals>
184 |           </execution>
185 |         </executions>
186 |       </plugin>
187 | 
188 |       <plugin>
189 |         <groupId>org.apache.maven.plugins</groupId>
190 |         <artifactId>maven-resources-plugin</artifactId>
191 |         <version>2.5</version>
192 |         <configuration>
193 |           <encoding>UTF-8</encoding>
194 |         </configuration>
195 |       </plugin>
196 | 
197 |       <plugin>
198 |         <groupId>org.apache.maven.plugins</groupId>
199 |         <artifactId>maven-release-plugin</artifactId>
200 |         <version>2.3.2</version>
201 |       </plugin>
202 | 
203 |     </plugins>
204 |     <!-- END BUILD PLUGINS -->
205 | 
206 |     <!-- EXTENSIONS -->
207 |     <extensions>
208 |       <extension>
209 |         <groupId>org.apache.maven.wagon</groupId>
210 |         <artifactId>wagon-ssh-external</artifactId>
211 |         <version>1.0-beta-7</version>
212 |       </extension>
213 |     </extensions>
214 |     <!-- END EXTENSIONS -->
215 | 
216 |     <pluginManagement>
217 |     	<plugins>
218 |     		<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
219 |     		<plugin>
220 |     			<groupId>org.eclipse.m2e</groupId>
221 |     			<artifactId>lifecycle-mapping</artifactId>
222 |     			<version>1.0.0</version>
223 |     			<configuration>
224 |     				<lifecycleMappingMetadata>
225 |     					<pluginExecutions>
226 |     						<pluginExecution>
227 |     							<pluginExecutionFilter>
228 |     								<groupId>
229 |     									net.alchim31.maven
230 |     								</groupId>
231 |     								<artifactId>
232 |     									scala-maven-plugin
233 |     								</artifactId>
234 |     								<versionRange>
235 |     									[3.1.0,)
236 |     								</versionRange>
237 |     								<goals>
238 |     									<goal>compile</goal>
239 |     									<goal>testCompile</goal>
240 |     								</goals>
241 |     							</pluginExecutionFilter>
242 |     							<action>
243 |     								<ignore></ignore>
244 |     							</action>
245 |     						</pluginExecution>
246 |     					</pluginExecutions>
247 |     				</lifecycleMappingMetadata>
248 |     			</configuration>
249 |     		</plugin>
250 |     	</plugins>
251 |     </pluginManagement>
252 |   </build>
253 | </project>
254 | 


--------------------------------------------------------------------------------
/suim-examples/src/main/resources/META-INF/org.apache.uima.fit/types.txt:
--------------------------------------------------------------------------------
1 | classpath*:ex/TutorialTypeSystem.xml
2 | classpath*:org/apache/uima/examples/SourceDocumentInformation.xml
3 | classpath*:desc/type/LexicalUnits.xml
4 | classpath*:desc/type/metadata.xml
5 | 
6 | 


--------------------------------------------------------------------------------
/suim-examples/src/main/resources/ex/RoomNumberAndDateTime.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | 
 3 | 	<!--
 4 | 	 ***************************************************************
 5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
 6 | 	 * or more contributor license agreements.  See the NOTICE file
 7 | 	 * distributed with this work for additional information
 8 | 	 * regarding copyright ownership.  The ASF licenses this file
 9 | 	 * to you under the Apache License, Version 2.0 (the
10 | 	 * "License"); you may not use this file except in compliance
11 | 	 * with the License.  You may obtain a copy of the License at
12 |          *
13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
14 | 	 * 
15 | 	 * Unless required by applicable law or agreed to in writing,
16 | 	 * software distributed under the License is distributed on an
17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 | 	 * KIND, either express or implied.  See the License for the
19 | 	 * specific language governing permissions and limitations
20 | 	 * under the License.
21 | 	 ***************************************************************
22 |    -->
23 |    
24 | <analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier" xmlns:xi="http://www.w3.org/2001/XInclude">
25 | 	<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
26 | 	<primitive>false</primitive>
27 | 
28 | 	<delegateAnalysisEngineSpecifiers>
29 | 		<delegateAnalysisEngine key="RoomNumber">
30 | 			<import location="../ex2/RoomNumberAnnotator.xml" />
31 | 		</delegateAnalysisEngine>
32 | 
33 | 		<delegateAnalysisEngine key="DateTime">
34 | 			<import location="TutorialDateTime.xml" />
35 | 		</delegateAnalysisEngine>
36 | 	</delegateAnalysisEngineSpecifiers>
37 | 
38 | 	<analysisEngineMetaData>
39 | 		<name>Aggregate TAE - Room Number and DateTime Annotators</name>
40 | 		<description>Detects Room Numbers, Dates, and Times</description>
41 | 		
42 | 		<flowConstraints>
43 | 			<fixedFlow>
44 | 				<node>RoomNumber</node>
45 | 				<node>DateTime</node>
46 | 			</fixedFlow>
47 | 		</flowConstraints>
48 | 		
49 | 		<capabilities>
50 | 			<capability>
51 | 				<inputs/>
52 | 				<outputs>
53 | 					<type allAnnotatorFeatures="true">
54 | 						org.apache.uima.tutorial.RoomNumber</type>
55 | 					<type allAnnotatorFeatures="true">
56 | 						org.apache.uima.tutorial.DateAnnot</type>
57 | 					<type allAnnotatorFeatures="true">
58 | 						org.apache.uima.tutorial.TimeAnnot</type>
59 | 				</outputs>
60 | 				<languagesSupported>
61 | 					<language>en</language>
62 | 				</languagesSupported>
63 | 			</capability>
64 | 		</capabilities>
65 | 		<operationalProperties>
66 | 			<modifiesCas>true</modifiesCas>
67 | 			<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
68 | 			<outputsNewCASes>false</outputsNewCASes>
69 | 		</operationalProperties>
70 | 	</analysisEngineMetaData>
71 | </analysisEngineDescription>
72 | 


--------------------------------------------------------------------------------
/suim-examples/src/main/resources/ex/TutorialTypeSystem.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | 
  3 | 	<!--
  4 | 	 ***************************************************************
  5 | 	 * Licensed to the Apache Software Foundation (ASF) under one
  6 | 	 * or more contributor license agreements.  See the NOTICE file
  7 | 	 * distributed with this work for additional information
  8 | 	 * regarding copyright ownership.  The ASF licenses this file
  9 | 	 * to you under the Apache License, Version 2.0 (the
 10 | 	 * "License"); you may not use this file except in compliance
 11 | 	 * with the License.  You may obtain a copy of the License at
 12 |          *
 13 | 	 *   http://www.apache.org/licenses/LICENSE-2.0
 14 | 	 * 
 15 | 	 * Unless required by applicable law or agreed to in writing,
 16 | 	 * software distributed under the License is distributed on an
 17 | 	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 18 | 	 * KIND, either express or implied.  See the License for the
 19 | 	 * specific language governing permissions and limitations
 20 | 	 * under the License.
 21 | 	 ***************************************************************
 22 |    -->
 23 |    
 24 | <typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
 25 | 	<name>TutorialTypeSystem</name>
 26 | 	<description>Type System Definition for the tutorial examples - as of Exercise 6</description>
 27 | 	<version>1.0</version>
 28 | 	<vendor>The Apache Software Foundation</vendor>
 29 | 	<types>
 30 | 		<typeDescription>
 31 | 			<name>org.apache.uima.tutorial.RoomNumber</name>
 32 | 			<description></description>
 33 | 			<supertypeName>uima.tcas.Annotation</supertypeName>
 34 | 			<features>
 35 | 				<featureDescription>
 36 | 					<name>building</name>
 37 | 					<description>Building containing this room</description>
 38 | 					<rangeTypeName>uima.cas.String</rangeTypeName>
 39 | 				</featureDescription>
 40 | 			</features>
 41 | 		</typeDescription>
 42 | 		<typeDescription>
 43 | 			<name>org.apache.uima.tutorial.DateTimeAnnot</name>
 44 | 			<description></description>
 45 | 			<supertypeName>uima.tcas.Annotation</supertypeName>
 46 | 			<features>
 47 | 				<featureDescription>
 48 | 					<name>shortDateString</name>
 49 | 					<description />
 50 | 					<rangeTypeName>uima.cas.String</rangeTypeName>
 51 | 				</featureDescription>
 52 | 			</features>
 53 | 		</typeDescription>
 54 | 		<typeDescription>
 55 | 			<name>org.apache.uima.tutorial.TimeAnnot</name>
 56 | 			<description></description>
 57 | 			<supertypeName>org.apache.uima.tutorial.DateTimeAnnot</supertypeName>
 58 | 			<features></features>
 59 | 		</typeDescription>
 60 | 		<typeDescription>
 61 | 			<name>org.apache.uima.tutorial.DateAnnot</name>
 62 | 			<description></description>
 63 | 			<supertypeName>org.apache.uima.tutorial.DateTimeAnnot</supertypeName>
 64 | 			<features></features>
 65 | 		</typeDescription>
 66 | 		<typeDescription>
 67 | 			<name>org.apache.uima.tutorial.Meeting</name>
 68 | 			<description></description>
 69 | 			<supertypeName>uima.tcas.Annotation</supertypeName>
 70 | 			<features>
 71 | 				<featureDescription>
 72 | 					<name>room</name>
 73 | 					<description></description>
 74 | 					<rangeTypeName>org.apache.uima.tutorial.RoomNumber</rangeTypeName>
 75 | 				</featureDescription>
 76 | 				<featureDescription>
 77 | 					<name>date</name>
 78 | 					<description></description>
 79 | 					<rangeTypeName>org.apache.uima.tutorial.DateAnnot</rangeTypeName>
 80 | 				</featureDescription>
 81 | 				<featureDescription>
 82 | 					<name>startTime</name>
 83 | 					<description></description>
 84 | 					<rangeTypeName>org.apache.uima.tutorial.TimeAnnot</rangeTypeName>
 85 | 				</featureDescription>
 86 | 				<featureDescription>
 87 | 					<name>endTime</name>
 88 | 					<description></description>
 89 | 					<rangeTypeName>org.apache.uima.tutorial.TimeAnnot</rangeTypeName>
 90 | 				</featureDescription>
 91 | 			</features>
 92 | 		</typeDescription>
 93 | 		<typeDescription>
 94 | 			<name>org.apache.uima.tutorial.UimaAcronym</name>
 95 | 			<description></description>
 96 | 			<supertypeName>uima.tcas.Annotation</supertypeName>
 97 | 			<features>
 98 | 				<featureDescription>
 99 | 					<name>expandedForm</name>
100 | 					<description></description>
101 | 					<rangeTypeName>uima.cas.String</rangeTypeName>
102 | 				</featureDescription>
103 | 			</features>
104 | 		</typeDescription>
105 | 		<typeDescription>
106 | 			<name>org.apache.uima.tutorial.UimaMeeting</name>
107 | 			<description></description>
108 | 			<supertypeName>org.apache.uima.tutorial.Meeting</supertypeName>
109 | 		</typeDescription>
110 |         <typeDescription>
111 | 			<name>org.apache.uima.examples.tokenizer.Token</name>
112 | 			<description></description>
113 | 			<supertypeName>uima.tcas.Annotation</supertypeName>
114 |         </typeDescription>
115 | 		<typeDescription>
116 | 			<name>org.apache.uima.examples.tokenizer.Sentence</name>
117 | 			<description></description>
118 | 			<supertypeName>uima.tcas.Annotation</supertypeName>
119 | 		</typeDescription>
120 | 	</types>
121 | </typeSystemDescription>


--------------------------------------------------------------------------------
/suim-examples/src/main/resources/org/apache/uima/tutorial/ex6/uimaAcronyms.txt:
--------------------------------------------------------------------------------
1 | UIMA	Unstructured Information Management Architecture
2 | SITH	System Integration Testing and Hardening
3 | CPE	Collection Processing Engine
4 | CPM	Collection Processing Manager
5 | AE	Analysis Engine
6 | CAS	Common Analysis Structure
7 | JCAS	Java Common Analysis Structure


--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/Annotators.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2013 Carnegie Mellon University
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | package edu.cmu.lti.suim.examples
18 | 
19 | import scala.collection.JavaConversions.collectionAsScalaIterable
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkContext.rddToOrderedRDDFunctions
23 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
24 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription
25 | import org.apache.uima.fit.factory.CollectionReaderFactory.createReader
26 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
27 | import org.apache.uima.fit.util.JCasUtil
28 | 
29 | import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase
30 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.`type`.Token
31 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader
32 | import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter
33 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD
34 | import edu.cmu.lti.suim.SparkUimaUtils.process
35 | 
36 | 
37 | object Annotators {
38 | 
39 |   def main(args: Array[String]) = {
40 |     val sc = new SparkContext(args(0), "App",
41 |       System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
42 | 
43 |     val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
44 |     val rdd = makeRDD(createReader(classOf[TextReader],
45 |       ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "data/*.txt",
46 |       ResourceCollectionReaderBase.PARAM_LANGUAGE, "en"), sc)
47 |     val seg = createEngineDescription(classOf[BreakIteratorSegmenter])
48 |     val tokens = rdd.map(process(_, seg)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[Token]))
49 |     val counts = tokens.map(token => token.getCoveredText()).filter(filter(_)).map((_,1)).reduceByKey(_ + _).map(pair => (pair._2, pair._1)).sortByKey(false)
50 |     counts.take(20).foreach(println(_))
51 |   }
52 | 
53 |   def filter(input: String): Boolean = !input.forall(_.isDigit) && input.matches("""\w*""")
54 | }
55 | 


--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/App.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2013 Carnegie Mellon University
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | package edu.cmu.lti.suim.examples
18 | 
19 | import scala.collection.JavaConversions.collectionAsScalaIterable
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
23 | import org.apache.uima.examples.cpe.FileSystemCollectionReader
24 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription
25 | import org.apache.uima.fit.factory.CollectionReaderFactory.createReader
26 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
27 | import org.apache.uima.fit.util.JCasUtil
28 | import org.apache.uima.tutorial.RoomNumber
29 | import org.apache.uima.tutorial.ex1.RoomNumberAnnotator
30 | 
31 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD
32 | import edu.cmu.lti.suim.SparkUimaUtils.process
33 | 
34 | object App {
35 | 
36 |   def main(args: Array[String]) = {
37 |     val sc = new SparkContext(args(0), "App",
38 |       System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
39 | 
40 |     val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
41 |     val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
42 |     val rdd = makeRDD(createReader(classOf[FileSystemCollectionReader], params: _*), sc)
43 |     val rnum = createEngineDescription(classOf[RoomNumberAnnotator])
44 |     val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber]))
45 |     val counts = rooms.map(room => room.getBuilding()).map((_,1)).reduceByKey(_ + _)
46 |     counts.foreach(println(_))
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/AppWithHDFS.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2013 Carnegie Mellon University
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | package edu.cmu.lti.suim.examples
18 | 
19 | import scala.collection.JavaConversions.collectionAsScalaIterable
20 | 
21 | import org.apache.spark.SparkContext
22 | import org.apache.uima.examples.cpe.FileSystemCollectionReader
23 | import org.apache.uima.fit.factory.AnalysisEngineFactory
24 | import org.apache.uima.fit.factory.CollectionReaderFactory
25 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
26 | import org.apache.uima.fit.util.JCasUtil
27 | import org.apache.uima.tutorial.RoomNumber
28 | import org.apache.uima.tutorial.ex1.RoomNumberAnnotator
29 | 
30 | import edu.cmu.lti.suim.SparkUimaUtils.process
31 | import edu.cmu.lti.suim.SparkUimaUtils.sequenceFile
32 | 
33 | 
34 | object AppWithHDFS {
35 | 
36 |   def main(args: Array[String]) = {
37 |     val sc = new SparkContext(args(0), "App",
38 |       System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
39 | 
40 |     val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
41 |     val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
42 |     val rdd = sequenceFile(CollectionReaderFactory.createCollectionReader(classOf[FileSystemCollectionReader], params: _*),
43 |       "hdfs://localhost:9000/file.txt",sc)
44 |     val rnum = AnalysisEngineFactory.createEngineDescription(classOf[RoomNumberAnnotator])
45 |     val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber]))
46 |     val counts = rooms.map(room => room.getBuilding()).countByValue()
47 |     println(counts)
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/SparkPipelineExample.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *   http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied.  See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | package edu.cmu.lti.suim.examples
 21 | 
 22 | import java.util.StringTokenizer
 23 | 
 24 | import scala.collection.JavaConversions.bufferAsJavaList
 25 | import scala.collection.JavaConversions.collectionAsScalaIterable
 26 | import scala.io.Source
 27 | 
 28 | import org.apache.spark.SparkContext
 29 | import org.apache.uima.examples.cpe.FileSystemCollectionReader
 30 | import org.apache.uima.fit.component.JCasAnnotator_ImplBase
 31 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription
 32 | import org.apache.uima.fit.factory.CollectionReaderFactory
 33 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
 34 | import org.apache.uima.fit.util.JCasUtil.select
 35 | import org.apache.uima.jcas.JCas
 36 | import org.apache.uima.tutorial.Meeting
 37 | import org.apache.uima.tutorial.UimaAcronym
 38 | import org.apache.uima.tutorial.UimaMeeting
 39 | 
 40 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD
 41 | import edu.cmu.lti.suim.SparkUimaUtils.process
 42 | 
 43 | object SparkPipelineExample {
 44 | 
 45 |   def readMap(file: String) = {
 46 |     val s = Source.fromFile(file)
 47 |     s.getLines.map(line => {
 48 |       val pair = line.split("\t")
 49 |       (pair(0), pair(1))
 50 |     }).toMap
 51 |   }
 52 | 
 53 |   def main(args: Array[String]) = {
 54 |     val sc = new SparkContext(args(0), "App",
 55 |       System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
 56 | 
 57 |     // Share variable
 58 |     val mMap = sc.broadcast(readMap("src/main/resources/org/apache/uima/tutorial/ex6/uimaAcronyms.txt"))
 59 |     val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
 60 |     val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
 61 |     val rdd = makeRDD(CollectionReaderFactory.createReader(
 62 |       classOf[FileSystemCollectionReader], params: _*), sc)
 63 |     val result = rdd.map(process(_, createEngineDescription(
 64 |       createEngineDescription(classOf[UimaAcronymAnnotator]),
 65 |       createEngineDescription(classOf[UimaMeetingAnnotator])))).cache
 66 |     result.flatMap(scas => select(scas.jcas, classOf[UimaAcronym])).foreach(println(_))
 67 |     result.flatMap(scas => select(scas.jcas, classOf[UimaMeeting])).foreach(println(_))
 68 |   }
 69 | }
 70 | 
 71 | class UimaAcronymAnnotator extends JCasAnnotator_ImplBase {
 72 | 
 73 |   val mMap = org.apache.spark.SparkEnv.get.blockManager.getSingle("broadcast_0").get.asInstanceOf[Map[String, String]]
 74 | 
 75 |   override def process(jcas: JCas) {
 76 |      // go through document word-by-word
 77 |     val text = jcas.getDocumentText();
 78 |     var pos = 0;
 79 |     val tokenizer = new StringTokenizer(text, """ \t\n\r.<.>/?";:[{]}\|=+()!""", true);
 80 |     while (tokenizer.hasMoreTokens()) {
 81 |       val token = tokenizer.nextToken();
 82 |       // look up token in map to see if it is an acronym
 83 |       val expandedForm = mMap.get(token);
 84 |       if (expandedForm.isDefined) {
 85 |         // create annotation
 86 |         val annot = new UimaAcronym(jcas, pos, pos + token.length());
 87 |         annot.setExpandedForm(expandedForm.get);
 88 |         annot.addToIndexes();
 89 |       }
 90 |       // incrememnt pos and go to next token
 91 |       pos += token.length();
 92 |     }
 93 |   }
 94 | }
 95 | 
 96 | 
 97 | class UimaMeetingAnnotator extends JCasAnnotator_ImplBase {
 98 | 
 99 |   val mMap = org.apache.spark.SparkEnv.get.blockManager.getSingle("broadcast_0").get.asInstanceOf[Map[String, String]]
100 | 
101 |   override def process(jcas: JCas) {
102 |     // get document text
103 |     val text = jcas.getDocumentText();
104 | 
105 |     // We iterate over all Meeting annotations, and if we determine that
106 |     // the topic of a meeting is UIMA-related, we create a UimaMeeting
107 |     // annotation. We add each UimaMeeting annotation to a list, and then
108 |     // later go back and add these to the CAS indexes. We need to do this
109 |     // because it's not allowed to add to an index that you're currently
110 |     // iterating over.
111 |     val uimaMeetings = scala.collection.mutable.Buffer[UimaMeeting]()
112 | 
113 |     select(jcas, classOf[Meeting]).foreach(meeting => {
114 |       // get span of text within 50 chars on either side of meeting
115 |       // (window size should probably be a config. param)
116 |       var begin = meeting.getBegin() - 50
117 |       var end = meeting.getEnd() + 50
118 |       if (begin < 0) {
119 |         begin = 0
120 |       }
121 |       if (end > text.length()) {
122 |         end = text.length()
123 |       }
124 |       val window = text.substring(begin, end)
125 | 
126 |       // look for UIMA acronyms within this window
127 |       val tokenizer = new StringTokenizer(window, """ \t\n\r.<.>/?";:[{]}\|=+()!""");
128 |       var continue = true
129 |       while (tokenizer.hasMoreTokens() && continue) {
130 |         val token = tokenizer.nextToken();
131 |         // look up token in map to see if it is an acronym
132 |         if (mMap.get(token) != null) {
133 |           // create annotation
134 |           val annot = new UimaMeeting(jcas, meeting.getBegin(), meeting.getEnd());
135 |           annot.setRoom(meeting.getRoom());
136 |           annot.setDate(meeting.getDate());
137 |           annot.setStartTime(meeting.getStartTime());
138 |           annot.setEndTime(meeting.getEndTime());
139 |           // Add annotation to a list, to be later added to the
140 |           // indexes.
141 |           // We need to do this because it's not allowed to add to an
142 |           // index that you're currently iterating over.
143 |           uimaMeetings.add(annot);
144 |           continue = false
145 |         }
146 |       }
147 |     })
148 |     uimaMeetings.foreach(meeting => meeting.addToIndexes())
149 |   }
150 | }
151 | 


--------------------------------------------------------------------------------
/suim-examples/src/test/scala/spark-uima-tools/AppSpec.scala:
--------------------------------------------------------------------------------
 1 | package cmu.edu.lti.suim
 2 | 
 3 | import org.scalatest.FlatSpec
 4 | import org.scalatest.matchers.ShouldMatchers
 5 | 
 6 | class AppSpec extends FlatSpec with ShouldMatchers {
 7 |   "An App" should "pass" in {
 8 |     (1) should equal(1)
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/suim-java/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 2 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
 4 |                              http://maven.apache.org/maven-v4_0_0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 |     
 7 |     <groupId>edu.cmu.lti</groupId>
 8 |     <artifactId>suim-java</artifactId>
 9 |     <version>0.0.1-SNAPSHOT</version>
10 |     <name>SUIM Java</name>
11 | 
12 |     <build>
13 |       <pluginManagement>
14 |         <plugins>
15 |           <plugin>
16 |             <groupId>org.apache.maven.plugins</groupId>
17 |             <artifactId>maven-compiler-plugin</artifactId>
18 |             <configuration>
19 |               <source>1.6</source>
20 |               <target>1.6</target>
21 |             </configuration>
22 |           </plugin>
23 |         </plugins>
24 |       </pluginManagement>
25 |     </build>
26 | 
27 |     <dependencies>
28 |       <dependency>
29 |         <groupId>edu.cmu.lti</groupId>
30 |         <artifactId>suim-scala</artifactId>
31 |         <version>0.0.1-SNAPSHOT</version>
32 |       </dependency>
33 |     </dependencies>
34 | </project>
35 | 


--------------------------------------------------------------------------------
/suim-java/src/main/java/edu/cmu/lti/suim/JavaSparkUima.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2013 Carnegie Mellon University
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | package edu.cmu.lti.suim;
18 | 
19 | import java.util.List;
20 | 
21 | import org.apache.hadoop.io.NullWritable;
22 | import org.apache.spark.api.java.JavaRDD;
23 | import org.apache.spark.api.java.JavaSparkContext;
24 | import org.apache.spark.api.java.function.Function;
25 | import org.apache.uima.analysis_engine.AnalysisEngineDescription;
26 | import org.apache.uima.collection.CollectionReader;
27 | import org.apache.uima.fit.factory.AnalysisEngineFactory;
28 | import org.apache.uima.resource.ResourceInitializationException;
29 | 
30 | public final class JavaSparkUima {
31 | 
32 |   public static JavaRDD<SCAS> sequenceFile(CollectionReader reader, String uri, JavaSparkContext sc) throws Exception {
33 |     SparkUimaUtils.createSequenceFile(reader, uri);
34 |     return sc.sequenceFile(uri, NullWritable.class, SCAS.class).values();
35 |   }
36 | 
37 |   public static JavaRDD<SCAS> makeRDD(CollectionReader reader, JavaSparkContext sc) throws Exception {
38 |     List<SCAS> buffer = SparkUimaUtils.readFrom(reader);
39 |     return sc.parallelize(buffer);
40 |   }
41 | 
42 |   public final static class PipelineFunction extends Function<SCAS, SCAS> {
43 | 
44 | 	private static final long serialVersionUID = -6881223764488277676L;
45 | 	
46 | 	private final AnalysisEngineDescription description;
47 |     
48 |     public PipelineFunction(AnalysisEngineDescription... descs) throws ResourceInitializationException {
49 |       this.description = AnalysisEngineFactory.createEngineDescription(descs);
50 |     }
51 | 
52 |     public PipelineFunction(AnalysisEngineDescription desc) {
53 |       this.description = desc;
54 |     }
55 | 
56 |     public SCAS call(SCAS scas) {
57 |       return SparkUimaUtils.process(scas, description);
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/suim-scala/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |   <modelVersion>4.0.0</modelVersion>
  3 | 
  4 |   <groupId>edu.cmu.lti</groupId>
  5 |   <artifactId>suim-scala</artifactId>
  6 |   <version>0.0.1-SNAPSHOT</version>
  7 |   <name>SUIM Scala</name>
  8 | 
  9 |   <properties>
 10 |     <scala.version>2.9.3</scala.version>
 11 |   </properties>
 12 | 
 13 |   <!-- END REPOSITORIES -->
 14 | 
 15 |   <!-- DEPENDENCIES -->
 16 | 
 17 |   <dependencies>
 18 |     <dependency>
 19 |       <groupId>org.apache.uima</groupId>
 20 |       <artifactId>uimaj-examples</artifactId>
 21 |       <version>2.4.2</version>
 22 |     </dependency>
 23 |     <dependency>
 24 |       <groupId>org.apache.uima</groupId>
 25 |       <artifactId>uimaj-core</artifactId>
 26 |       <version>2.4.2</version>
 27 |     </dependency>
 28 |     <dependency>
 29 |       <groupId>org.apache.uima</groupId>
 30 |       <artifactId>uimafit-core</artifactId>
 31 |       <version>2.0.0</version>
 32 |     </dependency>
 33 |     <dependency>
 34 |       <groupId>org.apache.spark</groupId>
 35 |       <artifactId>spark-core_2.9.3</artifactId>
 36 |       <version>0.8.0-incubating</version>
 37 |     </dependency>
 38 |     <dependency>
 39 |       <groupId>org.scala-lang</groupId>
 40 |       <artifactId>scala-library</artifactId>
 41 |       <version>${scala.version}</version>
 42 |     </dependency>
 43 | 
 44 |     <dependency>
 45 |       <groupId>org.scalatest</groupId>
 46 |       <artifactId>scalatest_2.9.2</artifactId>
 47 |       <version>1.7.2</version>
 48 |       <scope>test</scope>
 49 |     </dependency>
 50 | 
 51 |     <dependency>
 52 |       <groupId>org.scalamock</groupId>
 53 |       <artifactId>scalamock-scalatest-support_2.9.2</artifactId>
 54 |       <version>2.4</version>
 55 |       <scope>test</scope>
 56 |     </dependency>
 57 | 
 58 |   </dependencies>
 59 | 
 60 |   <!-- END DEPENDENCIES -->
 61 | 
 62 |   <!-- SCALADOC -->
 63 |   <reporting>
 64 |     <plugins>
 65 |       <plugin>
 66 |         <groupId>net.alchim31.maven</groupId>
 67 |         <artifactId>scala-maven-plugin</artifactId>
 68 |         <version>3.1.0</version>
 69 |       </plugin>
 70 |     </plugins>
 71 |   </reporting>
 72 |   <!-- END SCALADOC -->
 73 | 
 74 |   <!-- BUILD SETTINGS -->
 75 |   <build>
 76 |     
 77 |     <sourceDirectory>src/main/scala</sourceDirectory>
 78 |     <testSourceDirectory>src/test/scala</testSourceDirectory>
 79 |     
 80 |     <!-- BUILD PLUGINS -->
 81 |     <plugins>
 82 |       
 83 |       <plugin>
 84 |         <groupId>org.apache.maven.plugins</groupId>
 85 |         <artifactId>maven-compiler-plugin</artifactId>
 86 |         <configuration>
 87 |           <source>1.6</source>
 88 |           <target>1.6</target> 
 89 |         </configuration>
 90 |       </plugin>
 91 | 
 92 |       <plugin>
 93 |         <groupId>net.alchim31.maven</groupId>
 94 |         <artifactId>scala-maven-plugin</artifactId>
 95 |         <version>3.1.0</version>
 96 |         <configuration>
 97 |           <recompileMode>incremental</recompileMode>
 98 |           <args>
 99 |             <arg>-unchecked</arg>
100 |             <arg>-deprecation</arg>
101 |             <arg>-explaintypes</arg>
102 |           </args>
103 |           <launchers>
104 |             <launcher>
105 |               <id>main</id>
106 |               <mainClass>spark-uima-tools.App</mainClass>
107 |             </launcher>
108 |             <!-- you could define other launcher -->
109 |           </launchers>
110 |         </configuration>
111 |         <executions>
112 |           <execution>
113 |             <goals>
114 |               <goal>compile</goal>
115 |               <goal>testCompile</goal>
116 |             </goals>
117 |             <configuration>
118 |               <args>
119 |                 <arg>-make:transitive</arg>
120 |                 <arg>-dependencyfile</arg>
121 |                 <arg>${project.build.directory}/.scala_dependencies</arg>
122 |               </args>
123 |             </configuration>
124 |           </execution>
125 |         </executions>
126 |       </plugin>
127 | 
128 |       <plugin>
129 |         <groupId>org.apache.maven.plugins</groupId>
130 |         <artifactId>maven-surefire-plugin</artifactId>
131 |         <version>2.12</version>
132 |         <configuration>
133 |           <skipTests>true</skipTests>
134 |           <useSystemClassLoader>false</useSystemClassLoader>
135 |           <argLine>-Xmx1024m</argLine>
136 |           <includes>
137 |             <include>**/*Spec.scala</include>
138 |           </includes>
139 |           <excludes>
140 |             <exclude>**/*Test.scala</exclude>
141 |           </excludes>
142 |         </configuration>
143 |       </plugin>
144 |       
145 |       <plugin>
146 |         <groupId>org.scalatest</groupId>
147 |         <artifactId>scalatest-maven-plugin</artifactId>
148 |         <version>1.0-M2</version>
149 |         <configuration>
150 |           <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
151 |           <junitxml>.</junitxml>
152 |           <filereports>WDF TestSuite.txt</filereports>
153 |         </configuration>
154 |         <executions>
155 |           <execution>
156 |             <id>test</id>
157 |             <goals>
158 |               <goal>test</goal>
159 |             </goals>
160 |           </execution>
161 |         </executions>
162 |       </plugin>
163 | 
164 |       <plugin>
165 |         <groupId>org.apache.maven.plugins</groupId>
166 |         <artifactId>maven-source-plugin</artifactId>
167 |         <version>2.1.2</version>
168 |         <executions>
169 |           <execution>
170 |             <id>attach-sources</id>
171 |             <goals>
172 |               <goal>jar</goal>
173 |             </goals>
174 |           </execution>
175 |         </executions>
176 |       </plugin>
177 | 
178 |       <plugin>
179 |         <groupId>org.apache.maven.plugins</groupId>
180 |         <artifactId>maven-resources-plugin</artifactId>
181 |         <version>2.5</version>
182 |         <configuration>
183 |           <encoding>UTF-8</encoding>
184 |         </configuration>
185 |       </plugin>
186 | 
187 |       <plugin>
188 |         <groupId>org.apache.maven.plugins</groupId>
189 |         <artifactId>maven-release-plugin</artifactId>
190 |         <version>2.3.2</version>
191 |       </plugin>
192 | 
193 |     </plugins>
194 |     <!-- END BUILD PLUGINS -->
195 | 
196 |     <!-- EXTENSIONS -->
197 |     <extensions>
198 |       <extension>
199 |         <groupId>org.apache.maven.wagon</groupId>
200 |         <artifactId>wagon-ssh-external</artifactId>
201 |         <version>1.0-beta-7</version>
202 |       </extension>
203 |     </extensions>
204 |     <!-- END EXTENSIONS -->
205 | 
206 |     <pluginManagement>
207 |     	<plugins>
208 |     		<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
209 |     		<plugin>
210 |     			<groupId>org.eclipse.m2e</groupId>
211 |     			<artifactId>lifecycle-mapping</artifactId>
212 |     			<version>1.0.0</version>
213 |     			<configuration>
214 |     				<lifecycleMappingMetadata>
215 |     					<pluginExecutions>
216 |     						<pluginExecution>
217 |     							<pluginExecutionFilter>
218 |     								<groupId>
219 |     									net.alchim31.maven
220 |     								</groupId>
221 |     								<artifactId>
222 |     									scala-maven-plugin
223 |     								</artifactId>
224 |     								<versionRange>
225 |     									[3.1.0,)
226 |     								</versionRange>
227 |     								<goals>
228 |     									<goal>compile</goal>
229 |     									<goal>testCompile</goal>
230 |     								</goals>
231 |     							</pluginExecutionFilter>
232 |     							<action>
233 |     								<ignore></ignore>
234 |     							</action>
235 |     						</pluginExecution>
236 |     					</pluginExecutions>
237 |     				</lifecycleMappingMetadata>
238 |     			</configuration>
239 |     		</plugin>
240 |     	</plugins>
241 |     </pluginManagement>
242 |   </build>
243 | </project>
244 | 


--------------------------------------------------------------------------------
/suim-scala/src/main/scala/edu/cmu/lti/suim/SCAS.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2013 Carnegie Mellon University
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | package edu.cmu.lti.suim
18 | 
19 | import java.io.ByteArrayInputStream
20 | import java.io.ByteArrayOutputStream
21 | import java.io.DataInput
22 | import java.io.DataOutput
23 | import java.io.Externalizable
24 | import java.io.ObjectInput
25 | import java.io.ObjectOutput
26 | 
27 | import org.apache.hadoop.io.Writable
28 | import org.apache.uima.cas.CAS
29 | import org.apache.uima.cas.impl.Serialization
30 | import org.apache.uima.fit.factory.JCasFactory
31 | 
32 | object SCAS {
33 |        
34 |   def read(in: DataInput) = {
35 |     val scas = new SCAS();
36 |     scas.readFields(in);
37 |     scas
38 |   }
39 | }
40 | 
41 | class SCAS(val cas: CAS) extends Externalizable with Writable {
42 | 
43 |   def this() {
44 |     this(JCasFactory.createJCas().getCas())
45 |   }
46 | 
47 |   override def readExternal(in: ObjectInput) {
48 |     readFields(in)
49 |   }
50 | 
51 |   override def writeExternal(out: ObjectOutput) {
52 |     write(out)
53 |   }
54 | 
55 |   def jcas = cas.getJCas()
56 | 
57 |   override def write(out: DataOutput) {
58 |     val baos = new ByteArrayOutputStream();
59 |     Serialization.serializeWithCompression(cas, baos)
60 |     out.writeInt(baos.size)
61 |     out.write(baos.toByteArray)
62 |   }
63 |        
64 |   override def readFields(in: DataInput) {
65 |     val size = in.readInt();
66 |     val bytes = new Array[Byte](size)
67 |     in.readFully(bytes);
68 |     val bais = new ByteArrayInputStream(bytes)
69 |     Serialization.deserializeCAS(cas, bais);
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/suim-scala/src/main/scala/edu/cmu/lti/suim/SparkUimaUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2013 Carnegie Mellon University
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | package edu.cmu.lti.suim
18 | 
19 | import java.net.URI
20 | 
21 | import scala.collection.JavaConversions.asScalaBuffer
22 | import scala.collection.JavaConversions.bufferAsJavaList
23 | 
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.fs.FileSystem
26 | import org.apache.hadoop.fs.Path
27 | import org.apache.hadoop.io.IOUtils
28 | import org.apache.hadoop.io.NullWritable
29 | import org.apache.hadoop.io.SequenceFile
30 | import org.apache.spark.SparkContext
31 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
32 | import org.apache.spark.SparkContext.writableWritableConverter
33 | import org.apache.uima.analysis_engine.AnalysisEngineDescription
34 | import org.apache.uima.collection.CollectionReader
35 | import org.apache.uima.fit.factory.AnalysisEngineFactory
36 | import org.apache.uima.fit.factory.JCasFactory
37 | 
38 | object SparkUimaUtils {
39 | 
40 |   def createSequenceFile(reader: CollectionReader, uri: String) {
41 |     val conf = new Configuration()
42 |     val fs = FileSystem.get(URI.create(uri), conf)
43 |     val path = new Path(uri)
44 |     val nw = NullWritable.get
45 |     val writer = SequenceFile.createWriter(fs, conf, path, nw.getClass(), classOf[SCAS])
46 |     while (reader.hasNext()) {
47 |       val jcas = JCasFactory.createJCas();
48 |       val cas = jcas.getCas()
49 |       reader.getNext(cas)
50 |       val scas = new SCAS(cas)
51 |       writer.append(nw, scas)
52 |     }
53 |     IOUtils.closeStream(writer)
54 |   }
55 | 
56 |   def sequenceFile(reader: CollectionReader, uri: String, sc: SparkContext) = {
57 |     createSequenceFile(reader, uri)
58 |     sc.sequenceFile[NullWritable, SCAS](uri).values
59 |   }
60 | 
61 |   def readFrom(reader: CollectionReader): java.util.List[SCAS] = {
62 |     val buffer = collection.mutable.ArrayBuffer[SCAS]()
63 |     while (reader.hasNext()) {
64 |       val jcas = JCasFactory.createJCas();
65 |       val cas = jcas.getCas()
66 |       reader.getNext(cas)
67 |       buffer += new SCAS(cas)
68 |     }
69 |     buffer
70 |   }
71 | 
72 |   def makeRDD(reader: CollectionReader, sc: SparkContext) = {
73 |     val buffer = readFrom(reader)
74 |     sc.parallelize(buffer)
75 |   }
76 | 
77 |   def process(scas: SCAS, description: AnalysisEngineDescription) = {
78 |     val ae = AnalysisEngineFactory.createEngine(description)
79 |     ae.process(scas.jcas)
80 |     scas
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/suim-scala/src/test/scala/spark-uima-tools/AppSpec.scala:
--------------------------------------------------------------------------------
 1 | package cmu.edu.lti.suim
 2 | 
 3 | import org.scalatest.FlatSpec
 4 | import org.scalatest.matchers.ShouldMatchers
 5 | 
 6 | class AppSpec extends FlatSpec with ShouldMatchers {
 7 |   "An App" should "pass" in {
 8 |     (1) should equal(1)
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------