├── .gitignore
├── .travis.yml
├── LICENSE
├── README.markdown
├── pom.xml
├── suim-examples
├── data
│ ├── Apache_UIMA.txt
│ ├── IBM_LifeSciences.txt
│ ├── New_IBM_Fellows.txt
│ ├── SeminarChallengesInSpeechRecognition.txt
│ ├── TrainableInformationExtractionSystems.txt
│ ├── UIMASummerSchool2003.txt
│ ├── UIMA_Seminars.txt
│ ├── WatsonConferenceRooms.txt
│ └── xml
│ │ ├── IBM_LifeSciences.xml
│ │ ├── New_IBM_Fellows.xml
│ │ ├── SeminarChallengesInSpeechRecognition.xml
│ │ ├── TrainableInformationExtractionSystems.xml
│ │ ├── UIMASummerSchool2003.xml
│ │ ├── UIMA_Seminars.xml
│ │ └── WatsonConferenceRooms.xml
├── pom.xml
└── src
│ ├── main
│ ├── resources
│ │ ├── META-INF
│ │ │ └── org.apache.uima.fit
│ │ │ │ └── types.txt
│ │ ├── ex
│ │ │ ├── RoomNumberAndDateTime.xml
│ │ │ └── TutorialTypeSystem.xml
│ │ └── org
│ │ │ └── apache
│ │ │ └── uima
│ │ │ └── tutorial
│ │ │ └── ex6
│ │ │ └── uimaAcronyms.txt
│ └── scala
│ │ └── edu
│ │ └── cmu
│ │ └── lti
│ │ └── suim
│ │ └── examples
│ │ ├── Annotators.scala
│ │ ├── App.scala
│ │ ├── AppWithHDFS.scala
│ │ └── SparkPipelineExample.scala
│ └── test
│ └── scala
│ └── spark-uima-tools
│ └── AppSpec.scala
├── suim-java
├── pom.xml
└── src
│ └── main
│ └── java
│ └── edu
│ └── cmu
│ └── lti
│ └── suim
│ └── JavaSparkUima.java
└── suim-scala
├── pom.xml
└── src
├── main
└── scala
│ └── edu
│ └── cmu
│ └── lti
│ └── suim
│ ├── SCAS.scala
│ └── SparkUimaUtils.scala
└── test
└── scala
└── spark-uima-tools
└── AppSpec.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | target/
3 | *.DS_Store
4 | *.releaseBackup
5 | release.properties
6 | *.iml
7 | *.iws
8 | *.ipr
9 | .idea/
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - 2.9.2
4 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright 2012 Twitter Inc
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
1 | # SUIM
2 |
3 | Spark for Unstructured Information, provides a thin abstraction layer for [UIMA](http://uima.apache.org/)
4 | on top of [Spark](http://spark.apache.org/).
5 | SUIM leverages on Spark resilient distributed dataset (RDD) to run UIMA pipelines using uimaFIT, SUIM pipelines are
6 | distributed across the nodes on a cluster and can be operated on in parallel [1].
7 |
8 | SUIM allows you to run analytical pipelines on the resulting (or intermediate) `CAS` to execute furhter text analytics or
9 | machine learning algorithms.
10 |
11 | ## Examples
12 |
13 | #### Count buildings from the UIMA tutorial.
14 |
15 | Using the `RoomAnnotator` from the UIMA tutorial:
16 |
17 |
18 | ```scala
19 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
20 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
21 | val rdd = makeRDD(createCollectionReader(classOf[FileSystemCollectionReader], params: _*), sc)
22 | val rnum = createEngineDescription(classOf[RoomNumberAnnotator])
23 | val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber]))
24 | val counts = rooms.map(room => room.getBuilding()).map((_,1)).reduceByKey(_ + _)
25 | counts.foreach(println(_))
26 | ```
27 |
28 | If the collection is to large to fit in memory, or you already have a collection of `SCAS`es use an HDFS RDD:
29 |
30 | ```scala
31 | val rdd = sequenceFile(reateCollectionReader(classOf[FileSystemCollectionReader], params: _*),
32 | "hdfs://localhost:9000/documents", sc)
33 | ```
34 |
35 | #### Tokenize and count words with DKPro Core
36 |
37 | Use DKPro Core [2] to tokenize and Spark to do token level analytics.
38 |
39 | ```scala
40 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
41 | val rdd = makeRDD(createCollectionReader(classOf[TextReader],
42 | ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "data",
43 | ResourceCollectionReaderBase.PARAM_LANGUAGE, "en",
44 | ResourceCollectionReaderBase.PARAM_PATTERNS, Array("[+]*.txt")), sc)
45 | val seg = createPrimitiveDescription(classOf[BreakIteratorSegmenter])
46 | val tokens = rdd.map(process(_, seg)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[Token]))
47 | val counts = tokens.map(token => token.getCoveredText())
48 | .filter(filter(_))
49 | .map((_,1)).reduceByKey(_ + _)
50 | .map(pair => (pair._2, pair._1)).sortByKey(true)
51 | counts.foreach(println(_))
52 | ```
53 |
54 | ### Common Tasks
55 |
56 | To build:
57 |
58 | mvn compile
59 |
60 | To run:
61 |
62 | mvn scala:run
63 |
64 | To test:
65 |
66 | mvn test
67 |
68 | To create standalone with dependencies:
69 |
70 | mvn package
71 | java -jar target/spark-uima-tools-0.0.1-SNAPSHOT-jar-with-dependencies.jar
72 |
73 | ## References
74 | * [1] http://spark.incubator.apache.org/docs/latest/scala-programming-guide.html
75 | * [2] https://code.google.com/p/dkpro-core-asl/
76 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
5 | 4.0.0
6 |
7 | edu.cmu.lti
8 | suim
9 | 0.0.1-SNAPSHOT
10 | SUIM
11 | pom
12 |
13 | 2013
14 | https://github.com/oaqa/suim
15 |
16 |
17 | org.sonatype.oss
18 | oss-parent
19 | 7
20 |
21 |
22 |
23 | github.com
24 | https://github.com/oaqa/suim/issues
25 |
26 |
27 |
28 |
29 | The Apache Software License, Version 2.0
30 | http://www.apache.org/licenses/LICENSE-2.0.txt
31 | repo
32 |
33 |
34 |
35 |
36 | git@github.com:oaqa/suim.git
37 | scm:git:git@github.com:oaqa/suim.git
38 | scm:git:git@github.com:oaqa/suim.git
39 |
40 |
41 |
42 | 1.6
43 | 1.6
44 | UTF-8
45 |
46 |
47 |
48 |
49 | twttr
50 | twttr
51 | http://maven.twttr.com
52 |
53 |
54 |
55 |
56 |
57 | scala-tools.org
58 | Scala-Tools Maven2 Repository
59 | http://scala-tools.org/repo-releases
60 |
61 |
62 |
63 |
64 | suim-java
65 | suim-scala
66 | suim-examples
67 |
68 |
69 |
70 |
71 |
72 |
73 | org.apache.maven.plugins
74 | maven-compiler-plugin
75 |
76 | 1.6
77 | 1.6
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 | junit
87 | junit
88 | 3.8.1
89 | test
90 |
91 |
92 |
93 |
--------------------------------------------------------------------------------
/suim-examples/data/Apache_UIMA.txt:
--------------------------------------------------------------------------------
1 | Welcome to Apache UIMA (Unstructured Information Management Architecture), a incubator project of the Apache Software Foundation (ASF).
2 | Our goal is a thriving community of users and developers of UIMA frameworks, supporting components for analysing unstructured content such as text, audio and video.
3 |
4 | What is UIMA?
5 |
6 | Unstructured Information Management applications are software systems that analyze large volumes of unstructured information in order to discover knowledge that is relevant to an end user.
7 | UIMA is a framework and SDK for developing such applications. An example UIM application might ingest plain text and identify entities, such as persons, places, organizations; or relations, such as works-for or located-at.
8 | UIMA enables such an application to be decomposed into components, for example "language identification" -> "language specific segmentation" -> "sentence boundary detection" -> "entity detection (person/place names etc.)".
9 | Each component must implement interfaces defined by the framework and must provide self-describing metadata via XML descriptor files. The framework manages these components and the data flow between them. Components are written in Java or C++; the data that flows between components is designed for efficient mapping between these languages.
10 | UIMA additionally provides capabilities to wrap components as network services, and can scale to very large volumes by replicating processing pipelines over a cluster of networked nodes.
11 |
12 | Apache UIMA is an Apache-licensed open source implementation of the UIMA specification (that specification is, in turn, being developed concurrently by a technical committee within OASIS , a standards organization).
13 | We invite and encourage you to participate in both the implementation and specification efforts.
14 |
15 | UIMA is a component framework for analysing unstructured content such as text, audio and video.
16 | It comprises an SDK and tooling for composing and running analytic components written in Java and C++, with some support for Perl, Python and TCL.
17 |
18 |
19 | Apache UIMA mailing lists:
20 |
21 | Users - uima-user@incubator.apache.org
22 | Developers - uima-dev@incubator.apache.org
23 | Commits - uima-commits@incubator.apache.org
24 |
25 |
26 | Apache UIMA project committers:
27 |
28 | Michael Baessler
29 | Edward Epstein
30 | Thilo Goetz
31 | Adam Lally
32 | Marshall Schor
33 |
34 |
35 | Apache UIMA project Mentors:
36 |
37 | Ken Coar (ASF member and Vice President)
38 | Sam Ruby (ASF member)
--------------------------------------------------------------------------------
/suim-examples/data/IBM_LifeSciences.txt:
--------------------------------------------------------------------------------
1 | "Life sciences is one of the emerging markets at the heart of IBM's growth strategy," said John M. Thompson, IBM senior vice president & group executive, Software. "This investment is the first of a number of steps we will be taking to advance IBM's life sciences initiatives." In his role as newly appointed IBM Corporation vice chairman, effective September 1, Mr. Thompson will be responsible for integrating and accelerating IBM's efforts to exploit life sciences and other emerging growth areas.
2 |
3 | IBM estimates the market for IT solutions for life sciences will skyrocket from $3.5 billion today to more than $9 billion by 2003. Driving demand is the explosive growth in genomic, proteomic and pharmaceutical research. For example, the Human Genome Database is approximately three terabytes of data, or the equivalent of 150 million pages of information. The volume of life sciences data is doubling every six months.
4 |
5 | "All of this genetic data is worthless without the information technology that can help scientists manage and analyze it to unlock the pathways that will lead to new cures for many of today's diseases," said Dr. Caroline Kovac, vice president of IBM's new Life Sciences unit. "IBM can help speed this process by enabling more efficient interpretation of data and sharing of knowledge. The potential for change based on innovation in life sciences is bigger than the change caused by the digital circuit."
6 |
7 | Among the life sciences initiatives already underway at IBM are:
8 | - DiscoveryLink* -- For the first time, researchers using this combination of innovative middleware and integration services can join together information from many sources to solve complex medical research problems. DiscoveryLink creates a "virtual database" that permits data to be accessed and extracted from multiple data sources used in research and development projects. This IT solution can dramatically improve product cycle time and lower development costs for pharmaceutical, biotechnology and agri-science companies.
9 |
10 | - Blue Gene* - IBM is building a supercomputer 100 times faster than any available today designed to advance understanding of the mechanisms behind protein folding through large-scale biomolecular simulation. In December, IBM committed $100 million to this five-year research project to advance the state-of-the-art in supercomputing for biological applications.
11 | - Bio-Dictionary* -- IBM has compiled a protein dictionary containing some 30 million protein "words" designed to accelerate the understanding of protein shapes and functions.Bio-Dictionaries for selected genomes, as well as bioinformatics algorithms for pattern discovery and other relevant applications, are available to scientists and researchers for noncommercial use through a website dedicated to life sciences content at http://www.research.ibm.com/compsci/compbio/.
12 |
13 | * Indicates trademark or registered trademark of IBM Corporation.
--------------------------------------------------------------------------------
/suim-examples/data/New_IBM_Fellows.txt:
--------------------------------------------------------------------------------
1 | IBM today elevated five employees to the title of IBM Fellow -- its most prestigious technical honor. The company also presented more than $2.8 million in cash awards to employees whose technical innovation have yielded exceptional value to the company and its customers.
2 |
3 | IBM conferred the accolades and awards at its 2003 Corporate Technical Recognition Event (CTRE) in Scottsdale, Ariz. CTRE is a 40-year tradition at IBM, established to recognize exceptional technical employees and reward them for extraordinary achievements and contributions to the company's technology leadership.
4 |
5 | "Our technical employees are among the best and brightest innovators in the world. They share a passion for excellence that defines their work and permeates the products and services IBM delivers to its customers," said Nick Donofrio, senior vice president, technology and manufacturing for IBM. "CTRE provides the means for us to honor those who have distinguished themselves as exceptional leaders among their peers."
6 |
7 | Among the special honorees at the 2003 CTRE are five employees who earned the coveted distinction of IBM Fellow:
8 |
9 |
10 | - Grady Booch, chief scientist of Rational Software, IBM Software Group. Recognized internationally for his innovative work on software architecture, modeling, and software engineering process. Mr. Booch is one of the original authors of the Unified Modeling Language (UML), the industry-standard language of blueprints for software-intensive systems.
11 |
12 | - Dr. Donald Chamberlin, researcher, IBM Almaden Research Center. An expert in relational database languages, Dr. Chamberlin is co- inventor of SQL, the language that energized the relational database market. He has also influenced the creation of XQuery, one of a new generation of database query languages covering structured, semi-structured and unstructured data.
13 |
14 | - Dr. George Galambos, chief technology officer, IBM Global Services (IGS) in Canada; the first Fellow from Canada. Dr. Galambos specializes in high-performance, high availability designs, operational effectiveness, and risk assessment/mitigation, focusing on systems engineering and architecture reuse that enhances efficiency and stability. He is a principal driver of and contributor to the widely acclaimed "Patterns for e-business" and the Enterprise Solution Structure Reference Architectures, widely used by IGS in customer engagements.
15 |
16 | - Rod Smith, vice president of Internet emerging technologies, IBM Software Group. A leader in the areas of object-oriented programming, visual development tools, Java, XML, and Web Services. Rod also was the chief technical strategist for focusing the Java platform for use in middleware solutions, in particular initiating contributions to the development of the J2EE.
17 |
18 | - Charles Webb, eServer processor design, IBM Systems Group. Charles Webb has led the reinvention of IBM's eServer zSeries microprocessor designs and roadmap, including the z900 server, where he provided the bridge among architecture, hardware, compilers and system software, defining major portions of the 64- bit architecture and beyond.
19 |
20 |
21 | The title of IBM Fellow is the company's most preeminent technical distinction and is granted in recognition of outstanding and sustained technical achievements in engineering, programming, science and technology. Only 175 individuals have earned this designation in the company's history and, including the newly named Fellows, 56 are active employees. IBM Fellows are encouraged to further enhance their potential for creative achievements and typically work on special projects or research initiatives that lead the company in exciting new directions.
22 |
23 |
--------------------------------------------------------------------------------
/suim-examples/data/SeminarChallengesInSpeechRecognition.txt:
--------------------------------------------------------------------------------
1 | UIT Seminar: Challenges in Speech Recognition
2 | August 8, 2003 10:30 AM - 11:30 AM
3 | Lawrence Rabiner , Associate Director CAIP, Rutgers
4 | University, Professor Univ. of Santa Barbara
5 | Yorktown 20-043
6 | Availability: Open
7 |
8 | Speech recognition has matured to the point where it
9 | is now being widely applied in a range of applications
10 | including desktop dictation, cell phone name dialing,
11 | agent technology, automated operator services,
12 | telematics, call center automation and help desks.
13 |
14 | Although the technology is often good enough for many
15 | of these applications, there remain key challenges in
16 | virtually every aspect of speech recognition that
17 | prevent the technology from being used ubiquitously in
18 | any environment, for any speaker, and for an even
19 | broader range of applications. This talk will analyze
20 | the ‘Speech Circle’ that enables a person to maintain
21 | a dialog with a machine using speech recognition,
22 | spoken language understanding, dialog management and
23 | spoken language generation, and finally text-to-speech
24 | synthesis, and show where significant progress has
25 | been made, and where there remain critical problems
26 | that need to be addressed and solved.
27 |
28 | The talk will include several audio and video examples
29 | of speech recognition and speech understanding systems
30 | that have been studied in the laboratory to illustrate
31 | the challenges that remain to be solved before speech
32 | recognition is considered a solved problem.
33 |
34 |
35 |
--------------------------------------------------------------------------------
/suim-examples/data/TrainableInformationExtractionSystems.txt:
--------------------------------------------------------------------------------
1 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems
2 |
3 | August 19, 2003 02:00 PM - 03:30 PM
4 | David Johnson, Frank Oles, Tong Zhang(IBM Research)
5 | Hawthorne GN-F15
6 | Availability: Open
7 |
8 | The technical objective of the TIES project is to build customizable systems that can identify named entities in text, such as persons, organizations, and locations, as well as identifying relations between those entities. The technical approach is to develop new statistical and symbolic machine learning algorithms in service of the technical objective. Also, we are working on combining statistical with symbolic techniques. The first part of this talk, given by David E. Johnson, will provide a general overview of the goals of the TIES project. The second part, given by Tong Zhang, will provide background on applying statistical machine learning to this problem domain. Tong will also describe the particular statistical approach taken, which is termed Robust Risk Minimization (RMM). The final part will be given by Frank J. Oles. Frank will introduce his theory of precedence-inclusion patterns. Precedence-inclusion patterns are mathematical structures possessing multiple interacting strict partial orders that satisfy axioms generalizing the familiar properties of irreflexivity and transitivity. This very general theory provides a radically new approach to symbolic, as opposed to statistical, pattern generalization that can be applied to relational learning in a number of settings, including learning based on text, on images, or on videos.
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/suim-examples/data/UIMASummerSchool2003.txt:
--------------------------------------------------------------------------------
1 | UIMA Summer School
2 |
3 | August 26, 2003
4 | UIMA 101 - The New UIMA Introduction
5 | (Hands-on Tutorial)
6 | 9:00AM-5:00PM in HAW GN-K35
7 |
8 | August 28, 2003
9 | FROST Tutorial
10 | 9:00AM-5:00PM in HAW GN-K35
11 |
12 | September 15, 2003
13 | UIMA 201: UIMA Advanced Topics
14 | (Hands-on Tutorial)
15 | 9:00AM-5:00PM in HAW 1S-F53
16 |
17 | September 17, 2003
18 | The UIMA System Integration Test and Hardening Service
19 | The "SITH"
20 | 3:00PM-4:30PM in HAW GN-K35
21 |
22 |
23 |
24 | UIMA Summer School Tutorial and Presentation Details
25 | UIMA 101: The new UIMA tutorial
26 | Tuesday August 26 9:00AM - 4:30PM in GN-K35
27 |
28 | UIMA 101 is a hands-on programming tutorial.
29 |
30 | UIMA 101 is intended for people who want a first introductory course to UIMA or for people who would like a refresher.
31 |
32 | The tutorial covers the same concepts in the first UIMA tutorial given in 3Q 2002 except for some key updates:
33 |
34 | 1) It uses a new interface to the CAS that makes it more natural to access and update CAS feature structures using ordinary Java objects (i.e., the JCAS) and
35 | 2) It uses updated TAE interfaces that give the application developer more control over managing multiple CASs.
36 |
37 | Please NOTE expert users of UIMA can skip this one and should consider attending the Advanced Topics tutorial.
38 |
39 | Prerequisites for the UIMA 101 Tutorial
40 | 1) Java Programming
41 | 2) Some experience with Eclipse IDE helpful
42 |
43 | FROST Tutorial
44 | August 28 9:00AM - 5:00PM in GN-K35
45 |
46 | Visitors from the FROST team will be here to talk to us about FROST.
47 |
48 | UIMA 201: The UIMA Advanced Topics Tutorial
49 | September 15: 9:00AM - 5:30PM in Hawthorne 1S-F53
50 |
51 | UIMA 201 will introduce some new UIMA concepts and walk the student through hands-on examples.
52 |
53 | The advanced topics tutorial is designed for people who have some experience with UIMA and want
54 | to use new capabilities of UIMA 1.0 to address one or more of the following
55 | Advanced Topics:
56 |
57 | 1) Collection Processing and Collection Processing Engines (CPEs)
58 | 2) Multi-Threading and CAS Pooling
59 | 3) Using the UIMA adapter framework to integrate network TAEs with Java TAEs
60 | 4) A Semantic Search Application that brings it all together
61 |
62 | Prerequisites for UIMA 201
63 | 1) UIMA 101 Tutorial OR Extensive UIMA Experience
64 |
65 | The UIMA Integration Test bed Service (The "SITH")
66 | September 17 3:00PM - 4:30PM in HAW GN-K35
67 |
68 | We have developed the first version of the UIMA Integration Test bed service.
69 |
70 | This service is being developed to help test, evaluate, certify and publish UIMA compliant components.
71 |
72 | In this talk we will explain the service and what it is intended to provide the UIMA community. We will address the following topics:
73 |
74 | 1. SITH Services
75 | 2. How to submit components and what to expect in return
76 | 3. Overview of the test bed implementation using Collection Processing UIMA and Juru.
77 | 4. Next Steps for the SITH
78 |
79 |
80 |
--------------------------------------------------------------------------------
/suim-examples/data/UIMA_Seminars.txt:
--------------------------------------------------------------------------------
1 | Upcoming UIMA Seminars
2 |
3 | April 7, 2004 Distillery Lunch Seminar
4 | UIMA and its Metadata
5 | 12:00PM-1:00PM in HAW GN-K35.
6 |
7 | Dave Ferrucci will give a UIMA overview and discuss the types of component metadata that UIMA components provide. Jon Lenchner will give a demo of the Text Analysis Engine configurator tool.
8 |
9 |
10 | April 16, 2004 KM & I Department Tea
11 | Title: An Eclipse-based TAE Configurator Tool
12 | 3:00PM-4:30PM in HAW GN-K35 .
13 |
14 | Jon Lenchner will demo an Eclipse plugin for configuring TAE descriptors, which will be available soon for you to use. No more editing XML descriptors by hand!
15 |
16 |
17 | May 11, 2004 UIMA Tutorial
18 | 9:00AM-5:00PM in HAW GN-K35.
19 |
20 | This is a full-day, hands-on tutorial on UIMA, covering the development of Text Analysis Engines and Collection Processing Engines, as well as how to include these components in your own applications.
21 |
--------------------------------------------------------------------------------
/suim-examples/data/WatsonConferenceRooms.txt:
--------------------------------------------------------------------------------
1 | Conference Rooms at Watson:
2 | Location Capacity Wall Phone Ext.
3 |
4 | Classroom Style
5 | HAW J2-B34 Seats 12 tieline 863-3130
6 | HAW J2-N07 Seats 24 tieline 863-3210
7 | YKT 20-001 Seats 36 tieline 862-4304
8 | YKT 20-051 Seats 18 tieline 862-4307
9 |
10 | Conference Style
11 | HAW 2N-F28 Seats 20 tieline 863-7583
12 | HAW 4N-B15 Seats 14 tieline 863-7126
13 | HAW 4N-B17 Seats 10 tieline 863-7089
14 | HAW 4S-K21 Seats 16 tieline 863-6386
15 | HAW GN-F14 Seats 12 tieline 863-6770
16 | HAW GN-K30 Seats 12 tieline 863-7335
17 | HAW GN-K36 Seats 10 tieline 863-6098
18 | HAW J1-N14 Seats 24 tieline 863-3629
19 | HAW J2-A16 Seats 12 tieline 863-3240
20 | HAW J2-G27 Seats 15 tieline 863-3150
21 | HAW J2-M24 Seats 8 tieline 863-3160
22 | YKT 03-135 Seats 8 tieline 862-1696
23 | YKT 03-235 Seats 8 tieline 862-4278
24 | YKT 05-135 Seats 8 tieline 862-3477
25 | YKT 05-235 Seats 8 tieline 862-4279
26 | YKT 20-006 Seats 8 tieline 862-4301
27 | YKT 20-059 Seats 20 tieline 862-4308
28 | YKT 35-132 Seats 8 tieline 862-2873
29 | YKT 35-232 Seats 8 tieline 862-2860
30 | YKT 38-023 Seats 8 tieline 862-3299
31 | YKT 39-132 Seats 8 tieline 862-3486
32 | YKT 40-100 Seats 20 tieline 862-4199
33 | YKT 40-200 Seats 20 tieline 862-1379
34 |
35 | Other
36 | HAW GN-K35 Seats 24 tieline 863-6104
37 |
38 | Theater Style
39 | HAW 1S-F40 Seats 30 tieline 863-6396
40 | YKT 20-043 Seats 50 tieline 862-4306
41 |
42 | Video Conference Room
43 | YKT 32-026 Seats 25 tieline 862-3917
44 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/IBM_LifeSciences.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | IBM announces $100 Million investment in Life Sciences
26 | 16 August 2000
27 | "Life sciences is one of the emerging markets at the heart of IBM's growth strategy," said John M. Thompson, IBM senior vice president & group executive, Software. "This investment is the first of a number of steps we will be taking to advance IBM's life sciences initiatives." In his role as newly appointed IBM Corporation vice chairman, effective September 1, Mr. Thompson will be responsible for integrating and accelerating IBM's efforts to exploit life sciences and other emerging growth areas.
28 |
29 | IBM estimates the market for IT solutions for life sciences will skyrocket from $3.5 billion today to more than $9 billion by 2003. Driving demand is the explosive growth in genomic, proteomic and pharmaceutical research. For example, the Human Genome Database is approximately three terabytes of data, or the equivalent of 150 million pages of information. The volume of life sciences data is doubling every six months.
30 |
31 | "All of this genetic data is worthless without the information technology that can help scientists manage and analyze it to unlock the pathways that will lead to new cures for many of today's diseases," said Dr. Caroline Kovac, vice president of IBM's new Life Sciences unit. "IBM can help speed this process by enabling more efficient interpretation of data and sharing of knowledge. The potential for change based on innovation in life sciences is bigger than the change caused by the digital circuit."
32 |
33 | Among the life sciences initiatives already underway at IBM are:
34 | - DiscoveryLink* -- For the first time, researchers using this combination of innovative middleware and integration services can join together information from many sources to solve complex medical research problems. DiscoveryLink creates a "virtual database" that permits data to be accessed and extracted from multiple data sources used in research and development projects. This IT solution can dramatically improve product cycle time and lower development costs for pharmaceutical, biotechnology and agri-science companies.
35 |
36 | - Blue Gene* - IBM is building a supercomputer 100 times faster than any available today designed to advance understanding of the mechanisms behind protein folding through large-scale biomolecular simulation. In December, IBM committed $100 million to this five-year research project to advance the state-of-the-art in supercomputing for biological applications.
37 | - Bio-Dictionary* -- IBM has compiled a protein dictionary containing some 30 million protein "words" designed to accelerate the understanding of protein shapes and functions.Bio-Dictionaries for selected genomes, as well as bioinformatics algorithms for pattern discovery and other relevant applications, are available to scientists and researchers for noncommercial use through a website dedicated to life sciences content at http://www.research.ibm.com/compsci/compbio/.
38 |
39 |
40 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/New_IBM_Fellows.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | IBM Names Five Fellows, Company's Highest Techinical Honor
26 | 05 June 2002
27 |
28 | IBM today elevated five employees to the title of IBM Fellow -- its most prestigious technical honor. The company also presented more than $2.8 million in cash awards to employees whose technical innovation have yielded exceptional value to the company and its customers.
29 |
30 | IBM conferred the accolades and awards at its 2003 Corporate Technical Recognition Event (CTRE) in Scottsdale, Ariz. CTRE is a 40-year tradition at IBM, established to recognize exceptional technical employees and reward them for extraordinary achievements and contributions to the company's technology leadership.
31 |
32 | "Our technical employees are among the best and brightest innovators in the world. They share a passion for excellence that defines their work and permeates the products and services IBM delivers to its customers," said Nick Donofrio, senior vice president, technology and manufacturing for IBM. "CTRE provides the means for us to honor those who have distinguished themselves as exceptional leaders among their peers."
33 |
34 | Among the special honorees at the 2003 CTRE are five employees who earned the coveted distinction of IBM Fellow:
35 |
36 |
37 | - Grady Booch, chief scientist of Rational Software, IBM Software Group. Recognized internationally for his innovative work on software architecture, modeling, and software engineering process. Mr. Booch is one of the original authors of the Unified Modeling Language (UML), the industry-standard language of blueprints for software-intensive systems.
38 |
39 | - Dr. Donald Chamberlin, researcher, IBM Almaden Research Center. An expert in relational database languages, Dr. Chamberlin is co- inventor of SQL, the language that energized the relational database market. He has also influenced the creation of XQuery, one of a new generation of database query languages covering structured, semi-structured and unstructured data.
40 |
41 | - Dr. George Galambos, chief technology officer, IBM Global Services (IGS) in Canada; the first Fellow from Canada. Dr. Galambos specializes in high-performance, high availability designs, operational effectiveness, and risk assessment/mitigation, focusing on systems engineering and architecture reuse that enhances efficiency and stability. He is a principal driver of and contributor to the widely acclaimed "Patterns for e-business" and the Enterprise Solution Structure Reference Architectures, widely used by IGS in customer engagements.
42 |
43 | - Rod Smith, vice president of Internet emerging technologies, IBM Software Group. A leader in the areas of object-oriented programming, visual development tools, Java, XML, and Web Services. Rod also was the chief technical strategist for focusing the Java platform for use in middleware solutions, in particular initiating contributions to the development of the J2EE.
44 |
45 | - Charles Webb, eServer processor design, IBM Systems Group. Charles Webb has led the reinvention of IBM's eServer zSeries microprocessor designs and roadmap, including the z900 server, where he provided the bridge among architecture, hardware, compilers and system software, defining major portions of the 64- bit architecture and beyond.
46 |
47 |
48 | The title of IBM Fellow is the company's most preeminent technical distinction and is granted in recognition of outstanding and sustained technical achievements in engineering, programming, science and technology. Only 175 individuals have earned this designation in the company's history and, including the newly named Fellows, 56 are active employees. IBM Fellows are encouraged to further enhance their potential for creative achievements and typically work on special projects or research initiatives that lead the company in exciting new directions.
49 |
50 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/SeminarChallengesInSpeechRecognition.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
24 |
25 |
26 | UIT Seminar: Challenges in Speech Recognition
27 | 8 August 2003
28 |
29 | UIT Seminar: Challenges in Speech Recognition
30 | August 8, 2003 10:30 AM - 11:30 AM
31 | Lawrence Rabiner , Associate Director CAIP, Rutgers
32 | University, Professor Univ. of Santa Barbara
33 | Yorktown 20-043
34 | Availability: Open
35 |
36 | Speech recognition has matured to the point where it
37 | is now being widely applied in a range of applications
38 | including desktop dictation, cell phone name dialing,
39 | agent technology, automated operator services,
40 | telematics, call center automation and help desks.
41 |
42 | Although the technology is often good enough for many
43 | of these applications, there remain key challenges in
44 | virtually every aspect of speech recognition that
45 | prevent the technology from being used ubiquitously in
46 | any environment, for any speaker, and for an even
47 | broader range of applications. This talk will analyze
48 | the ‘Speech Circle’ that enables a person to maintain
49 | a dialog with a machine using speech recognition,
50 | spoken language understanding, dialog management and
51 | spoken language generation, and finally text-to-speech
52 | synthesis, and show where significant progress has
53 | been made, and where there remain critical problems
54 | that need to be addressed and solved.
55 |
56 | The talk will include several audio and video examples
57 | of speech recognition and speech understanding systems
58 | that have been studied in the laboratory to illustrate
59 | the challenges that remain to be solved before speech
60 | recognition is considered a solved problem.
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/TrainableInformationExtractionSystems.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems
26 | 19 August 2003
27 |
28 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems
29 |
30 | August 19, 2003 02:00 PM - 03:30 PM
31 | David Johnson, Frank Oles, Tong Zhang(IBM Research)
32 | Hawthorne GN-F15
33 | Availability: Open
34 |
35 | The technical objective of the TIES project is to build customizable systems that can identify named entities in text, such as persons, organizations, and locations, as well as identifying relations between those entities. The technical approach is to develop new statistical and symbolic machine learning algorithms in service of the technical objective. Also, we are working on combining statistical with symbolic techniques. The first part of this talk, given by David E. Johnson, will provide a general overview of the goals of the TIES project. The second part, given by Tong Zhang, will provide background on applying statistical machine learning to this problem domain. Tong will also describe the particular statistical approach taken, which is termed Robust Risk Minimization (RMM). The final part will be given by Frank J. Oles. Frank will introduce his theory of precedence-inclusion patterns. Precedence-inclusion patterns are mathematical structures possessing multiple interacting strict partial orders that satisfy axioms generalizing the familiar properties of irreflexivity and transitivity. This very general theory provides a radically new approach to symbolic, as opposed to statistical, pattern generalization that can be applied to relational learning in a number of settings, including learning based on text, on images, or on videos.
36 |
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/UIMASummerSchool2003.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 | UIMA Summer School
25 | 1 August 2003
26 |
27 | August 26, 2003
28 | UIMA 101 - The New UIMA Introduction
29 | (Hands-on Tutorial)
30 | 9:00AM-5:00PM in HAW GN-K35
31 |
32 | August 28, 2003
33 | FROST Tutorial
34 | 9:00AM-5:00PM in HAW GN-K35
35 |
36 | September 15, 2003
37 | UIMA 201: UIMA Advanced Topics
38 | (Hands-on Tutorial)
39 | 9:00AM-5:00PM in HAW 1S-F53
40 |
41 | September 17, 2003
42 | The UIMA System Integration Test and Hardening Service
43 | The "SITH"
44 | 3:00PM-4:30PM in HAW GN-K35
45 |
46 |
47 |
48 | UIMA Summer School Tutorial and Presentation Details
49 | UIMA 101: The new UIMA tutorial
50 | Tuesday August 26 9:00AM - 4:30PM in GN-K35
51 |
52 | UIMA 101 is a hands-on programming tutorial.
53 |
54 | UIMA 101 is intended for people who want a first introductory course to UIMA or for people who would like a refresher.
55 |
56 | The tutorial covers the same concepts in the first UIMA tutorial given in 3Q 2002 except for some key updates:
57 |
58 | 1) It uses a new interface to the CAS that makes it more natural to access and update CAS feature structures using ordinary Java objects (i.e., the JCAS) and
59 | 2) It uses updated TAE interfaces that give the application developer more control over managing multiple CASs.
60 |
61 | Please NOTE expert users of UIMA can skip this one and should consider attending the Advanced Topics tutorial.
62 |
63 | Prerequisites for the UIMA 101 Tutorial
64 | 1) Java Programming
65 | 2) Some experience with Eclipse IDE helpful
66 |
67 | FROST Tutorial
68 | August 28 9:00AM - 5:00PM in GN-K35
69 |
70 | Visitors from the FROST team will be here to talk to us about FROST.
71 |
72 | UIMA 201: The UIMA Advanced Topics Tutorial
73 | September 15: 9:00AM - 5:30PM in Hawthorne 1S-F53
74 |
75 | UIMA 201 will introduce some new UIMA concepts and walk the student through hands-on examples.
76 |
77 | The advanced topics tutorial is designed for people who have some experience with UIMA and want
78 | to use new capabilities of UIMA 1.0 to address one or more of the following
79 | Advanced Topics:
80 |
81 | 1) Collection Processing and Collection Processing Engines (CPEs)
82 | 2) Multi-Threading and CAS Pooling
83 | 3) Using the UIMA adapter framework to integrate network TAEs with Java TAEs
84 | 4) A Semantic Search Application that brings it all together
85 |
86 | Prerequisites for UIMA 201
87 | 1) UIMA 101 Tutorial OR Extensive UIMA Experience
88 |
89 | The UIMA Integration Test bed Service (The "SITH")
90 | September 17 3:00PM - 4:30PM in HAW GN-K35
91 |
92 | We have developed the first version of the UIMA Integration Test bed service.
93 |
94 | This service is being developed to help test, evaluate, certify and publish UIMA compliant components.
95 |
96 | In this talk we will explain the service and what it is intended to provide the UIMA community. We will address the following topics:
97 |
98 | 1. SITH Services
99 | 2. How to submit components and what to expect in return
100 | 3. Overview of the test bed implementation using Collection Processing UIMA and Juru.
101 | 4. Next Steps for the SITH
102 |
103 |
104 |
105 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/UIMA_Seminars.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | Upcoming UIMA Seminars
26 | 15 March 2004
27 |
28 | April 7, 2004 Distillery Lunch Seminar
29 | UIMA and its Metadata
30 | 12:00PM-1:00PM in HAW GN-K35.
31 |
32 | Dave Ferrucci will give a UIMA overview and discuss the types of component metadata that UIMA components provide. Jon Lenchner will give a demo of the Text Analysis Engine configurator tool.
33 |
34 |
35 | April 16, 2004 KM & I Department Tea
36 | Title: An Eclipse-based TAE Configurator Tool
37 | 3:00PM-4:30PM in HAW GN-K35 .
38 |
39 | Jon Lenchner will demo an Eclipse plugin for configuring TAE descriptors, which will be available soon for you to use. No more editing XML descriptors by hand!
40 |
41 |
42 | May 11, 2004 UIMA Tutorial
43 | 9:00AM-5:00PM in HAW GN-K35.
44 |
45 | This is a full-day, hands-on tutorial on UIMA, covering the development of Text Analysis Engines and Collection Processing Engines, as well as how to include these components in your own applications.
46 |
47 |
--------------------------------------------------------------------------------
/suim-examples/data/xml/WatsonConferenceRooms.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | Conference Rooms at Watson
26 | 01 January 2000
27 |
28 | Conference Rooms at Watson:
29 | Location Capacity Wall Phone Ext.
30 |
31 | Classroom Style
32 | HAW J2-B34 Seats 12 tieline 863-3130
33 | HAW J2-N07 Seats 24 tieline 863-3210
34 | YKT 20-001 Seats 36 tieline 862-4304
35 | YKT 20-051 Seats 18 tieline 862-4307
36 |
37 | Conference Style
38 | HAW 2N-F28 Seats 20 tieline 863-7583
39 | HAW 4N-B15 Seats 14 tieline 863-7126
40 | HAW 4N-B17 Seats 10 tieline 863-7089
41 | HAW 4S-K21 Seats 16 tieline 863-6386
42 | HAW GN-F14 Seats 12 tieline 863-6770
43 | HAW GN-K30 Seats 12 tieline 863-7335
44 | HAW GN-K36 Seats 10 tieline 863-6098
45 | HAW J1-N14 Seats 24 tieline 863-3629
46 | HAW J2-A16 Seats 12 tieline 863-3240
47 | HAW J2-G27 Seats 15 tieline 863-3150
48 | HAW J2-M24 Seats 8 tieline 863-3160
49 | YKT 03-135 Seats 8 tieline 862-1696
50 | YKT 03-235 Seats 8 tieline 862-4278
51 | YKT 05-135 Seats 8 tieline 862-3477
52 | YKT 05-235 Seats 8 tieline 862-4279
53 | YKT 20-006 Seats 8 tieline 862-4301
54 | YKT 20-059 Seats 20 tieline 862-4308
55 | YKT 35-132 Seats 8 tieline 862-2873
56 | YKT 35-232 Seats 8 tieline 862-2860
57 | YKT 38-023 Seats 8 tieline 862-3299
58 | YKT 39-132 Seats 8 tieline 862-3486
59 | YKT 40-100 Seats 20 tieline 862-4199
60 | YKT 40-200 Seats 20 tieline 862-1379
61 |
62 | Other
63 | HAW GN-K35 Seats 24 tieline 863-6104
64 |
65 | Theater Style
66 | HAW 1S-F40 Seats 30 tieline 863-6396
67 | YKT 20-043 Seats 50 tieline 862-4306
68 |
69 | Video Conference Room
70 | YKT 32-026 Seats 25 tieline 862-3917
71 |
72 |
--------------------------------------------------------------------------------
/suim-examples/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | edu.cmu.lti
5 | suim-examples
6 | 0.0.1-SNAPSHOT
7 | SUIM Examples
8 |
9 |
10 | 2.9.3
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | edu.cmu.lti
20 | suim-scala
21 | 0.0.1-SNAPSHOT
22 |
23 |
24 | de.tudarmstadt.ukp.dkpro.core
25 | de.tudarmstadt.ukp.dkpro.core.io.text-asl
26 | 1.5.0
27 |
28 |
29 | de.tudarmstadt.ukp.dkpro.core
30 | de.tudarmstadt.ukp.dkpro.core.tokit-asl
31 | 1.5.0
32 |
33 |
34 | de.tudarmstadt.ukp.dkpro.core
35 | de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl
36 | 1.5.0
37 |
38 |
39 | org.scalatest
40 | scalatest_2.9.2
41 | 1.7.2
42 | test
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 | net.alchim31.maven
53 | scala-maven-plugin
54 | 3.1.0
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 | src/main/scala
64 | src/test/scala
65 |
66 |
67 |
68 |
69 |
70 | org.apache.maven.plugins
71 | maven-compiler-plugin
72 |
73 | 1.6
74 | 1.6
75 |
76 |
77 |
78 |
79 | net.alchim31.maven
80 | scala-maven-plugin
81 | 3.1.0
82 |
83 | incremental
84 |
85 | -unchecked
86 | -deprecation
87 | -explaintypes
88 |
89 |
90 |
91 | main
92 | spark-uima-tools.App
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | compile
101 | testCompile
102 |
103 |
104 |
105 | -make:transitive
106 | -dependencyfile
107 | ${project.build.directory}/.scala_dependencies
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 | org.apache.maven.plugins
116 | maven-assembly-plugin
117 | 2.2-beta-5
118 |
119 |
120 | jar-with-dependencies
121 |
122 |
123 |
124 | spark-uima-tools.App
125 |
126 |
127 |
128 |
129 |
130 | package
131 |
132 | single
133 |
134 |
135 |
136 |
137 |
138 |
139 | org.apache.maven.plugins
140 | maven-surefire-plugin
141 | 2.12
142 |
143 | true
144 | false
145 | -Xmx1024m
146 |
147 | **/*Spec.scala
148 |
149 |
150 | **/*Test.scala
151 |
152 |
153 |
154 |
155 |
156 | org.scalatest
157 | scalatest-maven-plugin
158 | 1.0-M2
159 |
160 | ${project.build.directory}/surefire-reports
161 | .
162 | WDF TestSuite.txt
163 |
164 |
165 |
166 | test
167 |
168 | test
169 |
170 |
171 |
172 |
173 |
174 |
175 | org.apache.maven.plugins
176 | maven-source-plugin
177 | 2.1.2
178 |
179 |
180 | attach-sources
181 |
182 | jar
183 |
184 |
185 |
186 |
187 |
188 |
189 | org.apache.maven.plugins
190 | maven-resources-plugin
191 | 2.5
192 |
193 | UTF-8
194 |
195 |
196 |
197 |
198 | org.apache.maven.plugins
199 | maven-release-plugin
200 | 2.3.2
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 | org.apache.maven.wagon
210 | wagon-ssh-external
211 | 1.0-beta-7
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 | org.eclipse.m2e
221 | lifecycle-mapping
222 | 1.0.0
223 |
224 |
225 |
226 |
227 |
228 |
229 | net.alchim31.maven
230 |
231 |
232 | scala-maven-plugin
233 |
234 |
235 | [3.1.0,)
236 |
237 |
238 | compile
239 | testCompile
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
--------------------------------------------------------------------------------
/suim-examples/src/main/resources/META-INF/org.apache.uima.fit/types.txt:
--------------------------------------------------------------------------------
1 | classpath*:ex/TutorialTypeSystem.xml
2 | classpath*:org/apache/uima/examples/SourceDocumentInformation.xml
3 | classpath*:desc/type/LexicalUnits.xml
4 | classpath*:desc/type/metadata.xml
5 |
6 |
--------------------------------------------------------------------------------
/suim-examples/src/main/resources/ex/RoomNumberAndDateTime.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | org.apache.uima.java
26 | false
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 | Aggregate TAE - Room Number and DateTime Annotators
40 | Detects Room Numbers, Dates, and Times
41 |
42 |
43 |
44 | RoomNumber
45 | DateTime
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 | org.apache.uima.tutorial.RoomNumber
55 |
56 | org.apache.uima.tutorial.DateAnnot
57 |
58 | org.apache.uima.tutorial.TimeAnnot
59 |
60 |
61 | en
62 |
63 |
64 |
65 |
66 | true
67 | true
68 | false
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/suim-examples/src/main/resources/ex/TutorialTypeSystem.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
25 | TutorialTypeSystem
26 | Type System Definition for the tutorial examples - as of Exercise 6
27 | 1.0
28 | The Apache Software Foundation
29 |
30 |
31 | org.apache.uima.tutorial.RoomNumber
32 |
33 | uima.tcas.Annotation
34 |
35 |
36 | building
37 | Building containing this room
38 | uima.cas.String
39 |
40 |
41 |
42 |
43 | org.apache.uima.tutorial.DateTimeAnnot
44 |
45 | uima.tcas.Annotation
46 |
47 |
48 | shortDateString
49 |
50 | uima.cas.String
51 |
52 |
53 |
54 |
55 | org.apache.uima.tutorial.TimeAnnot
56 |
57 | org.apache.uima.tutorial.DateTimeAnnot
58 |
59 |
60 |
61 | org.apache.uima.tutorial.DateAnnot
62 |
63 | org.apache.uima.tutorial.DateTimeAnnot
64 |
65 |
66 |
67 | org.apache.uima.tutorial.Meeting
68 |
69 | uima.tcas.Annotation
70 |
71 |
72 | room
73 |
74 | org.apache.uima.tutorial.RoomNumber
75 |
76 |
77 | date
78 |
79 | org.apache.uima.tutorial.DateAnnot
80 |
81 |
82 | startTime
83 |
84 | org.apache.uima.tutorial.TimeAnnot
85 |
86 |
87 | endTime
88 |
89 | org.apache.uima.tutorial.TimeAnnot
90 |
91 |
92 |
93 |
94 | org.apache.uima.tutorial.UimaAcronym
95 |
96 | uima.tcas.Annotation
97 |
98 |
99 | expandedForm
100 |
101 | uima.cas.String
102 |
103 |
104 |
105 |
106 | org.apache.uima.tutorial.UimaMeeting
107 |
108 | org.apache.uima.tutorial.Meeting
109 |
110 |
111 | org.apache.uima.examples.tokenizer.Token
112 |
113 | uima.tcas.Annotation
114 |
115 |
116 | org.apache.uima.examples.tokenizer.Sentence
117 |
118 | uima.tcas.Annotation
119 |
120 |
121 |
--------------------------------------------------------------------------------
/suim-examples/src/main/resources/org/apache/uima/tutorial/ex6/uimaAcronyms.txt:
--------------------------------------------------------------------------------
1 | UIMA Unstructured Information Management Architecture
2 | SITH System Integration Testing and Hardening
3 | CPE Collection Processing Engine
4 | CPM Collection Processing Manager
5 | AE Analysis Engine
6 | CAS Common Analysis Structure
7 | JCAS Java Common Analysis Structure
--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/Annotators.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Carnegie Mellon University
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package edu.cmu.lti.suim.examples
18 |
19 | import scala.collection.JavaConversions.collectionAsScalaIterable
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkContext.rddToOrderedRDDFunctions
23 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
24 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription
25 | import org.apache.uima.fit.factory.CollectionReaderFactory.createReader
26 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
27 | import org.apache.uima.fit.util.JCasUtil
28 |
29 | import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase
30 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.`type`.Token
31 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader
32 | import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter
33 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD
34 | import edu.cmu.lti.suim.SparkUimaUtils.process
35 |
36 |
37 | object Annotators {
38 |
39 | def main(args: Array[String]) = {
40 | val sc = new SparkContext(args(0), "App",
41 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
42 |
43 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
44 | val rdd = makeRDD(createReader(classOf[TextReader],
45 | ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "data/*.txt",
46 | ResourceCollectionReaderBase.PARAM_LANGUAGE, "en"), sc)
47 | val seg = createEngineDescription(classOf[BreakIteratorSegmenter])
48 | val tokens = rdd.map(process(_, seg)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[Token]))
49 | val counts = tokens.map(token => token.getCoveredText()).filter(filter(_)).map((_,1)).reduceByKey(_ + _).map(pair => (pair._2, pair._1)).sortByKey(false)
50 | counts.take(20).foreach(println(_))
51 | }
52 |
53 | def filter(input: String): Boolean = !input.forall(_.isDigit) && input.matches("""\w*""")
54 | }
55 |
--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/App.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Carnegie Mellon University
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package edu.cmu.lti.suim.examples
18 |
19 | import scala.collection.JavaConversions.collectionAsScalaIterable
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
23 | import org.apache.uima.examples.cpe.FileSystemCollectionReader
24 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription
25 | import org.apache.uima.fit.factory.CollectionReaderFactory.createReader
26 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
27 | import org.apache.uima.fit.util.JCasUtil
28 | import org.apache.uima.tutorial.RoomNumber
29 | import org.apache.uima.tutorial.ex1.RoomNumberAnnotator
30 |
31 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD
32 | import edu.cmu.lti.suim.SparkUimaUtils.process
33 |
34 | object App {
35 |
36 | def main(args: Array[String]) = {
37 | val sc = new SparkContext(args(0), "App",
38 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
39 |
40 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
41 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
42 | val rdd = makeRDD(createReader(classOf[FileSystemCollectionReader], params: _*), sc)
43 | val rnum = createEngineDescription(classOf[RoomNumberAnnotator])
44 | val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber]))
45 | val counts = rooms.map(room => room.getBuilding()).map((_,1)).reduceByKey(_ + _)
46 | counts.foreach(println(_))
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/AppWithHDFS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Carnegie Mellon University
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package edu.cmu.lti.suim.examples
18 |
19 | import scala.collection.JavaConversions.collectionAsScalaIterable
20 |
21 | import org.apache.spark.SparkContext
22 | import org.apache.uima.examples.cpe.FileSystemCollectionReader
23 | import org.apache.uima.fit.factory.AnalysisEngineFactory
24 | import org.apache.uima.fit.factory.CollectionReaderFactory
25 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
26 | import org.apache.uima.fit.util.JCasUtil
27 | import org.apache.uima.tutorial.RoomNumber
28 | import org.apache.uima.tutorial.ex1.RoomNumberAnnotator
29 |
30 | import edu.cmu.lti.suim.SparkUimaUtils.process
31 | import edu.cmu.lti.suim.SparkUimaUtils.sequenceFile
32 |
33 |
34 | object AppWithHDFS {
35 |
36 | def main(args: Array[String]) = {
37 | val sc = new SparkContext(args(0), "App",
38 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
39 |
40 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
41 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
42 | val rdd = sequenceFile(CollectionReaderFactory.createCollectionReader(classOf[FileSystemCollectionReader], params: _*),
43 | "hdfs://localhost:9000/file.txt",sc)
44 | val rnum = AnalysisEngineFactory.createEngineDescription(classOf[RoomNumberAnnotator])
45 | val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber]))
46 | val counts = rooms.map(room => room.getBuilding()).countByValue()
47 | println(counts)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/suim-examples/src/main/scala/edu/cmu/lti/suim/examples/SparkPipelineExample.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 |
20 | package edu.cmu.lti.suim.examples
21 |
22 | import java.util.StringTokenizer
23 |
24 | import scala.collection.JavaConversions.bufferAsJavaList
25 | import scala.collection.JavaConversions.collectionAsScalaIterable
26 | import scala.io.Source
27 |
28 | import org.apache.spark.SparkContext
29 | import org.apache.uima.examples.cpe.FileSystemCollectionReader
30 | import org.apache.uima.fit.component.JCasAnnotator_ImplBase
31 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription
32 | import org.apache.uima.fit.factory.CollectionReaderFactory
33 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory
34 | import org.apache.uima.fit.util.JCasUtil.select
35 | import org.apache.uima.jcas.JCas
36 | import org.apache.uima.tutorial.Meeting
37 | import org.apache.uima.tutorial.UimaAcronym
38 | import org.apache.uima.tutorial.UimaMeeting
39 |
40 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD
41 | import edu.cmu.lti.suim.SparkUimaUtils.process
42 |
43 | object SparkPipelineExample {
44 |
45 | def readMap(file: String) = {
46 | val s = Source.fromFile(file)
47 | s.getLines.map(line => {
48 | val pair = line.split("\t")
49 | (pair(0), pair(1))
50 | }).toMap
51 | }
52 |
53 | def main(args: Array[String]) = {
54 | val sc = new SparkContext(args(0), "App",
55 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":"))
56 |
57 | // Share variable
58 | val mMap = sc.broadcast(readMap("src/main/resources/org/apache/uima/tutorial/ex6/uimaAcronyms.txt"))
59 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription()
60 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data")
61 | val rdd = makeRDD(CollectionReaderFactory.createReader(
62 | classOf[FileSystemCollectionReader], params: _*), sc)
63 | val result = rdd.map(process(_, createEngineDescription(
64 | createEngineDescription(classOf[UimaAcronymAnnotator]),
65 | createEngineDescription(classOf[UimaMeetingAnnotator])))).cache
66 | result.flatMap(scas => select(scas.jcas, classOf[UimaAcronym])).foreach(println(_))
67 | result.flatMap(scas => select(scas.jcas, classOf[UimaMeeting])).foreach(println(_))
68 | }
69 | }
70 |
71 | class UimaAcronymAnnotator extends JCasAnnotator_ImplBase {
72 |
73 | val mMap = org.apache.spark.SparkEnv.get.blockManager.getSingle("broadcast_0").get.asInstanceOf[Map[String, String]]
74 |
75 | override def process(jcas: JCas) {
76 | // go through document word-by-word
77 | val text = jcas.getDocumentText();
78 | var pos = 0;
79 | val tokenizer = new StringTokenizer(text, """ \t\n\r.<.>/?";:[{]}\|=+()!""", true);
80 | while (tokenizer.hasMoreTokens()) {
81 | val token = tokenizer.nextToken();
82 | // look up token in map to see if it is an acronym
83 | val expandedForm = mMap.get(token);
84 | if (expandedForm.isDefined) {
85 | // create annotation
86 | val annot = new UimaAcronym(jcas, pos, pos + token.length());
87 | annot.setExpandedForm(expandedForm.get);
88 | annot.addToIndexes();
89 | }
90 | // incrememnt pos and go to next token
91 | pos += token.length();
92 | }
93 | }
94 | }
95 |
96 |
97 | class UimaMeetingAnnotator extends JCasAnnotator_ImplBase {
98 |
99 | val mMap = org.apache.spark.SparkEnv.get.blockManager.getSingle("broadcast_0").get.asInstanceOf[Map[String, String]]
100 |
101 | override def process(jcas: JCas) {
102 | // get document text
103 | val text = jcas.getDocumentText();
104 |
105 | // We iterate over all Meeting annotations, and if we determine that
106 | // the topic of a meeting is UIMA-related, we create a UimaMeeting
107 | // annotation. We add each UimaMeeting annotation to a list, and then
108 | // later go back and add these to the CAS indexes. We need to do this
109 | // because it's not allowed to add to an index that you're currently
110 | // iterating over.
111 | val uimaMeetings = scala.collection.mutable.Buffer[UimaMeeting]()
112 |
113 | select(jcas, classOf[Meeting]).foreach(meeting => {
114 | // get span of text within 50 chars on either side of meeting
115 | // (window size should probably be a config. param)
116 | var begin = meeting.getBegin() - 50
117 | var end = meeting.getEnd() + 50
118 | if (begin < 0) {
119 | begin = 0
120 | }
121 | if (end > text.length()) {
122 | end = text.length()
123 | }
124 | val window = text.substring(begin, end)
125 |
126 | // look for UIMA acronyms within this window
127 | val tokenizer = new StringTokenizer(window, """ \t\n\r.<.>/?";:[{]}\|=+()!""");
128 | var continue = true
129 | while (tokenizer.hasMoreTokens() && continue) {
130 | val token = tokenizer.nextToken();
131 | // look up token in map to see if it is an acronym
132 | if (mMap.get(token) != null) {
133 | // create annotation
134 | val annot = new UimaMeeting(jcas, meeting.getBegin(), meeting.getEnd());
135 | annot.setRoom(meeting.getRoom());
136 | annot.setDate(meeting.getDate());
137 | annot.setStartTime(meeting.getStartTime());
138 | annot.setEndTime(meeting.getEndTime());
139 | // Add annotation to a list, to be later added to the
140 | // indexes.
141 | // We need to do this because it's not allowed to add to an
142 | // index that you're currently iterating over.
143 | uimaMeetings.add(annot);
144 | continue = false
145 | }
146 | }
147 | })
148 | uimaMeetings.foreach(meeting => meeting.addToIndexes())
149 | }
150 | }
151 |
--------------------------------------------------------------------------------
/suim-examples/src/test/scala/spark-uima-tools/AppSpec.scala:
--------------------------------------------------------------------------------
1 | package cmu.edu.lti.suim
2 |
3 | import org.scalatest.FlatSpec
4 | import org.scalatest.matchers.ShouldMatchers
5 |
6 | class AppSpec extends FlatSpec with ShouldMatchers {
7 | "An App" should "pass" in {
8 | (1) should equal(1)
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/suim-java/pom.xml:
--------------------------------------------------------------------------------
1 |
5 | 4.0.0
6 |
7 | edu.cmu.lti
8 | suim-java
9 | 0.0.1-SNAPSHOT
10 | SUIM Java
11 |
12 |
13 |
14 |
15 |
16 | org.apache.maven.plugins
17 | maven-compiler-plugin
18 |
19 | 1.6
20 | 1.6
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | edu.cmu.lti
30 | suim-scala
31 | 0.0.1-SNAPSHOT
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/suim-java/src/main/java/edu/cmu/lti/suim/JavaSparkUima.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Carnegie Mellon University
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package edu.cmu.lti.suim;
18 |
19 | import java.util.List;
20 |
21 | import org.apache.hadoop.io.NullWritable;
22 | import org.apache.spark.api.java.JavaRDD;
23 | import org.apache.spark.api.java.JavaSparkContext;
24 | import org.apache.spark.api.java.function.Function;
25 | import org.apache.uima.analysis_engine.AnalysisEngineDescription;
26 | import org.apache.uima.collection.CollectionReader;
27 | import org.apache.uima.fit.factory.AnalysisEngineFactory;
28 | import org.apache.uima.resource.ResourceInitializationException;
29 |
30 | public final class JavaSparkUima {
31 |
32 | public static JavaRDD sequenceFile(CollectionReader reader, String uri, JavaSparkContext sc) throws Exception {
33 | SparkUimaUtils.createSequenceFile(reader, uri);
34 | return sc.sequenceFile(uri, NullWritable.class, SCAS.class).values();
35 | }
36 |
37 | public static JavaRDD makeRDD(CollectionReader reader, JavaSparkContext sc) throws Exception {
38 | List buffer = SparkUimaUtils.readFrom(reader);
39 | return sc.parallelize(buffer);
40 | }
41 |
42 | public final static class PipelineFunction extends Function {
43 |
44 | private static final long serialVersionUID = -6881223764488277676L;
45 |
46 | private final AnalysisEngineDescription description;
47 |
48 | public PipelineFunction(AnalysisEngineDescription... descs) throws ResourceInitializationException {
49 | this.description = AnalysisEngineFactory.createEngineDescription(descs);
50 | }
51 |
52 | public PipelineFunction(AnalysisEngineDescription desc) {
53 | this.description = desc;
54 | }
55 |
56 | public SCAS call(SCAS scas) {
57 | return SparkUimaUtils.process(scas, description);
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/suim-scala/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | edu.cmu.lti
5 | suim-scala
6 | 0.0.1-SNAPSHOT
7 | SUIM Scala
8 |
9 |
10 | 2.9.3
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | org.apache.uima
20 | uimaj-examples
21 | 2.4.2
22 |
23 |
24 | org.apache.uima
25 | uimaj-core
26 | 2.4.2
27 |
28 |
29 | org.apache.uima
30 | uimafit-core
31 | 2.0.0
32 |
33 |
34 | org.apache.spark
35 | spark-core_2.9.3
36 | 0.8.0-incubating
37 |
38 |
39 | org.scala-lang
40 | scala-library
41 | ${scala.version}
42 |
43 |
44 |
45 | org.scalatest
46 | scalatest_2.9.2
47 | 1.7.2
48 | test
49 |
50 |
51 |
52 | org.scalamock
53 | scalamock-scalatest-support_2.9.2
54 | 2.4
55 | test
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | net.alchim31.maven
67 | scala-maven-plugin
68 | 3.1.0
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 | src/main/scala
78 | src/test/scala
79 |
80 |
81 |
82 |
83 |
84 | org.apache.maven.plugins
85 | maven-compiler-plugin
86 |
87 | 1.6
88 | 1.6
89 |
90 |
91 |
92 |
93 | net.alchim31.maven
94 | scala-maven-plugin
95 | 3.1.0
96 |
97 | incremental
98 |
99 | -unchecked
100 | -deprecation
101 | -explaintypes
102 |
103 |
104 |
105 | main
106 | spark-uima-tools.App
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 | compile
115 | testCompile
116 |
117 |
118 |
119 | -make:transitive
120 | -dependencyfile
121 | ${project.build.directory}/.scala_dependencies
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 | org.apache.maven.plugins
130 | maven-surefire-plugin
131 | 2.12
132 |
133 | true
134 | false
135 | -Xmx1024m
136 |
137 | **/*Spec.scala
138 |
139 |
140 | **/*Test.scala
141 |
142 |
143 |
144 |
145 |
146 | org.scalatest
147 | scalatest-maven-plugin
148 | 1.0-M2
149 |
150 | ${project.build.directory}/surefire-reports
151 | .
152 | WDF TestSuite.txt
153 |
154 |
155 |
156 | test
157 |
158 | test
159 |
160 |
161 |
162 |
163 |
164 |
165 | org.apache.maven.plugins
166 | maven-source-plugin
167 | 2.1.2
168 |
169 |
170 | attach-sources
171 |
172 | jar
173 |
174 |
175 |
176 |
177 |
178 |
179 | org.apache.maven.plugins
180 | maven-resources-plugin
181 | 2.5
182 |
183 | UTF-8
184 |
185 |
186 |
187 |
188 | org.apache.maven.plugins
189 | maven-release-plugin
190 | 2.3.2
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | org.apache.maven.wagon
200 | wagon-ssh-external
201 | 1.0-beta-7
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 | org.eclipse.m2e
211 | lifecycle-mapping
212 | 1.0.0
213 |
214 |
215 |
216 |
217 |
218 |
219 | net.alchim31.maven
220 |
221 |
222 | scala-maven-plugin
223 |
224 |
225 | [3.1.0,)
226 |
227 |
228 | compile
229 | testCompile
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
--------------------------------------------------------------------------------
/suim-scala/src/main/scala/edu/cmu/lti/suim/SCAS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Carnegie Mellon University
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package edu.cmu.lti.suim
18 |
19 | import java.io.ByteArrayInputStream
20 | import java.io.ByteArrayOutputStream
21 | import java.io.DataInput
22 | import java.io.DataOutput
23 | import java.io.Externalizable
24 | import java.io.ObjectInput
25 | import java.io.ObjectOutput
26 |
27 | import org.apache.hadoop.io.Writable
28 | import org.apache.uima.cas.CAS
29 | import org.apache.uima.cas.impl.Serialization
30 | import org.apache.uima.fit.factory.JCasFactory
31 |
32 | object SCAS {
33 |
34 | def read(in: DataInput) = {
35 | val scas = new SCAS();
36 | scas.readFields(in);
37 | scas
38 | }
39 | }
40 |
41 | class SCAS(val cas: CAS) extends Externalizable with Writable {
42 |
43 | def this() {
44 | this(JCasFactory.createJCas().getCas())
45 | }
46 |
47 | override def readExternal(in: ObjectInput) {
48 | readFields(in)
49 | }
50 |
51 | override def writeExternal(out: ObjectOutput) {
52 | write(out)
53 | }
54 |
55 | def jcas = cas.getJCas()
56 |
57 | override def write(out: DataOutput) {
58 | val baos = new ByteArrayOutputStream();
59 | Serialization.serializeWithCompression(cas, baos)
60 | out.writeInt(baos.size)
61 | out.write(baos.toByteArray)
62 | }
63 |
64 | override def readFields(in: DataInput) {
65 | val size = in.readInt();
66 | val bytes = new Array[Byte](size)
67 | in.readFully(bytes);
68 | val bais = new ByteArrayInputStream(bytes)
69 | Serialization.deserializeCAS(cas, bais);
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/suim-scala/src/main/scala/edu/cmu/lti/suim/SparkUimaUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2013 Carnegie Mellon University
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package edu.cmu.lti.suim
18 |
19 | import java.net.URI
20 |
21 | import scala.collection.JavaConversions.asScalaBuffer
22 | import scala.collection.JavaConversions.bufferAsJavaList
23 |
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.fs.FileSystem
26 | import org.apache.hadoop.fs.Path
27 | import org.apache.hadoop.io.IOUtils
28 | import org.apache.hadoop.io.NullWritable
29 | import org.apache.hadoop.io.SequenceFile
30 | import org.apache.spark.SparkContext
31 | import org.apache.spark.SparkContext.rddToPairRDDFunctions
32 | import org.apache.spark.SparkContext.writableWritableConverter
33 | import org.apache.uima.analysis_engine.AnalysisEngineDescription
34 | import org.apache.uima.collection.CollectionReader
35 | import org.apache.uima.fit.factory.AnalysisEngineFactory
36 | import org.apache.uima.fit.factory.JCasFactory
37 |
38 | object SparkUimaUtils {
39 |
40 | def createSequenceFile(reader: CollectionReader, uri: String) {
41 | val conf = new Configuration()
42 | val fs = FileSystem.get(URI.create(uri), conf)
43 | val path = new Path(uri)
44 | val nw = NullWritable.get
45 | val writer = SequenceFile.createWriter(fs, conf, path, nw.getClass(), classOf[SCAS])
46 | while (reader.hasNext()) {
47 | val jcas = JCasFactory.createJCas();
48 | val cas = jcas.getCas()
49 | reader.getNext(cas)
50 | val scas = new SCAS(cas)
51 | writer.append(nw, scas)
52 | }
53 | IOUtils.closeStream(writer)
54 | }
55 |
56 | def sequenceFile(reader: CollectionReader, uri: String, sc: SparkContext) = {
57 | createSequenceFile(reader, uri)
58 | sc.sequenceFile[NullWritable, SCAS](uri).values
59 | }
60 |
61 | def readFrom(reader: CollectionReader): java.util.List[SCAS] = {
62 | val buffer = collection.mutable.ArrayBuffer[SCAS]()
63 | while (reader.hasNext()) {
64 | val jcas = JCasFactory.createJCas();
65 | val cas = jcas.getCas()
66 | reader.getNext(cas)
67 | buffer += new SCAS(cas)
68 | }
69 | buffer
70 | }
71 |
72 | def makeRDD(reader: CollectionReader, sc: SparkContext) = {
73 | val buffer = readFrom(reader)
74 | sc.parallelize(buffer)
75 | }
76 |
77 | def process(scas: SCAS, description: AnalysisEngineDescription) = {
78 | val ae = AnalysisEngineFactory.createEngine(description)
79 | ae.process(scas.jcas)
80 | scas
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/suim-scala/src/test/scala/spark-uima-tools/AppSpec.scala:
--------------------------------------------------------------------------------
1 | package cmu.edu.lti.suim
2 |
3 | import org.scalatest.FlatSpec
4 | import org.scalatest.matchers.ShouldMatchers
5 |
6 | class AppSpec extends FlatSpec with ShouldMatchers {
7 | "An App" should "pass" in {
8 | (1) should equal(1)
9 | }
10 | }
11 |
--------------------------------------------------------------------------------