├── .gitignore ├── .travis.yml ├── LICENSE ├── README.markdown ├── pom.xml ├── suim-examples ├── data │ ├── Apache_UIMA.txt │ ├── IBM_LifeSciences.txt │ ├── New_IBM_Fellows.txt │ ├── SeminarChallengesInSpeechRecognition.txt │ ├── TrainableInformationExtractionSystems.txt │ ├── UIMASummerSchool2003.txt │ ├── UIMA_Seminars.txt │ ├── WatsonConferenceRooms.txt │ └── xml │ │ ├── IBM_LifeSciences.xml │ │ ├── New_IBM_Fellows.xml │ │ ├── SeminarChallengesInSpeechRecognition.xml │ │ ├── TrainableInformationExtractionSystems.xml │ │ ├── UIMASummerSchool2003.xml │ │ ├── UIMA_Seminars.xml │ │ └── WatsonConferenceRooms.xml ├── pom.xml └── src │ ├── main │ ├── resources │ │ ├── META-INF │ │ │ └── org.apache.uima.fit │ │ │ │ └── types.txt │ │ ├── ex │ │ │ ├── RoomNumberAndDateTime.xml │ │ │ └── TutorialTypeSystem.xml │ │ └── org │ │ │ └── apache │ │ │ └── uima │ │ │ └── tutorial │ │ │ └── ex6 │ │ │ └── uimaAcronyms.txt │ └── scala │ │ └── edu │ │ └── cmu │ │ └── lti │ │ └── suim │ │ └── examples │ │ ├── Annotators.scala │ │ ├── App.scala │ │ ├── AppWithHDFS.scala │ │ └── SparkPipelineExample.scala │ └── test │ └── scala │ └── spark-uima-tools │ └── AppSpec.scala ├── suim-java ├── pom.xml └── src │ └── main │ └── java │ └── edu │ └── cmu │ └── lti │ └── suim │ └── JavaSparkUima.java └── suim-scala ├── pom.xml └── src ├── main └── scala │ └── edu │ └── cmu │ └── lti │ └── suim │ ├── SCAS.scala │ └── SparkUimaUtils.scala └── test └── scala └── spark-uima-tools └── AppSpec.scala /.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | target/ 3 | *.DS_Store 4 | *.releaseBackup 5 | release.properties 6 | *.iml 7 | *.iws 8 | *.ipr 9 | .idea/ 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.9.2 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright 2012 Twitter Inc 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # SUIM 2 | 3 | Spark for Unstructured Information, provides a thin abstraction layer for [UIMA](http://uima.apache.org/) 4 | on top of [Spark](http://spark.apache.org/). 5 | SUIM leverages on Spark resilient distributed dataset (RDD) to run UIMA pipelines using uimaFIT, SUIM pipelines are 6 | distributed across the nodes on a cluster and can be operated on in parallel [1]. 7 | 8 | SUIM allows you to run analytical pipelines on the resulting (or intermediate) `CAS` to execute furhter text analytics or 9 | machine learning algorithms. 10 | 11 | ## Examples 12 | 13 | #### Count buildings from the UIMA tutorial. 14 | 15 | Using the `RoomAnnotator` from the UIMA tutorial: 16 | 17 | 18 | ```scala 19 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription() 20 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data") 21 | val rdd = makeRDD(createCollectionReader(classOf[FileSystemCollectionReader], params: _*), sc) 22 | val rnum = createEngineDescription(classOf[RoomNumberAnnotator]) 23 | val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber])) 24 | val counts = rooms.map(room => room.getBuilding()).map((_,1)).reduceByKey(_ + _) 25 | counts.foreach(println(_)) 26 | ``` 27 | 28 | If the collection is to large to fit in memory, or you already have a collection of `SCAS`es use an HDFS RDD: 29 | 30 | ```scala 31 | val rdd = sequenceFile(reateCollectionReader(classOf[FileSystemCollectionReader], params: _*), 32 | "hdfs://localhost:9000/documents", sc) 33 | ``` 34 | 35 | #### Tokenize and count words with DKPro Core 36 | 37 | Use DKPro Core [2] to tokenize and Spark to do token level analytics. 38 | 39 | ```scala 40 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription() 41 | val rdd = makeRDD(createCollectionReader(classOf[TextReader], 42 | ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "data", 43 | ResourceCollectionReaderBase.PARAM_LANGUAGE, "en", 44 | ResourceCollectionReaderBase.PARAM_PATTERNS, Array("[+]*.txt")), sc) 45 | val seg = createPrimitiveDescription(classOf[BreakIteratorSegmenter]) 46 | val tokens = rdd.map(process(_, seg)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[Token])) 47 | val counts = tokens.map(token => token.getCoveredText()) 48 | .filter(filter(_)) 49 | .map((_,1)).reduceByKey(_ + _) 50 | .map(pair => (pair._2, pair._1)).sortByKey(true) 51 | counts.foreach(println(_)) 52 | ``` 53 | 54 | ### Common Tasks 55 | 56 | To build: 57 | 58 | mvn compile 59 | 60 | To run: 61 | 62 | mvn scala:run 63 | 64 | To test: 65 | 66 | mvn test 67 | 68 | To create standalone with dependencies: 69 | 70 | mvn package 71 | java -jar target/spark-uima-tools-0.0.1-SNAPSHOT-jar-with-dependencies.jar 72 | 73 | ## References 74 | * [1] http://spark.incubator.apache.org/docs/latest/scala-programming-guide.html 75 | * [2] https://code.google.com/p/dkpro-core-asl/ 76 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 5 | 4.0.0 6 | 7 | edu.cmu.lti 8 | suim 9 | 0.0.1-SNAPSHOT 10 | SUIM 11 | pom 12 | 13 | 2013 14 | https://github.com/oaqa/suim 15 | 16 | 17 | org.sonatype.oss 18 | oss-parent 19 | 7 20 | 21 | 22 | 23 | github.com 24 | https://github.com/oaqa/suim/issues 25 | 26 | 27 | 28 | 29 | The Apache Software License, Version 2.0 30 | http://www.apache.org/licenses/LICENSE-2.0.txt 31 | repo 32 | 33 | 34 | 35 | 36 | git@github.com:oaqa/suim.git 37 | scm:git:git@github.com:oaqa/suim.git 38 | scm:git:git@github.com:oaqa/suim.git 39 | 40 | 41 | 42 | 1.6 43 | 1.6 44 | UTF-8 45 | 46 | 47 | 48 | 49 | twttr 50 | twttr 51 | http://maven.twttr.com 52 | 53 | 54 | 55 | 56 | 57 | scala-tools.org 58 | Scala-Tools Maven2 Repository 59 | http://scala-tools.org/repo-releases 60 | 61 | 62 | 63 | 64 | suim-java 65 | suim-scala 66 | suim-examples 67 | 68 | 69 | 70 | 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-compiler-plugin 75 | 76 | 1.6 77 | 1.6 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | junit 87 | junit 88 | 3.8.1 89 | test 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /suim-examples/data/Apache_UIMA.txt: -------------------------------------------------------------------------------- 1 | Welcome to Apache UIMA (Unstructured Information Management Architecture), a incubator project of the Apache Software Foundation (ASF). 2 | Our goal is a thriving community of users and developers of UIMA frameworks, supporting components for analysing unstructured content such as text, audio and video. 3 | 4 | What is UIMA? 5 | 6 | Unstructured Information Management applications are software systems that analyze large volumes of unstructured information in order to discover knowledge that is relevant to an end user. 7 | UIMA is a framework and SDK for developing such applications. An example UIM application might ingest plain text and identify entities, such as persons, places, organizations; or relations, such as works-for or located-at. 8 | UIMA enables such an application to be decomposed into components, for example "language identification" -> "language specific segmentation" -> "sentence boundary detection" -> "entity detection (person/place names etc.)". 9 | Each component must implement interfaces defined by the framework and must provide self-describing metadata via XML descriptor files. The framework manages these components and the data flow between them. Components are written in Java or C++; the data that flows between components is designed for efficient mapping between these languages. 10 | UIMA additionally provides capabilities to wrap components as network services, and can scale to very large volumes by replicating processing pipelines over a cluster of networked nodes. 11 | 12 | Apache UIMA is an Apache-licensed open source implementation of the UIMA specification (that specification is, in turn, being developed concurrently by a technical committee within OASIS , a standards organization). 13 | We invite and encourage you to participate in both the implementation and specification efforts. 14 | 15 | UIMA is a component framework for analysing unstructured content such as text, audio and video. 16 | It comprises an SDK and tooling for composing and running analytic components written in Java and C++, with some support for Perl, Python and TCL. 17 | 18 | 19 | Apache UIMA mailing lists: 20 | 21 | Users - uima-user@incubator.apache.org 22 | Developers - uima-dev@incubator.apache.org 23 | Commits - uima-commits@incubator.apache.org 24 | 25 | 26 | Apache UIMA project committers: 27 | 28 | Michael Baessler 29 | Edward Epstein 30 | Thilo Goetz 31 | Adam Lally 32 | Marshall Schor 33 | 34 | 35 | Apache UIMA project Mentors: 36 | 37 | Ken Coar (ASF member and Vice President) 38 | Sam Ruby (ASF member) -------------------------------------------------------------------------------- /suim-examples/data/IBM_LifeSciences.txt: -------------------------------------------------------------------------------- 1 | "Life sciences is one of the emerging markets at the heart of IBM's growth strategy," said John M. Thompson, IBM senior vice president & group executive, Software. "This investment is the first of a number of steps we will be taking to advance IBM's life sciences initiatives." In his role as newly appointed IBM Corporation vice chairman, effective September 1, Mr. Thompson will be responsible for integrating and accelerating IBM's efforts to exploit life sciences and other emerging growth areas. 2 | 3 | IBM estimates the market for IT solutions for life sciences will skyrocket from $3.5 billion today to more than $9 billion by 2003. Driving demand is the explosive growth in genomic, proteomic and pharmaceutical research. For example, the Human Genome Database is approximately three terabytes of data, or the equivalent of 150 million pages of information. The volume of life sciences data is doubling every six months. 4 | 5 | "All of this genetic data is worthless without the information technology that can help scientists manage and analyze it to unlock the pathways that will lead to new cures for many of today's diseases," said Dr. Caroline Kovac, vice president of IBM's new Life Sciences unit. "IBM can help speed this process by enabling more efficient interpretation of data and sharing of knowledge. The potential for change based on innovation in life sciences is bigger than the change caused by the digital circuit." 6 | 7 | Among the life sciences initiatives already underway at IBM are: 8 | - DiscoveryLink* -- For the first time, researchers using this combination of innovative middleware and integration services can join together information from many sources to solve complex medical research problems. DiscoveryLink creates a "virtual database" that permits data to be accessed and extracted from multiple data sources used in research and development projects. This IT solution can dramatically improve product cycle time and lower development costs for pharmaceutical, biotechnology and agri-science companies. 9 | 10 | - Blue Gene* - IBM is building a supercomputer 100 times faster than any available today designed to advance understanding of the mechanisms behind protein folding through large-scale biomolecular simulation. In December, IBM committed $100 million to this five-year research project to advance the state-of-the-art in supercomputing for biological applications. 11 | - Bio-Dictionary* -- IBM has compiled a protein dictionary containing some 30 million protein "words" designed to accelerate the understanding of protein shapes and functions.Bio-Dictionaries for selected genomes, as well as bioinformatics algorithms for pattern discovery and other relevant applications, are available to scientists and researchers for noncommercial use through a website dedicated to life sciences content at http://www.research.ibm.com/compsci/compbio/. 12 | 13 | * Indicates trademark or registered trademark of IBM Corporation. -------------------------------------------------------------------------------- /suim-examples/data/New_IBM_Fellows.txt: -------------------------------------------------------------------------------- 1 | IBM today elevated five employees to the title of IBM Fellow -- its most prestigious technical honor. The company also presented more than $2.8 million in cash awards to employees whose technical innovation have yielded exceptional value to the company and its customers. 2 | 3 | IBM conferred the accolades and awards at its 2003 Corporate Technical Recognition Event (CTRE) in Scottsdale, Ariz. CTRE is a 40-year tradition at IBM, established to recognize exceptional technical employees and reward them for extraordinary achievements and contributions to the company's technology leadership. 4 | 5 | "Our technical employees are among the best and brightest innovators in the world. They share a passion for excellence that defines their work and permeates the products and services IBM delivers to its customers," said Nick Donofrio, senior vice president, technology and manufacturing for IBM. "CTRE provides the means for us to honor those who have distinguished themselves as exceptional leaders among their peers." 6 | 7 | Among the special honorees at the 2003 CTRE are five employees who earned the coveted distinction of IBM Fellow: 8 | 9 | 10 | - Grady Booch, chief scientist of Rational Software, IBM Software Group. Recognized internationally for his innovative work on software architecture, modeling, and software engineering process. Mr. Booch is one of the original authors of the Unified Modeling Language (UML), the industry-standard language of blueprints for software-intensive systems. 11 | 12 | - Dr. Donald Chamberlin, researcher, IBM Almaden Research Center. An expert in relational database languages, Dr. Chamberlin is co- inventor of SQL, the language that energized the relational database market. He has also influenced the creation of XQuery, one of a new generation of database query languages covering structured, semi-structured and unstructured data. 13 | 14 | - Dr. George Galambos, chief technology officer, IBM Global Services (IGS) in Canada; the first Fellow from Canada. Dr. Galambos specializes in high-performance, high availability designs, operational effectiveness, and risk assessment/mitigation, focusing on systems engineering and architecture reuse that enhances efficiency and stability. He is a principal driver of and contributor to the widely acclaimed "Patterns for e-business" and the Enterprise Solution Structure Reference Architectures, widely used by IGS in customer engagements. 15 | 16 | - Rod Smith, vice president of Internet emerging technologies, IBM Software Group. A leader in the areas of object-oriented programming, visual development tools, Java, XML, and Web Services. Rod also was the chief technical strategist for focusing the Java platform for use in middleware solutions, in particular initiating contributions to the development of the J2EE. 17 | 18 | - Charles Webb, eServer processor design, IBM Systems Group. Charles Webb has led the reinvention of IBM's eServer zSeries microprocessor designs and roadmap, including the z900 server, where he provided the bridge among architecture, hardware, compilers and system software, defining major portions of the 64- bit architecture and beyond. 19 | 20 | 21 | The title of IBM Fellow is the company's most preeminent technical distinction and is granted in recognition of outstanding and sustained technical achievements in engineering, programming, science and technology. Only 175 individuals have earned this designation in the company's history and, including the newly named Fellows, 56 are active employees. IBM Fellows are encouraged to further enhance their potential for creative achievements and typically work on special projects or research initiatives that lead the company in exciting new directions. 22 | 23 | -------------------------------------------------------------------------------- /suim-examples/data/SeminarChallengesInSpeechRecognition.txt: -------------------------------------------------------------------------------- 1 | UIT Seminar: Challenges in Speech Recognition 2 | August 8, 2003 10:30 AM - 11:30 AM 3 | Lawrence Rabiner , Associate Director CAIP, Rutgers 4 | University, Professor Univ. of Santa Barbara 5 | Yorktown 20-043 6 | Availability: Open 7 | 8 | Speech recognition has matured to the point where it 9 | is now being widely applied in a range of applications 10 | including desktop dictation, cell phone name dialing, 11 | agent technology, automated operator services, 12 | telematics, call center automation and help desks. 13 | 14 | Although the technology is often good enough for many 15 | of these applications, there remain key challenges in 16 | virtually every aspect of speech recognition that 17 | prevent the technology from being used ubiquitously in 18 | any environment, for any speaker, and for an even 19 | broader range of applications. This talk will analyze 20 | the ‘Speech Circle’ that enables a person to maintain 21 | a dialog with a machine using speech recognition, 22 | spoken language understanding, dialog management and 23 | spoken language generation, and finally text-to-speech 24 | synthesis, and show where significant progress has 25 | been made, and where there remain critical problems 26 | that need to be addressed and solved. 27 | 28 | The talk will include several audio and video examples 29 | of speech recognition and speech understanding systems 30 | that have been studied in the laboratory to illustrate 31 | the challenges that remain to be solved before speech 32 | recognition is considered a solved problem. 33 | 34 | 35 | -------------------------------------------------------------------------------- /suim-examples/data/TrainableInformationExtractionSystems.txt: -------------------------------------------------------------------------------- 1 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems 2 | 3 | August 19, 2003 02:00 PM - 03:30 PM 4 | David Johnson, Frank Oles, Tong Zhang(IBM Research) 5 | Hawthorne GN-F15 6 | Availability: Open 7 | 8 | The technical objective of the TIES project is to build customizable systems that can identify named entities in text, such as persons, organizations, and locations, as well as identifying relations between those entities. The technical approach is to develop new statistical and symbolic machine learning algorithms in service of the technical objective. Also, we are working on combining statistical with symbolic techniques. The first part of this talk, given by David E. Johnson, will provide a general overview of the goals of the TIES project. The second part, given by Tong Zhang, will provide background on applying statistical machine learning to this problem domain. Tong will also describe the particular statistical approach taken, which is termed Robust Risk Minimization (RMM). The final part will be given by Frank J. Oles. Frank will introduce his theory of precedence-inclusion patterns. Precedence-inclusion patterns are mathematical structures possessing multiple interacting strict partial orders that satisfy axioms generalizing the familiar properties of irreflexivity and transitivity. This very general theory provides a radically new approach to symbolic, as opposed to statistical, pattern generalization that can be applied to relational learning in a number of settings, including learning based on text, on images, or on videos. 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /suim-examples/data/UIMASummerSchool2003.txt: -------------------------------------------------------------------------------- 1 | UIMA Summer School 2 | 3 | August 26, 2003 4 | UIMA 101 - The New UIMA Introduction 5 | (Hands-on Tutorial) 6 | 9:00AM-5:00PM in HAW GN-K35 7 | 8 | August 28, 2003 9 | FROST Tutorial 10 | 9:00AM-5:00PM in HAW GN-K35 11 | 12 | September 15, 2003 13 | UIMA 201: UIMA Advanced Topics 14 | (Hands-on Tutorial) 15 | 9:00AM-5:00PM in HAW 1S-F53 16 | 17 | September 17, 2003 18 | The UIMA System Integration Test and Hardening Service 19 | The "SITH" 20 | 3:00PM-4:30PM in HAW GN-K35 21 | 22 | 23 | 24 | UIMA Summer School Tutorial and Presentation Details 25 | UIMA 101: The new UIMA tutorial 26 | Tuesday August 26 9:00AM - 4:30PM in GN-K35 27 | 28 | UIMA 101 is a hands-on programming tutorial. 29 | 30 | UIMA 101 is intended for people who want a first introductory course to UIMA or for people who would like a refresher. 31 | 32 | The tutorial covers the same concepts in the first UIMA tutorial given in 3Q 2002 except for some key updates: 33 | 34 | 1) It uses a new interface to the CAS that makes it more natural to access and update CAS feature structures using ordinary Java objects (i.e., the JCAS) and 35 | 2) It uses updated TAE interfaces that give the application developer more control over managing multiple CASs. 36 | 37 | Please NOTE expert users of UIMA can skip this one and should consider attending the Advanced Topics tutorial. 38 | 39 | Prerequisites for the UIMA 101 Tutorial 40 | 1) Java Programming 41 | 2) Some experience with Eclipse IDE helpful 42 | 43 | FROST Tutorial 44 | August 28 9:00AM - 5:00PM in GN-K35 45 | 46 | Visitors from the FROST team will be here to talk to us about FROST. 47 | 48 | UIMA 201: The UIMA Advanced Topics Tutorial 49 | September 15: 9:00AM - 5:30PM in Hawthorne 1S-F53 50 | 51 | UIMA 201 will introduce some new UIMA concepts and walk the student through hands-on examples. 52 | 53 | The advanced topics tutorial is designed for people who have some experience with UIMA and want 54 | to use new capabilities of UIMA 1.0 to address one or more of the following 55 | Advanced Topics: 56 | 57 | 1) Collection Processing and Collection Processing Engines (CPEs) 58 | 2) Multi-Threading and CAS Pooling 59 | 3) Using the UIMA adapter framework to integrate network TAEs with Java TAEs 60 | 4) A Semantic Search Application that brings it all together 61 | 62 | Prerequisites for UIMA 201 63 | 1) UIMA 101 Tutorial OR Extensive UIMA Experience 64 | 65 | The UIMA Integration Test bed Service (The "SITH") 66 | September 17 3:00PM - 4:30PM in HAW GN-K35 67 | 68 | We have developed the first version of the UIMA Integration Test bed service. 69 | 70 | This service is being developed to help test, evaluate, certify and publish UIMA compliant components. 71 | 72 | In this talk we will explain the service and what it is intended to provide the UIMA community. We will address the following topics: 73 | 74 | 1. SITH Services 75 | 2. How to submit components and what to expect in return 76 | 3. Overview of the test bed implementation using Collection Processing UIMA and Juru. 77 | 4. Next Steps for the SITH 78 | 79 | 80 | -------------------------------------------------------------------------------- /suim-examples/data/UIMA_Seminars.txt: -------------------------------------------------------------------------------- 1 | Upcoming UIMA Seminars 2 | 3 | April 7, 2004 Distillery Lunch Seminar 4 | UIMA and its Metadata 5 | 12:00PM-1:00PM in HAW GN-K35. 6 | 7 | Dave Ferrucci will give a UIMA overview and discuss the types of component metadata that UIMA components provide. Jon Lenchner will give a demo of the Text Analysis Engine configurator tool. 8 | 9 | 10 | April 16, 2004 KM & I Department Tea 11 | Title: An Eclipse-based TAE Configurator Tool 12 | 3:00PM-4:30PM in HAW GN-K35 . 13 | 14 | Jon Lenchner will demo an Eclipse plugin for configuring TAE descriptors, which will be available soon for you to use. No more editing XML descriptors by hand! 15 | 16 | 17 | May 11, 2004 UIMA Tutorial 18 | 9:00AM-5:00PM in HAW GN-K35. 19 | 20 | This is a full-day, hands-on tutorial on UIMA, covering the development of Text Analysis Engines and Collection Processing Engines, as well as how to include these components in your own applications. 21 | -------------------------------------------------------------------------------- /suim-examples/data/WatsonConferenceRooms.txt: -------------------------------------------------------------------------------- 1 | Conference Rooms at Watson: 2 | Location Capacity Wall Phone Ext. 3 | 4 | Classroom Style 5 | HAW J2-B34 Seats 12 tieline 863-3130 6 | HAW J2-N07 Seats 24 tieline 863-3210 7 | YKT 20-001 Seats 36 tieline 862-4304 8 | YKT 20-051 Seats 18 tieline 862-4307 9 | 10 | Conference Style 11 | HAW 2N-F28 Seats 20 tieline 863-7583 12 | HAW 4N-B15 Seats 14 tieline 863-7126 13 | HAW 4N-B17 Seats 10 tieline 863-7089 14 | HAW 4S-K21 Seats 16 tieline 863-6386 15 | HAW GN-F14 Seats 12 tieline 863-6770 16 | HAW GN-K30 Seats 12 tieline 863-7335 17 | HAW GN-K36 Seats 10 tieline 863-6098 18 | HAW J1-N14 Seats 24 tieline 863-3629 19 | HAW J2-A16 Seats 12 tieline 863-3240 20 | HAW J2-G27 Seats 15 tieline 863-3150 21 | HAW J2-M24 Seats 8 tieline 863-3160 22 | YKT 03-135 Seats 8 tieline 862-1696 23 | YKT 03-235 Seats 8 tieline 862-4278 24 | YKT 05-135 Seats 8 tieline 862-3477 25 | YKT 05-235 Seats 8 tieline 862-4279 26 | YKT 20-006 Seats 8 tieline 862-4301 27 | YKT 20-059 Seats 20 tieline 862-4308 28 | YKT 35-132 Seats 8 tieline 862-2873 29 | YKT 35-232 Seats 8 tieline 862-2860 30 | YKT 38-023 Seats 8 tieline 862-3299 31 | YKT 39-132 Seats 8 tieline 862-3486 32 | YKT 40-100 Seats 20 tieline 862-4199 33 | YKT 40-200 Seats 20 tieline 862-1379 34 | 35 | Other 36 | HAW GN-K35 Seats 24 tieline 863-6104 37 | 38 | Theater Style 39 | HAW 1S-F40 Seats 30 tieline 863-6396 40 | YKT 20-043 Seats 50 tieline 862-4306 41 | 42 | Video Conference Room 43 | YKT 32-026 Seats 25 tieline 862-3917 44 | -------------------------------------------------------------------------------- /suim-examples/data/xml/IBM_LifeSciences.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | IBM announces $100 Million investment in Life Sciences 26 | 16 August 2000 27 | "Life sciences is one of the emerging markets at the heart of IBM's growth strategy," said John M. Thompson, IBM senior vice president & group executive, Software. "This investment is the first of a number of steps we will be taking to advance IBM's life sciences initiatives." In his role as newly appointed IBM Corporation vice chairman, effective September 1, Mr. Thompson will be responsible for integrating and accelerating IBM's efforts to exploit life sciences and other emerging growth areas. 28 | 29 | IBM estimates the market for IT solutions for life sciences will skyrocket from $3.5 billion today to more than $9 billion by 2003. Driving demand is the explosive growth in genomic, proteomic and pharmaceutical research. For example, the Human Genome Database is approximately three terabytes of data, or the equivalent of 150 million pages of information. The volume of life sciences data is doubling every six months. 30 | 31 | "All of this genetic data is worthless without the information technology that can help scientists manage and analyze it to unlock the pathways that will lead to new cures for many of today's diseases," said Dr. Caroline Kovac, vice president of IBM's new Life Sciences unit. "IBM can help speed this process by enabling more efficient interpretation of data and sharing of knowledge. The potential for change based on innovation in life sciences is bigger than the change caused by the digital circuit." 32 | 33 | Among the life sciences initiatives already underway at IBM are: 34 | - DiscoveryLink* -- For the first time, researchers using this combination of innovative middleware and integration services can join together information from many sources to solve complex medical research problems. DiscoveryLink creates a "virtual database" that permits data to be accessed and extracted from multiple data sources used in research and development projects. This IT solution can dramatically improve product cycle time and lower development costs for pharmaceutical, biotechnology and agri-science companies. 35 | 36 | - Blue Gene* - IBM is building a supercomputer 100 times faster than any available today designed to advance understanding of the mechanisms behind protein folding through large-scale biomolecular simulation. In December, IBM committed $100 million to this five-year research project to advance the state-of-the-art in supercomputing for biological applications. 37 | - Bio-Dictionary* -- IBM has compiled a protein dictionary containing some 30 million protein "words" designed to accelerate the understanding of protein shapes and functions.Bio-Dictionaries for selected genomes, as well as bioinformatics algorithms for pattern discovery and other relevant applications, are available to scientists and researchers for noncommercial use through a website dedicated to life sciences content at http://www.research.ibm.com/compsci/compbio/. 38 | 39 | 40 | -------------------------------------------------------------------------------- /suim-examples/data/xml/New_IBM_Fellows.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | IBM Names Five Fellows, Company's Highest Techinical Honor 26 | 05 June 2002 27 | 28 | IBM today elevated five employees to the title of IBM Fellow -- its most prestigious technical honor. The company also presented more than $2.8 million in cash awards to employees whose technical innovation have yielded exceptional value to the company and its customers. 29 | 30 | IBM conferred the accolades and awards at its 2003 Corporate Technical Recognition Event (CTRE) in Scottsdale, Ariz. CTRE is a 40-year tradition at IBM, established to recognize exceptional technical employees and reward them for extraordinary achievements and contributions to the company's technology leadership. 31 | 32 | "Our technical employees are among the best and brightest innovators in the world. They share a passion for excellence that defines their work and permeates the products and services IBM delivers to its customers," said Nick Donofrio, senior vice president, technology and manufacturing for IBM. "CTRE provides the means for us to honor those who have distinguished themselves as exceptional leaders among their peers." 33 | 34 | Among the special honorees at the 2003 CTRE are five employees who earned the coveted distinction of IBM Fellow: 35 | 36 | 37 | - Grady Booch, chief scientist of Rational Software, IBM Software Group. Recognized internationally for his innovative work on software architecture, modeling, and software engineering process. Mr. Booch is one of the original authors of the Unified Modeling Language (UML), the industry-standard language of blueprints for software-intensive systems. 38 | 39 | - Dr. Donald Chamberlin, researcher, IBM Almaden Research Center. An expert in relational database languages, Dr. Chamberlin is co- inventor of SQL, the language that energized the relational database market. He has also influenced the creation of XQuery, one of a new generation of database query languages covering structured, semi-structured and unstructured data. 40 | 41 | - Dr. George Galambos, chief technology officer, IBM Global Services (IGS) in Canada; the first Fellow from Canada. Dr. Galambos specializes in high-performance, high availability designs, operational effectiveness, and risk assessment/mitigation, focusing on systems engineering and architecture reuse that enhances efficiency and stability. He is a principal driver of and contributor to the widely acclaimed "Patterns for e-business" and the Enterprise Solution Structure Reference Architectures, widely used by IGS in customer engagements. 42 | 43 | - Rod Smith, vice president of Internet emerging technologies, IBM Software Group. A leader in the areas of object-oriented programming, visual development tools, Java, XML, and Web Services. Rod also was the chief technical strategist for focusing the Java platform for use in middleware solutions, in particular initiating contributions to the development of the J2EE. 44 | 45 | - Charles Webb, eServer processor design, IBM Systems Group. Charles Webb has led the reinvention of IBM's eServer zSeries microprocessor designs and roadmap, including the z900 server, where he provided the bridge among architecture, hardware, compilers and system software, defining major portions of the 64- bit architecture and beyond. 46 | 47 | 48 | The title of IBM Fellow is the company's most preeminent technical distinction and is granted in recognition of outstanding and sustained technical achievements in engineering, programming, science and technology. Only 175 individuals have earned this designation in the company's history and, including the newly named Fellows, 56 are active employees. IBM Fellows are encouraged to further enhance their potential for creative achievements and typically work on special projects or research initiatives that lead the company in exciting new directions. 49 | 50 | -------------------------------------------------------------------------------- /suim-examples/data/xml/SeminarChallengesInSpeechRecognition.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | 25 | 26 | UIT Seminar: Challenges in Speech Recognition 27 | 8 August 2003 28 | 29 | UIT Seminar: Challenges in Speech Recognition 30 | August 8, 2003 10:30 AM - 11:30 AM 31 | Lawrence Rabiner , Associate Director CAIP, Rutgers 32 | University, Professor Univ. of Santa Barbara 33 | Yorktown 20-043 34 | Availability: Open 35 | 36 | Speech recognition has matured to the point where it 37 | is now being widely applied in a range of applications 38 | including desktop dictation, cell phone name dialing, 39 | agent technology, automated operator services, 40 | telematics, call center automation and help desks. 41 | 42 | Although the technology is often good enough for many 43 | of these applications, there remain key challenges in 44 | virtually every aspect of speech recognition that 45 | prevent the technology from being used ubiquitously in 46 | any environment, for any speaker, and for an even 47 | broader range of applications. This talk will analyze 48 | the ‘Speech Circle’ that enables a person to maintain 49 | a dialog with a machine using speech recognition, 50 | spoken language understanding, dialog management and 51 | spoken language generation, and finally text-to-speech 52 | synthesis, and show where significant progress has 53 | been made, and where there remain critical problems 54 | that need to be addressed and solved. 55 | 56 | The talk will include several audio and video examples 57 | of speech recognition and speech understanding systems 58 | that have been studied in the laboratory to illustrate 59 | the challenges that remain to be solved before speech 60 | recognition is considered a solved problem. 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /suim-examples/data/xml/TrainableInformationExtractionSystems.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems 26 | 19 August 2003 27 | 28 | Adventurous Research Summer Seminar Series - Trainable Information Extraction Systems 29 | 30 | August 19, 2003 02:00 PM - 03:30 PM 31 | David Johnson, Frank Oles, Tong Zhang(IBM Research) 32 | Hawthorne GN-F15 33 | Availability: Open 34 | 35 | The technical objective of the TIES project is to build customizable systems that can identify named entities in text, such as persons, organizations, and locations, as well as identifying relations between those entities. The technical approach is to develop new statistical and symbolic machine learning algorithms in service of the technical objective. Also, we are working on combining statistical with symbolic techniques. The first part of this talk, given by David E. Johnson, will provide a general overview of the goals of the TIES project. The second part, given by Tong Zhang, will provide background on applying statistical machine learning to this problem domain. Tong will also describe the particular statistical approach taken, which is termed Robust Risk Minimization (RMM). The final part will be given by Frank J. Oles. Frank will introduce his theory of precedence-inclusion patterns. Precedence-inclusion patterns are mathematical structures possessing multiple interacting strict partial orders that satisfy axioms generalizing the familiar properties of irreflexivity and transitivity. This very general theory provides a radically new approach to symbolic, as opposed to statistical, pattern generalization that can be applied to relational learning in a number of settings, including learning based on text, on images, or on videos. 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /suim-examples/data/xml/UIMASummerSchool2003.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | UIMA Summer School 25 | 1 August 2003 26 | 27 | August 26, 2003 28 | UIMA 101 - The New UIMA Introduction 29 | (Hands-on Tutorial) 30 | 9:00AM-5:00PM in HAW GN-K35 31 | 32 | August 28, 2003 33 | FROST Tutorial 34 | 9:00AM-5:00PM in HAW GN-K35 35 | 36 | September 15, 2003 37 | UIMA 201: UIMA Advanced Topics 38 | (Hands-on Tutorial) 39 | 9:00AM-5:00PM in HAW 1S-F53 40 | 41 | September 17, 2003 42 | The UIMA System Integration Test and Hardening Service 43 | The "SITH" 44 | 3:00PM-4:30PM in HAW GN-K35 45 | 46 | 47 | 48 | UIMA Summer School Tutorial and Presentation Details 49 | UIMA 101: The new UIMA tutorial 50 | Tuesday August 26 9:00AM - 4:30PM in GN-K35 51 | 52 | UIMA 101 is a hands-on programming tutorial. 53 | 54 | UIMA 101 is intended for people who want a first introductory course to UIMA or for people who would like a refresher. 55 | 56 | The tutorial covers the same concepts in the first UIMA tutorial given in 3Q 2002 except for some key updates: 57 | 58 | 1) It uses a new interface to the CAS that makes it more natural to access and update CAS feature structures using ordinary Java objects (i.e., the JCAS) and 59 | 2) It uses updated TAE interfaces that give the application developer more control over managing multiple CASs. 60 | 61 | Please NOTE expert users of UIMA can skip this one and should consider attending the Advanced Topics tutorial. 62 | 63 | Prerequisites for the UIMA 101 Tutorial 64 | 1) Java Programming 65 | 2) Some experience with Eclipse IDE helpful 66 | 67 | FROST Tutorial 68 | August 28 9:00AM - 5:00PM in GN-K35 69 | 70 | Visitors from the FROST team will be here to talk to us about FROST. 71 | 72 | UIMA 201: The UIMA Advanced Topics Tutorial 73 | September 15: 9:00AM - 5:30PM in Hawthorne 1S-F53 74 | 75 | UIMA 201 will introduce some new UIMA concepts and walk the student through hands-on examples. 76 | 77 | The advanced topics tutorial is designed for people who have some experience with UIMA and want 78 | to use new capabilities of UIMA 1.0 to address one or more of the following 79 | Advanced Topics: 80 | 81 | 1) Collection Processing and Collection Processing Engines (CPEs) 82 | 2) Multi-Threading and CAS Pooling 83 | 3) Using the UIMA adapter framework to integrate network TAEs with Java TAEs 84 | 4) A Semantic Search Application that brings it all together 85 | 86 | Prerequisites for UIMA 201 87 | 1) UIMA 101 Tutorial OR Extensive UIMA Experience 88 | 89 | The UIMA Integration Test bed Service (The "SITH") 90 | September 17 3:00PM - 4:30PM in HAW GN-K35 91 | 92 | We have developed the first version of the UIMA Integration Test bed service. 93 | 94 | This service is being developed to help test, evaluate, certify and publish UIMA compliant components. 95 | 96 | In this talk we will explain the service and what it is intended to provide the UIMA community. We will address the following topics: 97 | 98 | 1. SITH Services 99 | 2. How to submit components and what to expect in return 100 | 3. Overview of the test bed implementation using Collection Processing UIMA and Juru. 101 | 4. Next Steps for the SITH 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /suim-examples/data/xml/UIMA_Seminars.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | Upcoming UIMA Seminars 26 | 15 March 2004 27 | 28 | April 7, 2004 Distillery Lunch Seminar 29 | UIMA and its Metadata 30 | 12:00PM-1:00PM in HAW GN-K35. 31 | 32 | Dave Ferrucci will give a UIMA overview and discuss the types of component metadata that UIMA components provide. Jon Lenchner will give a demo of the Text Analysis Engine configurator tool. 33 | 34 | 35 | April 16, 2004 KM & I Department Tea 36 | Title: An Eclipse-based TAE Configurator Tool 37 | 3:00PM-4:30PM in HAW GN-K35 . 38 | 39 | Jon Lenchner will demo an Eclipse plugin for configuring TAE descriptors, which will be available soon for you to use. No more editing XML descriptors by hand! 40 | 41 | 42 | May 11, 2004 UIMA Tutorial 43 | 9:00AM-5:00PM in HAW GN-K35. 44 | 45 | This is a full-day, hands-on tutorial on UIMA, covering the development of Text Analysis Engines and Collection Processing Engines, as well as how to include these components in your own applications. 46 | 47 | -------------------------------------------------------------------------------- /suim-examples/data/xml/WatsonConferenceRooms.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | Conference Rooms at Watson 26 | 01 January 2000 27 | 28 | Conference Rooms at Watson: 29 | Location Capacity Wall Phone Ext. 30 | 31 | Classroom Style 32 | HAW J2-B34 Seats 12 tieline 863-3130 33 | HAW J2-N07 Seats 24 tieline 863-3210 34 | YKT 20-001 Seats 36 tieline 862-4304 35 | YKT 20-051 Seats 18 tieline 862-4307 36 | 37 | Conference Style 38 | HAW 2N-F28 Seats 20 tieline 863-7583 39 | HAW 4N-B15 Seats 14 tieline 863-7126 40 | HAW 4N-B17 Seats 10 tieline 863-7089 41 | HAW 4S-K21 Seats 16 tieline 863-6386 42 | HAW GN-F14 Seats 12 tieline 863-6770 43 | HAW GN-K30 Seats 12 tieline 863-7335 44 | HAW GN-K36 Seats 10 tieline 863-6098 45 | HAW J1-N14 Seats 24 tieline 863-3629 46 | HAW J2-A16 Seats 12 tieline 863-3240 47 | HAW J2-G27 Seats 15 tieline 863-3150 48 | HAW J2-M24 Seats 8 tieline 863-3160 49 | YKT 03-135 Seats 8 tieline 862-1696 50 | YKT 03-235 Seats 8 tieline 862-4278 51 | YKT 05-135 Seats 8 tieline 862-3477 52 | YKT 05-235 Seats 8 tieline 862-4279 53 | YKT 20-006 Seats 8 tieline 862-4301 54 | YKT 20-059 Seats 20 tieline 862-4308 55 | YKT 35-132 Seats 8 tieline 862-2873 56 | YKT 35-232 Seats 8 tieline 862-2860 57 | YKT 38-023 Seats 8 tieline 862-3299 58 | YKT 39-132 Seats 8 tieline 862-3486 59 | YKT 40-100 Seats 20 tieline 862-4199 60 | YKT 40-200 Seats 20 tieline 862-1379 61 | 62 | Other 63 | HAW GN-K35 Seats 24 tieline 863-6104 64 | 65 | Theater Style 66 | HAW 1S-F40 Seats 30 tieline 863-6396 67 | YKT 20-043 Seats 50 tieline 862-4306 68 | 69 | Video Conference Room 70 | YKT 32-026 Seats 25 tieline 862-3917 71 | 72 | -------------------------------------------------------------------------------- /suim-examples/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | edu.cmu.lti 5 | suim-examples 6 | 0.0.1-SNAPSHOT 7 | SUIM Examples 8 | 9 | 10 | 2.9.3 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | edu.cmu.lti 20 | suim-scala 21 | 0.0.1-SNAPSHOT 22 | 23 | 24 | de.tudarmstadt.ukp.dkpro.core 25 | de.tudarmstadt.ukp.dkpro.core.io.text-asl 26 | 1.5.0 27 | 28 | 29 | de.tudarmstadt.ukp.dkpro.core 30 | de.tudarmstadt.ukp.dkpro.core.tokit-asl 31 | 1.5.0 32 | 33 | 34 | de.tudarmstadt.ukp.dkpro.core 35 | de.tudarmstadt.ukp.dkpro.core.api.segmentation-asl 36 | 1.5.0 37 | 38 | 39 | org.scalatest 40 | scalatest_2.9.2 41 | 1.7.2 42 | test 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | net.alchim31.maven 53 | scala-maven-plugin 54 | 3.1.0 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | src/main/scala 64 | src/test/scala 65 | 66 | 67 | 68 | 69 | 70 | org.apache.maven.plugins 71 | maven-compiler-plugin 72 | 73 | 1.6 74 | 1.6 75 | 76 | 77 | 78 | 79 | net.alchim31.maven 80 | scala-maven-plugin 81 | 3.1.0 82 | 83 | incremental 84 | 85 | -unchecked 86 | -deprecation 87 | -explaintypes 88 | 89 | 90 | 91 | main 92 | spark-uima-tools.App 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | compile 101 | testCompile 102 | 103 | 104 | 105 | -make:transitive 106 | -dependencyfile 107 | ${project.build.directory}/.scala_dependencies 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | org.apache.maven.plugins 116 | maven-assembly-plugin 117 | 2.2-beta-5 118 | 119 | 120 | jar-with-dependencies 121 | 122 | 123 | 124 | spark-uima-tools.App 125 | 126 | 127 | 128 | 129 | 130 | package 131 | 132 | single 133 | 134 | 135 | 136 | 137 | 138 | 139 | org.apache.maven.plugins 140 | maven-surefire-plugin 141 | 2.12 142 | 143 | true 144 | false 145 | -Xmx1024m 146 | 147 | **/*Spec.scala 148 | 149 | 150 | **/*Test.scala 151 | 152 | 153 | 154 | 155 | 156 | org.scalatest 157 | scalatest-maven-plugin 158 | 1.0-M2 159 | 160 | ${project.build.directory}/surefire-reports 161 | . 162 | WDF TestSuite.txt 163 | 164 | 165 | 166 | test 167 | 168 | test 169 | 170 | 171 | 172 | 173 | 174 | 175 | org.apache.maven.plugins 176 | maven-source-plugin 177 | 2.1.2 178 | 179 | 180 | attach-sources 181 | 182 | jar 183 | 184 | 185 | 186 | 187 | 188 | 189 | org.apache.maven.plugins 190 | maven-resources-plugin 191 | 2.5 192 | 193 | UTF-8 194 | 195 | 196 | 197 | 198 | org.apache.maven.plugins 199 | maven-release-plugin 200 | 2.3.2 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | org.apache.maven.wagon 210 | wagon-ssh-external 211 | 1.0-beta-7 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | org.eclipse.m2e 221 | lifecycle-mapping 222 | 1.0.0 223 | 224 | 225 | 226 | 227 | 228 | 229 | net.alchim31.maven 230 | 231 | 232 | scala-maven-plugin 233 | 234 | 235 | [3.1.0,) 236 | 237 | 238 | compile 239 | testCompile 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /suim-examples/src/main/resources/META-INF/org.apache.uima.fit/types.txt: -------------------------------------------------------------------------------- 1 | classpath*:ex/TutorialTypeSystem.xml 2 | classpath*:org/apache/uima/examples/SourceDocumentInformation.xml 3 | classpath*:desc/type/LexicalUnits.xml 4 | classpath*:desc/type/metadata.xml 5 | 6 | -------------------------------------------------------------------------------- /suim-examples/src/main/resources/ex/RoomNumberAndDateTime.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | org.apache.uima.java 26 | false 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | Aggregate TAE - Room Number and DateTime Annotators 40 | Detects Room Numbers, Dates, and Times 41 | 42 | 43 | 44 | RoomNumber 45 | DateTime 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | org.apache.uima.tutorial.RoomNumber 55 | 56 | org.apache.uima.tutorial.DateAnnot 57 | 58 | org.apache.uima.tutorial.TimeAnnot 59 | 60 | 61 | en 62 | 63 | 64 | 65 | 66 | true 67 | true 68 | false 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /suim-examples/src/main/resources/ex/TutorialTypeSystem.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 25 | TutorialTypeSystem 26 | Type System Definition for the tutorial examples - as of Exercise 6 27 | 1.0 28 | The Apache Software Foundation 29 | 30 | 31 | org.apache.uima.tutorial.RoomNumber 32 | 33 | uima.tcas.Annotation 34 | 35 | 36 | building 37 | Building containing this room 38 | uima.cas.String 39 | 40 | 41 | 42 | 43 | org.apache.uima.tutorial.DateTimeAnnot 44 | 45 | uima.tcas.Annotation 46 | 47 | 48 | shortDateString 49 | 50 | uima.cas.String 51 | 52 | 53 | 54 | 55 | org.apache.uima.tutorial.TimeAnnot 56 | 57 | org.apache.uima.tutorial.DateTimeAnnot 58 | 59 | 60 | 61 | org.apache.uima.tutorial.DateAnnot 62 | 63 | org.apache.uima.tutorial.DateTimeAnnot 64 | 65 | 66 | 67 | org.apache.uima.tutorial.Meeting 68 | 69 | uima.tcas.Annotation 70 | 71 | 72 | room 73 | 74 | org.apache.uima.tutorial.RoomNumber 75 | 76 | 77 | date 78 | 79 | org.apache.uima.tutorial.DateAnnot 80 | 81 | 82 | startTime 83 | 84 | org.apache.uima.tutorial.TimeAnnot 85 | 86 | 87 | endTime 88 | 89 | org.apache.uima.tutorial.TimeAnnot 90 | 91 | 92 | 93 | 94 | org.apache.uima.tutorial.UimaAcronym 95 | 96 | uima.tcas.Annotation 97 | 98 | 99 | expandedForm 100 | 101 | uima.cas.String 102 | 103 | 104 | 105 | 106 | org.apache.uima.tutorial.UimaMeeting 107 | 108 | org.apache.uima.tutorial.Meeting 109 | 110 | 111 | org.apache.uima.examples.tokenizer.Token 112 | 113 | uima.tcas.Annotation 114 | 115 | 116 | org.apache.uima.examples.tokenizer.Sentence 117 | 118 | uima.tcas.Annotation 119 | 120 | 121 | -------------------------------------------------------------------------------- /suim-examples/src/main/resources/org/apache/uima/tutorial/ex6/uimaAcronyms.txt: -------------------------------------------------------------------------------- 1 | UIMA Unstructured Information Management Architecture 2 | SITH System Integration Testing and Hardening 3 | CPE Collection Processing Engine 4 | CPM Collection Processing Manager 5 | AE Analysis Engine 6 | CAS Common Analysis Structure 7 | JCAS Java Common Analysis Structure -------------------------------------------------------------------------------- /suim-examples/src/main/scala/edu/cmu/lti/suim/examples/Annotators.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Carnegie Mellon University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package edu.cmu.lti.suim.examples 18 | 19 | import scala.collection.JavaConversions.collectionAsScalaIterable 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.SparkContext.rddToOrderedRDDFunctions 23 | import org.apache.spark.SparkContext.rddToPairRDDFunctions 24 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription 25 | import org.apache.uima.fit.factory.CollectionReaderFactory.createReader 26 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory 27 | import org.apache.uima.fit.util.JCasUtil 28 | 29 | import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase 30 | import de.tudarmstadt.ukp.dkpro.core.api.segmentation.`type`.Token 31 | import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader 32 | import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter 33 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD 34 | import edu.cmu.lti.suim.SparkUimaUtils.process 35 | 36 | 37 | object Annotators { 38 | 39 | def main(args: Array[String]) = { 40 | val sc = new SparkContext(args(0), "App", 41 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":")) 42 | 43 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription() 44 | val rdd = makeRDD(createReader(classOf[TextReader], 45 | ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "data/*.txt", 46 | ResourceCollectionReaderBase.PARAM_LANGUAGE, "en"), sc) 47 | val seg = createEngineDescription(classOf[BreakIteratorSegmenter]) 48 | val tokens = rdd.map(process(_, seg)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[Token])) 49 | val counts = tokens.map(token => token.getCoveredText()).filter(filter(_)).map((_,1)).reduceByKey(_ + _).map(pair => (pair._2, pair._1)).sortByKey(false) 50 | counts.take(20).foreach(println(_)) 51 | } 52 | 53 | def filter(input: String): Boolean = !input.forall(_.isDigit) && input.matches("""\w*""") 54 | } 55 | -------------------------------------------------------------------------------- /suim-examples/src/main/scala/edu/cmu/lti/suim/examples/App.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Carnegie Mellon University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package edu.cmu.lti.suim.examples 18 | 19 | import scala.collection.JavaConversions.collectionAsScalaIterable 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.SparkContext.rddToPairRDDFunctions 23 | import org.apache.uima.examples.cpe.FileSystemCollectionReader 24 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription 25 | import org.apache.uima.fit.factory.CollectionReaderFactory.createReader 26 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory 27 | import org.apache.uima.fit.util.JCasUtil 28 | import org.apache.uima.tutorial.RoomNumber 29 | import org.apache.uima.tutorial.ex1.RoomNumberAnnotator 30 | 31 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD 32 | import edu.cmu.lti.suim.SparkUimaUtils.process 33 | 34 | object App { 35 | 36 | def main(args: Array[String]) = { 37 | val sc = new SparkContext(args(0), "App", 38 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":")) 39 | 40 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription() 41 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data") 42 | val rdd = makeRDD(createReader(classOf[FileSystemCollectionReader], params: _*), sc) 43 | val rnum = createEngineDescription(classOf[RoomNumberAnnotator]) 44 | val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber])) 45 | val counts = rooms.map(room => room.getBuilding()).map((_,1)).reduceByKey(_ + _) 46 | counts.foreach(println(_)) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /suim-examples/src/main/scala/edu/cmu/lti/suim/examples/AppWithHDFS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Carnegie Mellon University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package edu.cmu.lti.suim.examples 18 | 19 | import scala.collection.JavaConversions.collectionAsScalaIterable 20 | 21 | import org.apache.spark.SparkContext 22 | import org.apache.uima.examples.cpe.FileSystemCollectionReader 23 | import org.apache.uima.fit.factory.AnalysisEngineFactory 24 | import org.apache.uima.fit.factory.CollectionReaderFactory 25 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory 26 | import org.apache.uima.fit.util.JCasUtil 27 | import org.apache.uima.tutorial.RoomNumber 28 | import org.apache.uima.tutorial.ex1.RoomNumberAnnotator 29 | 30 | import edu.cmu.lti.suim.SparkUimaUtils.process 31 | import edu.cmu.lti.suim.SparkUimaUtils.sequenceFile 32 | 33 | 34 | object AppWithHDFS { 35 | 36 | def main(args: Array[String]) = { 37 | val sc = new SparkContext(args(0), "App", 38 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":")) 39 | 40 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription() 41 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data") 42 | val rdd = sequenceFile(CollectionReaderFactory.createCollectionReader(classOf[FileSystemCollectionReader], params: _*), 43 | "hdfs://localhost:9000/file.txt",sc) 44 | val rnum = AnalysisEngineFactory.createEngineDescription(classOf[RoomNumberAnnotator]) 45 | val rooms = rdd.map(process(_, rnum)).flatMap(scas => JCasUtil.select(scas.jcas, classOf[RoomNumber])) 46 | val counts = rooms.map(room => room.getBuilding()).countByValue() 47 | println(counts) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /suim-examples/src/main/scala/edu/cmu/lti/suim/examples/SparkPipelineExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | package edu.cmu.lti.suim.examples 21 | 22 | import java.util.StringTokenizer 23 | 24 | import scala.collection.JavaConversions.bufferAsJavaList 25 | import scala.collection.JavaConversions.collectionAsScalaIterable 26 | import scala.io.Source 27 | 28 | import org.apache.spark.SparkContext 29 | import org.apache.uima.examples.cpe.FileSystemCollectionReader 30 | import org.apache.uima.fit.component.JCasAnnotator_ImplBase 31 | import org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription 32 | import org.apache.uima.fit.factory.CollectionReaderFactory 33 | import org.apache.uima.fit.factory.TypeSystemDescriptionFactory 34 | import org.apache.uima.fit.util.JCasUtil.select 35 | import org.apache.uima.jcas.JCas 36 | import org.apache.uima.tutorial.Meeting 37 | import org.apache.uima.tutorial.UimaAcronym 38 | import org.apache.uima.tutorial.UimaMeeting 39 | 40 | import edu.cmu.lti.suim.SparkUimaUtils.makeRDD 41 | import edu.cmu.lti.suim.SparkUimaUtils.process 42 | 43 | object SparkPipelineExample { 44 | 45 | def readMap(file: String) = { 46 | val s = Source.fromFile(file) 47 | s.getLines.map(line => { 48 | val pair = line.split("\t") 49 | (pair(0), pair(1)) 50 | }).toMap 51 | } 52 | 53 | def main(args: Array[String]) = { 54 | val sc = new SparkContext(args(0), "App", 55 | System.getenv("SPARK_HOME"), System.getenv("SPARK_CLASSPATH").split(":")) 56 | 57 | // Share variable 58 | val mMap = sc.broadcast(readMap("src/main/resources/org/apache/uima/tutorial/ex6/uimaAcronyms.txt")) 59 | val typeSystem = TypeSystemDescriptionFactory.createTypeSystemDescription() 60 | val params = Seq(FileSystemCollectionReader.PARAM_INPUTDIR, "data") 61 | val rdd = makeRDD(CollectionReaderFactory.createReader( 62 | classOf[FileSystemCollectionReader], params: _*), sc) 63 | val result = rdd.map(process(_, createEngineDescription( 64 | createEngineDescription(classOf[UimaAcronymAnnotator]), 65 | createEngineDescription(classOf[UimaMeetingAnnotator])))).cache 66 | result.flatMap(scas => select(scas.jcas, classOf[UimaAcronym])).foreach(println(_)) 67 | result.flatMap(scas => select(scas.jcas, classOf[UimaMeeting])).foreach(println(_)) 68 | } 69 | } 70 | 71 | class UimaAcronymAnnotator extends JCasAnnotator_ImplBase { 72 | 73 | val mMap = org.apache.spark.SparkEnv.get.blockManager.getSingle("broadcast_0").get.asInstanceOf[Map[String, String]] 74 | 75 | override def process(jcas: JCas) { 76 | // go through document word-by-word 77 | val text = jcas.getDocumentText(); 78 | var pos = 0; 79 | val tokenizer = new StringTokenizer(text, """ \t\n\r.<.>/?";:[{]}\|=+()!""", true); 80 | while (tokenizer.hasMoreTokens()) { 81 | val token = tokenizer.nextToken(); 82 | // look up token in map to see if it is an acronym 83 | val expandedForm = mMap.get(token); 84 | if (expandedForm.isDefined) { 85 | // create annotation 86 | val annot = new UimaAcronym(jcas, pos, pos + token.length()); 87 | annot.setExpandedForm(expandedForm.get); 88 | annot.addToIndexes(); 89 | } 90 | // incrememnt pos and go to next token 91 | pos += token.length(); 92 | } 93 | } 94 | } 95 | 96 | 97 | class UimaMeetingAnnotator extends JCasAnnotator_ImplBase { 98 | 99 | val mMap = org.apache.spark.SparkEnv.get.blockManager.getSingle("broadcast_0").get.asInstanceOf[Map[String, String]] 100 | 101 | override def process(jcas: JCas) { 102 | // get document text 103 | val text = jcas.getDocumentText(); 104 | 105 | // We iterate over all Meeting annotations, and if we determine that 106 | // the topic of a meeting is UIMA-related, we create a UimaMeeting 107 | // annotation. We add each UimaMeeting annotation to a list, and then 108 | // later go back and add these to the CAS indexes. We need to do this 109 | // because it's not allowed to add to an index that you're currently 110 | // iterating over. 111 | val uimaMeetings = scala.collection.mutable.Buffer[UimaMeeting]() 112 | 113 | select(jcas, classOf[Meeting]).foreach(meeting => { 114 | // get span of text within 50 chars on either side of meeting 115 | // (window size should probably be a config. param) 116 | var begin = meeting.getBegin() - 50 117 | var end = meeting.getEnd() + 50 118 | if (begin < 0) { 119 | begin = 0 120 | } 121 | if (end > text.length()) { 122 | end = text.length() 123 | } 124 | val window = text.substring(begin, end) 125 | 126 | // look for UIMA acronyms within this window 127 | val tokenizer = new StringTokenizer(window, """ \t\n\r.<.>/?";:[{]}\|=+()!"""); 128 | var continue = true 129 | while (tokenizer.hasMoreTokens() && continue) { 130 | val token = tokenizer.nextToken(); 131 | // look up token in map to see if it is an acronym 132 | if (mMap.get(token) != null) { 133 | // create annotation 134 | val annot = new UimaMeeting(jcas, meeting.getBegin(), meeting.getEnd()); 135 | annot.setRoom(meeting.getRoom()); 136 | annot.setDate(meeting.getDate()); 137 | annot.setStartTime(meeting.getStartTime()); 138 | annot.setEndTime(meeting.getEndTime()); 139 | // Add annotation to a list, to be later added to the 140 | // indexes. 141 | // We need to do this because it's not allowed to add to an 142 | // index that you're currently iterating over. 143 | uimaMeetings.add(annot); 144 | continue = false 145 | } 146 | } 147 | }) 148 | uimaMeetings.foreach(meeting => meeting.addToIndexes()) 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /suim-examples/src/test/scala/spark-uima-tools/AppSpec.scala: -------------------------------------------------------------------------------- 1 | package cmu.edu.lti.suim 2 | 3 | import org.scalatest.FlatSpec 4 | import org.scalatest.matchers.ShouldMatchers 5 | 6 | class AppSpec extends FlatSpec with ShouldMatchers { 7 | "An App" should "pass" in { 8 | (1) should equal(1) 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /suim-java/pom.xml: -------------------------------------------------------------------------------- 1 | 5 | 4.0.0 6 | 7 | edu.cmu.lti 8 | suim-java 9 | 0.0.1-SNAPSHOT 10 | SUIM Java 11 | 12 | 13 | 14 | 15 | 16 | org.apache.maven.plugins 17 | maven-compiler-plugin 18 | 19 | 1.6 20 | 1.6 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | edu.cmu.lti 30 | suim-scala 31 | 0.0.1-SNAPSHOT 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /suim-java/src/main/java/edu/cmu/lti/suim/JavaSparkUima.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Carnegie Mellon University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package edu.cmu.lti.suim; 18 | 19 | import java.util.List; 20 | 21 | import org.apache.hadoop.io.NullWritable; 22 | import org.apache.spark.api.java.JavaRDD; 23 | import org.apache.spark.api.java.JavaSparkContext; 24 | import org.apache.spark.api.java.function.Function; 25 | import org.apache.uima.analysis_engine.AnalysisEngineDescription; 26 | import org.apache.uima.collection.CollectionReader; 27 | import org.apache.uima.fit.factory.AnalysisEngineFactory; 28 | import org.apache.uima.resource.ResourceInitializationException; 29 | 30 | public final class JavaSparkUima { 31 | 32 | public static JavaRDD sequenceFile(CollectionReader reader, String uri, JavaSparkContext sc) throws Exception { 33 | SparkUimaUtils.createSequenceFile(reader, uri); 34 | return sc.sequenceFile(uri, NullWritable.class, SCAS.class).values(); 35 | } 36 | 37 | public static JavaRDD makeRDD(CollectionReader reader, JavaSparkContext sc) throws Exception { 38 | List buffer = SparkUimaUtils.readFrom(reader); 39 | return sc.parallelize(buffer); 40 | } 41 | 42 | public final static class PipelineFunction extends Function { 43 | 44 | private static final long serialVersionUID = -6881223764488277676L; 45 | 46 | private final AnalysisEngineDescription description; 47 | 48 | public PipelineFunction(AnalysisEngineDescription... descs) throws ResourceInitializationException { 49 | this.description = AnalysisEngineFactory.createEngineDescription(descs); 50 | } 51 | 52 | public PipelineFunction(AnalysisEngineDescription desc) { 53 | this.description = desc; 54 | } 55 | 56 | public SCAS call(SCAS scas) { 57 | return SparkUimaUtils.process(scas, description); 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /suim-scala/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | edu.cmu.lti 5 | suim-scala 6 | 0.0.1-SNAPSHOT 7 | SUIM Scala 8 | 9 | 10 | 2.9.3 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | org.apache.uima 20 | uimaj-examples 21 | 2.4.2 22 | 23 | 24 | org.apache.uima 25 | uimaj-core 26 | 2.4.2 27 | 28 | 29 | org.apache.uima 30 | uimafit-core 31 | 2.0.0 32 | 33 | 34 | org.apache.spark 35 | spark-core_2.9.3 36 | 0.8.0-incubating 37 | 38 | 39 | org.scala-lang 40 | scala-library 41 | ${scala.version} 42 | 43 | 44 | 45 | org.scalatest 46 | scalatest_2.9.2 47 | 1.7.2 48 | test 49 | 50 | 51 | 52 | org.scalamock 53 | scalamock-scalatest-support_2.9.2 54 | 2.4 55 | test 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | net.alchim31.maven 67 | scala-maven-plugin 68 | 3.1.0 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | src/main/scala 78 | src/test/scala 79 | 80 | 81 | 82 | 83 | 84 | org.apache.maven.plugins 85 | maven-compiler-plugin 86 | 87 | 1.6 88 | 1.6 89 | 90 | 91 | 92 | 93 | net.alchim31.maven 94 | scala-maven-plugin 95 | 3.1.0 96 | 97 | incremental 98 | 99 | -unchecked 100 | -deprecation 101 | -explaintypes 102 | 103 | 104 | 105 | main 106 | spark-uima-tools.App 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | compile 115 | testCompile 116 | 117 | 118 | 119 | -make:transitive 120 | -dependencyfile 121 | ${project.build.directory}/.scala_dependencies 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | org.apache.maven.plugins 130 | maven-surefire-plugin 131 | 2.12 132 | 133 | true 134 | false 135 | -Xmx1024m 136 | 137 | **/*Spec.scala 138 | 139 | 140 | **/*Test.scala 141 | 142 | 143 | 144 | 145 | 146 | org.scalatest 147 | scalatest-maven-plugin 148 | 1.0-M2 149 | 150 | ${project.build.directory}/surefire-reports 151 | . 152 | WDF TestSuite.txt 153 | 154 | 155 | 156 | test 157 | 158 | test 159 | 160 | 161 | 162 | 163 | 164 | 165 | org.apache.maven.plugins 166 | maven-source-plugin 167 | 2.1.2 168 | 169 | 170 | attach-sources 171 | 172 | jar 173 | 174 | 175 | 176 | 177 | 178 | 179 | org.apache.maven.plugins 180 | maven-resources-plugin 181 | 2.5 182 | 183 | UTF-8 184 | 185 | 186 | 187 | 188 | org.apache.maven.plugins 189 | maven-release-plugin 190 | 2.3.2 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | org.apache.maven.wagon 200 | wagon-ssh-external 201 | 1.0-beta-7 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | org.eclipse.m2e 211 | lifecycle-mapping 212 | 1.0.0 213 | 214 | 215 | 216 | 217 | 218 | 219 | net.alchim31.maven 220 | 221 | 222 | scala-maven-plugin 223 | 224 | 225 | [3.1.0,) 226 | 227 | 228 | compile 229 | testCompile 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /suim-scala/src/main/scala/edu/cmu/lti/suim/SCAS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Carnegie Mellon University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package edu.cmu.lti.suim 18 | 19 | import java.io.ByteArrayInputStream 20 | import java.io.ByteArrayOutputStream 21 | import java.io.DataInput 22 | import java.io.DataOutput 23 | import java.io.Externalizable 24 | import java.io.ObjectInput 25 | import java.io.ObjectOutput 26 | 27 | import org.apache.hadoop.io.Writable 28 | import org.apache.uima.cas.CAS 29 | import org.apache.uima.cas.impl.Serialization 30 | import org.apache.uima.fit.factory.JCasFactory 31 | 32 | object SCAS { 33 | 34 | def read(in: DataInput) = { 35 | val scas = new SCAS(); 36 | scas.readFields(in); 37 | scas 38 | } 39 | } 40 | 41 | class SCAS(val cas: CAS) extends Externalizable with Writable { 42 | 43 | def this() { 44 | this(JCasFactory.createJCas().getCas()) 45 | } 46 | 47 | override def readExternal(in: ObjectInput) { 48 | readFields(in) 49 | } 50 | 51 | override def writeExternal(out: ObjectOutput) { 52 | write(out) 53 | } 54 | 55 | def jcas = cas.getJCas() 56 | 57 | override def write(out: DataOutput) { 58 | val baos = new ByteArrayOutputStream(); 59 | Serialization.serializeWithCompression(cas, baos) 60 | out.writeInt(baos.size) 61 | out.write(baos.toByteArray) 62 | } 63 | 64 | override def readFields(in: DataInput) { 65 | val size = in.readInt(); 66 | val bytes = new Array[Byte](size) 67 | in.readFully(bytes); 68 | val bais = new ByteArrayInputStream(bytes) 69 | Serialization.deserializeCAS(cas, bais); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /suim-scala/src/main/scala/edu/cmu/lti/suim/SparkUimaUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2013 Carnegie Mellon University 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package edu.cmu.lti.suim 18 | 19 | import java.net.URI 20 | 21 | import scala.collection.JavaConversions.asScalaBuffer 22 | import scala.collection.JavaConversions.bufferAsJavaList 23 | 24 | import org.apache.hadoop.conf.Configuration 25 | import org.apache.hadoop.fs.FileSystem 26 | import org.apache.hadoop.fs.Path 27 | import org.apache.hadoop.io.IOUtils 28 | import org.apache.hadoop.io.NullWritable 29 | import org.apache.hadoop.io.SequenceFile 30 | import org.apache.spark.SparkContext 31 | import org.apache.spark.SparkContext.rddToPairRDDFunctions 32 | import org.apache.spark.SparkContext.writableWritableConverter 33 | import org.apache.uima.analysis_engine.AnalysisEngineDescription 34 | import org.apache.uima.collection.CollectionReader 35 | import org.apache.uima.fit.factory.AnalysisEngineFactory 36 | import org.apache.uima.fit.factory.JCasFactory 37 | 38 | object SparkUimaUtils { 39 | 40 | def createSequenceFile(reader: CollectionReader, uri: String) { 41 | val conf = new Configuration() 42 | val fs = FileSystem.get(URI.create(uri), conf) 43 | val path = new Path(uri) 44 | val nw = NullWritable.get 45 | val writer = SequenceFile.createWriter(fs, conf, path, nw.getClass(), classOf[SCAS]) 46 | while (reader.hasNext()) { 47 | val jcas = JCasFactory.createJCas(); 48 | val cas = jcas.getCas() 49 | reader.getNext(cas) 50 | val scas = new SCAS(cas) 51 | writer.append(nw, scas) 52 | } 53 | IOUtils.closeStream(writer) 54 | } 55 | 56 | def sequenceFile(reader: CollectionReader, uri: String, sc: SparkContext) = { 57 | createSequenceFile(reader, uri) 58 | sc.sequenceFile[NullWritable, SCAS](uri).values 59 | } 60 | 61 | def readFrom(reader: CollectionReader): java.util.List[SCAS] = { 62 | val buffer = collection.mutable.ArrayBuffer[SCAS]() 63 | while (reader.hasNext()) { 64 | val jcas = JCasFactory.createJCas(); 65 | val cas = jcas.getCas() 66 | reader.getNext(cas) 67 | buffer += new SCAS(cas) 68 | } 69 | buffer 70 | } 71 | 72 | def makeRDD(reader: CollectionReader, sc: SparkContext) = { 73 | val buffer = readFrom(reader) 74 | sc.parallelize(buffer) 75 | } 76 | 77 | def process(scas: SCAS, description: AnalysisEngineDescription) = { 78 | val ae = AnalysisEngineFactory.createEngine(description) 79 | ae.process(scas.jcas) 80 | scas 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /suim-scala/src/test/scala/spark-uima-tools/AppSpec.scala: -------------------------------------------------------------------------------- 1 | package cmu.edu.lti.suim 2 | 3 | import org.scalatest.FlatSpec 4 | import org.scalatest.matchers.ShouldMatchers 5 | 6 | class AppSpec extends FlatSpec with ShouldMatchers { 7 | "An App" should "pass" in { 8 | (1) should equal(1) 9 | } 10 | } 11 | --------------------------------------------------------------------------------