├── .gitignore ├── NLP-on-Spark.tar.gz ├── README.md ├── pom.xml └── src ├── main └── scala │ └── nlp │ └── spark │ └── annotate │ ├── Annotation.scala │ ├── Annotator.scala │ └── Document.scala └── test └── scala └── nlp └── spark └── annotate └── AnnotationTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Eclipse template 3 | *.pydevproject 4 | .metadata 5 | .gradle 6 | bin/ 7 | tmp/ 8 | *.tmp 9 | *.bak 10 | *.swp 11 | *~.nib 12 | local.properties 13 | .settings/ 14 | .loadpath 15 | 16 | # Eclipse Core 17 | .project 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # JDT-specific (Eclipse Java Development Tools) 29 | .classpath 30 | 31 | # Java annotation processor (APT) 32 | .factorypath 33 | 34 | # PDT-specific 35 | .buildpath 36 | 37 | # sbteclipse plugin 38 | .target 39 | 40 | # TeXlipse plugin 41 | .texlipse 42 | ### Maven template 43 | target/ 44 | pom.xml.tag 45 | pom.xml.releaseBackup 46 | pom.xml.versionsBackup 47 | pom.xml.next 48 | release.properties 49 | dependency-reduced-pom.xml 50 | buildNumber.properties 51 | .mvn/timing.properties 52 | ### JetBrains template 53 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 54 | 55 | *.iml 56 | 57 | ## Directory-based project format: 58 | .idea/ 59 | # if you remove the above rule, at least ignore the following: 60 | 61 | # User-specific stuff: 62 | # .idea/workspace.xml 63 | # .idea/tasks.xml 64 | # .idea/dictionaries 65 | 66 | # Sensitive or high-churn files: 67 | # .idea/dataSources.ids 68 | # .idea/dataSources.xml 69 | # .idea/sqlDataSources.xml 70 | # .idea/dynamic.xml 71 | # .idea/uiDesigner.xml 72 | 73 | # Gradle: 74 | # .idea/gradle.xml 75 | # .idea/libraries 76 | 77 | # Mongo Explorer plugin: 78 | # .idea/mongoSettings.xml 79 | 80 | ## File-based project format: 81 | *.ipr 82 | *.iws 83 | 84 | ## Plugin-specific files: 85 | 86 | # IntelliJ 87 | /out/ 88 | 89 | # mpeltonen/sbt-idea plugin 90 | .idea_modules/ 91 | 92 | # JIRA plugin 93 | atlassian-ide-plugin.xml 94 | 95 | # Crashlytics plugin (for Android Studio and IntelliJ) 96 | com_crashlytics_export_strings.xml 97 | crashlytics.properties 98 | crashlytics-build.properties 99 | 100 | ## Python 101 | *.pyc 102 | 103 | ## Checkstyle results 104 | scalastyle-output.xml 105 | javastyle-output.xml 106 | 107 | ## Minicluster folders 108 | minicluster_* -------------------------------------------------------------------------------- /NLP-on-Spark.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexander-n-thomas/nlp.spark.annotate/d3af0c7fe44272c2478aa6c8871944c84aea6514/NLP-on-Spark.tar.gz -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nlp.spark.annotate 2 | 3 | This is the project used in the "Building Pipelines for Natural Language Understanding with Spark". 4 | 5 | This is a simple maven project. 6 | `mvn clean install` to build the project. 7 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | nlp.spark 8 | annotate 9 | 1.0 10 | 11 | 12 | UTF-8 13 | 2.4 14 | 2.10 15 | 3.2.2 16 | 2.3 17 | 18 | 2.11 19 | 2.11.8 20 | 3.0.0 21 | 22 | 1.2.17 23 | 4.11 24 | 25 | 26 | 27 | 28 | 29 | 30 | org.scala-lang 31 | scala-library 32 | ${scala.version} 33 | 34 | 35 | 36 | 37 | 38 | 39 | org.scalatest 40 | scalatest_${scala.tools.version} 41 | ${scalatest.version} 42 | test 43 | 44 | 45 | scala-library 46 | org.scala-lang 47 | 48 | 49 | 50 | 51 | 52 | 53 | junit 54 | junit 55 | ${junit.version} 56 | test 57 | 58 | 59 | 60 | 61 | log4j 62 | log4j 63 | ${log4j.version} 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | net.alchim31.maven 72 | scala-maven-plugin 73 | 3.2.2 74 | 75 | 76 | scala-compile-first 77 | process-resources 78 | 79 | add-source 80 | compile 81 | 82 | 83 | 84 | 85 | compile 86 | testCompile 87 | 88 | 89 | 90 | -dependencyfile 91 | ${project.build.directory}/.scala_dependencies 92 | 93 | 94 | 95 | 96 | 97 | 98 | -Xmx4G 99 | 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/main/scala/nlp/spark/annotate/Annotation.scala: -------------------------------------------------------------------------------- 1 | package nlp.spark.annotate 2 | 3 | /** 4 | * This class represents an annotation in text 5 | * @param aType the type of the annotation (e.g. token, sentence, etc.) 6 | * @param begin the index of the first character of the annotation 7 | * @param end the index after the last charactor of the annotation 8 | * @param metadata the metadata for the annotation (e.g. lemma, sentiment, etx.) 9 | */ 10 | case class Annotation(aType: String, begin: Int, end: Int, metadata: Map[String, String] = Map()) 11 | 12 | /** 13 | * Companion object for Annotation type 14 | */ 15 | object Annotation { 16 | /** 17 | * Annotation ordering 18 | * @tparam A Some annotation subtype (for our examples we will only use Annotation) 19 | * @return The [[Ordering]] of annotations 20 | */ 21 | implicit def orderingByName[A <: Annotation]: Ordering[A] = { 22 | Ordering.by(a => (a.begin, -a.end, a.aType)) 23 | } 24 | } -------------------------------------------------------------------------------- /src/main/scala/nlp/spark/annotate/Annotator.scala: -------------------------------------------------------------------------------- 1 | package nlp.spark.annotate 2 | 3 | import scala.collection.immutable.TreeSet 4 | 5 | /** 6 | * This trait represents an function that produces annotations from a given text or document 7 | */ 8 | trait Annotator { 9 | /** 10 | * This takes a text, optional metadata, and optional annotations and returns a sequence of new annotations 11 | * @param text the text to be annotated 12 | * @param metadata the metadata associated with the text 13 | * @param annotations the annotations alread found in the text 14 | * @return a sequence of new annotations found in the text 15 | */ 16 | def annotate( 17 | text: String, 18 | metadata: Map[String, String] = Map(), 19 | annotations: TreeSet[Annotation] = TreeSet()): Seq[Annotation] 20 | 21 | /** 22 | * This takes a document and returns a sequence of new annotations 23 | * @param document the document to be annotated 24 | * @return a sequence of new annotations found in the text 25 | */ 26 | def annotate(document: Document): Seq[Annotation] = annotate(document.text, document.metadata, document.annotations) 27 | 28 | /** 29 | * This takes a document and returns a new document with new annotations (if any new ones were found) 30 | * @param document the document to be annotated 31 | * @return a new document with new annotations 32 | */ 33 | def apply(document: Document): Document = document.copy(annotations = document.annotations ++ annotate(document)) 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/nlp/spark/annotate/Document.scala: -------------------------------------------------------------------------------- 1 | package nlp.spark.annotate 2 | 3 | import scala.collection.immutable.TreeSet 4 | 5 | /** 6 | * 7 | * @param docName the name of the document 8 | * @param text the contents of the document 9 | * @param metadata the metadata associated with the document 10 | * @param annotations the annotations found in the document 11 | */ 12 | case class Document( 13 | docName: String, 14 | text: String, 15 | metadata: Map[String, String] = Map(), 16 | annotations: TreeSet[Annotation] = TreeSet()) -------------------------------------------------------------------------------- /src/test/scala/nlp/spark/annotate/AnnotationTest.scala: -------------------------------------------------------------------------------- 1 | package nlp.spark.annotate 2 | 3 | import org.junit.runner.RunWith 4 | import org.scalatest.FunSuite 5 | import org.scalatest.junit.JUnitRunner 6 | 7 | import scala.collection.immutable.TreeSet 8 | 9 | @RunWith(classOf[JUnitRunner]) 10 | class AnnotationTest extends FunSuite { 11 | test("ordering") { 12 | val ordering = implicitly[Ordering[Annotation]] 13 | assert(ordering.equiv(Annotation("a", 0, 10), Annotation("a", 0, 10))) 14 | assert(ordering.lt(Annotation("a", 0, 10), Annotation("a", 1, 10))) 15 | assert(ordering.lt(Annotation("a", 0, 11), Annotation("a", 0, 10))) 16 | assert(ordering.lt(Annotation("a", 0, 10), Annotation("b", 0, 10))) 17 | } 18 | 19 | test("get covering") { 20 | println(TreeSet(Annotation("a", 0, 10), Annotation("a", 1, 10)).range(Annotation("", 0, 10), Annotation("", 0, 10))) 21 | } 22 | } 23 | --------------------------------------------------------------------------------