├── .gitignore
├── NLP-on-Spark.tar.gz
├── README.md
├── pom.xml
└── src
    ├── main
        └── scala
        │   └── nlp
        │       └── spark
        │           └── annotate
        │               ├── Annotation.scala
        │               ├── Annotator.scala
        │               └── Document.scala
    └── test
        └── scala
            └── nlp
                └── spark
                    └── annotate
                        └── AnnotationTest.scala


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Eclipse template
  3 | *.pydevproject
  4 | .metadata
  5 | .gradle
  6 | bin/
  7 | tmp/
  8 | *.tmp
  9 | *.bak
 10 | *.swp
 11 | *~.nib
 12 | local.properties
 13 | .settings/
 14 | .loadpath
 15 | 
 16 | # Eclipse Core
 17 | .project
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # JDT-specific (Eclipse Java Development Tools)
 29 | .classpath
 30 | 
 31 | # Java annotation processor (APT)
 32 | .factorypath
 33 | 
 34 | # PDT-specific
 35 | .buildpath
 36 | 
 37 | # sbteclipse plugin
 38 | .target
 39 | 
 40 | # TeXlipse plugin
 41 | .texlipse
 42 | ### Maven template
 43 | target/
 44 | pom.xml.tag
 45 | pom.xml.releaseBackup
 46 | pom.xml.versionsBackup
 47 | pom.xml.next
 48 | release.properties
 49 | dependency-reduced-pom.xml
 50 | buildNumber.properties
 51 | .mvn/timing.properties
 52 | ### JetBrains template
 53 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
 54 | 
 55 | *.iml
 56 | 
 57 | ## Directory-based project format:
 58 | .idea/
 59 | # if you remove the above rule, at least ignore the following:
 60 | 
 61 | # User-specific stuff:
 62 | # .idea/workspace.xml
 63 | # .idea/tasks.xml
 64 | # .idea/dictionaries
 65 | 
 66 | # Sensitive or high-churn files:
 67 | # .idea/dataSources.ids
 68 | # .idea/dataSources.xml
 69 | # .idea/sqlDataSources.xml
 70 | # .idea/dynamic.xml
 71 | # .idea/uiDesigner.xml
 72 | 
 73 | # Gradle:
 74 | # .idea/gradle.xml
 75 | # .idea/libraries
 76 | 
 77 | # Mongo Explorer plugin:
 78 | # .idea/mongoSettings.xml
 79 | 
 80 | ## File-based project format:
 81 | *.ipr
 82 | *.iws
 83 | 
 84 | ## Plugin-specific files:
 85 | 
 86 | # IntelliJ
 87 | /out/
 88 | 
 89 | # mpeltonen/sbt-idea plugin
 90 | .idea_modules/
 91 | 
 92 | # JIRA plugin
 93 | atlassian-ide-plugin.xml
 94 | 
 95 | # Crashlytics plugin (for Android Studio and IntelliJ)
 96 | com_crashlytics_export_strings.xml
 97 | crashlytics.properties
 98 | crashlytics-build.properties
 99 | 
100 | ## Python
101 | *.pyc
102 | 
103 | ## Checkstyle results
104 | scalastyle-output.xml
105 | javastyle-output.xml
106 | 
107 | ## Minicluster folders
108 | minicluster_*


--------------------------------------------------------------------------------
/NLP-on-Spark.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexander-n-thomas/nlp.spark.annotate/d3af0c7fe44272c2478aa6c8871944c84aea6514/NLP-on-Spark.tar.gz


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # nlp.spark.annotate
2 | 
3 | This is the project used in the "Building Pipelines for Natural Language Understanding with Spark".
4 | 
5 | This is a simple maven project.  
6 | `mvn clean install` to build the project.
7 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>nlp.spark</groupId>
  8 |     <artifactId>annotate</artifactId>
  9 |     <version>1.0</version>
 10 | 
 11 |     <properties>
 12 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 13 |         <plugin.compiler.version>2.4</plugin.compiler.version>
 14 |         <plugin.surefire.version>2.10</plugin.surefire.version>
 15 |         <plugin.scala-maven.version>3.2.2</plugin.scala-maven.version>
 16 |         <plugin.shade.version>2.3</plugin.shade.version>
 17 | 
 18 |         <scala.tools.version>2.11</scala.tools.version>
 19 |         <scala.version>2.11.8</scala.version>
 20 |         <scalatest.version>3.0.0</scalatest.version>
 21 | 
 22 |         <log4j.version>1.2.17</log4j.version>
 23 |         <junit.version>4.11</junit.version>
 24 |     </properties>
 25 | 
 26 |     <dependencies>
 27 | 
 28 |         <!-- Scala -->
 29 |         <dependency>
 30 |             <groupId>org.scala-lang</groupId>
 31 |             <artifactId>scala-library</artifactId>
 32 |             <version>${scala.version}</version>
 33 |         </dependency>
 34 | 
 35 |         <!-- Testing -->
 36 | 
 37 |         <!-- scalatest -->
 38 |         <dependency>
 39 |             <groupId>org.scalatest</groupId>
 40 |             <artifactId>scalatest_${scala.tools.version}</artifactId>
 41 |             <version>${scalatest.version}</version>
 42 |             <scope>test</scope>
 43 |             <exclusions>
 44 |                 <exclusion>
 45 |                     <artifactId>scala-library</artifactId>
 46 |                     <groupId>org.scala-lang</groupId>
 47 |                 </exclusion>
 48 |             </exclusions>
 49 |         </dependency>
 50 | 
 51 |         <!-- JUnit Testing -->
 52 |         <dependency>
 53 |             <groupId>junit</groupId>
 54 |             <artifactId>junit</artifactId>
 55 |             <version>${junit.version}</version>
 56 |             <scope>test</scope>
 57 |         </dependency>
 58 | 
 59 |         <!-- Apache log4j -->
 60 |         <dependency>
 61 |             <groupId>log4j</groupId>
 62 |             <artifactId>log4j</artifactId>
 63 |             <version>${log4j.version}</version>
 64 |         </dependency>
 65 |     </dependencies>
 66 | 
 67 |     <build>
 68 |         <plugins>
 69 |             <plugin>
 70 |                 <!-- see http://davidb.github.com/scala-maven-plugin -->
 71 |                 <groupId>net.alchim31.maven</groupId>
 72 |                 <artifactId>scala-maven-plugin</artifactId>
 73 |                 <version>3.2.2</version>
 74 |                 <executions>
 75 |                     <execution>
 76 |                         <id>scala-compile-first</id>
 77 |                         <phase>process-resources</phase>
 78 |                         <goals>
 79 |                             <goal>add-source</goal>
 80 |                             <goal>compile</goal>
 81 |                         </goals>
 82 |                     </execution>
 83 |                     <execution>
 84 |                         <goals>
 85 |                             <goal>compile</goal>
 86 |                             <goal>testCompile</goal>
 87 |                         </goals>
 88 |                         <configuration>
 89 |                             <args>
 90 |                                 <arg>-dependencyfile</arg>
 91 |                                 <arg>${project.build.directory}/.scala_dependencies</arg>
 92 |                             </args>
 93 |                         </configuration>
 94 |                     </execution>
 95 |                 </executions>
 96 |                 <configuration>
 97 |                     <jvmArgs>
 98 |                         <jvmArg>-Xmx4G</jvmArg>
 99 |                     </jvmArgs>
100 |                 </configuration>
101 |             </plugin>
102 |         </plugins>
103 |     </build>
104 | </project>


--------------------------------------------------------------------------------
/src/main/scala/nlp/spark/annotate/Annotation.scala:
--------------------------------------------------------------------------------
 1 | package nlp.spark.annotate
 2 | 
 3 | /**
 4 |   * This class represents an annotation in text
 5 |   * @param aType the type of the annotation (e.g. token, sentence, etc.)
 6 |   * @param begin the index of the first character of the annotation
 7 |   * @param end the index after the last charactor of the annotation
 8 |   * @param metadata the metadata for the annotation (e.g. lemma, sentiment, etx.)
 9 |   */
10 | case class Annotation(aType: String, begin: Int, end: Int, metadata: Map[String, String] = Map())
11 | 
12 | /**
13 |   * Companion object for Annotation type
14 |   */
15 | object Annotation {
16 |   /**
17 |     * Annotation ordering
18 |     * @tparam A Some annotation subtype (for our examples we will only use Annotation)
19 |     * @return The [[Ordering]] of annotations
20 |     */
21 |   implicit  def orderingByName[A <: Annotation]: Ordering[A] = {
22 |     Ordering.by(a => (a.begin, -a.end, a.aType))
23 |   }
24 | }


--------------------------------------------------------------------------------
/src/main/scala/nlp/spark/annotate/Annotator.scala:
--------------------------------------------------------------------------------
 1 | package nlp.spark.annotate
 2 | 
 3 | import scala.collection.immutable.TreeSet
 4 | 
 5 | /**
 6 |   * This trait represents an function that produces annotations from a given text or document
 7 |   */
 8 | trait Annotator {
 9 |   /**
10 |     * This takes a text, optional metadata, and optional annotations and returns a sequence of new annotations
11 |     * @param text the text to be annotated
12 |     * @param metadata the metadata associated with the text
13 |     * @param annotations the annotations alread found in the text
14 |     * @return a sequence of new annotations found in the text
15 |     */
16 |   def annotate(
17 |     text: String,
18 |     metadata: Map[String, String] = Map(),
19 |     annotations: TreeSet[Annotation] = TreeSet()): Seq[Annotation]
20 | 
21 |   /**
22 |     * This takes a document and returns a sequence of new annotations
23 |     * @param document the document to be annotated
24 |     * @return a sequence of new annotations found in the text
25 |     */
26 |   def annotate(document: Document): Seq[Annotation] = annotate(document.text, document.metadata, document.annotations)
27 | 
28 |   /**
29 |     * This takes a document and returns a new document with new annotations (if any new ones were found)
30 |     * @param document the document to be annotated
31 |     * @return a new document with new annotations
32 |     */
33 |   def apply(document: Document): Document = document.copy(annotations = document.annotations ++ annotate(document))
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/nlp/spark/annotate/Document.scala:
--------------------------------------------------------------------------------
 1 | package nlp.spark.annotate
 2 | 
 3 | import scala.collection.immutable.TreeSet
 4 | 
 5 | /**
 6 |   *
 7 |   * @param docName the name of the document
 8 |   * @param text the contents of the document
 9 |   * @param metadata the metadata associated with the document
10 |   * @param annotations the annotations found in the document
11 |   */
12 | case class Document(
13 |   docName: String,
14 |   text: String,
15 |   metadata: Map[String, String] = Map(),
16 |   annotations: TreeSet[Annotation] = TreeSet())


--------------------------------------------------------------------------------
/src/test/scala/nlp/spark/annotate/AnnotationTest.scala:
--------------------------------------------------------------------------------
 1 | package nlp.spark.annotate
 2 | 
 3 | import org.junit.runner.RunWith
 4 | import org.scalatest.FunSuite
 5 | import org.scalatest.junit.JUnitRunner
 6 | 
 7 | import scala.collection.immutable.TreeSet
 8 | 
 9 | @RunWith(classOf[JUnitRunner])
10 | class AnnotationTest extends FunSuite {
11 |   test("ordering") {
12 |     val ordering = implicitly[Ordering[Annotation]]
13 |     assert(ordering.equiv(Annotation("a", 0, 10), Annotation("a", 0, 10)))
14 |     assert(ordering.lt(Annotation("a", 0, 10), Annotation("a", 1, 10)))
15 |     assert(ordering.lt(Annotation("a", 0, 11), Annotation("a", 0, 10)))
16 |     assert(ordering.lt(Annotation("a", 0, 10), Annotation("b", 0, 10)))
17 |   }
18 | 
19 |   test("get covering") {
20 |     println(TreeSet(Annotation("a", 0, 10), Annotation("a", 1, 10)).range(Annotation("", 0, 10), Annotation("", 0, 10)))
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------