├── .gitignore ├── README.md ├── build.gradle ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat └── src ├── main ├── resources │ ├── en-ingredients-finder.bin │ └── trainingdata.txt └── scala │ └── io │ └── github │ └── robhinds │ └── ner │ └── NerModelTrainer.scala └── test └── scala └── io └── github └── robhinds └── ner └── NerModelTrainerSpec.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .cache-main 2 | .classpath 3 | .gradle 4 | .idea 5 | .project 6 | .settings 7 | bin/ 8 | build/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenNLP Ingredient Finder 2 | 3 | A sample project that demonstrates how to train an OpenNLP Named-Entity-Recognition model for a particular entity. 4 | 5 | This example uses training data created based on the BBC Food recipe archive to try and identify ingredients. 6 | 7 | ## How to run 8 | The sample training data and a pre-trained model is in ```src/main/resources```. There is a unit test that shows how to load the model and use it to extract named entities from an input text. 9 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | group = "io.github.robhinds" 2 | 3 | task wrapper(type: Wrapper) { 4 | gradleVersion = "2.11" 5 | } 6 | 7 | buildscript { 8 | repositories { 9 | mavenCentral() 10 | } 11 | dependencies { 12 | classpath "com.github.maiflai:gradle-scalatest:0.6-5-g9065d91" 13 | } 14 | } 15 | 16 | repositories { 17 | mavenCentral() 18 | maven { url "https://repository.apache.org/content/repositories/snapshots" } 19 | } 20 | 21 | apply { 22 | plugin "scala" 23 | plugin "com.github.maiflai.scalatest" 24 | } 25 | 26 | dependencies { 27 | compile "org.scala-lang:scala-library:2.11.8" 28 | compile "org.apache.opennlp:opennlp-tools:1.6.1-SNAPSHOT" 29 | testCompile 'org.scalatest:scalatest_2.11:3.0.0' 30 | testRuntime 'org.pegdown:pegdown:1.6.0' 31 | } -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robhinds/opennlp-ingredient-finder/2efabd8521e5808735a5de8fb90d981326dc728b/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sat Oct 29 22:22:42 BST 2016 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.11-bin.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # Attempt to set APP_HOME 46 | # Resolve links: $0 may be a link 47 | PRG="$0" 48 | # Need this for relative symlinks. 49 | while [ -h "$PRG" ] ; do 50 | ls=`ls -ld "$PRG"` 51 | link=`expr "$ls" : '.*-> \(.*\)$'` 52 | if expr "$link" : '/.*' > /dev/null; then 53 | PRG="$link" 54 | else 55 | PRG=`dirname "$PRG"`"/$link" 56 | fi 57 | done 58 | SAVED="`pwd`" 59 | cd "`dirname \"$PRG\"`/" >/dev/null 60 | APP_HOME="`pwd -P`" 61 | cd "$SAVED" >/dev/null 62 | 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 64 | 65 | # Determine the Java command to use to start the JVM. 66 | if [ -n "$JAVA_HOME" ] ; then 67 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 68 | # IBM's JDK on AIX uses strange locations for the executables 69 | JAVACMD="$JAVA_HOME/jre/sh/java" 70 | else 71 | JAVACMD="$JAVA_HOME/bin/java" 72 | fi 73 | if [ ! -x "$JAVACMD" ] ; then 74 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 75 | 76 | Please set the JAVA_HOME variable in your environment to match the 77 | location of your Java installation." 78 | fi 79 | else 80 | JAVACMD="java" 81 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 82 | 83 | Please set the JAVA_HOME variable in your environment to match the 84 | location of your Java installation." 85 | fi 86 | 87 | # Increase the maximum file descriptors if we can. 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 89 | MAX_FD_LIMIT=`ulimit -H -n` 90 | if [ $? -eq 0 ] ; then 91 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 92 | MAX_FD="$MAX_FD_LIMIT" 93 | fi 94 | ulimit -n $MAX_FD 95 | if [ $? -ne 0 ] ; then 96 | warn "Could not set maximum file descriptor limit: $MAX_FD" 97 | fi 98 | else 99 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 100 | fi 101 | fi 102 | 103 | # For Darwin, add options to specify how the application appears in the dock 104 | if $darwin; then 105 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 106 | fi 107 | 108 | # For Cygwin, switch paths to Windows format before running java 109 | if $cygwin ; then 110 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 111 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 112 | JAVACMD=`cygpath --unix "$JAVACMD"` 113 | 114 | # We build the pattern for arguments to be converted via cygpath 115 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 116 | SEP="" 117 | for dir in $ROOTDIRSRAW ; do 118 | ROOTDIRS="$ROOTDIRS$SEP$dir" 119 | SEP="|" 120 | done 121 | OURCYGPATTERN="(^($ROOTDIRS))" 122 | # Add a user-defined pattern to the cygpath arguments 123 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 124 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 125 | fi 126 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 127 | i=0 128 | for arg in "$@" ; do 129 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 130 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 131 | 132 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 133 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 134 | else 135 | eval `echo args$i`="\"$arg\"" 136 | fi 137 | i=$((i+1)) 138 | done 139 | case $i in 140 | (0) set -- ;; 141 | (1) set -- "$args0" ;; 142 | (2) set -- "$args0" "$args1" ;; 143 | (3) set -- "$args0" "$args1" "$args2" ;; 144 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 145 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 146 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 147 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 148 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 149 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 150 | esac 151 | fi 152 | 153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 154 | function splitJvmOpts() { 155 | JVM_OPTS=("$@") 156 | } 157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 159 | 160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 161 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /src/main/resources/en-ingredients-finder.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robhinds/opennlp-ingredient-finder/2efabd8521e5808735a5de8fb90d981326dc728b/src/main/resources/en-ingredients-finder.bin -------------------------------------------------------------------------------- /src/main/scala/io/github/robhinds/ner/NerModelTrainer.scala: -------------------------------------------------------------------------------- 1 | package io.github.robhinds.ner 2 | 3 | import java.io.{BufferedOutputStream, FileInputStream, FileOutputStream} 4 | import java.nio.charset.Charset 5 | 6 | import opennlp.tools.ml.maxent.quasinewton.QNTrainer 7 | import opennlp.tools.ml.perceptron.PerceptronTrainer 8 | import opennlp.tools.namefind.{NameFinderME, NameSampleDataStream, TokenNameFinderFactory, TokenNameFinderModel} 9 | import opennlp.tools.util.{ObjectStream, PlainTextByLineStream, TrainingParameters} 10 | 11 | object NerModelTrainer { 12 | var model: TokenNameFinderModel = _ 13 | var modelOut: BufferedOutputStream = _ 14 | 15 | def trainModel() = { 16 | 17 | val charset = Charset.forName("UTF-8") 18 | val lineStream: ObjectStream[String] = new PlainTextByLineStream(new FileInputStream(s"src/main/resources/trainingdata.txt"), charset) 19 | val sampleStream = new NameSampleDataStream(lineStream) 20 | 21 | try { 22 | val params = TrainingParameters.defaultParams() 23 | params.put(TrainingParameters.ALGORITHM_PARAM, QNTrainer.MAXENT_QN_VALUE) 24 | 25 | model = NameFinderME.train("en", "food", sampleStream, params, new TokenNameFinderFactory()) 26 | } 27 | finally { 28 | sampleStream.close() 29 | } 30 | 31 | try { 32 | modelOut = new BufferedOutputStream(new FileOutputStream(s"src/main/resources/en-ingredients-finder.bin")) 33 | model.serialize(modelOut) 34 | } finally { 35 | if (modelOut != null) 36 | modelOut.close() 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/scala/io/github/robhinds/ner/NerModelTrainerSpec.scala: -------------------------------------------------------------------------------- 1 | package io.github.robhinds.ner 2 | 3 | import java.io.{FileInputStream, IOException} 4 | 5 | import opennlp.tools.namefind.{NameFinderME, TokenNameFinderModel} 6 | import org.scalatest.{FunSpec, Matchers} 7 | 8 | class NerModelTrainerSpec extends FunSpec with Matchers { 9 | 10 | describe("training a model"){ 11 | it ("should create a trained model file") { 12 | NerModelTrainer.trainModel() 13 | } 14 | } 15 | 16 | val recipe = "Make the shortbread - cream the butter and sugar, mix in the flour, and then press into the bottom of a lined baking tray\n\nFor the caramel, gently melt all ingredients except rosemary in a pan, once melted add the rosemary and stir through, heat for two to three minutes\n\nLeave the caramel to stand for another 10 minutes or so, whilst the shortbread cools\n\nPoor the caramel through a sieve to remove the rosemary pieces, pour caramel onto the cooled shortbread\n\nPlace the caramel topped shortbread in the fridge to cool for another 20 minutes\n\nMelt the chocolate in the microwave on a low power settings (will take a couple of minutes), once it is smooth, pour on top of the caramel shortbread and put back in the fridge to set." 17 | 18 | describe("running trained model") { 19 | it("should extract ingredients") { 20 | val modelIn = new FileInputStream("src/main/resources/en-ingredients-finder.bin") 21 | 22 | try { 23 | val sampleRecipe = recipe.split("[^a-zA-Z]") 24 | val model = new TokenNameFinderModel(modelIn) 25 | val nameFinder = new NameFinderME(model) 26 | val matches = nameFinder.find(sampleRecipe) 27 | matches.foreach { m => 28 | sampleRecipe.slice(m.getStart, m.getEnd).foreach(println(_)) 29 | } 30 | } 31 | finally { 32 | if (modelIn != null) { 33 | try { 34 | modelIn.close() 35 | } 36 | } 37 | } 38 | } 39 | } 40 | 41 | } 42 | --------------------------------------------------------------------------------