├── .gitignore ├── .travis.yml ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── lib ├── utterance_parser.rb └── utterance_parser │ ├── example.rb │ ├── parser.rb │ ├── pattern.txt │ ├── pos_tagger.rb │ ├── utils.rb │ ├── utterance.rb │ └── version.rb ├── test ├── example_test.rb ├── parser_test.rb └── test_helper.rb └── utterance_parser.gemspec /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | .yardoc 6 | Gemfile.lock 7 | InstalledFiles 8 | _yardoc 9 | coverage 10 | doc/ 11 | lib/bundler/man 12 | pkg 13 | rdoc 14 | spec/reports 15 | test/tmp 16 | test/version_tmp 17 | tmp 18 | *.bundle 19 | *.so 20 | *.o 21 | *.a 22 | mkmf.log 23 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: ruby 2 | rvm: 3 | - 2.1.1 4 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | # Specify your gem's dependencies in utterance_parser.gemspec 4 | gemspec 5 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 macournoyer 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | "Software"), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UtteranceParser 2 | 3 | A trainable natural language parser that extracts intent and entities from utterances. 4 | 5 | It uses a [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) to determine intent and [Conditional random fields](https://en.wikipedia.org/wiki/Conditional_random_field) to extract entities. 6 | 7 | For example, it can turn this: 8 | 9 | > Remind me to pick up the kids in two hours 10 | 11 | into ... 12 | 13 | ```ruby 14 | [ 15 | # intent 16 | "reminder", 17 | # entities 18 | {task: "pick up the kids", time: "in two hours"} 19 | ] 20 | ``` 21 | 22 | ## Installation 23 | 24 | Add this line to your application's Gemfile: 25 | 26 | gem 'utterance_parser' 27 | 28 | And then execute: 29 | 30 | $ bundle 31 | 32 | Or install it yourself as: 33 | 34 | $ gem install utterance_parser 35 | 36 | ## Usage 37 | 38 | ```ruby 39 | parser = UtteranceParser.new 40 | 41 | parser.train( 42 | # Utterance => intent 43 | "Hi" => "greeting", 44 | "Hello" => "greeting", 45 | 46 | "What time is it" => "time", 47 | "What's the weather outside" => "weather", 48 | 49 | # Mark entities using XML tags 50 | "Remind me to get stuff done " => "reminder", 51 | "Remind me to buy milk " => "reminder", 52 | "Remind me to pick up the kids " => "reminder", 53 | 54 | "Play some jazz" => "play", 55 | "Play some blues" => "play", 56 | "Play some rap" => "play" 57 | ) 58 | 59 | parser.parse "Hello there!" 60 | # => ["greeting", {}] 61 | 62 | parser.parse "Play some rock" 63 | # => ["play", {playlist: "rock"}] 64 | 65 | parser.parse "Remind me to buy stuff in three hours" 66 | # => ["reminder", {task: "buy stuff", time: "in three hours"}] 67 | ``` 68 | 69 | ## Contributing 70 | 71 | 1. Fork it ( https://github.com/macournoyer/utterance_parser/fork ) 72 | 2. Create your feature branch (`git checkout -b my-new-feature`) 73 | 3. Commit your changes (`git commit -am 'Add some feature'`) 74 | 4. Push to the branch (`git push origin my-new-feature`) 75 | 5. Create a new Pull Request 76 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | 4 | Rake::TestTask.new(:test) do |t| 5 | t.libs << "test" 6 | t.pattern = 'test/*_test.rb' 7 | end 8 | 9 | task :default => :test 10 | 11 | -------------------------------------------------------------------------------- /lib/utterance_parser.rb: -------------------------------------------------------------------------------- 1 | require_relative 'utterance_parser/version' 2 | require_relative 'utterance_parser/utils' 3 | require_relative 'utterance_parser/utterance' 4 | require_relative 'utterance_parser/example' 5 | require_relative 'utterance_parser/pos_tagger' 6 | require_relative 'utterance_parser/parser' 7 | 8 | module UtteranceParser 9 | def self.new(*args) 10 | Parser.new(*args) 11 | end 12 | end 13 | 14 | -------------------------------------------------------------------------------- /lib/utterance_parser/example.rb: -------------------------------------------------------------------------------- 1 | module UtteranceParser 2 | # An example utterance for training the parser. 3 | # Can be labeled with entities via XML tags: `Play some rap`. 4 | class Example < Utterance 5 | attr_reader :labeled_text, :intent 6 | 7 | def initialize(labeled_text, intent) 8 | super(labeled_text.gsub(TAG_RE, '\2')) 9 | @labeled_text = labeled_text 10 | @intent = intent 11 | end 12 | 13 | def ==(other) 14 | other.class == self.class && other.labeled_text == @labeled_text 15 | end 16 | 17 | def labeled_tokens 18 | labels = tags_with_position(@labeled_text) 19 | 20 | tags_with_position(PosTagger.add_tags(@text)).map do |tag, word, word_position| 21 | label = labels.detect do |name, content, label_position| 22 | # If the word position intersect with the label's, it's a match 23 | if label_position.include? word_position.begin 24 | break name 25 | end 26 | end 27 | [word, tag.upcase, label] 28 | end 29 | end 30 | 31 | private 32 | # Return tags, their content, and their position in the `text` *without the tags*. 33 | def tags_with_position(text) 34 | tags = [] 35 | tags_offset = 0 36 | 37 | text.scan(TAG_RE) do |tag, content| 38 | tags_offset += tag.size + 2 # 39 | index = $~.offset(2)[0] - tags_offset 40 | tags << [tag, content, (index..index + content.size)] 41 | tags_offset += tag.size + 3 # 42 | end 43 | 44 | tags 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/utterance_parser/parser.rb: -------------------------------------------------------------------------------- 1 | require 'nbayes' 2 | require 'wapiti' 3 | 4 | module UtteranceParser 5 | class Parser 6 | def initialize(save_path=nil) 7 | build_save_paths(save_path) if save_path 8 | 9 | if save_path && File.exist?(@classifier_file) 10 | @classifier = NBayes::Base.from(@classifier_file) 11 | else 12 | @classifier = NBayes::Base.new 13 | end 14 | 15 | if save_path && File.exist?(@labeller_file) 16 | @labeller = Wapiti::Model.load(@labeller_file) 17 | else 18 | @labeller = Wapiti::Model.new pattern: "#{__dir__}/pattern.txt" 19 | end 20 | end 21 | 22 | def train(examples) 23 | case examples 24 | when Array 25 | # All good! 26 | when Hash 27 | examples = examples.map { |utterance, intent| Example.new(utterance, intent) } 28 | else 29 | raise ArgumentError, "Expected [, ...] or { utterance => intent, ... }" 30 | end 31 | 32 | examples.each do |example| 33 | @classifier.train(example.pos_tokens, example.intent) 34 | end 35 | 36 | labeled_examples = examples.map do |example| 37 | example.labeled_tokens.map { |word, tag, entity| [word, tag, entity || "_"].join(" ") } 38 | end 39 | 40 | @labeller.train labeled_examples 41 | end 42 | 43 | def parse(text) 44 | utterance = Utterance.new(text) 45 | intent = @classifier.classify(utterance.pos_tokens).max_class 46 | labeled = @labeller.label([ utterance.pos_tokens.map { |t| t.join(" ") } ]).first 47 | [intent, extract_entities(labeled)] 48 | end 49 | 50 | def save(path=nil) 51 | build_save_paths path if path 52 | 53 | if !defined?(@classifier_file) || !defined?(@labeller_file) 54 | raise ArgumentError, "Path to save directory missing" 55 | end 56 | 57 | @classifier.dump(@classifier_file) 58 | @labeller.compact 59 | @labeller.save(@labeller_file) 60 | end 61 | 62 | private 63 | # Extract entities from tokens. 64 | # Eg.:: 65 | # [ ["Play NNP", "_"], ["some DET", "_"], ["jazz NN", "category"] ] 66 | # Returns: 67 | # { category: "jazz" } 68 | def extract_entities(tokens) 69 | # FIXME this will not handle duplicated labels, eg.: category being used twice. 70 | labeled = tokens.group_by { |tagged_word, label| label } 71 | labeled.delete("_") 72 | 73 | labeled.each_with_object({}) do |(label, words), entities| 74 | entities[label.to_sym] = words.map do |word, _| 75 | # Remove the POS tag from. Eg.: `word == 'jazz NN'` 76 | word[0, word.rindex(" ")] 77 | end.join(" ") 78 | end 79 | end 80 | 81 | def build_save_paths(save_path) 82 | raise ArgumentError, "Path to save directory missing" unless File.directory?(save_path) 83 | 84 | @classifier_file = File.join(save_path, "classifier.yml") 85 | @labeller_file = File.join(save_path, "labeller.mod") 86 | end 87 | end 88 | end -------------------------------------------------------------------------------- /lib/utterance_parser/pattern.txt: -------------------------------------------------------------------------------- 1 | # Wapiti CRF template file 2 | # Based on example in https://taku910.github.io/crfpp/#templ 3 | 4 | # Unigram 5 | u:%x[-3,0] 6 | u:%x[-2,0] 7 | u:%x[-1,0] 8 | u:%x[0,0] 9 | u:%x[1,0] 10 | u:%x[2,0] 11 | u:%x[3,0] 12 | u:%x[-1,0]/%x[0,0] 13 | u:%x[0,0]/%x[1,0] 14 | 15 | u:%x[-3,1] 16 | u:%x[-2,1] 17 | u:%x[-1,1] 18 | u:%x[0,1] 19 | u:%x[1,1] 20 | u:%x[2,1] 21 | u:%x[3,1] 22 | u:%x[-2,1]/%x[-1,1] 23 | u:%x[-1,1]/%x[0,1] 24 | u:%x[0,1]/%x[1,1] 25 | u:%x[1,1]/%x[2,1] 26 | 27 | u:%x[-2,1]/%x[-1,1]/%x[0,1] 28 | u:%x[-1,1]/%x[0,1]/%x[1,1] 29 | u:%x[0,1]/%x[1,1]/%x[2,1] 30 | 31 | # Bigram 32 | B -------------------------------------------------------------------------------- /lib/utterance_parser/pos_tagger.rb: -------------------------------------------------------------------------------- 1 | # EngTagger generates two tons of warnings (metric however). 2 | # It also uses globals and monkey patches core classes. 3 | # But it's easy to install and just works, so ... 4 | # 5 | # ¯\_(ツ)_/¯ 6 | # 7 | UtteranceParser::Utils.ignore_warnings do 8 | require 'engtagger' 9 | end 10 | 11 | module UtteranceParser 12 | PosTagger = Utils.ignore_warnings { EngTagger.new } 13 | end -------------------------------------------------------------------------------- /lib/utterance_parser/utils.rb: -------------------------------------------------------------------------------- 1 | module UtteranceParser 2 | module Utils 3 | def self.ignore_warnings 4 | old_verbose, $VERBOSE = $VERBOSE, nil 5 | yield 6 | ensure 7 | $VERBOSE = old_verbose 8 | end 9 | end 10 | end -------------------------------------------------------------------------------- /lib/utterance_parser/utterance.rb: -------------------------------------------------------------------------------- 1 | module UtteranceParser 2 | class Utterance 3 | TAG_RE = /<(.+?)>(.*?)<.+?>/ 4 | 5 | attr_reader :text 6 | 7 | def initialize(text) 8 | @text = text 9 | end 10 | 11 | def ==(other) 12 | other.class == self.class && other.text == @text 13 | end 14 | 15 | def pos_tokens 16 | PosTagger.add_tags(@text).scan(TAG_RE).map { |tag, word| [word, tag.upcase] } 17 | end 18 | end 19 | end -------------------------------------------------------------------------------- /lib/utterance_parser/version.rb: -------------------------------------------------------------------------------- 1 | module UtteranceParser 2 | VERSION = "0.1.0" 3 | end 4 | -------------------------------------------------------------------------------- /test/example_test.rb: -------------------------------------------------------------------------------- 1 | require 'test_helper' 2 | 3 | class ExampleTest < Minitest::Test 4 | def test_tokens 5 | assert_equal [["What", "WP", nil], ["time", "NN", nil], ["is", "VBZ", nil], ["it", "PRP", nil], ["?", "PP", nil]], 6 | Example.new("What time is it?", "time").labeled_tokens 7 | end 8 | 9 | def test_single_labeled_tokens 10 | assert_equal [["Play", "NNP", nil], ["some", "DET", nil], ["jazz", "NN", "category"]], 11 | Example.new("Play some jazz", "play").labeled_tokens 12 | end 13 | 14 | def test_multi_labeled_tokens 15 | assert_equal [["Play", "NNP", nil], ["some", "DET", nil], 16 | ["smooth", "JJ", "category"], ["jazz", "NN", "category"], 17 | ["now", "RB", "time"], ["please", "VB", nil]], 18 | Example.new("Play some smooth jazz please", "play").labeled_tokens 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /test/parser_test.rb: -------------------------------------------------------------------------------- 1 | require 'test_helper' 2 | require 'fileutils' 3 | 4 | class ParserTest < Minitest::Test 5 | def setup 6 | @parser = Parser.new 7 | @parser.train( 8 | "Hi" => "greeting", 9 | "Hello" => "greeting", 10 | "What time is it" => "time", 11 | "What's the weather outside" => "weather", 12 | "Remind me to buy milk " => "reminder", 13 | "Remind me to pick up the kids " => "reminder", 14 | "Remind me to get stuff done " => "reminder", 15 | "Play some jazz" => "play", 16 | "Play some blues" => "play", 17 | "Play some rap" => "play", 18 | "Play something" => "play", 19 | "Play something please" => "play", 20 | ) 21 | end 22 | 23 | def test_save 24 | FileUtils.mkdir_p "tmp/parser" 25 | FileUtils.rm_rf "tmp/parser/*" 26 | @parser.save "tmp/parser" 27 | assert File.file? "tmp/parser/classifier.yml" 28 | assert File.file? "tmp/parser/classifier.yml" 29 | @parser = Parser.new("tmp/parser") 30 | end 31 | 32 | def self.test_parses(utterance, intent, entities=nil) 33 | define_method "test_parses \"#{utterance}\"" do 34 | actual_intent, actual_entities = @parser.parse utterance 35 | assert_equal actual_intent, intent, "Unexpected intent" 36 | assert_equal(actual_entities, entities, "Unexpected entities") if entities 37 | end 38 | end 39 | 40 | test_parses "Hello there!", "greeting" 41 | 42 | test_parses "Could you play something nice please", "play" 43 | 44 | test_parses "Play some rock", "play", category: "rock" 45 | 46 | test_parses "Remind me to buy stuff in three hours", 47 | "reminder", task: "buy stuff", time: "in three hours" 48 | 49 | test_parses "Remind me to go play outside tomorrow", 50 | "reminder", task: "go play outside", time: "tomorrow" 51 | end 52 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) 2 | require 'utterance_parser' 3 | 4 | require 'minitest/autorun' 5 | 6 | class Minitest::Test 7 | include UtteranceParser 8 | end -------------------------------------------------------------------------------- /utterance_parser.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | require 'utterance_parser/version' 5 | 6 | Gem::Specification.new do |spec| 7 | spec.name = "utterance_parser" 8 | spec.version = UtteranceParser::VERSION 9 | spec.authors = ["Marc-Andre Cournoyer"] 10 | spec.email = ["macournoyer@gmail.com"] 11 | spec.summary = "Extract intent and entities from natural language utterances" 12 | spec.description = "A trainable natural language parser that extracts intent and entities from utterances." 13 | spec.homepage = "https://github.com/macournoyer/utterance_parser" 14 | spec.license = "MIT" 15 | 16 | spec.files = `git ls-files -z`.split("\x0") 17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | 21 | spec.add_dependency 'engtagger', '~> 0.2' 22 | spec.add_dependency 'nbayes', '~> 0.1' 23 | spec.add_dependency 'wapiti', '~> 0.1' 24 | spec.add_development_dependency "bundler", "~> 1.6" 25 | spec.add_development_dependency "rake" 26 | spec.add_development_dependency "minitest" 27 | end 28 | --------------------------------------------------------------------------------