├── .gitignore
├── .travis.yml
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── lib
├── utterance_parser.rb
└── utterance_parser
│ ├── example.rb
│ ├── parser.rb
│ ├── pattern.txt
│ ├── pos_tagger.rb
│ ├── utils.rb
│ ├── utterance.rb
│ └── version.rb
├── test
├── example_test.rb
├── parser_test.rb
└── test_helper.rb
└── utterance_parser.gemspec
/.gitignore:
--------------------------------------------------------------------------------
1 | *.gem
2 | *.rbc
3 | .bundle
4 | .config
5 | .yardoc
6 | Gemfile.lock
7 | InstalledFiles
8 | _yardoc
9 | coverage
10 | doc/
11 | lib/bundler/man
12 | pkg
13 | rdoc
14 | spec/reports
15 | test/tmp
16 | test/version_tmp
17 | tmp
18 | *.bundle
19 | *.so
20 | *.o
21 | *.a
22 | mkmf.log
23 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: ruby
2 | rvm:
3 | - 2.1.1
4 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in utterance_parser.gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2016 macournoyer
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # UtteranceParser
2 |
3 | A trainable natural language parser that extracts intent and entities from utterances.
4 |
5 | It uses a [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) to determine intent and [Conditional random fields](https://en.wikipedia.org/wiki/Conditional_random_field) to extract entities.
6 |
7 | For example, it can turn this:
8 |
9 | > Remind me to pick up the kids in two hours
10 |
11 | into ...
12 |
13 | ```ruby
14 | [
15 | # intent
16 | "reminder",
17 | # entities
18 | {task: "pick up the kids", time: "in two hours"}
19 | ]
20 | ```
21 |
22 | ## Installation
23 |
24 | Add this line to your application's Gemfile:
25 |
26 | gem 'utterance_parser'
27 |
28 | And then execute:
29 |
30 | $ bundle
31 |
32 | Or install it yourself as:
33 |
34 | $ gem install utterance_parser
35 |
36 | ## Usage
37 |
38 | ```ruby
39 | parser = UtteranceParser.new
40 |
41 | parser.train(
42 | # Utterance => intent
43 | "Hi" => "greeting",
44 | "Hello" => "greeting",
45 |
46 | "What time is it" => "time",
47 | "What's the weather outside" => "weather",
48 |
49 | # Mark entities using XML tags
50 | "Remind me to get stuff done " => "reminder",
51 | "Remind me to buy milk " => "reminder",
52 | "Remind me to pick up the kids " => "reminder",
53 |
54 | "Play some jazz" => "play",
55 | "Play some blues" => "play",
56 | "Play some rap" => "play"
57 | )
58 |
59 | parser.parse "Hello there!"
60 | # => ["greeting", {}]
61 |
62 | parser.parse "Play some rock"
63 | # => ["play", {playlist: "rock"}]
64 |
65 | parser.parse "Remind me to buy stuff in three hours"
66 | # => ["reminder", {task: "buy stuff", time: "in three hours"}]
67 | ```
68 |
69 | ## Contributing
70 |
71 | 1. Fork it ( https://github.com/macournoyer/utterance_parser/fork )
72 | 2. Create your feature branch (`git checkout -b my-new-feature`)
73 | 3. Commit your changes (`git commit -am 'Add some feature'`)
74 | 4. Push to the branch (`git push origin my-new-feature`)
75 | 5. Create a new Pull Request
76 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require "bundler/gem_tasks"
2 | require "rake/testtask"
3 |
4 | Rake::TestTask.new(:test) do |t|
5 | t.libs << "test"
6 | t.pattern = 'test/*_test.rb'
7 | end
8 |
9 | task :default => :test
10 |
11 |
--------------------------------------------------------------------------------
/lib/utterance_parser.rb:
--------------------------------------------------------------------------------
1 | require_relative 'utterance_parser/version'
2 | require_relative 'utterance_parser/utils'
3 | require_relative 'utterance_parser/utterance'
4 | require_relative 'utterance_parser/example'
5 | require_relative 'utterance_parser/pos_tagger'
6 | require_relative 'utterance_parser/parser'
7 |
8 | module UtteranceParser
9 | def self.new(*args)
10 | Parser.new(*args)
11 | end
12 | end
13 |
14 |
--------------------------------------------------------------------------------
/lib/utterance_parser/example.rb:
--------------------------------------------------------------------------------
1 | module UtteranceParser
2 | # An example utterance for training the parser.
3 | # Can be labeled with entities via XML tags: `Play some rap`.
4 | class Example < Utterance
5 | attr_reader :labeled_text, :intent
6 |
7 | def initialize(labeled_text, intent)
8 | super(labeled_text.gsub(TAG_RE, '\2'))
9 | @labeled_text = labeled_text
10 | @intent = intent
11 | end
12 |
13 | def ==(other)
14 | other.class == self.class && other.labeled_text == @labeled_text
15 | end
16 |
17 | def labeled_tokens
18 | labels = tags_with_position(@labeled_text)
19 |
20 | tags_with_position(PosTagger.add_tags(@text)).map do |tag, word, word_position|
21 | label = labels.detect do |name, content, label_position|
22 | # If the word position intersect with the label's, it's a match
23 | if label_position.include? word_position.begin
24 | break name
25 | end
26 | end
27 | [word, tag.upcase, label]
28 | end
29 | end
30 |
31 | private
32 | # Return tags, their content, and their position in the `text` *without the tags*.
33 | def tags_with_position(text)
34 | tags = []
35 | tags_offset = 0
36 |
37 | text.scan(TAG_RE) do |tag, content|
38 | tags_offset += tag.size + 2 #
39 | index = $~.offset(2)[0] - tags_offset
40 | tags << [tag, content, (index..index + content.size)]
41 | tags_offset += tag.size + 3 #
42 | end
43 |
44 | tags
45 | end
46 | end
47 | end
48 |
--------------------------------------------------------------------------------
/lib/utterance_parser/parser.rb:
--------------------------------------------------------------------------------
1 | require 'nbayes'
2 | require 'wapiti'
3 |
4 | module UtteranceParser
5 | class Parser
6 | def initialize(save_path=nil)
7 | build_save_paths(save_path) if save_path
8 |
9 | if save_path && File.exist?(@classifier_file)
10 | @classifier = NBayes::Base.from(@classifier_file)
11 | else
12 | @classifier = NBayes::Base.new
13 | end
14 |
15 | if save_path && File.exist?(@labeller_file)
16 | @labeller = Wapiti::Model.load(@labeller_file)
17 | else
18 | @labeller = Wapiti::Model.new pattern: "#{__dir__}/pattern.txt"
19 | end
20 | end
21 |
22 | def train(examples)
23 | case examples
24 | when Array
25 | # All good!
26 | when Hash
27 | examples = examples.map { |utterance, intent| Example.new(utterance, intent) }
28 | else
29 | raise ArgumentError, "Expected [, ...] or { utterance => intent, ... }"
30 | end
31 |
32 | examples.each do |example|
33 | @classifier.train(example.pos_tokens, example.intent)
34 | end
35 |
36 | labeled_examples = examples.map do |example|
37 | example.labeled_tokens.map { |word, tag, entity| [word, tag, entity || "_"].join(" ") }
38 | end
39 |
40 | @labeller.train labeled_examples
41 | end
42 |
43 | def parse(text)
44 | utterance = Utterance.new(text)
45 | intent = @classifier.classify(utterance.pos_tokens).max_class
46 | labeled = @labeller.label([ utterance.pos_tokens.map { |t| t.join(" ") } ]).first
47 | [intent, extract_entities(labeled)]
48 | end
49 |
50 | def save(path=nil)
51 | build_save_paths path if path
52 |
53 | if !defined?(@classifier_file) || !defined?(@labeller_file)
54 | raise ArgumentError, "Path to save directory missing"
55 | end
56 |
57 | @classifier.dump(@classifier_file)
58 | @labeller.compact
59 | @labeller.save(@labeller_file)
60 | end
61 |
62 | private
63 | # Extract entities from tokens.
64 | # Eg.::
65 | # [ ["Play NNP", "_"], ["some DET", "_"], ["jazz NN", "category"] ]
66 | # Returns:
67 | # { category: "jazz" }
68 | def extract_entities(tokens)
69 | # FIXME this will not handle duplicated labels, eg.: category being used twice.
70 | labeled = tokens.group_by { |tagged_word, label| label }
71 | labeled.delete("_")
72 |
73 | labeled.each_with_object({}) do |(label, words), entities|
74 | entities[label.to_sym] = words.map do |word, _|
75 | # Remove the POS tag from. Eg.: `word == 'jazz NN'`
76 | word[0, word.rindex(" ")]
77 | end.join(" ")
78 | end
79 | end
80 |
81 | def build_save_paths(save_path)
82 | raise ArgumentError, "Path to save directory missing" unless File.directory?(save_path)
83 |
84 | @classifier_file = File.join(save_path, "classifier.yml")
85 | @labeller_file = File.join(save_path, "labeller.mod")
86 | end
87 | end
88 | end
--------------------------------------------------------------------------------
/lib/utterance_parser/pattern.txt:
--------------------------------------------------------------------------------
1 | # Wapiti CRF template file
2 | # Based on example in https://taku910.github.io/crfpp/#templ
3 |
4 | # Unigram
5 | u:%x[-3,0]
6 | u:%x[-2,0]
7 | u:%x[-1,0]
8 | u:%x[0,0]
9 | u:%x[1,0]
10 | u:%x[2,0]
11 | u:%x[3,0]
12 | u:%x[-1,0]/%x[0,0]
13 | u:%x[0,0]/%x[1,0]
14 |
15 | u:%x[-3,1]
16 | u:%x[-2,1]
17 | u:%x[-1,1]
18 | u:%x[0,1]
19 | u:%x[1,1]
20 | u:%x[2,1]
21 | u:%x[3,1]
22 | u:%x[-2,1]/%x[-1,1]
23 | u:%x[-1,1]/%x[0,1]
24 | u:%x[0,1]/%x[1,1]
25 | u:%x[1,1]/%x[2,1]
26 |
27 | u:%x[-2,1]/%x[-1,1]/%x[0,1]
28 | u:%x[-1,1]/%x[0,1]/%x[1,1]
29 | u:%x[0,1]/%x[1,1]/%x[2,1]
30 |
31 | # Bigram
32 | B
--------------------------------------------------------------------------------
/lib/utterance_parser/pos_tagger.rb:
--------------------------------------------------------------------------------
1 | # EngTagger generates two tons of warnings (metric however).
2 | # It also uses globals and monkey patches core classes.
3 | # But it's easy to install and just works, so ...
4 | #
5 | # ¯\_(ツ)_/¯
6 | #
7 | UtteranceParser::Utils.ignore_warnings do
8 | require 'engtagger'
9 | end
10 |
11 | module UtteranceParser
12 | PosTagger = Utils.ignore_warnings { EngTagger.new }
13 | end
--------------------------------------------------------------------------------
/lib/utterance_parser/utils.rb:
--------------------------------------------------------------------------------
1 | module UtteranceParser
2 | module Utils
3 | def self.ignore_warnings
4 | old_verbose, $VERBOSE = $VERBOSE, nil
5 | yield
6 | ensure
7 | $VERBOSE = old_verbose
8 | end
9 | end
10 | end
--------------------------------------------------------------------------------
/lib/utterance_parser/utterance.rb:
--------------------------------------------------------------------------------
1 | module UtteranceParser
2 | class Utterance
3 | TAG_RE = /<(.+?)>(.*?)<.+?>/
4 |
5 | attr_reader :text
6 |
7 | def initialize(text)
8 | @text = text
9 | end
10 |
11 | def ==(other)
12 | other.class == self.class && other.text == @text
13 | end
14 |
15 | def pos_tokens
16 | PosTagger.add_tags(@text).scan(TAG_RE).map { |tag, word| [word, tag.upcase] }
17 | end
18 | end
19 | end
--------------------------------------------------------------------------------
/lib/utterance_parser/version.rb:
--------------------------------------------------------------------------------
1 | module UtteranceParser
2 | VERSION = "0.1.0"
3 | end
4 |
--------------------------------------------------------------------------------
/test/example_test.rb:
--------------------------------------------------------------------------------
1 | require 'test_helper'
2 |
3 | class ExampleTest < Minitest::Test
4 | def test_tokens
5 | assert_equal [["What", "WP", nil], ["time", "NN", nil], ["is", "VBZ", nil], ["it", "PRP", nil], ["?", "PP", nil]],
6 | Example.new("What time is it?", "time").labeled_tokens
7 | end
8 |
9 | def test_single_labeled_tokens
10 | assert_equal [["Play", "NNP", nil], ["some", "DET", nil], ["jazz", "NN", "category"]],
11 | Example.new("Play some jazz", "play").labeled_tokens
12 | end
13 |
14 | def test_multi_labeled_tokens
15 | assert_equal [["Play", "NNP", nil], ["some", "DET", nil],
16 | ["smooth", "JJ", "category"], ["jazz", "NN", "category"],
17 | ["now", "RB", "time"], ["please", "VB", nil]],
18 | Example.new("Play some smooth jazz please", "play").labeled_tokens
19 | end
20 | end
21 |
--------------------------------------------------------------------------------
/test/parser_test.rb:
--------------------------------------------------------------------------------
1 | require 'test_helper'
2 | require 'fileutils'
3 |
4 | class ParserTest < Minitest::Test
5 | def setup
6 | @parser = Parser.new
7 | @parser.train(
8 | "Hi" => "greeting",
9 | "Hello" => "greeting",
10 | "What time is it" => "time",
11 | "What's the weather outside" => "weather",
12 | "Remind me to buy milk " => "reminder",
13 | "Remind me to pick up the kids " => "reminder",
14 | "Remind me to get stuff done " => "reminder",
15 | "Play some jazz" => "play",
16 | "Play some blues" => "play",
17 | "Play some rap" => "play",
18 | "Play something" => "play",
19 | "Play something please" => "play",
20 | )
21 | end
22 |
23 | def test_save
24 | FileUtils.mkdir_p "tmp/parser"
25 | FileUtils.rm_rf "tmp/parser/*"
26 | @parser.save "tmp/parser"
27 | assert File.file? "tmp/parser/classifier.yml"
28 | assert File.file? "tmp/parser/classifier.yml"
29 | @parser = Parser.new("tmp/parser")
30 | end
31 |
32 | def self.test_parses(utterance, intent, entities=nil)
33 | define_method "test_parses \"#{utterance}\"" do
34 | actual_intent, actual_entities = @parser.parse utterance
35 | assert_equal actual_intent, intent, "Unexpected intent"
36 | assert_equal(actual_entities, entities, "Unexpected entities") if entities
37 | end
38 | end
39 |
40 | test_parses "Hello there!", "greeting"
41 |
42 | test_parses "Could you play something nice please", "play"
43 |
44 | test_parses "Play some rock", "play", category: "rock"
45 |
46 | test_parses "Remind me to buy stuff in three hours",
47 | "reminder", task: "buy stuff", time: "in three hours"
48 |
49 | test_parses "Remind me to go play outside tomorrow",
50 | "reminder", task: "go play outside", time: "tomorrow"
51 | end
52 |
--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2 | require 'utterance_parser'
3 |
4 | require 'minitest/autorun'
5 |
6 | class Minitest::Test
7 | include UtteranceParser
8 | end
--------------------------------------------------------------------------------
/utterance_parser.gemspec:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | lib = File.expand_path('../lib', __FILE__)
3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4 | require 'utterance_parser/version'
5 |
6 | Gem::Specification.new do |spec|
7 | spec.name = "utterance_parser"
8 | spec.version = UtteranceParser::VERSION
9 | spec.authors = ["Marc-Andre Cournoyer"]
10 | spec.email = ["macournoyer@gmail.com"]
11 | spec.summary = "Extract intent and entities from natural language utterances"
12 | spec.description = "A trainable natural language parser that extracts intent and entities from utterances."
13 | spec.homepage = "https://github.com/macournoyer/utterance_parser"
14 | spec.license = "MIT"
15 |
16 | spec.files = `git ls-files -z`.split("\x0")
17 | spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19 | spec.require_paths = ["lib"]
20 |
21 | spec.add_dependency 'engtagger', '~> 0.2'
22 | spec.add_dependency 'nbayes', '~> 0.1'
23 | spec.add_dependency 'wapiti', '~> 0.1'
24 | spec.add_development_dependency "bundler", "~> 1.6"
25 | spec.add_development_dependency "rake"
26 | spec.add_development_dependency "minitest"
27 | end
28 |
--------------------------------------------------------------------------------