├── .github └── workflows │ └── build.yml ├── .gitignore ├── CHANGELOG.md ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── lib ├── mitie.rb └── mitie │ ├── binary_relation_detector.rb │ ├── binary_relation_trainer.rb │ ├── document.rb │ ├── ffi.rb │ ├── ner.rb │ ├── ner_trainer.rb │ ├── ner_training_instance.rb │ ├── text_categorizer.rb │ ├── text_categorizer_trainer.rb │ ├── utils.rb │ └── version.rb ├── mitie.gemspec ├── test ├── binary_relation_detector_test.rb ├── binary_relation_trainer_test.rb ├── document_test.rb ├── mitie_test.rb ├── ner_test.rb ├── ner_trainer_test.rb ├── ner_training_instance_test.rb ├── test_helper.rb └── text_categorizer_test.rb └── vendor └── LICENSE.txt /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | include: 9 | - ruby: 3.4 10 | os: ubuntu-latest 11 | - ruby: 3.3 12 | os: ubuntu-22.04 13 | - ruby: 3.2 14 | os: macos-latest 15 | - ruby: 3.1 16 | os: windows-latest 17 | runs-on: ${{ matrix.os }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | - uses: ruby/setup-ruby@v1 21 | with: 22 | ruby-version: ${{ matrix.ruby }} 23 | bundler-cache: true 24 | - uses: actions/cache@v4 25 | with: 26 | path: MITIE-models 27 | key: models-v4 28 | id: cache-models 29 | - name: Download models 30 | if: steps.cache-models.outputs.cache-hit != 'true' 31 | run: | 32 | curl -Ls -o models.tar.bz2 https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2 33 | tar xfj models.tar.bz2 34 | - run: bundle exec rake vendor:platform 35 | - run: bundle exec rake test 36 | env: 37 | MITIE_MODELS_PATH: MITIE-models/english 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | *.lock 10 | *.so 11 | *.dylib 12 | *.dll 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.3.2 (2025-05-04) 2 | 3 | - Fixed memory leaks 4 | 5 | ## 0.3.1 (2024-12-29) 6 | 7 | - Fixed warning with Ruby 3.4 8 | 9 | ## 0.3.0 (2024-10-23) 10 | 11 | - Dropped support for Ruby < 3.1 12 | 13 | ## 0.2.2 (2023-06-07) 14 | 15 | - Fixed error with `dup` and `clone` 16 | 17 | ## 0.2.1 (2022-06-12) 18 | 19 | - Added `tokenize` and `tokenize_file` methods 20 | - Added support for untokenized text to text categorization 21 | 22 | ## 0.2.0 (2022-06-01) 23 | 24 | - Added support for text categorization 25 | - Added support for training binary relation detectors 26 | - Dropped support for Ruby < 2.7 27 | 28 | ## 0.1.6 (2022-03-20) 29 | 30 | - Added support for training NER models 31 | - Improved ARM detection 32 | 33 | ## 0.1.5 (2021-01-29) 34 | 35 | - Fixed issue with multibyte characters 36 | 37 | ## 0.1.4 (2020-12-28) 38 | 39 | - Added ARM shared library for Mac 40 | 41 | ## 0.1.3 (2020-12-04) 42 | 43 | - Added support for custom tokenization 44 | 45 | ## 0.1.2 (2020-09-14) 46 | 47 | - Added binary relation detection 48 | - Added `Document` class 49 | 50 | ## 0.1.1 (2020-09-14) 51 | 52 | - Added shared libraries 53 | - Improved error message when model file does not exist 54 | 55 | ## 0.1.0 (2020-09-14) 56 | 57 | - First release 58 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "rake" 6 | gem "minitest", ">= 5" 7 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MITIE Ruby 2 | 3 | [MITIE](https://github.com/mit-nlp/MITIE) - named-entity recognition, binary relation detection, and text categorization - for Ruby 4 | 5 | - Finds people, organizations, and locations in text 6 | - Detects relationships between entities, like `PERSON` was born in `LOCATION` 7 | 8 | [![Build Status](https://github.com/ankane/mitie-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/mitie-ruby/actions) 9 | 10 | ## Installation 11 | 12 | Add this line to your application’s Gemfile: 13 | 14 | ```ruby 15 | gem "mitie" 16 | ``` 17 | 18 | And download the pre-trained models for your language: 19 | 20 | - [English](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2.tar.bz2) 21 | - [Spanish](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-Spanish.zip) 22 | - [German](https://github.com/mit-nlp/MITIE/releases/download/v0.4/MITIE-models-v0.2-German.tar.bz2) 23 | 24 | ## Getting Started 25 | 26 | - [Named Entity Recognition](#named-entity-recognition) 27 | - [Binary Relation Detection](#binary-relation-detection) 28 | - [Text Categorization](#text-categorization) 29 | 30 | ## Named Entity Recognition 31 | 32 | Load an NER model 33 | 34 | ```ruby 35 | model = Mitie::NER.new("ner_model.dat") 36 | ``` 37 | 38 | Create a document 39 | 40 | ```ruby 41 | doc = model.doc("Nat works at GitHub in San Francisco") 42 | ``` 43 | 44 | Get entities 45 | 46 | ```ruby 47 | doc.entities 48 | ``` 49 | 50 | This returns 51 | 52 | ```ruby 53 | [ 54 | {text: "Nat", tag: "PERSON", score: 0.3112371212688382, offset: 0}, 55 | {text: "GitHub", tag: "ORGANIZATION", score: 0.5660115198329334, offset: 13}, 56 | {text: "San Francisco", tag: "LOCATION", score: 1.3890524313885309, offset: 23} 57 | ] 58 | ``` 59 | 60 | Get tokens 61 | 62 | ```ruby 63 | doc.tokens 64 | ``` 65 | 66 | Get tokens and their offset 67 | 68 | ```ruby 69 | doc.tokens_with_offset 70 | ``` 71 | 72 | Get all tags for a model 73 | 74 | ```ruby 75 | model.tags 76 | ``` 77 | 78 | ### Training 79 | 80 | Load an NER model into a trainer 81 | 82 | ```ruby 83 | trainer = Mitie::NERTrainer.new("total_word_feature_extractor.dat") 84 | ``` 85 | 86 | Create training instances 87 | 88 | ```ruby 89 | tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"] 90 | instance = Mitie::NERTrainingInstance.new(tokens) 91 | instance.add_entity(3..4, "topic") # machine learning 92 | instance.add_entity(6..6, "language") # Ruby 93 | ``` 94 | 95 | Add the training instances to the trainer 96 | 97 | ```ruby 98 | trainer.add(instance) 99 | ``` 100 | 101 | Train the model 102 | 103 | ```ruby 104 | model = trainer.train 105 | ``` 106 | 107 | Save the model 108 | 109 | ```ruby 110 | model.save_to_disk("ner_model.dat") 111 | ``` 112 | 113 | ## Binary Relation Detection 114 | 115 | Detect relationships betweens two entities, like: 116 | 117 | - `PERSON` was born in `LOCATION` 118 | - `ORGANIZATION` was founded in `LOCATION` 119 | - `FILM` was directed by `PERSON` 120 | 121 | There are 21 detectors for English. You can find them in the `binary_relations` directory in the model download. 122 | 123 | Load a detector 124 | 125 | ```ruby 126 | detector = Mitie::BinaryRelationDetector.new("rel_classifier_organization.organization.place_founded.svm") 127 | ``` 128 | 129 | And create a document 130 | 131 | ```ruby 132 | doc = model.doc("Shopify was founded in Ottawa") 133 | ``` 134 | 135 | Get relations 136 | 137 | ```ruby 138 | detector.relations(doc) 139 | ``` 140 | 141 | This returns 142 | 143 | ```ruby 144 | [{first: "Shopify", second: "Ottawa", score: 0.17649169745814464}] 145 | ``` 146 | 147 | ### Training 148 | 149 | Load an NER model into a trainer 150 | 151 | ```ruby 152 | trainer = Mitie::BinaryRelationTrainer.new(model) 153 | ``` 154 | 155 | Add positive and negative examples to the trainer 156 | 157 | ```ruby 158 | tokens = ["Shopify", "was", "founded", "in", "Ottawa"] 159 | trainer.add_positive_binary_relation(tokens, 0..0, 4..4) 160 | trainer.add_negative_binary_relation(tokens, 4..4, 0..0) 161 | ``` 162 | 163 | Train the detector 164 | 165 | ```ruby 166 | detector = trainer.train 167 | ``` 168 | 169 | Save the detector 170 | 171 | ```ruby 172 | detector.save_to_disk("binary_relation_detector.svm") 173 | ``` 174 | 175 | ## Text Categorization 176 | 177 | Load a model into a trainer 178 | 179 | ```ruby 180 | trainer = Mitie::TextCategorizerTrainer.new("total_word_feature_extractor.dat") 181 | ``` 182 | 183 | Add labeled text to the trainer 184 | 185 | ```ruby 186 | trainer.add("This is super cool", "positive") 187 | ``` 188 | 189 | Train the model 190 | 191 | ```ruby 192 | model = trainer.train 193 | ``` 194 | 195 | Save the model 196 | 197 | ```ruby 198 | model.save_to_disk("text_categorization_model.dat") 199 | ``` 200 | 201 | Load a saved model 202 | 203 | ```ruby 204 | model = Mitie::TextCategorizer.new("text_categorization_model.dat") 205 | ``` 206 | 207 | Categorize text 208 | 209 | ```ruby 210 | model.categorize("What a super nice day") 211 | ``` 212 | 213 | ## Deployment 214 | 215 | Check out [Trove](https://github.com/ankane/trove) for deploying models. 216 | 217 | ```sh 218 | trove push ner_model.dat 219 | ``` 220 | 221 | ## History 222 | 223 | View the [changelog](https://github.com/ankane/mitie-ruby/blob/master/CHANGELOG.md) 224 | 225 | ## Contributing 226 | 227 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 228 | 229 | - [Report bugs](https://github.com/ankane/mitie-ruby/issues) 230 | - Fix bugs and [submit pull requests](https://github.com/ankane/mitie-ruby/pulls) 231 | - Write, clarify, or fix documentation 232 | - Suggest or add new features 233 | 234 | To get started with development: 235 | 236 | ```sh 237 | git clone https://github.com/ankane/mitie-ruby.git 238 | cd mitie-ruby 239 | bundle install 240 | bundle exec rake vendor:all 241 | 242 | export MITIE_MODELS_PATH=path/to/MITIE-models/english 243 | bundle exec rake test 244 | ``` 245 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | 4 | task default: :test 5 | Rake::TestTask.new do |t| 6 | t.libs << "test" 7 | t.pattern = "test/**/*_test.rb" 8 | end 9 | 10 | def download_file(file, sha256) 11 | require "open-uri" 12 | 13 | url = "https://github.com/ankane/ml-builds/releases/download/mitie-0.7/#{file}" 14 | puts "Downloading #{file}..." 15 | contents = URI.parse(url).read 16 | 17 | computed_sha256 = Digest::SHA256.hexdigest(contents) 18 | raise "Bad hash: #{computed_sha256}" if computed_sha256 != sha256 19 | 20 | dest = "vendor/#{file}" 21 | File.binwrite(dest, contents) 22 | puts "Saved #{dest}" 23 | end 24 | 25 | namespace :vendor do 26 | task :linux do 27 | download_file("libmitie.so", "07b241d857a4bcd7fd97b68a87ccb06fbab70bfc621ee25aa0ea6bd7f905c45c") 28 | end 29 | 30 | task :mac do 31 | download_file("libmitie.dylib", "8c4fdbe11ef137c401141242af8030628672d64589b5e63ba9c13b7162d29d6c") 32 | download_file("libmitie.arm64.dylib", "616117825ac8a37ec1f016016868e1d72a21e5f3a90cc6b0347d4ff9dbf98088") 33 | end 34 | 35 | task :windows do 36 | download_file("mitie.dll", "dfeaaf72b12c7323d9447275af16afe5a1c64096ec2f00d04cb50f518ca19776") 37 | end 38 | 39 | task all: [:linux, :mac, :windows] 40 | 41 | task :platform do 42 | if Gem.win_platform? 43 | Rake::Task["vendor:windows"].invoke 44 | elsif RbConfig::CONFIG["host_os"] =~ /darwin/i 45 | Rake::Task["vendor:mac"].invoke 46 | else 47 | Rake::Task["vendor:linux"].invoke 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /lib/mitie.rb: -------------------------------------------------------------------------------- 1 | # stdlib 2 | require "fiddle/import" 3 | 4 | # modules 5 | require_relative "mitie/binary_relation_detector" 6 | require_relative "mitie/binary_relation_trainer" 7 | require_relative "mitie/document" 8 | require_relative "mitie/ner" 9 | require_relative "mitie/ner_training_instance" 10 | require_relative "mitie/ner_trainer" 11 | require_relative "mitie/text_categorizer" 12 | require_relative "mitie/text_categorizer_trainer" 13 | require_relative "mitie/utils" 14 | require_relative "mitie/version" 15 | 16 | module Mitie 17 | class Error < StandardError; end 18 | 19 | class << self 20 | attr_accessor :ffi_lib 21 | end 22 | lib_name = 23 | if Gem.win_platform? 24 | "mitie.dll" 25 | elsif RbConfig::CONFIG["host_os"] =~ /darwin/i 26 | if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i 27 | "libmitie.arm64.dylib" 28 | else 29 | "libmitie.dylib" 30 | end 31 | else 32 | "libmitie.so" 33 | end 34 | vendor_lib = File.expand_path("../vendor/#{lib_name}", __dir__) 35 | self.ffi_lib = [vendor_lib] 36 | 37 | # friendlier error message 38 | autoload :FFI, "mitie/ffi" 39 | 40 | class << self 41 | def tokenize(text) 42 | tokens_ptr = FFI.mitie_tokenize(+text.to_s) 43 | tokens_ptr.free = FFI["mitie_free"] 44 | tokens = read_tokens(tokens_ptr) 45 | tokens.each { |t| t.force_encoding(text.encoding) } 46 | tokens 47 | end 48 | 49 | def tokenize_file(filename) 50 | raise ArgumentError, "File does not exist" unless File.exist?(filename) 51 | tokens_ptr = FFI.mitie_tokenize_file(+filename) 52 | tokens_ptr.free = FFI["mitie_free"] 53 | read_tokens(tokens_ptr) 54 | end 55 | 56 | private 57 | 58 | def read_tokens(tokens_ptr) 59 | i = 0 60 | tokens = [] 61 | loop do 62 | token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr 63 | break if token.null? 64 | tokens << token.to_s 65 | i += 1 66 | end 67 | tokens 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /lib/mitie/binary_relation_detector.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class BinaryRelationDetector 3 | def initialize(path = nil, pointer: nil) 4 | if path 5 | # better error message 6 | raise ArgumentError, "File does not exist" unless File.exist?(path) 7 | @pointer = FFI.mitie_load_binary_relation_detector(+path) 8 | @pointer.free = FFI["mitie_free"] 9 | elsif pointer 10 | @pointer = pointer 11 | else 12 | raise ArgumentError, "Must pass either a path or a pointer" 13 | end 14 | end 15 | 16 | def name 17 | FFI.mitie_binary_relation_detector_name_string(pointer).to_s 18 | end 19 | 20 | def relations(doc) 21 | raise ArgumentError, "Expected Mitie::Document, not #{doc.class.name}" unless doc.is_a?(Document) 22 | 23 | entities = doc.entities 24 | combinations = [] 25 | (entities.size - 1).times do |i| 26 | combinations << [entities[i], entities[i + 1]] 27 | combinations << [entities[i + 1], entities[i]] 28 | end 29 | 30 | relations = [] 31 | combinations.each do |entity1, entity2| 32 | relation = extract_relation(doc, entity1, entity2) 33 | relations << relation if relation 34 | end 35 | relations 36 | end 37 | 38 | def save_to_disk(filename) 39 | if FFI.mitie_save_binary_relation_detector(+filename, pointer) != 0 40 | raise Error, "Unable to save detector" 41 | end 42 | nil 43 | end 44 | 45 | private 46 | 47 | def pointer 48 | @pointer 49 | end 50 | 51 | def extract_relation(doc, entity1, entity2) 52 | relation = 53 | FFI.mitie_extract_binary_relation( 54 | doc.model.pointer, 55 | doc.send(:tokens_ptr), 56 | entity1[:token_index], 57 | entity1[:token_length], 58 | entity2[:token_index], 59 | entity2[:token_length] 60 | ) 61 | relation.free = FFI["mitie_free"] 62 | 63 | score_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 64 | status = FFI.mitie_classify_binary_relation(pointer, relation, score_ptr) 65 | raise Error, "Bad status: #{status}" if status != 0 66 | 67 | score = Utils.read_double(score_ptr) 68 | if score > 0 69 | { 70 | first: entity1[:text], 71 | second: entity2[:text], 72 | score: score 73 | } 74 | end 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /lib/mitie/binary_relation_trainer.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class BinaryRelationTrainer 3 | def initialize(ner, name: "") 4 | @pointer = FFI.mitie_create_binary_relation_trainer(+name, ner.pointer) 5 | @pointer.free = FFI["mitie_free"] 6 | end 7 | 8 | def add_positive_binary_relation(tokens, range1, range2) 9 | check_add(tokens, range1, range2) 10 | 11 | tokens_pointer = Utils.array_to_pointer(tokens) 12 | status = FFI.mitie_add_positive_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size) 13 | if status != 0 14 | raise Error, "Unable to add binary relation" 15 | end 16 | end 17 | 18 | def add_negative_binary_relation(tokens, range1, range2) 19 | check_add(tokens, range1, range2) 20 | 21 | tokens_pointer = Utils.array_to_pointer(tokens) 22 | status = FFI.mitie_add_negative_binary_relation(@pointer, tokens_pointer, range1.begin, range1.size, range2.begin, range2.size) 23 | if status != 0 24 | raise Error, "Unable to add binary relation" 25 | end 26 | end 27 | 28 | def beta 29 | FFI.mitie_binary_relation_trainer_get_beta(@pointer) 30 | end 31 | 32 | def beta=(value) 33 | raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0 34 | 35 | FFI.mitie_binary_relation_trainer_set_beta(@pointer, value) 36 | end 37 | 38 | def num_threads 39 | FFI.mitie_binary_relation_trainer_get_num_threads(@pointer) 40 | end 41 | 42 | def num_threads=(value) 43 | FFI.mitie_binary_relation_trainer_set_num_threads(@pointer, value) 44 | end 45 | 46 | def num_positive_examples 47 | FFI.mitie_binary_relation_trainer_num_positive_examples(@pointer) 48 | end 49 | 50 | def num_negative_examples 51 | FFI.mitie_binary_relation_trainer_num_negative_examples(@pointer) 52 | end 53 | 54 | def train 55 | if num_positive_examples + num_negative_examples == 0 56 | raise Error, "You can't call train() on an empty trainer" 57 | end 58 | 59 | detector = FFI.mitie_train_binary_relation_detector(@pointer) 60 | 61 | raise Error, "Unable to create binary relation detector. Probably ran out of RAM." if detector.null? 62 | 63 | Mitie::BinaryRelationDetector.new(pointer: detector) 64 | end 65 | 66 | private 67 | 68 | def check_add(tokens, range1, range2) 69 | Utils.check_range(range1, tokens.size) 70 | Utils.check_range(range2, tokens.size) 71 | 72 | if entities_overlap?(range1, range2) 73 | raise ArgumentError, "Entities overlap" 74 | end 75 | end 76 | 77 | def entities_overlap?(range1, range2) 78 | FFI.mitie_entities_overlap(range1.begin, range1.size, range2.begin, range2.size) == 1 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /lib/mitie/document.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class Document 3 | attr_reader :model, :text 4 | 5 | def initialize(model, text) 6 | @model = model 7 | @text = text 8 | end 9 | 10 | def tokens 11 | @tokens ||= tokens_with_offset.map(&:first) 12 | end 13 | 14 | def tokens_with_offset 15 | @tokens_with_offset ||= begin 16 | if text.is_a?(Array) 17 | # offsets are unknown when given tokens 18 | text.map { |v| [v, nil] } 19 | else 20 | i = 0 21 | tokens = [] 22 | loop do 23 | token = (tokens_ptr + i * Fiddle::SIZEOF_VOIDP).ptr 24 | break if token.null? 25 | offset = (offsets_ptr + i * Fiddle::SIZEOF_LONG).to_str(Fiddle::SIZEOF_LONG).unpack1("L!") 26 | tokens << [token.to_s.force_encoding(text.encoding), offset] 27 | i += 1 28 | end 29 | tokens 30 | end 31 | end 32 | end 33 | 34 | def entities 35 | @entities ||= begin 36 | entities = [] 37 | tokens = tokens_with_offset 38 | detections = FFI.mitie_extract_entities(pointer, tokens_ptr) 39 | detections.free = FFI["mitie_free"] 40 | num_detections = FFI.mitie_ner_get_num_detections(detections) 41 | num_detections.times do |i| 42 | pos = FFI.mitie_ner_get_detection_position(detections, i) 43 | len = FFI.mitie_ner_get_detection_length(detections, i) 44 | tag = FFI.mitie_ner_get_detection_tagstr(detections, i).to_s 45 | score = FFI.mitie_ner_get_detection_score(detections, i) 46 | tok = tokens[pos, len] 47 | offset = tok[0][1] 48 | 49 | entity = {} 50 | if offset 51 | finish = tok[-1][1] + tok[-1][0].bytesize 52 | entity[:text] = text.byteslice(offset...finish) 53 | else 54 | entity[:text] = tok.map(&:first) 55 | end 56 | entity[:tag] = tag 57 | entity[:score] = score 58 | entity[:offset] = offset if offset 59 | entity[:token_index] = pos 60 | entity[:token_length] = len 61 | entities << entity 62 | end 63 | entities 64 | end 65 | end 66 | 67 | private 68 | 69 | def pointer 70 | model.pointer 71 | end 72 | 73 | def tokens_ptr 74 | tokenize[0] 75 | end 76 | 77 | def offsets_ptr 78 | tokenize[1] 79 | end 80 | 81 | def tokenize 82 | @tokenize ||= begin 83 | if text.is_a?(Array) 84 | tokens_ptr = Utils.array_to_pointer(text) 85 | [tokens_ptr, nil] 86 | else 87 | offsets_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP, Fiddle::RUBY_FREE) 88 | tokens_ptr = FFI.mitie_tokenize_with_offsets(+text, offsets_ptr) 89 | tokens_ptr.free = FFI["mitie_free"] 90 | offsets_ptr = offsets_ptr.ptr 91 | offsets_ptr.free = FFI["mitie_free"] 92 | 93 | [tokens_ptr, offsets_ptr] 94 | end 95 | end 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /lib/mitie/ffi.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | module FFI 3 | extend Fiddle::Importer 4 | 5 | libs = Mitie.ffi_lib.dup 6 | begin 7 | dlload Fiddle.dlopen(libs.shift) 8 | rescue Fiddle::DLError => e 9 | retry if libs.any? 10 | raise e 11 | end 12 | 13 | # https://github.com/mit-nlp/MITIE/blob/master/mitielib/include/mitie.h 14 | 15 | extern "void mitie_free(void* object)" 16 | extern "char** mitie_tokenize(const char* text)" 17 | extern "char** mitie_tokenize_file(const char* filename)" 18 | extern "char** mitie_tokenize_with_offsets(const char* text, unsigned long** token_offsets)" 19 | 20 | # ner 21 | extern "mitie_named_entity_extractor* mitie_load_named_entity_extractor(const char* filename)" 22 | extern "unsigned long mitie_get_num_possible_ner_tags(const mitie_named_entity_extractor* ner)" 23 | extern "const char* mitie_get_named_entity_tagstr(const mitie_named_entity_extractor* ner, unsigned long idx)" 24 | extern "mitie_named_entity_detections* mitie_extract_entities(const mitie_named_entity_extractor* ner, char** tokens)" 25 | extern "unsigned long mitie_ner_get_num_detections(const mitie_named_entity_detections* dets)" 26 | extern "unsigned long mitie_ner_get_detection_position(const mitie_named_entity_detections* dets, unsigned long idx)" 27 | extern "unsigned long mitie_ner_get_detection_length(const mitie_named_entity_detections* dets, unsigned long idx)" 28 | extern "unsigned long mitie_ner_get_detection_tag(const mitie_named_entity_detections* dets, unsigned long idx)" 29 | extern "const char* mitie_ner_get_detection_tagstr(const mitie_named_entity_detections* dets, unsigned long idx)" 30 | extern "double mitie_ner_get_detection_score(const mitie_named_entity_detections* dets, unsigned long idx)" 31 | 32 | # binary relation detector 33 | extern "mitie_binary_relation_detector* mitie_load_binary_relation_detector(const char* filename)" 34 | extern "const char* mitie_binary_relation_detector_name_string(const mitie_binary_relation_detector* detector)" 35 | extern "int mitie_entities_overlap(unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)" 36 | extern "mitie_binary_relation* mitie_extract_binary_relation(const mitie_named_entity_extractor* ner, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)" 37 | extern "int mitie_classify_binary_relation(const mitie_binary_relation_detector* detector, const mitie_binary_relation* relation, double* score)" 38 | 39 | # text categorizer 40 | extern "mitie_text_categorizer* mitie_load_text_categorizer(const char* filename)" 41 | extern "int mitie_categorize_text(const mitie_text_categorizer* tcat, const char** tokens, char** text_tag, double* text_score)" 42 | 43 | # save 44 | extern "int mitie_save_named_entity_extractor(const char* filename, const mitie_named_entity_extractor* ner)" 45 | extern "int mitie_save_binary_relation_detector(const char* filename, const mitie_binary_relation_detector* detector)" 46 | extern "int mitie_save_text_categorizer(const char* filename, const mitie_text_categorizer* tcat)" 47 | 48 | # ner trainer 49 | extern "mitie_ner_training_instance* mitie_create_ner_training_instance(char** tokens)" 50 | extern "unsigned long mitie_ner_training_instance_num_entities(const mitie_ner_training_instance* instance)" 51 | extern "unsigned long mitie_ner_training_instance_num_tokens(const mitie_ner_training_instance* instance)" 52 | extern "int mitie_overlaps_any_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length)" 53 | extern "int mitie_add_ner_training_entity(mitie_ner_training_instance* instance, unsigned long start, unsigned long length, const char* label)" 54 | extern "mitie_ner_trainer* mitie_create_ner_trainer(const char* filename)" 55 | extern "unsigned long mitie_ner_trainer_size(const mitie_ner_trainer* trainer)" 56 | extern "int mitie_add_ner_training_instance(mitie_ner_trainer* trainer, const mitie_ner_training_instance* instance)" 57 | extern "void mitie_ner_trainer_set_beta(mitie_ner_trainer* trainer, double beta)" 58 | extern "double mitie_ner_trainer_get_beta(const mitie_ner_trainer* trainer)" 59 | extern "void mitie_ner_trainer_set_num_threads(mitie_ner_trainer* trainer, unsigned long num_threads)" 60 | extern "unsigned long mitie_ner_trainer_get_num_threads(const mitie_ner_trainer* trainer)" 61 | extern "mitie_named_entity_extractor* mitie_train_named_entity_extractor(const mitie_ner_trainer* trainer)" 62 | 63 | # binary relation trainer 64 | extern "mitie_binary_relation_trainer* mitie_create_binary_relation_trainer(const char* relation_name, const mitie_named_entity_extractor* ner)" 65 | extern "unsigned long mitie_binary_relation_trainer_num_positive_examples(const mitie_binary_relation_trainer* trainer)" 66 | extern "unsigned long mitie_binary_relation_trainer_num_negative_examples(const mitie_binary_relation_trainer* trainer)" 67 | extern "int mitie_add_positive_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)" 68 | extern "int mitie_add_negative_binary_relation(mitie_binary_relation_trainer* trainer, char** tokens, unsigned long arg1_start, unsigned long arg1_length, unsigned long arg2_start, unsigned long arg2_length)" 69 | extern "void mitie_binary_relation_trainer_set_beta(mitie_binary_relation_trainer* trainer, double beta)" 70 | extern "double mitie_binary_relation_trainer_get_beta(const mitie_binary_relation_trainer* trainer)" 71 | extern "void mitie_binary_relation_trainer_set_num_threads(mitie_binary_relation_trainer* trainer, unsigned long num_threads)" 72 | extern "unsigned long mitie_binary_relation_trainer_get_num_threads(const mitie_binary_relation_trainer* trainer)" 73 | extern "mitie_binary_relation_detector* mitie_train_binary_relation_detector(const mitie_binary_relation_trainer* trainer)" 74 | 75 | # text categorizer trainer 76 | extern "mitie_text_categorizer_trainer* mitie_create_text_categorizer_trainer(const char* filename)" 77 | extern "unsigned long mitie_text_categorizer_trainer_size(const mitie_text_categorizer_trainer* trainer)" 78 | extern "void mitie_text_categorizer_trainer_set_beta(mitie_text_categorizer_trainer* trainer, double beta)" 79 | extern "double mitie_text_categorizer_trainer_get_beta(const mitie_text_categorizer_trainer* trainer)" 80 | extern "void mitie_text_categorizer_trainer_set_num_threads(mitie_text_categorizer_trainer* trainer, unsigned long num_threads)" 81 | extern "unsigned long mitie_text_categorizer_trainer_get_num_threads(const mitie_text_categorizer_trainer* trainer)" 82 | extern "int mitie_add_text_categorizer_labeled_text(mitie_text_categorizer_trainer* trainer, const char** tokens, const char* label)" 83 | extern "mitie_text_categorizer* mitie_train_text_categorizer(const mitie_text_categorizer_trainer* trainer)" 84 | end 85 | end 86 | -------------------------------------------------------------------------------- /lib/mitie/ner.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class NER 3 | attr_reader :pointer 4 | 5 | def initialize(path = nil, pointer: nil) 6 | if path 7 | # better error message 8 | raise ArgumentError, "File does not exist" unless File.exist?(path) 9 | @pointer = FFI.mitie_load_named_entity_extractor(+path) 10 | @pointer.free = FFI["mitie_free"] 11 | elsif pointer 12 | @pointer = pointer 13 | else 14 | raise ArgumentError, "Must pass either a path or a pointer" 15 | end 16 | end 17 | 18 | def tags 19 | FFI.mitie_get_num_possible_ner_tags(pointer).times.map do |i| 20 | FFI.mitie_get_named_entity_tagstr(pointer, i).to_s 21 | end 22 | end 23 | 24 | def doc(text) 25 | Document.new(self, text) 26 | end 27 | 28 | def entities(text) 29 | doc(text).entities 30 | end 31 | 32 | def save_to_disk(filename) 33 | if FFI.mitie_save_named_entity_extractor(+filename, pointer) != 0 34 | raise Error, "Unable to save model" 35 | end 36 | nil 37 | end 38 | 39 | def tokens(text) 40 | doc(text).tokens 41 | end 42 | 43 | def tokens_with_offset(text) 44 | doc(text).tokens_with_offset 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/mitie/ner_trainer.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class NERTrainer 3 | def initialize(filename) 4 | raise ArgumentError, "File does not exist" unless File.exist?(filename) 5 | @pointer = FFI.mitie_create_ner_trainer(+filename) 6 | @pointer.free = FFI["mitie_free"] 7 | end 8 | 9 | def add(instance) 10 | FFI.mitie_add_ner_training_instance(@pointer, instance.pointer) 11 | end 12 | 13 | def beta 14 | FFI.mitie_ner_trainer_get_beta(@pointer) 15 | end 16 | 17 | def beta=(value) 18 | raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0 19 | 20 | FFI.mitie_ner_trainer_set_beta(@pointer, value) 21 | end 22 | 23 | def num_threads 24 | FFI.mitie_ner_trainer_get_num_threads(@pointer) 25 | end 26 | 27 | def num_threads=(value) 28 | FFI.mitie_ner_trainer_set_num_threads(@pointer, value) 29 | end 30 | 31 | def size 32 | FFI.mitie_ner_trainer_size(@pointer) 33 | end 34 | 35 | def train 36 | raise Error, "You can't call train() on an empty trainer" if size.zero? 37 | 38 | extractor = FFI.mitie_train_named_entity_extractor(@pointer) 39 | 40 | raise Error, "Unable to create named entity extractor. Probably ran out of RAM." if extractor.null? 41 | 42 | Mitie::NER.new(pointer: extractor) 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /lib/mitie/ner_training_instance.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class NERTrainingInstance 3 | attr_reader :pointer 4 | 5 | def initialize(tokens) 6 | tokens_pointer = Utils.array_to_pointer(tokens) 7 | 8 | @pointer = FFI.mitie_create_ner_training_instance(tokens_pointer) 9 | raise Error, "Unable to create training instance. Probably ran out of RAM." if @pointer.null? 10 | @pointer.free = FFI["mitie_free"] 11 | end 12 | 13 | def add_entity(range, label) 14 | Utils.check_range(range, num_tokens) 15 | 16 | raise ArgumentError, "Range overlaps existing entity" if overlaps_any_entity?(range) 17 | 18 | unless FFI.mitie_add_ner_training_entity(@pointer, range.begin, range.size, +label).zero? 19 | raise Error, "Unable to add entity to training instance. Probably ran out of RAM." 20 | end 21 | 22 | nil 23 | end 24 | 25 | def num_entities 26 | FFI.mitie_ner_training_instance_num_entities(@pointer) 27 | end 28 | 29 | def num_tokens 30 | FFI.mitie_ner_training_instance_num_tokens(@pointer) 31 | end 32 | 33 | def overlaps_any_entity?(range) 34 | Utils.check_range(range, num_tokens) 35 | 36 | FFI.mitie_overlaps_any_entity(@pointer, range.begin, range.size) == 1 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/mitie/text_categorizer.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class TextCategorizer 3 | def initialize(path = nil, pointer: nil) 4 | if path 5 | # better error message 6 | raise ArgumentError, "File does not exist" unless File.exist?(path) 7 | @pointer = FFI.mitie_load_text_categorizer(+path) 8 | @pointer.free = FFI["mitie_free"] 9 | elsif pointer 10 | @pointer = pointer 11 | else 12 | raise ArgumentError, "Must pass either a path or a pointer" 13 | end 14 | end 15 | 16 | def categorize(text) 17 | tokens = text.is_a?(Array) ? text : Mitie.tokenize(text) 18 | tokens_pointer = Utils.array_to_pointer(tokens) 19 | text_tag = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP, Fiddle::RUBY_FREE) 20 | text_score = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 21 | 22 | if FFI.mitie_categorize_text(@pointer, tokens_pointer, text_tag, text_score) != 0 23 | raise Error, "Unable to categorize" 24 | end 25 | text_tag = text_tag.ptr 26 | text_tag.free = FFI["mitie_free"] 27 | 28 | { 29 | tag: text_tag.to_s, 30 | score: Utils.read_double(text_score) 31 | } 32 | end 33 | 34 | def save_to_disk(filename) 35 | if FFI.mitie_save_text_categorizer(+filename, @pointer) != 0 36 | raise Error, "Unable to save model" 37 | end 38 | nil 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /lib/mitie/text_categorizer_trainer.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | class TextCategorizerTrainer 3 | def initialize(filename) 4 | raise ArgumentError, "File does not exist" unless File.exist?(filename) 5 | @pointer = FFI.mitie_create_text_categorizer_trainer(+filename) 6 | @pointer.free = FFI["mitie_free"] 7 | end 8 | 9 | def add(text, label) 10 | tokens = text.is_a?(Array) ? text : Mitie.tokenize(text) 11 | tokens_pointer = Utils.array_to_pointer(tokens) 12 | FFI.mitie_add_text_categorizer_labeled_text(@pointer, tokens_pointer, +label) 13 | end 14 | 15 | def beta 16 | FFI.mitie_text_categorizer_trainer_get_beta(@pointer) 17 | end 18 | 19 | def beta=(value) 20 | raise ArgumentError, "beta must be greater than or equal to zero" unless value >= 0 21 | 22 | FFI.mitie_text_categorizer_trainer_set_beta(@pointer, value) 23 | end 24 | 25 | def num_threads 26 | FFI.mitie_text_categorizer_trainer_get_num_threads(@pointer) 27 | end 28 | 29 | def num_threads=(value) 30 | FFI.mitie_text_categorizer_trainer_set_num_threads(@pointer, value) 31 | end 32 | 33 | def size 34 | FFI.mitie_text_categorizer_trainer_size(@pointer) 35 | end 36 | 37 | def train 38 | raise Error, "You can't call train() on an empty trainer" if size.zero? 39 | 40 | categorizer = FFI.mitie_train_text_categorizer(@pointer) 41 | 42 | raise Error, "Unable to create text categorizer. Probably ran out of RAM." if categorizer.null? 43 | 44 | Mitie::TextCategorizer.new(pointer: categorizer) 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /lib/mitie/utils.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | module Utils 3 | def self.array_to_pointer(text) 4 | # malloc uses memset to set all bytes to 0 5 | tokens_ptr = Fiddle::Pointer.malloc(Fiddle::SIZEOF_VOIDP * (text.size + 1), Fiddle::RUBY_FREE) 6 | text_ptrs = text.map { |v| Fiddle::Pointer[v] } 7 | text.size.times do |i| 8 | tokens_ptr[i * Fiddle::SIZEOF_VOIDP, Fiddle::SIZEOF_VOIDP] = text_ptrs[i].ref 9 | end 10 | tokens_ptr.instance_variable_set(:@mitie_refs, text_ptrs) 11 | tokens_ptr 12 | end 13 | 14 | def self.check_range(range, num_tokens) 15 | if range.none? || !(0..(num_tokens - 1)).cover?(range) 16 | raise ArgumentError, "Invalid range" 17 | end 18 | end 19 | 20 | def self.read_double(ptr) 21 | ptr.to_str(Fiddle::SIZEOF_DOUBLE).unpack1("d") 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /lib/mitie/version.rb: -------------------------------------------------------------------------------- 1 | module Mitie 2 | VERSION = "0.3.2" 3 | end 4 | -------------------------------------------------------------------------------- /mitie.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/mitie/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "mitie" 5 | spec.version = Mitie::VERSION 6 | spec.summary = "Named-entity recognition for Ruby" 7 | spec.homepage = "https://github.com/ankane/mitie-ruby" 8 | spec.license = "BSL-1.0" 9 | 10 | spec.author = "Andrew Kane" 11 | spec.email = "andrew@ankane.org" 12 | 13 | spec.files = Dir["*.{md,txt}", "{lib,vendor}/**/*"] 14 | spec.require_path = "lib" 15 | 16 | spec.required_ruby_version = ">= 3.1" 17 | 18 | spec.add_dependency "fiddle" 19 | end 20 | -------------------------------------------------------------------------------- /test/binary_relation_detector_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class BinaryRelationDetectorTest < Minitest::Test 4 | def test_directed_by 5 | detector = Mitie::BinaryRelationDetector.new("#{models_path}/binary_relations/rel_classifier_film.film.directed_by.svm") 6 | assert_equal "film.film.directed_by", detector.name 7 | doc = model.doc("The Shawshank Redemption was directed by Frank Darabont and starred Tim Robbins and Morgan Freeman") 8 | 9 | relations = detector.relations(doc) 10 | assert_equal 1, relations.size 11 | 12 | relation = relations.first 13 | assert_equal "Shawshank Redemption", relation[:first] 14 | assert_equal "Frank Darabont", relation[:second] 15 | assert relation[:score] 16 | end 17 | 18 | def test_place_founded 19 | detector = Mitie::BinaryRelationDetector.new("#{models_path}/binary_relations/rel_classifier_organization.organization.place_founded.svm") 20 | assert_equal "organization.organization.place_founded", detector.name 21 | doc = model.doc("Shopify was founded in Ottawa") 22 | 23 | relations = detector.relations(doc) 24 | assert_equal 1, relations.size 25 | 26 | relation = relations.first 27 | assert_equal "Shopify", relation[:first] 28 | assert_equal "Ottawa", relation[:second] 29 | assert relation[:score] 30 | end 31 | 32 | def test_non_document 33 | detector = Mitie::BinaryRelationDetector.new("#{models_path}/binary_relations/rel_classifier_film.film.directed_by.svm") 34 | error = assert_raises(ArgumentError) do 35 | detector.relations("Hi") 36 | end 37 | assert_equal "Expected Mitie::Document, not String", error.message 38 | end 39 | 40 | def test_missing_file 41 | error = assert_raises(ArgumentError) do 42 | Mitie::BinaryRelationDetector.new("missing.dat") 43 | end 44 | assert_equal "File does not exist", error.message 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /test/binary_relation_trainer_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class BinaryRelationTrainerTest < Minitest::Test 4 | def test_works 5 | trainer = Mitie::BinaryRelationTrainer.new(model) 6 | trainer.add_positive_binary_relation(tokens, 0..0, 4..4) 7 | trainer.add_negative_binary_relation(tokens, 4...5, 0..0) 8 | assert_equal 1, trainer.num_positive_examples 9 | assert_equal 1, trainer.num_negative_examples 10 | detector = silence_stdout { trainer.train } 11 | assert_equal "", detector.name 12 | 13 | tempfile = Tempfile.new 14 | detector.save_to_disk(tempfile.path) 15 | assert File.exist?(tempfile.path) 16 | 17 | detector = Mitie::BinaryRelationDetector.new(tempfile.path) 18 | doc = model.doc("Shopify was founded in Ottawa") 19 | 20 | relations = detector.relations(doc) 21 | assert_equal 1, relations.size 22 | 23 | relation = relations.first 24 | assert_equal "Shopify", relation[:first] 25 | assert_equal "Ottawa", relation[:second] 26 | assert relation[:score] 27 | end 28 | 29 | def test_add_positive_binary_relation_invalid_range 30 | trainer = Mitie::BinaryRelationTrainer.new(model) 31 | 32 | error = assert_raises(ArgumentError) do 33 | trainer.add_positive_binary_relation(tokens, 0...0, 4..4) 34 | end 35 | assert_equal "Invalid range", error.message 36 | 37 | error = assert_raises(ArgumentError) do 38 | trainer.add_positive_binary_relation(tokens, 0..0, 4...4) 39 | end 40 | assert_equal "Invalid range", error.message 41 | 42 | error = assert_raises(ArgumentError) do 43 | trainer.add_positive_binary_relation(tokens, 0..0, 4..5) 44 | end 45 | assert_equal "Invalid range", error.message 46 | end 47 | 48 | def test_add_negative_binary_relation_invalid_range 49 | trainer = Mitie::BinaryRelationTrainer.new(model) 50 | 51 | error = assert_raises(ArgumentError) do 52 | trainer.add_negative_binary_relation(tokens, 0...0, 4..4) 53 | end 54 | assert_equal "Invalid range", error.message 55 | 56 | error = assert_raises(ArgumentError) do 57 | trainer.add_negative_binary_relation(tokens, 0..0, 4...4) 58 | end 59 | assert_equal "Invalid range", error.message 60 | 61 | error = assert_raises(ArgumentError) do 62 | trainer.add_negative_binary_relation(tokens, 0..0, 4..5) 63 | end 64 | assert_equal "Invalid range", error.message 65 | end 66 | 67 | def test_add_positive_binary_relation_entities_overlap 68 | trainer = Mitie::BinaryRelationTrainer.new(model) 69 | 70 | error = assert_raises(ArgumentError) do 71 | trainer.add_positive_binary_relation(tokens, 0..1, 1..2) 72 | end 73 | assert_equal "Entities overlap", error.message 74 | end 75 | 76 | def test_add_negative_binary_relation_entities_overlap 77 | trainer = Mitie::BinaryRelationTrainer.new(model) 78 | 79 | error = assert_raises(ArgumentError) do 80 | trainer.add_negative_binary_relation(tokens, 0..1, 1..2) 81 | end 82 | assert_equal "Entities overlap", error.message 83 | end 84 | 85 | def test_empty_trainer 86 | trainer = Mitie::BinaryRelationTrainer.new(model) 87 | error = assert_raises(Mitie::Error) do 88 | trainer.train 89 | end 90 | assert_equal "You can't call train() on an empty trainer", error.message 91 | end 92 | 93 | private 94 | 95 | def tokens 96 | ["Shopify", "was", "founded", "in", "Ottawa"] 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /test/document_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class DocumentTest < Minitest::Test 4 | def test_entities 5 | expected = [ 6 | {:text=>"Nat", :tag=>"PERSON", :score=>0.31123712126883823, :offset=>0, :token_index=>0, :token_length=>1}, 7 | {:text=>"GitHub", :tag=>"LOCATION", :score=>0.5660115198329334, :offset=>13, :token_index=>3, :token_length=>1}, 8 | {:text=>"San Francisco", :tag=>"LOCATION", :score=>1.3890524313885309, :offset=>23, :token_index=>5, :token_length=>2} 9 | ] 10 | assert_equal expected, doc.entities 11 | end 12 | 13 | def test_entities_tokens 14 | expected = [ 15 | {:text=>["Nat"], :tag=>"PERSON", :score=>0.31123712126883823, :token_index=>0, :token_length=>1}, 16 | {:text=>["GitHub"], :tag=>"LOCATION", :score=>0.5660115198329334, :token_index=>3, :token_length=>1}, 17 | {:text=>["San", "Francisco"], :tag=>"LOCATION", :score=>1.3890524313885309, :token_index=>5, :token_length=>2} 18 | ] 19 | assert_equal expected, token_doc.entities 20 | end 21 | 22 | def test_entities_location 23 | # would ideally return a single location 24 | assert_equal ["San Francisco", "California"], model.doc("San Francisco, California").entities.map { |e| e[:text] } 25 | end 26 | 27 | # offset is in bytes 28 | def test_entities_byte_order_mark 29 | expected = [{:text=>"California", :tag=>"LOCATION", :score=>1.4244816233933328, :offset=>12, :token_index=>2, :token_length=>1}] 30 | assert_equal expected, model.doc("\xEF\xBB\xBFWorks in California").entities 31 | end 32 | 33 | def test_tokens 34 | expected = ["Nat", "works", "at", "GitHub", "in", "San", "Francisco"] 35 | assert_equal expected, doc.tokens 36 | end 37 | 38 | def test_tokens_tokens 39 | expected = ["Nat", "works", "at", "GitHub", "in", "San", "Francisco"] 40 | assert_equal expected, token_doc.tokens 41 | end 42 | 43 | def test_tokens_with_offset 44 | expected = [["Nat", 0], ["works", 4], ["at", 10], ["GitHub", 13], ["in", 20], ["San", 23], ["Francisco", 27]] 45 | assert_equal expected, doc.tokens_with_offset 46 | end 47 | 48 | def test_tokens_with_offset_tokens 49 | expected =[["Nat", nil], ["works", nil], ["at", nil], ["GitHub", nil], ["in", nil], ["San", nil], ["Francisco", nil]] 50 | assert_equal expected, token_doc.tokens_with_offset 51 | end 52 | 53 | def test_tokens_utf8 54 | assert_equal ["“", "hello", "”"], model.doc("“hello”").tokens 55 | end 56 | 57 | def test_tokens_with_offset_utf8 58 | # https://github.com/mit-nlp/MITIE/issues/211 59 | skip "Possible bug with MITIE" 60 | 61 | assert_equal [["“", 0], ["hello", 1], ["”", 6]], model.doc("“hello”").tokens_with_offset 62 | end 63 | 64 | def doc 65 | model.doc(text) 66 | end 67 | 68 | def token_doc 69 | model.doc(tokens) 70 | end 71 | 72 | def tokens 73 | ["Nat", "works", "at", "GitHub", "in", "San", "Francisco"] 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /test/mitie_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class MitieTest < Minitest::Test 4 | def test_tokenize 5 | tokens = Mitie.tokenize(text) 6 | assert_equal ["Nat", "works", "at", "GitHub", "in", "San", "Francisco"], tokens 7 | assert tokens.all? { |t| t.encoding == Encoding::UTF_8 } 8 | end 9 | 10 | def test_tokenize_encoding 11 | tokens = Mitie.tokenize(text.dup.force_encoding(Encoding::US_ASCII)) 12 | assert tokens.all? { |t| t.encoding == Encoding::US_ASCII } 13 | end 14 | 15 | def test_tokenize_nil 16 | assert_equal [], Mitie.tokenize(nil) 17 | end 18 | 19 | def test_tokenize_file 20 | tempfile = Tempfile.new 21 | tempfile.write(text) 22 | tempfile.flush 23 | tokens = Mitie.tokenize_file(tempfile.path) 24 | assert_equal ["Nat", "works", "at", "GitHub", "in", "San", "Francisco"], tokens 25 | assert tokens.all? { |t| t.encoding == Encoding::ASCII_8BIT } 26 | end 27 | 28 | def test_tokenize_file_missing 29 | error = assert_raises(ArgumentError) do 30 | Mitie.tokenize_file("missing.txt") 31 | end 32 | assert_equal "File does not exist", error.message 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /test/ner_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class NERTest < Minitest::Test 4 | def test_entities 5 | expected = [ 6 | {:text=>"Nat", :tag=>"PERSON", :score=>0.31123712126883823, :offset=>0, :token_index=>0, :token_length=>1}, 7 | {:text=>"GitHub", :tag=>"LOCATION", :score=>0.5660115198329334, :offset=>13, :token_index=>3, :token_length=>1}, 8 | {:text=>"San Francisco", :tag=>"LOCATION", :score=>1.3890524313885309, :offset=>23, :token_index=>5, :token_length=>2} 9 | ] 10 | assert_equal expected, model.entities(text) 11 | end 12 | 13 | def test_tokens 14 | expected = ["Nat", "works", "at", "GitHub", "in", "San", "Francisco"] 15 | assert_equal expected, model.tokens(text) 16 | end 17 | 18 | def test_tokens_utf8 19 | assert_equal ["“", "hello", "”"], model.tokens("“hello”") 20 | end 21 | 22 | def test_tokens_with_offset 23 | expected = [["Nat", 0], ["works", 4], ["at", 10], ["GitHub", 13], ["in", 20], ["San", 23], ["Francisco", 27]] 24 | assert_equal expected, model.tokens_with_offset(text) 25 | end 26 | 27 | def test_tags 28 | assert_equal ["PERSON", "LOCATION", "ORGANIZATION", "MISC"], model.tags 29 | end 30 | 31 | def test_missing_file 32 | error = assert_raises(ArgumentError) do 33 | Mitie::NER.new("missing.dat") 34 | end 35 | assert_equal "File does not exist", error.message 36 | end 37 | 38 | def test_save_to_disk 39 | tempfile = Tempfile.new 40 | model.save_to_disk(tempfile.path) 41 | assert File.exist?(tempfile.path) 42 | ensure 43 | tempfile.close 44 | tempfile.unlink 45 | end 46 | 47 | def test_save_to_disk_error 48 | error = assert_raises(Mitie::Error) do 49 | model.save_to_disk("#{Dir.tmpdir}/missing/ner_model.dat") 50 | end 51 | assert_equal "Unable to save model", error.message 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /test/ner_trainer_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class NERTrainerTest < Minitest::Test 4 | def test_beta_accessors 5 | trainer = Mitie::NERTrainer.new(feature_extractor_path) 6 | trainer.beta = 2.0 7 | assert_equal 2.0, trainer.beta 8 | end 9 | 10 | def test_beta_writer_raises_on_invalid_input 11 | trainer = Mitie::NERTrainer.new(feature_extractor_path) 12 | error = assert_raises(ArgumentError) do 13 | trainer.beta = -0.5 14 | end 15 | assert_equal "beta must be greater than or equal to zero", error.message 16 | end 17 | 18 | def test_num_threads_accessors 19 | trainer = Mitie::NERTrainer.new(feature_extractor_path) 20 | trainer.num_threads = 2 21 | assert_equal 2, trainer.num_threads 22 | end 23 | 24 | def test_train 25 | tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"] 26 | instance = Mitie::NERTrainingInstance.new(tokens) 27 | instance.add_entity(3..4, "topic") 28 | instance.add_entity(6..6, "language") 29 | 30 | trainer = Mitie::NERTrainer.new(feature_extractor_path) 31 | trainer.add(instance) 32 | trainer.num_threads = 2 33 | model = silence_stdout { trainer.train } 34 | 35 | assert model.is_a?(Mitie::NER) 36 | 37 | entity = model.doc("Hello Ruby").entities.first 38 | assert_equal "Ruby", entity[:text] 39 | assert_equal "language", entity[:tag] 40 | end 41 | 42 | def test_empty_trainer 43 | trainer = Mitie::NERTrainer.new(feature_extractor_path) 44 | error = assert_raises(Mitie::Error) do 45 | trainer.train 46 | end 47 | assert_equal "You can't call train() on an empty trainer", error.message 48 | end 49 | 50 | def test_missing_file 51 | error = assert_raises(ArgumentError) do 52 | Mitie::NERTrainer.new("missing.dat") 53 | end 54 | assert_equal "File does not exist", error.message 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /test/ner_training_instance_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class NERTrainingInstanceTest < Minitest::Test 4 | def test_add_entity_raises_on_invalid_input 5 | tokens = ["I", "raise", "errors", "."] 6 | instance = Mitie::NERTrainingInstance.new(tokens) 7 | instance.add_entity(2..2, "noun") 8 | 9 | error = assert_raises(ArgumentError) do 10 | instance.add_entity(1...1, "nope") 11 | end 12 | assert_equal "Invalid range", error.message 13 | 14 | error = assert_raises(ArgumentError) do 15 | instance.add_entity(1...9, "nope") 16 | end 17 | assert_equal "Invalid range", error.message 18 | 19 | error = assert_raises(ArgumentError) do 20 | instance.add_entity(-1...2, "nope") 21 | end 22 | assert_equal "Invalid range", error.message 23 | 24 | error = assert_raises(ArgumentError) do 25 | instance.add_entity(2..2, "nope") 26 | end 27 | assert_equal "Range overlaps existing entity", error.message 28 | end 29 | 30 | def test_num_entities 31 | tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"] 32 | instance = Mitie::NERTrainingInstance.new(tokens) 33 | 34 | assert_equal 0, instance.num_entities 35 | 36 | instance.add_entity(3..4, "topic") 37 | instance.add_entity(6..6, "language") 38 | 39 | assert_equal 2, instance.num_entities 40 | end 41 | 42 | def test_num_tokens 43 | tokens = ["I", "have", "five", "tokens", "."] 44 | instance = Mitie::NERTrainingInstance.new(tokens) 45 | assert_equal 5, instance.num_tokens 46 | end 47 | 48 | def test_overlaps_any_entity 49 | tokens = ["You", "can", "do", "machine", "learning", "in", "Ruby", "!"] 50 | instance = Mitie::NERTrainingInstance.new(tokens) 51 | instance.add_entity(3..4, "topic") 52 | instance.add_entity(6..6, "language") 53 | 54 | refute instance.overlaps_any_entity?(1..2) 55 | assert instance.overlaps_any_entity?(2..3) 56 | refute instance.overlaps_any_entity?(5..5) 57 | end 58 | 59 | def test_overlaps_any_entity_raises_errors 60 | tokens = ["I", "raise", "errors", "."] 61 | instance = Mitie::NERTrainingInstance.new(tokens) 62 | instance.add_entity(2..2, "noun") 63 | 64 | error = assert_raises(ArgumentError) do 65 | instance.overlaps_any_entity?(1...1) 66 | end 67 | assert_equal "Invalid range", error.message 68 | 69 | error = assert_raises(ArgumentError) do 70 | instance.overlaps_any_entity?(9..12) 71 | end 72 | assert_equal "Invalid range", error.message 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | Bundler.require(:default) 3 | require "minitest/autorun" 4 | require "minitest/pride" 5 | 6 | class Minitest::Test 7 | def setup 8 | # autoload before GC.stress 9 | Mitie::FFI if stress? 10 | 11 | GC.stress = true if stress? 12 | end 13 | 14 | def teardown 15 | GC.stress = false if stress? 16 | end 17 | 18 | def stress? 19 | ENV["STRESS"] 20 | end 21 | 22 | # memoize for performance 23 | def model 24 | @@model ||= Mitie::NER.new("#{models_path}/ner_model.dat") 25 | end 26 | 27 | def models_path 28 | ENV.fetch("MITIE_MODELS_PATH") 29 | end 30 | 31 | def feature_extractor_path 32 | "#{models_path}/total_word_feature_extractor.dat" 33 | end 34 | 35 | def text 36 | "Nat works at GitHub in San Francisco" 37 | end 38 | 39 | # capture_io does not suppress output 40 | def silence_stdout 41 | old_stdout = STDOUT.dup 42 | STDOUT.reopen(IO::NULL) 43 | STDOUT.sync = true 44 | yield 45 | ensure 46 | STDOUT.reopen(old_stdout) 47 | old_stdout.close 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /test/text_categorizer_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class TextCategorizerTest < Minitest::Test 4 | def test_tokens 5 | trainer = Mitie::TextCategorizerTrainer.new(feature_extractor_path) 6 | trainer.add(["This", "is", "super", "cool"], "positive") 7 | trainer.add(["I", "am", "not", "a", "fan"], "negative") 8 | model = silence_stdout { trainer.train } 9 | 10 | tempfile = Tempfile.new 11 | model.save_to_disk(tempfile.path) 12 | assert File.exist?(tempfile.path) 13 | 14 | model = Mitie::TextCategorizer.new(tempfile.path) 15 | result = model.categorize(["What", "a", "super", "nice", "day"]) 16 | assert_equal "positive", result[:tag] 17 | assert_in_delta 0.0684, result[:score] 18 | end 19 | 20 | def test_strings 21 | trainer = Mitie::TextCategorizerTrainer.new(feature_extractor_path) 22 | trainer.add("This is super cool", "positive") 23 | trainer.add("I am not a fan", "negative") 24 | model = silence_stdout { trainer.train } 25 | 26 | tempfile = Tempfile.new 27 | model.save_to_disk(tempfile.path) 28 | assert File.exist?(tempfile.path) 29 | 30 | model = Mitie::TextCategorizer.new(tempfile.path) 31 | result = model.categorize("What a super nice day") 32 | assert_equal "positive", result[:tag] 33 | assert_in_delta 0.0684, result[:score] 34 | end 35 | 36 | def test_empty_trainer 37 | trainer = Mitie::TextCategorizerTrainer.new(feature_extractor_path) 38 | error = assert_raises(Mitie::Error) do 39 | trainer.train 40 | end 41 | assert_equal "You can't call train() on an empty trainer", error.message 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /vendor/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Boost Software License - Version 1.0 - August 17th, 2003 2 | 3 | Permission is hereby granted, free of charge, to any person or organization 4 | obtaining a copy of the software and accompanying documentation covered by 5 | this license (the "Software") to use, reproduce, display, distribute, 6 | execute, and transmit the Software, and to prepare derivative works of the 7 | Software, and to permit third-parties to whom the Software is furnished to 8 | do so, all subject to the following: 9 | 10 | The copyright notices in the Software and this entire statement, including 11 | the above license grant, this restriction and the following disclaimer, 12 | must be included in all copies of the Software, in whole or in part, and 13 | all derivative works of the Software, unless such copies or derivative 14 | works are solely in the form of machine-executable object code generated by 15 | a source language processor. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 20 | SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 21 | FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 22 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 | DEALINGS IN THE SOFTWARE. 24 | --------------------------------------------------------------------------------