├── .gitignore ├── .ruby-gemset ├── .ruby-version ├── Gemfile ├── Gemfile.lock ├── LICENSE ├── README.markdown ├── Rakefile ├── examples └── decision_tree │ ├── cross_validation.rb │ ├── decision_tree.rb │ └── main.rb ├── lib ├── prolly.rb └── prolly │ ├── ps.rb │ ├── ps │ └── storage │ │ ├── base.rb │ │ ├── mongodb.rb │ │ ├── redis.rb │ │ └── rubylist.rb │ ├── rand_var.rb │ └── rand_var │ ├── entropy.rb │ ├── infogain.rb │ ├── pdf.rb │ └── prob.rb ├── prolly.gemspec └── specs ├── ps_spec.rb └── rand_var_spec.rb /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore emacs and other backup files 2 | *~ 3 | \#*\# 4 | .\#* 5 | *.swp 6 | *.swo 7 | 8 | data/ 9 | profile/ 10 | 11 | 12 | -------------------------------------------------------------------------------- /.ruby-gemset: -------------------------------------------------------------------------------- 1 | prolly 2 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.2.0 2 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | #gem "redis", "~>3.2.1" 4 | gem "moped", "~>2.0.3" 5 | gem "rspec" 6 | 7 | -------------------------------------------------------------------------------- /Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | bson (2.3.0) 5 | connection_pool (2.1.1) 6 | diff-lcs (1.2.5) 7 | moped (2.0.3) 8 | bson (~> 2.2) 9 | connection_pool (~> 2.0) 10 | optionable (~> 0.2.0) 11 | optionable (0.2.0) 12 | rspec (3.1.0) 13 | rspec-core (~> 3.1.0) 14 | rspec-expectations (~> 3.1.0) 15 | rspec-mocks (~> 3.1.0) 16 | rspec-core (3.1.7) 17 | rspec-support (~> 3.1.0) 18 | rspec-expectations (3.1.2) 19 | diff-lcs (>= 1.2.0, < 2.0) 20 | rspec-support (~> 3.1.0) 21 | rspec-mocks (3.1.3) 22 | rspec-support (~> 3.1.0) 23 | rspec-support (3.1.2) 24 | 25 | PLATFORMS 26 | ruby 27 | 28 | DEPENDENCIES 29 | moped (~> 2.0.3) 30 | rspec 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # Prolly 2 | 3 | **Prolly is a Domain Specific Language (DSL) for expressing probabilities in code.** 4 | Just like a database has a query language (SQL), this is a query language 5 | specifically for answering questions about probabilities of events based on the 6 | samples you've seen before. 7 | 8 | So instead of counting all the events yourself, you just express 9 | probabilities, entropies, and information gain much like how math books express it. 10 | Being able to express probabilities is useful for writing machine learning 11 | algorithms at a higher level of abstraction. The right level abstraction makes things 12 | easier to build. 13 | 14 | We can now making decisions in code not just based on the current data, like `if` 15 | statements do, but we can make decisions based on the chance of prior data and 16 | the current data, and that makes for smarter software. 17 | 18 | ## What can I use this for? 19 | 20 | There are examples of using Prolly to write learning algorithms. 21 | 22 | - [Decision Tree](https://github.com/iamwilhelm/prolly/tree/master/examples/decision_tree) 23 | 24 | 25 | ## Quick intro 26 | 27 | Prolly makes it easy to express probabilities from data. It can also calculate 28 | entropies of random variables as well as the information gain. 29 | 30 | Here's how to express Bayes Rule in Prolly: 31 | 32 | ``` 33 | Ps.rv(color: blue).given(size: red).prob * Ps.rv(size: red).prob 34 | / Ps.rv(color: blue).prob 35 | ``` 36 | 37 | And the above will calculate P(Size=red | Color= blue) 38 | 39 | ## Installing 40 | 41 | Use ruby gems to install 42 | 43 | `gem install prolly` 44 | 45 | If you use Bundler, just add it to your Gemfile, and then run `bundle install` 46 | 47 | ## Usage 48 | 49 | We first add samples of observable events to be able to estimate the probability of the events we've seen. Then we can query it with Prolly to know the probability of different events. 50 | 51 | ### Adding samples 52 | 53 | Now we add the samples of data that we've observed for the random variable. Presumably, we have a large 54 | enough dataset that we can reasonably estimate each specified RV. 55 | 56 | ``` 57 | require 'prolly' 58 | include Prolly 59 | 60 | Ps.add({ color: :blue, size: :small }) 61 | Ps.add({ color: :blue, size: :big }) 62 | Ps.add({ color: :blue, size: :big }) 63 | Ps.add({ color: :green, size: :big }) 64 | Ps.add({ color: :green, size: :small }) 65 | ``` 66 | 67 | Now that we have samples to estimate our probabilities, we're good to go on how to express them. 68 | 69 | >Note that you need you'll need to `include Prolly` into whatever namespace you're using it in, in order to call `Ps.add`. Otherwise, you'll need 70 | to type: `Prolly::Ps.add`, if `Ps` is already taken in your namespace. 71 | 72 | ### Expressing Stochastics through Probability Space 73 | 74 | `Ps` is short for Probability Space. It's normally denoted by Ω, U (for universal set), or S (for sample set) in probability textbooks. It's the set of all events that could happen. 75 | 76 | You start with probability space. 77 | ``` 78 | Ps 79 | ``` 80 | then pick an specified random variable to examine 81 | ``` 82 | Ps.rv(color: :blue) 83 | ``` 84 | And if necessary, pick a conditional random variable 85 | ``` 86 | Ps.rv(color: :blue).given(size: :small) 87 | ``` 88 | Then pick the operation, where it can be `count`, `prob`, `pdf`, `entropy`, or `infogain`. 89 | ``` 90 | Ps.rv(color: :blue).given(size: :small).prob 91 | ``` 92 | And that will give you the probability of the random variable Color is :blue given that the Size was :small. 93 | 94 | ### Probabilities 95 | 96 | What is the probability there is a blue marble? 97 | ```ruby 98 | # P(C = blue) 99 | Ps.rv(color: :blue).prob 100 | ``` 101 | 102 | What is the joint probability there is a blue marble that also has a rough texture? 103 | ```ruby 104 | # P(C = blue, T = rough) 105 | Ps.rv(color: :blue, texture: :rough).prob 106 | ``` 107 | 108 | What is the probability a marble is small or med sized? 109 | ```ruby 110 | # P(S = small, med) 111 | Ps.rv(size: [:small, :med]).prob 112 | ``` 113 | 114 | What is the probability of a blue marble given that the marble is small? 115 | ```ruby 116 | # P(C = blue | S = small) 117 | Ps.rv(color: :blue).given(size: :small).prob 118 | ``` 119 | 120 | What is the probability of a blue marble and rough texture given that the marble is small? 121 | ```ruby 122 | # P(C = blue, T = rough | S = small) 123 | Ps.rv(color: :blue, texture: :rough).given(size: :small).prob 124 | ``` 125 | 126 | ### Probability density functions 127 | 128 | Probability density for a random variable. 129 | ```ruby 130 | Ps.rv(:color).pdf 131 | ``` 132 | 133 | Probability density for a conditional random variable. 134 | ```ruby 135 | Ps.rv(:color).given(size: :small).pdf 136 | ``` 137 | 138 | ### Entropy 139 | 140 | Entropy of the RV color. 141 | ```ruby 142 | # H(C) 143 | Ps.rv(:color).entropy 144 | ``` 145 | 146 | Entropy of color given the marble is small 147 | ```ruby 148 | # H(C | S = small) 149 | Ps.rv(:color).given(size: :small).entropy 150 | ``` 151 | 152 | ### Information Gain 153 | 154 | Information gain of color and size. 155 | ```ruby 156 | # IG(C | S) 157 | Ps.rv(:color).given(:size).infogain 158 | ``` 159 | 160 | Information gain of color and size, when we already know texture and opacity. 161 | ```ruby 162 | # IG(C | S, T=smooth, O=opaque) 163 | Ps.rv(:color).given(:size, { texture: :smooth, opacity: :opaque }).infogain 164 | ``` 165 | 166 | ### Counts 167 | 168 | At the base of all the probabilities are counts of stuff. 169 | ```ruby 170 | Ps.rv(color: :blue).count 171 | ``` 172 | 173 | ```ruby 174 | Ps.rv(:color).given(:size).count 175 | ``` 176 | ## Full Reference 177 | 178 | A random variable can be specified `Ps.rv(:color)` or unspecified `Ps.rv(color: :blue)`. So too can conditional random variables be specified or unspecified. 179 | 180 | Prolly currently supports five operations. 181 | 182 | - .prob() · Calculates probability, a fractional number representing the belief you have that an event will occur; based on the amount of evidence you've seen for that event. 183 | - .pdf() · Calculates probability density function, a hash of all possible probabilities for the random variable. 184 | - .entropy() · Calculates entropy, a fractional number representing the spikiness or smoothness of a density function, which implies how much information is in the random variable. 185 | - .infogain() · Calculates information gain, a fractional number representing the amount of information (that is, reduction in uncertainty) that knowing either variable provides about the other. 186 | - .count() · Counts the number of events satisfying the conditions. 187 | 188 | Each of the operations will only work with certain combinations of random variables. The possibilities are listed below, and Prolly will throw an exception if it's violated. 189 | 190 | Legend: 191 | - ✓ available for this operator 192 | - Δ! available, but not yet implemented for this operator. 193 | 194 | ### The Probability Operator: .prob() 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 |
n/a.given(:size).given(size: :small).given(size: :small, weight: :fat).given(:size, weight: :fat).given(:size, :weight)
rv(color: :blue)
rv(color: [:blue, :green])
rv(color: :blue, texture: :rough)
rv(:color)
rv(:color, :texture)
252 | 253 | ### The Probability Density Function Operator: .pdf() 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 |
n/a.given(:size).given(size: :small).given(size: :small, weight: :fat).given(:size, weight: :fat).given(:size, :weight)
rv(color: :blue)
rv(color: [:blue, :green])
rv(color: :blue, texture: :rough)
rv(:color)
rv(:color, :texture)Δ!Δ!Δ!Δ!Δ!
311 | 312 | ### The Entropy Operator: .entropy() 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 |
n/a.given(:size).given(size: :small).given(size: :small, weight: :fat).given(:size, weight: :fat).given(:size, :weight)
rv(color: :blue)
rv(color: [:blue, :green])
rv(color: :blue, texture: :rough)
rv(:color)
rv(:color, :texture)Δ!Δ!
370 | 371 | ### The Information Gain Operator: .infogain() 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 |
n/a.given(:size).given(size: :small).given(size: :small, weight: :fat).given(:size, weight: :fat).given(:size, :weight)
rv(color: :blue)
rv(color: [:blue, :green])
rv(color: :blue, texture: :rough)
rv(:color)
rv(:color, :texture)
429 | 430 | ### The Count Operator: .count() 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 |
n/a.given(:size).given(size: :small).given(size: :small, weight: :fat).given(:size, weight: :fat).given(:size, :weight)
rv(color: :blue)
rv(color: [:blue, :green])
rv(color: :blue, texture: :rough)
rv(:color)
rv(:color, :texture)
488 | 489 | ## Stores 490 | 491 | Prolly can use different stores to remember the prior event data from which it 492 | calculates the probability. Currently Prolly implements a RubyList store and a 493 | Mongodb store. 494 | 495 | ### Implementing new stores 496 | 497 | The interface for a new store is pretty easy. It just needs to implement six methods: 498 | 499 | #### initialize 500 | 501 | This just brings up the store, and connects to it, and whatever else you need to do in the beginning. 502 | 503 | #### reset 504 | 505 | This should just clear the entire store of the data in the collection. 506 | 507 | #### add(datum) 508 | 509 | Adds one row of data to the store. 510 | 511 | #### count(rvs, options = {}) 512 | 513 | Counts the number of samples that satisfy the RVs requested. `rvs` can be either an Array or a Hash. When it's an array, you must count all 514 | samples that have all the RVs. 515 | 516 | When it's a hash, you must look for all samples that not only have the random variables, but also have the matching designated 517 | values. Note that the values can be an array. When that happens, the user is indicating that it also would like any of the values the RV to match. 518 | 519 | #### rand_vars 520 | 521 | Return a list of all random variables 522 | 523 | #### uniq_vals(name) 524 | 525 | Return a list of all uniq values of a random variable. 526 | 527 | ## Motivation 528 | 529 | A couple years back, I was reading [a blog post](http://weblog.raganwald.com/2008/02/naive-approach-to-hiring-people.html) by Raganwald, where I read this quote: 530 | 531 |
532 | A very senior Microsoft developer who moved to Google told me that Google works and thinks at a higher level of abstraction than Microsoft. “Google uses Bayesian filtering the way Microsoft uses the if statement,” he said. 533 | 534 | —Joel Spolsky, Microsoft Jet 535 |
536 | 537 | That got me thinking very literally. What would it look like if we have probability 538 | statements to use natively like we have "if" statements? How would that change how 539 | we code? That would mean we could make decisions not just on the information we 540 | have on hand, but the prior information we saw before. 541 | 542 | ## Contributing 543 | 544 | Write some specs, make sure the entire thing passes. Then submit a pull request. 545 | 546 | ## Contributors 547 | 548 | - Wil Chung 549 | 550 | ## License 551 | 552 | MIT license 553 | 554 | ## Changelog 555 | 556 | ### v0.0.1 557 | 558 | - Initial release with counts, probs, pdf, entropy, and infogain. 559 | - implements two stores, RubyList and Mongodb 560 | 561 | ### v0.0.2 562 | 563 | - fix: RubyList storage had a bug that counted incorrectly. 564 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | 2 | def run(cmd) 3 | IO.popen(cmd) { |f| f.each_line { |l| puts l } } 4 | end 5 | 6 | desc "run tests" 7 | task :test do 8 | run "rspec ./specs" 9 | end 10 | 11 | task :default => :test 12 | 13 | namespace :example do 14 | 15 | desc "decision tree" 16 | task :decision_tree do 17 | run "ruby -Ilib examples/decision_tree/main.rb" 18 | end 19 | 20 | end 21 | -------------------------------------------------------------------------------- /examples/decision_tree/cross_validation.rb: -------------------------------------------------------------------------------- 1 | #datum = { size: :large } 2 | #classification = dt.classify(:color, datum) 3 | # 4 | #puts classification 5 | 6 | 7 | require_relative 'decision_tree' 8 | 9 | class CrossValidation 10 | 11 | def initialize(cols, options = {}) 12 | @data = [] 13 | @cols = cols 14 | end 15 | 16 | def setup 17 | yield self 18 | partition_sets 19 | end 20 | 21 | def add(datum) 22 | @data << datum 23 | end 24 | 25 | def run(target_rv) 26 | 27 | learner_and_errors = models.map do |columns| 28 | puts "Model size: #{columns.length}" 29 | ::Ps.reset 30 | learner = ::DecisionTree::Machine.new 31 | 32 | puts "loading..." 33 | learner.load(@training_set) 34 | 35 | puts "learning #{target_rv} for #{columns}" 36 | 37 | learner.learn(target_rv) do |rv| 38 | columns.include?(rv) 39 | end 40 | 41 | puts "checking for errors..." 42 | errors = @cross_validation_set.map do |datum| 43 | expected = [datum[target_rv]] 44 | actual = learner.classify(datum) 45 | expected != actual 46 | end 47 | 48 | error_rate = errors.count { |e| e == true }.to_f / errors.length 49 | puts "Error rate is #{error_rate.round(4)}" 50 | puts 51 | 52 | [learner, columns, error_rate] 53 | end 54 | 55 | puts "!!!!!!!!!!!! learner_and_errors !!!!!!!!!!!!!!" 56 | puts Hash[*learner_and_errors.flatten].values.inspect 57 | puts 58 | 59 | learner_and_errors.each do |learner, columns, _| 60 | # run model on test set for generalized error 61 | errors = @test_set.map do |datum| 62 | expected = [datum[target_rv]] 63 | actual = learner.classify(datum) 64 | expected != actual 65 | end 66 | 67 | error_rate = errors.count { |e| e == true }.to_f / errors.length 68 | 69 | puts "#{columns.inspect} error rate: #{error_rate.round(4)}" 70 | end 71 | puts 72 | 73 | # select model based on least model error 74 | learner, _ = learner_and_errors.min { |a, b| a[1] <=> b[1] } 75 | 76 | # run model on test set for generalized error 77 | errors = @test_set.map do |datum| 78 | expected = [datum[target_rv]] 79 | actual = learner.classify(datum) 80 | expected != actual 81 | end 82 | 83 | error_rate = errors.count { |e| e == true }.to_f / errors.length 84 | 85 | puts "Test set error rate: #{error_rate.round(4)}" 86 | end 87 | 88 | private 89 | 90 | def partition_sets 91 | sixty = (@data.length * 0.6).floor 92 | eighty = (@data.length * 0.8).floor 93 | 94 | @training_set = @data[0..sixty] 95 | @cross_validation_set = @data[sixty..eighty] 96 | @test_set = @data[eighty..@data.length] 97 | end 98 | 99 | def models 100 | [4,4].map { |n| 101 | @cols.reject { |c| 102 | [:fnlwgt, :education_num, :native_country].include?(c) 103 | }.sample(n) 104 | } 105 | end 106 | 107 | end 108 | -------------------------------------------------------------------------------- /examples/decision_tree/decision_tree.rb: -------------------------------------------------------------------------------- 1 | 2 | require 'prolly' 3 | #require 'ruby-prof' 4 | 5 | include Prolly 6 | 7 | module DecisionTree 8 | class << self 9 | def load(cols, filepath) 10 | File.open(filepath, 'r') do |f| 11 | f.each_line do |line| 12 | next if line.chomp.empty? 13 | yield Hash[*cols.zip(line.chomp.split(/,\s*/)).flatten] 14 | end 15 | end 16 | end 17 | end 18 | 19 | class Machine 20 | attr_reader :tree 21 | 22 | def initialize 23 | @tree = nil 24 | end 25 | 26 | def load(data_set) 27 | data_set.each { |datum| 28 | add(datum) 29 | } 30 | end 31 | 32 | def add(example) 33 | ::Ps.add(example) 34 | end 35 | 36 | def split_rv(rv) 37 | if rv.class == Hash 38 | rkey = rv.keys.first 39 | rval = rv[rkey] 40 | else 41 | rkey = rv 42 | rval = nil 43 | end 44 | return rkey, rval 45 | end 46 | 47 | def putss(rvs, str) 48 | space = @rv_size - rvs.size 49 | puts (" " * (space) * 4) + str 50 | end 51 | 52 | def classify(datum) 53 | classify_helper(@tree, datum) 54 | # recursively traverse down the tree and figure out the decision. 55 | #classify_helper(....?) 56 | end 57 | 58 | # recursive. 59 | def classify_helper(node, datum) 60 | if node.kind_of?(Hash) 61 | max_result = node.max { |a, b| a[1] <=> b[1] } 62 | return max_result[0] 63 | else 64 | # puts "node: #{node}" 65 | unless datum.has_key?(node.name) 66 | raise "Missing column #{node.name} in datum" 67 | end 68 | 69 | unless node.children.has_key?([datum[node.name]]) 70 | return nil 71 | end 72 | 73 | # puts datum[node.name] 74 | return classify_helper(node.children[[datum[node.name]]], datum) 75 | end 76 | end 77 | 78 | def learn(rv_target, &block) 79 | #RubyProf.start 80 | 81 | tkey, tval = split_rv(rv_target) 82 | rvs = ::Ps.rv.reject { |rv| rv == tkey } 83 | rvs.reject! do |key| 84 | !block.call(key) 85 | end 86 | @rv_size = rvs.size 87 | 88 | @tree = create_node(rv_target, {}, rvs, &block) 89 | 90 | #result = RubyProf.stop 91 | #printer = RubyProf::MultiPrinter.new(result) 92 | #printer.print(:path => "./profile", :profile => "profile", :min_percent => 2) 93 | end 94 | 95 | # rv_target - the variable we're trying to learn 96 | # rv_parents - hash of past decisions in branch 97 | # rand_vars - remaining rand_vars to decide on 98 | # block - for filtering which key to use 99 | def create_node(rv_target, rv_parents, rand_vars, &block) 100 | tkey, tval = split_rv(rv_target) 101 | #pkey, pval = split_rv(rv_parent) 102 | 103 | # calculate all gains for remaining rand vars 104 | gains = rand_vars.map do |key| 105 | ig = ::Ps.rv(tkey).given(key, rv_parents).infogain 106 | putss rand_vars, "#{tkey} | #{key}, #{rv_parents} = #{ig}" 107 | [ key, ig ] 108 | end 109 | putss rand_vars, "Gains: #{gains.to_s}" 110 | 111 | # find the next RV 112 | # use the rkey and remove it from list of candidate rand_vars 113 | rkey, _ = gains.max { |a, b| 114 | if a[1].nan? and b[1].nan? 115 | 0 116 | elsif a[1].nan? 117 | -1 118 | elsif b[1].nan? 119 | 1 120 | else 121 | a[1] <=> b[1] 122 | end 123 | } 124 | gains.delete_if { |ig| ig[0] == rkey } 125 | new_rand_vars = gains.map { |g| g[0] } 126 | 127 | # create node to attach to parent node 128 | putss rand_vars, "Using :#{rkey} for node with parents #{rv_parents} to create node" 129 | node = Node.new(rkey) 130 | 131 | # create a child node for every value of selected rkey 132 | ::Ps.uniq_vals([rkey]).each do |rval| 133 | rval_str = rval.first 134 | new_rv_parents = rv_parents.clone.merge(rkey => rval_str) 135 | 136 | putss rand_vars, "P(#{tkey} | #{new_rv_parents}) =" 137 | prob_distr = ::Ps.rv(tkey).given(new_rv_parents).pdf 138 | putss rand_vars, "-- #{prob_distr}" 139 | 140 | ## base case 0 141 | #if gains.empty? 142 | # putss rand_vars, "Base Case 0 #{rkey}: no more rvs" 143 | # node.add(rval_str, prob_distr) 144 | # next 145 | #end 146 | 147 | # base case 2 148 | if gains.all? { |ig| ig[1] == 0.0 } 149 | putss rand_vars, gains.inspect 150 | putss rand_vars, "Base Case 2 #{rkey}: Gains all zero" 151 | node.add(rval, prob_distr) 152 | next 153 | end 154 | 155 | # base case 1 156 | ent = ::Ps.rv(tkey).given(new_rv_parents).entropy 157 | putss rand_vars, "H(#{tkey} | #{new_rv_parents}) =" 158 | putss rand_vars, "-- #{ent}" 159 | if ent == 0.0 160 | putss rand_vars, "Base Case 1 #{rkey}: H(#{tkey} | #{new_rv_parents}) = 0" 161 | node.add(rval, prob_distr) 162 | next 163 | end 164 | 165 | putss rand_vars, "Creating child node for #{rkey} = #{rval}" 166 | child_node = create_node(rv_target, new_rv_parents, new_rand_vars, &block) 167 | node.add(rval, child_node) 168 | end 169 | 170 | puts 171 | 172 | return node 173 | end 174 | end 175 | 176 | class Node 177 | attr_accessor :name 178 | attr_reader :children 179 | 180 | def initialize(name = nil) 181 | @name = name 182 | @children = {} 183 | end 184 | 185 | def add(val, node) 186 | return if node.nil? 187 | @children[val] = node 188 | end 189 | 190 | def inspect 191 | result = "{ " 192 | result += %Q{"name": "#{@name}", } 193 | result += %Q{"children": \{} 194 | @children.each do |child_name, child_node| 195 | result += %Q{"#{child_name}"} 196 | result += " => " 197 | result += child_node.inspect 198 | result += ", " 199 | end 200 | result += " }" 201 | result += " }" 202 | end 203 | 204 | end 205 | 206 | end 207 | 208 | 209 | -------------------------------------------------------------------------------- /examples/decision_tree/main.rb: -------------------------------------------------------------------------------- 1 | 2 | require_relative 'cross_validation' 3 | require_relative 'decision_tree' 4 | 5 | cols = [ 6 | :age, :workclass, :fnlwgt, :education, :education_num, :marital_status, 7 | :occupation, :relationship, :race, :sex, :capital_gain, :capital_loss, 8 | :hours_per_week, :native_country, :income 9 | ] 10 | 11 | def discretize(datum) 12 | Hash[*datum.flat_map { |k, v| 13 | discretized_value = case k 14 | when :age 15 | discretize_age(v) 16 | when :capital_gain 17 | discretize_capital_gain(v) 18 | when :capital_loss 19 | discretize_capital_loss(v) 20 | when :hours_per_week 21 | discretize_hours_per_week(v) 22 | else 23 | v 24 | end 25 | [k, discretized_value] 26 | }] 27 | end 28 | 29 | def discretize_age(v) 30 | v = v.to_i 31 | if v <= 18 32 | return "<= 18" 33 | elsif v > 18 and v <= 25 34 | return "> 18 and <= 25" 35 | elsif v > 25 and v <= 30 36 | return "> 25 and <= 30" 37 | elsif v > 40 and v <= 50 38 | return "> 40 and 50" 39 | elsif v > 50 and v <= 60 40 | return "> 50 and 60" 41 | else 42 | return "> 60" 43 | end 44 | end 45 | 46 | def discretize_capital_gain(v) 47 | v = v.to_i 48 | if v <= 10000 49 | return "<= 10000" 50 | elsif v > 10000 and v <= 20000 51 | return "10000 and 20000" 52 | elsif v > 20000 and v <= 30000 53 | return "20000 and 30000" 54 | elsif v > 30000 and v <= 40000 55 | return "30000 and 40000" 56 | elsif v > 40000 and v <= 50000 57 | return "40000 and 50000" 58 | elsif v > 50000 and v <= 60000 59 | return "50000 and 60000" 60 | elsif v > 60000 and v <= 70000 61 | return "60000 and 70000" 62 | elsif v > 70000 and v <= 80000 63 | return "70000 and 80000" 64 | elsif v > 80000 and v <= 90000 65 | return "80000 and 90000" 66 | else 67 | return "< 90000" 68 | end 69 | end 70 | 71 | def discretize_capital_loss(v) 72 | v = v.to_i 73 | if v <= 10000 74 | return "<= 10000" 75 | elsif v > 10000 and v <= 20000 76 | return "10000 and 20000" 77 | elsif v > 20000 and v <= 30000 78 | return "20000 and 30000" 79 | elsif v > 30000 and v <= 40000 80 | return "30000 and 40000" 81 | elsif v > 40000 and v <= 50000 82 | return "40000 and 50000" 83 | elsif v > 50000 and v <= 60000 84 | return "50000 and 60000" 85 | elsif v > 60000 and v <= 70000 86 | return "60000 and 70000" 87 | elsif v > 70000 and v <= 80000 88 | return "70000 and 80000" 89 | elsif v > 80000 and v <= 90000 90 | return "80000 and 90000" 91 | else 92 | return "< 90000" 93 | end 94 | end 95 | 96 | def discretize_hours_per_week(v) 97 | v = v.to_i 98 | if v <= 20 99 | return "< 20" 100 | elsif v > 20 and v <= 30 101 | return "20 and 30" 102 | elsif v > 30 and v <= 40 103 | return "30 and 40" 104 | else 105 | return "> 40" 106 | end 107 | end 108 | 109 | cv = CrossValidation.new(cols, {}) 110 | 111 | cv.setup do |cv| 112 | DecisionTree.load(cols, "data/adult.csv") do |example| 113 | data = discretize(example) 114 | data.delete(:fnlwgt) 115 | data.delete(:education_num) 116 | data.delete(:native_country) 117 | cv.add(data) 118 | end 119 | end 120 | 121 | cv.run(:income) 122 | 123 | 124 | -------------------------------------------------------------------------------- /lib/prolly.rb: -------------------------------------------------------------------------------- 1 | require 'prolly/ps' 2 | 3 | -------------------------------------------------------------------------------- /lib/prolly/ps.rb: -------------------------------------------------------------------------------- 1 | require "forwardable" 2 | require "prolly/rand_var" 3 | 4 | require "prolly/ps/storage/rubylist" 5 | require "prolly/ps/storage/mongodb" 6 | #require "prolly/ps/storage/redis" 7 | 8 | module Prolly 9 | class Ps 10 | 11 | class << self 12 | def ps 13 | @ps ||= Ps.new 14 | end 15 | 16 | def import(data) 17 | ps.import(data) 18 | end 19 | 20 | def reset 21 | ps.reset 22 | end 23 | 24 | def add(datum) 25 | ps.add(datum) 26 | end 27 | 28 | def rv(*rand_vars) 29 | if rand_vars.empty? 30 | ps.rand_vars 31 | else 32 | RandVar.new(ps, *rand_vars) 33 | end 34 | end 35 | 36 | def stash 37 | ps.stash 38 | end 39 | 40 | # unique values for a random variable. 41 | # 42 | # If there are multiple random variables, then we get combinations of the unique 43 | # values of the random variables 44 | def uniq_vals(uspec_rvs) 45 | 46 | def combo(list_of_vals) 47 | if list_of_vals.length == 1 48 | list_of_vals.first.map { |e| [e] } 49 | else 50 | combinations = combo(list_of_vals[1..-1]) 51 | list_of_vals.first.flat_map { |val| combinations.map { |e| [val] + e } } 52 | end 53 | end 54 | 55 | combo(uspec_rvs.map { |uspec_rv| @ps.uniq_vals(uspec_rv) }) 56 | end 57 | end 58 | 59 | extend Forwardable 60 | 61 | def_delegators :@storage, :reset, :add, :count, :rand_vars, :uniq_vals, :import 62 | 63 | def initialize(storage = nil) 64 | #@storage = Storage::Mongodb.new() 65 | @storage = Storage::Rubylist.new() 66 | #@storage = Storage::Redis.new() 67 | end 68 | 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /lib/prolly/ps/storage/base.rb: -------------------------------------------------------------------------------- 1 | 2 | module Prolly 3 | class Ps 4 | module Storage 5 | 6 | class Base 7 | 8 | def initialize 9 | end 10 | 11 | def reset 12 | @stash ||= {} 13 | @stash_stats ||= { hits: 0, misses: 0 } 14 | @stash_time ||= {} 15 | end 16 | 17 | def import(data) 18 | data.each { |datum| add(datum) } 19 | end 20 | 21 | def add(datum) 22 | raise StandardError.new("not implemented") 23 | end 24 | 25 | def count(rvs, options = {}) 26 | raise StandardError.new("not implemented") 27 | end 28 | 29 | def rand_vars 30 | end 31 | 32 | def uniq_vals(name) 33 | end 34 | 35 | end 36 | 37 | end 38 | end 39 | end 40 | 41 | -------------------------------------------------------------------------------- /lib/prolly/ps/storage/mongodb.rb: -------------------------------------------------------------------------------- 1 | require 'date' 2 | require 'moped' 3 | 4 | require 'prolly/ps/storage/base' 5 | 6 | module Prolly 7 | class Ps 8 | module Storage 9 | 10 | class Mongodb < Base 11 | 12 | attr_reader :session 13 | 14 | def initialize 15 | @session ||= Moped::Session.new(["127.0.0.1:27017", "127.0.0.1:27018"]) 16 | @session.use 'pspace' 17 | 18 | super 19 | @rand_vars = [] 20 | end 21 | 22 | def reset 23 | super 24 | @session['samples'].drop 25 | end 26 | 27 | def add(datum) 28 | # create an index for each new datum key 29 | #new_rvs(datum).each do |rv| 30 | # @session.indexes.create(rv.to_sym => 1) 31 | #end 32 | 33 | record_new_rand_vars(datum) 34 | 35 | @session[:samples].insert(datum) 36 | end 37 | 38 | def count(rvs, options = {}) 39 | reload = options["reload"] || false 40 | if rvs.kind_of?(Array) 41 | @session[:samples].find( 42 | Hash[*rvs.flat_map { |rv| [rv, { '$exists' => true }] }] 43 | ).count 44 | elsif rvs.kind_of?(Hash) 45 | @session[:samples].find(to_query_hash(rvs)).count 46 | end 47 | end 48 | 49 | def rand_vars 50 | @session[:rand_vars].find.map { |rv| rv[:name] } 51 | end 52 | 53 | def uniq_vals(name) 54 | @session[:samples].aggregate([ 55 | { "$match" => { name.to_sym => { "$exists" => true } } }, 56 | { "$group" => { "_id": "$#{name}" } } 57 | ]).map { |e| e["_id"] } 58 | end 59 | 60 | private 61 | 62 | def new_rvs(datum) 63 | return datum.keys - rand_vars 64 | end 65 | 66 | def record_new_rand_vars(datum) 67 | new_rvs(datum).each do |rv| 68 | @session[:rand_vars].insert({ name: rv }) 69 | end 70 | end 71 | 72 | def to_query_hash(rvs) 73 | Hash[*rvs.flat_map { |k, v| 74 | [k, v.kind_of?(Array) ? { "$in" => v } : v] 75 | }] 76 | end 77 | 78 | end 79 | 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /lib/prolly/ps/storage/redis.rb: -------------------------------------------------------------------------------- 1 | require "redis" 2 | 3 | require 'prolly/ps/storage/base' 4 | 5 | module Prolly 6 | class Ps 7 | module Storage 8 | 9 | class Redis 10 | 11 | def initialize(data) 12 | @redis = ::Redis.new(host: "localhost", port: "6379") 13 | reset 14 | import(data) unless data.nil? 15 | end 16 | 17 | def reset 18 | @redis.keys("pspace:*").each { |k| @redis.del k } 19 | end 20 | 21 | def import(data) 22 | data.each { |datum| add(datum) } 23 | end 24 | 25 | def add(datum) 26 | datum.each do |rv, val| 27 | @redis.sadd "pspace:rand_vars", rv 28 | @redis.sadd "pspace:uniq_vals:#{rv}", val 29 | 30 | @redis.PFADD "pspace:count:#{rv}", datum.object_id.to_i 31 | @redis.PFADD "pspace:count:#{rv}=#{val}", datum.object_id.to_i 32 | 33 | end 34 | end 35 | 36 | def count(rvs, options = {}) 37 | if rvs.kind_of?(Array) 38 | @redis.pfcount *rvs.map { |rv| "pspace:count:#{rv}" } 39 | elsif rvs.kind_of?(Hash) 40 | @redis.pfcount *rvs.map { |rv, val| "pspace:count:#{rv}=#{val}" } 41 | end 42 | end 43 | 44 | def rand_vars 45 | @redis.smembers "pspace:rand_vars" 46 | end 47 | 48 | def uniq_vals(rv) 49 | @redis.smembers "pspace:uniq_vals:#{rv}" 50 | end 51 | 52 | end 53 | 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/prolly/ps/storage/rubylist.rb: -------------------------------------------------------------------------------- 1 | require 'prolly/ps/storage/base' 2 | 3 | module Prolly 4 | class Ps 5 | module Storage 6 | 7 | class Rubylist < Base 8 | 9 | def initialize 10 | super 11 | end 12 | 13 | def reset 14 | super 15 | @data = [] 16 | @uniq_vals = {} 17 | end 18 | 19 | def add(datum) 20 | @data << datum 21 | end 22 | 23 | def count(rvs, options = {}) 24 | reload = options[:reload] || false 25 | if rvs.kind_of?(Array) 26 | value = @data.count { |e| rvs.all? { |rv| e.has_key?(rv) } } 27 | elsif rvs.kind_of?(Hash) 28 | value = @data.count { |e| 29 | rvs.map { |rkey, rval| 30 | vals = rval.kind_of?(Array) ? rval : [rval] 31 | vals.include?(e[rkey]) 32 | }.all? 33 | } 34 | end 35 | return value 36 | end 37 | 38 | def rand_vars 39 | @data.first.keys 40 | end 41 | 42 | def uniq_vals(name) 43 | @uniq_vals[name] ||= @data.map { |li| li.has_key?(name) ? li[name] : nil }.uniq 44 | end 45 | 46 | private 47 | 48 | def explain(rvs, options = {}) 49 | end 50 | 51 | def stats(options = {}) 52 | end 53 | 54 | def display_stats 55 | require 'pp' 56 | puts "------------- Stats! --------------------" 57 | puts 58 | pp @stash_time.sort { |a, b| b[1][:usage] <=> a[1][:usage] }[0..10] 59 | puts 60 | pp @stash_time.sort { |a, b| b[1][:elapsed] <=> a[1][:elapsed] }[0..10] 61 | puts 62 | puts @stash_stats.inspect 63 | puts 64 | end 65 | 66 | end 67 | 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /lib/prolly/rand_var.rb: -------------------------------------------------------------------------------- 1 | require 'prolly/rand_var/prob' 2 | require 'prolly/rand_var/pdf' 3 | require 'prolly/rand_var/entropy' 4 | require 'prolly/rand_var/infogain' 5 | 6 | module Prolly 7 | 8 | class RandVar 9 | 10 | include Prob 11 | include Pdf 12 | include Entropy 13 | include Infogain 14 | 15 | def initialize(pspace, *rand_vars) 16 | @pspace = pspace 17 | 18 | @uspec_rv, @spec_rv = parse(rand_vars) 19 | 20 | @uspec_gv = [] 21 | @spec_gv = {} 22 | end 23 | 24 | # parses rand_var arguments 25 | # 26 | # random variable are passed in as arguments to a method. It can take the format of: 27 | # 28 | # :size 29 | # 30 | # { size: :large, color: :green } 31 | # 32 | # [ :size, { color: :green, texture: :rough } ] 33 | # 34 | def parse(rand_vars) 35 | if rand_vars.kind_of?(Hash) 36 | specified_rvs = rand_vars 37 | unspecified_rvs = [] 38 | elsif rand_vars.kind_of?(Array) 39 | specified_rvs, unspecified_rvs = rand_vars.partition { |e| e.kind_of?(Hash) } 40 | specified_rvs = specified_rvs.inject({}) { |t, e| t.merge(e) } 41 | else # if it's a symbol 42 | specified_rvs = [] 43 | unspecified_rvs = [rand_vars] 44 | end 45 | 46 | return unspecified_rvs, specified_rvs 47 | end 48 | 49 | def given(*rand_vars) 50 | @uspec_gv, @spec_gv = parse(rand_vars) 51 | 52 | return self 53 | end 54 | 55 | def count 56 | if !@spec_rv.empty? 57 | if @uspec_gv.empty? and @spec_gv.empty? 58 | @pspace.count(@spec_rv) 59 | else 60 | @pspace.count(@spec_rv.merge(@spec_gv)) 61 | end 62 | else 63 | @pspace.count(@uspec_rv) 64 | end 65 | end 66 | 67 | end 68 | 69 | end 70 | -------------------------------------------------------------------------------- /lib/prolly/rand_var/entropy.rb: -------------------------------------------------------------------------------- 1 | module Prolly 2 | class RandVar 3 | 4 | module Entropy 5 | 6 | # Entropy doesn't take hashes (for now?) 7 | # If it did, I'm not sure what H(color=green) means at all. 8 | def entropy 9 | if !@spec_rv.empty? 10 | raise "Cannot use entropy with specified random variables" 11 | else 12 | #puts "H(#{@rv} | #{@gv})" 13 | 14 | if @uspec_gv.empty?# and @spec_gv.empty? 15 | entropy_rv 16 | else 17 | entropy_rv_gv 18 | end 19 | 20 | end 21 | end 22 | 23 | private 24 | 25 | # H(color) 26 | # H(color, size) 27 | # H(color | size=small) 28 | # H(color, size | texture=smooth) 29 | # H(color | size=small, texture=smooth) 30 | def entropy_rv 31 | distr = pdf 32 | distr.inject(0) do |t, kv| 33 | name, pn = kv 34 | t += -pn * (pn == 0 ? 0.0 : Math.log(pn)) / Math.log(10) 35 | end 36 | end 37 | 38 | # H(color | size) 39 | # H(color, weight | size, texture = smooth) 40 | # H(color | size, texture = smooth) 41 | def entropy_rv_gv 42 | ::Ps.uniq_vals(@uspec_gv).inject(0) do |t, gv_vals| 43 | uspec_gv_speced = Hash[*@uspec_gv.zip(gv_vals).flatten] 44 | gv = @spec_gv.merge(uspec_gv_speced) 45 | 46 | pn = Ps.rv(gv).given(@spec_gv).prob 47 | hn = Ps.rv(*@uspec_rv).given(gv).entropy 48 | 49 | #puts "P(#{gv} | #{@spec_gv}) = #{pn}" 50 | #puts "H(#{@uspec_rv} | #{gv}) = #{hn}" 51 | #puts " #{Ps.rv(*@uspec_rv).given(gv).prob}" 52 | 53 | t += (pn * hn) 54 | end 55 | end 56 | 57 | end 58 | 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /lib/prolly/rand_var/infogain.rb: -------------------------------------------------------------------------------- 1 | module Prolly 2 | class RandVar 3 | 4 | module Infogain 5 | 6 | # I(Y | X) 7 | # I(Y | X, A = a) 8 | # I(Y | X, A = a, B = b) 9 | def infogain 10 | raise "Need given var" if @uspec_gv.empty? and @spec_gv.empty? 11 | raise "Need unspecified given var" if @uspec_gv.empty? 12 | raise "Need unspecified rand var" if @uspec_rv.empty? 13 | 14 | # puts "I(#{@rv} | #{@gv})" 15 | Ps.rv(*@uspec_rv).given(@spec_gv).entropy - 16 | Ps.rv(*@uspec_rv).given(*@uspec_gv, @spec_gv).entropy 17 | end 18 | 19 | end 20 | 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/prolly/rand_var/pdf.rb: -------------------------------------------------------------------------------- 1 | module Prolly 2 | class RandVar 3 | 4 | module Pdf 5 | 6 | def pdf 7 | if !@spec_rv.empty? 8 | 9 | raise StandardError.new("Cannot use pdf on this RV") 10 | 11 | #if @uspec_gv.empty? and @spec_gv.empty? 12 | # prob_rv_eq 13 | #else 14 | # prob_rv_eq_gv_eq 15 | #end 16 | 17 | else 18 | #puts "distr : #{@rv.to_s} : #{@gv.to_s}" 19 | 20 | if @uspec_gv.empty? and @spec_gv.empty? 21 | prob_rv 22 | elsif not @spec_gv.empty? 23 | prob_rv_gv_eq 24 | else 25 | prob_rv_gv 26 | end 27 | 28 | end 29 | 30 | end 31 | 32 | private 33 | 34 | # P(color) = [P(color=green), P(color=blue)] 35 | # P(color, size) = [every combo of color and size] 36 | def prob_rv 37 | distr = ::Ps.uniq_vals(@uspec_rv).flat_map do |rv_vals| 38 | spec_rv = Hash[*@uspec_rv.zip(rv_vals).flatten] 39 | [rv_vals, Ps.rv(spec_rv).prob] 40 | end 41 | 42 | Hash[*distr] 43 | end 44 | 45 | # P(color | size=small) = 46 | # [P(color=green | size=small), P(color=blue | size=small)] 47 | # P(color | size=small, texture=smooth) = 48 | # [P(every color | size=small, texture=smooth)] 49 | def prob_rv_gv_eq 50 | distr = ::Ps.uniq_vals(@uspec_rv).flat_map do |rv_vals| 51 | spec_rv = Hash[*@uspec_rv.zip(rv_vals).flatten] 52 | [rv_vals, Ps.rv(spec_rv).given(@spec_gv).prob] 53 | end 54 | 55 | Hash[*distr] 56 | end 57 | 58 | # P(color | size) = 59 | # [P(color=green | size), P(color=blue | size)] 60 | # TODO not tested 61 | def prob_rv_gv 62 | rv = @uspec_rv.first 63 | gv = @uspec_gv.first 64 | 65 | distr = @pspace.uniq_vals(rv).flat_map do |rv_val| 66 | #puts "rv | gv : #{rv.to_s} | #{@gv.to_s}" 67 | 68 | [rv_val, Ps.rv(rv.to_sym => rv_val).given(gv.to_sym).prob] 69 | end 70 | Hash[*distr] 71 | end 72 | 73 | 74 | 75 | end 76 | 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /lib/prolly/rand_var/prob.rb: -------------------------------------------------------------------------------- 1 | module Prolly 2 | class RandVar 3 | 4 | module Prob 5 | 6 | def prob 7 | #puts "P(#{@rv} | #{@gv})" 8 | raise StandardError.new("Cannot use prob on this RV") if @spec_rv.empty? 9 | 10 | if @uspec_gv.empty? and @spec_gv.empty? 11 | prob_rv_eq 12 | else 13 | prob_rv_eq_gv_eq 14 | end 15 | end 16 | 17 | private 18 | 19 | # P(color=green) 20 | # P(color=green, size=small) 21 | # P(color=[green, blue]) 22 | def prob_rv_eq 23 | numer = self.count() 24 | denom = @pspace.count(@spec_rv.keys) 25 | 26 | if denom == 0.0 27 | return 0.0 28 | else 29 | return numer.to_f / denom 30 | end 31 | end 32 | 33 | # P(color=green | size=small) 34 | # P(color=green, size=small | texture=smooth) 35 | # P(color=green | size=small, texture=smooth) 36 | def prob_rv_eq_gv_eq 37 | numer = @pspace.count(@spec_rv.merge(@spec_gv)) 38 | denom = @pspace.count(@spec_gv) 39 | 40 | if denom == 0.0 41 | return 0.0 42 | else 43 | return numer.to_f / denom 44 | end 45 | end 46 | 47 | # P(color=green | size) 48 | # 49 | # For now, this is like P(color=green) 50 | def prob_rv_eq_gv 51 | numer = @pspace.count(@spec_rv) 52 | denom = @pspace.count(@uspec_gv) 53 | 54 | return numer.to_f / denom 55 | end 56 | 57 | 58 | end 59 | 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /prolly.gemspec: -------------------------------------------------------------------------------- 1 | 2 | Gem::Specification.new do |s| 3 | s.name = "prolly" 4 | s.version = '0.0.2' 5 | s.date = '2015-02-15' 6 | s.summary = 'Domain Specific Language for expressing probabilities in code' 7 | s.description = "Just like a database has a query language like SQL this is a query language specifically for answering questions about probabilities of events based on the samples you have seen before" 8 | s.authors = ["Wil Chung"] 9 | s.email = "iamwil@gmail.com" 10 | s.files = [Dir.glob(File.join('lib', '**', '**')), 'LICENSE', 'README.markdown'].flatten 11 | s.add_runtime_dependency('moped', '~> 2.0', '>= 2.0.3') 12 | s.homepage = "https://github.com/iamwilhelm/prolly" 13 | s.license = 'MIT' 14 | end 15 | 16 | -------------------------------------------------------------------------------- /specs/ps_spec.rb: -------------------------------------------------------------------------------- 1 | $:.unshift "lib" 2 | 3 | require "rspec" 4 | require "prolly" 5 | 6 | include Prolly 7 | 8 | describe Ps do 9 | let(:data) { 10 | [ 11 | { :color => :green, :size => :small, :texture => :smooth }, 12 | { :color => :blue, :size => :small, :texture => :rough }, 13 | { :color => :blue, :size => :med, :texture => :smooth }, 14 | { :color => :green, :size => :large, :texture => :rough }, 15 | { :color => :green, :size => :small, :texture => :smooth }, 16 | ] 17 | } 18 | 19 | before do 20 | Ps.reset 21 | Ps.import(data) 22 | end 23 | 24 | describe "#uniq_vals" do 25 | context "when asking for uniq vals of colors" do 26 | it "is green and blue" do 27 | result = Ps.uniq_vals([:color]) 28 | expect(result).to include([:blue]) 29 | expect(result).to include([:green]) 30 | end 31 | end 32 | 33 | context "when asking for uniq vals of colors and sizes" do 34 | it "is combinations of green blue, small, med, and large" do 35 | result = Ps.uniq_vals([:color, :size]) 36 | expect(result).to include([:green, :small]) 37 | expect(result).to include([:green, :med]) 38 | expect(result).to include([:green, :large]) 39 | expect(result).to include([:blue, :small]) 40 | expect(result).to include([:blue, :med]) 41 | expect(result).to include([:blue, :large]) 42 | end 43 | end 44 | 45 | context "when asking for uniq vals of colors, sizes, and textures" do 46 | it "is combinations of blue, green, small, med, large, smooth, and rough" do 47 | result = Ps.uniq_vals([:color, :size, :texture]) 48 | expect(result).to include([:green, :small, :smooth]) 49 | expect(result).to include([:green, :med , :smooth]) 50 | expect(result).to include([:green, :large, :smooth]) 51 | expect(result).to include([:blue, :small , :smooth]) 52 | expect(result).to include([:blue, :med , :smooth]) 53 | expect(result).to include([:blue, :large , :smooth]) 54 | 55 | expect(result).to include([:green, :small, :rough]) 56 | expect(result).to include([:green, :med , :rough]) 57 | expect(result).to include([:green, :large, :rough]) 58 | expect(result).to include([:blue, :small , :rough]) 59 | expect(result).to include([:blue, :med , :rough]) 60 | expect(result).to include([:blue, :large , :rough]) 61 | end 62 | end 63 | end 64 | 65 | end 66 | -------------------------------------------------------------------------------- /specs/rand_var_spec.rb: -------------------------------------------------------------------------------- 1 | require "rspec" 2 | 3 | require "prolly" 4 | 5 | include Prolly 6 | 7 | describe RandVar do 8 | let(:data) { 9 | [ 10 | { color: :green, size: :small, texture: :smooth, weight: :fat , opacity: :transparent }, 11 | { color: :blue, size: :small, texture: :rough , weight: :thin, opacity: :transparent }, 12 | { color: :blue, size: :med, texture: :smooth, weight: :thin, opacity: :opaque }, 13 | { color: :green, size: :large, texture: :rough , weight: :thin, opacity: :opaque }, 14 | { color: :green, size: :small, texture: :smooth, weight: :thin, opacity: :opaque }, 15 | ] 16 | } 17 | 18 | before(:each) do 19 | Ps.reset 20 | Ps.import(data) 21 | end 22 | 23 | describe "#count" do 24 | context "when counting color" do 25 | # TODO or should it be the count distribution? 26 | it "is size of entire set" do 27 | expect(Ps.rv(:color).count).to eq(5) 28 | end 29 | end 30 | 31 | context "when counting color, size" do 32 | it "is size of entire set with both attributes" do 33 | expect(Ps.rv(:color, :size).count).to eq(5) 34 | end 35 | end 36 | 37 | context "when counting color = green" do 38 | it "is 3, number of rows with color = green" do 39 | expect(Ps.rv(color: :green).count).to eq(3) 40 | end 41 | end 42 | 43 | context "when counting color = green, size = small" do 44 | it "is 2, number of rows with both color = green and size = small" do 45 | expect(Ps.rv(color: :green, size: :small).count).to eq(2) 46 | end 47 | end 48 | 49 | context "when counting color = green | size == small" do 50 | it "is 2" do 51 | expect(Ps.rv(color: :green).given(size: :small).count).to eq(2) 52 | end 53 | end 54 | 55 | context "when counting color = green, size = small | texture = smooth" do 56 | it "is 2" do 57 | expect( 58 | Ps.rv(color: :green).given(size: :small, texture: :smooth).count 59 | ).to eq(2) 60 | end 61 | end 62 | 63 | context "when counting color = blue | size = small, texture = rough" do 64 | it "is 1" do 65 | expect( 66 | Ps.rv(color: :blue).given(size: :small, texture: :rough).count 67 | ).to eq(1) 68 | end 69 | end 70 | 71 | context "when counting color = blue | size = small, texture = smooth" do 72 | it "is 0" do 73 | expect( 74 | Ps.rv(color: :blue).given(size: :small, texture: :smooth).count 75 | ).to eq(0) 76 | end 77 | end 78 | end 79 | 80 | describe "#prob" do 81 | 82 | describe "#prob_rv_eq" do 83 | context "when prob color = green" do 84 | it "is 3/5" do 85 | expect(Ps.rv(color: :green).prob).to be_within(0.001).of(3.0 / 5) 86 | end 87 | end 88 | 89 | context "when prob color = green, size = small" do 90 | it "is 2/5" do 91 | expect( 92 | Ps.rv(color: :green, size: :small).prob 93 | ).to be_within(0.001).of(2.0 / 5) 94 | end 95 | end 96 | 97 | context "when prob size = [small, med]" do 98 | it "is 4 / 5" do 99 | expect(Ps.rv(size: [:small, :med]).prob).to be_within(0.001).of(4.0 / 5) 100 | end 101 | end 102 | end 103 | 104 | describe "#prob_rv_eq_gv_eq" do 105 | context "when prob color = green | size == small" do 106 | it "is 2/3" do 107 | result = Ps.rv(color: :green).given(size: :small).prob 108 | expect(result).to be_within(0.001).of(2.0 / 3) 109 | end 110 | end 111 | 112 | context "when prob color=green, size=small | texture=smooth" do 113 | it "is 2/3" do 114 | result = Ps.rv(color: :green, size: :small).given(texture: :smooth).prob 115 | expect(result).to be_within(0.001).of(2.0 / 3) 116 | end 117 | end 118 | 119 | context "when prob color=green | size=small, texture=smooth" do 120 | it "is 1.0" do 121 | result = Ps.rv(color: :green).given(size: :small, texture: :smooth).prob 122 | expect(result).to be_within(0.001).of(1.0) 123 | end 124 | end 125 | 126 | context "when prob color=green | size=[small,med]" do 127 | it "is 1/2" do 128 | result = Ps.rv(color: :green).given(size: [:small, :med]).prob 129 | expect(result).to be_within(0.001).of(1.0 / 2) 130 | end 131 | end 132 | end 133 | 134 | describe "#prob_rv_eq_gv" do 135 | context "when prob size=large | texture" do 136 | it "is 1/5" do 137 | result = Ps.rv(size: :large).given(:texture).prob 138 | expect(result).to be_within(0.001).of(1.0 / 5) 139 | end 140 | end 141 | end 142 | 143 | describe "#prob_rv" do 144 | context "when prob color" do 145 | it "is the entire distribution as a hash" do 146 | result = Ps.rv(:color).pdf 147 | expect(result).to be_an_instance_of(Hash) 148 | expect(result.keys).to include([:blue]) 149 | expect(result.keys).to include([:green]) 150 | end 151 | 152 | it "has probs that sum to 1" do 153 | result = Ps.rv(:color).pdf 154 | sum = result.values.inject(0) { |t,e| t += e } 155 | expect(sum).to eq(1.0) 156 | end 157 | 158 | it "is a distribution of blue = 2/5 and green = 3/5" do 159 | result = Ps.rv(:color).pdf 160 | expect(result).to be_an_instance_of(Hash) 161 | expect(result[[:blue]]).to eql(2.0 / 5) 162 | expect(result[[:green]]).to eql(3.0 / 5) 163 | end 164 | end 165 | 166 | context "when prob color, size" do 167 | it "is a distribution" do 168 | result = Ps.rv(:color, :size).pdf 169 | expect(result.keys).to include([:green, :small]) 170 | expect(result.keys).to include([:green, :med]) 171 | expect(result.keys).to include([:green, :large]) 172 | expect(result.keys).to include([:blue , :small]) 173 | expect(result.keys).to include([:blue , :med]) 174 | expect(result.keys).to include([:blue , :large]) 175 | end 176 | 177 | it "has probs that sum to 1" do 178 | result = Ps.rv(:color, :size).pdf 179 | sum = result.values.inject(0) { |t,e| t += e } 180 | expect(sum).to eq(1.0) 181 | end 182 | 183 | it "is a distribution of color and size" do 184 | result = Ps.rv(:color, :size).pdf 185 | expect(result[[:green, :small]]).to eql(2.0 / 5) 186 | expect(result[[:green, :med ]]).to eql(0.0 / 5) 187 | expect(result[[:green, :large]]).to eql(1.0 / 5) 188 | expect(result[[:blue , :small]]).to eql(1.0 / 5) 189 | expect(result[[:blue , :med ]]).to eql(1.0 / 5) 190 | expect(result[[:blue , :large]]).to eql(0.0 / 5) 191 | end 192 | end 193 | end 194 | 195 | describe "#prob_rv_gv_eq" do 196 | context "when prob color | size = small" do 197 | it "is a distribution" do 198 | result = Ps.rv(:color).given(size: :small).pdf 199 | expect(result).to be_an_instance_of(Hash) 200 | expect(result.keys).to include([:blue]) 201 | expect(result.keys).to include([:green]) 202 | end 203 | 204 | it "has probs that sum to 1" do 205 | result = Ps.rv(:color).given(size: :small).pdf 206 | sum = result.values.inject(0) { |t,e| t += e } 207 | expect(sum).to eq(1.0) 208 | end 209 | 210 | it "is a distribution of blue = 1/3, green = 2/3" do 211 | result = Ps.rv(:color).given(size: :small).pdf 212 | expect(result[[:blue]]).to eql(1.0 / 3) 213 | expect(result[[:green]]).to eql(2.0 / 3) 214 | end 215 | end 216 | 217 | context "when prob color | size = small, texture = smooth" do 218 | it "is a distribution" do 219 | result = Ps.rv(:color).given(size: :small, texture: :smooth).pdf 220 | expect(result).to be_an_instance_of(Hash) 221 | expect(result.keys).to include([:blue]) 222 | expect(result.keys).to include([:green]) 223 | end 224 | 225 | it "has probs that sum to 1" do 226 | result = Ps.rv(:color).given(size: :small, texture: :smooth).pdf 227 | sum = result.values.inject(0) { |t,e| t += e } 228 | expect(sum).to eq(1.0) 229 | end 230 | 231 | it "is a distribution of blue and green" do 232 | result = Ps.rv(:color).given(size: :small, texture: :smooth).pdf 233 | expect(result[[:blue]]).to eql(0.0 / 2) 234 | expect(result[[:green]]).to eql(2.0 / 2) 235 | end 236 | end 237 | end 238 | 239 | context "when prob color = green | size = small" do 240 | it "is ..." do 241 | #result = Ps.rv(:color).given(:size).prob 242 | end 243 | end 244 | end 245 | 246 | describe "#entropy" do 247 | 248 | describe "#entropy_rv" do 249 | context "when entropy of color" do 250 | it "is H(X) = -∑ (pn log pn)" do 251 | result = Ps.rv(:color).entropy 252 | expect(result).to be_within(0.001).of( 253 | -0.6 * Math.log(0.6) / Math.log(10) - 254 | 0.4 * Math.log(0.4) / Math.log(10) 255 | ) 256 | end 257 | end 258 | 259 | context "when entropy of color, size" do 260 | it "is H(color,size)" do 261 | result = Ps.rv(:color, :size).entropy 262 | expect(result).to be_within(0.001).of( 263 | -0.4 * Math.log(0.4) / Math.log(10) + 264 | -0.2 * Math.log(0.2) / Math.log(10) * 3 265 | ) 266 | end 267 | end 268 | 269 | context "when entropy of color | size = small" do 270 | it "is H(color | size = small)" do 271 | result = Ps.rv(:color).given(size: :small).entropy 272 | expect(result).to be_within(0.001).of( 273 | -(1.0/3) * Math.log(1.0/3) / Math.log(10) + 274 | -(2.0/3) * Math.log(2.0/3) / Math.log(10) 275 | ) 276 | end 277 | 278 | it "is H(color, size | texture = smooth)" do 279 | result = Ps.rv(:color, :size).given(texture: :smooth).entropy 280 | expect(result).to be_within(0.001).of( 281 | -(1.0/3) * Math.log(1.0/3) / Math.log(10) + 282 | -(2.0/3) * Math.log(2.0/3) / Math.log(10) 283 | ) 284 | end 285 | 286 | it "is H(color | size=small, texture=smooth)" do 287 | result = Ps.rv(:color).given(size: :small, texture: :smooth).entropy 288 | expect(result).to be_within(0.001).of( 289 | -(1.0) * Math.log(1.0) / Math.log(10) 290 | ) 291 | end 292 | end 293 | end 294 | 295 | describe "#entropy_rv_gv" do 296 | context "when entropy of color | size" do 297 | it "is H(color | size)" do 298 | result = Ps.rv(:color).given(:size).entropy 299 | expect(result).to be_within(0.001).of( 300 | # :small * (:green | :small + :blue | :small) 301 | (3.0 / 5) * (-(2.0/3) * Math.log(2.0/3) / Math.log(10) + 302 | -(1.0/3) * Math.log(1.0/3) / Math.log(10)) + 303 | # :med 304 | (1.0 / 5) * (-(0.0) + 305 | -(1.0) * Math.log(1.0) / Math.log(10)) + 306 | # :large 307 | (1.0 / 5) * (-(0.0) + 308 | -(1.0) * Math.log(1.0) / Math.log(10)) 309 | ) 310 | end 311 | 312 | it "is H(color, texture | size, weight = thin)" do 313 | result = Ps.rv(:color, :texture).given(:size, weight: :thin) 314 | 315 | expect(result.entropy).to be_within(0.001).of( 316 | # :small * (:green, :smooth | :small, :thin + 317 | # :blue, :smooth | :small, :thin + 318 | # :green, :rough | :small, :thin + 319 | # :blue, :rough | :small, :thin ) 320 | (2.0/4) * (-(1.0/2) * Math.log(1.0/2) / Math.log(10) + 321 | -(0.0/2) * 0.0 / Math.log(10) + 322 | -(0.0/2) * 0.0 / Math.log(10) + 323 | -(1.0/2) * Math.log(1.0/2) / Math.log(10) 324 | ) + 325 | # :med 326 | (1.0/4) * (-(0.0/1) * 0.0 / Math.log(10) + 327 | -(1.0/1) * Math.log(1.0/1) / Math.log(10) + 328 | -(0.0/1) * 0.0 / Math.log(10) + 329 | -(0.0/1) * 0.0 / Math.log(10) 330 | ) + 331 | # :large 332 | (1.0/4) * (-(0.0/1) * 0.0 / Math.log(10) + 333 | -(0.0/1) * 0.0 / Math.log(10) + 334 | -(1.0/1) * Math.log(1.0/1) / Math.log(10) + 335 | -(0.0/1) * 0.0 / Math.log(10) 336 | ) 337 | ) 338 | end 339 | 340 | it "is H(color | size, weight = thin)" do 341 | result = Ps.rv(:color).given(:size, weight: :thin) 342 | 343 | expect(result.entropy).to be_within(0.001).of( 344 | # :small * (:green | :small, :thin + :blue | :small, :thin) 345 | (2.0/4) * (-(1.0/2) * Math.log(1.0/2) / Math.log(10) + 346 | -(1.0/2) * Math.log(1.0/2) / Math.log(10)) + 347 | # :med 348 | (1.0/4) * (-(0.0) + 349 | -(1.0) * Math.log(1.0) / Math.log(10)) + 350 | # :large 351 | (1.0/4) * (-(0.0) + 352 | -(1.0) * Math.log(1.0) / Math.log(10)) 353 | ) 354 | end 355 | 356 | it "is H(color | texture, opacity=opaque, weight=thin)" do 357 | result = Ps.rv(:color).given(:texture, 358 | opacity: :opaque, 359 | weight: :thin) 360 | expect(result.entropy).to be_within(0.001).of( 361 | # :smooth * (:green | :smooth + :blue | :smooth) 362 | (2.0/3) * (-(1.0/2) * Math.log(1.0/2) / Math.log(10) + 363 | -(1.0/2) * Math.log(1.0/2) / Math.log(10)) + 364 | # :rough * (:green | :rough + :blue | :rough) 365 | (1.0/3) * (-(1.0/1) * Math.log(1.0/1) / Math.log(10) + 366 | -(0.0/1) * 0.0 / Math.log(10)) 367 | ) 368 | end 369 | 370 | end 371 | end 372 | end 373 | 374 | describe "#infogain" do 375 | 376 | context "when color | size" do 377 | # I(color | size) = H(color) - H(color | size) 378 | it "is the infogain(color | size)" do 379 | result = Ps.rv(:color).given(:size).infogain 380 | expect(result).to be_within(0.001).of( 381 | Ps.rv(:color).entropy - Ps.rv(:color).given(:size).entropy 382 | ) 383 | end 384 | end 385 | 386 | context "when color | size, weight = thin" do 387 | # I(color | size, weight = thin) = 388 | # H(color | weight = thin) - H(color | size, weight = thin)" 389 | it "is infogain(color | weight = thin)" do 390 | result = Ps.rv(:color).given(:size, weight: :thin).infogain 391 | 392 | expect(result).to be_within(0.001).of( 393 | Ps.rv(:color).given(weight: :thin).entropy - 394 | Ps.rv(:color).given(:size, weight: :thin).entropy 395 | ) 396 | end 397 | end 398 | 399 | context "when color | texture, weight = thin, opacity = opaque)" do 400 | it "is infogain(color | texture, weight = thin, opacity = opaque)" do 401 | result = Ps.rv(:color).given(:texture, 402 | weight: :thin, opacity: :opaque).infogain 403 | 404 | expect(result).to be_within(0.001).of( 405 | Ps.rv(:color).given(weight: :thin, opacity: :opaque).entropy - 406 | Ps.rv(:color).given(:texture, 407 | weight: :thin, opacity: :opaque).entropy 408 | ) 409 | end 410 | end 411 | 412 | end 413 | 414 | end 415 | --------------------------------------------------------------------------------