├── lib ├── cmfrec │ ├── version.rb │ ├── data.rb │ ├── ffi.rb │ └── recommender.rb └── cmfrec.rb ├── .gitignore ├── Gemfile ├── test ├── support │ ├── ratings.csv │ ├── user_info.csv │ ├── item_info.csv │ ├── explicit.py │ └── implicit.py ├── data_test.rb ├── test_helper.rb ├── similar_test.rb ├── implicit_test.rb ├── explicit_test.rb └── recommender_test.rb ├── cmfrec.gemspec ├── vendor.yml ├── .github └── workflows │ └── build.yml ├── LICENSE.txt ├── CHANGELOG.md ├── Rakefile └── README.md /lib/cmfrec/version.rb: -------------------------------------------------------------------------------- 1 | module Cmfrec 2 | VERSION = "0.3.4" 3 | end 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | *.lock 10 | /vendor/ 11 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gemspec 4 | 5 | gem "rake" 6 | gem "minitest" 7 | gem "ngt" 8 | gem "rover-df", platform: :mri 9 | gem "csv" 10 | -------------------------------------------------------------------------------- /test/support/ratings.csv: -------------------------------------------------------------------------------- 1 | user_id,item_id,rating 2 | 1,0,1.4921923425606358 3 | 3,1,2.6847079310837447 4 | 0,4,3.8581788594815807 5 | 0,4,3.0713429913298755 6 | 3,1,0.7076907232051566 7 | 1,2,1.5844475118558095 8 | 3,4,3.8858294019971025 9 | 1,2,3.6319018703788792 10 | 3,4,3.0402603509354296 11 | 0,3,2.0968771164188356 12 | -------------------------------------------------------------------------------- /test/support/user_info.csv: -------------------------------------------------------------------------------- 1 | a,b,c,d,user_id 2 | 0.1753226729488545,0.19443089035631161,-0.5352490231162413,0.7773512055030873,0 3 | 0.17133845294792382,-0.44285143947433503,1.704903771988004,0.9243458534996555,1 4 | -1.3048612409805047,-0.38057503543909377,-0.7436270102848858,-0.43712177454042656,2 5 | -0.4264500924926471,1.3814073030992386,0.09837051208332934,-0.3694574833330476,3 6 | -------------------------------------------------------------------------------- /test/data_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class DataTest < Minitest::Test 4 | def test_load_movielens 5 | ratings, user_info, item_info = Cmfrec.load_movielens 6 | assert_equal 100000, ratings.size 7 | assert_equal 943, user_info.size 8 | assert_equal 1664, item_info.size 9 | assert ratings.all? { |v| v[:user_id] } 10 | assert ratings.all? { |v| v[:item_id] } 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /test/support/item_info.csv: -------------------------------------------------------------------------------- 1 | e,f,g,h,i,item_id 2 | -1.27321995082579,1.014986802932659,-1.4810597099625187,-0.2870998934432889,-0.05682428151725377,0 3 | -0.7881051069122398,0.06770978759740764,-1.0389059260000584,-0.26367744698372175,0.538250568591776,1 4 | 1.5282602461441552,1.9566768523529174,0.4782003522287232,0.11934520974720628,-0.8574652055233603,2 5 | -0.7619478040496863,0.6393048990026347,0.04752605319893533,3.114833654896224,-0.5324615930156734,3 6 | 0.34558367589718153,-0.4255140429914282,-2.2387713760258197,1.7553969593118812,-0.2641933631054802,4 7 | -------------------------------------------------------------------------------- /test/support/explicit.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from cmfrec import CMF 3 | 4 | ratings = pd.read_csv("test/support/ratings.csv") 5 | ratings.columns = ["UserId", "ItemId", "Rating"] 6 | user_info = pd.read_csv("test/support/user_info.csv") 7 | user_info.rename(columns={"user_id": "UserId"}, inplace=True) 8 | item_info = pd.read_csv("test/support/item_info.csv") 9 | item_info.rename(columns={"item_id": "ItemId"}, inplace=True) 10 | 11 | model = CMF(k=8, verbose=False) 12 | model.fit(X=ratings, U=user_info, I=item_info) 13 | 14 | print(list(model.predict(user=[3, 3], item=[2, 4]))) 15 | -------------------------------------------------------------------------------- /test/support/implicit.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from cmfrec import CMF_implicit 3 | 4 | ratings = pd.read_csv("test/support/ratings.csv") 5 | ratings.columns = ["UserId", "ItemId", "Value"] 6 | user_info = pd.read_csv("test/support/user_info.csv") 7 | user_info.rename(columns={"user_id": "UserId"}, inplace=True) 8 | item_info = pd.read_csv("test/support/item_info.csv") 9 | item_info.rename(columns={"item_id": "ItemId"}, inplace=True) 10 | 11 | model = CMF_implicit(k=8, verbose=False) 12 | model.fit(X=ratings, U=user_info, I=item_info) 13 | 14 | print(list(model.predict(user=[3, 3], item=[2, 4]))) 15 | -------------------------------------------------------------------------------- /cmfrec.gemspec: -------------------------------------------------------------------------------- 1 | require_relative "lib/cmfrec/version" 2 | 3 | Gem::Specification.new do |spec| 4 | spec.name = "cmfrec" 5 | spec.version = Cmfrec::VERSION 6 | spec.summary = "Recommendations for Ruby using collective matrix factorization" 7 | spec.homepage = "https://github.com/ankane/cmfrec-ruby" 8 | spec.license = "MIT" 9 | 10 | spec.author = "Andrew Kane" 11 | spec.email = "andrew@ankane.org" 12 | 13 | spec.files = Dir["*.{md,txt}", "{lib,vendor}/**/*"] 14 | spec.require_path = "lib" 15 | 16 | spec.required_ruby_version = ">= 3.1" 17 | 18 | spec.add_dependency "fiddle", ">= 1.1.7" 19 | end 20 | -------------------------------------------------------------------------------- /vendor.yml: -------------------------------------------------------------------------------- 1 | platforms: 2 | x86_64-linux: 3 | url: https://github.com/ankane/ml-builds/releases/download/cmfrec-3.4.2/cmfrec-3.4.2-x86_64-linux.zip 4 | sha256: cc501fb99347d4fefd4cd279099f1b4a2fe4dc3f3e36ac6728d56fc5a2917116 5 | aarch64-linux: 6 | url: https://github.com/ankane/ml-builds/releases/download/cmfrec-3.4.2/cmfrec-3.4.2-aarch64-linux.zip 7 | sha256: a2973156970bd95170fcec53ed05e0883829d993f6db2d5144d64290d58df13c 8 | x86_64-darwin: 9 | url: https://github.com/ankane/ml-builds/releases/download/cmfrec-3.4.2/cmfrec-3.4.2-x86_64-darwin.zip 10 | sha256: 0d872f2fc4358dad47a18a592c836c23df0d04e25c3ca35f435ac78cabb6fe63 11 | arm64-darwin: 12 | url: https://github.com/ankane/ml-builds/releases/download/cmfrec-3.4.2/cmfrec-3.4.2-aarch64-darwin.zip 13 | sha256: 519661a1ff6906f13b387ea50d3e7118356eecd2234c3f07aeb3f8137fe4a829 14 | -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | require "bundler/setup" 2 | Bundler.require(:default) 3 | require "minitest/autorun" 4 | 5 | class Minitest::Test 6 | def setup 7 | if stress? 8 | # autoload before GC.stress 9 | Cmfrec::FFI.name 10 | skip if is_a?(DataTest) 11 | GC.stress = true 12 | end 13 | end 14 | 15 | def teardown 16 | GC.stress = false if stress? 17 | end 18 | 19 | def stress? 20 | ENV["STRESS"] 21 | end 22 | 23 | def assert_elements_in_delta(expected, actual) 24 | assert_equal expected.size, actual.size 25 | expected.zip(actual) do |exp, act| 26 | assert_in_delta exp, act 27 | end 28 | end 29 | 30 | FILES = {} 31 | 32 | def read_csv(name) 33 | require "csv" 34 | 35 | FILES[name] ||= CSV.read("test/support/#{name}.csv", headers: true, converters: :numeric, header_converters: :symbol).map { |v| v.to_h.freeze }.freeze 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | os: [ubuntu-latest, macos-latest] 9 | runs-on: ${{ matrix.os }} 10 | steps: 11 | - uses: actions/checkout@v5 12 | - if: ${{ startsWith(matrix.os, 'ubuntu') }} 13 | run: sudo apt-get update && sudo apt-get install liblapack3 14 | - if: ${{ startsWith(matrix.os, 'macos') }} 15 | run: brew install libomp 16 | env: 17 | HOMEBREW_NO_AUTO_UPDATE: 1 18 | HOMEBREW_NO_INSTALL_CLEANUP: 1 19 | - uses: ruby/setup-ruby@v1 20 | with: 21 | ruby-version: 3.4 22 | bundler-cache: true 23 | - uses: actions/cache@v4 24 | with: 25 | path: ~/.cache/cmfrec 26 | key: cmfrec 27 | - run: bundle exec rake vendor:platform 28 | - run: bundle exec rake test 29 | -------------------------------------------------------------------------------- /lib/cmfrec.rb: -------------------------------------------------------------------------------- 1 | # stdlib 2 | require "etc" 3 | require "fiddle/import" 4 | 5 | # modules 6 | require_relative "cmfrec/data" 7 | require_relative "cmfrec/recommender" 8 | require_relative "cmfrec/version" 9 | 10 | module Cmfrec 11 | class Error < StandardError; end 12 | 13 | extend Data 14 | 15 | class << self 16 | attr_accessor :ffi_lib 17 | end 18 | lib_path = 19 | if Gem.win_platform? 20 | "x64-mingw/cmfrec.dll" 21 | elsif RbConfig::CONFIG["host_os"] =~ /darwin/i 22 | if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i 23 | "arm64-darwin/libcmfrec.dylib" 24 | else 25 | "x86_64-darwin/libcmfrec.dylib" 26 | end 27 | else 28 | if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i 29 | "aarch64-linux/libcmfrec.so" 30 | else 31 | "x86_64-linux/libcmfrec.so" 32 | end 33 | end 34 | vendor_lib = File.expand_path("../vendor/#{lib_path}", __dir__) 35 | self.ffi_lib = [vendor_lib] 36 | 37 | # friendlier error message 38 | autoload :FFI, "cmfrec/ffi" 39 | end 40 | -------------------------------------------------------------------------------- /test/similar_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class SimilarTest < Minitest::Test 4 | def setup 5 | # ngt not supported 6 | skip if RUBY_ENGINE == "truffleruby" 7 | end 8 | 9 | def test_similar_users 10 | data = read_csv("ratings") 11 | user_info = read_csv("user_info") 12 | item_info = read_csv("item_info") 13 | 14 | recommender = Cmfrec::Recommender.new(factors: 20, verbose: false) 15 | recommender.fit(data, user_info: user_info, item_info: item_info) 16 | 17 | refute_empty recommender.similar_users(data.first[:user_id]) 18 | assert_empty recommender.similar_users("missing") 19 | end 20 | 21 | def test_similar_items 22 | data = read_csv("ratings") 23 | user_info = read_csv("user_info") 24 | item_info = read_csv("item_info") 25 | 26 | recommender = Cmfrec::Recommender.new(factors: 20, verbose: false) 27 | recommender.fit(data, user_info: user_info, item_info: item_info) 28 | 29 | refute_empty recommender.similar_items(data.first[:item_id]) 30 | assert_empty recommender.similar_items("missing") 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 David Cortes 4 | Copyright (c) 2020-2025 Andrew Kane 5 | 6 | All rights reserved. 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to 10 | deal in the Software without restriction, including without limitation the 11 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 12 | sell copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in 16 | all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 24 | IN THE SOFTWARE. 25 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.3.4 (2025-11-22) 2 | 3 | - Added ARM shared library for Linux 4 | 5 | ## 0.3.3 (2025-09-30) 6 | 7 | - Fixed error with Rover 0.5+ 8 | 9 | ## 0.3.2 (2025-05-04) 10 | 11 | - Fixed crash with Fiddle 1.1.7+ 12 | - Fixed memory leaks 13 | 14 | ## 0.3.1 (2024-12-29) 15 | 16 | - Removed dependency on `base64` gem for serialization 17 | 18 | ## 0.3.0 (2024-10-23) 19 | 20 | - Changed dataset directory to match XDG Base Directory Specification 21 | - Removed dependency on `csv` gem for `load_movielens` 22 | - Dropped support for marshal serialization 23 | - Dropped support for Ruby < 3.1 24 | 25 | ## 0.2.1 (2022-07-11) 26 | 27 | - Added support for JSON serialization 28 | 29 | ## 0.2.0 (2022-06-14) 30 | 31 | - Updated cmfrec to 3.4.2 32 | - Fixed missing item ids with `load_movielens` 33 | - Dropped support for Ruby < 2.7 34 | 35 | ## 0.1.7 (2022-03-22) 36 | 37 | - Improved ARM detection 38 | - Fixed error with `load_movielens` 39 | - Fixed duplicates in `item_info` with `load_movielens` 40 | 41 | ## 0.1.6 (2021-08-12) 42 | 43 | - Added `user_ids` and `item_ids` methods 44 | - Added `user_id` argument to `user_factors` 45 | - Added `item_id` argument to `item_factors` 46 | - Added `user_id` argument to `user_bias` 47 | - Added `item_id` argument to `item_bias` 48 | - Added `item_ids` argument to `new_user_recs` 49 | - Fixed order for `user_recs` 50 | 51 | ## 0.1.5 (2021-08-10) 52 | 53 | - Fixed issue with `user_recs` and `new_user_recs` returning rated items 54 | - Fixed error with `new_user_recs` 55 | 56 | ## 0.1.4 (2021-02-04) 57 | 58 | - Added support for saving and loading recommenders 59 | - Added `similar_users` and `similar_items` 60 | - Improved ARM detection 61 | 62 | ## 0.1.3 (2020-12-28) 63 | 64 | - Added ARM shared library for Mac 65 | 66 | ## 0.1.2 (2020-12-09) 67 | 68 | - Added `load_movielens` method 69 | - Updated cmfrec to 2.4.1 70 | 71 | ## 0.1.1 (2020-11-28) 72 | 73 | - Added `predict` method 74 | 75 | ## 0.1.0 (2020-11-27) 76 | 77 | - First release 78 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | require "rake/testtask" 3 | 4 | Rake::TestTask.new do |t| 5 | t.pattern = "test/**/*_test.rb" 6 | end 7 | 8 | task default: :test 9 | 10 | # ensure vendor files exist 11 | task :ensure_vendor do 12 | vendor_config.fetch("platforms").each_key do |k| 13 | raise "Missing directory: #{k}" unless Dir.exist?("vendor/#{k}") 14 | end 15 | end 16 | 17 | Rake::Task["build"].enhance [:ensure_vendor] 18 | 19 | def download_platform(platform) 20 | require "fileutils" 21 | require "open-uri" 22 | require "tmpdir" 23 | 24 | config = vendor_config.fetch("platforms").fetch(platform) 25 | url = config.fetch("url") 26 | sha256 = config.fetch("sha256") 27 | 28 | puts "Downloading #{url}..." 29 | contents = URI.parse(url).read 30 | 31 | computed_sha256 = Digest::SHA256.hexdigest(contents) 32 | raise "Bad hash: #{computed_sha256}" if computed_sha256 != sha256 33 | 34 | file = Tempfile.new(binmode: true) 35 | file.write(contents) 36 | 37 | vendor = File.expand_path("vendor", __dir__) 38 | FileUtils.mkdir_p(vendor) 39 | 40 | dest = File.join(vendor, platform) 41 | FileUtils.rm_r(dest) if Dir.exist?(dest) 42 | 43 | # run apt install unzip on Linux 44 | system "unzip", "-q", file.path, "-d", dest, exception: true 45 | end 46 | 47 | def vendor_config 48 | @vendor_config ||= begin 49 | require "yaml" 50 | YAML.safe_load_file("vendor.yml") 51 | end 52 | end 53 | 54 | namespace :vendor do 55 | task :all do 56 | vendor_config.fetch("platforms").each_key do |k| 57 | download_platform(k) 58 | end 59 | end 60 | 61 | task :platform do 62 | if Gem.win_platform? 63 | download_platform("x64-mingw") 64 | elsif RbConfig::CONFIG["host_os"] =~ /darwin/i 65 | if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i 66 | download_platform("arm64-darwin") 67 | else 68 | download_platform("x86_64-darwin") 69 | end 70 | else 71 | if RbConfig::CONFIG["host_cpu"] =~ /arm|aarch64/i 72 | download_platform("aarch64-linux") 73 | else 74 | download_platform("x86_64-linux") 75 | end 76 | end 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /test/implicit_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class ImplicitTest < Minitest::Test 4 | def test_implicit 5 | data = read_csv("ratings") 6 | data = data.map { |v| v.except(:rating) } 7 | user_info = read_csv("user_info") 8 | item_info = read_csv("item_info") 9 | 10 | recommender = Cmfrec::Recommender.new(factors: 3, verbose: false) 11 | recommender.fit(data, user_info: user_info, item_info: item_info) 12 | assert_implicit(recommender, data, user_info, item_info) 13 | end 14 | 15 | def test_implicit_json 16 | data = read_csv("ratings") 17 | data = data.map { |v| v.except(:rating) } 18 | user_info = read_csv("user_info") 19 | item_info = read_csv("item_info") 20 | 21 | recommender = Cmfrec::Recommender.new(factors: 3, verbose: false) 22 | recommender.fit(data, user_info: user_info, item_info: item_info) 23 | 24 | recommender = Cmfrec::Recommender.load_json(recommender.to_json) 25 | assert_implicit(recommender, data, user_info, item_info) 26 | end 27 | 28 | def assert_implicit(recommender, data, user_info, item_info) 29 | assert_equal 0, recommender.global_mean 30 | assert_kind_of Array, recommender.user_factors 31 | assert_kind_of Array, recommender.item_factors 32 | assert_nil recommender.user_bias 33 | assert_nil recommender.item_bias 34 | 35 | recs = recommender.user_recs(3, item_ids: [2, 4]) 36 | assert_equal [4, 2], recs.map { |r| r[:item_id] } 37 | # assert_elements_in_delta [0.66010979, 0.27917186], recs.map { |r| r[:score] } 38 | 39 | recs = recommender.user_recs(3) 40 | assert_equal [0, 3, 2], recs.map { |r| r[:item_id] } 41 | 42 | new_data = data.select { |d| d[:user_id] == 3 }.map(&:dup) 43 | new_data.each { |d| d.delete(:user_id) } 44 | new_user_info = user_info.find { |d| d[:user_id] == 3 } 45 | 46 | # data + user info 47 | recs = recommender.new_user_recs(new_data, user_info: new_user_info) 48 | assert_equal [0, 3, 2], recs.map { |r| r[:item_id] } 49 | 50 | # data 51 | recs = recommender.new_user_recs(new_data) 52 | assert_equal [0, 3, 2], recs.map { |r| r[:item_id] } 53 | 54 | # user info 55 | recs = recommender.new_user_recs([], user_info: new_user_info) 56 | assert_equal [4, 0, 1, 3, 2], recs.map { |r| r[:item_id] } 57 | 58 | # user bias 59 | recommender.user_ids.each do |user_id| 60 | assert_nil recommender.user_bias(user_id) 61 | end 62 | assert_nil recommender.user_bias("unknown") 63 | 64 | # item bias 65 | recommender.item_ids.each do |item_id| 66 | assert_nil recommender.item_bias(item_id) 67 | end 68 | assert_nil recommender.item_bias("unknown") 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /test/explicit_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class ExplicitTest < Minitest::Test 4 | def test_explicit 5 | data = read_csv("ratings") 6 | user_info = read_csv("user_info") 7 | item_info = read_csv("item_info") 8 | 9 | recommender = Cmfrec::Recommender.new(factors: 3, verbose: false) 10 | recommender.fit(data, user_info: user_info, item_info: item_info) 11 | 12 | assert_explicit(recommender, data, user_info, item_info) 13 | end 14 | 15 | def test_explicit_json 16 | data = read_csv("ratings") 17 | user_info = read_csv("user_info") 18 | item_info = read_csv("item_info") 19 | 20 | recommender = Cmfrec::Recommender.new(factors: 3, verbose: false) 21 | recommender.fit(data, user_info: user_info, item_info: item_info) 22 | 23 | recommender = Cmfrec::Recommender.load_json(recommender.to_json) 24 | assert_explicit(recommender, data, user_info, item_info) 25 | end 26 | 27 | # TODO better test 28 | def test_add_implicit_features 29 | data = read_csv("ratings") 30 | recommender = Cmfrec::Recommender.new(add_implicit_features: true, verbose: false) 31 | recommender.fit(data) 32 | 33 | recs = recommender.user_recs(3, item_ids: [2, 4]) 34 | assert_equal [4, 2], recs.map { |r| r[:item_id] } 35 | assert_elements_in_delta [2.82454054, 2.59874401], recs.map { |r| r[:score] } 36 | end 37 | 38 | def assert_explicit(recommender, data, user_info, item_info) 39 | assert_in_delta 2.6053429099247047, recommender.global_mean 40 | assert_kind_of Array, recommender.user_factors 41 | assert_kind_of Array, recommender.item_factors 42 | assert_kind_of Array, recommender.user_bias 43 | assert_kind_of Array, recommender.item_bias 44 | 45 | expected = [-0.08009341941042647, -0.020419767633096483, 0.06021799829862086, 0.0] 46 | assert_elements_in_delta expected, recommender.user_bias 47 | 48 | expected = [-0.09391428617760313, -0.14812063585792457, 0.23961739305090726, 0.01382086676717879, -0.05169852652770836] 49 | assert_elements_in_delta expected, recommender.item_bias 50 | 51 | recs = recommender.user_recs(3, item_ids: [2, 4]) 52 | assert_equal [4, 2], recs.map { |r| r[:item_id] } 53 | assert_elements_in_delta [2.82454054, 2.59874401], recs.map { |r| r[:score] } 54 | 55 | recs = recommender.user_recs(3, item_ids: [1, 2, 4], count: 2) 56 | assert_equal [4, 2], recs.map { |r| r[:item_id] } 57 | 58 | recs = recommender.user_recs(3) 59 | assert_equal [2, 3, 0], recs.map { |r| r[:item_id] } 60 | assert_elements_in_delta [2.59874401, 2.53322462, 2.49100886], recs.map { |r| r[:score] } 61 | 62 | new_data = data.select { |d| d[:user_id] == 3 }.map(&:dup) 63 | new_data.each { |d| d.delete(:user_id) } 64 | new_user_info = user_info.find { |d| d[:user_id] == 3 } 65 | 66 | # data + user info 67 | recs = recommender.new_user_recs(new_data, user_info: new_user_info) 68 | assert_equal [2, 3, 0], recs.map { |r| r[:item_id] } 69 | 70 | # data 71 | recs = recommender.new_user_recs(new_data) 72 | assert_equal [2, 3, 0], recs.map { |r| r[:item_id] } 73 | 74 | # user info 75 | recs = recommender.new_user_recs([], user_info: new_user_info) 76 | assert_equal [4, 2, 3, 0, 1], recs.map { |r| r[:item_id] } 77 | 78 | # user bias 79 | recommender.user_ids.zip(recommender.user_bias) do |user_id, bias| 80 | assert_equal bias, recommender.user_bias(user_id) 81 | end 82 | assert_nil recommender.user_bias("unknown") 83 | 84 | # item bias 85 | recommender.item_ids.zip(recommender.item_bias) do |item_id, bias| 86 | assert_equal bias, recommender.item_bias(item_id) 87 | end 88 | assert_nil recommender.item_bias("unknown") 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /lib/cmfrec/data.rb: -------------------------------------------------------------------------------- 1 | module Cmfrec 2 | module Data 3 | def load_movielens 4 | data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data", 5 | file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490") 6 | user_path = download_file("ml-100k/u.user", "https://files.grouplens.org/datasets/movielens/ml-100k/u.user", 7 | file_hash: "f120e114da2e8cf314fd28f99417c94ae9ddf1cb6db8ce0e4b5995d40e90e62c") 8 | item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item", 9 | file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701") 10 | 11 | user_info = [] 12 | File.foreach(user_path) do |line| 13 | row = line.split("|") 14 | user = {user_id: row[0].to_i} 15 | 10.times do |i| 16 | user[:"region#{i}"] = row[4][0] == i.to_s ? 1 : 0 17 | end 18 | user_info << user 19 | end 20 | 21 | item_info = [] 22 | movies = {} 23 | movie_names = {} 24 | genres = %w(unknown action adventure animation childrens comedy crime documentary drama fantasy filmnoir horror musical mystery romance scifi thriller war western) 25 | File.foreach(item_path) do |line| 26 | row = line.encode("UTF-8", "ISO-8859-1").split("|") 27 | movies[row[0]] = row[1] 28 | 29 | # filter duplicates 30 | next if movie_names[row[1]] 31 | movie_names[row[1]] = true 32 | 33 | item = {item_id: row[1], year: !row[2].empty? ? Date.strptime(row[2], "%d-%b-%Y").year : 1970} 34 | genres.each_with_index do |genre, i| 35 | item[:"genre_#{genre}"] = row[i + 5].to_i 36 | end 37 | item_info << item 38 | end 39 | 40 | data = [] 41 | File.foreach(data_path) do |line| 42 | row = line.split("\t") 43 | data << { 44 | user_id: row[0].to_i, 45 | item_id: movies[row[1]], 46 | rating: row[2].to_i 47 | } 48 | end 49 | 50 | [data, user_info, item_info] 51 | end 52 | 53 | private 54 | 55 | def download_file(fname, origin, file_hash:) 56 | require "digest" 57 | require "fileutils" 58 | require "net/http" 59 | require "tmpdir" 60 | 61 | cache_home = ENV["XDG_CACHE_HOME"] || "#{ENV.fetch("HOME")}/.cache" 62 | dest = "#{cache_home}/cmfrec/#{fname}" 63 | FileUtils.mkdir_p(File.dirname(dest)) 64 | 65 | return dest if File.exist?(dest) 66 | 67 | temp_path = "#{Dir.tmpdir}/cmfrec-#{Time.now.to_f}" # TODO better name 68 | 69 | digest = Digest::SHA2.new 70 | 71 | uri = URI(origin) 72 | 73 | # Net::HTTP automatically adds Accept-Encoding for compression 74 | # of response bodies and automatically decompresses gzip 75 | # and deflateresponses unless a Range header was sent. 76 | # https://ruby-doc.org/stdlib-2.6.4/libdoc/net/http/rdoc/Net/HTTP.html 77 | Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http| 78 | request = Net::HTTP::Get.new(uri) 79 | 80 | puts "Downloading data from #{origin}" 81 | File.open(temp_path, "wb") do |f| 82 | http.request(request) do |response| 83 | response.read_body do |chunk| 84 | f.write(chunk) 85 | digest.update(chunk) 86 | end 87 | end 88 | end 89 | end 90 | 91 | if digest.hexdigest != file_hash 92 | raise Error, "Bad hash: #{digest.hexdigest}" 93 | end 94 | 95 | puts "Hash verified: #{file_hash}" 96 | 97 | FileUtils.mv(temp_path, dest) 98 | 99 | dest 100 | end 101 | end 102 | end 103 | -------------------------------------------------------------------------------- /lib/cmfrec/ffi.rb: -------------------------------------------------------------------------------- 1 | module Cmfrec 2 | module FFI 3 | extend Fiddle::Importer 4 | 5 | libs = Cmfrec.ffi_lib.dup 6 | begin 7 | dlload Fiddle.dlopen(libs.shift) 8 | rescue Fiddle::DLError => e 9 | retry if libs.any? 10 | raise e 11 | end 12 | 13 | # https://github.com/david-cortes/cmfrec/blob/master/src/cmfrec.h 14 | 15 | # determined by CMakeLists.txt 16 | typealias "int_t", "int" 17 | typealias "real_t", "double" 18 | 19 | extern "int_t fit_collective_explicit_als(real_t *restrict biasA, real_t *restrict biasB, real_t *restrict A, real_t *restrict B, real_t *restrict C, real_t *restrict D, real_t *restrict Ai, real_t *restrict Bi, bool add_implicit_features, bool reset_values, int_t seed, real_t *restrict glob_mean, real_t *restrict U_colmeans, real_t *restrict I_colmeans, int_t m, int_t n, int_t k, int_t ixA[], int_t ixB[], real_t *restrict X, size_t nnz, real_t *restrict Xfull, real_t *restrict weight, bool user_bias, bool item_bias, bool center, real_t lam, real_t *restrict lam_unique, real_t l1_lam, real_t *restrict l1_lam_unique, bool scale_lam, bool scale_lam_sideinfo, bool scale_bias_const, real_t *scaling_biasA, real_t *scaling_biasB, real_t *restrict U, int_t m_u, int_t p, real_t *restrict II, int_t n_i, int_t q, int_t U_row[], int_t U_col[], real_t *restrict U_sp, size_t nnz_U, int_t I_row[], int_t I_col[], real_t *restrict I_sp, size_t nnz_I, bool NA_as_zero_X, bool NA_as_zero_U, bool NA_as_zero_I, int_t k_main, int_t k_user, int_t k_item, real_t w_main, real_t w_user, real_t w_item, real_t w_implicit, int_t niter, int nthreads, bool verbose, bool handle_interrupt, bool use_cg, int_t max_cg_steps, bool precondition_cg, bool finalize_chol, bool nonneg, int_t max_cd_steps, bool nonneg_C, bool nonneg_D, bool precompute_for_predictions, bool include_all_X, real_t *restrict B_plus_bias, real_t *restrict precomputedBtB, real_t *restrict precomputedTransBtBinvBt, real_t *restrict precomputedBtXbias, real_t *restrict precomputedBeTBeChol, real_t *restrict precomputedBiTBi, real_t *restrict precomputedTransCtCinvCt, real_t *restrict precomputedCtCw, real_t *precomputedCtUbias)" 20 | extern "int_t fit_collective_implicit_als(real_t *restrict A, real_t *restrict B, real_t *restrict C, real_t *restrict D, bool reset_values, int_t seed, real_t *restrict U_colmeans, real_t *restrict I_colmeans, int_t m, int_t n, int_t k, int_t ixA[], int_t ixB[], real_t *restrict X, size_t nnz, real_t lam, real_t *restrict lam_unique, real_t l1_lam, real_t *restrict l1_lam_unique, real_t *restrict U, int_t m_u, int_t p, real_t *restrict II, int_t n_i, int_t q, int_t U_row[], int_t U_col[], real_t *restrict U_sp, size_t nnz_U, int_t I_row[], int_t I_col[], real_t *restrict I_sp, size_t nnz_I, bool NA_as_zero_U, bool NA_as_zero_I, int_t k_main, int_t k_user, int_t k_item, real_t w_main, real_t w_user, real_t w_item, real_t *restrict w_main_multiplier, real_t alpha, bool adjust_weight, bool apply_log_transf, int_t niter, int nthreads, bool verbose, bool handle_interrupt, bool use_cg, int_t max_cg_steps, bool precondition_cg, bool finalize_chol, bool nonneg, int_t max_cd_steps, bool nonneg_C, bool nonneg_D, bool precompute_for_predictions, real_t *restrict precomputedBtB, real_t *restrict precomputedBeTBe, real_t *restrict precomputedBeTBeChol, real_t *precomputedCtUbias)" 21 | extern "int_t predict_X_old_collective_explicit(int_t row[], int_t col[], real_t *restrict predicted, size_t n_predict, real_t *restrict A, real_t *restrict biasA, real_t *restrict B, real_t *restrict biasB, real_t glob_mean, int_t k, int_t k_user, int_t k_item, int_t k_main, int_t m, int_t n_max, int nthreads)" 22 | extern "int_t predict_X_old_collective_implicit(int_t row[], int_t col[], real_t *restrict predicted, size_t n_predict, real_t *restrict A, real_t *restrict B, int_t k, int_t k_user, int_t k_item, int_t k_main, int_t m, int_t n, int nthreads)" 23 | extern "int_t topN_old_collective_explicit(real_t *a_vec, real_t a_bias, real_t *A, real_t *biasA, int_t row_index, real_t *B, real_t *biasB, real_t glob_mean, int_t k, int_t k_user, int_t k_item, int_t k_main, int_t *include_ix, int_t n_include, int_t *exclude_ix, int_t n_exclude, int_t *outp_ix, real_t *outp_score, int_t n_top, int_t n, int_t n_max, bool include_all_X, int nthreads)" 24 | extern "int_t topN_old_collective_implicit(real_t *a_vec, real_t *A, int_t row_index, real_t *B, int_t k, int_t k_user, int_t k_item, int_t k_main, int_t *include_ix, int_t n_include, int_t *exclude_ix, int_t n_exclude, int_t *outp_ix, real_t *outp_score, int_t n_top, int_t n, int nthreads)" 25 | extern "int_t topN_new_collective_explicit(bool user_bias, real_t *u_vec, int_t p, real_t *u_vec_sp, int_t u_vec_X_col[], size_t nnz_u_vec, real_t *u_bin_vec, int_t pbin, bool NA_as_zero_U, bool NA_as_zero_X, bool nonneg, real_t *C, real_t *Cb, real_t glob_mean, real_t *biasB, real_t *U_colmeans, real_t *Xa, int_t X_col[], size_t nnz, real_t *Xa_dense, int_t n, real_t *weight, real_t *B, real_t *Bi, bool add_implicit_features, int_t k, int_t k_user, int_t k_item, int_t k_main, real_t lam, real_t *lam_unique, real_t l1_lam, real_t *l1_lam_unique, bool scale_lam, bool scale_lam_sideinfo, bool scale_bias_const, real_t scaling_biasA, real_t w_main, real_t w_user, real_t w_implicit, int_t n_max, bool include_all_X, real_t *BtB, real_t *TransBtBinvBt, real_t *BtXbias, real_t *BeTBeChol, real_t *BiTBi, real_t *CtCw, real_t *TransCtCinvCt, real_t *CtUbias, real_t *B_plus_bias, int_t *include_ix, int_t n_include, int_t *exclude_ix, int_t n_exclude, int_t *outp_ix, real_t *outp_score, int_t n_top, int nthreads)" 26 | extern "int_t topN_new_collective_implicit(int_t n, real_t *u_vec, int_t p, real_t *u_vec_sp, int_t u_vec_X_col[], size_t nnz_u_vec, bool NA_as_zero_U, bool nonneg, real_t *U_colmeans, real_t *B, real_t *C, real_t *Xa, int_t X_col[], size_t nnz, int_t k, int_t k_user, int_t k_item, int_t k_main, real_t lam, real_t l1_lam, real_t alpha, real_t w_main, real_t w_user, real_t w_main_multiplier, bool apply_log_transf, real_t *BeTBe, real_t *BtB, real_t *BeTBeChol, real_t *CtUbias, int_t *include_ix, int_t n_include, int_t *exclude_ix, int_t n_exclude, int_t *outp_ix, real_t *outp_score, int_t n_top, int nthreads)" 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cmfrec Ruby 2 | 3 | :fire: Recommendations for Ruby, powered by [cmfrec](https://github.com/david-cortes/cmfrec) 4 | 5 | - Supports side information :tada: 6 | - Works with explicit and implicit feedback 7 | - Uses high-performance matrix factorization 8 | 9 | [![Build Status](https://github.com/ankane/cmfrec-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/cmfrec-ruby/actions) 10 | 11 | ## Installation 12 | 13 | Add this line to your application’s Gemfile: 14 | 15 | ```ruby 16 | gem "cmfrec" 17 | ``` 18 | 19 | For Windows, also follow [these instructions](#windows-installation). 20 | 21 | ## Getting Started 22 | 23 | Create a recommender 24 | 25 | ```ruby 26 | recommender = Cmfrec::Recommender.new 27 | ``` 28 | 29 | If users rate items directly, this is known as explicit feedback. Fit the recommender with: 30 | 31 | ```ruby 32 | recommender.fit([ 33 | {user_id: 1, item_id: 1, rating: 5}, 34 | {user_id: 2, item_id: 1, rating: 3} 35 | ]) 36 | ``` 37 | 38 | > IDs can be integers, strings, or any other data type 39 | 40 | If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating, or use a value like number of purchases, number of page views, or time spent on page: 41 | 42 | ```ruby 43 | recommender.fit([ 44 | {user_id: 1, item_id: 1, value: 1}, 45 | {user_id: 2, item_id: 1, value: 1} 46 | ]) 47 | ``` 48 | 49 | > Use `value` instead of `rating` for implicit feedback 50 | 51 | Get recommendations for a user in the training data 52 | 53 | ```ruby 54 | recommender.user_recs(user_id) 55 | ``` 56 | 57 | Get recommendations for a new user 58 | 59 | ```ruby 60 | recommender.new_user_recs([ 61 | {item_id: 1, rating: 5}, 62 | {item_id: 2, rating: 3} 63 | ]) 64 | ``` 65 | 66 | Use the `count` option to specify the number of recommendations (default is 5) 67 | 68 | ```ruby 69 | recommender.user_recs(user_id, count: 3) 70 | ``` 71 | 72 | Get predicted ratings for specific users and items 73 | 74 | ```ruby 75 | recommender.predict([{user_id: 1, item_id: 2}, {user_id: 2, item_id: 4}]) 76 | ``` 77 | 78 | ## Side Information 79 | 80 | Add side information about users, items, or both 81 | 82 | ```ruby 83 | user_info = [ 84 | {user_id: 1, cats: 1, dogs: 0}, 85 | {user_id: 2, cats: 2, dogs: 1} 86 | ] 87 | item_info = [ 88 | {item_id: 1, genre_comedy: 1, genre_drama: 0}, 89 | {item_id: 2, genre_comedy: 0, genre_drama: 1} 90 | ] 91 | recommender.fit(ratings, user_info: user_info, item_info: item_info) 92 | ``` 93 | 94 | Get recommendations for a new user with ratings and side information 95 | 96 | ```ruby 97 | ratings = [ 98 | {item_id: 1, rating: 5}, 99 | {item_id: 2, rating: 3} 100 | ] 101 | recommender.new_user_recs(ratings, user_info: {cats: 0, dogs: 2}) 102 | ``` 103 | 104 | Get recommendations with only side information 105 | 106 | ```ruby 107 | recommender.new_user_recs([], user_info: {cats: 0, dogs: 2}) 108 | ``` 109 | 110 | ## Similarity 111 | 112 | Add this line to your application’s Gemfile: 113 | 114 | ```ruby 115 | gem "ngt" 116 | ``` 117 | 118 | Get similar users 119 | 120 | ```ruby 121 | recommender.similar_users(user_id) 122 | ``` 123 | 124 | Get similar items - “users who liked this item also liked” 125 | 126 | ```ruby 127 | recommender.similar_items(item_id) 128 | ``` 129 | 130 | ## Examples 131 | 132 | ### MovieLens 133 | 134 | Load the data 135 | 136 | ```ruby 137 | ratings, user_info, item_info = Cmfrec.load_movielens 138 | ``` 139 | 140 | Create a recommender and get predictions 141 | 142 | ```ruby 143 | recommender = Cmfrec::Recommender.new(factors: 20) 144 | recommender.fit(ratings.first(80000), user_info: user_info, item_info: item_info) 145 | recommender.predict(ratings.last(20000)) 146 | ``` 147 | 148 | ### Ahoy 149 | 150 | [Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback 151 | 152 | ```ruby 153 | views = Ahoy::Event.where(name: "Viewed post").group(:user_id).group_prop(:post_id).count 154 | 155 | data = 156 | views.map do |(user_id, post_id), count| 157 | { 158 | user_id: user_id, 159 | item_id: post_id, 160 | value: count 161 | } 162 | end 163 | ``` 164 | 165 | Create a recommender and get recommended posts for a user 166 | 167 | ```ruby 168 | recommender = Cmfrec::Recommender.new 169 | recommender.fit(data) 170 | recommender.user_recs(current_user.id) 171 | ``` 172 | 173 | ## Options 174 | 175 | Specify the number of factors and epochs 176 | 177 | ```ruby 178 | Cmfrec::Recommender.new(factors: 8, epochs: 20) 179 | ``` 180 | 181 | If recommendations look off, trying changing `factors`. The default is 8, but 3 could be good for some applications and 300 good for others. 182 | 183 | ### Explicit Feedback 184 | 185 | Add implicit features 186 | 187 | ```ruby 188 | Cmfrec::Recommender.new(add_implicit_features: true) 189 | ``` 190 | 191 | Disable bias 192 | 193 | ```ruby 194 | Cmfrec::Recommender.new(user_bias: false, item_bias: false) 195 | ``` 196 | 197 | ## Data 198 | 199 | Data can be an array of hashes 200 | 201 | ```ruby 202 | [{user_id: 1, item_id: 1, rating: 5}, {user_id: 2, item_id: 1, rating: 3}] 203 | ``` 204 | 205 | Or a Rover data frame 206 | 207 | ```ruby 208 | Rover.read_csv("ratings.csv") 209 | ``` 210 | 211 | ## Storing Recommenders 212 | 213 | Store the recommender 214 | 215 | ```ruby 216 | json = recommender.to_json 217 | File.write("recommender.json", json) 218 | ``` 219 | 220 | The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system, or use a tool like [Trove](https://github.com/ankane/trove). Also, user and item IDs should be integers or strings for this. 221 | 222 | Load a recommender 223 | 224 | ```ruby 225 | json = File.read("recommender.json") 226 | recommender = Cmfrec::Recommender.load_json(json) 227 | ``` 228 | 229 | Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples) for Disco, which has a similar API. For explicit feedback, you should [disable the bias](#explicit-feedback) with this approach. 230 | 231 | ## Reference 232 | 233 | Get ids 234 | 235 | ```ruby 236 | recommender.user_ids 237 | recommender.item_ids 238 | ``` 239 | 240 | Get the global mean 241 | 242 | ```ruby 243 | recommender.global_mean 244 | ``` 245 | 246 | Get the factors 247 | 248 | ```ruby 249 | recommender.user_factors 250 | recommender.item_factors 251 | ``` 252 | 253 | Get the bias 254 | 255 | ```ruby 256 | recommender.user_bias 257 | recommender.item_bias 258 | ``` 259 | 260 | ## Windows Installation 261 | 262 | On Windows, build the [cmfrec C shared library](https://github.com/david-cortes/cmfrec#instalation) and set: 263 | 264 | ```ruby 265 | Cmfrec.ffi_lib = "path/to/cmfrec.dll" 266 | ``` 267 | 268 | ## History 269 | 270 | View the [changelog](https://github.com/ankane/cmfrec-ruby/blob/master/CHANGELOG.md) 271 | 272 | ## Contributing 273 | 274 | Everyone is encouraged to help improve this project. Here are a few ways you can help: 275 | 276 | - [Report bugs](https://github.com/ankane/cmfrec-ruby/issues) 277 | - Fix bugs and [submit pull requests](https://github.com/ankane/cmfrec-ruby/pulls) 278 | - Write, clarify, or fix documentation 279 | - Suggest or add new features 280 | 281 | To get started with development: 282 | 283 | ```sh 284 | git clone https://github.com/ankane/cmfrec-ruby.git 285 | cd cmfrec-ruby 286 | bundle install 287 | bundle exec rake vendor:all 288 | bundle exec rake test 289 | ``` 290 | -------------------------------------------------------------------------------- /test/recommender_test.rb: -------------------------------------------------------------------------------- 1 | require_relative "test_helper" 2 | 3 | class RecommenderTest < Minitest::Test 4 | def test_example 5 | data = [ 6 | {user_id: 0, item_id: 1, rating: 3}, 7 | {user_id: 0, item_id: 4, rating: 3}, 8 | {user_id: 0, item_id: 5, rating: 4}, 9 | {user_id: 0, item_id: 6, rating: 5}, 10 | {user_id: 1, item_id: 2, rating: 2}, 11 | {user_id: 2, item_id: 0, rating: 1}, 12 | {user_id: 2, item_id: 3, rating: 5}, 13 | {user_id: 2, item_id: 5, rating: 1}, 14 | {user_id: 3, item_id: 4, rating: 4}, 15 | {user_id: 3, item_id: 5, rating: 2}, 16 | {user_id: 3, item_id: 7, rating: 3}, 17 | {user_id: 4, item_id: 1, rating: 3}, 18 | {user_id: 5, item_id: 0, rating: 1}, 19 | {user_id: 5, item_id: 2, rating: 3}, 20 | {user_id: 5, item_id: 3, rating: 2}, 21 | {user_id: 5, item_id: 6, rating: 5} 22 | ] 23 | 24 | recommender = Cmfrec::Recommender.new(factors: 3, user_bias: false, item_bias: false, verbose: false) 25 | recommender.fit(data) 26 | assert_in_delta 2.9375, recommender.global_mean 27 | end 28 | 29 | def test_rated 30 | data = [ 31 | {user_id: 1, item_id: "A"}, 32 | {user_id: 1, item_id: "B"}, 33 | {user_id: 1, item_id: "C"}, 34 | {user_id: 1, item_id: "D"}, 35 | {user_id: 2, item_id: "C"}, 36 | {user_id: 2, item_id: "D"}, 37 | {user_id: 2, item_id: "E"}, 38 | {user_id: 2, item_id: "F"} 39 | ] 40 | recommender = Cmfrec::Recommender.new(verbose: false) 41 | recommender.fit(data) 42 | assert_equal ["E", "F"], recommender.user_recs(1).map { |r| r[:item_id] }.sort 43 | assert_equal ["A", "B"], recommender.user_recs(2).map { |r| r[:item_id] }.sort 44 | 45 | new_data = ["B", "C", "D", "E"].map { |v| {item_id: v} } 46 | assert_equal ["A", "F"], recommender.new_user_recs(new_data).map { |r| r[:item_id] }.sort 47 | end 48 | 49 | def test_rated_all 50 | recommender = Cmfrec::Recommender.new(verbose: false) 51 | recommender.fit([{user_id: 1, item_id: "A"}]) 52 | assert_empty recommender.user_recs(1) 53 | assert_empty recommender.new_user_recs([{user_id: 1, item_id: "A"}]) 54 | end 55 | 56 | def test_no_bias 57 | data = read_csv("ratings") 58 | recommender = Cmfrec::Recommender.new(user_bias: false, item_bias: false, verbose: false) 59 | recommender.fit(data) 60 | assert_nil recommender.user_bias 61 | assert_nil recommender.item_bias 62 | end 63 | 64 | def test_ids 65 | data = [ 66 | {user_id: 1, item_id: "A"}, 67 | {user_id: 1, item_id: "B"}, 68 | {user_id: 2, item_id: "B"} 69 | ] 70 | recommender = Cmfrec::Recommender.new(verbose: false) 71 | recommender.fit(data) 72 | assert_equal [1, 2], recommender.user_ids 73 | assert_equal ["A", "B"], recommender.item_ids 74 | end 75 | 76 | def test_factors 77 | data = [ 78 | {user_id: 1, item_id: "A"}, 79 | {user_id: 1, item_id: "B"}, 80 | {user_id: 2, item_id: "B"} 81 | ] 82 | recommender = Cmfrec::Recommender.new(factors: 20, verbose: false) 83 | recommender.fit(data) 84 | 85 | assert_equal [2, 20], [recommender.user_factors.size, recommender.user_factors[0].size] 86 | assert_equal [2, 20], [recommender.item_factors.size, recommender.item_factors[0].size] 87 | 88 | assert_equal 20, recommender.user_factors(1).size 89 | assert_equal 20, recommender.item_factors("A").size 90 | 91 | assert_nil recommender.user_factors(3) 92 | assert_nil recommender.item_factors("C") 93 | end 94 | 95 | def test_user_recs_item_ids 96 | recommender = Cmfrec::Recommender.new(verbose: false) 97 | recommender.fit([ 98 | {user_id: 1, item_id: 1, rating: 5}, 99 | {user_id: 1, item_id: 2, rating: 3} 100 | ]) 101 | assert_equal [2], recommender.user_recs(1, item_ids: [2]).map { |r| r[:item_id] } 102 | end 103 | 104 | # Python library gets a_vec from -1 index (bug?) 105 | def test_user_recs_new_user 106 | data = read_csv("ratings") 107 | recommender = Cmfrec::Recommender.new(verbose: false) 108 | recommender.fit(data) 109 | assert_empty recommender.user_recs(1000) 110 | end 111 | 112 | # only return items that exist 113 | def test_user_recs_new_item 114 | data = read_csv("ratings") 115 | recommender = Cmfrec::Recommender.new(verbose: false) 116 | recommender.fit(data) 117 | assert_empty recommender.user_recs(3, item_ids: [1000]) 118 | end 119 | 120 | def test_new_user_recs_item_ids 121 | recommender = Cmfrec::Recommender.new(verbose: false) 122 | recommender.fit([ 123 | {user_id: 1, item_id: 1, rating: 5}, 124 | {user_id: 1, item_id: 2, rating: 3} 125 | ]) 126 | assert_equal [2], recommender.new_user_recs([], item_ids: [2]).map { |r| r[:item_id] } 127 | end 128 | 129 | def test_new_user_recs_new_item 130 | recommender = Cmfrec::Recommender.new(verbose: false) 131 | recommender.fit([{user_id: 1, item_id: "A"}]) 132 | assert_equal ["A"], recommender.new_user_recs([]).map { |r| r[:item_id] } 133 | assert_equal ["A"], recommender.new_user_recs([{item_id: "B"}]).map { |r| r[:item_id] } 134 | end 135 | 136 | def test_predict 137 | data = read_csv("ratings") 138 | recommender = Cmfrec::Recommender.new(verbose: false) 139 | recommender.fit(data) 140 | 141 | predict_data = [{user_id: 3, item_id: 2}, {user_id: 3, item_id: 4}] 142 | assert_elements_in_delta [2.59874401, 2.82454054], recommender.predict(predict_data) 143 | end 144 | 145 | def test_predict_new_user 146 | data = read_csv("ratings") 147 | recommender = Cmfrec::Recommender.new(verbose: false) 148 | recommender.fit(data) 149 | 150 | bias_index = recommender.instance_variable_get(:@item_map)[2] 151 | expected = recommender.global_mean + recommender.item_bias[bias_index] 152 | assert_elements_in_delta [expected], recommender.predict([{user_id: 1000, item_id: 2}]) 153 | end 154 | 155 | def test_predict_new_item 156 | data = read_csv("ratings") 157 | recommender = Cmfrec::Recommender.new(verbose: false) 158 | recommender.fit(data) 159 | 160 | bias_index = recommender.instance_variable_get(:@user_map)[3] 161 | expected = recommender.global_mean + recommender.user_bias[bias_index] 162 | assert_elements_in_delta [expected], recommender.predict([{user_id: 3, item_id: 1000}]) 163 | end 164 | 165 | def test_predict_new_user_and_item 166 | data = read_csv("ratings") 167 | recommender = Cmfrec::Recommender.new(verbose: false) 168 | recommender.fit(data) 169 | 170 | expected = recommender.global_mean 171 | assert_elements_in_delta [expected], recommender.predict([{user_id: 1000, item_id: 1000}]) 172 | end 173 | 174 | def test_predict_user_recs_consistent 175 | data = read_csv("ratings") 176 | recommender = Cmfrec::Recommender.new(verbose: false) 177 | recommender.fit(data) 178 | 179 | expected = data.first(5).map { |v| recommender.user_recs(v[:user_id], item_ids: [v[:item_id]]).first[:score] } 180 | predictions = recommender.predict(data.first(5)) 181 | 5.times do |i| 182 | assert_in_delta expected[i], predictions[i] 183 | end 184 | end 185 | 186 | def test_no_training_data 187 | recommender = Cmfrec::Recommender.new 188 | error = assert_raises(ArgumentError) do 189 | recommender.fit([]) 190 | end 191 | assert_equal "No training data", error.message 192 | end 193 | 194 | def test_missing_user_id 195 | recommender = Cmfrec::Recommender.new 196 | error = assert_raises(ArgumentError) do 197 | recommender.fit([{item_id: 1, rating: 5}]) 198 | end 199 | assert_equal "Missing user_id", error.message 200 | end 201 | 202 | def test_missing_item_id 203 | recommender = Cmfrec::Recommender.new 204 | error = assert_raises(ArgumentError) do 205 | recommender.fit([{user_id: 1, rating: 5}]) 206 | end 207 | assert_equal "Missing item_id", error.message 208 | end 209 | 210 | def test_user_info_missing_user_id 211 | data = read_csv("ratings") 212 | recommender = Cmfrec::Recommender.new 213 | error = assert_raises(ArgumentError) do 214 | recommender.fit(data, user_info: [{a: 1}]) 215 | end 216 | assert_equal "Missing user_id", error.message 217 | end 218 | 219 | def test_item_info_missing_item_id 220 | data = read_csv("ratings") 221 | recommender = Cmfrec::Recommender.new 222 | error = assert_raises(ArgumentError) do 223 | recommender.fit(data, item_info: [{a: 1}]) 224 | end 225 | assert_equal "Missing item_id", error.message 226 | end 227 | 228 | def test_not_fit 229 | recommender = Cmfrec::Recommender.new 230 | error = assert_raises do 231 | recommender.user_recs(1) 232 | end 233 | assert_equal "Not fit", error.message 234 | end 235 | 236 | def test_rover 237 | skip if ["jruby", "truffleruby"].include?(RUBY_ENGINE) 238 | 239 | data = Rover.read_csv("test/support/ratings.csv") 240 | user_info = Rover.read_csv("test/support/user_info.csv") 241 | item_info = Rover.read_csv("test/support/item_info.csv") 242 | recommender = Cmfrec::Recommender.new(verbose: false) 243 | recommender.fit(data, user_info: user_info, item_info: item_info) 244 | recommender.new_user_recs(data) 245 | recommender.predict(data) 246 | end 247 | end 248 | -------------------------------------------------------------------------------- /lib/cmfrec/recommender.rb: -------------------------------------------------------------------------------- 1 | module Cmfrec 2 | class Recommender 3 | attr_reader :global_mean 4 | 5 | def initialize(factors: 8, epochs: 10, verbose: true, user_bias: true, item_bias: true, add_implicit_features: false) 6 | set_params( 7 | k: factors, 8 | niter: epochs, 9 | verbose: verbose, 10 | user_bias: user_bias, 11 | item_bias: item_bias, 12 | add_implicit_features: add_implicit_features 13 | ) 14 | 15 | @fit = false 16 | @user_map = {} 17 | @item_map = {} 18 | @user_info_map = {} 19 | @item_info_map = {} 20 | end 21 | 22 | def fit(train_set, user_info: nil, item_info: nil) 23 | reset 24 | partial_fit(train_set, user_info: user_info, item_info: item_info) 25 | end 26 | 27 | def predict(data) 28 | check_fit 29 | 30 | data = to_dataset(data) 31 | 32 | u = data.map { |v| @user_map[v[:user_id]] || @user_map.size } 33 | i = data.map { |v| @item_map[v[:item_id]] || @item_map.size } 34 | 35 | row = int_ptr(u) 36 | col = int_ptr(i) 37 | n_predict = data.size 38 | predicted = Fiddle::Pointer.malloc(n_predict * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 39 | 40 | if @implicit 41 | check_status FFI.predict_X_old_collective_implicit( 42 | row, col, predicted, n_predict, 43 | @a, @b, 44 | @k, @k_user, @k_item, @k_main, 45 | @m, @n, 46 | @nthreads 47 | ) 48 | else 49 | check_status FFI.predict_X_old_collective_explicit( 50 | row, col, predicted, n_predict, 51 | @a, @bias_a, 52 | @b, @bias_b, 53 | @global_mean, 54 | @k, @k_user, @k_item, @k_main, 55 | @m, @n, 56 | @nthreads 57 | ) 58 | end 59 | 60 | predictions = real_array(predicted) 61 | predictions.map! { |v| v.nan? ? @global_mean : v } if @implicit 62 | predictions 63 | end 64 | 65 | def user_recs(user_id, count: 5, item_ids: nil) 66 | check_fit 67 | user = @user_map[user_id] 68 | 69 | if user 70 | a_vec = @a[user * @k * Fiddle::SIZEOF_DOUBLE, @k * Fiddle::SIZEOF_DOUBLE] 71 | a_bias = @bias_a ? @bias_a[user * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") : 0 72 | # @rated[user] will be nil for recommenders saved before 0.1.5 73 | top_n(a_vec: a_vec, a_bias: a_bias, count: count, rated: (@rated[user] || {}).keys, item_ids: item_ids, row_index: user) 74 | else 75 | # no items if user is unknown 76 | # TODO maybe most popular items 77 | [] 78 | end 79 | end 80 | 81 | def new_user_recs(data, count: 5, user_info: nil, item_ids: nil) 82 | check_fit 83 | 84 | data = to_dataset(data) 85 | user_info = to_dataset(user_info) if user_info 86 | 87 | # remove unknown items 88 | data, unknown_data = data.partition { |d| @item_map[d[:item_id]] } 89 | 90 | if unknown_data.any? 91 | # TODO warn for unknown items? 92 | # warn "[cmfrec] Unknown items: #{unknown_data.map { |d| d[:item_id] }.join(", ")}" 93 | end 94 | 95 | rated_ids = data.map { |d| @item_map[d[:item_id]] } 96 | 97 | nnz = data.size 98 | 99 | u_vec_sp = [] 100 | u_vec_x_col = [] 101 | if user_info 102 | user_info.each do |k, v| 103 | next if k == :user_id 104 | 105 | uc = @user_info_map[k] 106 | raise "Bad key: #{k}" unless uc 107 | 108 | u_vec_x_col << uc 109 | u_vec_sp << v 110 | end 111 | end 112 | p_ = @user_info_map.size 113 | nnz_u_vec = u_vec_sp.size 114 | u_vec_x_col = int_ptr(u_vec_x_col) 115 | u_vec_sp = real_ptr(u_vec_sp) 116 | 117 | u_vec = nil 118 | u_bin_vec = nil 119 | pbin = 0 120 | 121 | weight = nil 122 | lam_unique = nil 123 | l1_lam_unique = nil 124 | n_max = @n 125 | 126 | if data.any? 127 | if @implicit 128 | ratings = data.map { |d| d[:value] || 1 } 129 | else 130 | ratings = data.map { |d| d[:rating] } 131 | check_ratings(ratings) 132 | end 133 | xa = real_ptr(ratings) 134 | x_col = int_ptr(rated_ids) 135 | else 136 | xa = nil 137 | x_col = nil 138 | end 139 | xa_dense = nil 140 | 141 | rated = rated_ids.uniq 142 | 143 | prep = prepare_top_n(count: count, rated: rated, item_ids: item_ids) 144 | return [] if prep.empty? 145 | include_ix, n_include, exclude_ix, n_exclude, outp_ix, outp_score, count = prep 146 | 147 | if @implicit 148 | args = [ 149 | @n, 150 | u_vec, p_, 151 | u_vec_sp, u_vec_x_col, nnz_u_vec, 152 | @na_as_zero_user, 153 | @nonneg, 154 | @u_colmeans, 155 | @b, @c, 156 | xa, x_col, nnz, 157 | @k, @k_user, @k_item, @k_main, 158 | @lambda_, @l1_lambda, @alpha, @w_main, @w_user, 159 | @w_main_multiplier, 160 | @apply_log_transf, 161 | nil, #BeTBe, 162 | nil, #BtB, 163 | nil, #BeTBeChol, 164 | nil, #CtUbias, 165 | include_ix, n_include, 166 | exclude_ix, n_exclude, 167 | outp_ix, outp_score, 168 | count, @nthreads 169 | ] 170 | check_status FFI.topN_new_collective_implicit(*fiddle_args(args)) 171 | else 172 | cb = nil 173 | scaling_bias_a = 0 174 | 175 | args = [ 176 | @user_bias, 177 | u_vec, p_, 178 | u_vec_sp, u_vec_x_col, nnz_u_vec, 179 | u_bin_vec, pbin, 180 | @na_as_zero_user, @na_as_zero, 181 | @nonneg, 182 | @c, cb, 183 | @global_mean, @bias_b, 184 | @u_colmeans, 185 | xa, x_col, nnz, 186 | xa_dense, @n, 187 | weight, 188 | @b, 189 | @bi, @add_implicit_features, 190 | @k, @k_user, @k_item, @k_main, 191 | @lambda_, lam_unique, 192 | @l1_lambda, l1_lam_unique, 193 | @scale_lam, @scale_lam_sideinfo, 194 | @scale_bias_const, scaling_bias_a, 195 | @w_main, @w_user, @w_implicit, 196 | n_max, @include_all_x, 197 | nil, #BtB, 198 | nil, #TransBtBinvBt, 199 | nil, #BtXbias, 200 | nil, #BeTBeChol, 201 | nil, #BiTBi, 202 | nil, #CtCw, 203 | nil, #TransCtCinvCt, 204 | nil, #CtUbias, 205 | nil, #B_plus_bias, 206 | include_ix, n_include, 207 | exclude_ix, n_exclude, 208 | outp_ix, outp_score, 209 | count, @nthreads 210 | ] 211 | check_status FFI.topN_new_collective_explicit(*fiddle_args(args)) 212 | end 213 | 214 | top_n_output(outp_ix, outp_score) 215 | end 216 | 217 | def user_ids 218 | @user_map.keys 219 | end 220 | 221 | def item_ids 222 | @item_map.keys 223 | end 224 | 225 | def user_factors(user_id = nil) 226 | read_factors(@a, [@m, @m_u].max, @k_user + @k + @k_main, user_id, @user_map) 227 | end 228 | 229 | def item_factors(item_id = nil) 230 | read_factors(@b, [@n, @n_i].max, @k_item + @k + @k_main, item_id, @item_map) 231 | end 232 | 233 | def user_bias(user_id = nil) 234 | read_bias(@bias_a, user_id, @user_map) if @bias_a 235 | end 236 | 237 | def item_bias(item_id = nil) 238 | read_bias(@bias_b, item_id, @item_map) if @bias_b 239 | end 240 | 241 | def similar_items(item_id, count: 5) 242 | check_fit 243 | similar(item_id, @item_map, item_factors, count, item_index) 244 | end 245 | alias_method :item_recs, :similar_items 246 | 247 | def similar_users(user_id, count: 5) 248 | check_fit 249 | similar(user_id, @user_map, user_factors, count, user_index) 250 | end 251 | 252 | def to_json 253 | require "json" 254 | 255 | obj = { 256 | implicit: @implicit 257 | } 258 | 259 | # options 260 | obj[:factors] = @k 261 | obj[:epochs] = @niter 262 | obj[:verbose] = @verbose 263 | 264 | # factors 265 | obj[:user_ids] = @user_map.keys 266 | obj[:item_ids] = @item_map.keys 267 | obj[:rated] = @user_map.map { |_, u| (@rated[u] || {}).keys } 268 | obj[:user_factors] = json_dump_ptr(@a) 269 | obj[:item_factors] = json_dump_ptr(@b) 270 | 271 | # bias 272 | obj[:user_bias] = json_dump_ptr(@bias_a) 273 | obj[:item_bias] = json_dump_ptr(@bias_b) 274 | 275 | # mean 276 | obj[:global_mean] = @global_mean 277 | 278 | unless (@user_info_map.keys + @item_info_map.keys).all? { |v| v.is_a?(Symbol) } 279 | raise "Side info keys must be symbols to save" 280 | end 281 | 282 | # side info 283 | obj[:user_info_ids] = @user_info_map.keys 284 | obj[:item_info_ids] = @item_info_map.keys 285 | obj[:user_info_factors] = json_dump_ptr(@c) 286 | obj[:item_info_factors] = json_dump_ptr(@d) 287 | 288 | # implicit features 289 | obj[:add_implicit_features] = @add_implicit_features 290 | obj[:user_factors_implicit] = json_dump_ptr(@ai) 291 | obj[:item_factors_implicit] = json_dump_ptr(@bi) 292 | 293 | unless @implicit 294 | obj[:min_rating] = @min_rating 295 | obj[:max_rating] = @max_rating 296 | end 297 | 298 | obj[:user_means] = json_dump_ptr(@u_colmeans) 299 | 300 | JSON.generate(obj) 301 | end 302 | 303 | def self.load_json(json) 304 | require "json" 305 | 306 | obj = JSON.parse(json) 307 | 308 | recommender = new 309 | recommender.send(:json_load, obj) 310 | recommender 311 | end 312 | 313 | private 314 | 315 | def user_index 316 | @user_index ||= create_index(user_factors) 317 | end 318 | 319 | def item_index 320 | @item_index ||= create_index(item_factors) 321 | end 322 | 323 | def create_index(factors) 324 | require "ngt" 325 | 326 | index = Ngt::Index.new(@k, distance_type: "Cosine") 327 | index.batch_insert(factors) 328 | index 329 | end 330 | 331 | # TODO include bias 332 | def similar(id, map, factors, count, index) 333 | i = map[id] 334 | if i 335 | keys = map.keys 336 | result = index.search(factors[i], size: count + 1)[1..-1] 337 | result.map do |v| 338 | { 339 | # ids from batch_insert start at 1 instead of 0 340 | item_id: keys[v[:id] - 1], 341 | # convert cosine distance to cosine similarity 342 | score: 1 - v[:distance] 343 | } 344 | end 345 | else 346 | [] 347 | end 348 | end 349 | 350 | def reset 351 | @fit = false 352 | @user_map.clear 353 | @item_map.clear 354 | @user_info_map.clear 355 | @item_info_map.clear 356 | @user_index = nil 357 | @item_index = nil 358 | end 359 | 360 | # TODO resize pointers as needed and reset values for new memory 361 | def partial_fit(train_set, user_info: nil, item_info: nil) 362 | train_set = to_dataset(train_set) 363 | 364 | unless @fit 365 | @implicit = !train_set.any? { |v| v[:rating] } 366 | end 367 | 368 | unless @implicit 369 | ratings = train_set.map { |o| o[:rating] } 370 | check_ratings(ratings) 371 | end 372 | 373 | check_training_set(train_set) 374 | update_maps(train_set) 375 | 376 | x_row = [] 377 | x_col = [] 378 | x_val = [] 379 | value_key = @implicit ? :value : :rating 380 | @rated = Hash.new { |hash, key| hash[key] = {} } 381 | train_set.each do |v| 382 | u = @user_map[v[:user_id]] 383 | i = @item_map[v[:item_id]] 384 | @rated[u][i] = true 385 | 386 | x_row << u 387 | x_col << i 388 | x_val << (v[value_key] || 1) 389 | end 390 | @rated.default = nil 391 | 392 | @m = @user_map.size 393 | @n = @item_map.size 394 | nnz = train_set.size 395 | 396 | x_row = int_ptr(x_row) 397 | x_col = int_ptr(x_col) 398 | x = real_ptr(x_val) 399 | 400 | x_full = nil 401 | weight = nil 402 | lam_unique = nil 403 | l1_lam_unique = nil 404 | 405 | uu = nil 406 | ii = nil 407 | 408 | # side info 409 | u_row, u_col, u_sp, nnz_u, @m_u, p_ = process_info(user_info, @user_map, @user_info_map, :user_id) 410 | i_row, i_col, i_sp, nnz_i, @n_i, q = process_info(item_info, @item_map, @item_info_map, :item_id) 411 | 412 | @precompute_for_predictions = false 413 | 414 | # initialize w/ normal distribution 415 | reset_values = !@fit 416 | 417 | @a = Fiddle::Pointer.malloc([@m, @m_u].max * (@k_user + @k + @k_main) * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 418 | @b = Fiddle::Pointer.malloc([@n, @n_i].max * (@k_item + @k + @k_main) * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 419 | @c = p_ > 0 ? Fiddle::Pointer.malloc(p_ * (@k_user + @k) * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) : nil 420 | @d = q > 0 ? Fiddle::Pointer.malloc(q * (@k_item + @k) * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) : nil 421 | 422 | @bias_a = nil 423 | @bias_b = nil 424 | 425 | u_colmeans = Fiddle::Pointer.malloc(p_ * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 426 | i_colmeans = Fiddle::Pointer.malloc(q * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 427 | 428 | if @implicit 429 | set_implicit_vars 430 | 431 | args = [ 432 | @a, @b, 433 | @c, @d, 434 | reset_values, @random_state, 435 | u_colmeans, i_colmeans, 436 | @m, @n, @k, 437 | x_row, x_col, x, nnz, 438 | @lambda_, lam_unique, 439 | @l1_lambda, l1_lam_unique, 440 | uu, @m_u, p_, 441 | ii, @n_i, q, 442 | u_row, u_col, u_sp, nnz_u, 443 | i_row, i_col, i_sp, nnz_i, 444 | @na_as_zero_user, @na_as_zero_item, 445 | @k_main, @k_user, @k_item, 446 | @w_main, @w_user, @w_item, real_ptr([@w_main_multiplier]), 447 | @alpha, @adjust_weight, @apply_log_transf, 448 | @niter, @nthreads, @verbose, @handle_interrupt, 449 | @use_cg, @max_cg_steps, @precondition_cg, @finalize_chol, 450 | @nonneg, @max_cd_steps, @nonneg_c, @nonneg_d, 451 | @precompute_for_predictions, 452 | nil, #precomputedBtB, 453 | nil, #precomputedBeTBe, 454 | nil, #precomputedBeTBeChol 455 | nil #precomputedCtUbias 456 | ] 457 | check_status FFI.fit_collective_implicit_als(*fiddle_args(args)) 458 | 459 | @global_mean = 0 460 | else 461 | @bias_a = Fiddle::Pointer.malloc([@m, @m_u].max * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) if @user_bias 462 | @bias_b = Fiddle::Pointer.malloc([@n, @n_i].max * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) if @item_bias 463 | 464 | if @add_implicit_features 465 | @ai = Fiddle::Pointer.malloc([@m, @m_u].max * (@k + @k_main) * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 466 | @bi = Fiddle::Pointer.malloc([@n, @n_i].max * (@k + @k_main) * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 467 | else 468 | @ai = nil 469 | @bi = nil 470 | end 471 | 472 | glob_mean = Fiddle::Pointer.malloc(Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 473 | 474 | # TODO add 475 | scaling_bias_a = nil 476 | scaling_bias_b = nil 477 | 478 | args = [ 479 | @bias_a, @bias_b, 480 | @a, @b, 481 | @c, @d, 482 | @ai, @bi, 483 | @add_implicit_features, 484 | reset_values, @random_state, 485 | glob_mean, 486 | u_colmeans, i_colmeans, 487 | @m, @n, @k, 488 | x_row, x_col, x, nnz, 489 | x_full, 490 | weight, 491 | @user_bias, @item_bias, @center, 492 | @lambda_, lam_unique, 493 | @l1_lambda, l1_lam_unique, 494 | @scale_lam, @scale_lam_sideinfo, 495 | @scale_bias_const, scaling_bias_a, scaling_bias_b, 496 | uu, @m_u, p_, 497 | ii, @n_i, q, 498 | u_row, u_col, u_sp, nnz_u, 499 | i_row, i_col, i_sp, nnz_i, 500 | @na_as_zero, @na_as_zero_user, @na_as_zero_item, 501 | @k_main, @k_user, @k_item, 502 | @w_main, @w_user, @w_item, @w_implicit, 503 | @niter, @nthreads, @verbose, @handle_interrupt, 504 | @use_cg, @max_cg_steps, @precondition_cg, @finalize_chol, 505 | @nonneg, @max_cd_steps, @nonneg_c, @nonneg_d, 506 | @precompute_for_predictions, 507 | @include_all_x, 508 | nil, #B_plus_bias, 509 | nil, #precomputedBtB, 510 | nil, #precomputedTransBtBinvBt, 511 | nil, #precomputedBtXbias 512 | nil, #precomputedBeTBeChol, 513 | nil, #precomputedBiTBi, 514 | nil, #precomputedTransCtCinvCt, 515 | nil, #precomputedCtCw 516 | nil #precomputedCtUbias 517 | ] 518 | check_status FFI.fit_collective_explicit_als(*fiddle_args(args)) 519 | 520 | @global_mean = real_array(glob_mean).first 521 | end 522 | 523 | @u_colmeans = u_colmeans 524 | 525 | @fit = true 526 | 527 | self 528 | end 529 | 530 | def set_params( 531 | k: 40, lambda_: 10.0, method: "als", use_cg: true, 532 | user_bias: true, item_bias: true, center: true, add_implicit_features: false, 533 | scale_lam: false, scale_lam_sideinfo: false, scale_bias_const: false, 534 | k_user: 0, k_item: 0, k_main: 0, 535 | w_main: 1.0, w_user: 1.0, w_item: 1.0, w_implicit: 0.5, 536 | l1_lambda: 0.0, center_u: true, center_i: true, 537 | maxiter: 800, niter: 10, parallelize: "separate", corr_pairs: 4, 538 | max_cg_steps: 3, precondition_cg: false, finalize_chol: true, 539 | na_as_zero: false, na_as_zero_user: false, na_as_zero_item: false, 540 | nonneg: false, nonneg_c: false, nonneg_d: false, max_cd_steps: 100, 541 | precompute_for_predictions: true, include_all_x: true, 542 | use_float: true, random_state: 1, verbose: true, print_every: 10, 543 | handle_interrupt: true, produce_dicts: false, nthreads: -1 544 | ) 545 | @k = k 546 | @k_user = k_user 547 | @k_item = k_item 548 | @k_main = k_main 549 | @lambda_ = lambda_ 550 | @w_main = w_main 551 | @w_user = w_user 552 | @w_item = w_item 553 | @w_implicit = w_implicit 554 | @user_bias = !!user_bias 555 | @item_bias = !!item_bias 556 | @method = method 557 | @add_implicit_features = !!add_implicit_features 558 | @use_cg = !!use_cg 559 | @max_cg_steps = max_cg_steps.to_i 560 | @max_cd_steps = max_cd_steps.to_i 561 | @finalize_chol = !!finalize_chol 562 | @maxiter = maxiter 563 | @niter = niter 564 | @parallelize = parallelize 565 | @na_as_zero = !!na_as_zero 566 | @na_as_zero_user = !!na_as_zero_user 567 | @na_as_zero_item = !!na_as_zero_item 568 | @nonneg = !!nonneg 569 | @nonneg_c = !!nonneg_c 570 | @nonneg_d = !!nonneg_d 571 | @precompute_for_predictions = !!precompute_for_predictions 572 | @include_all_x = true 573 | @use_float = !!use_float 574 | @verbose = !!verbose 575 | @print_every = print_every 576 | @corr_pairs = corr_pairs 577 | @random_state = random_state.to_i 578 | @produce_dicts = !!produce_dicts 579 | @handle_interrupt = !!handle_interrupt 580 | nthreads = Etc.nprocessors if nthreads < 0 581 | @nthreads = nthreads 582 | 583 | @center = center 584 | @scale_lam = scale_lam 585 | @scale_lam_sideinfo = scale_lam_sideinfo 586 | @scale_bias_const = scale_bias_const 587 | @l1_lambda = l1_lambda 588 | @precondition_cg = precondition_cg 589 | 590 | # TODO center_u, center_i 591 | end 592 | 593 | def update_maps(train_set) 594 | raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? } 595 | raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? } 596 | 597 | train_set.each do |v| 598 | @user_map[v[:user_id]] ||= @user_map.size 599 | @item_map[v[:item_id]] ||= @item_map.size 600 | end 601 | end 602 | 603 | def check_ratings(ratings) 604 | unless ratings.all? { |r| !r.nil? } 605 | raise ArgumentError, "Missing ratings" 606 | end 607 | unless ratings.all? { |r| r.is_a?(Numeric) } 608 | raise ArgumentError, "Ratings must be numeric" 609 | end 610 | end 611 | 612 | def check_training_set(train_set) 613 | raise ArgumentError, "No training data" if train_set.empty? 614 | end 615 | 616 | def check_fit 617 | raise "Not fit" unless @fit 618 | end 619 | 620 | def to_dataset(dataset) 621 | if defined?(Rover::DataFrame) && dataset.is_a?(Rover::DataFrame) 622 | # convert keys to symbols 623 | dataset.each_row.map { |v| v.transform_keys(&:to_sym) } 624 | elsif defined?(Daru::DataFrame) && dataset.is_a?(Daru::DataFrame) 625 | # convert keys to symbols 626 | dataset = dataset.dup 627 | new_names = dataset.vectors.to_a.map { |k| [k, k.to_sym] }.to_h 628 | dataset.rename_vectors!(new_names) 629 | dataset.to_a[0] 630 | else 631 | dataset 632 | end 633 | end 634 | 635 | def read_factors(ptr, d1, d2, id, map) 636 | width = d2 * Fiddle::SIZEOF_DOUBLE 637 | if id 638 | i = map[id] 639 | ptr[i * width, width].unpack("d*") if i 640 | else 641 | arr = [] 642 | offset = 0 643 | d1.times do |i| 644 | arr << ptr[offset, width].unpack("d*") 645 | offset += width 646 | end 647 | arr 648 | end 649 | end 650 | 651 | def read_bias(ptr, id, map) 652 | if id 653 | i = map[id] 654 | ptr[i * Fiddle::SIZEOF_DOUBLE, Fiddle::SIZEOF_DOUBLE].unpack1("d") if i 655 | else 656 | real_array(ptr) 657 | end 658 | end 659 | 660 | def prepare_top_n(count: nil, rated: nil, item_ids: nil) 661 | if item_ids 662 | # remove missing ids 663 | item_ids = item_ids.map { |v| @item_map[v] }.compact 664 | return [] if item_ids.empty? 665 | 666 | include_ix = int_ptr(item_ids) 667 | n_include = item_ids.size 668 | 669 | count = n_include if n_include < count 670 | else 671 | include_ix = nil 672 | n_include = 0 673 | end 674 | 675 | if rated && !item_ids 676 | # assumes rated is unique and all items are known 677 | # calling code is responsible for this 678 | exclude_ix = int_ptr(rated) 679 | n_exclude = rated.size 680 | remaining = @item_map.size - n_exclude 681 | return [] if remaining == 0 682 | count = remaining if remaining < count 683 | else 684 | exclude_ix = nil 685 | n_exclude = 0 686 | end 687 | 688 | outp_ix = Fiddle::Pointer.malloc(count * Fiddle::SIZEOF_INT, Fiddle::RUBY_FREE) 689 | outp_score = Fiddle::Pointer.malloc(count * Fiddle::SIZEOF_DOUBLE, Fiddle::RUBY_FREE) 690 | 691 | [include_ix, n_include, exclude_ix, n_exclude, outp_ix, outp_score, count] 692 | end 693 | 694 | def top_n(a_vec:, a_bias:, count:, rated: nil, item_ids: nil, row_index:) 695 | prep = prepare_top_n(count: count, rated: rated, item_ids: item_ids) 696 | return [] if prep.empty? 697 | include_ix, n_include, exclude_ix, n_exclude, outp_ix, outp_score, count = prep 698 | 699 | if @implicit 700 | check_status FFI.topN_old_collective_implicit( 701 | a_vec, 702 | @a, row_index, 703 | @b, 704 | @k, @k_user, @k_item, @k_main, 705 | include_ix, n_include, 706 | exclude_ix, n_exclude, 707 | outp_ix, outp_score, 708 | count, @n, @nthreads 709 | ) 710 | else 711 | # TODO add param 712 | n_max = @n 713 | 714 | check_status FFI.topN_old_collective_explicit( 715 | a_vec, a_bias, 716 | @a, @bias_a, row_index, 717 | @b, 718 | @bias_b, 719 | @global_mean, 720 | @k, @k_user, @k_item, @k_main, 721 | include_ix, n_include, 722 | exclude_ix, n_exclude, 723 | outp_ix, outp_score, 724 | count, @n, n_max, @include_all_x ? 1 : 0, @nthreads 725 | ) 726 | end 727 | 728 | top_n_output(outp_ix, outp_score) 729 | end 730 | 731 | def top_n_output(outp_ix, outp_score) 732 | imap = @item_map.map(&:reverse).to_h 733 | item_ids = int_array(outp_ix).map { |v| imap[v] } 734 | scores = real_array(outp_score) 735 | 736 | item_ids.zip(scores).map do |item_id, score| 737 | {item_id: item_id, score: score} 738 | end 739 | end 740 | 741 | def fiddle_args(args) 742 | args 743 | end 744 | 745 | def check_status(ret_val) 746 | case ret_val 747 | when 0 748 | # success 749 | when 1 750 | raise "Could not allocate sufficient memory" 751 | else 752 | raise "Bad status: #{ret_val}" 753 | end 754 | end 755 | 756 | def process_info(info, map, info_map, key) 757 | return [nil, nil, nil, 0, 0, 0] unless info 758 | 759 | info = to_dataset(info) 760 | 761 | row = [] 762 | col = [] 763 | val = [] 764 | info.each do |ri| 765 | rk = ri[key] 766 | raise ArgumentError, "Missing #{key}" unless rk 767 | 768 | r = (map[rk] ||= map.size) 769 | ri.each do |k, v| 770 | next if k == key 771 | row << r 772 | col << (info_map[k] ||= info_map.size) 773 | val << v 774 | end 775 | end 776 | [int_ptr(row), int_ptr(col), real_ptr(val), val.size, map.size, info_map.size] 777 | end 778 | 779 | def int_ptr(v) 780 | v.pack("i*") 781 | end 782 | 783 | def real_ptr(v) 784 | v.pack("d*") 785 | end 786 | 787 | def int_array(ptr) 788 | ptr.to_str(ptr.size).unpack("i*") 789 | end 790 | 791 | def real_array(ptr) 792 | ptr.to_str(ptr.size).unpack("d*") 793 | end 794 | 795 | def set_implicit_vars 796 | @w_main_multiplier = 1.0 797 | @alpha = 1.0 798 | @adjust_weight = false # downweight? 799 | @apply_log_transf = false 800 | 801 | # different defaults 802 | @lambda_ = 1e0 803 | @w_user = 10 804 | @w_item = 10 805 | @finalize_chol = false 806 | end 807 | 808 | def json_dump_ptr(ptr) 809 | [ptr.to_str(ptr.size)].pack("m0") if ptr 810 | end 811 | 812 | def json_load_ptr(str) 813 | Fiddle::Pointer[str.unpack1("m0")] if str 814 | end 815 | 816 | def json_load(obj) 817 | @implicit = obj["implicit"] 818 | 819 | # options 820 | set_params( 821 | k: obj["factors"], 822 | niter: obj["epochs"], 823 | verbose: obj["verbose"], 824 | user_bias: !obj["user_bias"].nil?, 825 | item_bias: !obj["item_bias"].nil?, 826 | add_implicit_features: obj["add_implicit_features"] 827 | ) 828 | 829 | # factors 830 | @user_map = obj["user_ids"].map.with_index.to_h 831 | @item_map = obj["item_ids"].map.with_index.to_h 832 | @rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] } 833 | @a = json_load_ptr(obj["user_factors"]) 834 | @b = json_load_ptr(obj["item_factors"]) 835 | 836 | # bias 837 | @bias_a = json_load_ptr(obj["user_bias"]) 838 | @bias_b = json_load_ptr(obj["item_bias"]) 839 | 840 | # mean 841 | @global_mean = obj["global_mean"] 842 | 843 | # side info 844 | @user_info_map = obj["user_info_ids"].map(&:to_sym).map.with_index.to_h 845 | @item_info_map = obj["item_info_ids"].map(&:to_sym).map.with_index.to_h 846 | @c = json_load_ptr(obj["user_info_factors"]) 847 | @d = json_load_ptr(obj["item_info_factors"]) 848 | 849 | # implicit features 850 | @add_implicit_features = obj["add_implicit_features"] 851 | @ai = json_load_ptr(obj["user_factors_implicit"]) 852 | @bi = json_load_ptr(obj["item_factors_implicit"]) 853 | 854 | unless @implicit 855 | @min_rating = obj["min_rating"] 856 | @max_rating = obj["max_rating"] 857 | end 858 | 859 | @u_colmeans = json_load_ptr(obj["user_means"]) 860 | 861 | @m = @user_map.size 862 | @n = @item_map.size 863 | @m_u = @user_info_map.size 864 | @n_i = @item_info_map.size 865 | 866 | set_implicit_vars if @implicit 867 | 868 | @fit = @m > 0 869 | end 870 | end 871 | end 872 | --------------------------------------------------------------------------------