├── .gitignore ├── CMakeLists.txt ├── README.md ├── include └── batch_learn.hpp ├── python ├── batch_learn │ ├── __init__.py │ └── writer.pyx └── setup.py └── src ├── batch-learn.cpp ├── commands ├── command.hpp ├── convert.cpp ├── convert.hpp ├── ffm.hpp ├── model.cpp ├── model.hpp └── nn.hpp ├── models ├── ffm.cpp ├── ffm.hpp ├── model.hpp ├── nn.cpp └── nn.hpp └── util ├── common.hpp ├── dataset.hpp ├── model.hpp └── nn.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6) 2 | project(batch-learn) 3 | 4 | find_package(OpenMP REQUIRED) 5 | find_package(Boost REQUIRED program_options) 6 | 7 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 9 | 10 | include_directories(${Boost_INCLUDE_DIR}) 11 | link_directories(${Boost_LIBRARY_DIRS}) 12 | 13 | file(GLOB SRCS src/*.cpp src/**/*.cpp) 14 | include_directories(include) 15 | 16 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -O3 -std=c++11 -march=native") 17 | 18 | add_executable(${PROJECT_NAME} ${SRCS}) 19 | target_link_libraries(${PROJECT_NAME} boost_program_options) 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Batch Learn 2 | 3 | Batch-Learn is an implementation of ML algorithms which may be applied to on-disk data batch-by-batch, without loading full dataset to memory. 4 | 5 | Algorithms included: 6 | * FFM 7 | * NN / MLP 8 | 9 | It's extracted from the [code](https://github.com/alno/kaggle-outbrain-click-prediction) written during [Outbrain Click Prediction](https://www.kaggle.com/c/outbrain-click-prediction/) competition on Kaggle and now is undergoing some rewrite and refactoring. 10 | 11 | ## Installation 12 | 13 | Batch-learn uses [CMake](https://cmake.org/) as a build tool and depends on following libraries: 14 | * boost-program-options 15 | * boost-iostreams 16 | 17 | To compile code you need to install boost libraries and then call: 18 | 19 | mkdir build 20 | cd build 21 | cmake .. 22 | make 23 | 24 | ## Usage 25 | 26 | First, you need to convert to batch-learn format: 27 | 28 | batch-learn convert -f ffm -b 24 ffm_dataset.txt -O bl_dataset 29 | 30 | To train ffm model and make predictions on test dataset: 31 | 32 | batch-learn ffm --train tr1 --test te1 --pred pred.txt 33 | 34 | You also may specify validation dataset: 35 | 36 | batch-learn ffm --train tr1 --test te1 --val va1 --pred pred.txt 37 | 38 | To get list of available commands just run: 39 | 40 | batch-learn help 41 | 42 | To get help about some specific command: 43 | 44 | batch-learn help ffm 45 | -------------------------------------------------------------------------------- /include/batch_learn.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace batch_learn { 8 | 9 | const uint32_t file_format_version = 1; 10 | 11 | struct feature { 12 | uint32_t index; // Feature index consists of two parts: field (in high bits) and in-field index (in low bits), number of bits to store index specified in file header 13 | float value; 14 | }; 15 | 16 | struct file_index { 17 | uint64_t n_examples; // Number of examples; 18 | uint32_t n_fields; // Number of feature fields (max + 1) 19 | uint32_t n_indices; // Number of feature in-field indices (max + 1) 20 | 21 | uint32_t n_index_bits; // Number of bits used to store index part (should be enough to include max index) 22 | 23 | std::vector labels; // Target values of examples (size N) 24 | std::vector offsets; // Offsets of example data (size N +1) in number of features 25 | std::vector groups; // Group identifiers for MAP calculation 26 | }; 27 | 28 | // Index IO functions 29 | 30 | inline void write_index(const std::string & file_name, const file_index & index) { 31 | using namespace std; 32 | 33 | if (index.labels.size() != index.n_examples) 34 | throw runtime_error("Invalid index labels size"); 35 | 36 | if (index.offsets.size() != index.n_examples + 1) 37 | throw runtime_error("Invalid index offsets size"); 38 | 39 | if (index.groups.size() != index.n_examples) 40 | throw runtime_error("Invalid index groups size"); 41 | 42 | if ((1ul << index.n_index_bits) < index.n_indices) 43 | throw runtime_error("Not enough index bits allocated to store max index"); 44 | 45 | if ((1ul << (32 - index.n_index_bits)) < index.n_fields) 46 | throw runtime_error("Not enough field bits allocated to store max field"); 47 | 48 | FILE * file = fopen(file_name.c_str(), "wb"); 49 | 50 | if(file == nullptr) 51 | throw runtime_error(string("Can't open index file ") + file_name); 52 | 53 | if (fwrite(&file_format_version, sizeof(uint32_t), 1, file) != 1) 54 | throw runtime_error("Error writing format version"); 55 | 56 | // Header 57 | 58 | if (fwrite(&index.n_examples, sizeof(uint64_t), 1, file) != 1) 59 | throw runtime_error("Error writing example count"); 60 | 61 | if (fwrite(&index.n_fields, sizeof(uint32_t), 1, file) != 1) 62 | throw runtime_error("Error writing field count"); 63 | 64 | if (fwrite(&index.n_indices, sizeof(uint32_t), 1, file) != 1) 65 | throw runtime_error("Error writing index count"); 66 | 67 | if (fwrite(&index.n_index_bits, sizeof(uint32_t), 1, file) != 1) 68 | throw runtime_error("Error writing index bit count"); 69 | 70 | // Index itself 71 | 72 | if (fwrite(index.labels.data(), sizeof(float), index.labels.size(), file) != index.labels.size()) 73 | throw runtime_error("Error writing labels"); 74 | 75 | if (fwrite(index.offsets.data(), sizeof(uint64_t), index.offsets.size(), file) != index.offsets.size()) 76 | throw runtime_error("Error writing offsets"); 77 | 78 | if (fwrite(index.groups.data(), sizeof(uint64_t), index.groups.size(), file) != index.groups.size()) 79 | throw runtime_error("Error writing groups"); 80 | 81 | fclose(file); 82 | }; 83 | 84 | inline file_index read_index(const std::string & file_name) { 85 | using namespace std; 86 | 87 | file_index index; 88 | FILE * file = fopen(file_name.c_str(), "rb"); 89 | 90 | if(file == nullptr) 91 | throw runtime_error(string("Can't open index file ") + file_name); 92 | 93 | uint32_t version; 94 | 95 | if (fread(&version, sizeof(uint32_t), 1, file) != 1) 96 | throw runtime_error("Error reading version"); 97 | 98 | if (version != file_format_version) 99 | throw runtime_error("File format version mismatch"); 100 | 101 | // Header 102 | 103 | if (fread(&index.n_examples, sizeof(uint64_t), 1, file) != 1) 104 | throw runtime_error("Error reading example count"); 105 | 106 | if (fread(&index.n_fields, sizeof(uint32_t), 1, file) != 1) 107 | throw runtime_error("Error reading field count"); 108 | 109 | if (fread(&index.n_indices, sizeof(uint32_t), 1, file) != 1) 110 | throw runtime_error("Error reading index count"); 111 | 112 | if (fread(&index.n_index_bits, sizeof(uint32_t), 1, file) != 1) 113 | throw runtime_error("Error reading index bit count"); 114 | 115 | // Reserve space for y and offsets 116 | index.labels.resize(index.n_examples, 0); 117 | index.offsets.resize(index.n_examples + 1, 0); 118 | index.groups.resize(index.n_examples, 0); 119 | 120 | if (fread(index.labels.data(), sizeof(float), index.labels.size(), file) != index.labels.size()) 121 | throw runtime_error("Error reading labels"); 122 | 123 | if (fread(index.offsets.data(), sizeof(uint64_t), index.offsets.size(), file) != index.offsets.size()) 124 | throw runtime_error("Error reading offsets"); 125 | 126 | if (fread(index.groups.data(), sizeof(uint64_t), index.groups.size(), file) != index.groups.size()) 127 | throw runtime_error("Error reading groups"); 128 | 129 | fclose(file); 130 | 131 | return index; 132 | }; 133 | 134 | // Batch IO functions 135 | 136 | inline void read_batch(const std::string & file_name, uint64_t from, uint64_t to, std::vector & features) { 137 | using namespace std; 138 | 139 | if (to < from) 140 | throw runtime_error("Wrong range"); 141 | 142 | features.resize(to - from); 143 | 144 | // Empty range, no need to read 145 | if (to == from) 146 | return; 147 | 148 | FILE * file = fopen(file_name.c_str(), "rb"); 149 | 150 | if (file == nullptr) 151 | throw runtime_error(string("Can't open data file ") + file_name); 152 | 153 | if (fseek((FILE *)file, from * sizeof(feature), SEEK_SET) != 0) 154 | throw new runtime_error("Can't set file pos"); 155 | 156 | if (fread(features.data(), sizeof(feature), features.size(), (FILE *)file) != features.size()) 157 | throw new runtime_error("Can't read data"); 158 | 159 | fclose(file); 160 | } 161 | 162 | inline std::vector read_batch(const std::string & file_name, uint64_t from, uint64_t to) { 163 | std::vector features(to - from); 164 | read_batch(file_name, from, to, features); 165 | return features; 166 | } 167 | 168 | // Data file writer 169 | 170 | class stream_data_writer { 171 | FILE * file; 172 | uint64_t offset; 173 | public: 174 | stream_data_writer(const std::string & file_name): offset(0) { 175 | using namespace std; 176 | 177 | file = fopen(file_name.c_str(), "wb"); 178 | 179 | if (file == nullptr) 180 | throw runtime_error(string("Can't open data file ") + file_name); 181 | } 182 | 183 | ~stream_data_writer() { 184 | fclose((FILE *)file); 185 | } 186 | 187 | uint64_t write(const std::vector & features) { 188 | if (fwrite(features.data(), sizeof(feature), features.size(), (FILE *)file) != features.size()) 189 | throw std::runtime_error("Error writing example count"); 190 | 191 | offset += features.size(); 192 | 193 | return offset; 194 | } 195 | }; 196 | 197 | 198 | class file_writer { 199 | private: 200 | std::string filename; 201 | uint32_t index_mask; 202 | file_index index; 203 | stream_data_writer data_writer; 204 | 205 | public: 206 | file_writer(const std::string& _filename, uint32_t index_bits, uint32_t n_fields = 0, uint32_t n_indices = 0) 207 | : filename(_filename) 208 | , index_mask((1 << index_bits) - 1) 209 | , data_writer(filename + ".data") 210 | { 211 | index.n_examples = 0; 212 | index.n_fields = n_fields; 213 | index.n_indices = n_indices; 214 | index.n_index_bits = index_bits; 215 | index.offsets.push_back(0); 216 | } 217 | 218 | void write_row(const std::vector & features, float y, uint64_t group) { 219 | index.n_examples ++; 220 | index.labels.push_back(y); 221 | index.groups.push_back(group); 222 | index.offsets.push_back(data_writer.write(features)); 223 | } 224 | 225 | void write_row(size_t n_features, const int* fields, const int* indices, const float* values, float y, uint64_t group) { 226 | std::vector features(n_features); 227 | for (size_t i = 0; i < n_features; ++ i) { 228 | uint32_t idx = indices[i] & index_mask; 229 | 230 | features[i].index = (fields[i] << index.n_index_bits) | idx; 231 | features[i].value = values[i]; 232 | 233 | if (uint32_t(fields[i]) >= index.n_fields) 234 | index.n_fields = fields[i] + 1; 235 | 236 | if (idx >= index.n_indices) 237 | index.n_indices = idx + 1; 238 | } 239 | 240 | write_row(features, y, group); 241 | } 242 | 243 | void write_index() { 244 | batch_learn::write_index(filename + ".index", index); 245 | } 246 | }; 247 | 248 | 249 | }; 250 | -------------------------------------------------------------------------------- /python/batch_learn/__init__.py: -------------------------------------------------------------------------------- 1 | from .writer import Writer 2 | 3 | __all__ = ['Writer'] 4 | -------------------------------------------------------------------------------- /python/batch_learn/writer.pyx: -------------------------------------------------------------------------------- 1 | from libcpp.string cimport string 2 | from libcpp cimport bool 3 | 4 | import numpy as np 5 | cimport numpy as np 6 | 7 | cdef extern from "../../include/batch_learn.hpp" namespace "batch_learn": 8 | cdef cppclass file_writer: 9 | file_writer(string, int) 10 | void write_row(int, int*, int*, float*, float, int) 11 | void write_index() 12 | 13 | cdef class Writer: 14 | cdef file_writer* _writer 15 | cdef bool _closed 16 | 17 | def __cinit__(self, filename, int index_bits): 18 | self._writer = new file_writer(filename.encode('utf-8'), index_bits) 19 | self._closed = False 20 | 21 | def __dealloc__(self): 22 | if not self._closed: 23 | self.close() 24 | 25 | def write_index(self): 26 | self._check_not_closed() 27 | self._writer.write_index() 28 | 29 | def write_row(self, fields, indices, values, y, group=0): 30 | self._check_not_closed() 31 | self._write_row( 32 | np.asarray(fields, dtype=np.uint32, order='c'), 33 | np.asarray(indices, dtype=np.uint32, order='c'), 34 | np.asarray(values, dtype=np.float32, order='c'), 35 | y, 36 | group 37 | ) 38 | 39 | def close(self): 40 | self._check_not_closed() 41 | del self._writer 42 | self._closed = True 43 | 44 | def _check_not_closed(self): 45 | if self._closed: 46 | raise RuntimeError("Writer already closed") 47 | 48 | def _write_row( 49 | self, 50 | np.ndarray[uint, ndim=1, mode='c'] fields, 51 | np.ndarray[uint, ndim=1, mode='c'] indices, 52 | np.ndarray[float, ndim=1, mode='c'] values, 53 | float y, 54 | int group 55 | ): 56 | self._writer.write_row(fields.shape[0], fields.data, indices.data, values.data, y, group) 57 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name='batch-learn', 6 | version='0.1.0', 7 | packages=find_packages(exclude=['contrib', 'docs', 'tests']), 8 | ext_modules=cythonize( 9 | "batch_learn/writer.pyx", 10 | #sources=["Rectangle.cpp"], # additional source file(s) 11 | language="c++", 12 | ) 13 | ) 14 | -------------------------------------------------------------------------------- /src/batch-learn.cpp: -------------------------------------------------------------------------------- 1 | #include "commands/convert.hpp" 2 | #include "commands/ffm.hpp" 3 | #include "commands/nn.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | static std::unordered_map> commands; 12 | 13 | 14 | class help_command : public command { 15 | protected: 16 | std::string command_name; 17 | public: 18 | help_command() { 19 | using namespace boost::program_options; 20 | 21 | options_desc.add_options() 22 | ("command", value(&command_name), "command name to get help about"); 23 | 24 | positional_options_desc.add("command", -1); 25 | } 26 | 27 | virtual std::string name() { return "help"; } 28 | virtual std::string description() { return "get the help"; } 29 | 30 | virtual int run() { 31 | using namespace std; 32 | 33 | if (commands.count(command_name) == 0) { 34 | cout << "Supported commands:" << endl; 35 | 36 | for (auto it = commands.begin(); it != commands.end(); ++ it) { 37 | cout << " "; 38 | cout.width(10); 39 | cout << left << it->first << it->second->description() << endl; 40 | } 41 | } else { 42 | commands[command_name]->print_help(); 43 | } 44 | 45 | return 0; 46 | } 47 | }; 48 | 49 | 50 | 51 | int main(int ac, char* av[]) { 52 | using namespace std; 53 | 54 | // Prepare commands 55 | commands.insert(make_pair("help", unique_ptr(new help_command()))); 56 | commands.insert(make_pair("convert", unique_ptr(new convert_command()))); 57 | commands.insert(make_pair("ffm", unique_ptr(new ffm_command()))); 58 | commands.insert(make_pair("nn", unique_ptr(new nn_command()))); 59 | 60 | // Check if command specified 61 | if (ac <= 1) { 62 | cout << "No command specified" << endl; 63 | 64 | commands["help"]->run(); 65 | 66 | return -1; 67 | } 68 | 69 | // Extract command and check if it's supported 70 | string cmd_name(av[1]); 71 | 72 | if (commands.count(cmd_name) == 0) { 73 | cout << "Unknown command " << cmd_name << " specified" << endl; 74 | 75 | commands["help"]->run(); 76 | 77 | return -2; 78 | } 79 | 80 | auto & cmd = commands[cmd_name]; 81 | 82 | // Try to parse command options 83 | try { 84 | cmd->parse_options(ac - 1, av + 1); 85 | } catch (const std::exception & e) { 86 | cout << "Error: " << e.what() << endl; 87 | cmd->print_help(); 88 | return -3; 89 | } 90 | 91 | // Print help if required 92 | if (cmd->options_vm.count("help") > 0) { 93 | cmd->print_help(); 94 | return 0; 95 | } 96 | 97 | // Run command 98 | try { 99 | return cmd->run(); 100 | } catch (std::exception & ex) { 101 | cerr << endl << "Error: " << ex.what() << endl; 102 | return -4; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/commands/command.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | 8 | class command { 9 | protected: 10 | boost::program_options::options_description options_desc; 11 | boost::program_options::positional_options_description positional_options_desc; 12 | public: 13 | boost::program_options::variables_map options_vm; 14 | public: 15 | command() { 16 | options_desc.add_options()("help", "show command help"); 17 | } 18 | 19 | virtual ~command() {} 20 | 21 | virtual std::string name() = 0; 22 | virtual std::string description() = 0; 23 | 24 | virtual void parse_options(int ac, char * av[]) { 25 | using namespace boost::program_options; 26 | 27 | store(command_line_parser(ac, av).options(options_desc).positional(positional_options_desc).run(), options_vm); 28 | notify(options_vm); 29 | } 30 | 31 | virtual void print_help() { 32 | std::cout << "Supported command options:" << std::endl; 33 | std::cout << options_desc << std::endl; 34 | } 35 | 36 | virtual int run() = 0; 37 | }; 38 | -------------------------------------------------------------------------------- /src/commands/convert.cpp: -------------------------------------------------------------------------------- 1 | #include "convert.hpp" 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include 8 | 9 | constexpr uint max_line_size = 100000; 10 | 11 | inline uint32_t h(uint32_t x) { 12 | x = ((x >> 16) ^ x) * 0x45d9f3b; 13 | x = ((x >> 16) ^ x) * 0x45d9f3b; 14 | x = (x >> 16) ^ x; 15 | return x; 16 | } 17 | 18 | int convert_command::run() { 19 | if (input_format_name == std::string("ffm")) { 20 | convert_from_ffm(); 21 | } else { 22 | std::cout << "Error: unknown input format, supported formats: ffm" << std::endl; 23 | return -1; 24 | } 25 | 26 | return 0; 27 | } 28 | 29 | void convert_command::convert_from_ffm() { 30 | using namespace std; 31 | using namespace batch_learn; 32 | using boost::format; 33 | 34 | cout << "Converting " << input_file_name << " to " << output_file_name << " using " << index_bits << " index bits... "; 35 | cout.flush(); 36 | 37 | FILE * input_file = fopen(input_file_name.c_str(), "r"); 38 | if (input_file == nullptr) 39 | throw runtime_error("Error opening input file"); 40 | 41 | file_index output_index; 42 | output_index.n_examples = 0; 43 | output_index.n_fields = 0; 44 | output_index.n_indices = 0; 45 | output_index.n_index_bits = index_bits; 46 | output_index.offsets.push_back(0); 47 | 48 | stream_data_writer output_data_writer(output_file_name + ".data"); 49 | vector features; 50 | char line[max_line_size]; 51 | 52 | uint64_t i = 0; 53 | while (fgets(line, max_line_size, input_file) != nullptr) { 54 | features.clear(); 55 | 56 | ++ i; 57 | 58 | char *y_char = strtok(line, " \t"); 59 | float y = (atoi(y_char) > 0) ? 1.0f : -1.0f; 60 | 61 | while (true) { 62 | char * feature_char = strtok(nullptr, " \t"); 63 | 64 | if (feature_char == nullptr || *feature_char == '\n') 65 | break; 66 | 67 | 68 | char * index_delim = strpbrk(feature_char, ":"); 69 | 70 | if (index_delim == nullptr) 71 | throw std::runtime_error(str(format("Invalid feature spec '%s' at line %d") % feature_char % i)); 72 | 73 | char * value_delim = strpbrk(index_delim + 1, ":"); 74 | 75 | if (value_delim == nullptr) 76 | throw std::runtime_error(str(format("Invalid feature spec '%s' at line %d") % feature_char % i)); 77 | 78 | if (feature_char[0] == ':' || feature_char[0] == 0) 79 | throw std::runtime_error(str(format("Empty field in '%s' at line %d") % feature_char % i)); 80 | 81 | if (index_delim[1] == ':' || index_delim[1] == 0) 82 | throw std::runtime_error(str(format("Empty index in '%s' at line %d") % feature_char % i)); 83 | 84 | if (value_delim[1] == 0) 85 | throw std::runtime_error(str(format("Empty value in '%s' at line %d") % feature_char % i)); 86 | 87 | index_delim[0] = 0; 88 | value_delim[0] = 0; 89 | 90 | uint field = atoi(feature_char); 91 | uint index = atoi(index_delim + 1); 92 | float value = atof(value_delim + 1); 93 | 94 | if (rehash_indexes > 0) 95 | index = h(index) % rehash_indexes; 96 | 97 | if (field >= output_index.n_fields) 98 | output_index.n_fields = field + 1; 99 | 100 | if (index >= output_index.n_indices) 101 | output_index.n_indices = index + 1; 102 | 103 | feature f; 104 | f.index = (field << index_bits) | index; 105 | f.value = value; 106 | 107 | features.push_back(f); 108 | } 109 | 110 | output_index.n_examples ++; 111 | output_index.labels.push_back(y); 112 | output_index.groups.push_back(0); // No group support in ffm format 113 | output_index.offsets.push_back(output_data_writer.write(features)); 114 | 115 | if (output_index.n_examples % progress_step == 0) { 116 | uint progress = output_index.n_examples; 117 | std::string unit; 118 | 119 | if (progress_step % 1000000 == 0) { 120 | progress /= 1000000; 121 | unit = "M"; 122 | } else if (progress_step % 1000 == 0) { 123 | progress /= 1000; 124 | unit = "K"; 125 | } 126 | 127 | cout << progress << unit << "... "; 128 | cout.flush(); 129 | } 130 | } 131 | 132 | fclose(input_file); 133 | 134 | write_index(output_file_name + ".index", output_index); 135 | 136 | cout << "Done." << endl; 137 | } 138 | -------------------------------------------------------------------------------- /src/commands/convert.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "command.hpp" 4 | 5 | 6 | class convert_command : public command { 7 | protected: 8 | std::string input_file_name, output_file_name, input_format_name; 9 | uint index_bits, progress_step, rehash_indexes; 10 | public: 11 | convert_command(): rehash_indexes(0) { 12 | using namespace boost::program_options; 13 | 14 | options_desc.add_options() 15 | ("bits,b", value(&index_bits)->default_value(24), "number of bits to store feature indices") 16 | ("rehash", value(&rehash_indexes), "rehash feature indices to given max") 17 | ("progress,p", value(&progress_step)->default_value(1000000), "print progress every N examples") 18 | ("format,f", value(&input_format_name)->required(), "input format name (only ffm supported for now)") 19 | ("input-file,I", value(&input_file_name)->required(), "input file name") 20 | ("output-file,O", value(&output_file_name)->required(), "output file name"); 21 | 22 | positional_options_desc.add("input-file", 1).add("output-file", 1); 23 | } 24 | 25 | virtual std::string name() { return "convert"; } 26 | virtual std::string description() { return "convert file to batch-learn binary format"; } 27 | 28 | virtual int run(); 29 | private: 30 | void convert_from_ffm(); 31 | }; 32 | -------------------------------------------------------------------------------- /src/commands/ffm.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "model.hpp" 4 | #include "../models/ffm.hpp" 5 | 6 | 7 | class ffm_command : public model_command { 8 | protected: 9 | uint n_dim; 10 | float eta, lambda; 11 | bool dropout; 12 | public: 13 | ffm_command() { 14 | using namespace boost::program_options; 15 | 16 | options_desc.add_options() 17 | ("dim,k", value(&n_dim)->default_value(4), "dimensions") 18 | ("eta", value(&eta)->default_value(0.2), "learning rate") 19 | ("lambda", value(&lambda)->default_value(0.00002), "l2 regularization coeff") 20 | ("dropout", value(&dropout)->default_value(true), "dropout"); 21 | } 22 | 23 | virtual std::string name() { return "ffm"; } 24 | virtual std::string description() { return "train and apply ffm model"; } 25 | 26 | virtual std::unique_ptr create_model(uint32_t n_fields, uint32_t n_indices, uint32_t n_index_bits) { 27 | return std::unique_ptr(new ffm_model(n_fields, n_indices, n_index_bits, n_dim, seed, eta, lambda, dropout)); 28 | } 29 | }; 30 | -------------------------------------------------------------------------------- /src/commands/model.cpp: -------------------------------------------------------------------------------- 1 | #include "model.hpp" 2 | 3 | #include "../util/dataset.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | // Batch configuration 17 | const uint32_t batch_size = 20000; 18 | const uint32_t mini_batch_size = 24; 19 | 20 | 21 | static std::default_random_engine rnd(2017); 22 | 23 | 24 | std::vector> generate_mini_batches(uint64_t begin, uint64_t end) { 25 | std::vector> batches; 26 | 27 | for (uint64_t mini_batch_start = begin; mini_batch_start < end; mini_batch_start += mini_batch_size) 28 | batches.push_back(std::make_pair(mini_batch_start, min(mini_batch_start + mini_batch_size, end))); 29 | 30 | return batches; 31 | } 32 | 33 | 34 | float compute_norm(batch_learn::feature * fa, batch_learn::feature * fb) { 35 | float norm = 0; 36 | 37 | for (batch_learn::feature * f = fa; f != fb; ++ f) 38 | norm += f->value * f->value; 39 | 40 | return norm; 41 | } 42 | 43 | 44 | double train_on_dataset(model & m, const batch_learn_dataset & dataset) { 45 | time_t start_time = time(nullptr); 46 | 47 | std::cout << " Training... "; 48 | std::cout.flush(); 49 | 50 | auto batches = dataset.generate_batches(batch_size); 51 | 52 | std::shuffle(batches.begin(), batches.end(), rnd); 53 | 54 | double loss = 0.0; 55 | uint64_t cnt = 0; 56 | 57 | // Iterate over batches, read each and then iterate over examples 58 | #pragma omp parallel for schedule(dynamic, 1) reduction(+: loss) reduction(+: cnt) 59 | for (uint64_t bi = 0; bi < batches.size(); ++ bi) { 60 | auto batch_start_index = batches[bi].first; 61 | auto batch_end_index = batches[bi].second; 62 | 63 | auto batch_start_offset = dataset.index.offsets[batch_start_index]; 64 | auto batch_end_offset = dataset.index.offsets[batch_end_index]; 65 | 66 | auto mini_batches = generate_mini_batches(batch_start_index, batch_end_index); 67 | 68 | std::vector batch_features = batch_learn::read_batch(dataset.data_file_name, batch_start_offset, batch_end_offset); 69 | batch_learn::feature * batch_features_data = batch_features.data(); 70 | 71 | std::shuffle(mini_batches.begin(), mini_batches.end(), rnd); 72 | 73 | for (auto mb = mini_batches.begin(); mb != mini_batches.end(); ++ mb) { 74 | for (auto ei = mb->first; ei < mb->second; ++ ei) { 75 | float y = dataset.index.labels[ei]; 76 | 77 | auto start_offset = dataset.index.offsets[ei] - batch_start_offset; 78 | auto end_offset = dataset.index.offsets[ei+1] - batch_start_offset; 79 | 80 | float norm = compute_norm(batch_features_data + start_offset, batch_features_data + end_offset); 81 | 82 | float t = m.predict(batch_features_data + start_offset, batch_features_data + end_offset, norm, true); 83 | float expnyt = exp(-y*t); 84 | 85 | m.update(batch_features_data + start_offset, batch_features_data + end_offset, norm, -y * expnyt / (1+expnyt)); 86 | 87 | loss += log(1+exp(-y*t)); 88 | } 89 | } 90 | 91 | cnt += batch_end_index - batch_start_index; 92 | } 93 | 94 | std::cout << cnt << " examples processed in " << (time(nullptr) - start_time) << " seconds, loss = " << std::fixed << std::setprecision(5) << (loss / cnt) << std::endl; 95 | 96 | return loss; 97 | } 98 | 99 | 100 | double evaluate_on_dataset(model & m, const batch_learn_dataset & dataset) { 101 | time_t start_time = time(nullptr); 102 | 103 | std::cout << " Evaluating... "; 104 | std::cout.flush(); 105 | 106 | auto batches = dataset.generate_batches(batch_size); 107 | 108 | double loss = 0.0; 109 | uint32_t cnt = 0; 110 | 111 | std::vector predictions(dataset.index.n_examples); 112 | 113 | // Iterate over batches, read each and then iterate over examples 114 | #pragma omp parallel for schedule(dynamic, 1) reduction(+: loss) reduction(+: cnt) 115 | for (uint32_t bi = 0; bi < batches.size(); ++ bi) { 116 | auto batch_start_index = batches[bi].first; 117 | auto batch_end_index = batches[bi].second; 118 | 119 | auto batch_start_offset = dataset.index.offsets[batch_start_index]; 120 | auto batch_end_offset = dataset.index.offsets[batch_end_index]; 121 | 122 | std::vector batch_features = batch_learn::read_batch(dataset.data_file_name, batch_start_offset, batch_end_offset); 123 | batch_learn::feature * batch_features_data = batch_features.data(); 124 | 125 | for (auto ei = batch_start_index; ei < batch_end_index; ++ ei) { 126 | float y = dataset.index.labels[ei]; 127 | 128 | auto start_offset = dataset.index.offsets[ei] - batch_start_offset; 129 | auto end_offset = dataset.index.offsets[ei+1] - batch_start_offset; 130 | 131 | float norm = compute_norm(batch_features_data + start_offset, batch_features_data + end_offset); 132 | float t = m.predict(batch_features_data + start_offset, batch_features_data + end_offset, norm, false); 133 | 134 | loss += log(1+exp(-y*t)); 135 | predictions[ei] = 1 / (1+exp(-t)); 136 | } 137 | 138 | cnt += batch_end_index - batch_start_index; 139 | } 140 | 141 | std::cout << cnt << " examples processed in " << (time(nullptr) - start_time) << " seconds, loss = " << std::fixed << std::setprecision(5) << (loss / cnt) << std::endl; 142 | 143 | return loss; 144 | } 145 | 146 | void predict_on_dataset(model & m, const batch_learn_dataset & dataset, std::ostream & out) { 147 | time_t start_time = time(nullptr); 148 | 149 | std::cout << " Predicting... "; 150 | std::cout.flush(); 151 | 152 | auto batches = dataset.generate_batches(batch_size); 153 | 154 | uint64_t cnt = 0; 155 | 156 | // Iterate over batches, read each and then iterate over examples 157 | for (uint64_t bi = 0; bi < batches.size(); ++ bi) { 158 | auto batch_start_index = batches[bi].first; 159 | auto batch_end_index = batches[bi].second; 160 | 161 | auto batch_start_offset = dataset.index.offsets[batch_start_index]; 162 | auto batch_end_offset = dataset.index.offsets[batch_end_index]; 163 | 164 | std::vector batch_features = batch_learn::read_batch(dataset.data_file_name, batch_start_offset, batch_end_offset); 165 | batch_learn::feature * batch_features_data = batch_features.data(); 166 | 167 | for (auto ei = batch_start_index; ei < batch_end_index; ++ ei) { 168 | auto start_offset = dataset.index.offsets[ei] - batch_start_offset; 169 | auto end_offset = dataset.index.offsets[ei+1] - batch_start_offset; 170 | 171 | float norm = compute_norm(batch_features_data + start_offset, batch_features_data + end_offset); 172 | float t = m.predict(batch_features_data + start_offset, batch_features_data + end_offset, norm, false); 173 | 174 | out << 1/(1+exp(-t)) << std::endl; 175 | } 176 | 177 | cnt += batch_end_index - batch_start_index; 178 | } 179 | 180 | std::cout << cnt << " examples processed in " << (time(nullptr) - start_time) << " seconds" << std::endl; 181 | } 182 | 183 | 184 | int model_command::run() { 185 | using namespace std; 186 | 187 | omp_set_num_threads(n_threads); 188 | rnd.seed(seed); 189 | 190 | auto ds_train = batch_learn_dataset(train_file_name); 191 | 192 | auto model = create_model(ds_train.index.n_fields, ds_train.index.n_indices, ds_train.index.n_index_bits); 193 | 194 | if (val_file_name.empty()) { // No validation set given, just train 195 | for (uint epoch = 0; epoch < n_epochs; ++ epoch) { 196 | cout << "Epoch " << epoch << "..." << endl; 197 | 198 | train_on_dataset(*model, ds_train); 199 | } 200 | } else { // Train with validation each epoch 201 | auto ds_val = batch_learn_dataset(val_file_name); 202 | 203 | if (ds_val.index.n_index_bits != ds_train.index.n_index_bits) 204 | throw std::runtime_error("Mismatching index bits in train and val"); 205 | 206 | for (uint epoch = 0; epoch < n_epochs; ++ epoch) { 207 | cout << "Epoch " << epoch << "..." << endl; 208 | 209 | train_on_dataset(*model, ds_train); 210 | evaluate_on_dataset(*model, ds_val); 211 | } 212 | } 213 | 214 | // Predict on test if given 215 | if (!test_file_name.empty() && !pred_file_name.empty()) { 216 | auto ds_test = batch_learn_dataset(test_file_name); 217 | 218 | if (ds_test.index.n_index_bits != ds_train.index.n_index_bits) 219 | throw std::runtime_error("Mismatching index bits in train and test"); 220 | 221 | ofstream out(pred_file_name); 222 | predict_on_dataset(*model, ds_test, out); 223 | } 224 | 225 | return 0; 226 | } 227 | -------------------------------------------------------------------------------- /src/commands/model.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "command.hpp" 4 | #include "../models/model.hpp" 5 | 6 | 7 | class model_command : public command { 8 | protected: 9 | std::string train_file_name, val_file_name, test_file_name, pred_file_name; 10 | uint n_epochs, n_threads, seed; 11 | public: 12 | model_command(): seed(0) { 13 | using namespace boost::program_options; 14 | 15 | options_desc.add_options() 16 | ("train", value(&train_file_name)->required(), "train dataset file") 17 | ("val", value(&val_file_name), "validation dataset file") 18 | ("test", value(&test_file_name), "test dataset file") 19 | ("pred", value(&pred_file_name), "file to save predictions") 20 | ("seed,s", value(&seed), "random seed") 21 | ("epochs", value(&n_epochs)->default_value(10), "number of epochs") 22 | ("threads,t", value(&n_threads)->default_value(4), "number of threads"); 23 | 24 | positional_options_desc.add("test", 1).add("pred", 1); 25 | } 26 | 27 | virtual int run(); 28 | virtual std::unique_ptr create_model(uint32_t n_fields, uint32_t n_indices, uint32_t n_index_bits) = 0; 29 | }; 30 | -------------------------------------------------------------------------------- /src/commands/nn.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "model.hpp" 4 | #include "../models/nn.hpp" 5 | 6 | 7 | class nn_command : public model_command { 8 | protected: 9 | float eta, lambda; 10 | public: 11 | nn_command() { 12 | using namespace boost::program_options; 13 | 14 | options_desc.add_options() 15 | ("eta", value(&eta)->default_value(0.02), "learning rate") 16 | ("lambda", value(&lambda)->default_value(0.00002), "l2 regularization coeff"); 17 | } 18 | 19 | virtual std::string name() { return "nn"; } 20 | virtual std::string description() { return "train and apply nn model"; } 21 | 22 | virtual std::unique_ptr create_model(uint32_t n_fields, uint32_t n_indices, uint32_t n_index_bits) { 23 | return std::unique_ptr(new nn_model(n_indices, n_index_bits, seed, eta, lambda)); 24 | } 25 | }; 26 | -------------------------------------------------------------------------------- /src/models/ffm.cpp: -------------------------------------------------------------------------------- 1 | #include "ffm.hpp" 2 | 3 | #include "../util/model.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | 13 | class state { 14 | public: 15 | std::vector dropout_mask; 16 | float dropout_mult; 17 | public: 18 | void init_train_dropout_mask(int len) { 19 | dropout_mult = 2.0; 20 | dropout_mask.resize((len + 63) / 64); 21 | 22 | for (uint64_t * p = dropout_mask.data(); p != dropout_mask.data() + dropout_mask.size(); ++ p) 23 | if (_rdrand64_step((unsigned long long *)p) != 1) 24 | throw std::runtime_error("Error generating random number!"); 25 | 26 | } 27 | 28 | void init_test_dropout_mask(int len) { 29 | dropout_mult = 1.0; 30 | dropout_mask.resize((len + 63) / 64); 31 | 32 | memset(dropout_mask.data(), 0xFF, dropout_mask.size() * sizeof(uint64_t)); 33 | } 34 | }; 35 | 36 | static thread_local state local_state; 37 | 38 | 39 | template 40 | static void init_ffm_weights(float * weights, uint64_t n, uint32_t n_dim, uint32_t n_dim_aligned, D gen, std::default_random_engine & rnd) { 41 | float * w = weights; 42 | 43 | for(uint64_t i = 0; i < n; i++) { 44 | for (uint d = 0; d < n_dim; d++, w++) 45 | *w = gen(rnd); 46 | 47 | for (uint d = n_dim; d < n_dim_aligned; d++, w++) 48 | *w = 0; 49 | 50 | for (uint d = n_dim_aligned; d < 2*n_dim_aligned; d++, w++) 51 | *w = 1; 52 | } 53 | } 54 | 55 | 56 | static void init_lin_weights(float * weights, uint64_t n) { 57 | float * w = weights; 58 | 59 | for(uint64_t i = 0; i < n; i++) { 60 | *w++ = 0; 61 | *w++ = 1; 62 | } 63 | } 64 | 65 | 66 | ffm_model::ffm_model(uint32_t n_fields, uint32_t n_indices, uint32_t n_index_bits, uint32_t n_dim, int seed, float eta, float lambda, bool dropout) { 67 | this->n_fields = n_fields; 68 | this->n_indices = n_indices; 69 | this->n_index_bits = n_index_bits; 70 | this->n_dim = n_dim; 71 | this->eta = eta; 72 | this->lambda = lambda; 73 | this->dropout = dropout; 74 | 75 | n_dim_aligned = ((n_dim - 1) / align_floats + 1) * align_floats; 76 | 77 | index_stride = n_fields * n_dim_aligned * 2; 78 | field_stride = n_dim_aligned * 2; 79 | index_mask = (1ul << n_index_bits) - 1; 80 | 81 | std::default_random_engine rnd(seed); 82 | 83 | bias_w = 0; 84 | bias_wg = 1; 85 | 86 | try { 87 | uint64_t total_weights = size_t(n_indices) * n_fields * n_dim_aligned * 2 + n_indices * 2; 88 | 89 | std::cout << "Indices: " << n_indices << ", fields: " << n_fields << std::endl; 90 | std::cout << "Allocating " << (total_weights * sizeof(float) / 1024 / 1024) << " MB memory for model weights... "; 91 | std::cout.flush(); 92 | 93 | ffm_weights = malloc_aligned(uint64_t(n_indices) * n_fields * n_dim_aligned * 2); 94 | lin_weights = malloc_aligned(n_indices * 2); 95 | 96 | std::cout << "done." << std::endl; 97 | } catch (std::bad_alloc & e) { 98 | throw std::runtime_error("Can't allocate weights memory"); 99 | } 100 | 101 | std::cout << "Initializing weights... "; 102 | std::cout.flush(); 103 | 104 | init_ffm_weights(ffm_weights, size_t(n_indices) * n_fields, n_dim, n_dim_aligned, std::uniform_real_distribution(0.0, 1.0/sqrt(n_dim)), rnd); 105 | init_lin_weights(lin_weights, n_indices); 106 | 107 | std::cout << "done." << std::endl; 108 | } 109 | 110 | 111 | ffm_model::~ffm_model() { 112 | free(ffm_weights); 113 | free(lin_weights); 114 | } 115 | 116 | 117 | float ffm_model::predict(const batch_learn::feature * start, const batch_learn::feature * end, float norm, bool train) { 118 | uint feature_count = end - start; 119 | uint interaction_count = feature_count * (feature_count + 1) / 2; 120 | 121 | if (train && dropout) 122 | local_state.init_train_dropout_mask(interaction_count); 123 | else 124 | local_state.init_test_dropout_mask(interaction_count); 125 | 126 | auto dropout_mask = local_state.dropout_mask.data(); 127 | float dropout_mult = local_state.dropout_mult; 128 | 129 | 130 | float linear_total = bias_w; 131 | float linear_norm = end - start; 132 | 133 | __m256 xmm_total = _mm256_set1_ps(0); 134 | 135 | uint i = 0; 136 | 137 | for (const batch_learn::feature * fa = start; fa != end; ++ fa) { 138 | uint index_a = fa->index & index_mask; 139 | uint field_a = fa->index >> n_index_bits; 140 | float value_a = fa->value; 141 | 142 | // Check index/field bounds 143 | if (index_a >= n_indices || field_a >= n_fields) 144 | continue; 145 | 146 | linear_total += value_a * lin_weights[index_a*2] / linear_norm; 147 | 148 | for (const batch_learn::feature * fb = start; fb != fa; ++ fb, ++ i) { 149 | uint index_b = fb->index & index_mask; 150 | uint field_b = fb->index >> n_index_bits; 151 | float value_b = fb->value; 152 | 153 | // Check index/field bounds 154 | if (index_b >= n_indices || field_b >= n_fields) 155 | continue; 156 | 157 | if (test_mask_bit(dropout_mask, i) == 0) 158 | continue; 159 | 160 | float * wa = ffm_weights + index_a * index_stride + field_b * field_stride; 161 | float * wb = ffm_weights + index_b * index_stride + field_a * field_stride; 162 | 163 | __m256 xmm_val = _mm256_set1_ps(dropout_mult * value_a * value_b / norm); 164 | 165 | for(uint d = 0; d < n_dim; d += 8) { 166 | __m256 xmm_wa = _mm256_load_ps(wa + d); 167 | __m256 xmm_wb = _mm256_load_ps(wb + d); 168 | 169 | xmm_total = _mm256_add_ps(xmm_total, _mm256_mul_ps(_mm256_mul_ps(xmm_wa, xmm_wb), xmm_val)); 170 | } 171 | } 172 | } 173 | 174 | return sum(xmm_total) + linear_total; 175 | } 176 | 177 | 178 | void ffm_model::update(const batch_learn::feature * start, const batch_learn::feature * end, float norm, float kappa) { 179 | auto dropout_mask = local_state.dropout_mask.data(); 180 | float dropout_mult = local_state.dropout_mult; 181 | 182 | float linear_norm = end - start; 183 | 184 | __m256 xmm_eta = _mm256_set1_ps(eta); 185 | __m256 xmm_lambda = _mm256_set1_ps(lambda); 186 | 187 | uint i = 0; 188 | 189 | for (const batch_learn::feature * fa = start; fa != end; ++ fa) { 190 | uint index_a = fa->index & index_mask; 191 | uint field_a = fa->index >> n_index_bits; 192 | float value_a = fa->value; 193 | 194 | // Check index/field bounds 195 | if (index_a >= n_indices || field_a >= n_fields) 196 | continue; 197 | 198 | float g = lambda * lin_weights[index_a*2] + kappa * value_a / linear_norm; 199 | float wg = lin_weights[index_a*2 + 1] + g*g; 200 | 201 | lin_weights[index_a*2] -= eta * g / sqrt(wg); 202 | lin_weights[index_a*2 + 1] = wg; 203 | 204 | for (const batch_learn::feature * fb = start; fb != fa; ++ fb, ++ i) { 205 | uint index_b = fb->index & index_mask; 206 | uint field_b = fb->index >> n_index_bits; 207 | float value_b = fb->value; 208 | 209 | // Check index/field bounds 210 | if (index_b >= n_indices || field_b >= n_fields) 211 | continue; 212 | 213 | if (test_mask_bit(dropout_mask, i) == 0) 214 | continue; 215 | 216 | float * wa = ffm_weights + index_a * index_stride + field_b * field_stride; 217 | float * wb = ffm_weights + index_b * index_stride + field_a * field_stride; 218 | 219 | float * wga = wa + n_dim_aligned; 220 | float * wgb = wb + n_dim_aligned; 221 | 222 | __m256 xmm_kappa_val = _mm256_set1_ps(kappa * dropout_mult * value_a * value_b / norm); 223 | 224 | for(uint d = 0; d < n_dim; d += 8) { 225 | // Load weights 226 | __m256 xmm_wa = _mm256_load_ps(wa + d); 227 | __m256 xmm_wb = _mm256_load_ps(wb + d); 228 | 229 | __m256 xmm_wga = _mm256_load_ps(wga + d); 230 | __m256 xmm_wgb = _mm256_load_ps(wgb + d); 231 | 232 | // Compute gradient values 233 | __m256 xmm_ga = _mm256_add_ps(_mm256_mul_ps(xmm_lambda, xmm_wa), _mm256_mul_ps(xmm_kappa_val, xmm_wb)); 234 | __m256 xmm_gb = _mm256_add_ps(_mm256_mul_ps(xmm_lambda, xmm_wb), _mm256_mul_ps(xmm_kappa_val, xmm_wa)); 235 | 236 | // Update weights 237 | xmm_wga = _mm256_add_ps(xmm_wga, _mm256_mul_ps(xmm_ga, xmm_ga)); 238 | xmm_wgb = _mm256_add_ps(xmm_wgb, _mm256_mul_ps(xmm_gb, xmm_gb)); 239 | 240 | xmm_wa = _mm256_sub_ps(xmm_wa, _mm256_mul_ps(xmm_eta, _mm256_mul_ps(_mm256_rsqrt_ps(xmm_wga), xmm_ga))); 241 | xmm_wb = _mm256_sub_ps(xmm_wb, _mm256_mul_ps(xmm_eta, _mm256_mul_ps(_mm256_rsqrt_ps(xmm_wgb), xmm_gb))); 242 | 243 | // Store weights 244 | _mm256_store_ps(wa + d, xmm_wa); 245 | _mm256_store_ps(wb + d, xmm_wb); 246 | 247 | _mm256_store_ps(wga + d, xmm_wga); 248 | _mm256_store_ps(wgb + d, xmm_wgb); 249 | } 250 | } 251 | } 252 | 253 | // Update bias 254 | bias_wg += kappa*kappa; 255 | bias_w -= eta * kappa / sqrt(bias_wg); 256 | } 257 | -------------------------------------------------------------------------------- /src/models/ffm.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "model.hpp" 4 | 5 | 6 | class ffm_model : public model { 7 | uint32_t n_fields, n_indices, n_index_bits, n_dim; 8 | 9 | uint32_t n_dim_aligned, index_stride, field_stride, index_mask; 10 | 11 | float * ffm_weights; 12 | float * lin_weights; 13 | 14 | float bias_w; 15 | float bias_wg; 16 | 17 | float eta; 18 | float lambda; 19 | bool dropout; 20 | public: 21 | ffm_model(uint32_t n_fields, uint32_t n_indices, uint32_t n_index_bits, uint32_t n_dim, int seed, float eta, float lambda, bool dropout); 22 | virtual ~ffm_model(); 23 | 24 | virtual float predict(const batch_learn::feature * start, const batch_learn::feature * end, float norm, bool train); 25 | virtual void update(const batch_learn::feature * start, const batch_learn::feature * end, float norm, float kappa); 26 | }; 27 | -------------------------------------------------------------------------------- /src/models/model.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | class model { 6 | public: 7 | model() {} 8 | virtual ~model() {} 9 | 10 | virtual float predict(const batch_learn::feature * start, const batch_learn::feature * end, float norm, bool train) = 0; 11 | virtual void update(const batch_learn::feature * start, const batch_learn::feature * end, float norm, float kappa) = 0; 12 | }; 13 | -------------------------------------------------------------------------------- /src/models/nn.cpp: -------------------------------------------------------------------------------- 1 | #include "nn.hpp" 2 | 3 | #include "../util/model.hpp" 4 | #include "../util/nn.hpp" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | constexpr uint l0_output_size = aligned_float_array_size(96); 13 | constexpr uint l1_output_size = aligned_float_array_size(64); 14 | constexpr uint l2_output_size = aligned_float_array_size(48); 15 | 16 | constexpr uint l1_layer_size = l0_output_size * (l1_output_size - 1); 17 | constexpr uint l2_layer_size = l1_output_size * (l2_output_size - 1); 18 | constexpr uint l3_layer_size = l2_output_size; 19 | 20 | 21 | 22 | class state_buffer { 23 | public: 24 | float * l0_output; 25 | float * l0_output_grad; 26 | float * l0_dropout_mask; 27 | 28 | float * l1_output; 29 | float * l1_output_grad; 30 | float * l1_dropout_mask; 31 | 32 | float * l2_output; 33 | float * l2_output_grad; 34 | float * l2_dropout_mask; 35 | 36 | std::default_random_engine gen; 37 | public: 38 | state_buffer() { 39 | l0_output = malloc_aligned(l0_output_size); 40 | l0_output_grad = malloc_aligned(l0_output_size); 41 | l0_dropout_mask = malloc_aligned(l0_output_size); 42 | 43 | l1_output = malloc_aligned(l1_output_size); 44 | l1_output_grad = malloc_aligned(l1_output_size); 45 | l1_dropout_mask = malloc_aligned(l1_output_size); 46 | 47 | l2_output = malloc_aligned(l2_output_size); 48 | l2_output_grad = malloc_aligned(l2_output_size); 49 | l2_dropout_mask = malloc_aligned(l2_output_size); 50 | } 51 | 52 | ~state_buffer() { 53 | free(l0_output); 54 | free(l0_output_grad); 55 | free(l0_dropout_mask); 56 | 57 | free(l1_output); 58 | free(l1_output_grad); 59 | free(l1_dropout_mask); 60 | 61 | free(l2_output); 62 | free(l2_output_grad); 63 | free(l2_dropout_mask); 64 | } 65 | }; 66 | 67 | static thread_local state_buffer local_state_buffer; 68 | 69 | 70 | nn_model::nn_model(uint32_t n_indices, uint32_t n_index_bits, int seed, float eta, float lambda) { 71 | this->n_indices = n_indices; 72 | this->n_index_bits = n_index_bits; 73 | this->eta = eta; 74 | this->lambda = lambda; 75 | 76 | index_mask = (1u << n_index_bits) - 1; 77 | 78 | std::default_random_engine rnd(seed); 79 | 80 | lin_w = malloc_aligned(n_indices * l0_output_size); 81 | lin_wg = malloc_aligned(n_indices * l0_output_size); 82 | 83 | l1_w = malloc_aligned(l1_layer_size); 84 | l1_wg = malloc_aligned(l1_layer_size); 85 | 86 | l2_w = malloc_aligned(l2_layer_size); 87 | l2_wg = malloc_aligned(l2_layer_size); 88 | 89 | l3_w = malloc_aligned(l3_layer_size); 90 | l3_wg = malloc_aligned(l3_layer_size); 91 | 92 | fill_with_rand(lin_w, n_indices * l0_output_size, std::uniform_real_distribution(-0.1, 0.1), rnd); 93 | fill_with_ones(lin_wg, n_indices * l0_output_size); 94 | 95 | fill_with_rand(l1_w, l1_layer_size, std::normal_distribution(0, 2/sqrt(l0_output_size)), rnd); 96 | fill_with_ones(l1_wg, l1_layer_size); 97 | 98 | fill_with_rand(l2_w, l2_layer_size, std::normal_distribution(0, 2/sqrt(l1_output_size)), rnd); 99 | fill_with_ones(l2_wg, l2_layer_size); 100 | 101 | fill_with_rand(l3_w, l3_layer_size, std::normal_distribution(0, 2/sqrt(l2_output_size)), rnd); 102 | fill_with_ones(l3_wg, l3_layer_size); 103 | } 104 | 105 | 106 | nn_model::~nn_model() { 107 | free(lin_w); 108 | free(lin_wg); 109 | 110 | free(l1_w); 111 | free(l1_wg); 112 | 113 | free(l2_w); 114 | free(l2_wg); 115 | } 116 | 117 | 118 | float nn_model::predict(const batch_learn::feature * start, const batch_learn::feature * end, float norm, bool train) { 119 | float linear_norm = end - start; 120 | state_buffer & buf = local_state_buffer; 121 | 122 | float * l0_output = buf.l0_output; 123 | float * l0_dropout_mask = buf.l0_dropout_mask; 124 | 125 | float * l1_output = buf.l1_output; 126 | float * l1_dropout_mask = buf.l1_dropout_mask; 127 | 128 | float * l2_output = buf.l2_output; 129 | float * l2_dropout_mask = buf.l2_dropout_mask; 130 | 131 | auto & gen = buf.gen; 132 | 133 | std::uniform_real_distribution dropout_distr(0, 1); 134 | 135 | if (train) { // Apply dropout only in train 136 | float l0_dropout_prob = 0;//0.02; 137 | float l1_dropout_prob = 0;//0.02; 138 | float l2_dropout_prob = 0;//0.02; 139 | 140 | float l0_dropout_scale = 1 / (1 - l0_dropout_prob); 141 | float l1_dropout_scale = 1 / (1 - l1_dropout_prob); 142 | float l2_dropout_scale = 1 / (1 - l2_dropout_prob); 143 | 144 | // Prepare dropout masks 145 | l0_dropout_mask[0] = 1.0; // No dropout on bias 146 | for (uint j = 1; j < l0_output_size; ++ j) 147 | l0_dropout_mask[j] = (dropout_distr(gen) >= l0_dropout_prob) * l0_dropout_scale; 148 | 149 | l1_dropout_mask[0] = 1.0; // No dropout on bias 150 | for (uint j = 1; j < l1_output_size; ++ j) 151 | l1_dropout_mask[j] = (dropout_distr(gen) >= l1_dropout_prob) * l1_dropout_scale; 152 | 153 | l2_dropout_mask[0] = 1.0; // No dropout on bias 154 | for (uint j = 1; j < l2_output_size; ++ j) 155 | l2_dropout_mask[j] = (dropout_distr(gen) >= l2_dropout_prob) * l2_dropout_scale; 156 | } else { 157 | fill_with_ones(l0_dropout_mask, l0_output_size); 158 | fill_with_ones(l1_dropout_mask, l1_output_size); 159 | fill_with_ones(l2_dropout_mask, l2_output_size); 160 | } 161 | 162 | // Compute activations 163 | 164 | fill_with_zero(l0_output, l0_output_size); 165 | fill_with_zero(l1_output, l1_output_size); 166 | fill_with_zero(l2_output, l2_output_size); 167 | 168 | for (const batch_learn::feature * fa = start; fa != end; ++ fa) { 169 | uint index = fa->index & index_mask; 170 | float value = fa->value; 171 | 172 | // Check index bounds 173 | if (index >= n_indices) 174 | continue; 175 | 176 | float * wl = lin_w + index * l0_output_size; 177 | 178 | __m256 ymm_val = _mm256_set1_ps(value / linear_norm); 179 | for(uint d = 0; d < l0_output_size; d += 8) { 180 | _mm256_store_ps(l0_output + d, _mm256_load_ps(l0_output + d) + _mm256_load_ps(wl + d) * ymm_val); 181 | } 182 | } 183 | 184 | l0_output[0] = 1.0; // Layer 0 bias, here we rewritre some computation results, but who cares 185 | l1_output[0] = 1.0; // Layer 1 bias 186 | l2_output[0] = 1.0; // Layer 2 bias 187 | 188 | // Layer 0 relu 189 | for (uint j = 1; j < l0_output_size; ++ j) 190 | l0_output[j] = relu(l0_output[j]) * l0_dropout_mask[j]; 191 | 192 | // Layer 1 forward pass 193 | for (uint j = 1; j < l1_output_size; ++ j) 194 | l1_output[j] = relu(forward_pass(l0_output_size, l0_output, l1_w + (j - 1) * l0_output_size)) * l1_dropout_mask[j]; 195 | 196 | // Layer 2 forward pass 197 | for (uint j = 1; j < l2_output_size; ++ j) 198 | l2_output[j] = relu(forward_pass(l1_output_size, l1_output, l2_w + (j - 1) * l1_output_size)) * l2_dropout_mask[j]; 199 | 200 | // Layer 3 forward pass 201 | return forward_pass(l2_output_size, l2_output, l3_w); 202 | } 203 | 204 | 205 | void nn_model::update(const batch_learn::feature * start, const batch_learn::feature * end, float norm, float kappa) { 206 | float linear_norm = end - start; 207 | state_buffer & buf = local_state_buffer; 208 | 209 | float * l0_output = buf.l0_output; 210 | float * l0_output_grad = buf.l0_output_grad; 211 | float * l0_dropout_mask = buf.l0_dropout_mask; 212 | 213 | float * l1_output = buf.l1_output; 214 | float * l1_output_grad = buf.l1_output_grad; 215 | float * l1_dropout_mask = buf.l1_dropout_mask; 216 | 217 | float * l2_output = buf.l2_output; 218 | float * l2_output_grad = buf.l2_output_grad; 219 | float * l2_dropout_mask = buf.l2_dropout_mask; 220 | 221 | fill_with_zero(l0_output_grad, l0_output_size); 222 | fill_with_zero(l1_output_grad, l1_output_size); 223 | fill_with_zero(l2_output_grad, l2_output_size); 224 | 225 | backward_pass(l2_output_size, l2_output, l2_output_grad, l3_w, l3_wg, kappa, eta, lambda); 226 | 227 | // Backprop layer 2 228 | for (uint j = 1, ofs = 0; j < l2_output_size; ++ j, ofs += l1_output_size) { 229 | float l2_grad = l2_output_grad[j] * l2_dropout_mask[j]; 230 | 231 | if (l2_output[j] <= 0) // Relu activation: grad in negative part is zero 232 | l2_grad = 0; 233 | 234 | backward_pass(l1_output_size, l1_output, l1_output_grad, l2_w + ofs, l2_wg + ofs, l2_grad, eta, lambda); 235 | } 236 | 237 | // Backprop layer 1 238 | for (uint j = 1, ofs = 0; j < l1_output_size; ++ j, ofs += l0_output_size) { 239 | float l1_grad = l1_output_grad[j] * l1_dropout_mask[j]; 240 | 241 | if (l1_output[j] <= 0) // Relu activation: grad in negative part is zero 242 | l1_grad = 0; 243 | 244 | backward_pass(l0_output_size, l0_output, l0_output_grad, l1_w + ofs, l1_wg + ofs, l1_grad, eta, lambda); 245 | } 246 | 247 | // Backprop layer 0 248 | l0_output_grad[0] = 0; 249 | for (uint j = 1; j < l0_output_size; ++ j) { 250 | float l0_grad = l0_output_grad[j] * l0_dropout_mask[j]; 251 | 252 | if (l0_output[j] <= 0) // Relu activation: grad in negative part is zero 253 | l0_grad = 0; 254 | 255 | l0_output_grad[j] = l0_grad; 256 | } 257 | 258 | // Update linear and interaction weights 259 | __m256 ymm_eta = _mm256_set1_ps(eta); 260 | __m256 ymm_lambda = _mm256_set1_ps(lambda); 261 | 262 | for (const batch_learn::feature * fa = start; fa != end; ++ fa) { 263 | uint index = fa->index & index_mask; 264 | float value = fa->value; 265 | 266 | // Check index bounds 267 | if (index >= n_indices) 268 | continue; 269 | 270 | float * wl = lin_w + index * l0_output_size; 271 | float * wgl = lin_wg + index * l0_output_size; 272 | 273 | __m256 ymm_val = _mm256_set1_ps(value / linear_norm); 274 | 275 | for (uint d = 0; d < l0_output_size; d += 8) { 276 | __m256 ymm_kappa_val = _mm256_load_ps(l0_output_grad + d) * ymm_val; 277 | 278 | // Load weights 279 | __m256 ymm_wl = _mm256_load_ps(wl + d); 280 | __m256 ymm_wgl = _mm256_load_ps(wgl + d); 281 | 282 | // Compute gradient values 283 | __m256 ymm_g = ymm_lambda * ymm_wl + ymm_kappa_val; 284 | 285 | // Update weights 286 | ymm_wgl = ymm_wgl + ymm_g * ymm_g; 287 | ymm_wl = ymm_wl - ymm_eta * ymm_g * _mm256_rsqrt_ps(ymm_wgl); 288 | 289 | // Store weights 290 | _mm256_store_ps(wl + d, ymm_wl); 291 | _mm256_store_ps(wgl + d, ymm_wgl); 292 | } 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /src/models/nn.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "model.hpp" 4 | 5 | 6 | class nn_model : public model { 7 | float * lin_w; 8 | float * lin_wg; 9 | 10 | float * l1_w; 11 | float * l1_wg; 12 | 13 | float * l2_w; 14 | float * l2_wg; 15 | 16 | float * l3_w; 17 | float * l3_wg; 18 | 19 | float eta; 20 | float lambda; 21 | 22 | uint32_t n_indices, n_index_bits, index_mask; 23 | public: 24 | nn_model(uint32_t n_indices, uint32_t n_index_bits, int seed, float eta, float lambda); 25 | virtual ~nn_model(); 26 | 27 | virtual float predict(const batch_learn::feature * start, const batch_learn::feature * end, float norm, bool train); 28 | virtual void update(const batch_learn::feature * start, const batch_learn::feature * end, float norm, float kappa); 29 | }; 30 | -------------------------------------------------------------------------------- /src/util/common.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | T min(T a, T b) { 5 | return a < b ? a : b; 6 | } 7 | -------------------------------------------------------------------------------- /src/util/dataset.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.hpp" 4 | 5 | #include 6 | 7 | 8 | class batch_learn_dataset { 9 | public: 10 | batch_learn::file_index index; 11 | std::string data_file_name; 12 | 13 | batch_learn_dataset(const std::string & file_name) { 14 | std::cout << "Loading " << file_name << ".index... "; 15 | std::cout.flush(); 16 | 17 | index = batch_learn::read_index(file_name + ".index"); 18 | data_file_name = file_name + ".data"; 19 | 20 | std::cout << index.n_examples << " examples" << std::endl; 21 | } 22 | 23 | std::vector> generate_batches(uint64_t batch_size) const { 24 | std::vector> batches; 25 | 26 | for (uint64_t batch_start = 0; batch_start < index.n_examples; batch_start += batch_size) 27 | batches.push_back(std::make_pair(batch_start, min(batch_start + batch_size, index.n_examples))); 28 | 29 | return batches; 30 | } 31 | }; 32 | -------------------------------------------------------------------------------- /src/util/model.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | // Define intrinsic missing in gcc 8 | #define _mm256_set_m128(v0, v1) _mm256_insertf128_ps(_mm256_castps128_ps256(v1), (v0), 1) 9 | 10 | 11 | 12 | constexpr uint32_t align_bytes = 32; 13 | constexpr uint32_t align_floats = align_bytes / sizeof(float); 14 | 15 | 16 | inline float sum(__m256 val) { 17 | __m128 s = _mm256_extractf128_ps(_mm256_add_ps(val, _mm256_permute2f128_ps(val, val, 1)), 0); 18 | 19 | s = _mm_hadd_ps(s, s); 20 | s = _mm_hadd_ps(s, s); 21 | 22 | float sum; 23 | _mm_store_ss(&sum, s); 24 | 25 | return sum; 26 | } 27 | 28 | constexpr uint aligned_float_array_size(uint cnt) { 29 | return ((cnt - 1) / align_floats + 1) * align_floats; 30 | } 31 | 32 | 33 | template 34 | inline T * malloc_aligned(size_t size) { 35 | void *ptr; 36 | 37 | int status = posix_memalign(&ptr, align_bytes, size*sizeof(T)); 38 | 39 | if(status != 0) 40 | throw std::bad_alloc(); 41 | 42 | return (T*) ptr; 43 | } 44 | 45 | 46 | template 47 | inline void fill_with_zero(T * weights, size_t n) { 48 | T * w = weights; 49 | 50 | for(size_t i = 0; i < n; i++) 51 | *w++ = T(0); 52 | } 53 | 54 | 55 | template 56 | static void fill_with_rand(float * weights, uint64_t n, D gen, std::default_random_engine & rnd) { 57 | float * w = weights; 58 | 59 | for(uint64_t i = 0; i < n; i++) { 60 | *w++ = gen(rnd); 61 | } 62 | } 63 | 64 | 65 | template 66 | inline void fill_with_ones(T * weights, size_t n) { 67 | T * w = weights; 68 | 69 | for(size_t i = 0; i < n; i++) 70 | *w++ = T(1); 71 | } 72 | 73 | 74 | inline uint test_mask_bit(uint64_t * mask, uint i) { 75 | return (mask[i >> 6] >> (i & 63)) & 1; 76 | } 77 | 78 | 79 | template 80 | inline T min(T a, T b) { 81 | return a < b ? a : b; 82 | } 83 | 84 | 85 | template 86 | inline int sgn(T val) { 87 | return (T(0) < val) - (val < T(0)); 88 | } 89 | 90 | 91 | inline float relu(float val) { 92 | return val > 0 ? val : 0; 93 | } 94 | 95 | inline bool isnan(float val) { 96 | return val != val; 97 | } 98 | -------------------------------------------------------------------------------- /src/util/nn.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | inline void backward_pass(uint input_size, float * input, float * input_grad, float * w, float * wg, float grad, float eta, float lambda) { 5 | __m256 ymm_eta = _mm256_set1_ps(eta); 6 | __m256 ymm_lambda = _mm256_set1_ps(lambda); 7 | __m256 ymm_grad = _mm256_set1_ps(grad); 8 | 9 | for (uint i = 0; i < input_size; i += 8) { 10 | __m256 ymm_w = _mm256_load_ps(w + i); 11 | 12 | __m256 ymm_g = ymm_lambda * ymm_w + ymm_grad * _mm256_load_ps(input + i); 13 | __m256 ymm_wg = _mm256_load_ps(wg + i) + ymm_g * ymm_g; 14 | 15 | _mm256_store_ps(input_grad + i, ymm_grad * ymm_w + _mm256_load_ps(input_grad + i)); 16 | 17 | _mm256_store_ps(w + i, ymm_w - ymm_eta * ymm_g * _mm256_rsqrt_ps(ymm_wg)); 18 | _mm256_store_ps(wg + i, ymm_wg); 19 | } 20 | } 21 | 22 | inline float forward_pass(uint input_size, float * input, float * w) { 23 | __m256 ymm_total = _mm256_set1_ps(0); 24 | 25 | for (uint i = 0; i < input_size; i += 8) 26 | ymm_total += _mm256_load_ps(input + i) * _mm256_load_ps(w + i); 27 | 28 | return sum(ymm_total); 29 | } 30 | --------------------------------------------------------------------------------