├── libmysqlhll ├── constants.hpp ├── CMakeLists.txt ├── SerializedHyperLogLog.hpp ├── base64 │ └── base64.h └── mysqlhll.cxx ├── .gitmodules ├── CMakeLists.txt ├── .gitignore ├── sql ├── generate_data.rb ├── udf.sql └── example.sql └── README.md /libmysqlhll/constants.hpp: -------------------------------------------------------------------------------- 1 | #define HLL_BIT_WIDTH 10 2 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libmysqlhll/cpp-hyperloglog"] 2 | path = libmysqlhll/cpp-hyperloglog 3 | url = https://github.com/amirtuval/cpp-HyperLogLog.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.8) 2 | project (MYSQLHLL) 3 | 4 | if (NOT CMAKE_BUILD_TYPE) 5 | message(STATUS "No build type selected, default to Release") 6 | set(CMAKE_BUILD_TYPE "Release") 7 | endif() 8 | 9 | add_subdirectory (libmysqlhll) 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | CMakeCache.txt 2 | CMakeFiles 3 | Makefile 4 | cmake_install.cmake 5 | install_manifest.txt 6 | 7 | 8 | *.so 9 | *.*~ 10 | libmysqlhll-test 11 | *vcxproj* 12 | *.sln 13 | *.suo 14 | *.sdf 15 | *.opensdf 16 | Win32/ 17 | libmysqlhll/mysqlhll.dir/ 18 | Debug/ 19 | Release/ 20 | x64/ -------------------------------------------------------------------------------- /libmysqlhll/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | IF (WIN32) 2 | set(MYSQL_INCLUDE_DIR "C:\\Program Files\\MySQL\\MySQL Server 5.6\\include\\" CACHE STRING "MySQL include directory") 3 | ELSE() 4 | set(MYSQL_INCLUDE_DIR "/usr/include/mysql" CACHE STRING "MySQL include directory") 5 | ENDIF() 6 | 7 | include_directories (${MYSQLHLL_SOURCE_DIR}/libmysqlhll/cpp-hyperloglog/include ${MYSQLHLL_SOURCE_DIR}/libmysqlhll/base64 ${MYSQL_INCLUDE_DIR}) 8 | 9 | add_library (mysqlhll SHARED mysqlhll.cxx) 10 | -------------------------------------------------------------------------------- /sql/generate_data.rb: -------------------------------------------------------------------------------- 1 | require 'csv' 2 | 3 | URLS = ['http://www.google.com', 'http://www.cnn.com', 'http://www.yahoo.com', 'http://www.abc.com', 'http://www.wikipedia.org'] 4 | ROW_COUNT = 50000 5 | USER_COUNT = 10000 6 | HOURS = 60*60 7 | DAYS = 24 * HOURS 8 | 9 | CSV.open('data.csv', 'w') do |csv| 10 | ROW_COUNT.times do 11 | csv << [URLS.sample, (1..USER_COUNT).to_a.sample, Time.now - (0..7).to_a.sample * DAYS - (0..12).to_a.sample * HOURS, (0..60).to_a.sample] 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /sql/udf.sql: -------------------------------------------------------------------------------- 1 | # rename to '.so' to '.dll' on windows 2 | 3 | drop function if exists hll_create; 4 | drop function if exists hll_create_legacy; 5 | drop function if exists hll_compute; 6 | drop function if exists hll_merge; 7 | drop function if exists hll_merge_compute; 8 | 9 | create aggregate function hll_create returns string soname 'libmysqlhll.so'; 10 | create aggregate function hll_create_legacy returns string soname 'libmysqlhll.so'; 11 | create aggregate function hll_compute returns int soname 'libmysqlhll.so'; 12 | create aggregate function hll_merge returns string soname 'libmysqlhll.so'; 13 | create aggregate function hll_merge_compute returns int soname 'libmysqlhll.so'; 14 | -------------------------------------------------------------------------------- /libmysqlhll/SerializedHyperLogLog.hpp: -------------------------------------------------------------------------------- 1 | #if !defined(SERIALIZED_HYPERLOGLOG_HPP) 2 | #define SERIALIZED_HYPERLOGLOG_HPP 3 | 4 | #include 5 | #include 6 | 7 | #include "hyperloglog.hpp" 8 | #include "base64.h" 9 | 10 | class SerializedHyperLogLog : public hll::HyperLogLog { 11 | 12 | public: 13 | SerializedHyperLogLog(uint8_t b, bool legacyMode=true) : HyperLogLog(b, legacyMode) { 14 | } 15 | 16 | char* toString(char* result) { 17 | sprintf(result, "%d|%d|bin_", (legacyMode_ ? 1 : 0), b_); 18 | memcpy(&result[strlen(result)], &M_[0], M_.size()); 19 | return result; 20 | } 21 | 22 | int stringLength() { 23 | return 10 + M_.size(); 24 | } 25 | 26 | static SerializedHyperLogLog* fromString(const char* encoded) { 27 | if (encoded == NULL) return NULL; 28 | const char* firstSep = (const char*)memchr(encoded, '|', 4); 29 | if (firstSep == NULL) return NULL; 30 | 31 | int m; 32 | const char* data; 33 | bool legacyMode; 34 | 35 | const char* secondSep = (const char*) memchr(&firstSep[1], '|', 4); 36 | if (secondSep == NULL) { // check if string has 2 '|' 37 | sscanf(encoded, "%d|", &m); 38 | data = &firstSep[1]; 39 | legacyMode = true; 40 | } else { 41 | int legacyModeInt; 42 | sscanf(encoded, "%d|%d|", &legacyModeInt, &m); 43 | legacyMode = legacyModeInt == 0 ? false : true; 44 | data = &secondSep[1]; 45 | } 46 | 47 | SerializedHyperLogLog* result = new SerializedHyperLogLog(m, legacyMode); 48 | 49 | const unsigned char* decoded; 50 | bool freeDecoded; 51 | if (strncmp(data, "bin_", 4) == 0) { 52 | decoded = (const unsigned char*)&data[4]; 53 | freeDecoded = false; 54 | } else { 55 | size_t outputLength; 56 | decoded = base64_decode(data, strlen(data), &outputLength); 57 | freeDecoded = true; 58 | } 59 | 60 | if (decoded == NULL) return NULL; 61 | 62 | memcpy(&result->M_[0], decoded, result->M_.size()); 63 | 64 | if (freeDecoded) 65 | free((void*)decoded); 66 | 67 | return result; 68 | } 69 | 70 | }; 71 | 72 | #endif 73 | -------------------------------------------------------------------------------- /sql/example.sql: -------------------------------------------------------------------------------- 1 | # This is a SQL script that demonstrates the various features of the hyperloglog functions 2 | 3 | drop database if exists hll_test; 4 | 5 | create database hll_test; 6 | 7 | use hll_test; 8 | 9 | # create raw data table 10 | drop table if exists user_visits; 11 | create table user_visits ( 12 | id integer primary key auto_increment, 13 | url varchar(255), 14 | user_id integer, 15 | visit_time datetime, 16 | visit_length_in_minutes integer 17 | ); 18 | 19 | # you must enable loading local files into mysql. See http://dev.mysql.com/doc/refman/5.1/en/load-data-local.html 20 | load data local infile 'data.csv' into table user_visits columns terminated by ',' (url, user_id, visit_time, visit_length_in_minutes); 21 | 22 | # get accurate and estimated counts for google visits per day 23 | select date(visit_time), count(distinct user_id) accurate_user_count, hll_compute(user_id) estimated_user_count 24 | from user_visits 25 | where url like '%google%' 26 | group by date(visit_time); 27 | 28 | # get accurate and estimated counts for the last 3 days per url 29 | select url, count(distinct user_id) accurate_user_count, hll_compute(user_id) estimated_user_count 30 | from user_visits 31 | where visit_time >= date_sub('2014-06-16',INTERVAL 3 DAY) 32 | group by url; 33 | 34 | 35 | # create aggregated table 36 | drop table if exists daily_user_visits; 37 | create table daily_user_visits ( 38 | day date, 39 | url varchar(255), 40 | user_hll varchar(5468), 41 | visit_length_in_minutes integer, 42 | unique(day, url) 43 | ); 44 | 45 | replace into daily_user_visits(day, url, user_hll, visit_length_in_minutes) 46 | select date(visit_time), url, hll_create(user_id), sum(visit_length_in_minutes) 47 | from user_visits 48 | group by date(visit_time), url; 49 | 50 | 51 | # get estimated counts for google visits per day 52 | select day, hll_merge_compute(user_hll) estimated_user_count 53 | from daily_user_visits 54 | where url like '%google%' 55 | group by day; 56 | 57 | # get accurate and estimated counts for the last 3 days per url 58 | select url, hll_merge_compute(user_hll) estimated_user_count 59 | from daily_user_visits 60 | where day >= date_sub('2014-06-16',INTERVAL 3 DAY) 61 | group by url; 62 | 63 | 64 | # build aggregation incrementally 65 | 66 | truncate table daily_user_visits; 67 | 68 | # first insert, no unique violation 69 | insert into daily_user_visits(day, url, user_hll, visit_length_in_minutes) 70 | select date(visit_time), url, hll_create(user_id), sum(visit_length_in_minutes) 71 | from user_visits 72 | where id < 10000 73 | group by date(visit_time), url; 74 | 75 | # second insert, on unique violation we update the existing row 76 | insert into daily_user_visits(day, url, user_hll, visit_length_in_minutes) 77 | select date(visit_time), url, hll_create(user_id), sum(visit_length_in_minutes) 78 | from user_visits 79 | where id >= 10000 80 | group by date(visit_time), url 81 | on duplicate key update user_hll=(select hll_merge(user_hll, values(user_hll))), visit_length_in_minutes=visit_length_in_minutes+values(visit_length_in_minutes); 82 | -------------------------------------------------------------------------------- /libmysqlhll/base64/base64.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | static char encoding_table[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 6 | 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 7 | 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 8 | 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 9 | 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 10 | 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 11 | 'w', 'x', 'y', 'z', '0', '1', '2', '3', 12 | '4', '5', '6', '7', '8', '9', '+', '/'}; 13 | static char *decoding_table = NULL; 14 | static int mod_table[] = {0, 2, 1}; 15 | 16 | 17 | char *base64_encode(const unsigned char *data, 18 | size_t input_length, 19 | size_t *output_length) { 20 | 21 | *output_length = 4 * ((input_length + 2) / 3); 22 | 23 | char *encoded_data = (char*)malloc(*output_length + 1); 24 | if (encoded_data == NULL) return NULL; 25 | 26 | for (int i = 0, j = 0; i < input_length;) { 27 | 28 | uint32_t octet_a = i < input_length ? (unsigned char)data[i++] : 0; 29 | uint32_t octet_b = i < input_length ? (unsigned char)data[i++] : 0; 30 | uint32_t octet_c = i < input_length ? (unsigned char)data[i++] : 0; 31 | 32 | uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c; 33 | 34 | encoded_data[j++] = encoding_table[(triple >> 3 * 6) & 0x3F]; 35 | encoded_data[j++] = encoding_table[(triple >> 2 * 6) & 0x3F]; 36 | encoded_data[j++] = encoding_table[(triple >> 1 * 6) & 0x3F]; 37 | encoded_data[j++] = encoding_table[(triple >> 0 * 6) & 0x3F]; 38 | } 39 | 40 | for (int i = 0; i < mod_table[input_length % 3]; i++) 41 | encoded_data[*output_length - 1 - i] = '='; 42 | 43 | encoded_data[*output_length] = '\0'; 44 | return encoded_data; 45 | } 46 | 47 | void build_decoding_table() { 48 | 49 | decoding_table = (char*)malloc(256); 50 | 51 | for (int i = 0; i < 64; i++) 52 | decoding_table[(unsigned char) encoding_table[i]] = i; 53 | } 54 | 55 | unsigned char *base64_decode(const char *data, 56 | size_t input_length, 57 | size_t *output_length) { 58 | 59 | if (decoding_table == NULL) build_decoding_table(); 60 | 61 | if (input_length % 4 != 0) return NULL; 62 | 63 | *output_length = input_length / 4 * 3; 64 | if (data[input_length - 1] == '=') (*output_length)--; 65 | if (data[input_length - 2] == '=') (*output_length)--; 66 | 67 | unsigned char *decoded_data = (unsigned char*)malloc(*output_length); 68 | if (decoded_data == NULL) return NULL; 69 | 70 | for (int i = 0, j = 0; i < input_length;) { 71 | 72 | uint32_t sextet_a = data[i] == '=' ? 0 & i++ : decoding_table[data[i++]]; 73 | uint32_t sextet_b = data[i] == '=' ? 0 & i++ : decoding_table[data[i++]]; 74 | uint32_t sextet_c = data[i] == '=' ? 0 & i++ : decoding_table[data[i++]]; 75 | uint32_t sextet_d = data[i] == '=' ? 0 & i++ : decoding_table[data[i++]]; 76 | 77 | uint32_t triple = (sextet_a << 3 * 6) 78 | + (sextet_b << 2 * 6) 79 | + (sextet_c << 1 * 6) 80 | + (sextet_d << 0 * 6); 81 | 82 | if (j < *output_length) decoded_data[j++] = (triple >> 2 * 8) & 0xFF; 83 | if (j < *output_length) decoded_data[j++] = (triple >> 1 * 8) & 0xFF; 84 | if (j < *output_length) decoded_data[j++] = (triple >> 0 * 8) & 0xFF; 85 | } 86 | 87 | return decoded_data; 88 | } 89 | 90 | void base64_cleanup() { 91 | free(decoding_table); 92 | } 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mysql-hyperloglog 2 | ================= 3 | 4 | A MySQL plugin for [the HyperLogLog Algorithm](http://en.wikipedia.org/wiki/HyperLogLog). 5 | When installed, the plugin adds several aggregate functions to mysql, that allows you to estimate the distinct count of large datasets. 6 | You can use it as a replacement for count(distinct). However, the real power of the plugin is by storing the hyperloglog result in an [aggregate table](http://en.wikipedia.org/wiki/Aggregate_(data_warehouse)), which allows you to get an estimate of the distinct count from the aggregate. 7 | 8 | 9 | 10 | **Note:** An estimated count is just like it sounds - an estimate, and therefore, not 100% accurate. You can expect several percentages of difference between the actual count and the estimated count, so it might not be suitable for all use cases. 11 | Hyperloglog trades space for accuracy, so you can increase the accuracy by allowing HLL to store more data. More on that later. 12 | 13 | Thanks 14 | ====== 15 | 16 | The excellent [cpp-hyperloglog](https://github.com/hideo55/cpp-HyperLogLog) project is used for the actual HyperLogLog implementation. 17 | Thanks, Hideaki Ohno. 18 | 19 | Usage 20 | ===== 21 | 22 | The plugin includes 4 aggregate functions: 23 | 24 | HLL_CREATE - Given a list of values, this function will return the HLL string computed from these values. 25 | HLL_COMPUTE - Given a list of values, this function will return an integer representing the estimated distinct count of these values. 26 | HLL_MERGE - Given a list of HLL strings, this function will return the HLL string that is the combination of all the hll strings. 27 | HLL_MERGE_COMPUTE - Given a list of HLL strings, this function will return an integer representing the estimated distinct count of these values. 28 | 29 | HyperLogLog stores its data as a byte vector. An HLL string is the base64 representation of the byte vector. 30 | In its default implementation, the plugin uses hyperloglog with a 10 bit width, resulting in a 1024 bytes vector. Base64 string of that is ~1400 chars long. 31 | 32 | [Here's a more detailed example](sql/example.sql). 33 | 34 | **NOTE:** Each of the above functions can accept any number of arguments. Each value will be treated as an additional value added. 35 | So you can issue the following statement, for example: 36 | 37 | ```sql 38 | mysql> select hll_compute(1,2,4,3,4,2,1); 39 | +----------------------------+ 40 | | hll_compute(1,2,4,3,4,2,1) | 41 | +----------------------------+ 42 | | 4 | 43 | +----------------------------+ 44 | 1 row in set (0.01 sec) 45 | ``` 46 | 47 | Installation 48 | ============ 49 | 50 | Compilation 51 | ----------- 52 | 53 | The project uses [CMake](http://www.cmake.org/) as a build tool, and the code is platform independent, so it should compile fine on most platforms (linux, windows, mac). 54 | There might be some issues around compiling cmake that are beyond the scope of this guide to resolve. Please consult the documentation/relevant forums for it. For example, CMake has some issues with compiling 64 bit projects on windows. 55 | 56 | **Linux** 57 | 58 | Tested on ubuntu precise (12.04) 64 bit, but should work pretty much the same on most linux distros. 59 | 60 | **Prerequisites:** Make sure you have the cmake, build-essential and libmysqlclient-dev packages installed. 61 | 62 | Run the following from the project's root directory 63 | 64 | ```bash 65 | git submodule update --init 66 | cmake . 67 | make 68 | ``` 69 | 70 | **NOTE:** You may need to tell cmake where to find mysql header files. The deault is "/usr/include/mysql", but if they are located in another directory on your machine, add "-DMYSQL_INCLUDE_DIR={DIR}" to the cmake command line. 71 | 72 | After a successful compilation, you will have the libmysqlhll.so binary under the libmysqlhll/ directory. 73 | 74 | **Windows** 75 | 76 | Tested on Windows 7 with Visual Studio 2010 proffessional, but should work pretty much the same on most windows machines. 77 | 78 | **Prerequisites**: Make sure you have CMake installed. 79 | 80 | Run the following from the project's root directory, in a Visual Studio command prompt (open either 32 or 64 bit command prompt, depending on your chosen target CPU): 81 | 82 | ``` 83 | git submodule update --init 84 | cmake . 85 | msbuild mysqlhll.sln /p:Configuration=Release 86 | ``` 87 | 88 | **NOTE:** You may need to tell cmake where to find mysql header files. The deault is "C:\Program Files\MySQL\MySQL Server 5.6\include", but if they are located in another directory on your machine, add "-DMYSQL_INCLUDE_DIR={DIR}" to the cmake command line. 89 | 90 | After that, mysqlhll.dll file will be located under libmysqlhll\Release. 91 | 92 | 93 | MySQL Installation 94 | ------------------ 95 | 96 | Once the binary is compiled (.so on linux, .dll on windows), you have to copy it to the MySQL plugins dir. 97 | If you are not sure where that is, you can check by running this command 98 | 99 | `show variables like '%plugin%';` 100 | 101 | in mysql client (On linux, it is usually /usr/lib/mysql/plugin/). 102 | 103 | After that login to mysql as root, and run [the functions installation script](sql/udf.sql). Replace .so with .dll if you are on windows. 104 | 105 | 106 | Customization 107 | ============= 108 | 109 | If you would like to change the bit width of the HyperLogLog algorithm, you can change it by editing the [constants.hpp file](libmysqlhll/constants.hpp) before compiling the project. 110 | The default is 12, resulting in 2**12(4096) bytes of storage. This can go as high as 16 to get better accuracy with more storage, or you can lower it to save storage. 111 | -------------------------------------------------------------------------------- /libmysqlhll/mysqlhll.cxx: -------------------------------------------------------------------------------- 1 | #ifdef WIN32 2 | #include 3 | typedef signed char int8_t; 4 | #define EXPORT __declspec(dllexport) 5 | #else 6 | #define EXPORT 7 | #endif 8 | 9 | #include 10 | 11 | #include "constants.hpp" 12 | #include "SerializedHyperLogLog.hpp" 13 | 14 | #define HLL_LEGACY_BIT_WIDTH 12 15 | 16 | extern "C" { 17 | 18 | #ifndef NDEBUG 19 | #define LOG(...) fprintf(stderr, __VA_ARGS__); 20 | #else 21 | #define LOG(...) 22 | #endif 23 | 24 | my_bool EXPORT hll_create_init(UDF_INIT *initid, UDF_ARGS *args, char *message); 25 | void EXPORT hll_create_deinit(UDF_INIT *initid); 26 | char EXPORT *hll_create(UDF_INIT *initid, UDF_ARGS *args, char *result, 27 | unsigned long *length, char *is_null, char *error); 28 | void EXPORT hll_create_clear(UDF_INIT* initid, char* is_null, char* message); 29 | void EXPORT hll_create_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message); 30 | 31 | my_bool EXPORT hll_create_legacy_init(UDF_INIT *initid, UDF_ARGS *args, char *message); 32 | void EXPORT hll_create_legacy_deinit(UDF_INIT *initid); 33 | char EXPORT *hll_create_legacy(UDF_INIT *initid, UDF_ARGS *args, char *result, 34 | unsigned long *length, char *is_null, char *error); 35 | void EXPORT hll_create_legacy_clear(UDF_INIT* initid, char* is_null, char* message); 36 | void EXPORT hll_create_legacy_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message); 37 | 38 | my_bool EXPORT hll_compute_init(UDF_INIT *initid, UDF_ARGS *args, char *message); 39 | void EXPORT hll_compute_deinit(UDF_INIT *initid); 40 | long long EXPORT hll_compute(UDF_INIT *initid, UDF_ARGS *args, char *result, 41 | unsigned long *length, char *is_null, char *error); 42 | void EXPORT hll_compute_clear(UDF_INIT* initid, char* is_null, char* message); 43 | void EXPORT hll_compute_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message); 44 | 45 | my_bool EXPORT hll_merge_init(UDF_INIT *initid, UDF_ARGS *args, char *message); 46 | void EXPORT hll_merge_deinit(UDF_INIT *initid); 47 | char EXPORT *hll_merge(UDF_INIT *initid, UDF_ARGS *args, char *result, 48 | unsigned long *length, char *is_null, char *error); 49 | void EXPORT hll_merge_clear(UDF_INIT* initid, char* is_null, char* message); 50 | void EXPORT hll_merge_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message); 51 | 52 | my_bool EXPORT hll_merge_compute_init(UDF_INIT *initid, UDF_ARGS *args, char *message); 53 | void EXPORT hll_merge_compute_deinit(UDF_INIT *initid); 54 | long long EXPORT hll_merge_compute(UDF_INIT *initid, UDF_ARGS *args, char *result, 55 | unsigned long *length, char *is_null, char *error); 56 | void EXPORT hll_merge_compute_clear(UDF_INIT* initid, char* is_null, char* message); 57 | void EXPORT hll_merge_compute_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message); 58 | 59 | class Data { 60 | public: 61 | SerializedHyperLogLog* shll; 62 | char* result; 63 | 64 | Data(bool need_result, SerializedHyperLogLog* hll) { 65 | init(need_result, hll); 66 | } 67 | 68 | Data(bool need_result, int bitWidth, bool legacyMode) { 69 | init(need_result, new SerializedHyperLogLog(bitWidth, legacyMode)); 70 | } 71 | 72 | ~Data() { 73 | if (shll != NULL) 74 | delete shll; 75 | if (result != NULL) { 76 | free(result); 77 | } 78 | } 79 | 80 | private: 81 | void init(bool need_result, SerializedHyperLogLog* hll) { 82 | shll = hll; 83 | 84 | if (need_result) { 85 | result = (char*)malloc(10000); 86 | } else { 87 | result = NULL; 88 | } 89 | } 90 | }; 91 | 92 | my_bool init(UDF_INIT *initid, UDF_ARGS *args, char *message, bool need_result, int bitWidth, bool legacyMode, const char* function_name) { 93 | if (args->arg_count == 0) { 94 | sprintf(message, "Wrong arguments to %s(); Must have at least 1 argument", function_name); 95 | return 1; 96 | } 97 | 98 | for(int i = 0; i < args->arg_count; ++i) { 99 | args->arg_type[i] = STRING_RESULT; 100 | } 101 | 102 | initid->ptr = (char*)new Data(need_result, bitWidth, legacyMode); 103 | return 0; 104 | } 105 | 106 | my_bool EXPORT hll_create_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { 107 | return init(initid, args, message, true, HLL_BIT_WIDTH, false, "HLL_CREATE"); 108 | } 109 | 110 | my_bool EXPORT hll_create_legacy_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { 111 | return init(initid, args, message, true, HLL_LEGACY_BIT_WIDTH, true, "HLL_CREATE_LEGACY"); 112 | } 113 | 114 | Data* data(UDF_INIT *initid) { 115 | return (Data*)initid->ptr; 116 | } 117 | 118 | SerializedHyperLogLog* shll(UDF_INIT *initid) { 119 | return data(initid)->shll; 120 | } 121 | 122 | void EXPORT hll_create_deinit(UDF_INIT *initid) { 123 | delete data(initid); 124 | } 125 | 126 | void EXPORT hll_create_legacy_deinit(UDF_INIT *initid) { 127 | hll_create_deinit(initid); 128 | } 129 | 130 | char EXPORT *hll_create(UDF_INIT *initid, UDF_ARGS *args, char *result, 131 | unsigned long *length, char *is_null, char *error) { 132 | 133 | char* hll_result = data(initid)->result; 134 | if (shll(initid) == NULL) { 135 | hll_result[0] = '\0'; 136 | *length = 0; 137 | } else { 138 | shll(initid)->toString(hll_result); 139 | *length = shll(initid)->stringLength(); 140 | } 141 | 142 | return hll_result; 143 | } 144 | 145 | char EXPORT *hll_create_legacy(UDF_INIT *initid, UDF_ARGS *args, char *result, 146 | unsigned long *length, char *is_null, char *error) { 147 | return hll_create(initid, args, result, length, is_null, error); 148 | } 149 | 150 | void EXPORT hll_create_clear(UDF_INIT* initid, char* is_null, char* message) { 151 | if (shll(initid) != NULL) 152 | shll(initid)->clear(); 153 | } 154 | 155 | void EXPORT hll_create_legacy_clear(UDF_INIT* initid, char* is_null, char* message) { 156 | hll_create_clear(initid, is_null, message); 157 | } 158 | 159 | void get_value_and_length(UDF_ARGS* args, int i, const char** value, uint32_t* length) { 160 | *value = (args->args[i] == NULL ? "" : args->args[i]); 161 | *length = (args->args[i] == NULL ? 0 : args->lengths[i]); 162 | } 163 | 164 | void EXPORT hll_create_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message) { 165 | for(int i = 0; i < args->arg_count; ++i) { 166 | const char* value; 167 | uint32_t length; 168 | get_value_and_length(args, i, &value, &length); 169 | shll(initid)->add(value, length); 170 | } 171 | } 172 | 173 | void EXPORT hll_create_legacy_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message) { 174 | hll_create_add(initid, args, is_null, message); 175 | } 176 | 177 | my_bool EXPORT hll_compute_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { 178 | return init(initid, args, message, false, HLL_BIT_WIDTH, false, "HLL_COMPUTE"); 179 | } 180 | 181 | void EXPORT hll_compute_deinit(UDF_INIT *initid) { 182 | return hll_create_deinit(initid); 183 | } 184 | 185 | long long EXPORT hll_compute(UDF_INIT *initid, UDF_ARGS *args, char *result, 186 | unsigned long *length, char *is_null, char *error) { 187 | return shll(initid)->estimate(); 188 | } 189 | 190 | void EXPORT hll_compute_clear(UDF_INIT* initid, char* is_null, char* message) { 191 | hll_create_clear(initid, is_null, message); 192 | } 193 | 194 | void EXPORT hll_compute_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message) { 195 | hll_create_add(initid, args, is_null, message); 196 | } 197 | 198 | my_bool merge_init(UDF_INIT *initid, UDF_ARGS *args, char *message, bool need_result, const char* function_name) { 199 | if (args->arg_count == 0) { 200 | sprintf(message,"Wrong arguments to %s(); Must have at least 1 argument", function_name); 201 | return 1; 202 | } 203 | 204 | for(int i = 0; i < args->arg_count; ++i) { 205 | if (args->arg_type[i] != STRING_RESULT) { 206 | sprintf(message,"Wrong arguments to %s(); All arguments must be of type string", function_name); 207 | return 1; 208 | } 209 | } 210 | 211 | initid->ptr = (char*)new Data(need_result, NULL); 212 | return 0; 213 | } 214 | 215 | my_bool EXPORT hll_merge_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { 216 | return merge_init(initid, args, message, true, "HLL_MERGE"); 217 | } 218 | 219 | void EXPORT hll_merge_deinit(UDF_INIT *initid) { 220 | delete data(initid); 221 | } 222 | 223 | char EXPORT *hll_merge(UDF_INIT *initid, UDF_ARGS *args, char *result, 224 | unsigned long *length, char *is_null, char *error) { 225 | return hll_create(initid, args, result, length, is_null, error); 226 | } 227 | 228 | void EXPORT hll_merge_clear(UDF_INIT* initid, char* is_null, char* message) { 229 | if (shll(initid) != NULL) 230 | shll(initid)->clear(); 231 | } 232 | 233 | void EXPORT hll_merge_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message) { 234 | for(int i = 0; i < args->arg_count; ++i) { 235 | uint32_t length; 236 | const char* arg; 237 | get_value_and_length(args, i, &arg, &length); 238 | if (length == 0) continue; // NULL handling 239 | 240 | char* hll_str = (char*)malloc(length + 1); 241 | 242 | memcpy(hll_str, arg, length); 243 | hll_str[length] = '\0'; 244 | 245 | SerializedHyperLogLog* current_shll = SerializedHyperLogLog::fromString(hll_str); 246 | free(hll_str); 247 | 248 | if (current_shll != NULL) { 249 | if (shll(initid) != NULL) { 250 | shll(initid)->merge(*current_shll); 251 | delete current_shll; 252 | } else { 253 | data(initid)-> shll = current_shll; 254 | } 255 | } 256 | } 257 | } 258 | 259 | my_bool EXPORT hll_merge_compute_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { 260 | return merge_init(initid, args, message, false, "HLL_MERGE_COMPUTE"); 261 | } 262 | 263 | void EXPORT hll_merge_compute_deinit(UDF_INIT *initid) { 264 | hll_merge_deinit(initid); 265 | } 266 | 267 | long long EXPORT hll_merge_compute(UDF_INIT *initid, UDF_ARGS *args, char *result, 268 | unsigned long *length, char *is_null, char *error) { 269 | if (shll(initid) == NULL) return 0; 270 | return shll(initid)->estimate(); 271 | } 272 | 273 | void EXPORT hll_merge_compute_clear(UDF_INIT* initid, char* is_null, char* message) { 274 | hll_merge_clear(initid, is_null, message); 275 | } 276 | 277 | void EXPORT hll_merge_compute_add(UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message) { 278 | hll_merge_add(initid, args, is_null, message); 279 | } 280 | 281 | } 282 | --------------------------------------------------------------------------------