├── .gitignore ├── LICENSE.txt ├── OWNERS ├── README.md ├── es-5.4.3 ├── build.gradle ├── settings.gradle └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── elasticsearch │ │ │ └── plugin │ │ │ └── search │ │ │ └── hyperloglogplusplugin │ │ │ ├── HyperLogLogPlusAggregationPlugin.java │ │ │ ├── HyperUniqueSumAggregationBuilder.java │ │ │ ├── HyperUniqueSumAggregator.java │ │ │ ├── HyperUniqueSumAggregatorFactory.java │ │ │ └── InternalHyperUniqueSum.java │ └── plugin-metadata │ │ └── plugin-security.policy │ └── test │ └── java │ └── org │ └── elasticsearch │ └── plugin │ └── search │ └── hyperloglogplusplugin │ └── HyperLogLogPlusAggregationPluginTests.java ├── es-5.5.0 ├── build.gradle ├── settings.gradle └── src │ ├── main │ ├── java │ │ └── org │ │ │ └── elasticsearch │ │ │ └── plugin │ │ │ └── search │ │ │ └── hyperloglogplusplugin │ │ │ ├── HyperLogLogPlusAggregationPlugin.java │ │ │ ├── HyperUniqueSumAggregationBuilder.java │ │ │ ├── HyperUniqueSumAggregator.java │ │ │ ├── HyperUniqueSumAggregatorFactory.java │ │ │ └── InternalHyperUniqueSum.java │ └── plugin-metadata │ │ └── plugin-security.policy │ └── test │ └── java │ └── org │ └── elasticsearch │ └── plugin │ └── search │ └── hyperloglogplusplugin │ └── HyperLogLogPlusAggregationPluginTests.java └── spark ├── README.md └── hyperloglogUDAF.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .gradle/ 3 | build/ 4 | out/ 5 | *.swp 6 | *-execution-hints.log 7 | *-execution-times.log 8 | 9 | // ignore the downloaded model files in git 10 | src/test/resources/models/ 11 | 12 | // intellij 13 | *.iml 14 | *.ipr 15 | *.iws 16 | 17 | // eclipse 18 | .project 19 | .classpath 20 | eclipse-build 21 | */.project 22 | */.classpath 23 | */eclipse-build 24 | .settings 25 | !/.settings/org.eclipse.core.resources.prefs 26 | !/.settings/org.eclipse.jdt.core.prefs 27 | !/.settings/org.eclipse.jdt.ui.prefs 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | polloi-dev@bazaarvoice.com 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elasticsearch HyperloglogPlus Aggregator 2 | 3 | Simple aggregation which allows the aggregation of HyperloglogPlus serialized objects which have been saved as a binary field in Elasticsearch. 4 | 5 | ## Background 6 | 7 | The [HyperloglogPlus](https://en.wikipedia.org/wiki/HyperLogLog) data structure allows us to compute the cardinality of a multiset with a small trade-off in accuracy. 8 | One of the interesting properties of Hyperloglog is that it allows to merge more than one Hyperloglog to compute total multiset cardinality. 9 | 10 | This plugin uses a HyperloglogPlus implementation from [stream-lib](https://github.com/addthis/stream-lib). We first considered Elasticsearch HLL implemenation, but decided 11 | to go with stream-lib as ES implementation may change and usage of HyperLogLogPlus directly is not supported. 12 | 13 | ### Sample Use Case 14 | 15 | #### Unique audience count 16 | 17 | Given a data set of social media posts with a few billion visits and 10 million posts ( with hashtags) , we must count the distinct count of users who had seen a combination of 18 | hashtags associated with pages . One approach would be to index each user as a document with posts and their hashtags as properties in the document. This demands a huge index and the view is very 'user centric'. 19 | 20 | Instead, the approach used here is to index each post as a document with a hyperloglog binary field representing unique visitors for that post. Then, using the 21 | `hyperlogsum` aggregation provided by this plugin, unique visitors can be computed by any post dimension ( such as hashtags here). 22 | 23 | Computing Hyperloglog binary field to index can be done by one of these ways: 24 | 25 | 1 - As demonstrated in the [integration tests](./src/test/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperLogLogPlusAggregationPluginTests.java), 26 | directly using stream-lib with same precision used by plugin internally (`HyperUniqeSumAggregationBuilder.SERIALIZED_DENSE_PRECISION`, `HyperUniqeSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION`) 27 | 28 | 2 - If using spark to index data into elasticsearch, an example [UDAF](/spark/hyperloglogUDAF.scala) has been provided which internally uses the same precision. 29 | 30 | 3 - One could also build a custom UDAF for your stack, modeled on the spark version (feel free to contribute back!) 31 | 32 | ## Usage 33 | 34 | ### Elasticsearch Versions 35 | 36 | Since this plugin might change with minor versions of ES , Source code is organized into ES version specific directory. Currently we support ES 5.4.3. 37 | 38 | Please create an issue if a different ES-version-specific release needs to be made. 39 | 40 | ## Setup 41 | 42 | Switch to ES version specific directory . 43 | 44 | ```bash 45 | cd es-5.4.3 46 | ``` 47 | 48 | In order to install this plugin, you need to create a zip distribution first by running 49 | 50 | ```bash 51 | gradle clean assemble 52 | ``` 53 | 54 | This will produce a zip file in `build/distributions`. 55 | 56 | After building the zip file, you can install it like this 57 | 58 | ```bash 59 | bin/elasticsearch-plugin install file:///path/to/elasticsearch-hyperloglogplus/build/distribution/elasticsearch-hyperloglogplus.zip 60 | ``` 61 | 62 | 63 | ### Elasticsearch Mappings 64 | 65 | Since we are using binary field in ES for storing HLL, `doc_values` should be set to `true` in the mapping declaration. 66 | 67 | ### Indexing Example 68 | 69 | hll field should be mapped to `type = binary` and `doc_values = true`, like using an index template in below example 70 | 71 | ``` 72 | curl -XPUT http://localhost:9200/_template/template_1 -d ' 73 | { 74 | "order": 0, 75 | "template": "*", 76 | "mappings": { 77 | "product": { 78 | "properties": { 79 | "hll": { 80 | "type": "binary", 81 | "doc_values": true 82 | }, 83 | "desc": { 84 | "type": "text", 85 | "fields": { 86 | "keyword": { 87 | "type": "keyword" 88 | } 89 | } 90 | } 91 | } 92 | } 93 | }, 94 | "aliases": {} 95 | }' 96 | 97 | ``` 98 | 99 | python sample code to populate index 100 | 101 | ```python 102 | import json 103 | import requests 104 | 105 | data = { 106 | "desc" : "bar", 107 | "hll" : "rO0ABXNyAFBjb20uY2xlYXJzcHJpbmcuYW5hbHl0aWNzLnN0cmVhbS5jYXJkaW5hbGl0eS5IeXBlckxvZ0xvZ1BsdXMkU2VyaWFsaXphdGlvbkhvbGRlct2sxediKNllDAAAeHB6AAACVf////4OGQHIAearF8aBOYiDAfAZlpgGrusBwO8M7ij6uSHKkgOA0RDW9Wuyig30qCvu9CTQ3yT62grKsRjumQeE9QG+vQ3c7ATc6TC0OsjSH/CaE6RY+PYD0I8WtvM1tMcBlswJ7vsQ0E/6/wLEvSWW/ibm4Q6YL9KADMyeCLq1JsDWKszxE7CzBIibCsrVCfS3CPCbG9TsA8TpVeaUCejZE9KlH4iPCNSNEczxB/pu+NUPutEC/KgVxpwBzKpI0poTwk6i6CO83APe/waKqRv+tg328CewmxLugwLqwA7GxwimiQWIswuC/i/WsxCo0Rrs2QbAywi2yATqkyz46SLO6wic/gXyGaTcAaiHMezAAvzfCrrwEsyUGbhHnKUDhLgj9IMSvogC7MMIrusBjPMD2L0FkvQPrKcfoJpM2r1K7rEGtP0S2rAr+tJM9oga3q8OivkJrPAF4uQojscWmMw62towntIEoN0DzowQjvAq7NYI4IQJ7sojsq0L5I8B+OoUptgohswMrNEa2Pw5iIMB5uEdurUmioM40M4CgrEj2OIKqIcPmosL7NkGvsMK5LIisOkH0PELuucEirAitKEH7NkG9JcBzPICnrAO8qEh0LkJkKoN0MIQ1pgLwu8XruUm3IQQ5rgVwt0RgJIwzK4DqMEw4IoZkO8KrqUP1uIIns4nyokO2kuorxPUr1OEvweWogL8qyDkpAyyig64shDqoS++zQvu4xbOuhL48grqohHg8QfSlQKO+wPmH6brQqSiCJSiAqSZVujUA9jDArqyC/T1Xng=" 108 | } 109 | 110 | 111 | url = 'http://localhost:9200/newindex/product/a' 112 | 113 | for i in range(1,100): 114 | requests.post(url + str(i),json = data) 115 | ``` 116 | 117 | 118 | ### Querying with custom aggregation 119 | 120 | ``` 121 | curl -XPOST 'localhost:9200/_search?pretty=true&size=0' -d '{ "query" : {"match_all" : {}}, "aggs" : { "desc.keyword" : { "terms" : { "field" : "desc.keyword"} , "aggs" : { "uniq" : {"hyperlogsum" : { "field" : "hll" } } } } } }' 122 | 123 | { 124 | "took" : 36, 125 | "timed_out" : false, 126 | "_shards" : { 127 | "total" : 5, 128 | "successful" : 5, 129 | "failed" : 0 130 | }, 131 | "hits" : { 132 | "total" : 99, 133 | "max_score" : 0.0, 134 | "hits" : [ ] 135 | }, 136 | "aggregations" : { 137 | "desc.keyword" : { 138 | "doc_count_error_upper_bound" : 0, 139 | "sum_other_doc_count" : 0, 140 | "buckets" : [ 141 | { 142 | "key" : "bar", 143 | "doc_count" : 99, 144 | "uniq" : { 145 | "value" : 200.0 146 | } 147 | } 148 | ] 149 | } 150 | } 151 | } 152 | 153 | ``` 154 | 155 | ## Bugs & TODO 156 | 157 | * Use an encoding format for HLL ( Like REDIS ) 158 | * Consider LogLog Beta 159 | * HLL Mapping Type 160 | 161 | ## Acknowledgement 162 | 163 | Thanks to [David Pilato's](https://github.com/dadoonet) gradle cookie cutter generator to make plugin projects easier. 164 | 165 | -------------------------------------------------------------------------------- /es-5.4.3/build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | mavenLocal() 4 | mavenCentral() 5 | jcenter() 6 | } 7 | 8 | dependencies { 9 | classpath "org.elasticsearch.gradle:build-tools:5.4.3" 10 | } 11 | } 12 | 13 | group = 'org.elasticsearch.plugin.search' 14 | version = '0.0.1-SNAPSHOT' 15 | 16 | apply plugin: 'java' 17 | apply plugin: 'elasticsearch.esplugin' 18 | apply plugin: 'idea' 19 | 20 | esplugin { 21 | name 'elasticsearch-hyperloglogplussum' 22 | description 'Allows aggregation of HyperloglogPlus serialized objects' 23 | classname 'org.elasticsearch.plugin.search.hyperloglogplusplugin.HyperLogLogPlusAggregationPlugin' 24 | 25 | } 26 | 27 | dependencies { 28 | compile 'com.clearspring.analytics:stream:2.9.5' 29 | testCompile 'org.elasticsearch.test:framework:5.4.3' 30 | } 31 | 32 | 33 | checkstyleMain.enabled = true 34 | checkstyleTest.enabled = true 35 | dependencyLicenses.enabled = false 36 | thirdPartyAudit.enabled = false 37 | 38 | 39 | -------------------------------------------------------------------------------- /es-5.4.3/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'elasticsearch-hyperloglogplus' 2 | -------------------------------------------------------------------------------- /es-5.4.3/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperLogLogPlusAggregationPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import org.elasticsearch.plugins.Plugin; 4 | import org.elasticsearch.plugins.SearchPlugin; 5 | 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | 11 | public class HyperLogLogPlusAggregationPlugin extends Plugin implements SearchPlugin { 12 | 13 | @Override 14 | public List getAggregations() { 15 | ArrayList aggregationSpecs = new ArrayList<>(1); 16 | aggregationSpecs.add(new AggregationSpec(HyperUniqueSumAggregationBuilder.NAME, HyperUniqueSumAggregationBuilder::new, HyperUniqueSumAggregationBuilder::parse) 17 | .addResultReader(InternalHyperUniqueSum::new)); 18 | return aggregationSpecs; 19 | } 20 | 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /es-5.4.3/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperUniqueSumAggregationBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import org.elasticsearch.common.io.stream.StreamInput; 4 | import org.elasticsearch.common.io.stream.StreamOutput; 5 | import org.elasticsearch.common.xcontent.ObjectParser; 6 | import org.elasticsearch.common.xcontent.XContentBuilder; 7 | import org.elasticsearch.index.query.QueryParseContext; 8 | import org.elasticsearch.search.aggregations.AggregationBuilder; 9 | import org.elasticsearch.search.aggregations.AggregatorFactories.Builder; 10 | import org.elasticsearch.search.aggregations.AggregatorFactory; 11 | import org.elasticsearch.search.aggregations.support.*; 12 | import org.elasticsearch.search.internal.SearchContext; 13 | 14 | import java.io.IOException; 15 | 16 | public class HyperUniqueSumAggregationBuilder extends ValuesSourceAggregationBuilder { 17 | public static final String NAME = "hyperlogsum"; 18 | public static final int SERIALIZED_SPARSE_PRECISION = 25; 19 | public static final int SERIALIZED_DENSE_PRECISION = 14; 20 | 21 | 22 | private static final ObjectParser PARSER; 23 | 24 | static { 25 | PARSER = new ObjectParser<>(HyperUniqueSumAggregationBuilder.NAME); 26 | ValuesSourceParserHelper.declareBytesFields(PARSER, false, false); 27 | } 28 | 29 | public static AggregationBuilder parse(String aggregationName, QueryParseContext context) throws IOException { 30 | return PARSER.parse(context.parser(), new HyperUniqueSumAggregationBuilder(aggregationName), context); 31 | } 32 | 33 | public HyperUniqueSumAggregationBuilder(String name) { 34 | super(name, ValuesSourceType.BYTES, ValueType.STRING); 35 | } 36 | 37 | public HyperUniqueSumAggregationBuilder(StreamInput in) throws IOException { 38 | super(in, ValuesSourceType.BYTES, ValueType.STRING); 39 | } 40 | 41 | @Override 42 | protected void innerWriteTo(StreamOutput streamOutput) throws IOException { 43 | //noop 44 | } 45 | 46 | @Override 47 | protected HyperUniqueSumAggregatorFactory innerBuild(SearchContext context, ValuesSourceConfig config, 48 | AggregatorFactory parent, Builder subFactoriesBuilder) throws IOException { 49 | return new HyperUniqueSumAggregatorFactory(name, config, context, parent, subFactoriesBuilder, metaData); 50 | } 51 | 52 | @Override 53 | public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { 54 | return builder; 55 | } 56 | 57 | @Override 58 | protected int innerHashCode() { 59 | return 0; 60 | } 61 | 62 | @Override 63 | protected boolean innerEquals(Object obj) { 64 | return true; 65 | } 66 | 67 | @Override 68 | public String getType() { 69 | return NAME; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /es-5.4.3/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperUniqueSumAggregator.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; 4 | import org.apache.lucene.index.LeafReaderContext; 5 | import org.apache.lucene.util.BytesRef; 6 | import org.elasticsearch.ElasticsearchGenerationException; 7 | import org.elasticsearch.common.lease.Releasables; 8 | import org.elasticsearch.common.util.BigArrays; 9 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 10 | 11 | import org.elasticsearch.common.util.ObjectArray; 12 | import org.elasticsearch.index.fielddata.SortedBinaryDocValues; 13 | import org.elasticsearch.search.DocValueFormat; 14 | import org.elasticsearch.search.aggregations.Aggregator; 15 | import org.elasticsearch.search.aggregations.InternalAggregation; 16 | import org.elasticsearch.search.aggregations.LeafBucketCollector; 17 | import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; 18 | import org.elasticsearch.search.aggregations.AggregatorFactories; 19 | import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregator; 20 | import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; 21 | import org.elasticsearch.search.aggregations.support.ValuesSource; 22 | import org.elasticsearch.search.internal.SearchContext; 23 | 24 | import java.io.ByteArrayOutputStream; 25 | import java.io.ByteArrayInputStream; 26 | import java.io.ObjectOutputStream; 27 | import java.io.ObjectInputStream; 28 | import java.io.IOException; 29 | import java.io.Serializable; 30 | import java.util.Arrays; 31 | import java.util.List; 32 | import java.util.Map; 33 | 34 | public class HyperUniqueSumAggregator extends NumericMetricsAggregator.SingleValue { 35 | 36 | private final ValuesSource valuesSource; 37 | private final DocValueFormat format; 38 | private ObjectArray hyperLogLogPlusPlusObjectArray; 39 | 40 | HyperUniqueSumAggregator(String name, ValuesSource valuesSource, DocValueFormat formatter, SearchContext context, 41 | Aggregator parent, List pipelineAggregators, Map metaData) throws IOException { 42 | super(name, context, parent, pipelineAggregators, metaData); 43 | this.valuesSource = valuesSource; 44 | this.format = formatter; 45 | if (valuesSource != null) { 46 | hyperLogLogPlusPlusObjectArray = context.bigArrays().newObjectArray(1); 47 | } 48 | } 49 | 50 | public static BytesRef serializeHyperLogLogPlus(Serializable obj) { 51 | ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(512); 52 | ObjectOutputStream out = null; 53 | try { 54 | out = new ObjectOutputStream(byteArrayOutputStream); 55 | out.writeObject(obj); 56 | } catch (IOException e) { 57 | throw new ElasticsearchGenerationException("Failed to serialize HLLPlus ", e); 58 | } finally { 59 | if (out != null) { 60 | try { 61 | out.close(); 62 | } catch (IOException e) { 63 | throw new RuntimeException("Exception on closing HLLPlus output stream ", e); 64 | } 65 | } 66 | } 67 | return new BytesRef(byteArrayOutputStream.toByteArray()); 68 | } 69 | 70 | public static HyperLogLogPlus deserializeHyperLogLogPlus(BytesRef bytesRef) { 71 | byte[] bytesToDeserialize = Arrays.copyOfRange(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length); 72 | ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytesToDeserialize); 73 | ObjectInputStream in = null; 74 | try { 75 | in = new ObjectInputStream(byteArrayInputStream); 76 | if (in == null) { 77 | return null; 78 | } 79 | return (HyperLogLogPlus) in.readObject(); 80 | } catch (Exception e) { 81 | throw new ElasticsearchGenerationException("Failed to deserialize HLLPlus ", e); 82 | } finally { 83 | if (in != null) { 84 | try { 85 | in.close(); 86 | } catch (IOException e) { 87 | throw new RuntimeException("Exception on closing HLLPlus stream ", e); 88 | } 89 | } 90 | } 91 | 92 | } 93 | 94 | @Override 95 | public boolean needsScores() { 96 | return false; 97 | } 98 | 99 | @Override 100 | public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, 101 | final LeafBucketCollector sub) throws IOException { 102 | if (valuesSource == null) { 103 | return LeafBucketCollector.NO_OP_COLLECTOR; 104 | } 105 | final BigArrays bigArrays = context.bigArrays(); 106 | final SortedBinaryDocValues values = valuesSource.bytesValues(ctx); 107 | return new LeafBucketCollectorBase(sub, values) { 108 | @Override 109 | public void collect(int doc, long bucket) throws IOException { 110 | hyperLogLogPlusPlusObjectArray = bigArrays.grow(hyperLogLogPlusPlusObjectArray, bucket + 1); 111 | values.setDocument(doc); 112 | final int valuesCount = values.count(); 113 | HyperLogLogPlus hll; 114 | for (int i = 0; i < valuesCount; i++) { 115 | hll = deserializeHyperLogLogPlus(values.valueAt(i)); 116 | HyperLogLogPlus current = hyperLogLogPlusPlusObjectArray.get(bucket); 117 | if (current == null) { 118 | hyperLogLogPlusPlusObjectArray.set(bucket, hll); 119 | } else { 120 | try { 121 | hyperLogLogPlusPlusObjectArray.set(bucket, (HyperLogLogPlus) hll.merge(current)); 122 | } catch (CardinalityMergeException cme) { 123 | throw new ElasticsearchGenerationException("Failed to merge HyperLogLogPlus structures ", cme); 124 | } 125 | } 126 | 127 | } 128 | } 129 | }; 130 | } 131 | 132 | @Override 133 | public InternalAggregation buildAggregation(long bucket) throws IOException { 134 | if (valuesSource == null || bucket >= hyperLogLogPlusPlusObjectArray.size()) { 135 | return buildEmptyAggregation(); 136 | } 137 | 138 | BytesRef bytesRefToSerialize = HyperUniqueSumAggregator.serializeHyperLogLogPlus(hyperLogLogPlusPlusObjectArray.get(bucket)); 139 | return new InternalHyperUniqueSum(name, bytesRefToSerialize.bytes, format, pipelineAggregators(), metaData()); 140 | } 141 | 142 | @Override 143 | public InternalAggregation buildEmptyAggregation() { 144 | return new InternalHyperUniqueSum(name, new BytesRef().bytes, format, pipelineAggregators(), metaData()); 145 | } 146 | 147 | @Override 148 | public void doClose() { 149 | Releasables.close(hyperLogLogPlusPlusObjectArray); 150 | } 151 | 152 | @Override 153 | public double metric(long owningBucketOrd) { 154 | return hyperLogLogPlusPlusObjectArray.get(owningBucketOrd).cardinality(); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /es-5.4.3/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperUniqueSumAggregatorFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import org.elasticsearch.search.aggregations.Aggregator; 4 | import org.elasticsearch.search.aggregations.AggregatorFactories; 5 | import org.elasticsearch.search.aggregations.AggregatorFactory; 6 | import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; 7 | import org.elasticsearch.search.aggregations.support.ValuesSource; 8 | import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; 9 | import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; 10 | import org.elasticsearch.search.internal.SearchContext; 11 | 12 | import java.io.IOException; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | public class HyperUniqueSumAggregatorFactory extends ValuesSourceAggregatorFactory { 17 | 18 | public HyperUniqueSumAggregatorFactory(String name, ValuesSourceConfig config, SearchContext context, 19 | AggregatorFactory parent, AggregatorFactories.Builder subFactoriesBuilder, Map metaData) throws IOException { 20 | super(name, config, context, parent, subFactoriesBuilder, metaData); 21 | 22 | } 23 | 24 | @Override 25 | protected Aggregator createUnmapped(Aggregator parent, List pipelineAggregators, Map metaData) 26 | throws IOException { 27 | return new HyperUniqueSumAggregator(name, null, config.format(), context, parent, pipelineAggregators, metaData); 28 | } 29 | 30 | @Override 31 | protected Aggregator doCreateInternal(ValuesSource.Bytes valuesSource, Aggregator parent, boolean collectsFromSingleBucket, 32 | List pipelineAggregators, Map metaData) throws IOException { 33 | return new HyperUniqueSumAggregator(name, valuesSource, config.format(), context, parent, pipelineAggregators, metaData); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /es-5.4.3/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/InternalHyperUniqueSum.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; 4 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 5 | import org.apache.lucene.util.BytesRef; 6 | import org.elasticsearch.ElasticsearchGenerationException; 7 | import org.elasticsearch.common.io.stream.StreamInput; 8 | import org.elasticsearch.common.io.stream.StreamOutput; 9 | import org.elasticsearch.common.xcontent.XContentBuilder; 10 | import org.elasticsearch.search.DocValueFormat; 11 | import org.elasticsearch.search.aggregations.InternalAggregation; 12 | import org.elasticsearch.search.aggregations.metrics.InternalNumericMetricsAggregation; 13 | import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; 14 | 15 | import java.io.IOException; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | public class InternalHyperUniqueSum extends InternalNumericMetricsAggregation.SingleValue { 20 | private final byte[] hyperLogLogPlusBytes; 21 | 22 | InternalHyperUniqueSum(String name, byte[] hyperLogLogPlusBytes, DocValueFormat formatter, List pipelineAggregators, 23 | Map metaData) { 24 | super(name, pipelineAggregators, metaData); 25 | this.hyperLogLogPlusBytes = hyperLogLogPlusBytes; 26 | this.format = formatter; 27 | } 28 | 29 | 30 | public InternalHyperUniqueSum(StreamInput in) throws IOException { 31 | super(in); 32 | format = in.readNamedWriteable(DocValueFormat.class); 33 | hyperLogLogPlusBytes = in.readByteArray(); 34 | } 35 | 36 | @Override 37 | protected void doWriteTo(StreamOutput out) throws IOException { 38 | out.writeNamedWriteable(format); 39 | out.writeByteArray(hyperLogLogPlusBytes); 40 | } 41 | 42 | @Override 43 | public String getWriteableName() { 44 | return HyperUniqueSumAggregationBuilder.NAME; 45 | } 46 | 47 | @Override 48 | public double value() { 49 | return HyperUniqueSumAggregator.deserializeHyperLogLogPlus(new BytesRef(hyperLogLogPlusBytes)).cardinality(); 50 | } 51 | 52 | 53 | @Override 54 | public InternalHyperUniqueSum doReduce(List aggregations, ReduceContext reduceContext) { 55 | HyperLogLogPlus total = new HyperLogLogPlus(HyperUniqueSumAggregationBuilder.SERIALIZED_DENSE_PRECISION, HyperUniqueSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION); 56 | for (InternalAggregation aggregation : aggregations) { 57 | byte[] bytes = ((InternalHyperUniqueSum) aggregation).hyperLogLogPlusBytes; 58 | if (bytes != null && bytes.length > 0) { 59 | HyperLogLogPlus current = HyperUniqueSumAggregator.deserializeHyperLogLogPlus(new BytesRef(bytes)); 60 | 61 | if (current != null) { 62 | try { 63 | total = (HyperLogLogPlus) total.merge(current); 64 | } catch (CardinalityMergeException cme) { 65 | throw new ElasticsearchGenerationException("Failed to merge HLL+ ", cme); 66 | 67 | } 68 | } 69 | } 70 | 71 | } 72 | return new InternalHyperUniqueSum(name, HyperUniqueSumAggregator.serializeHyperLogLogPlus(total).bytes, format, pipelineAggregators(), getMetaData()); 73 | } 74 | 75 | @Override 76 | public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { 77 | builder.field(CommonFields.VALUE.getPreferredName(), value()); 78 | if (format != DocValueFormat.RAW) { 79 | builder.field(CommonFields.VALUE_AS_STRING.getPreferredName(), format.format(value())); 80 | } 81 | return builder; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /es-5.4.3/src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bazaarvoice/elasticsearch-hyperloglog/f3d2bb0c80ec8c0c0ec15ae94575a08ff0a7d4f5/es-5.4.3/src/main/plugin-metadata/plugin-security.policy -------------------------------------------------------------------------------- /es-5.4.3/src/test/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperLogLogPlusAggregationPluginTests.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | 4 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 5 | import org.elasticsearch.action.index.IndexRequestBuilder; 6 | import org.elasticsearch.action.search.SearchResponse; 7 | import org.elasticsearch.plugins.Plugin; 8 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 9 | import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; 10 | import static org.elasticsearch.index.query.QueryBuilders.termQuery; 11 | import static org.hamcrest.Matchers.containsString; 12 | import static org.hamcrest.Matchers.equalTo; 13 | import static org.hamcrest.Matchers.notNullValue; 14 | 15 | import org.elasticsearch.search.aggregations.AggregationBuilders; 16 | import org.elasticsearch.search.aggregations.bucket.terms.StringTerms; 17 | import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregation; 18 | import org.elasticsearch.test.ESIntegTestCase; 19 | 20 | import java.util.*; 21 | 22 | @ESIntegTestCase.SuiteScopeTestCase 23 | public class HyperLogLogPlusAggregationPluginTests extends ESIntegTestCase { 24 | 25 | // adds string representations of integers 0 .. max-1 26 | private String getHLLStringForTesting(int max) { 27 | HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(HyperUniqueSumAggregationBuilder.SERIALIZED_DENSE_PRECISION, HyperUniqueSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION); 28 | for ( int i = 0; i < max ; i ++){ 29 | hyperLogLogPlus.offer(Integer.toString(i)); 30 | } 31 | return Base64.getEncoder().encodeToString(HyperUniqueSumAggregator.serializeHyperLogLogPlus(hyperLogLogPlus).bytes); 32 | } 33 | 34 | private String getHLLStringForTestingWithWrongPrecision(int max) { 35 | HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(HyperUniqueSumAggregationBuilder.SERIALIZED_DENSE_PRECISION + 5, HyperUniqueSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION + 5); 36 | for ( int i = 0; i < max ; i ++){ 37 | hyperLogLogPlus.offer(Integer.toString(i)); 38 | } 39 | return Base64.getEncoder().encodeToString(HyperUniqueSumAggregator.serializeHyperLogLogPlus(hyperLogLogPlus).bytes); 40 | } 41 | 42 | @Override 43 | public void setupSuiteScopeCluster() throws Exception { 44 | 45 | List builders ; 46 | 47 | prepareCreate("idx_no_hll") 48 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 49 | .execute() 50 | .actionGet(); 51 | 52 | prepareCreate("idx_hll") 53 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 54 | .execute() 55 | .actionGet(); 56 | 57 | prepareCreate("idx_invalid_hll") 58 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 59 | .execute() 60 | .actionGet(); 61 | 62 | prepareCreate("idx_wrong_precision_hll") 63 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 64 | .execute() 65 | .actionGet(); 66 | 67 | builders = new ArrayList<>(); 68 | for (int i = 0; i < 2; i++) { 69 | builders.add(client().prepareIndex("idx_no_hll", "type", ""+i).setSource(jsonBuilder() 70 | .startObject() 71 | .field("value", i*2) 72 | .endObject())); 73 | } 74 | indexRandom(true, builders); 75 | builders = new ArrayList<>(); 76 | 77 | List hllEntries = new ArrayList(Arrays.asList("fred", "barney", "wilma")); 78 | List tags = new ArrayList(Arrays.asList("crazy","mayBeCrazy")); 79 | 80 | // First document will contain 100 unique values 81 | // second document will same values in first document + 10 additional unique values 82 | // total unique values in index should be 110 83 | for (int i = 0; i < 2; i++) { 84 | builders.add(client().prepareIndex("idx_hll", "type", "" + i).setSource(jsonBuilder() 85 | .startObject() 86 | .field("tag", tags.get(i)) 87 | .field("hll", getHLLStringForTesting(100 + i*10 )) 88 | .endObject())); 89 | } 90 | indexRandom(true, builders); 91 | 92 | builders = new ArrayList<>(); 93 | for (int i = 0; i < 2; i++) { 94 | builders.add(client().prepareIndex("idx_invalid_hll", "type", ""+i).setSource(jsonBuilder() 95 | .startObject() 96 | .field("value", i * 2) 97 | .field("hll",Base64.getEncoder().encode("invalid hll string".getBytes())) 98 | .endObject())); 99 | } 100 | indexRandom(true, builders); 101 | 102 | builders = new ArrayList<>(); 103 | for (int i = 0; i < 2; i++) { 104 | builders.add(client().prepareIndex("idx_wrong_precision_hll", "type", ""+i).setSource(jsonBuilder() 105 | .startObject() 106 | .field("value", i * 2) 107 | .field("hll", getHLLStringForTestingWithWrongPrecision(100 + i*10 )) 108 | .endObject())); 109 | } 110 | indexRandom(true, builders); 111 | 112 | ensureSearchable(); 113 | } 114 | 115 | public void testEmptyAggregation() throws Exception { 116 | 117 | SearchResponse searchResponse = client().prepareSearch("idx_no_hll") 118 | .setQuery(matchAllQuery()) 119 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 120 | .execute().actionGet(); 121 | 122 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(2L)); 123 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = searchResponse.getAggregations().get("hyperlog"); 124 | assertThat(numericMetricsAggregation, notNullValue()); 125 | assertEquals("expected 0.0 ", "0.0", numericMetricsAggregation.getValueAsString()); 126 | 127 | } 128 | 129 | public void testUniqueSum1() throws Exception { 130 | 131 | SearchResponse searchResponse = client().prepareSearch("idx_hll") 132 | .setQuery(matchAllQuery()) 133 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 134 | .execute().actionGet(); 135 | 136 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(2L)); 137 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = searchResponse.getAggregations().get("hyperlog"); 138 | assertThat(numericMetricsAggregation, notNullValue()); 139 | assertThat(numericMetricsAggregation.value(), equalTo(110.0)); 140 | 141 | } 142 | 143 | public void testUniqueSum2() throws Exception { 144 | 145 | SearchResponse searchResponse = client().prepareSearch("idx_hll") 146 | .setQuery(termQuery("tag", "crazy")) 147 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 148 | .execute().actionGet(); 149 | 150 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(1L)); 151 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = searchResponse.getAggregations().get("hyperlog"); 152 | assertThat(numericMetricsAggregation, notNullValue()); 153 | assertThat(numericMetricsAggregation.value(), equalTo(100.0)); 154 | 155 | } 156 | 157 | public void testInBuckets() throws Exception { 158 | 159 | SearchResponse searchResponse = client().prepareSearch("idx_hll") 160 | .setQuery(matchAllQuery()) 161 | .addAggregation(AggregationBuilders.terms("tag") 162 | .field("tag") 163 | .subAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll"))) 164 | .execute().actionGet(); 165 | 166 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(2L)); 167 | StringTerms stringTerms = searchResponse.getAggregations().get("tag"); 168 | assertThat(stringTerms.getBuckets().size(), equalTo(2)); 169 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = stringTerms.getBucketByKey("crazy").getAggregations().get("hyperlog"); 170 | assertThat(numericMetricsAggregation.value(), equalTo(100.0)); 171 | numericMetricsAggregation = stringTerms.getBucketByKey("mayBeCrazy").getAggregations().get("hyperlog"); 172 | assertThat(numericMetricsAggregation.value(), equalTo(110.0)); 173 | 174 | } 175 | 176 | public void testInvalidHLL() throws Exception { 177 | 178 | try { 179 | SearchResponse searchResponse = client().prepareSearch("idx_invalid_hll") 180 | .setQuery(matchAllQuery()) 181 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 182 | .execute().actionGet(); 183 | 184 | }catch (Exception ex){ 185 | assertThat(ex.toString(),containsString("Failed to deserialize HLLPlus")); 186 | } 187 | 188 | } 189 | 190 | 191 | @Override 192 | protected Collection> transportClientPlugins() { 193 | return Collections.singletonList(HyperLogLogPlusAggregationPlugin.class); 194 | } 195 | 196 | @Override 197 | protected Collection> getMockPlugins(){ 198 | ArrayList> mocks = new ArrayList<>(super.getMockPlugins()); 199 | mocks.add(HyperLogLogPlusAggregationPlugin.class); 200 | return mocks; 201 | } 202 | 203 | } 204 | 205 | -------------------------------------------------------------------------------- /es-5.5.0/build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | mavenLocal() 4 | mavenCentral() 5 | jcenter() 6 | } 7 | 8 | dependencies { 9 | classpath "org.elasticsearch.gradle:build-tools:5.5.0" 10 | } 11 | } 12 | 13 | group = 'org.elasticsearch.plugin.search' 14 | version = '0.1' 15 | 16 | apply plugin: 'java' 17 | apply plugin: 'elasticsearch.esplugin' 18 | apply plugin: 'idea' 19 | 20 | esplugin { 21 | name 'elasticsearch-hyperloglogplussum' 22 | description 'Allows aggregation of HyperloglogPlus serialized objects' 23 | classname 'org.elasticsearch.plugin.search.hyperloglogplusplugin.HyperLogLogPlusAggregationPlugin' 24 | 25 | } 26 | 27 | dependencies { 28 | compile 'com.clearspring.analytics:stream:2.9.5' 29 | testCompile 'org.elasticsearch.test:framework:5.5.0' 30 | } 31 | 32 | 33 | checkstyleMain.enabled = true 34 | checkstyleTest.enabled = true 35 | dependencyLicenses.enabled = false 36 | thirdPartyAudit.enabled = false 37 | 38 | 39 | -------------------------------------------------------------------------------- /es-5.5.0/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'elasticsearch-hyperloglogplus' 2 | -------------------------------------------------------------------------------- /es-5.5.0/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperLogLogPlusAggregationPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import org.elasticsearch.plugins.Plugin; 4 | import org.elasticsearch.plugins.SearchPlugin; 5 | 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | 11 | public class HyperLogLogPlusAggregationPlugin extends Plugin implements SearchPlugin { 12 | 13 | @Override 14 | public List getAggregations() { 15 | ArrayList aggregationSpecs = new ArrayList<>(1); 16 | aggregationSpecs.add(new AggregationSpec(HyperUniqueSumAggregationBuilder.NAME, HyperUniqueSumAggregationBuilder::new, HyperUniqueSumAggregationBuilder::parse) 17 | .addResultReader(InternalHyperUniqueSum::new)); 18 | return aggregationSpecs; 19 | } 20 | 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /es-5.5.0/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperUniqueSumAggregationBuilder.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import org.elasticsearch.common.io.stream.StreamInput; 4 | import org.elasticsearch.common.io.stream.StreamOutput; 5 | import org.elasticsearch.common.xcontent.ObjectParser; 6 | import org.elasticsearch.common.xcontent.XContentBuilder; 7 | import org.elasticsearch.index.query.QueryParseContext; 8 | import org.elasticsearch.search.aggregations.AggregationBuilder; 9 | import org.elasticsearch.search.aggregations.AggregatorFactories.Builder; 10 | import org.elasticsearch.search.aggregations.AggregatorFactory; 11 | import org.elasticsearch.search.aggregations.support.*; 12 | import org.elasticsearch.search.internal.SearchContext; 13 | 14 | import java.io.IOException; 15 | 16 | public class HyperUniqueSumAggregationBuilder extends ValuesSourceAggregationBuilder { 17 | public static final String NAME = "hyperlogsum"; 18 | public static final int SERIALIZED_SPARSE_PRECISION = 25; 19 | public static final int SERIALIZED_DENSE_PRECISION = 14; 20 | 21 | 22 | private static final ObjectParser PARSER; 23 | 24 | static { 25 | PARSER = new ObjectParser<>(HyperUniqueSumAggregationBuilder.NAME); 26 | ValuesSourceParserHelper.declareBytesFields(PARSER, false, false); 27 | } 28 | 29 | public static AggregationBuilder parse(String aggregationName, QueryParseContext context) throws IOException { 30 | return PARSER.parse(context.parser(), new HyperUniqueSumAggregationBuilder(aggregationName), context); 31 | } 32 | 33 | public HyperUniqueSumAggregationBuilder(String name) { 34 | super(name, ValuesSourceType.BYTES, ValueType.STRING); 35 | } 36 | 37 | public HyperUniqueSumAggregationBuilder(StreamInput in) throws IOException { 38 | super(in, ValuesSourceType.BYTES, ValueType.STRING); 39 | } 40 | 41 | @Override 42 | protected void innerWriteTo(StreamOutput streamOutput) throws IOException { 43 | //noop 44 | } 45 | 46 | @Override 47 | protected HyperUniqueSumAggregatorFactory innerBuild(SearchContext context, ValuesSourceConfig config, 48 | AggregatorFactory parent, Builder subFactoriesBuilder) throws IOException { 49 | return new HyperUniqueSumAggregatorFactory(name, config, context, parent, subFactoriesBuilder, metaData); 50 | } 51 | 52 | @Override 53 | public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { 54 | return builder; 55 | } 56 | 57 | @Override 58 | protected int innerHashCode() { 59 | return 0; 60 | } 61 | 62 | @Override 63 | protected boolean innerEquals(Object obj) { 64 | return true; 65 | } 66 | 67 | @Override 68 | public String getType() { 69 | return NAME; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /es-5.5.0/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperUniqueSumAggregator.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; 4 | import org.apache.lucene.index.LeafReaderContext; 5 | import org.apache.lucene.util.BytesRef; 6 | import org.elasticsearch.ElasticsearchGenerationException; 7 | import org.elasticsearch.common.lease.Releasables; 8 | import org.elasticsearch.common.util.BigArrays; 9 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 10 | 11 | import org.elasticsearch.common.util.ObjectArray; 12 | import org.elasticsearch.index.fielddata.SortedBinaryDocValues; 13 | import org.elasticsearch.search.DocValueFormat; 14 | import org.elasticsearch.search.aggregations.Aggregator; 15 | import org.elasticsearch.search.aggregations.InternalAggregation; 16 | import org.elasticsearch.search.aggregations.LeafBucketCollector; 17 | import org.elasticsearch.search.aggregations.LeafBucketCollectorBase; 18 | import org.elasticsearch.search.aggregations.AggregatorFactories; 19 | import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregator; 20 | import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; 21 | import org.elasticsearch.search.aggregations.support.ValuesSource; 22 | import org.elasticsearch.search.internal.SearchContext; 23 | 24 | import java.io.ByteArrayOutputStream; 25 | import java.io.ByteArrayInputStream; 26 | import java.io.ObjectOutputStream; 27 | import java.io.ObjectInputStream; 28 | import java.io.IOException; 29 | import java.io.Serializable; 30 | import java.util.Arrays; 31 | import java.util.List; 32 | import java.util.Map; 33 | 34 | public class HyperUniqueSumAggregator extends NumericMetricsAggregator.SingleValue { 35 | 36 | private final ValuesSource valuesSource; 37 | private final DocValueFormat format; 38 | private ObjectArray hyperLogLogPlusPlusObjectArray; 39 | 40 | HyperUniqueSumAggregator(String name, ValuesSource valuesSource, DocValueFormat formatter, SearchContext context, 41 | Aggregator parent, List pipelineAggregators, Map metaData) throws IOException { 42 | super(name, context, parent, pipelineAggregators, metaData); 43 | this.valuesSource = valuesSource; 44 | this.format = formatter; 45 | if (valuesSource != null) { 46 | hyperLogLogPlusPlusObjectArray = context.bigArrays().newObjectArray(1); 47 | } 48 | } 49 | 50 | public static BytesRef serializeHyperLogLogPlus(Serializable obj) { 51 | ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(512); 52 | ObjectOutputStream out = null; 53 | try { 54 | out = new ObjectOutputStream(byteArrayOutputStream); 55 | out.writeObject(obj); 56 | } catch (IOException e) { 57 | throw new ElasticsearchGenerationException("Failed to serialize HLLPlus ", e); 58 | } finally { 59 | if (out != null) { 60 | try { 61 | out.close(); 62 | } catch (IOException e) { 63 | throw new RuntimeException("Exception on closing HLLPlus output stream ", e); 64 | } 65 | } 66 | } 67 | return new BytesRef(byteArrayOutputStream.toByteArray()); 68 | } 69 | 70 | public static HyperLogLogPlus deserializeHyperLogLogPlus(BytesRef bytesRef) { 71 | byte[] bytesToDeserialize = Arrays.copyOfRange(bytesRef.bytes, bytesRef.offset, bytesRef.offset + bytesRef.length); 72 | ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytesToDeserialize); 73 | ObjectInputStream in = null; 74 | try { 75 | in = new ObjectInputStream(byteArrayInputStream); 76 | if (in == null) { 77 | return null; 78 | } 79 | return (HyperLogLogPlus) in.readObject(); 80 | } catch (Exception e) { 81 | throw new ElasticsearchGenerationException("Failed to deserialize HLLPlus ", e); 82 | } finally { 83 | if (in != null) { 84 | try { 85 | in.close(); 86 | } catch (IOException e) { 87 | throw new RuntimeException("Exception on closing HLLPlus stream ", e); 88 | } 89 | } 90 | } 91 | 92 | } 93 | 94 | @Override 95 | public boolean needsScores() { 96 | return false; 97 | } 98 | 99 | @Override 100 | public LeafBucketCollector getLeafCollector(LeafReaderContext ctx, 101 | final LeafBucketCollector sub) throws IOException { 102 | if (valuesSource == null) { 103 | return LeafBucketCollector.NO_OP_COLLECTOR; 104 | } 105 | final BigArrays bigArrays = context.bigArrays(); 106 | final SortedBinaryDocValues values = valuesSource.bytesValues(ctx); 107 | return new LeafBucketCollectorBase(sub, values) { 108 | @Override 109 | public void collect(int doc, long bucket) throws IOException { 110 | hyperLogLogPlusPlusObjectArray = bigArrays.grow(hyperLogLogPlusPlusObjectArray, bucket + 1); 111 | values.setDocument(doc); 112 | final int valuesCount = values.count(); 113 | HyperLogLogPlus hll; 114 | for (int i = 0; i < valuesCount; i++) { 115 | hll = deserializeHyperLogLogPlus(values.valueAt(i)); 116 | HyperLogLogPlus current = hyperLogLogPlusPlusObjectArray.get(bucket); 117 | if (current == null) { 118 | hyperLogLogPlusPlusObjectArray.set(bucket, hll); 119 | } else { 120 | try { 121 | hyperLogLogPlusPlusObjectArray.set(bucket, (HyperLogLogPlus) hll.merge(current)); 122 | } catch (CardinalityMergeException cme) { 123 | throw new ElasticsearchGenerationException("Failed to merge HyperLogLogPlus structures ", cme); 124 | } 125 | } 126 | 127 | } 128 | } 129 | }; 130 | } 131 | 132 | @Override 133 | public InternalAggregation buildAggregation(long bucket) throws IOException { 134 | if (valuesSource == null || bucket >= hyperLogLogPlusPlusObjectArray.size()) { 135 | return buildEmptyAggregation(); 136 | } 137 | 138 | BytesRef bytesRefToSerialize = HyperUniqueSumAggregator.serializeHyperLogLogPlus(hyperLogLogPlusPlusObjectArray.get(bucket)); 139 | return new InternalHyperUniqueSum(name, bytesRefToSerialize.bytes, format, pipelineAggregators(), metaData()); 140 | } 141 | 142 | @Override 143 | public InternalAggregation buildEmptyAggregation() { 144 | return new InternalHyperUniqueSum(name, new BytesRef().bytes, format, pipelineAggregators(), metaData()); 145 | } 146 | 147 | @Override 148 | public void doClose() { 149 | Releasables.close(hyperLogLogPlusPlusObjectArray); 150 | } 151 | 152 | @Override 153 | public double metric(long owningBucketOrd) { 154 | return hyperLogLogPlusPlusObjectArray.get(owningBucketOrd).cardinality(); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /es-5.5.0/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperUniqueSumAggregatorFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import org.elasticsearch.search.aggregations.Aggregator; 4 | import org.elasticsearch.search.aggregations.AggregatorFactories; 5 | import org.elasticsearch.search.aggregations.AggregatorFactory; 6 | import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; 7 | import org.elasticsearch.search.aggregations.support.ValuesSource; 8 | import org.elasticsearch.search.aggregations.support.ValuesSourceAggregatorFactory; 9 | import org.elasticsearch.search.aggregations.support.ValuesSourceConfig; 10 | import org.elasticsearch.search.internal.SearchContext; 11 | 12 | import java.io.IOException; 13 | import java.util.List; 14 | import java.util.Map; 15 | 16 | public class HyperUniqueSumAggregatorFactory extends ValuesSourceAggregatorFactory { 17 | 18 | public HyperUniqueSumAggregatorFactory(String name, ValuesSourceConfig config, SearchContext context, 19 | AggregatorFactory parent, AggregatorFactories.Builder subFactoriesBuilder, Map metaData) throws IOException { 20 | super(name, config, context, parent, subFactoriesBuilder, metaData); 21 | 22 | } 23 | 24 | @Override 25 | protected Aggregator createUnmapped(Aggregator parent, List pipelineAggregators, Map metaData) 26 | throws IOException { 27 | return new HyperUniqueSumAggregator(name, null, config.format(), context, parent, pipelineAggregators, metaData); 28 | } 29 | 30 | @Override 31 | protected Aggregator doCreateInternal(ValuesSource.Bytes valuesSource, Aggregator parent, boolean collectsFromSingleBucket, 32 | List pipelineAggregators, Map metaData) throws IOException { 33 | return new HyperUniqueSumAggregator(name, valuesSource, config.format(), context, parent, pipelineAggregators, metaData); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /es-5.5.0/src/main/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/InternalHyperUniqueSum.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | import com.clearspring.analytics.stream.cardinality.CardinalityMergeException; 4 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 5 | import org.apache.lucene.util.BytesRef; 6 | import org.elasticsearch.ElasticsearchGenerationException; 7 | import org.elasticsearch.common.io.stream.StreamInput; 8 | import org.elasticsearch.common.io.stream.StreamOutput; 9 | import org.elasticsearch.common.xcontent.XContentBuilder; 10 | import org.elasticsearch.search.DocValueFormat; 11 | import org.elasticsearch.search.aggregations.InternalAggregation; 12 | import org.elasticsearch.search.aggregations.metrics.InternalNumericMetricsAggregation; 13 | import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator; 14 | 15 | import java.io.IOException; 16 | import java.util.List; 17 | import java.util.Map; 18 | 19 | public class InternalHyperUniqueSum extends InternalNumericMetricsAggregation.SingleValue { 20 | private final byte[] hyperLogLogPlusBytes; 21 | 22 | InternalHyperUniqueSum(String name, byte[] hyperLogLogPlusBytes, DocValueFormat formatter, List pipelineAggregators, 23 | Map metaData) { 24 | super(name, pipelineAggregators, metaData); 25 | this.hyperLogLogPlusBytes = hyperLogLogPlusBytes; 26 | this.format = formatter; 27 | } 28 | 29 | 30 | public InternalHyperUniqueSum(StreamInput in) throws IOException { 31 | super(in); 32 | format = in.readNamedWriteable(DocValueFormat.class); 33 | hyperLogLogPlusBytes = in.readByteArray(); 34 | } 35 | 36 | @Override 37 | protected void doWriteTo(StreamOutput out) throws IOException { 38 | out.writeNamedWriteable(format); 39 | out.writeByteArray(hyperLogLogPlusBytes); 40 | } 41 | 42 | @Override 43 | public String getWriteableName() { 44 | return HyperUniqueSumAggregationBuilder.NAME; 45 | } 46 | 47 | @Override 48 | public double value() { 49 | return HyperUniqueSumAggregator.deserializeHyperLogLogPlus(new BytesRef(hyperLogLogPlusBytes)).cardinality(); 50 | } 51 | 52 | 53 | @Override 54 | public InternalHyperUniqueSum doReduce(List aggregations, ReduceContext reduceContext) { 55 | HyperLogLogPlus total = new HyperLogLogPlus(HyperUniqueSumAggregationBuilder.SERIALIZED_DENSE_PRECISION, HyperUniqueSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION); 56 | for (InternalAggregation aggregation : aggregations) { 57 | byte[] bytes = ((InternalHyperUniqueSum) aggregation).hyperLogLogPlusBytes; 58 | if (bytes != null && bytes.length > 0) { 59 | HyperLogLogPlus current = HyperUniqueSumAggregator.deserializeHyperLogLogPlus(new BytesRef(bytes)); 60 | 61 | if (current != null) { 62 | try { 63 | total = (HyperLogLogPlus) total.merge(current); 64 | } catch (CardinalityMergeException cme) { 65 | throw new ElasticsearchGenerationException("Failed to merge HLL+ ", cme); 66 | 67 | } 68 | } 69 | } 70 | 71 | } 72 | return new InternalHyperUniqueSum(name, HyperUniqueSumAggregator.serializeHyperLogLogPlus(total).bytes, format, pipelineAggregators(), getMetaData()); 73 | } 74 | 75 | @Override 76 | public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { 77 | builder.field(CommonFields.VALUE.getPreferredName(), value()); 78 | if (format != DocValueFormat.RAW) { 79 | builder.field(CommonFields.VALUE_AS_STRING.getPreferredName(), format.format(value())); 80 | } 81 | return builder; 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /es-5.5.0/src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bazaarvoice/elasticsearch-hyperloglog/f3d2bb0c80ec8c0c0ec15ae94575a08ff0a7d4f5/es-5.5.0/src/main/plugin-metadata/plugin-security.policy -------------------------------------------------------------------------------- /es-5.5.0/src/test/java/org/elasticsearch/plugin/search/hyperloglogplusplugin/HyperLogLogPlusAggregationPluginTests.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.search.hyperloglogplusplugin; 2 | 3 | 4 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus; 5 | import org.elasticsearch.action.index.IndexRequestBuilder; 6 | import org.elasticsearch.action.search.SearchResponse; 7 | import org.elasticsearch.plugins.Plugin; 8 | import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; 9 | import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; 10 | import static org.elasticsearch.index.query.QueryBuilders.termQuery; 11 | import static org.hamcrest.Matchers.containsString; 12 | import static org.hamcrest.Matchers.equalTo; 13 | import static org.hamcrest.Matchers.notNullValue; 14 | 15 | import org.elasticsearch.search.aggregations.AggregationBuilders; 16 | import org.elasticsearch.search.aggregations.bucket.terms.StringTerms; 17 | import org.elasticsearch.search.aggregations.metrics.NumericMetricsAggregation; 18 | import org.elasticsearch.test.ESIntegTestCase; 19 | 20 | import java.util.*; 21 | 22 | @ESIntegTestCase.SuiteScopeTestCase 23 | public class HyperLogLogPlusAggregationPluginTests extends ESIntegTestCase { 24 | 25 | // adds string representations of integers 0 .. max-1 26 | private String getHLLStringForTesting(int max) { 27 | HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(HyperUniqueSumAggregationBuilder.SERIALIZED_DENSE_PRECISION, HyperUniqueSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION); 28 | for ( int i = 0; i < max ; i ++){ 29 | hyperLogLogPlus.offer(Integer.toString(i)); 30 | } 31 | return Base64.getEncoder().encodeToString(HyperUniqueSumAggregator.serializeHyperLogLogPlus(hyperLogLogPlus).bytes); 32 | } 33 | 34 | private String getHLLStringForTestingWithWrongPrecision(int max) { 35 | HyperLogLogPlus hyperLogLogPlus = new HyperLogLogPlus(HyperUniqueSumAggregationBuilder.SERIALIZED_DENSE_PRECISION + 5, HyperUniqueSumAggregationBuilder.SERIALIZED_SPARSE_PRECISION + 5); 36 | for ( int i = 0; i < max ; i ++){ 37 | hyperLogLogPlus.offer(Integer.toString(i)); 38 | } 39 | return Base64.getEncoder().encodeToString(HyperUniqueSumAggregator.serializeHyperLogLogPlus(hyperLogLogPlus).bytes); 40 | } 41 | 42 | @Override 43 | public void setupSuiteScopeCluster() throws Exception { 44 | 45 | List builders ; 46 | 47 | prepareCreate("idx_no_hll") 48 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 49 | .execute() 50 | .actionGet(); 51 | 52 | prepareCreate("idx_hll") 53 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 54 | .execute() 55 | .actionGet(); 56 | 57 | prepareCreate("idx_invalid_hll") 58 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 59 | .execute() 60 | .actionGet(); 61 | 62 | prepareCreate("idx_wrong_precision_hll") 63 | .addMapping("type", "tag", "type=keyword", "hll", "type=binary,doc_values=true") 64 | .execute() 65 | .actionGet(); 66 | 67 | builders = new ArrayList<>(); 68 | for (int i = 0; i < 2; i++) { 69 | builders.add(client().prepareIndex("idx_no_hll", "type", ""+i).setSource(jsonBuilder() 70 | .startObject() 71 | .field("value", i*2) 72 | .endObject())); 73 | } 74 | indexRandom(true, builders); 75 | builders = new ArrayList<>(); 76 | 77 | List hllEntries = new ArrayList(Arrays.asList("fred", "barney", "wilma")); 78 | List tags = new ArrayList(Arrays.asList("crazy","mayBeCrazy")); 79 | 80 | // First document will contain 100 unique values 81 | // second document will same values in first document + 10 additional unique values 82 | // total unique values in index should be 110 83 | for (int i = 0; i < 2; i++) { 84 | builders.add(client().prepareIndex("idx_hll", "type", "" + i).setSource(jsonBuilder() 85 | .startObject() 86 | .field("tag", tags.get(i)) 87 | .field("hll", getHLLStringForTesting(100 + i*10 )) 88 | .endObject())); 89 | } 90 | indexRandom(true, builders); 91 | 92 | builders = new ArrayList<>(); 93 | for (int i = 0; i < 2; i++) { 94 | builders.add(client().prepareIndex("idx_invalid_hll", "type", ""+i).setSource(jsonBuilder() 95 | .startObject() 96 | .field("value", i * 2) 97 | .field("hll",Base64.getEncoder().encode("invalid hll string".getBytes())) 98 | .endObject())); 99 | } 100 | indexRandom(true, builders); 101 | 102 | builders = new ArrayList<>(); 103 | for (int i = 0; i < 2; i++) { 104 | builders.add(client().prepareIndex("idx_wrong_precision_hll", "type", ""+i).setSource(jsonBuilder() 105 | .startObject() 106 | .field("value", i * 2) 107 | .field("hll", getHLLStringForTestingWithWrongPrecision(100 + i*10 )) 108 | .endObject())); 109 | } 110 | indexRandom(true, builders); 111 | 112 | ensureSearchable(); 113 | } 114 | 115 | public void testEmptyAggregation() throws Exception { 116 | 117 | SearchResponse searchResponse = client().prepareSearch("idx_no_hll") 118 | .setQuery(matchAllQuery()) 119 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 120 | .execute().actionGet(); 121 | 122 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(2L)); 123 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = searchResponse.getAggregations().get("hyperlog"); 124 | assertThat(numericMetricsAggregation, notNullValue()); 125 | assertEquals("expected 0.0 ", "0.0", numericMetricsAggregation.getValueAsString()); 126 | 127 | } 128 | 129 | public void testUniqueSum1() throws Exception { 130 | 131 | SearchResponse searchResponse = client().prepareSearch("idx_hll") 132 | .setQuery(matchAllQuery()) 133 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 134 | .execute().actionGet(); 135 | 136 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(2L)); 137 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = searchResponse.getAggregations().get("hyperlog"); 138 | assertThat(numericMetricsAggregation, notNullValue()); 139 | assertThat(numericMetricsAggregation.value(), equalTo(110.0)); 140 | 141 | } 142 | 143 | public void testUniqueSum2() throws Exception { 144 | 145 | SearchResponse searchResponse = client().prepareSearch("idx_hll") 146 | .setQuery(termQuery("tag", "crazy")) 147 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 148 | .execute().actionGet(); 149 | 150 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(1L)); 151 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = searchResponse.getAggregations().get("hyperlog"); 152 | assertThat(numericMetricsAggregation, notNullValue()); 153 | assertThat(numericMetricsAggregation.value(), equalTo(100.0)); 154 | 155 | } 156 | 157 | public void testInBuckets() throws Exception { 158 | 159 | SearchResponse searchResponse = client().prepareSearch("idx_hll") 160 | .setQuery(matchAllQuery()) 161 | .addAggregation(AggregationBuilders.terms("tag") 162 | .field("tag") 163 | .subAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll"))) 164 | .execute().actionGet(); 165 | 166 | assertThat(searchResponse.getHits().getTotalHits(), equalTo(2L)); 167 | StringTerms stringTerms = searchResponse.getAggregations().get("tag"); 168 | assertThat(stringTerms.getBuckets().size(), equalTo(2)); 169 | NumericMetricsAggregation.SingleValue numericMetricsAggregation = stringTerms.getBucketByKey("crazy").getAggregations().get("hyperlog"); 170 | assertThat(numericMetricsAggregation.value(), equalTo(100.0)); 171 | numericMetricsAggregation = stringTerms.getBucketByKey("mayBeCrazy").getAggregations().get("hyperlog"); 172 | assertThat(numericMetricsAggregation.value(), equalTo(110.0)); 173 | 174 | } 175 | 176 | public void testInvalidHLL() throws Exception { 177 | 178 | try { 179 | SearchResponse searchResponse = client().prepareSearch("idx_invalid_hll") 180 | .setQuery(matchAllQuery()) 181 | .addAggregation(new HyperUniqueSumAggregationBuilder("hyperlog").field("hll")) 182 | .execute().actionGet(); 183 | 184 | }catch (Exception ex){ 185 | assertThat(ex.toString(),containsString("Failed to deserialize HLLPlus")); 186 | } 187 | 188 | } 189 | 190 | 191 | @Override 192 | protected Collection> transportClientPlugins() { 193 | return Collections.singletonList(HyperLogLogPlusAggregationPlugin.class); 194 | } 195 | 196 | @Override 197 | protected Collection> getMockPlugins(){ 198 | ArrayList> mocks = new ArrayList<>(super.getMockPlugins()); 199 | mocks.add(HyperLogLogPlusAggregationPlugin.class); 200 | return mocks; 201 | } 202 | 203 | } 204 | 205 | -------------------------------------------------------------------------------- /spark/README.md: -------------------------------------------------------------------------------- 1 | # Spark UDAF to write Hyperloglog byte array 2 | 3 | 1 - Reference [stream-lib](https://mvnrepository.com/artifact/com.clearspring.analytics/stream/2.9.5) in your build configuration. 4 | 5 | 2 - Copy spark UDAF code [hyperloglogUDAF.scala](./hyperloglogUDAF.scala) into your spark code. This UDAF can be used to collect values to a `Hyperloglog` structure. 6 | 7 | ### Example 8 | ```scala 9 | val hll = new HyperLogPlusPlusAgg() 10 | 11 | val visitorHLL = pageViews 12 | .groupBy($"product") 13 | .agg(hll($"VisitorID").as("HLL")) 14 | .toDF() 15 | 16 | ``` 17 | 18 | This dataframe will now contain "HLL" field as a Base64 encoded String representation of the serialized HyperLogLogPlus object which will be constructed 19 | from each VisitorID. This can now be indexed into Elasticsearch as usual: by using the [elasticsearch-spark](https://github.com/elastic/elasticsearch-hadoop#apache-spark) library. 20 | 21 | ```scala 22 | import org.elasticsearch.spark.sql._ 23 | 24 | df.saveToEs(...) 25 | 26 | ``` -------------------------------------------------------------------------------- /spark/hyperloglogUDAF.scala: -------------------------------------------------------------------------------- 1 | import java.io._ 2 | import java.util.Base64 3 | 4 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 5 | import org.apache.spark.sql.Row 6 | import org.apache.spark.sql.types._ 7 | import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus 8 | /** 9 | * UDAF which builds HLL Plus from String Column Values and returns 10 | * serialized HLL Object encoded as Base64 String 11 | * 12 | * Uses stream-lib HLL Plus implementation 13 | */ 14 | class HyperLogPlusPlusAgg extends UserDefinedAggregateFunction { 15 | 16 | @throws(classOf[IOException]) 17 | def serializeHLL(obj: Object): Array[Byte] = { 18 | val baos: ByteArrayOutputStream = new ByteArrayOutputStream(512) 19 | var out: ObjectOutputStream = null 20 | try { 21 | out = new ObjectOutputStream(baos) 22 | out.writeObject(obj) 23 | } finally { 24 | if (out != null) { 25 | out.close 26 | } 27 | } 28 | return baos.toByteArray 29 | } 30 | 31 | @throws(classOf[ClassNotFoundException]) 32 | @throws(classOf[IOException]) 33 | def deserializeHLL(bytes: Array[Byte]): HyperLogLogPlus = { 34 | val bais: ByteArrayInputStream = new ByteArrayInputStream(bytes) 35 | var in: ObjectInputStream = null 36 | try { 37 | in = new ObjectInputStream(bais) 38 | return in.readObject.asInstanceOf[HyperLogLogPlus] 39 | } finally { 40 | if (in != null) { 41 | in.close 42 | } 43 | } 44 | } 45 | // input can be any type , but we use String , probably we can make this generic and test 46 | override def inputSchema: org.apache.spark.sql.types.StructType = 47 | StructType(StructField("value", StringType) :: Nil) 48 | 49 | // Internal Fields to keep aggregate 50 | override def bufferSchema: StructType = StructType( 51 | StructField("hllbits", BinaryType) :: Nil 52 | ) 53 | 54 | // output will be Base64 encoded HLL Byte Array 55 | override def dataType: DataType = StringType 56 | 57 | override def deterministic: Boolean = true 58 | 59 | // initializing hll buffer with empty HLL 60 | override def initialize(buffer: MutableAggregationBuffer): Unit = { 61 | val hll = new HyperLogLogPlus(14, 25) 62 | buffer(0) = serializeHLL(hll) 63 | } 64 | 65 | // update hll with value from input column and deserialize back to buffer 66 | override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { 67 | val hll = new HyperLogLogPlus(14, 25) 68 | hll.offer(input.getAs[String](0)) 69 | buffer(0) = serializeHLL(hll) 70 | } 71 | 72 | // merge HLLs to buffer 73 | override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 74 | val mergedHLL = deserializeHLL(buffer1.getAs[Array[Byte]](0)) 75 | .merge(deserializeHLL(buffer2.getAs[Array[Byte]](0))) 76 | buffer1(0) = serializeHLL(mergedHLL) 77 | } 78 | 79 | // Convert serialized HLL from buffer to Base64 80 | override def evaluate(buffer: Row): Any = { 81 | new String(Base64.getEncoder.encode(buffer.getAs[Array[Byte]](0))) 82 | } 83 | } 84 | --------------------------------------------------------------------------------