├── App.config
├── README.md
├── LICENSE
└── EliasFanoCompression.cs
/App.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | EliasFanoCompression
2 | ====================
3 |
4 | EliasFanoCompression: quasi-succinct compression of sorted integers in C#
5 |
6 | Elias-Fano encoding is **quasi succinct**, which means it is **almost as good as the best theoretical possible compression scheme** for sorted integers.
7 | While it can be used to compress any sorted list of integers, we will use it for compressing posting lists of inverted indexes.
8 | Based on a research paper by Sebastiano Vigna: http://vigna.di.unimi.it/ftp/papers/QuasiSuccinctIndices.pdf
9 |
10 | #### Blog Post
11 | [Elias-Fano: quasi-succinct compression of sorted integers in C#](https://seekstorm.com/blog/elias-fano-succinct-compression-sorted-integers-csharp/)
12 |
13 | ```
14 | Copyright (C) 2016 Wolf Garbe
15 | Version: 1.0
16 | Author: Wolf Garbe
17 | Maintainer: Wolf Garbe
18 | URL: https://seekstorm.com/blog/elias-fano-succinct-compression-sorted-integers-csharp/
19 | Description: https://seekstorm.com/blog/elias-fano-succinct-compression-sorted-integers-csharp/
20 |
21 | License:
22 | This program is free software; you can redistribute it and/or modify
23 | it under the terms of the GNU Lesser General Public License,
24 | version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
25 | http://www.opensource.org/licenses/LGPL-3.0
26 | ```
27 |
28 | ---
29 |
30 | **EliasFanoCompression** is contributed by [**SeekStorm** - the high performance Search as a Service & search API](https://seekstorm.com)
31 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/EliasFanoCompression.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Diagnostics;
4 |
5 |
6 | // EliasFanoCompression: quasi-succinct compression of sorted integers in C#
7 | //
8 | // Elias-Fano encoding is quasi succinct, which means it is almost as good as the best theoretical possible compression scheme for sorted integers.
9 | // While it can be used to compress any sorted list of integers, we will use it for compressing posting lists of inverted indexes.
10 | // Based on a research paper by Sebastiano Vigna: http://vigna.di.unimi.it/ftp/papers/QuasiSuccinctIndices.pdf
11 | //
12 | // Copyright (C) 2016 Wolf Garbe
13 | // Version: 1.0
14 | // Author: Wolf Garbe
15 | // Maintainer: Wolf Garbe
16 | // URL: https://seekstorm.com/blog/elias-fano-succinct-compression-sorted-integers-csharp/
17 | // Description: https://seekstorm.com/blog/elias-fano-succinct-compression-sorted-integers-csharp/
18 | //
19 | // License:
20 | // This program is free software; you can redistribute it and/or modify
21 | // it under the terms of the GNU Lesser General Public License,
22 | // version 3.0 (LGPL-3.0) as published by the Free Software Foundation.
23 | // http://www.opensource.org/licenses/LGPL-3.0
24 |
25 |
26 | static class EliasFanoCompression
27 | {
28 | public static Random rnd = new Random(500);
29 |
30 | // generates a sorted list of n integers with no duplicates within range
31 | public static List generatePostingList(int n, int range)
32 | {
33 | if ((n < 1) || (n > range) || (range < 1)) Console.WriteLine("n within 1...range and range>0!");
34 |
35 | List postingList = new List(n);
36 |
37 | // hashset fits in RAM && enough gaps (n*1.1 hs = new HashSet();
42 | while (hs.Count < n)
43 | {
44 | uint docID = (uint)rnd.Next(1, range);
45 |
46 | // make sure docid are unique!
47 | // strictly positive delta, no zero allowed (we dont allow a zero for the docid because then the delta for the first docid in a posting list could be zero)
48 | if (hs.Add(docID)) postingList.Add(docID);
49 | }
50 | postingList.Sort();
51 | }
52 | else
53 | {
54 | // slow for sparse lists as it loops through whole range, fast for dense lists, no hashset required, no sorting required
55 | for (uint i = 1; i <= range; i++)
56 | {
57 | // derived from: if ( rnd.Next(range) postingList, byte[] compressedBuffer, ref int compressedBufferPointer2)
71 | {
72 | // Elias Fano Coding
73 | // compress sorted integers: Given n and u we have a monotone sequence 0 ≤ x0, x1, x2, ... , xn-1 ≤ u
74 | // at most 2 + log(u / n) bits per element
75 | // Quasi-succinct: less than half a bit away from succinct bound!
76 | // https://en.wikipedia.org/wiki/Unary_coding
77 | // http://vigna.di.unimi.it/ftp/papers/QuasiSuccinctIndices.pdf
78 | // http://shonan.nii.ac.jp/seminar/029/wp-content/uploads/sites/12/2013/07/Sebastiano_Shonan.pdf
79 | // http://www.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf
80 | // http://hpc.isti.cnr.it/hpcworkshop2014/PartitionedEliasFanoIndexes.pdf
81 |
82 | Stopwatch sw = Stopwatch.StartNew();
83 |
84 | uint lastDocID = 0;
85 |
86 | ulong buffer1 = 0;
87 | int bufferLength1 = 0;
88 | ulong buffer2 = 0;
89 | int bufferLength2 = 0;
90 |
91 | uint largestblockID = (uint)postingList[postingList.Count - 1];
92 | double averageDelta = (double)largestblockID / (double)postingList.Count;
93 | double averageDeltaLog = Math.Log(averageDelta, 2);
94 | int lowBitsLength = (int)Math.Floor(averageDeltaLog); if (lowBitsLength < 0) lowBitsLength = 0;
95 | ulong lowBitsMask = (((ulong)1 << lowBitsLength) - 1);
96 |
97 | int compressedBufferPointer1 = 0;
98 |
99 | // +6 : for docid number, lowerBitsLength and ceiling
100 | compressedBufferPointer2 = lowBitsLength * postingList.Count / 8 + 6;
101 |
102 | // store postingList.Count for decompression: LSB first
103 | compressedBuffer[compressedBufferPointer1++] = (byte)(postingList.Count & 255);
104 | compressedBuffer[compressedBufferPointer1++] = (byte)((postingList.Count >> 8) & 255);
105 | compressedBuffer[compressedBufferPointer1++] = (byte)((postingList.Count >> 16) & 255);
106 | compressedBuffer[compressedBufferPointer1++] = (byte)((postingList.Count >> 24) & 255);
107 |
108 | // store lowerBitsLength for decompression
109 | compressedBuffer[compressedBufferPointer1++] = (byte)lowBitsLength;
110 |
111 | foreach (uint docID in postingList)
112 | {
113 | // docID strictly monotone/increasing numbers, docIdDelta strictly positive, no zero allowed
114 | uint docIdDelta = (docID - lastDocID - 1);
115 |
116 | // low bits
117 | // Store the lower l= log(u / n) bits explicitly
118 | // binary packing/bit packing
119 |
120 | buffer1 <<= lowBitsLength;
121 | buffer1 |= (docIdDelta & lowBitsMask);
122 | bufferLength1 += lowBitsLength;
123 |
124 | // flush buffer to compressedBuffer
125 | while (bufferLength1 > 7)
126 | {
127 | bufferLength1 -= 8;
128 | compressedBuffer[compressedBufferPointer1++] = (byte)(buffer1 >> bufferLength1);
129 | }
130 |
131 | // high bits
132 | // Store high bits as a sequence of unary coded gaps
133 | // 0=1, 1=01, 2=001, 3=0001, ...
134 | // https://en.wikipedia.org/wiki/Unary_coding
135 |
136 | // length of unary code
137 | uint unaryCodeLength = (uint)(docIdDelta >> lowBitsLength) + 1;
138 | buffer2 <<= (int)unaryCodeLength;
139 |
140 | // set most right bit
141 | buffer2 |= 1;
142 | bufferLength2 += (int)unaryCodeLength;
143 |
144 | // flush buffer to compressedBuffer
145 | while (bufferLength2 > 7)
146 | {
147 | bufferLength2 -= 8;
148 | compressedBuffer[compressedBufferPointer2++] = (byte)(buffer2 >> bufferLength2);
149 | }
150 |
151 | lastDocID = docID;
152 | }
153 |
154 | // final flush buffer
155 | if (bufferLength1 > 0)
156 | {
157 | compressedBuffer[compressedBufferPointer1++] = (byte)(buffer1 << (8 - bufferLength1));
158 | }
159 |
160 | if (bufferLength2 > 0)
161 | {
162 | compressedBuffer[compressedBufferPointer2++] = (byte)(buffer2 << (8 - bufferLength2));
163 | }
164 |
165 | Console.WriteLine("\rCompression: " + sw.ElapsedMilliseconds.ToString("N0") + " ms " + postingList.Count.ToString("N0") +" DocID delta: " + averageDelta.ToString("N2") + " low bits: " + lowBitsLength.ToString() + " bits/DocID: " + ((double)compressedBufferPointer2 * (double)8 / (double)postingList.Count).ToString("N2")+" (" + (2+averageDeltaLog).ToString("N2")+") uncompressed: " + ((ulong)postingList.Count*4).ToString("N0") + " compressed: " + compressedBufferPointer2.ToString("N0") +" ratio: "+ ((double)postingList.Count * 4/ compressedBufferPointer2).ToString("N2")) ;
166 | }
167 |
168 | public static uint[,] decodingTableHighBits = new uint[256, 8];
169 | public static byte[] decodingTableDocIdNumber = new byte[256];
170 | public static byte[] decodingTableHighBitsCarryover = new byte[256];
171 |
172 | public static void eliasFanoCreateDecodingTable()
173 | {
174 | for (int i = 0; i < 256; i++)
175 | {
176 | byte zeroCount = 0;
177 | for (int j = 7; j >= 0; j--)
178 | {
179 | // count 1 within i
180 | if ((i & (1 << j)) > 0)
181 | {
182 | // unary code of high bits of nth docid within this byte
183 | decodingTableHighBits[i, decodingTableDocIdNumber[i] ] = zeroCount;
184 |
185 | // docIdNumber = number of docid = number of 1 within one byte
186 | decodingTableDocIdNumber[i]++;
187 | zeroCount = 0;
188 | }
189 | else
190 | {
191 | // count 0 since last 1 within i
192 | zeroCount++;
193 | }
194 | }
195 | // number of trailing zeros (zeros carryover), if whole byte=0 then unaryCodeLength+=8
196 | decodingTableHighBitsCarryover[i] = zeroCount;
197 | }
198 | }
199 |
200 | public static void EliasFanoDecompress(byte[] compressedBuffer, int compressedBufferPointer, uint[] postingList, ref int resultPointer)
201 | {
202 | Stopwatch sw = Stopwatch.StartNew();
203 |
204 | // array is faster than list, but wastes space with fixed size
205 | // this is only important for decompression, not for compressed intersection (because we have only a fraction of results)
206 |
207 | int lowBitsPointer = 0;
208 | ulong lastDocID = 0;
209 | ulong docID = 0;
210 |
211 | // read postingList.Count for decompression: LSB first
212 | int postingListCount = compressedBuffer[lowBitsPointer++];
213 | postingListCount |= (int)compressedBuffer[lowBitsPointer++] << 8;
214 | postingListCount |= (int)compressedBuffer[lowBitsPointer++] << 16;
215 | postingListCount |= (int)compressedBuffer[lowBitsPointer++] << 24;
216 |
217 | // read fanoParamInt for decompression
218 | byte lowBitsLength = compressedBuffer[lowBitsPointer++];
219 |
220 | // decompress low bits
221 | byte lowBitsCount = 0;
222 | byte lowBits = 0;
223 |
224 | // decompress high bits
225 | byte cb = 1;
226 | for (int highBitsPointer =lowBitsLength * postingListCount / 8 + 6 ; highBitsPointer < compressedBufferPointer; highBitsPointer++)
227 | {
228 | // number of trailing zeros (zeros carryover), if whole byte=0 then unaryCodeLength+=8
229 | docID += decodingTableHighBitsCarryover[cb];
230 | cb = compressedBuffer[highBitsPointer];
231 |
232 | // number of docids contained within one byte
233 | byte docIdNumber = decodingTableDocIdNumber[cb];
234 | for (byte i = 0; i < docIdNumber; i++)
235 | {
236 | // decompress low bits
237 | docID <<= lowBitsCount;
238 | docID |= lowBits & ((1u << lowBitsCount) - 1u); //mask remainder from previous lowBits, then add/or to docid
239 |
240 | while (lowBitsCount < lowBitsLength)
241 | {
242 | docID <<= 8;
243 |
244 | lowBits = compressedBuffer[lowBitsPointer++];
245 | docID |= lowBits;
246 | lowBitsCount += 8;
247 | }
248 | lowBitsCount -= lowBitsLength;
249 | // move right bits which belong to next docid
250 | docID >>= lowBitsCount;
251 |
252 | // decompress high bits
253 | // 1 byte contains high bits in unary code of 0..8 docid's
254 | docID += (decodingTableHighBits[cb, i] << lowBitsLength) + lastDocID + 1u;
255 | postingList[resultPointer++] = (uint)docID; lastDocID = docID; docID = 0;
256 | }
257 | }
258 | Console.WriteLine("\rDecompression: " + sw.ElapsedMilliseconds.ToString("N0") + " ms " + postingListCount.ToString("N0") + " DocID");
259 | }
260 |
261 | static void Main(string[] args)
262 | {
263 | // init
264 | Console.Write("Create decoding table...");
265 | eliasFanoCreateDecodingTable();
266 |
267 | Console.SetWindowSize(Math.Min(180, Console.LargestWindowWidth), Console.WindowHeight);
268 |
269 | int indexedPages = 1000000000;
270 |
271 | // may be increased to 1,000,000,000 (>2 GB) if: >=16 GB RAM, 64 bit Windows, .NET version >= 4.5, in config file, Project / Properties / Buld / Prefer 32-bit disabled!
272 | // http://stackoverflow.com/questions/25225249/maxsize-of-array-in-net-framework-4-5
273 | int maximumPostingListLength = 1000000000;
274 |
275 | for (int postingListLength = 10; postingListLength <= maximumPostingListLength; postingListLength *= 10)
276 | {
277 | // posting list creation
278 | Console.Write("\rCreate posting list...");
279 | List postingList1 = generatePostingList(postingListLength, indexedPages);
280 |
281 | // compression
282 | Console.Write("\rCompress posting list...");
283 | //maximum compressed size
284 | int maxCompressedSize = (int)((2 + Math.Ceiling(Math.Log((double)postingList1[postingList1.Count-1] / (double)postingList1.Count, 2))) * postingList1.Count / 8) + 6;
285 | byte[] compressedBuffer1 = new byte[maxCompressedSize];
286 | int compressedBufferPointer1 = 0;
287 | EliasFanoCompress(postingList1, compressedBuffer1, ref compressedBufferPointer1);
288 |
289 | // decompression
290 | Console.Write("Decompress posting list...");
291 | uint[] postingList10 = new uint[postingListLength];
292 | int resultPointer1 = 0;
293 | EliasFanoDecompress(compressedBuffer1, compressedBufferPointer1, postingList10, ref resultPointer1);
294 |
295 | // verification
296 | Console.Write("Verify posting list...");
297 | bool error = false;
298 | for (int i = 0; i < resultPointer1; i++) if (postingList1[i] != postingList10[i]) { error=true;break; }
299 | if (resultPointer1 != postingList1.Count) error = true;
300 | if (error) Console.WriteLine("\rVerification failed! ");
301 | }
302 |
303 |
304 | Console.WriteLine("\rPress any key to exit");
305 | Console.ReadKey();
306 | }
307 | }
308 |
309 |
--------------------------------------------------------------------------------