├── README.md
├── bench.c
├── images
    ├── graph1.png
    └── graph2.png
├── octosort.c
└── octosort.h


/README.md:
--------------------------------------------------------------------------------
  1 | Origin
  2 | ------
  3 | Octosort is a block merge sort based on [WikiSort](https://github.com/BonzaiThePenguin/WikiSort) and [quadsort](https://github.com/scandum/quadsort). This document primarily lists notable differences and some benchmarks.
  4 | 
  5 | Octo swap
  6 | ---------
  7 | Like quadsort has the quad swap, octosort has the octo swap. The swap sorts between 4 and 8 elements at a time and performs runs on reverse ordered data.
  8 | 
  9 | Monobound binary search
 10 | -----------------------
 11 | WikiSort's binary search has been replaced with a [monobound binary search](https://github.com/scandum/binary_search), which is up to two times faster.
 12 | 
 13 | Gries-Mills rotation
 14 | --------------------
 15 | WikiSort's triple reversal rotation has been replaced with a Gries-Mills rotation, which is up to two times faster.
 16 | 
 17 | Quad merge
 18 | ----------
 19 | WikiSort already implemented a quad merge, which has been updated to no longer detect reverse order runs, since that's taken care off by the octo swap.
 20 | 
 21 | Tail merge
 22 | ----------
 23 | Quadsort's tail merge routine was added to perform partially in-place merges.
 24 | 
 25 | Data Types
 26 | ----------
 27 | Support was added for long doubles and 8, 16, 32, and 64 bit data types. By using 32 or 64 bit pointers it's possible to sort any other data type.
 28 | 
 29 | Interface
 30 | ---------
 31 | The interface was changed to use the same one as qsort, which is described in [man qsort](https://man7.org/linux/man-pages/man3/qsort.3p.html).
 32 | 
 33 | Memory
 34 | ------
 35 | By default octosort uses 512 elements worth of stack memory.
 36 | 
 37 | The minimum memory requirement for octosort is 1 element of stack memory, it can be configured to use n / 2 memory.
 38 | 
 39 | Big O
 40 | -----
 41 | ```cobol
 42 |                  ┌───────────────────────┐┌───────────────────────┐
 43 |                  │comparisons            ││swap memory            │
 44 | ┌───────────────┐├───────┬───────┬───────┤├───────┬───────┬───────┤┌──────┐┌─────────┐┌─────────┐
 45 | │name           ││min    │avg    │max    ││min    │avg    │max    ││stable││partition││adaptive │
 46 | ├───────────────┤├───────┼───────┼───────┤├───────┼───────┼───────┤├──────┤├─────────┤├─────────┤
 47 | │mergesort      ││n log n│n log n│n log n││n      │n      │n      ││yes   ││no       ││no       │
 48 | ├───────────────┤├───────┼───────┼───────┤├───────┼───────┼───────┤├──────┤├─────────┤├─────────┤
 49 | │octosort       ││n      │n log n│n log n││1      │1      │1      ││yes   ││no       ││yes      │
 50 | ├───────────────┤├───────┼───────┼───────┤├───────┼───────┼───────┤├──────┤├─────────┤├─────────┤
 51 | │quicksort      ││n      │n log n│n²     ││1      │1      │1      ││no    ││yes      ││no       │
 52 | └───────────────┘└───────┴───────┴───────┘└───────┴───────┴───────┘└──────┘└─────────┘└─────────┘
 53 | ```
 54 | 
 55 | Benchmarks
 56 | ----------
 57 | The following benchmark was on WSL 2 gcc version 7.5.0 (Ubuntu 7.5.0-3ubuntu1~18.04).
 58 | The source code was compiled using gcc -O3 bench.c. Each test was ran 100 times
 59 | and only the best run is reported. It's generated by running the benchmark using
 60 | 100000 100 as the argument.
 61 | 
 62 | ![Graph](/images/graph1.png)
 63 | 
 64 | <details><summary><b>data table</b></summary>
 65 | 
 66 | |      Name |    Items | Type |     Best |  Average |  Compares | Samples |     Distribution |
 67 | | --------- | -------- | ---- | -------- | -------- | --------- | ------- | ---------------- |
 68 | |     qsort |   100000 |   32 | 0.008508 | 0.008779 |   1536367 |     100 |     random order |
 69 | |  octosort |   100000 |   32 | 0.008792 | 0.008889 |   1800800 |     100 |     random order |
 70 | |           |          |      |          |          |           |         |                  |
 71 | |     qsort |   100000 |   32 | 0.002024 | 0.002225 |    815024 |     100 |  ascending order |
 72 | |  octosort |   100000 |   32 | 0.000328 | 0.000345 |    116524 |     100 |  ascending order |
 73 | |           |          |      |          |          |           |         |                  |
 74 | |     qsort |   100000 |   32 | 0.002831 | 0.003088 |    915020 |     100 |    ascending saw |
 75 | |  octosort |   100000 |   32 | 0.001537 | 0.001565 |    370372 |     100 |    ascending saw |
 76 | |           |          |      |          |          |           |         |                  |
 77 | |     qsort |   100000 |   32 | 0.006426 | 0.006722 |   1531997 |     100 |    generic order |
 78 | |  octosort |   100000 |   32 | 0.006437 | 0.006515 |   1633855 |     100 |    generic order |
 79 | |           |          |      |          |          |           |         |                  |
 80 | |     qsort |   100000 |   32 | 0.002456 | 0.002657 |    853904 |     100 | descending order |
 81 | |  octosort |   100000 |   32 | 0.000221 | 0.000227 |     99999 |     100 | descending order |
 82 | |           |          |      |          |          |           |         |                  |
 83 | |     qsort |   100000 |   32 | 0.002832 | 0.003001 |   1063907 |     100 |   descending saw |
 84 | |  octosort |   100000 |   32 | 0.001738 | 0.001849 |    693171 |     100 |   descending saw |
 85 | |           |          |      |          |          |           |         |                  |
 86 | |     qsort |   100000 |   32 | 0.003744 | 0.003939 |   1012256 |     100 |      random tail |
 87 | |  octosort |   100000 |   32 | 0.002684 | 0.002740 |    630603 |     100 |      random tail |
 88 | |           |          |      |          |          |           |         |                  |
 89 | |     qsort |   100000 |   32 | 0.005464 | 0.005732 |   1200738 |     100 |      random half |
 90 | |  octosort |   100000 |   32 | 0.004859 | 0.004911 |   1022394 |     100 |      random half |
 91 | |           |          |      |          |          |           |         |                  |
 92 | |     qsort |   100000 |   32 | 0.004147 | 0.004685 |   1209200 |     100 |  ascending tiles |
 93 | |  octosort |   100000 |   32 | 0.003146 | 0.003437 |    790377 |     100 |  ascending tiles |
 94 | 
 95 | </details>
 96 | 
 97 | 
 98 | The following benchmark was generated using 1000000 0 0 as the argument.
 99 | 
100 | ![Graph](/images/graph2.png)
101 | 
102 | <details><summary><b>data table</b></summary>
103 | 
104 | |      Name |    Items | Type |     Best |  Average |  Compares | Samples |     Distribution |
105 | | --------- | -------- | ---- | -------- | -------- | --------- | ------- | ---------------- |
106 | |     qsort |        4 |   32 | 0.001369 | 0.001439 |         5 |     100 |         random 4 |
107 | |  octosort |        4 |   32 | 0.000765 | 0.000776 |         6 |     100 |         random 4 |
108 | |           |          |      |          |          |           |         |                  |
109 | |     qsort |        8 |   32 | 0.001511 | 0.001555 |        17 |     100 |         random 8 |
110 | |  octosort |        8 |   32 | 0.000893 | 0.000939 |        19 |     100 |         random 8 |
111 | |           |          |      |          |          |           |         |                  |
112 | |     qsort |       16 |   32 | 0.001587 | 0.001952 |        46 |     100 |        random 16 |
113 | |  octosort |       16 |   32 | 0.001221 | 0.001281 |        55 |     100 |        random 16 |
114 | |           |          |      |          |          |           |         |                  |
115 | |     qsort |       32 |   32 | 0.001795 | 0.002612 |       121 |     100 |        random 32 |
116 | |  octosort |       32 |   32 | 0.001319 | 0.001602 |       124 |     100 |        random 32 |
117 | |           |          |      |          |          |           |         |                  |
118 | |     qsort |       64 |   32 | 0.002037 | 0.003018 |       309 |     100 |        random 64 |
119 | |  octosort |       64 |   32 | 0.001492 | 0.002195 |       319 |     100 |        random 64 |
120 | |           |          |      |          |          |           |         |                  |
121 | |     qsort |      128 |   32 | 0.002304 | 0.003754 |       745 |     100 |       random 128 |
122 | |  octosort |      128 |   32 | 0.001674 | 0.003189 |       775 |     100 |       random 128 |
123 | |           |          |      |          |          |           |         |                  |
124 | |     qsort |      256 |   32 | 0.003293 | 0.005024 |      1738 |     100 |       random 256 |
125 | |  octosort |      256 |   32 | 0.001909 | 0.003613 |      1806 |     100 |       random 256 |
126 | |           |          |      |          |          |           |         |                  |
127 | |     qsort |      512 |   32 | 0.005293 | 0.006220 |      3968 |     100 |       random 512 |
128 | |  octosort |      512 |   32 | 0.003113 | 0.005086 |      4112 |     100 |       random 512 |
129 | |           |          |      |          |          |           |         |                  |
130 | |     qsort |     1024 |   32 | 0.006530 | 0.007128 |      8962 |     100 |      random 1024 |
131 | |  octosort |     1024 |   32 | 0.005290 | 0.006494 |     10031 |     100 |      random 1024 |
132 | |           |          |      |          |          |           |         |                  |
133 | |     qsort |     2048 |   32 | 0.007341 | 0.007810 |     19962 |     100 |      random 2048 |
134 | |  octosort |     2048 |   32 | 0.006943 | 0.007444 |     22885 |     100 |      random 2048 |
135 | |           |          |      |          |          |           |         |                  |
136 | |     qsort |     4096 |   32 | 0.008086 | 0.008499 |     43966 |     100 |      random 4096 |
137 | |  octosort |     4096 |   32 | 0.008295 | 0.008441 |     51035 |     100 |      random 4096 |
138 | |           |          |      |          |          |           |         |                  |
139 | |     qsort |     8192 |   32 | 0.008740 | 0.009142 |     96149 |     100 |      random 8192 |
140 | |  octosort |     8192 |   32 | 0.009122 | 0.009198 |    112238 |     100 |      random 8192 |
141 | |           |          |      |          |          |           |         |                  |
142 | |     qsort |    16384 |   32 | 0.009405 | 0.009830 |    208702 |     100 |     random 16384 |
143 | |  octosort |    16384 |   32 | 0.009827 | 0.009949 |    244511 |     100 |     random 16384 |
144 | |           |          |      |          |          |           |         |                  |
145 | |     qsort |    32768 |   32 | 0.010039 | 0.010421 |    450105 |     100 |     random 32768 |
146 | |  octosort |    32768 |   32 | 0.010525 | 0.010680 |    529041 |     100 |     random 32768 |
147 | |           |          |      |          |          |           |         |                  |
148 | |     qsort |    65536 |   32 | 0.010708 | 0.011123 |    965773 |     100 |     random 65536 |
149 | |  octosort |    65536 |   32 | 0.011250 | 0.011431 |   1138363 |     100 |     random 65536 |
150 | |           |          |      |          |          |           |         |                  |
151 | |     qsort |   131072 |   32 | 0.011316 | 0.011698 |   2062601 |     100 |    random 131072 |
152 | |  octosort |   131072 |   32 | 0.011982 | 0.012159 |   2437514 |     100 |    random 131072 |
153 | 
154 | </details>
155 | 


--------------------------------------------------------------------------------
/bench.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Copyright (C) 2014-2021 Igor van den Hoven ivdhoven@gmail.com
  3 | */
  4 | 
  5 | /*
  6 | 	Permission is hereby granted, free of charge, to any person obtaining
  7 | 	a copy of this software and associated documentation files (the
  8 | 	"Software"), to deal in the Software without restriction, including
  9 | 	without limitation the rights to use, copy, modify, merge, publish,
 10 | 	distribute, sublicense, and/or sell copies of the Software, and to
 11 | 	permit persons to whom the Software is furnished to do so, subject to
 12 | 	the following conditions:
 13 | 
 14 | 	The above copyright notice and this permission notice shall be
 15 | 	included in all copies or substantial portions of the Software.
 16 | 
 17 | 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 18 | 	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 19 | 	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 20 | 	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 21 | 	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 22 | 	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 23 | 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 24 | */
 25 | 
 26 | /*
 27 | 	To compile use:
 28 | 	
 29 | 	gcc -O3 bench.c
 30 | 	
 31 | 	or
 32 | 	
 33 | 	g++ -O3 -w -fpermissive bench.c
 34 | */
 35 | 
 36 | #include <stdlib.h>
 37 | #include <stdio.h>
 38 | #include <string.h>
 39 | #include <sys/time.h>
 40 | #include <time.h>
 41 | #include <errno.h>
 42 | #include <math.h>
 43 | 
 44 | #include "octosort.h"
 45 | 
 46 | //#define cmp(a,b) (*(a) > *(b))
 47 | 
 48 | //typedef int CMPFUNC (const void *a, const void *b);
 49 | 
 50 | typedef void SRTFUNC(void *array, size_t nmemb, size_t size, CMPFUNC *cmpf);
 51 | 
 52 | 
 53 | // Must prevent inlining so the benchmark is fair against qsort.
 54 | 
 55 | // Remove __attribute__ ((noinline)) and comparisons++ for full throttle.
 56 | 
 57 | size_t comparisons;
 58 | 
 59 | __attribute__ ((noinline)) int cmp_int(const void * a, const void * b)
 60 | {
 61 | 	const int fa = *(const int *) a;
 62 | 	const int fb = *(const int *) b;
 63 | 
 64 | 	comparisons++;
 65 | 
 66 | 	return fa - fb;
 67 | }
 68 | 
 69 | __attribute__ ((noinline)) int cmp_stable(const void * a, const void * b)
 70 | {
 71 | 	comparisons++;
 72 | 
 73 | 	return *(int *) a / 100000 - *(int *) b / 100000;
 74 | }
 75 | 
 76 | __attribute__ ((noinline)) int cmp_long(const void * a, const void * b)
 77 | {
 78 | 	comparisons++;
 79 | 
 80 | 	return (*(long long *) a > *(long long *) b) - (*(long long *) a < *(long long *) b);
 81 | 
 82 | //	return *(long long *) a > *(long long *) b;
 83 | }
 84 | 
 85 | __attribute__ ((noinline)) int cmp_long_double(const void * a, const void * b)
 86 | {
 87 | 	const long double fa = *(const long double *) a;
 88 | 	const long double fb = *(const long double *) b;
 89 | 
 90 | 	comparisons++;
 91 | 
 92 | 	if (isnan(fa) || isnan(fb))
 93 | 	{
 94 | 		return isnan(fa) - isnan(fb);
 95 | 	}
 96 | 	return ((fa > fb) - (fa < fb));
 97 | }
 98 | 
 99 | 
100 | int cmp_str(const void * a, const void * b)
101 | {
102 | 	return strcmp(*(const char **) a, *(const char **) b);
103 | }
104 | 
105 | int cmp_float(const void * a, const void * b)
106 | {
107 | 	return *(float *) a - *(float *) b;
108 | }
109 | 
110 | 
111 | long long utime()
112 | {
113 | 	struct timeval now_time;
114 | 
115 | 	gettimeofday(&now_time, NULL);
116 | 
117 | 	return now_time.tv_sec * 1000000LL + now_time.tv_usec;
118 | }
119 | 
120 | void seed_rand(unsigned long long seed)
121 | {
122 | 	srand(seed);
123 | }
124 | 
125 | void test_sort(void *array, void *unsorted, void *valid, int minimum, int maximum, int samples, int repetitions, SRTFUNC *srt, const char *name, char *desc, size_t size, CMPFUNC *cmpf)
126 | {
127 | 	long long start, end, total, best, average;
128 | 	size_t rep, sam, max;
129 | 	long long *ptla = (long long *) array, *ptlv = valid;
130 | 	long double *ptda = (long double *) array, *ptdv = valid;
131 | 	int *pta = (int *) array, *ptv = (int *) valid, cnt;
132 | 
133 | 	if (*name == '*')
134 | 	{
135 | 		if (!strcmp(desc, "random order") || !strcmp(desc, "random 1-4") || !strcmp(desc, "random 4"))
136 | 		{
137 | 			if (comparisons)
138 | 			{
139 | 				printf("%s\n", "|      Name |    Items | Type |     Best |  Average |  Compares | Samples |     Distribution |");
140 | 				printf("%s\n", "| --------- | -------- | ---- | -------- | -------- | --------- | ------- | ---------------- |");
141 | 			}
142 | 			else
143 | 			{
144 | 				printf("%s\n", "|      Name |    Items | Type |     Best |  Average |     Loops | Samples |     Distribution |");
145 | 				printf("%s\n", "| --------- | -------- | ---- | -------- | -------- | --------- | ------- | ---------------- |");
146 | 			}
147 | 		}
148 | 		else
149 | 		{
150 | 				printf("%s\n", "|           |          |      |          |          |           |         |                  |");
151 | 		}
152 | 		return;
153 | 	}
154 | 
155 | 	best = average = 0;
156 | 
157 | 	if (minimum == 7 && maximum == 7)
158 | 	{
159 | 		printf("\e[1;32m%10d %10d %10d %10d %10d %10d %10d\e[0m\n", pta[0], pta[1], pta[2], pta[3], pta[4], pta[5], pta[6]);
160 | 	}
161 | 
162 | 	for (sam = 0 ; sam < samples ; sam++)
163 | 	{
164 | 		total = 0;
165 | 
166 | 		max = minimum;
167 | 
168 | 		if (repetitions > 1)
169 | 		{
170 | 			start = utime();
171 | 
172 | 			for (rep = 0 ; rep < repetitions ; rep++)
173 | 			{
174 | 				memcpy(array, unsorted, max * size);
175 | 
176 | 				comparisons = 0;
177 | 
178 | 				switch (*name)
179 | 				{
180 | 					case 'q':
181 | 						qsort(array, max, size, cmpf);
182 | 						break;
183 | 
184 | 					case 'o':
185 | 						octosort(array, max, size, cmpf);
186 | 						break;
187 | 				}
188 | 
189 | 				if (minimum < maximum && ++max > maximum)
190 | 				{
191 | 					max = minimum;
192 | 				}
193 | 			}
194 | 			end = utime();
195 | 		}
196 | 		else
197 | 		{
198 | 			memcpy(array, unsorted, max * size);
199 | 
200 | 			comparisons = 0;
201 | 
202 | 			start = utime();
203 | 
204 | 			switch (*name)
205 | 			{
206 | 				case 'q':
207 | 					qsort(array, max, size, cmpf);
208 | 					break;
209 | 				case 'o':
210 | 					octosort(array, max, size, cmpf);
211 | 					break;
212 | 			}
213 | 			end = utime();
214 | 		}
215 | 
216 | 		total = end - start;
217 | 
218 | 		if (!best || total < best)
219 | 		{
220 | 			best = total;
221 | 		}
222 | 		average += total;
223 | 	}
224 | 
225 | 	if (minimum == 7 && maximum == 7)
226 | 	{
227 | 		printf("\e[1;32m%10d %10d %10d %10d %10d %10d %10d\e[0m\n", pta[0], pta[1], pta[2], pta[3], pta[4], pta[5], pta[6]);
228 | 	}
229 | 
230 | 	if (repetitions == 0)
231 | 	{
232 | 		return;
233 | 	}
234 | 
235 | 	average /= samples;
236 | 
237 | 	if (cmpf == cmp_stable)
238 | 	{
239 | 		for (cnt = 1 ; cnt < maximum ; cnt++)
240 | 		{
241 | 			if (pta[cnt - 1] > pta[cnt])
242 | 			{
243 | 				sprintf(desc, "\e[1;31m%16s\e[0m", "unstable");
244 | 
245 | 				break;
246 | 			}
247 | 		}
248 | 	}
249 | 
250 | 	if (comparisons)
251 | 	{
252 | 		printf("|%10s | %8d | %4d | %f | %f | %9d | %7d | %16s |\n", name, maximum, (int) size * 8, best / 1000000.0, average / 1000000.0, (int) comparisons, samples, desc);
253 | 	}
254 | 	else
255 | 	{
256 | 		printf("|%10s | %8d | %4d | %f | %f | %9d | %7d | %16s |\n", name, maximum, (int) size * 8, best / 1000000.0, average / 1000000.0, repetitions, samples, desc);
257 | 	}
258 | 
259 | 	if (minimum != maximum || cmpf == cmp_stable)
260 | 	{
261 | 		return;
262 | 	}
263 | 
264 | 	for (cnt = 1 ; cnt < maximum ; cnt++)
265 | 	{
266 | 		if (size == sizeof(int))
267 | 		{
268 | 			if (pta[cnt - 1] > pta[cnt])
269 | 			{
270 | 				printf("%17s: not properly sorted at index %d. (%d vs %d\n", name, cnt, pta[cnt - 1], pta[cnt]);
271 | 				break;
272 | 			}
273 | 			if (pta[cnt - 1] == pta[cnt])
274 | 			{
275 | //				printf("%17s: Found a repeat value at index %d. (%d)\n", name, cnt, pta[cnt]);
276 | 			}
277 | 		}
278 | 		else if (size == sizeof(long long))
279 | 		{
280 | 			if (ptla[cnt - 1] > ptla[cnt])
281 | 			{
282 | 				printf("%17s: not properly sorted at index %d. (%lld vs %lld\n", name, cnt, ptla[cnt - 1], ptla[cnt]);
283 | 				break;
284 | 			}
285 | 		}
286 | 		else if (size == sizeof(long double))
287 | 		{
288 | 			if (cmp_long_double(&ptda[cnt - 1], &ptda[cnt]) > 0)
289 | 			{
290 | 				printf("%17s: not properly sorted at index %d. (%Lf vs %Lf\n", name, cnt, ptda[cnt - 1], ptda[cnt]);
291 | 				break;
292 | 			}
293 | 		}
294 | 	}
295 | 
296 | 	for (cnt = 1 ; cnt < maximum ; cnt++)
297 | 	{
298 | 		if (size == sizeof(int))
299 | 		{
300 | 			if (pta[cnt] != ptv[cnt])
301 | 			{
302 | 				printf("         validate: array[%d] != valid[%d]. (%d vs %d\n", cnt, cnt, pta[cnt], ptv[cnt]);
303 | 				break;
304 | 			}
305 | 		}
306 | 		else if (size == sizeof(long long))
307 | 		{
308 | 			if (ptla[cnt] != ptlv[cnt])
309 | 			{
310 | 				printf("         validate: array[%d] != valid[%d]. (%lld vs %lld\n", cnt, cnt, ptla[cnt], ptlv[cnt]);
311 | 				break;
312 | 			}
313 | 		}
314 | 		else if (size == sizeof(long double))
315 | 		{
316 | 			if (ptda[cnt] != ptdv[cnt])
317 | 			{
318 | 				printf("         validate: array[%d] != valid[%d]. (%Lf vs %Lf\n", cnt, cnt, ptda[cnt], ptdv[cnt]);
319 | 				break;
320 | 			}
321 | 		}
322 | 	}
323 | }
324 | 
325 | void validate()
326 | {
327 | 	int seed = time(NULL);
328 | 	int cnt, val, max = 2000000;
329 | 
330 | 	int *a_array, *r_array, *v_array;
331 | 
332 | 	seed_rand(seed);
333 | 
334 | 	a_array = (int *) malloc(max * sizeof(int));
335 | 	r_array = (int *) malloc(max * sizeof(int));
336 | 	v_array = (int *) malloc(max * sizeof(int));
337 | 
338 | 	for (cnt = 0 ; cnt < max ; cnt++)
339 | 	{
340 | 		r_array[cnt] = rand();
341 | 	}
342 | 
343 | 
344 | 	for (cnt = 1 ; cnt < 100 ; cnt++)
345 | 	{
346 | 		memcpy(a_array, r_array, max * sizeof(int));
347 | 		memcpy(v_array, r_array, max * sizeof(int));
348 | 
349 | 		octosort(a_array, cnt, sizeof(int), cmp_int);
350 | 		qsort(v_array, cnt, sizeof(int), cmp_int);
351 | 
352 | 		for (val = 0 ; val < cnt ; val++)
353 | 		{
354 | 			if (val && v_array[val - 1] > v_array[val])
355 | 			{
356 | 				printf("\e[1;31mvalidate rand: seed %d: size: %d Not properly sorted at index %d.\n", seed, cnt, val);
357 | 				return;
358 | 			}
359 | 
360 | 			if (a_array[val] != v_array[val])
361 | 			{
362 | 				printf("\e[1;31mvalidate rand: seed %d: size: %d Not verified at index %d.\n", seed, cnt, val);
363 | 				return;
364 | 			}
365 | 		}
366 | 	}
367 | 
368 | 	// ascending saw
369 | 
370 | 	for (cnt = 0 ; cnt < 1000 ; cnt++)
371 | 	{
372 | 		r_array[cnt] = rand();
373 | 	}
374 | 
375 |         octosort(r_array + max / 4 * 0, max / 4, sizeof(int), cmp_int);
376 |         octosort(r_array + max / 4 * 1, max / 4, sizeof(int), cmp_int);
377 |         octosort(r_array + max / 4 * 2, max / 4, sizeof(int), cmp_int);
378 |         octosort(r_array + max / 4 * 3, max / 4, sizeof(int), cmp_int);
379 | 
380 |         for (cnt = 1 ; cnt < 1000 ; cnt += 7)
381 | 	{
382 | 		memcpy(a_array, r_array, max * sizeof(int));
383 | 	        memcpy(v_array, r_array, max * sizeof(int));
384 | 
385 | 	        octosort(a_array, cnt, sizeof(int), cmp_int);
386 | 	        qsort(v_array, cnt, sizeof(int), cmp_int);
387 | 
388 | 		for (val = 0 ; val < cnt ; val++)
389 | 		{
390 | 			if (val && v_array[val - 1] > v_array[val])
391 | 			{
392 | 				printf("\e[1;31mvalidate ascending saw: seed %d: size: %d Not properly sorted at index %d.\n", seed, cnt, val);
393 | 				return;
394 | 			}
395 | 
396 | 			if (a_array[val] != v_array[val])
397 | 			{
398 | 				printf("\e[1;31mvalidate ascending saw: seed %d: size: %d Not verified at index %d.\n", seed, cnt, val);
399 | 				return;
400 | 			}
401 | 		}
402 | 	}
403 | 
404 |         // descending saw
405 | 
406 |         for (cnt = 0 ; cnt < 1000 ; cnt++)
407 |         {
408 |                 r_array[cnt] = (max - cnt - 1) % 100000;
409 |         }
410 | 
411 |         for (cnt = 1 ; cnt < 1000 ; cnt += 7)
412 | 	{
413 | 		memcpy(a_array, r_array, max * sizeof(int));
414 | 	        memcpy(v_array, r_array, max * sizeof(int));
415 | 
416 | 	        octosort(a_array, cnt, sizeof(int), cmp_int);
417 | 	        qsort(v_array, cnt, sizeof(int), cmp_int);
418 | 
419 | 		for (val = 0 ; val < cnt ; val++)
420 | 		{
421 | 			if (val && v_array[val - 1] > v_array[val])
422 | 			{
423 | 				printf("\e[1;31mvalidate descending saw: seed %d: size: %d Not properly sorted at index %d.\n", seed, cnt, val);
424 | 				return;
425 | 			}
426 | 
427 | 			if (a_array[val] != v_array[val])
428 | 			{
429 | 				printf("\e[1;31mvalidate descending saw: seed %d: size: %d Not verified at index %d.\n", seed, cnt, val);
430 | 				return;
431 | 			}
432 | 		}
433 | 	}
434 | 
435 | 	// random tail
436 | 
437 | 	for (cnt = 0 ; cnt < max * 3 / 4 ; cnt++)
438 | 	{
439 | 		r_array[cnt] = cnt;
440 | 	}
441 | 
442 | 	for (cnt = max * 3 / 4 ; cnt < max ; cnt++)
443 | 	{
444 | 		r_array[cnt] = rand();
445 | 	}
446 | 
447 |         for (cnt = 1 ; cnt < 1000 ; cnt += 7)
448 | 	{
449 | 		memcpy(a_array, r_array, max * sizeof(int));
450 | 	        memcpy(v_array, r_array, max * sizeof(int));
451 | 
452 | 	        octosort(a_array, cnt, sizeof(int), cmp_int);
453 | 	        qsort(v_array, cnt, sizeof(int), cmp_int);
454 | 
455 | 		for (val = 0 ; val < cnt ; val++)
456 | 		{
457 | 			if (val && v_array[val - 1] > v_array[val])
458 | 			{
459 | 				printf("\e[1;31mvalidate rand tail: seed %d: size: %d Not properly sorted at index %d.\n", seed, cnt, val);
460 | 				return;
461 | 			}
462 | 
463 | 			if (a_array[val] != v_array[val])
464 | 			{
465 | 				printf("\e[1;31mvalidate rand tail: seed %d: size: %d Not verified at index %d.\n", seed, cnt, val);
466 | 				return;
467 | 			}
468 | 		}
469 | 	}
470 | 
471 | 	free(a_array);
472 | 	free(r_array);
473 | 	free(v_array);
474 | }
475 | 
476 | 
477 | int main(int argc, char **argv)
478 | {
479 | 	int max = 100000;
480 | 	int samples = 10;
481 | 	int repetitions = 1;
482 | 	int seed = 0;
483 | 	int cnt, rnd, lst;
484 | 	int *a_array, *r_array, *v_array;
485 | 	long long *la_array, *lr_array, *lv_array;
486 | 	long double *da_array, *dr_array, *dv_array;
487 | 
488 | 	char dist[40], *sorts[] = { "*", "qsort", "octosort" };
489 | 
490 | 	if (argc >= 1 && argv[1] && *argv[1])
491 | 	{
492 | 		max = atoi(argv[1]);
493 | 	}
494 | 
495 | 	if (argc >= 2 && argv[2] && *argv[2])
496 | 	{
497 | 		samples = atoi(argv[2]);
498 | 	}
499 | 
500 | 	if (argc >= 3 && argv[3] && *argv[3])
501 | 	{
502 | 		repetitions = atoi(argv[3]);
503 | 	}
504 | 
505 | 	if (argc >= 4 && argv[4] && *argv[4])
506 | 	{
507 | 		seed = atoi(argv[4]);
508 | 	}
509 | 
510 | 	validate();
511 | 
512 | 	rnd = seed ? seed : time(NULL);
513 | 
514 | 	a_array = (int *) malloc(max * sizeof(int));
515 | 	r_array = (int *) malloc(max * sizeof(int));
516 | 	v_array = (int *) malloc(max * sizeof(int));
517 | 
518 | 	printf("Info: int = %lu, long long = %lu, long double = %lu\n\n", sizeof(int) * 8, sizeof(long long) * 8, sizeof(long double) * 8);
519 | 
520 | 	printf("Benchmark: array size: %d, samples: %d, repetitions: %d, seed: %d\n\n", max, samples, repetitions, rnd);
521 | 
522 | 	if (samples == 0 && repetitions == 0)
523 | 	{
524 | 		goto small_range_test;
525 | 	}
526 | 
527 | 	// 128 bit
528 | 
529 | 	da_array = (long double *) malloc(max * sizeof(long double));
530 | 	dr_array = (long double *) malloc(max * sizeof(long double));
531 | 	dv_array = (long double *) malloc(max * sizeof(long double));
532 | 
533 | 	if (da_array == NULL || dr_array == NULL || dv_array == NULL)
534 | 	{
535 | 		printf("main(%d,%d,%d): malloc: %s\n", max, samples, repetitions, strerror(errno));
536 | 
537 | 		return 0;
538 | 	}
539 | 
540 | 	seed_rand(rnd);
541 | 
542 | 	for (cnt = 0 ; cnt < max ; cnt++)
543 | 	{
544 | 		dr_array[cnt] = rand() + 1.0 / (long double) (rand() + (rand() << 30LL));
545 | 	}
546 | 
547 | 	memcpy(dv_array, dr_array, max * sizeof(long double));
548 | 	qsort(dv_array, max, sizeof(long double), cmp_long_double);
549 | 
550 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
551 | 	{
552 | 		test_sort(da_array, dr_array, dv_array, max, max, samples, repetitions, qsort, sorts[cnt], "random order", sizeof(long double), cmp_long_double);
553 | 	}
554 | 
555 | 	free(da_array);
556 | 	free(dr_array);
557 | 	free(dv_array);
558 | 
559 | 	printf("\n");
560 | 	
561 | 	// 64 bit
562 | 
563 | 	la_array = (long long *) malloc(max * sizeof(long long));
564 | 	lr_array = (long long *) malloc(max * sizeof(long long));
565 | 	lv_array = (long long *) malloc(max * sizeof(long long));
566 | 
567 | 	if (la_array == NULL || lr_array == NULL || lv_array == NULL)
568 | 	{
569 | 		printf("main(%d,%d,%d): malloc: %s\n", max, samples, repetitions, strerror(errno));
570 | 
571 | 		return 0;
572 | 	}
573 | 
574 | 	seed_rand(rnd);
575 | 
576 | 	for (cnt = 0 ; cnt < max ; cnt++)
577 | 	{
578 | 		lr_array[cnt] = rand();
579 | 		lr_array[cnt] += (unsigned long long) rand() << 32ULL;
580 | 	}
581 | 
582 | 	memcpy(lv_array, lr_array, max * sizeof(long long));
583 | 	qsort(lv_array, max, sizeof(long long), cmp_long);
584 | 
585 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
586 | 	{
587 | 		test_sort(la_array, lr_array, lv_array, max, max, samples, repetitions, qsort, sorts[cnt], "random order", sizeof(long long), cmp_long);
588 | 	}
589 | 
590 | 	printf("\n");
591 | 
592 | 	free(la_array);
593 | 	free(lr_array);
594 | 	free(lv_array);
595 | 
596 | 	// 32 bit
597 | 
598 | 	// random
599 | 
600 | 	seed_rand(rnd);
601 | 
602 | 	for (cnt = 0 ; cnt < max ; cnt++)
603 | 	{
604 | 		r_array[cnt] = rand();
605 | 	}
606 | 
607 | 	memcpy(v_array, r_array, max * sizeof(int));
608 | 	qsort(v_array, max, sizeof(int), cmp_int);
609 | 
610 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
611 | 	{
612 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "random order", sizeof(int), cmp_int);
613 | 	}
614 | 
615 | 	// ascending
616 | 
617 | 	for (cnt = 0 ; cnt < max ; cnt++)
618 | 	{
619 | 		r_array[cnt] = cnt;
620 | 	}
621 | 
622 |         memcpy(v_array, r_array, max * sizeof(int));
623 |         memcpy(r_array, v_array, max * sizeof(int));
624 | 
625 | 	qsort(v_array, max, sizeof(int), cmp_int);
626 | 
627 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
628 | 	{
629 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "ascending order", sizeof(int), cmp_int);
630 | 	}
631 | 
632 | 	// ascending saw
633 | 
634 | 	for (cnt = 0 ; cnt < max ; cnt++)
635 | 	{
636 | 		r_array[cnt] = rand();
637 | 	}
638 | 
639 |         memcpy(v_array, r_array, max * sizeof(int));
640 |         qsort(v_array + max / 4 * 0, max / 4, sizeof(int), cmp_int);
641 |         qsort(v_array + max / 4 * 1, max / 4, sizeof(int), cmp_int);
642 |         qsort(v_array + max / 4 * 2, max / 4, sizeof(int), cmp_int);
643 |         qsort(v_array + max / 4 * 3, max / 4, sizeof(int), cmp_int);
644 |         memcpy(r_array, v_array, max * sizeof(int));
645 | 
646 | 	qsort(v_array, max, sizeof(int), cmp_int);
647 | 
648 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
649 | 	{
650 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "ascending saw", sizeof(int), cmp_int);
651 | 	}
652 | 
653 | 	// generic
654 | 
655 | 	for (cnt = 0 ; cnt < max ; cnt++)
656 | 	{
657 | 		r_array[cnt] = rand() % 100;
658 | 	}
659 | 
660 | 	memcpy(v_array, r_array, max * sizeof(int));
661 | 	qsort(v_array, max, sizeof(int), cmp_int);
662 | 
663 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
664 | 	{
665 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "generic order", sizeof(int), cmp_int);
666 | 	}
667 | 
668 |         // descending
669 | 
670 |         for (cnt = 0 ; cnt < max ; cnt++)
671 |         {
672 |                 r_array[cnt] = (max - cnt);
673 |         }
674 | 
675 |         memcpy(v_array, r_array, max * sizeof(int));
676 |         qsort(v_array, max, sizeof(int), cmp_int);
677 | 
678 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
679 | 	{
680 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "descending order", sizeof(int), cmp_int);
681 | 	}
682 | 
683 |         // descending saw
684 | 
685 |         for (cnt = 0 ; cnt < max ; cnt++)
686 |         {
687 |                 r_array[cnt] = (max - cnt - 1) % 10000;
688 |         }
689 | 
690 |         memcpy(v_array, r_array, max * sizeof(int));
691 |         qsort(v_array, max, sizeof(int), cmp_int);
692 | 
693 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
694 | 	{
695 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "descending saw", sizeof(int), cmp_int);
696 | 	}
697 | 
698 | 	// random tail
699 | 
700 | 	seed_rand(rnd);
701 | 
702 | 	for (cnt = 0 ; cnt < max ; cnt++)
703 | 	{
704 | 		r_array[cnt] = rand();
705 | 	}
706 | 
707 | 	memcpy(v_array, r_array, max * sizeof(int));
708 | 	qsort(v_array, max * 3 / 4, sizeof(int), cmp_int);
709 | 	memcpy(r_array, v_array, max * sizeof(int));
710 | 	qsort(v_array, max, sizeof(int), cmp_int);
711 | 
712 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
713 | 	{
714 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "random tail", sizeof(int), cmp_int);
715 | 	}
716 | 
717 | 	seed_rand(rnd);
718 | 
719 | 	for (cnt = 0 ; cnt < max ; cnt++)
720 | 	{
721 | 		r_array[cnt] = rand();
722 | 	}
723 | 
724 |         memcpy(v_array, r_array, max * sizeof(int));
725 | 	qsort(v_array, max / 2, sizeof(int), cmp_int);
726 | 
727 | 	memcpy(r_array, v_array, max * sizeof(int));
728 | 	qsort(v_array, max, sizeof(int), cmp_int);
729 | 
730 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
731 | 	{
732 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], "random half", sizeof(int), cmp_int);
733 | 	}
734 | 
735 | 	// tiles
736 | 
737 | 	for (cnt = 0 ; cnt < max ; cnt++)
738 | 	{
739 | 		if (cnt % 2 == 0)
740 | 		{
741 | 			r_array[cnt] = 16777216 + cnt;
742 | 		}
743 | 		else
744 | 		{
745 | 			r_array[cnt] = 33554432 + cnt;
746 | 		}
747 | 	}
748 | 
749 | 	memcpy(v_array, r_array, max * sizeof(int));
750 | 	qsort(v_array, max, sizeof(int), cmp_int);
751 | 
752 | 	for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
753 | 	{
754 | 		strcpy(dist, "ascending tiles");
755 | 
756 | 		test_sort(a_array, r_array, v_array, max, max, samples, repetitions, qsort, sorts[cnt], dist, sizeof(int), cmp_stable);
757 | 	}
758 | 
759 | 	if (repetitions > 0)
760 | 	{
761 | 		goto end;
762 | 	}
763 | 
764 | 	small_range_test:
765 | 
766 | 	if (max >= 8192)
767 | 	{
768 | 		goto large_range_test;
769 | 	}
770 | 
771 | 	for (lst = 1, samples = 32768, repetitions = 4 ; repetitions <= 4096 ; repetitions *= 2, samples /= 2)
772 | 	{
773 | 		if (max >= repetitions)
774 | 		{
775 | 			sprintf(dist, "random %d-%d", lst, repetitions);
776 | 
777 | 			srand(rnd);
778 | 
779 | 			for (cnt = 0 ; cnt < repetitions ; cnt++)
780 | 			{
781 | 				r_array[cnt] = rand();
782 | 			}
783 | 
784 | 			memcpy(v_array, r_array, repetitions * sizeof(int));
785 | 			qsort(v_array, repetitions, sizeof(int), cmp_int);
786 | 
787 | 			for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
788 | 			{
789 | 				test_sort(a_array, r_array, v_array, lst, repetitions, 100, samples, qsort,           sorts[cnt],             dist, sizeof(int), cmp_int);
790 | 			}
791 | 			lst = repetitions + 1;
792 | 		}
793 | 	}
794 | 
795 | 	goto end;
796 | 
797 | 	large_range_test:
798 | 
799 | 	for (samples = 32768, repetitions = 4 ; samples > 0 ; repetitions *= 2, samples /= 2)
800 | 	{
801 | 		if (max >= repetitions)
802 | 		{
803 | 			srand(rnd);
804 | 
805 | 			for (cnt = 0 ; cnt < repetitions ; cnt++)
806 | 			{
807 | 				r_array[cnt] = rand();
808 | 			}
809 | 
810 | 			memcpy(v_array, r_array, repetitions * sizeof(int));
811 | 			qsort(v_array, repetitions, sizeof(int), cmp_int);
812 | 
813 | 			sprintf(dist, "random %d", repetitions);
814 | 
815 | 			for (cnt = 0 ; cnt < sizeof(sorts) / sizeof(char *) ; cnt++)
816 | 			{
817 | 				test_sort(a_array, r_array, v_array, repetitions, repetitions, 100, samples, qsort,           sorts[cnt],             dist, sizeof(int), cmp_int);
818 | 			}
819 | 		}
820 | 	}
821 | 
822 | 	end:
823 | 
824 | 
825 | 	free(a_array);
826 | 	free(r_array);
827 | 	free(v_array);
828 | 
829 | 	return 0;
830 | }
831 | 


--------------------------------------------------------------------------------
/images/graph1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scandum/octosort/73605cdbdfec66e7112c6a3a3830748fcd3bb665/images/graph1.png


--------------------------------------------------------------------------------
/images/graph2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scandum/octosort/73605cdbdfec66e7112c6a3a3830748fcd3bb665/images/graph2.png


--------------------------------------------------------------------------------
/octosort.c:
--------------------------------------------------------------------------------
   1 | /*
   2 | 	Copyright (C) 2014-2021 Igor van den Hoven ivdhoven@gmail.com
   3 | */
   4 | 
   5 | /*
   6 | 	Permission is hereby granted, free of charge, to any person obtaining
   7 | 	a copy of this software and associated documentation files (the
   8 | 	"Software"), to deal in the Software without restriction, including
   9 | 	without limitation the rights to use, copy, modify, merge, publish,
  10 | 	distribute, sublicense, and/or sell copies of the Software, and to
  11 | 	permit persons to whom the Software is furnished to do so, subject to
  12 | 	the following conditions:
  13 | 
  14 | 	The above copyright notice and this permission notice shall be
  15 | 	included in all copies or substantial portions of the Software.
  16 | 
  17 | 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  18 | 	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  19 | 	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  20 | 	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  21 | 	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  22 | 	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  23 | 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  24 | */
  25 | 
  26 | /*
  27 | 	octosort 1.0
  28 | */
  29 | 
  30 | /*
  31 | 	octosort is based on WikiSort and quadsort
  32 | 
  33 | 	WikiSort: https://github.com/BonzaiThePenguin/WikiSort
  34 | 	quadsort: https://github.com/scandum/quadsort
  35 | 	searches: https://github.com/scandum/binary_search
  36 | */
  37 | 
  38 | // binary insertion sort for up to 8 elements
  39 | 
  40 | void FUNC(octo_tail_insert)(VAR *array, VAR *pta, CMPFUNC *cmp)
  41 | {
  42 | 	VAR *pte, key;
  43 | 
  44 | 	pte = pta--;
  45 | 
  46 | 	if (cmp(pta, pte) > 0)
  47 | 	{
  48 | 		key = *pte;
  49 | 
  50 | 		if (cmp(pta - 3, &key) > 0)
  51 | 		{
  52 | 			*pte-- = *pta--; *pte-- = *pta--; *pte-- = *pta--; *pte-- = *pta--;
  53 | 		}
  54 | 		
  55 | 		if (pta >= array + 1 && cmp(pta - 1, &key) > 0)
  56 | 		{
  57 | 			*pte-- = *pta--; *pte-- = *pta--;
  58 | 		}
  59 | 
  60 | 		if (pta >= array && cmp(pta, &key) > 0)
  61 | 		{
  62 | 			*pte-- = *pta;
  63 | 		}
  64 | 		*pte = key;
  65 | 	}
  66 | }
  67 | 
  68 | // sort arrays of length 4 to 8 with reverse order run detection
  69 | 
  70 | VAR *FUNC(octo_swap)(VAR array[], VAR *ptz, size_t start, size_t nmemb, CMPFUNC *cmp)
  71 | {
  72 | 	VAR *pta, swap;
  73 | 	size_t i;
  74 | 
  75 | 	pta = array + start;
  76 | 
  77 | 	if (cmp(&pta[0], &pta[1]) > 0)
  78 | 	{
  79 | 		if (cmp(&pta[2], &pta[3]) > 0)
  80 | 		{
  81 | 			if (cmp(&pta[1], &pta[2]) > 0)
  82 | 			{
  83 | 				goto Swapper;
  84 | 			}
  85 | 			swap = pta[2]; pta[2] = pta[3]; pta[3] = swap;
  86 | 		}
  87 | 		swap = pta[0]; pta[0] = pta[1]; pta[1] = swap;
  88 | 	}
  89 | 	else if (cmp(&pta[2], &pta[3]) > 0)
  90 | 	{
  91 | 		swap = pta[2]; pta[2] = pta[3]; pta[3] = swap;
  92 | 	}
  93 | 
  94 | 	if (cmp(&pta[1], &pta[2]) > 0)
  95 | 	{
  96 | 		if (cmp(&pta[0], &pta[2]) <= 0)
  97 | 		{
  98 | 			if (cmp(&pta[1], &pta[3]) <= 0)
  99 | 			{
 100 | 				swap = pta[1]; pta[1] = pta[2]; pta[2] = swap;
 101 | 			}
 102 | 			else
 103 | 			{
 104 | 				swap = pta[1]; pta[1] = pta[2]; pta[2] = pta[3]; pta[3] = swap;
 105 | 			}
 106 | 		}
 107 | 		else if (cmp(&pta[0], &pta[3]) > 0)
 108 | 		{
 109 | 			swap = pta[1]; pta[1] = pta[3]; pta[3] = swap;
 110 | 			swap = pta[0]; pta[0] = pta[2]; pta[2] = swap;
 111 | 		}
 112 | 		else if (cmp(&pta[1], &pta[3]) <= 0)
 113 | 		{
 114 | 			swap = pta[1]; pta[1] = pta[0]; pta[0] = pta[2]; pta[2] = swap;
 115 | 		}
 116 | 		else
 117 | 		{
 118 | 			swap = pta[1]; pta[1] = pta[0]; pta[0] = pta[2]; pta[2] = pta[3]; pta[3] = swap;
 119 | 		}
 120 | 	}
 121 | 
 122 | 	for (i = 4 ; i < nmemb ; i++)
 123 | 	{
 124 | 		FUNC(octo_tail_insert)(pta, &pta[i], cmp);
 125 | 	}
 126 | 
 127 | 	if (ptz)
 128 | 	{
 129 | 		do
 130 | 		{
 131 | 			swap = *ptz;
 132 | 			*ptz++ = *--pta;
 133 | 			*pta = swap;
 134 | 		}
 135 | 		while (ptz < pta);
 136 | 	}
 137 | 	return NULL;
 138 | 
 139 | 	Swapper:
 140 | 
 141 | 	if (ptz == NULL || cmp(&pta[-1], &pta[0]) > 0)
 142 | 	{
 143 | 		switch (nmemb)
 144 | 		{
 145 | 			case 8:
 146 | 				if (cmp(&pta[6], &pta[7]) <= 0)
 147 | 				{
 148 | 					break;
 149 | 				}
 150 | 			case 7:
 151 | 				if (cmp(&pta[5], &pta[6]) <= 0)
 152 | 				{
 153 | 					break;
 154 | 				}
 155 | 			case 6:
 156 | 				if (cmp(&pta[4], &pta[5]) <= 0)
 157 | 				{
 158 | 					break;
 159 | 				}
 160 | 			case 5:
 161 | 				if (cmp(&pta[3], &pta[4]) <= 0)
 162 | 				{
 163 | 					break;
 164 | 				}
 165 | 			case 4:
 166 | 				return ptz ? ptz : pta;
 167 | 		}
 168 | 	}
 169 | 
 170 | 	swap = pta[0]; pta[0] = pta[3]; pta[3] = swap;
 171 | 	swap = pta[1]; pta[1] = pta[2]; pta[2] = swap;
 172 | 
 173 | 	for (i = 4 ; i < nmemb ; i++)
 174 | 	{
 175 | 		FUNC(octo_tail_insert)(pta, &pta[i], cmp);
 176 | 	}
 177 | 
 178 | 	if (ptz)
 179 | 	{
 180 | 		do
 181 | 		{
 182 | 			swap = *ptz;
 183 | 			*ptz++ = *--pta;
 184 | 			*pta = swap;
 185 | 		}
 186 | 		while (ptz < pta);
 187 | 	}
 188 | 	return NULL;
 189 | }
 190 | 
 191 | // find the index of the first value within the range that is equal to array[index]
 192 | 
 193 | size_t FUNC(monobound_binary_first)(const VAR array[], const VAR value, const Range range, CMPFUNC *cmp)
 194 | {
 195 | 	size_t top, mid, end = range.end;
 196 | 
 197 | 	if (range.start >= end)
 198 | 	{
 199 | 		return range.start;
 200 | 	}
 201 | 
 202 | 	top = end - range.start;
 203 | 
 204 | 	while (top > 1)
 205 | 	{
 206 | 		mid = top / 2;
 207 | 
 208 | 		if (cmp(&value, &array[end - mid]) <= 0)
 209 | 		{
 210 | 			end -= mid;
 211 | 		}
 212 | 		top -= mid;
 213 | 	}
 214 | 
 215 | 	if (cmp(&value, &array[end-1]) <= 0)
 216 | 	{
 217 | 		return --end;
 218 | 	}
 219 | 	return end;
 220 | }
 221 | 
 222 | // find the index of the last value within the range that is equal to array[index], plus 1
 223 | 
 224 | size_t FUNC(monobound_binary_last)(const VAR array[], const VAR value, const Range range, CMPFUNC *cmp)
 225 | {
 226 | 	size_t top, mid, start = range.start;
 227 | 
 228 | 	if (start >= range.end)
 229 | 	{
 230 | 		return start;
 231 | 	}
 232 | 
 233 | 	top = range.end - start;
 234 | 
 235 | 	while (top > 1)
 236 | 	{
 237 | 		mid = top / 2;
 238 | 
 239 | 		if (cmp(&array[start + mid], &value) <= 0)
 240 | 		{
 241 | 			start += mid;
 242 | 		}
 243 | 		top -= mid;
 244 | 	}
 245 | 
 246 | 	if (cmp(&array[start], &value) <= 0)
 247 | 	{
 248 | 		return ++start;
 249 | 	}
 250 | 	return start;
 251 | }
 252 | 
 253 | // combine a linear search with a binary search to reduce the number of comparisons in situations
 254 | // where have some idea as to how many unique values there are and where the next value might be
 255 | 
 256 | size_t FUNC(FindFirstForward)(const VAR array[], const VAR value, const Range range, CMPFUNC *cmp, const size_t unique)
 257 | {
 258 | 	size_t skip, index;
 259 | 
 260 | 	skip = Max(range_length(range) / unique, 1);
 261 | 
 262 | 	for (index = range.start + skip ; cmp(&value, &array[index - 1]) > 0 ; index += skip)
 263 | 	{
 264 | 		if (index >= range.end - skip)
 265 | 		{
 266 | 			return FUNC(monobound_binary_first)(array, value, new_range(index, range.end), cmp);
 267 | 		}
 268 | 	}
 269 | 	return FUNC(monobound_binary_first)(array, value, new_range(index - skip, index), cmp);
 270 | }
 271 | 
 272 | size_t FUNC(FindLastForward)(const VAR array[], const VAR value, const Range range, CMPFUNC *cmp, const size_t unique)
 273 | {
 274 | 	size_t skip, index;
 275 | 
 276 | 	if (range_length(range) == 0)
 277 | 		return range.start;
 278 | 
 279 | 	skip = Max(range_length(range)/unique, 1);
 280 | 
 281 | 	for (index = range.start + skip; cmp(&array[index - 1], &value) <= 0 ; index += skip)
 282 | 	{
 283 | 		if (index >= range.end - skip)
 284 | 		{
 285 | 			return FUNC(monobound_binary_last)(array, value, new_range(index, range.end), cmp);
 286 | 		}
 287 | 	}
 288 | 	return FUNC(monobound_binary_last)(array, value, new_range(index - skip, index), cmp);
 289 | }
 290 | 
 291 | size_t FUNC(FindFirstBackward)(const VAR array[], const VAR value, const Range range, CMPFUNC *cmp, const size_t unique)
 292 | {
 293 | 	size_t skip, index;
 294 | 
 295 | 	if (range_length(range) == 0)
 296 | 		return range.start;
 297 | 
 298 | 	skip = Max(range_length(range)/unique, 1);
 299 | 
 300 | 	for (index = range.end - skip; index > range.start && cmp(&value, &array[index - 1]) <= 0 ; index -= skip)
 301 | 	{
 302 | 		if (index < range.start + skip)
 303 | 		{
 304 | 			return FUNC(monobound_binary_first)(array, value, new_range(range.start, index), cmp);
 305 | 		}
 306 | 	}
 307 | 	return FUNC(monobound_binary_first)(array, value, new_range(index, index + skip), cmp);
 308 | }
 309 | 
 310 | size_t FUNC(FindLastBackward)(const VAR array[], const VAR value, const Range range, CMPFUNC *cmp, const size_t unique)
 311 | {
 312 | 	size_t skip, index;
 313 | 
 314 | 	if (range_length(range) == 0)
 315 | 		return range.start;
 316 | 
 317 | 	skip = Max(range_length(range)/unique, 1);
 318 | 
 319 | 	for (index = range.end - skip; index > range.start && cmp(&array[index - 1], &value) > 0 ; index -= skip)
 320 | 	{
 321 | 		if (index < range.start + skip)
 322 | 		{
 323 | 			return FUNC(monobound_binary_last)(array, value, new_range(range.start, index), cmp);
 324 | 		}
 325 | 	}
 326 | 	return FUNC(monobound_binary_last)(array, value, new_range(index, index + skip), cmp);
 327 | }
 328 | 
 329 | // monobound binary insertion sort
 330 | 
 331 | void FUNC(monobound_sort)(VAR array[], const Range range, CMPFUNC *cmp)
 332 | {
 333 | 	VAR *start, *pta, *end, key;
 334 | 	size_t i, mid, top, nmemb;
 335 | 
 336 | 	start = array + range.start;
 337 | 	nmemb = range.end - range.start;
 338 | 
 339 | 	for (i = 1 ; i < nmemb ; i++)
 340 | 	{
 341 | 		pta = end = start + i;
 342 | 
 343 | 		if (cmp(--pta, end) <= 0)
 344 | 		{
 345 | 			continue;
 346 | 		}
 347 | 		top = i;
 348 | 
 349 | 		while (top > 1)
 350 | 		{
 351 | 			mid = top / 2;
 352 | 
 353 | 			if (cmp(pta - mid, end) > 0)
 354 | 			{
 355 | 				pta -= mid;
 356 | 			}
 357 | 			top -= mid;
 358 | 		}
 359 | 
 360 | 		key = *end;
 361 | 
 362 | 		memmove(pta + 1, pta, (end - pta) * sizeof(VAR));
 363 | 
 364 | 		*pta = key;
 365 | 	}
 366 | }
 367 | 
 368 | // swap a series of values in the array
 369 | 
 370 | void FUNC(forward_block_swap)(VAR array[], const size_t start1, const size_t start2, size_t block_size)
 371 | {
 372 | 	VAR *pta, *ptb, swap;
 373 | 
 374 | 	pta = array + start1;
 375 | 	ptb = array + start2;
 376 | 
 377 | 	while (block_size--)
 378 | 	{
 379 | 		swap = *pta; *pta++ = *ptb; *ptb++ = swap;
 380 | 	}
 381 | }
 382 | 
 383 | void FUNC(backward_block_swap)(VAR array[], const size_t start1, const size_t start2, size_t block_size)
 384 | {
 385 | 	VAR *pta, *ptb, swap;
 386 | 
 387 | 	pta = array + start1 + block_size;
 388 | 	ptb = array + start2 + block_size;
 389 | 
 390 | 	while (block_size--)
 391 | 	{
 392 | 		swap = *--pta; *pta = *--ptb; *ptb = swap;
 393 | 	}
 394 | }
 395 | 
 396 | // rotate the values in an array ([0 1 2 3] becomes [1 2 3 0] if we rotate by 1)
 397 | // this assumes that 0 <= amount <= range.length()
 398 | 
 399 | void FUNC(Rotate)(VAR array[], const size_t amount, const Range range)
 400 | {
 401 | 	size_t start = range.start;
 402 | 	size_t left  = amount;
 403 | 	size_t right = range.end - range.start - amount;
 404 | 	size_t min   = left <= right ? left : right;
 405 | 
 406 | 	// Gries-Mills rotation
 407 | 
 408 | 	while (min > 1)
 409 | 	{
 410 | 		if (left <= right)
 411 | 		{
 412 | 			do
 413 | 			{
 414 | 				FUNC(forward_block_swap)(array, start, start + left, left);
 415 | 
 416 | 				start += left;
 417 | 				right -= left;
 418 | 			}
 419 | 			while (left <= right);
 420 | 
 421 | 			min = right;
 422 | 		}
 423 | 		else
 424 | 		{
 425 | 			do
 426 | 			{
 427 | 				FUNC(backward_block_swap)(array, start + left - right, start + left, right);
 428 | 
 429 | 				left -= right;
 430 | 			}
 431 | 			while (right <= left);
 432 | 
 433 | 			min = left;
 434 | 		}
 435 | 	}
 436 | 
 437 | 	if (min)
 438 | 	{
 439 | 		if (left <= right)
 440 | 		{
 441 | 			VAR swap = array[start];
 442 | 			memmove(&array[start], &array[start + 1], (right) * sizeof(VAR));
 443 | 			array[start + right] = swap;
 444 | 		}
 445 | 		else
 446 | 		{
 447 | 			VAR swap = array[start + left];
 448 | 			memmove(&array[start + 1], &array[start], (left) * sizeof(VAR));
 449 | 			array[start] = swap;
 450 | 		}
 451 | 	}
 452 | }
 453 | 
 454 | // merge two ranges from one array into another array
 455 | 
 456 | void FUNC(forward_merge_into)(VAR *dest, VAR *from, size_t nmemb, size_t block, CMPFUNC *cmp)
 457 | {
 458 | 	VAR *l, *r, *m, *e; // left, right, middle, end
 459 | 
 460 | 	l = from;
 461 | 	r = from + block;
 462 | 	m = r;
 463 | 	e = l + nmemb;
 464 | 
 465 | 	while (1)
 466 | 	{
 467 | 		if (cmp(l, r) <= 0)
 468 | 		{
 469 | 			*dest++ = *l++;
 470 | 
 471 | 			if (l == m)
 472 | 			{
 473 | 				do *dest++ = *r++; while (r < e);
 474 | 
 475 | 				return;
 476 | 			}
 477 | 		}
 478 | 		else
 479 | 		{
 480 | 			*dest++ = *r++;
 481 | 
 482 | 			if (r == e)
 483 | 			{
 484 | 				do *dest++ = *l++; while (l < m);
 485 | 
 486 | 				return;
 487 | 			}
 488 | 		}
 489 | 	}
 490 | }
 491 | 
 492 | void FUNC(external_backward_merge)(VAR *array, VAR *swap, size_t nmemb, size_t block, CMPFUNC *cmp)
 493 | {
 494 | 	VAR *r, *m, *e, *s; // right, middle, end, swap
 495 | 
 496 | 	m = array + block;
 497 | 	e = array + nmemb - 1;
 498 | 	r = m--;
 499 | 
 500 | 	if (cmp(m, r) <= 0)
 501 | 	{
 502 | 		return;
 503 | 	}
 504 | 
 505 | 	while (cmp(m, e) <= 0)
 506 | 	{
 507 | 		e--;
 508 | 	}
 509 | 
 510 | 	s = swap;
 511 | 
 512 | 	do *s++ = *r++; while (r <= e);
 513 | 
 514 | 	s--;
 515 | 
 516 | 	*e-- = *m--;
 517 | 
 518 | 	if (cmp(array, swap) <= 0)
 519 | 	{
 520 | 		while (1)
 521 | 		{
 522 | 			if (cmp(m, s) > 0)
 523 | 			{
 524 | 				*e-- = *m--;
 525 | 			}
 526 | 			else
 527 | 			{
 528 | 				*e-- = *s--;
 529 | 
 530 | 				if (s < swap)
 531 | 				{
 532 | 					return;
 533 | 				}
 534 | 			}
 535 | 		}
 536 | 	}
 537 | 	else
 538 | 	{
 539 | 		while (1)
 540 | 		{
 541 | 			if (cmp(m, s) > 0)
 542 | 			{
 543 | 				*e-- = *m--;
 544 | 
 545 | 				if (m < array)
 546 | 				{
 547 | 					do *e-- = *s--; while (s >= swap);
 548 | 
 549 | 					return;
 550 | 				}
 551 | 			}
 552 | 			else
 553 | 			{
 554 | 				*e-- = *s--;
 555 | 			}
 556 | 		}
 557 | 	}
 558 | }
 559 | 
 560 | // merge operation using an external buffer
 561 | 
 562 | void FUNC(MergeExternal)(VAR array[], const Range A, const Range B, CMPFUNC *cmp, VAR *cache)
 563 | {
 564 | 	VAR *A_index = &cache[0];
 565 | 	VAR *B_index = &array[B.start];
 566 | 	VAR *insert_index = &array[A.start];
 567 | 	VAR *A_last = &cache[range_length(A)];
 568 | 	VAR *B_last = &array[B.end];
 569 | 
 570 | 	if (range_length(B) > 0 && range_length(A) > 0)
 571 | 	{
 572 | 		while (1)
 573 | 		{
 574 | 			if (cmp(A_index, B_index) <= 0)
 575 | 			{
 576 | 				*insert_index++ = *A_index++;
 577 | 
 578 | 				if (A_index == A_last)
 579 | 					break;
 580 | 			}
 581 | 			else
 582 | 			{
 583 | 				*insert_index++ = *B_index++;
 584 | 
 585 | 				if (B_index == B_last)
 586 | 					break;
 587 | 			}
 588 | 		}
 589 | 	}
 590 | 	// copy the remainder of A into the final array
 591 | 
 592 | 	memcpy(insert_index, A_index, (A_last - A_index) * sizeof(VAR));
 593 | }
 594 | 
 595 | // merge operation using an internal buffer
 596 | 
 597 | // whenever we find a value to add to the final array, swap it with the value that's
 598 | // already in that spot when this algorithm is finished, the 'I' range will contain
 599 | // its original contents, but in a different order
 600 | 
 601 | void FUNC(MergeInternal)(VAR array[], const Range A, const Range B, CMPFUNC *cmp, const Range I)
 602 | {
 603 | 	VAR swap, *pta, *ptb, *pti;
 604 | 	size_t a = 0, b = 0, i = 0;
 605 | 	size_t length_A = range_length(A);
 606 | 	size_t length_B = range_length(B);
 607 | 
 608 | 	if (length_A > 0 && length_B > 0)
 609 | 	{
 610 | 		pta = array + A.start;
 611 | 		ptb = array + B.start;
 612 | 		pti = array + I.start;
 613 | 
 614 | 		while (1)
 615 | 		{
 616 | 			if (cmp(&pti[a], &ptb[b]) <= 0)
 617 | 			{
 618 | 				swap = pta[i]; pta[i++] = pti[a]; pti[a] = swap;
 619 | 
 620 | 				if (++a >= length_A)
 621 | 					break;
 622 | 			}
 623 | 			else
 624 | 			{
 625 | 				swap = pta[i]; pta[i++] = ptb[b]; ptb[b] = swap;
 626 | 
 627 | 				if (++b >= length_B)
 628 | 					break;
 629 | 			}
 630 | 		}
 631 | 	}
 632 | 	FUNC(backward_block_swap)(array, I.start + a, A.start + i, length_A - a);
 633 | }
 634 | 
 635 | // merge operation without a buffer
 636 | 
 637 | // this just repeatedly binary searches into B and rotates A into position.
 638 | // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here,
 639 | // but I decided to stick with this because it had better situational performance
 640 | 
 641 | // (Hwang and Lin is designed for merging subarrays of very different sizes,
 642 | // but WikiSort almost always uses subarrays that are roughly the same size)
 643 | 
 644 | // normally this is incredibly suboptimal, but this function is only called
 645 | // when none of the A or B blocks in any subarray contained 2√A unique values,
 646 | // which places a hard limit on the number of times this will ACTUALLY need
 647 | // to binary search and rotate.
 648 | 
 649 | // according to my analysis the worst case is √A rotations performed on √A items
 650 | // once the constant factors are removed, which ends up being O(n)
 651 | 
 652 | // again, this is NOT a general-purpose solution – it only works well in this case!
 653 | // kind of like how the O(n^2) insertion sort is used in some places
 654 | 
 655 | void FUNC(MergeInPlace)(VAR array[], Range A, Range B, CMPFUNC *cmp, VAR *cache, const size_t cache_size)
 656 | {
 657 | 	if (range_length(A) == 0 || range_length(B) == 0)
 658 | 	{
 659 | 		return;
 660 | 	}
 661 | 
 662 | 	while (1)
 663 | 	{
 664 | 		// find the first place in B where the first item in A needs to be inserted
 665 | 		size_t mid = FUNC(monobound_binary_first)(array, array[A.start], B, cmp);
 666 | 
 667 | 		// rotate A into place
 668 | 		size_t amount = mid - A.end;
 669 | 
 670 | 		FUNC(Rotate)(array, range_length(A), new_range(A.start, mid));
 671 | 
 672 | 		if (B.end == mid)
 673 | 		{
 674 | 			break;
 675 | 		}
 676 | 
 677 | 		// calculate the new A and B ranges
 678 | 
 679 | 		B.start = mid;
 680 | 		A = new_range(A.start + amount, B.start);
 681 | 		A.start = FUNC(monobound_binary_last)(array, array[A.start], A, cmp);
 682 | 
 683 | 		if (range_length(A) == 0)
 684 | 		{
 685 | 			break;
 686 | 		}
 687 | 	}
 688 | }
 689 | 
 690 | // bottom-up merge sort combined with an in-place merge algorithm for O(1) memory use
 691 | 
 692 | void FUNC(octosort)(VAR array[], size_t size, VAR *external_cache, size_t cache_size, CMPFUNC *cmp)
 693 | {
 694 | 	VAR swap, stack_cache[512], *cache = external_cache;
 695 | 
 696 | 	#if DYNAMIC_CACHE
 697 | 		// turns into a full-throttle merge sort since everything fits into the cache
 698 | 
 699 | 		if (cache == NULL)
 700 | 		{
 701 | 			cache_size = 1 + size / 2;
 702 | 
 703 | 			cache = (VAR *) malloc(cache_size * sizeof(VAR));
 704 | 
 705 | 			if (cache == NULL)
 706 | 			{
 707 | 				external_cache = cache = stack_cache;
 708 | 
 709 | 				cache_size = 512;
 710 | 			}
 711 | 		}
 712 | 	#else
 713 | 		// since the cache size is fixed, it's still O(1) memory
 714 | 		// the minimum stack size is typically 8192 KB, so 512 elements should fit comfortably
 715 | 		// removing the cache entirely gives 60% of the performance of qsort()
 716 | 
 717 | 		if (cache == NULL)
 718 | 		{
 719 | 			cache = stack_cache;
 720 | 
 721 | 			cache_size = 512;
 722 | 		}
 723 | 	#endif
 724 | 
 725 | 	// if the array is of size 1, 2, 3 .. 8 sort them like so:
 726 | 
 727 | 	if (size <= 8)
 728 | 	{
 729 | 		FUNC(monobound_sort)(array, new_range(0, size), cmp);
 730 | 
 731 | 		goto End;
 732 | 	}
 733 | 
 734 | 	WikiIterator iterator = WikiIterator_new(size, 4);
 735 | 
 736 | 	VAR *pto = NULL;
 737 | 
 738 | 	// sort groups of 4-8 items at a time
 739 | 
 740 | 	while (!WikiIterator_finished(&iterator))
 741 | 	{
 742 | 		Range range = WikiIterator_nextRange(&iterator);
 743 | 
 744 | 		pto = FUNC(octo_swap)(array, pto, range.start, range.end - range.start, cmp);
 745 | 	}
 746 | 
 747 | 	if (pto)
 748 | 	{
 749 | 		VAR *pta = array + size - 1;
 750 | 		VAR *ptz = pto;
 751 | 
 752 | 		do
 753 | 		{
 754 | 			swap = *ptz;
 755 | 			*ptz = *pta;
 756 | 			*pta = swap;
 757 | 		}
 758 | 		while (++ptz < --pta);
 759 | 
 760 | 		if (pto == array)
 761 | 		{
 762 | 			goto End;
 763 | 		}
 764 | 	}
 765 | 
 766 | 	// then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc.
 767 | 
 768 | 	while (1)
 769 | 	{
 770 | 		// if every A and B block will fit into the cache, use a special branch specifically for merging with the cache
 771 | 		// (we use < rather than <= since the block size might be one more than iterator.length())
 772 | 		if (WikiIterator_length(&iterator) < cache_size)
 773 | 		{
 774 | 			// perform a quad merge if the four subarrays fit into the cache
 775 | 
 776 | 			// array: [A][B][C][D]
 777 | 			// cache: [A  B]       Step 1
 778 | 			// cache:       [C  D] Step 2
 779 | 			// array: [A  B  C  D] Step 3
 780 | 
 781 | 			if ((WikiIterator_length(&iterator) + 1) * 4 <= cache_size && (WikiIterator_length(&iterator) + 1) * 4 <= size)
 782 | 			{
 783 | 				WikiIterator_begin(&iterator);
 784 | 
 785 | 				while (!WikiIterator_finished(&iterator))
 786 | 				{
 787 | 					Range A = WikiIterator_nextRange(&iterator);
 788 | 					Range B = WikiIterator_nextRange(&iterator);
 789 | 					Range C = WikiIterator_nextRange(&iterator);
 790 | 					Range D = WikiIterator_nextRange(&iterator);
 791 | 
 792 | 					if (cmp(&array[A.end - 1], &array[B.start]) <= 0)
 793 | 					{
 794 | 						if (cmp(&array[C.end - 1], &array[D.start]) <= 0)
 795 | 						{
 796 | 							if (cmp(&array[B.end - 1], &array[C.start]) <= 0)
 797 | 							{
 798 | 								continue; // A through D are in order, skip doing anything else
 799 | 							}
 800 | 							// A and B are in order and C and D are in order, copy to cache
 801 | 							memcpy(&cache[0],                 &array[A.start], range_length(A) * sizeof(VAR));
 802 | 							memcpy(&cache[A.end - A.start], &array[B.start], range_length(B) * sizeof(VAR));
 803 | 							memcpy(&cache[B.end - A.start], &array[C.start], range_length(C) * sizeof(VAR));
 804 | 							memcpy(&cache[C.end - A.start], &array[D.start], range_length(D) * sizeof(VAR));
 805 | 
 806 | 							goto Step3;
 807 | 						}
 808 | 						// A and B are in order, copy to cache
 809 | 						memcpy(&cache[0],                 &array[A.start], range_length(A) * sizeof(VAR));
 810 | 						memcpy(&cache[A.end - A.start], &array[B.start], range_length(B) * sizeof(VAR));
 811 | 
 812 | 						goto Step2;
 813 | 					}
 814 | 					// Step1:
 815 | 
 816 | 					// A and B are not in order, merge to cache
 817 | 					FUNC(forward_merge_into)(cache, array + A.start, B.end - A.start, A.end - A.start, cmp);
 818 | 
 819 | 					if (cmp(&array[C.end - 1], &array[D.start]) <= 0) // C and D are in order, copy to cache
 820 | 					{
 821 | 						memcpy(&cache[B.end - A.start], &array[C.start], range_length(C) * sizeof(VAR));
 822 | 						memcpy(&cache[C.end - A.start], &array[D.start], range_length(D) * sizeof(VAR));
 823 | 					}
 824 | 					else
 825 | 					{
 826 | 						Step2:
 827 | 
 828 | 						// C and D are not in order, merge to cache
 829 | 						FUNC(forward_merge_into)(&cache[B.end - A.start], &array[C.start], D.end - C.start, C.end - C.start, cmp);
 830 | 					}
 831 | 					Step3:
 832 | 
 833 | 					// merge A through D from the cache back into the array
 834 | 					FUNC(forward_merge_into)(&array[A.start], &cache[0], D.end - A.start, B.end - A.start, cmp);
 835 | 				}
 836 | 
 837 | 				// we merged two levels at the same time, so we're done with this level already
 838 | 				// iterator.nextLevel() is called again at the bottom of this outer merge loop
 839 | 
 840 | 				WikiIterator_nextLevel(&iterator);
 841 | 			}
 842 | 			else
 843 | 			{
 844 | 				WikiIterator_begin(&iterator);
 845 | 
 846 | 				while (!WikiIterator_finished(&iterator))
 847 | 				{
 848 | 					Range A = WikiIterator_nextRange(&iterator);
 849 | 					Range B = WikiIterator_nextRange(&iterator);
 850 | 
 851 | 					if (cmp(&array[A.end - 1], &array[B.start]) <= 0)
 852 | 					{
 853 | 						continue; // A and B are in order, skip doing anything else
 854 | 					}
 855 | 					// A and B are not in order, merge through the cache
 856 | 					FUNC(external_backward_merge)(array + A.start, cache, B.end - A.start, range_length(A), cmp);
 857 | 				}
 858 | 			}
 859 | 		}
 860 | 		else
 861 | 		{
 862 | 			// this is where the in-place merge logic starts!
 863 | 			// 1. pull out two internal buffers each containing √A unique values
 864 | 			//	1a. adjust block_size and buffer_size if we couldn't find enough unique values
 865 | 			// 2. loop over the A and B subarrays within this level of the merge sort
 866 | 			// 3. break A and B into blocks of size 'block_size'
 867 | 			// 4. "tag" each of the A blocks with values from the first internal buffer
 868 | 			// 5. roll the A blocks through the B blocks and drop/rotate them where they belong
 869 | 			// 6. merge each A block with any B values that follow, using the cache or the second internal buffer
 870 | 			// 7. sort the second internal buffer if it exists
 871 | 			// 8. redistribute the two internal buffers back into the array
 872 | 
 873 | 			size_t block_size = monobound_sqrt(WikiIterator_length(&iterator));
 874 | 
 875 | 			size_t buffer_size = WikiIterator_length(&iterator) / block_size + 1;
 876 | 
 877 | 			// as an optimization, we really only need to pull out the internal buffers once for each level of merges
 878 | 			// after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level
 879 | 
 880 | 			Range buffer1, buffer2, A, B;
 881 | 			size_t find_separately = 0;
 882 | 			size_t index, last, count, find, start, pull_index = 0;
 883 | 
 884 | 			struct
 885 | 			{
 886 | 				size_t from;
 887 | 				size_t to;
 888 | 				size_t count;
 889 | 				Range range;
 890 | 			}
 891 | 			pull[2];
 892 | 
 893 | 			pull[0].from = pull[0].to = pull[0].count = 0; pull[0].range = new_range(0, 0);
 894 | 			pull[1].from = pull[1].to = pull[1].count = 0; pull[1].range = new_range(0, 0);
 895 | 
 896 | 			buffer1 = new_range(0, 0);
 897 | 			buffer2 = new_range(0, 0);
 898 | 
 899 | 			find = buffer_size * 2; // find two internal buffers of size 'buffer_size' each
 900 | 
 901 | 			if (block_size <= cache_size)
 902 | 			{
 903 | 				find = buffer_size; // if every A block fits into the cache then we won't need the second internal buffer
 904 | 			}
 905 | 			else if (find > WikiIterator_length(&iterator))
 906 | 			{
 907 | 				find_separately = 1; // we can't fit both buffers into the same A or B subarray, so find two buffers separately
 908 | 				find = buffer_size;
 909 | 			}
 910 | 
 911 | 			// we need to find either a single contiguous space containing 2√A unique values, which will be split up into two buffers of size √A each,
 912 | 			// or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values,
 913 | 			// OR if we couldn't find that many unique values, we need the largest possible buffer we can get
 914 | 
 915 | 			// in the case where it couldn't find a single buffer of at least √A unique values,
 916 | 			// all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace)
 917 | 
 918 | 			WikiIterator_begin(&iterator);
 919 | 
 920 | 			while (!WikiIterator_finished(&iterator))
 921 | 			{
 922 | 				A = WikiIterator_nextRange(&iterator);
 923 | 				B = WikiIterator_nextRange(&iterator);
 924 | 
 925 | 				// just store information about where the values will be pulled from and to,
 926 | 				// as well as how many values there are, to create the two internal buffers
 927 | 
 928 | 				// check A for the number of unique values we need to fill an internal buffer
 929 | 				// these values will be pulled out to the start of A
 930 | 				for (last = A.start, count = 1; count < find; last = index, count++)
 931 | 				{
 932 | 					index = FUNC(FindLastForward)(array, array[last], new_range(last + 1, A.end), cmp, find - count);
 933 | 
 934 | 					if (index == A.end)
 935 | 						break;
 936 | 				}
 937 | 				index = last;
 938 | 
 939 | 				if (count >= buffer_size)
 940 | 				{
 941 | 					// keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer
 942 | 					PULL(A.start);
 943 | 					pull_index = 1;
 944 | 
 945 | 					if (count == buffer_size + buffer_size)
 946 | 					{
 947 | 						// we were able to find a single contiguous section containing 2√A unique values,
 948 | 						// so this section can be used to contain both of the internal buffers we'll need
 949 | 						buffer1 = new_range(A.start, A.start + buffer_size);
 950 | 						buffer2 = new_range(A.start + buffer_size, A.start + count);
 951 | 						break;
 952 | 					}
 953 | 					else if (find == buffer_size + buffer_size)
 954 | 					{
 955 | 						// we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
 956 | 						// so we still need to find a second separate buffer of at least √A unique values
 957 | 						buffer1 = new_range(A.start, A.start + count);
 958 | 						find = buffer_size;
 959 | 					}
 960 | 					else if (block_size <= cache_size)
 961 | 					{
 962 | 						// we found the first and only internal buffer that we need, so we're done!
 963 | 						buffer1 = new_range(A.start, A.start + count);
 964 | 						break;
 965 | 					}
 966 | 					else if (find_separately)
 967 | 					{
 968 | 						// found one buffer, but now find the other one
 969 | 						buffer1 = new_range(A.start, A.start + count);
 970 | 						find_separately = 0;
 971 | 					}
 972 | 					else
 973 | 					{
 974 | 						// we found a second buffer in an 'A' subarray containing √A unique values, so we're done!
 975 | 						buffer2 = new_range(A.start, A.start + count);
 976 | 						break;
 977 | 					}
 978 | 				}
 979 | 				else if (pull_index == 0 && count > range_length(buffer1))
 980 | 				{
 981 | 					// keep track of the largest buffer we were able to find
 982 | 					buffer1 = new_range(A.start, A.start + count);
 983 | 					PULL(A.start);
 984 | 				}
 985 | 
 986 | 				// check B for the number of unique values we need to fill an internal buffer
 987 | 				// these values will be pulled out to the end of B
 988 | 				for (last = B.end - 1, count = 1; count < find; last = index - 1, count++)
 989 | 				{
 990 | 					index = FUNC(FindFirstBackward)(array, array[last], new_range(B.start, last), cmp, find - count);
 991 | 					if (index == B.start)
 992 | 						break;
 993 | 				}
 994 | 				index = last;
 995 | 
 996 | 				if (count >= buffer_size)
 997 | 				{
 998 | 					// keep track of the range within the array where we'll need to "pull out" these values to create the internal buffer
 999 | 					PULL(B.end);
1000 | 					pull_index = 1;
1001 | 
1002 | 					if (count == buffer_size + buffer_size)
1003 | 					{
1004 | 						// we were able to find a single contiguous section containing 2√A unique values,
1005 | 						// so this section can be used to contain both of the internal buffers we'll need
1006 | 						buffer1 = new_range(B.end - count, B.end - buffer_size);
1007 | 						buffer2 = new_range(B.end - buffer_size, B.end);
1008 | 						break;
1009 | 					}
1010 | 					else if (find == buffer_size + buffer_size)
1011 | 					{
1012 | 						// we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values,
1013 | 						// so we still need to find a second separate buffer of at least √A unique values
1014 | 						buffer1 = new_range(B.end - count, B.end);
1015 | 						find = buffer_size;
1016 | 					}
1017 | 					else if (block_size <= cache_size)
1018 | 					{
1019 | 						// we found the first and only internal buffer that we need, so we're done!
1020 | 						buffer1 = new_range(B.end - count, B.end);
1021 | 						break;
1022 | 					}
1023 | 					else if (find_separately)
1024 | 					{
1025 | 						// found one buffer, but now find the other one
1026 | 						buffer1 = new_range(B.end - count, B.end);
1027 | 						find_separately = 0;
1028 | 					}
1029 | 					else
1030 | 					{
1031 | 						// buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray,
1032 | 						// we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2
1033 | 						if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count;
1034 | 
1035 | 						// we found a second buffer in an 'B' subarray containing √A unique values, so we're done!
1036 | 						buffer2 = new_range(B.end - count, B.end);
1037 | 						break;
1038 | 					}
1039 | 				}
1040 | 				else if (pull_index == 0 && count > range_length(buffer1))
1041 | 				{
1042 | 					// keep track of the largest buffer we were able to find
1043 | 					buffer1 = new_range(B.end - count, B.end);
1044 | 					PULL(B.end);
1045 | 				}
1046 | 			}
1047 | 
1048 | 			// pull out the two ranges so we can use them as internal buffers
1049 | 			for (pull_index = 0; pull_index < 2; pull_index++)
1050 | 			{
1051 | 				Range range;
1052 | 				size_t length = pull[pull_index].count;
1053 | 
1054 | 				if (pull[pull_index].to < pull[pull_index].from)
1055 | 				{
1056 | 					// we're pulling the values out to the left, which means the start of an A subarray
1057 | 					index = pull[pull_index].from;
1058 | 					for (count = 1; count < length; count++)
1059 | 					{
1060 | 						index = FUNC(FindFirstBackward)(array, array[index - 1], new_range(pull[pull_index].to, pull[pull_index].from - (count - 1)), cmp, length - count);
1061 | 						range = new_range(index + 1, pull[pull_index].from + 1);
1062 | 						FUNC(Rotate)(array, range_length(range) - count, range);
1063 | 						pull[pull_index].from = index + count;
1064 | 					}
1065 | 				}
1066 | 				else if (pull[pull_index].to > pull[pull_index].from)
1067 | 				{
1068 | 					// we're pulling values out to the right, which means the end of a B subarray
1069 | 					index = pull[pull_index].from + 1;
1070 | 					for (count = 1; count < length; count++)
1071 | 					{
1072 | 						index = FUNC(FindLastForward)(array, array[index], new_range(index, pull[pull_index].to), cmp, length - count);
1073 | 						range = new_range(pull[pull_index].from, index - 1);
1074 | 						FUNC(Rotate)(array, count, range);
1075 | 						pull[pull_index].from = index - 1 - count;
1076 | 					}
1077 | 				}
1078 | 			}
1079 | 
1080 | 			// adjust block_size and buffer_size based on the values we were able to pull out
1081 | 			buffer_size = range_length(buffer1);
1082 | 			block_size = WikiIterator_length(&iterator)/buffer_size + 1;
1083 | 
1084 | 			// the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks,
1085 | 			// so this was originally here to test the math for adjusting block_size above
1086 | 			// assert((WikiIterator_length(&iterator) + 1)/block_size <= buffer_size);
1087 | 
1088 | 			// now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort!
1089 | 			WikiIterator_begin(&iterator);
1090 | 
1091 | 			while (!WikiIterator_finished(&iterator))
1092 | 			{
1093 | 				A = WikiIterator_nextRange(&iterator);
1094 | 				B = WikiIterator_nextRange(&iterator);
1095 | 
1096 | 				// remove any parts of A or B that are being used by the internal buffers
1097 | 				start = A.start;
1098 | 
1099 | 				if (start == pull[0].range.start)
1100 | 				{
1101 | 					if (pull[0].from > pull[0].to)
1102 | 					{
1103 | 						A.start += pull[0].count;
1104 | 
1105 | 						// if the internal buffer takes up the entire A or B subarray, then there's nothing to merge
1106 | 						// this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4,
1107 | 						// which also only happens when cache_size is small or 1 since it'd otherwise use MergeExternal
1108 | 						if (range_length(A) == 0)
1109 | 							continue;
1110 | 					}
1111 | 					else if (pull[0].from < pull[0].to)
1112 | 					{
1113 | 						B.end -= pull[0].count;
1114 | 						if (range_length(B) == 0)
1115 | 							continue;
1116 | 					}
1117 | 				}
1118 | 
1119 | 				if (start == pull[1].range.start)
1120 | 				{
1121 | 					if (pull[1].from > pull[1].to)
1122 | 					{
1123 | 						A.start += pull[1].count;
1124 | 						if (range_length(A) == 0)
1125 | 							continue;
1126 | 					}
1127 | 					else if (pull[1].from < pull[1].to)
1128 | 					{
1129 | 						B.end -= pull[1].count;
1130 | 						if (range_length(B) == 0)
1131 | 							continue;
1132 | 					}
1133 | 				}
1134 | 
1135 | 				if (cmp(&array[A.end - 1], &array[B.start]) > 0) // A and B are not in order, so merge them
1136 | 				{
1137 | 					Range blockA, firstA, lastA, lastB, blockB;
1138 | 					size_t indexA, findA;
1139 | 
1140 | 					// break the remainder of A into blocks. firstA is the uneven-sized first A block
1141 | 					blockA = new_range(A.start, A.end);
1142 | 					firstA = new_range(A.start, A.start + range_length(blockA) % block_size);
1143 | 
1144 | 					// swap the first value of each A block with the value in buffer1
1145 | 					for (indexA = buffer1.start, index = firstA.end; index < blockA.end; indexA++, index += block_size) 
1146 | 					{
1147 | 						SWAP(array[indexA], array[index]);
1148 | 					}
1149 | 
1150 | 					// start rolling the A blocks through the B blocks!
1151 | 					// whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well
1152 | 					lastA = firstA;
1153 | 					lastB = new_range(0, 0);
1154 | 					blockB = new_range(B.start, B.start + Min(block_size, range_length(B)));
1155 | 					blockA.start += range_length(firstA);
1156 | 					indexA = buffer1.start;
1157 | 
1158 | 					// if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it
1159 | 					// otherwise, if the second buffer is available, block swap the contents into that
1160 | 					if (range_length(lastA) <= cache_size)
1161 | 					{
1162 | 						memcpy(&cache[0], &array[lastA.start], range_length(lastA) * sizeof(VAR));
1163 | 					}
1164 | 					else if (range_length(buffer2) > 0)
1165 | 					{
1166 | 						FUNC(forward_block_swap)(array, lastA.start, buffer2.start, range_length(lastA));
1167 | 					}
1168 | 
1169 | 					if (range_length(blockA) > 0)
1170 | 					{
1171 | 						while (1)
1172 | 						{
1173 | 							// if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block,
1174 | 							// then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks.
1175 | 
1176 | 							if (range_length(blockB) == 0 || (range_length(lastB) > 0 && cmp(&array[indexA], &array[lastB.end - 1]) <= 0))
1177 | 							{
1178 | 								// figure out where to split the previous B block, and rotate it at the split
1179 | 								size_t B_split = FUNC(monobound_binary_first)(array, array[indexA], lastB, cmp);
1180 | 								size_t B_remaining = lastB.end - B_split;
1181 | 
1182 | 								// swap the minimum A block to the beginning of the rolling A blocks
1183 | 								size_t minA = blockA.start;
1184 | 								for (findA = minA + block_size; findA < blockA.end; findA += block_size)
1185 | 								{
1186 | 									if (cmp(&array[minA], &array[findA]) > 0)
1187 | 									{
1188 | 										minA = findA;
1189 | 									}
1190 | 								}
1191 | 								FUNC(forward_block_swap)(array, blockA.start, minA, block_size);
1192 | 
1193 | 								// swap the first item of the previous A block back with its original value, which is stored in buffer1
1194 | 								SWAP(array[blockA.start], array[indexA]);
1195 | 								indexA++;
1196 | 
1197 | 								 // locally merge the previous A block with the B values that follow it if lastA fits into the external cache
1198 | 								 // we'll use that (with MergeExternal), or if the second internal buffer exists we'll use that (with MergeInternal),
1199 | 								 // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace)
1200 | 
1201 | 								if (range_length(lastA) <= cache_size)
1202 | 								{
1203 | 									FUNC(MergeExternal)(array, lastA, new_range(lastA.end, B_split), cmp, cache);
1204 | 								}
1205 | 								else if (range_length(buffer2) > 0)
1206 | 								{
1207 | 									FUNC(MergeInternal)(array, lastA, new_range(lastA.end, B_split), cmp, buffer2);
1208 | 								}
1209 | 								else
1210 | 								{
1211 | 									FUNC(MergeInPlace)(array, lastA, new_range(lastA.end, B_split), cmp, cache, cache_size);
1212 | 								}
1213 | 
1214 | 								if (range_length(buffer2) > 0 || block_size <= cache_size)
1215 | 								{
1216 | 									// copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway
1217 | 
1218 | 									if (block_size <= cache_size)
1219 | 									{
1220 | 										memcpy(&cache[0], &array[blockA.start], block_size * sizeof(VAR));
1221 | 									}
1222 | 									else
1223 | 									{
1224 | 										FUNC(forward_block_swap)(array, blockA.start, buffer2.start, block_size);
1225 | 									}
1226 | 
1227 | 									// this is equivalent to rotating, but faster
1228 | 									// the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it
1229 | 									// either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs
1230 | 									FUNC(forward_block_swap)(array, B_split, blockA.start + block_size - B_remaining, B_remaining);
1231 | 								}
1232 | 								else
1233 | 								{
1234 | 									// we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation
1235 | 									FUNC(Rotate)(array, blockA.start - B_split, new_range(B_split, blockA.start + block_size));
1236 | 								}
1237 | 
1238 | 								// update the range for the remaining A blocks, and the range remaining from the B block after it was split
1239 | 								lastA = new_range(blockA.start - B_remaining, blockA.start - B_remaining + block_size);
1240 | 								lastB = new_range(lastA.end, lastA.end + B_remaining);
1241 | 
1242 | 								// if there are no more A blocks remaining, this step is finished!
1243 | 								blockA.start += block_size;
1244 | 								if (range_length(blockA) == 0)
1245 | 								{
1246 | 									break;
1247 | 								}
1248 | 
1249 | 							}
1250 | 							else if (range_length(blockB) < block_size)
1251 | 							{
1252 | 								// move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation
1253 | 								// the cache is disabled here since it might contain the contents of the previous A block
1254 | 								FUNC(Rotate)(array, blockB.start - blockA.start, new_range(blockA.start, blockB.end));
1255 | 
1256 | 								lastB = new_range(blockA.start, blockA.start + range_length(blockB));
1257 | 								blockA.start += range_length(blockB);
1258 | 								blockA.end += range_length(blockB);
1259 | 								blockB.end = blockB.start;
1260 | 							}
1261 | 							else
1262 | 							{
1263 | 								// roll the leftmost A block to the end by swapping it with the next B block
1264 | 								FUNC(forward_block_swap)(array, blockA.start, blockB.start, block_size);
1265 | 								lastB = new_range(blockA.start, blockA.start + block_size);
1266 | 
1267 | 								blockA.start += block_size;
1268 | 								blockA.end += block_size;
1269 | 								blockB.start += block_size;
1270 | 
1271 | 								if (blockB.end > B.end - block_size)
1272 | 								{
1273 | 									blockB.end = B.end;
1274 | 								}
1275 | 								else
1276 | 								{
1277 | 									blockB.end += block_size;
1278 | 								}
1279 | 							}
1280 | 						}
1281 | 					}
1282 | 
1283 | 					// merge the last A block with the remaining B values
1284 | 					if (range_length(lastA) <= cache_size)
1285 | 					{
1286 | 						FUNC(MergeExternal)(array, lastA, new_range(lastA.end, B.end), cmp, cache);
1287 | 					}
1288 | 					else if (range_length(buffer2) > 0)
1289 | 					{
1290 | 						FUNC(MergeInternal)(array, lastA, new_range(lastA.end, B.end), cmp, buffer2);
1291 | 					}
1292 | 					else
1293 | 					{
1294 | 						FUNC(MergeInPlace)(array, lastA, new_range(lastA.end, B.end), cmp, cache, cache_size);
1295 | 					}
1296 | 				}
1297 | 			}
1298 | 
1299 | 			// when we're finished with this merge step we should have the one or two internal buffers left over, where the second buffer is all jumbled up
1300 | 			// insertion sort the second buffer, then redistribute the buffers back into the array using the opposite process used for creating the buffer
1301 | 
1302 | 			// While an unstable sort like quicksort could be applied here, in benchmarks it was consistently slightly slower than a simple insertion sort,
1303 | 			// even for tens of millions of items. this may be because insertion sort is quite fast when the data is already somewhat sorted, like it is here
1304 | 
1305 | 			FUNC(monobound_sort)(array, buffer2, cmp);
1306 | 
1307 | 			for (pull_index = 0; pull_index < 2; pull_index++)
1308 | 			{
1309 | 				size_t amount, unique = pull[pull_index].count * 2;
1310 | 				if (pull[pull_index].from > pull[pull_index].to)
1311 | 				{
1312 | 					// the values were pulled out to the left, so redistribute them back to the right
1313 | 					Range buffer = new_range(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count);
1314 | 					while (range_length(buffer) > 0)
1315 | 					{
1316 | 						index = FUNC(FindFirstForward)(array, array[buffer.start], new_range(buffer.end, pull[pull_index].range.end), cmp, unique);
1317 | 						amount = index - buffer.end;
1318 | 						FUNC(Rotate)(array, range_length(buffer), new_range(buffer.start, index));
1319 | 						buffer.start += (amount + 1);
1320 | 						buffer.end += amount;
1321 | 						unique -= 2;
1322 | 					}
1323 | 				}
1324 | 				else if (pull[pull_index].from < pull[pull_index].to)
1325 | 				{
1326 | 					// the values were pulled out to the right, so redistribute them back to the left
1327 | 					Range buffer = new_range(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end);
1328 | 					while (range_length(buffer) > 0)
1329 | 					{
1330 | 						index = FUNC(FindLastBackward)(array, array[buffer.end - 1], new_range(pull[pull_index].range.start, buffer.start), cmp, unique);
1331 | 						amount = buffer.start - index;
1332 | 						FUNC(Rotate)(array, amount, new_range(index, buffer.end));
1333 | 						buffer.start -= amount;
1334 | 						buffer.end -= (amount + 1);
1335 | 						unique -= 2;
1336 | 					}
1337 | 				}
1338 | 			}
1339 | 		}
1340 | 
1341 | 		// double the size of each A and B subarray that will be merged in the next level
1342 | 
1343 | 		if (!WikiIterator_nextLevel(&iterator))
1344 | 		{
1345 | 			break;
1346 | 		}
1347 | 	}
1348 | 
1349 | 	End:
1350 | 
1351 | 	#if DYNAMIC_CACHE
1352 | 
1353 | 	if (cache != external_cache)
1354 | 	{
1355 | 		free(cache);
1356 | 	}
1357 | 
1358 | 	#endif
1359 | 
1360 | 	return;
1361 | }
1362 | 


--------------------------------------------------------------------------------
/octosort.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Copyright (C) 2014-2021 Igor van den Hoven ivdhoven@gmail.com
  3 | */
  4 | 
  5 | /*
  6 | 	Permission is hereby granted, free of charge, to any person obtaining
  7 | 	a copy of this software and associated documentation files (the
  8 | 	"Software"), to deal in the Software without restriction, including
  9 | 	without limitation the rights to use, copy, modify, merge, publish,
 10 | 	distribute, sublicense, and/or sell copies of the Software, and to
 11 | 	permit persons to whom the Software is furnished to do so, subject to
 12 | 	the following conditions:
 13 | 
 14 | 	The above copyright notice and this permission notice shall be
 15 | 	included in all copies or substantial portions of the Software.
 16 | 
 17 | 	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 18 | 	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 19 | 	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 20 | 	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 21 | 	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 22 | 	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 23 | 	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 24 | */
 25 | 
 26 | /*
 27 | 	octosort 1.0
 28 | */
 29 | 
 30 | /*
 31 | 	octosort is based on WikiSort and quadsort
 32 | 
 33 | 	WikiSort: https://github.com/BonzaiThePenguin/WikiSort
 34 | 	quadsort: https://github.com/scandum/quadsort
 35 | 	searches: https://github.com/scandum/binary_search
 36 | */
 37 | 
 38 | #ifndef OCTOSORT_H
 39 | #define OCTOSORT_H
 40 | 
 41 | #include <stdio.h>
 42 | #include <stdlib.h>
 43 | #include <string.h>
 44 | #include <assert.h>
 45 | 
 46 | //#define cmp(a,b) (*(a) > *(b))
 47 | 
 48 | typedef int CMPFUNC (const void *a, const void *b);
 49 | 
 50 | // Set to 1 to see how it performs when given more memory
 51 | 
 52 | #define DYNAMIC_CACHE 0
 53 | 
 54 | // utilities
 55 | 
 56 | #define SWAP(value1, value2) {swap = value1;value1 = value2;value2 = swap;}
 57 | 
 58 | #define PULL(_to) \
 59 | 	pull[pull_index].range = new_range(A.start, B.end); \
 60 | 	pull[pull_index].count = count; \
 61 | 	pull[pull_index].from = index; \
 62 | 	pull[pull_index].to = _to
 63 | 
 64 | // not as fast as math.h's sqrt() but it's portable
 65 | 
 66 | size_t monobound_sqrt(const size_t size)
 67 | {
 68 | 	size_t bot, mid, top, sum;
 69 | 
 70 | 	bot = 0;
 71 | 	top = 65536;
 72 | 
 73 | 	while (top > 1)
 74 | 	{
 75 | 		mid = top / 2;
 76 | 		sum = bot + mid;
 77 | 
 78 | 		if (sum * sum <= size)
 79 | 		{
 80 | 			bot += mid;
 81 | 		}
 82 | 		top -= mid;
 83 | 	}
 84 | 	return bot;
 85 | }
 86 | 
 87 | size_t Min(const size_t a, const size_t b)
 88 | {
 89 | 	return a < b ? a : b;
 90 | }
 91 | 
 92 | size_t Max(const size_t a, const size_t b)
 93 | {
 94 | 	return a > b ? a : b;
 95 | }
 96 | 
 97 | // 63 -> 32, 64 -> 64, etc. this comes from Hacker's Delight
 98 | 
 99 | size_t FloorPowerOfTwo (const size_t value)
100 | {
101 | 	size_t x = value;
102 | 	x = x | (x >> 1);
103 | 	x = x | (x >> 2);
104 | 	x = x | (x >> 4);
105 | 	x = x | (x >> 8);
106 | 	x = x | (x >> 16);
107 | #if __LP64__
108 | 	x = x | (x >> 32);
109 | #endif
110 | 	return x - (x >> 1);
111 | }
112 | 
113 | // structure to represent ranges within the array
114 | 
115 | typedef struct
116 | {
117 | 	size_t start;
118 | 	size_t end;
119 | }
120 | Range;
121 | 
122 | size_t range_length(Range range)
123 | {
124 | 	return range.end - range.start;
125 | }
126 | 
127 | Range new_range(const size_t start, const size_t end)
128 | {
129 | 	return (Range) {start, end};
130 | }
131 | 
132 | 
133 | // calculate how to scale the index value to the range within the array
134 | // the bottom-up merge sort only operates on values that are powers of two,
135 | // so scale down to that power of two, then use a fraction to scale back again
136 | 
137 | typedef struct
138 | {
139 | 	size_t size;
140 | 	size_t power_of_two;
141 | 	size_t numerator;
142 | 	size_t decimal;
143 | 	size_t denominator;
144 | 	size_t decimal_step;
145 | 	size_t numerator_step;
146 | }
147 | WikiIterator;
148 | 
149 | void WikiIterator_begin(WikiIterator *me)
150 | {
151 | 	me->numerator = me->decimal = 0;
152 | }
153 | 
154 | Range WikiIterator_nextRange(WikiIterator *me)
155 | {
156 | 	size_t start = me->decimal;
157 | 
158 | 	me->decimal += me->decimal_step;
159 | 	me->numerator += me->numerator_step;
160 | 
161 | 	if (me->numerator >= me->denominator)
162 | 	{
163 | 		me->numerator -= me->denominator;
164 | 		me->decimal++;
165 | 	}
166 | 
167 | 	return new_range(start, me->decimal);
168 | }
169 | 
170 | size_t WikiIterator_finished(WikiIterator *me)
171 | {
172 | 	return (me->decimal >= me->size);
173 | }
174 | 
175 | size_t WikiIterator_nextLevel(WikiIterator *me)
176 | {
177 | 	me->decimal_step += me->decimal_step;
178 | 	me->numerator_step += me->numerator_step;
179 | 
180 | 	if (me->numerator_step >= me->denominator)
181 | 	{
182 | 		me->numerator_step -= me->denominator;
183 | 		me->decimal_step++;
184 | 	}
185 | 
186 | 	return (me->decimal_step < me->size);
187 | }
188 | 
189 | size_t WikiIterator_length(WikiIterator *me)
190 | {
191 | 	return me->decimal_step;
192 | }
193 | 
194 | WikiIterator WikiIterator_new(size_t size2, size_t min_level)
195 | {
196 | 	WikiIterator me;
197 | 
198 | 	me.size = size2;
199 | 	me.power_of_two = FloorPowerOfTwo(me.size);
200 | 	me.denominator = me.power_of_two/min_level;
201 | 	me.numerator_step = me.size % me.denominator;
202 | 	me.decimal_step = me.size/me.denominator;
203 | 
204 | 	WikiIterator_begin(&me);
205 | 
206 | 	return me;
207 | }
208 | 
209 | //////////////////////////////////////////////////////////
210 | //┌────────────────────────────────────────────────────┐//
211 | //│                █████┐    ██████┐ ██████┐████████┐  │//
212 | //│               ██┌──██┐   ██┌──██┐└─██┌─┘└──██┌──┘  │//
213 | //│               └█████┌┘   ██████┌┘  ██│     ██│     │//
214 | //│               ██┌──██┐   ██┌──██┐  ██│     ██│     │//
215 | //│               └█████┌┘   ██████┌┘██████┐   ██│     │//
216 | //│                └────┘    └─────┘ └─────┘   └─┘     │//
217 | //└────────────────────────────────────────────────────┘//
218 | //////////////////////////////////////////////////////////
219 | 
220 | #undef VAR
221 | #undef FUNC
222 | #undef STRUCT
223 | 
224 | #define VAR char
225 | #define FUNC(NAME) NAME##8
226 | #define STRUCT(NAME) struct NAME##8
227 | 
228 | #include "octosort.c"
229 | 
230 | //////////////////////////////////////////////////////////
231 | //┌────────────────────────────────────────────────────┐//
232 | //│           ▄██┐   █████┐    ██████┐ ██████┐████████┐│//
233 | //│          ████│  ██┌───┘    ██┌──██┐└─██┌─┘└──██┌──┘│//
234 | //│          └─██│  ██████┐    ██████┌┘  ██│     ██│   │//
235 | //│            ██│  ██┌──██┐   ██┌──██┐  ██│     ██│   │//
236 | //│          ██████┐└█████┌┘   ██████┌┘██████┐   ██│   │//
237 | //│          └─────┘ └────┘    └─────┘ └─────┘   └─┘   │//
238 | //└────────────────────────────────────────────────────┘//
239 | //////////////////////////////////////////////////////////
240 | 
241 | #undef VAR
242 | #undef FUNC
243 | #undef STRUCT
244 | 
245 | #define VAR short
246 | #define FUNC(NAME) NAME##16
247 | #define STRUCT(NAME) struct NAME##16
248 | 
249 | #include "octosort.c"
250 | 
251 | //////////////////////////////////////////////////////////
252 | // ┌───────────────────────────────────────────────────┐//
253 | // │       ██████┐ ██████┐    ██████┐ ██████┐████████┐ │//
254 | // │       └────██┐└────██┐   ██┌──██┐└─██┌─┘└──██┌──┘ │//
255 | // │        █████┌┘ █████┌┘   ██████┌┘  ██│     ██│    │//
256 | // │        └───██┐██┌───┘    ██┌──██┐  ██│     ██│    │//
257 | // │       ██████┌┘███████┐   ██████┌┘██████┐   ██│    │//
258 | // │       └─────┘ └──────┘   └─────┘ └─────┘   └─┘    │//
259 | // └───────────────────────────────────────────────────┘//
260 | //////////////////////////////////////////////////////////
261 | 
262 | #undef VAR
263 | #undef FUNC
264 | #undef STRUCT
265 | 
266 | #define VAR int
267 | #define FUNC(NAME) NAME##32
268 | #define STRUCT(NAME) struct NAME##32
269 | 
270 | #include "octosort.c"
271 | 
272 | //////////////////////////////////////////////////////////
273 | // ┌───────────────────────────────────────────────────┐//
274 | // │        █████┐ ██┐  ██┐   ██████┐ ██████┐████████┐ │//
275 | // │       ██┌───┘ ██│  ██│   ██┌──██┐└─██┌─┘└──██┌──┘ │//
276 | // │       ██████┐ ███████│   ██████┌┘  ██│     ██│    │//
277 | // │       ██┌──██┐└────██│   ██┌──██┐  ██│     ██│    │//
278 | // │       └█████┌┘     ██│   ██████┌┘██████┐   ██│    │//
279 | // │        └────┘      └─┘   └─────┘ └─────┘   └─┘    │//
280 | // └───────────────────────────────────────────────────┘//
281 | //////////////////////////////////////////////////////////
282 | 
283 | #undef VAR
284 | #undef FUNC
285 | #undef STRUCT
286 | 
287 | #define VAR long long
288 | #define FUNC(NAME) NAME##64
289 | #define STRUCT(NAME) struct NAME##64
290 | 
291 | #include "octosort.c"
292 | 
293 | //////////////////////////////////////////////////////////
294 | //┌────────────────────────────────────────────────────┐//
295 | //│  ▄██┐  ██████┐  █████┐    ██████┐ ██████┐████████┐ │//
296 | //│ ████│  └────██┐██┌──██┐   ██┌──██┐└─██┌─┘└──██┌──┘ │//
297 | //│ └─██│   █████┌┘└█████┌┘   ██████┌┘  ██│     ██│    │//
298 | //│   ██│  ██┌───┘ ██┌──██┐   ██┌──██┐  ██│     ██│    │//
299 | //│ ██████┐███████┐└█████┌┘   ██████┌┘██████┐   ██│    │//
300 | //│ └─────┘└──────┘ └────┘    └─────┘ └─────┘   └─┘    │//
301 | //└────────────────────────────────────────────────────┘//
302 | //////////////////////////////////////////////////////////
303 | 
304 | #undef VAR
305 | #undef FUNC
306 | #undef STRUCT
307 | 
308 | #define VAR long double
309 | #define FUNC(NAME) NAME##128
310 | #define STRUCT(NAME) struct NAME##128
311 | 
312 | #include "octosort.c"
313 | 
314 | 
315 | ////////////////////////////////////////////////////////////////////////////////
316 | //┌──────────────────────────────────────────────────────────────────────────┐//
317 | //│    ██████┐  ██████┐████████┐ ██████┐ ███████┐ ██████┐ ██████┐ ████████┐  │//
318 | //│   ██┌───██┐██┌────┘└──██┌──┘██┌───██┐██┌────┘██┌───██┐██┌──██┐└──██┌──┘  │//
319 | //│   ██│   ██│██│        ██│   ██│   ██│███████┐██│   ██│██████┌┘   ██│     │//
320 | //│   ██│   ██│██│        ██│   ██│   ██│└────██│██│   ██│██┌──██┐   ██│     │//
321 | //│   └██████┌┘└██████┐   ██│   └██████┌┘███████│└██████┌┘██│  ██│   ██│     │//
322 | //│    └─────┘  └─────┘   └─┘    └─────┘ └──────┘ └─────┘ └─┘  └─┘   └─┘     │//
323 | //└──────────────────────────────────────────────────────────────────────────┘//
324 | ////////////////////////////////////////////////////////////////////////////////
325 | 
326 | void octosort(void *array, size_t nmemb, size_t size, CMPFUNC *cmp)
327 | {
328 | 	if (nmemb < 2)
329 | 	{
330 | 		return;
331 | 	}
332 | 
333 | 	switch (size)
334 | 	{
335 | 		case sizeof(char):
336 | 			return octosort8(array, nmemb, NULL, 0, cmp);
337 | 
338 | 		case sizeof(short):
339 | 			return octosort16(array, nmemb, NULL, 0, cmp);
340 | 
341 | 		case sizeof(int):
342 | 			return octosort32(array, nmemb, NULL, 0, cmp);
343 | 
344 | 		case sizeof(long long):
345 | 			return octosort64(array, nmemb, NULL, 0, cmp);
346 | 
347 | 		case sizeof(long double):
348 | 			return octosort128(array, nmemb, NULL, 0, cmp);
349 | 
350 | 		default:
351 | 			return assert(size == sizeof(char) || size == sizeof(short) || size == sizeof(int) || size == sizeof(long long) || size == sizeof(long double));
352 | 	}
353 | }
354 | 
355 | #undef DYNAMIC_CACHE
356 | #undef PULL
357 | #undef SWAP
358 | 
359 | #undef VAR
360 | #undef FUNC
361 | #undef STRUCT
362 | 
363 | #endif
364 | 


--------------------------------------------------------------------------------