├── __init__.py ├── .gitignore ├── MANIFEST ├── test_versions.sh ├── LICENSE ├── setup.py ├── README.md ├── test.py └── lazysorted.c /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build 3 | dist 4 | README.txt 5 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | README.txt 3 | lazysorted.c 4 | setup.py 5 | -------------------------------------------------------------------------------- /test_versions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script for testing on different python versions 3 | 4 | function restore { 5 | if [ -f test.py.bak ] 6 | then 7 | mv test.py.bak test.py 8 | fi 9 | } 10 | 11 | function check_success { 12 | if [ $? -ne 0 ] 13 | then 14 | restore 15 | exit 1 16 | fi 17 | } 18 | 19 | rm -rf build 20 | pandoc --from=markdown --to=rst --output=README.txt README.md 21 | if [ "$1" == "notest" ] 22 | then 23 | notest=1 24 | else 25 | notest=0 26 | fi 27 | 28 | for version in 2.5 2.6 2.7 3.1 3.2 3.3 29 | do 30 | echo -e "\n\nPYTHON $version\n==========\n" 31 | CFLAGS="-UNDEBUG" python$version setup.py build 32 | check_success 33 | echo "" 34 | 35 | loc=$version 36 | if [ "$version" == "2.7-dbg" ] 37 | then 38 | loc="2.7-pydebug" 39 | fi 40 | if [ "${version%.*}" == "3" ] 41 | then 42 | if [ ! -f test.py.bak ] 43 | then 44 | 2to3 --no-diffs -w test.py 45 | fi 46 | fi 47 | 48 | if [ $notest -eq 0 ] 49 | then 50 | PYTHONPATH="build/lib.linux-x86_64-$loc/" python$version -c "import lazysorted; print(lazysorted);" 51 | PYTHONPATH="build/lib.linux-x86_64-$loc/" python$version test.py 52 | check_success 53 | fi 54 | done 55 | 56 | restore 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Naftali Harris 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 13 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 14 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 15 | DISCLAIMED. IN NO EVENT SHALL NAFTALI HARRIS BE LIABLE FOR ANY 16 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 17 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 18 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 19 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 20 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | 3 | module1 = Extension('lazysorted', sources=['lazysorted.c']) 4 | 5 | f = open("README.txt") 6 | readme = f.read() 7 | f.close() 8 | 9 | setup(name='lazysorted', 10 | version='0.1.1', 11 | description='A partially and lazily sorted list data structure', 12 | author='Naftali Harris', 13 | author_email='naftaliharris@gmail.com', 14 | url='www.naftaliharris.com', 15 | keywords=["sort", "sorting", "partial", "lazy", "list"], 16 | classifiers=[ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Developers", 19 | "License :: OSI Approved :: BSD License", 20 | "Operating System :: OS Independent", 21 | "Programming Language :: Python :: 2", 22 | "Programming Language :: Python :: 2.5", 23 | "Programming Language :: Python :: 2.6", 24 | "Programming Language :: Python :: 2.7", 25 | "Programming Language :: Python :: 3", 26 | "Programming Language :: Python :: 3.1", 27 | "Programming Language :: Python :: 3.2", 28 | "Programming Language :: Python :: 3.3", 29 | "Programming Language :: Python :: Implementation :: CPython", 30 | "Topic :: Software Development :: Libraries :: Python Modules", 31 | ], 32 | ext_modules=[module1], 33 | long_description=readme) 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | lazysorted 2 | ========== 3 | 4 | lazysorted is a Python extension module for sorting sequences lazily. It 5 | presents the programmer with the abstraction that they are actually working 6 | with a sorted list, when in fact the list is only physically sorted when the 7 | programmer requests elements from it, and even then it is only sorted partially, 8 | just enough to return whatever was requested. 9 | 10 | The LazySorted object has a constructor that implements the same interface as 11 | the builtin `sorted(...)` function, and it supports most of the non-mutating 12 | methods of a python list. 13 | 14 | Since the LazySorted object only sorts as much as necessary, it can be faster 15 | than using the builtin `sorted(...)` for tasks that do not require the entire 16 | data to be sorted, like: 17 | 18 | 1. Computing medians 19 | 2. Computing [truncated means](http://en.wikipedia.org/wiki/Truncated%5Fmean) 20 | 3. Quickly iterating through the first few sorted elements of a list 21 | 4. Computing the deciles or quartiles of some data 22 | 23 | 24 | How to use it 25 | ------------- 26 | 27 | You can use LazySorted in much the same way you use the `sorted(...)` function 28 | and the python lists it produces: 29 | 30 | ```python 31 | from lazysorted import LazySorted 32 | from math import floor, ceil 33 | 34 | 35 | def median(xs): 36 | """An expected linear time median function""" 37 | ls = LazySorted(xs) 38 | n = len(ls) 39 | if n == 0: 40 | raise ValueError("Need a non-empty iterable") 41 | elif n % 2 == 1: 42 | return ls[n//2] 43 | else: 44 | return sum(ls[(n/2-1):(n/2+1)]) / 2.0 45 | 46 | 47 | def top_k(xs, k, key=None, reverse=False): 48 | """Efficiently computes the top k elements of xs using the given key, or 49 | the bottom k if reverse=True""" 50 | ls = LazySorted(xs, key=key, reverse=reverse) 51 | return ls[0:k] 52 | 53 | 54 | def trimmed_mean(xs, alpha=0.05): 55 | """Computes the mean of the xs from the alpha to (1-alpha) quantiles 56 | in expected linear time. More robust than the ordinary sample mean.""" 57 | if not 0 <= alpha < 0.5: 58 | raise ValueError("alpha must be in [0, 0.5)") 59 | 60 | ls = LazySorted(xs) 61 | n = len(ls) 62 | if n == 0: 63 | raise ValueError("Need a non-empty iterable") 64 | lower = int(floor(n * alpha)) 65 | upper = int(ceil(n * (1 - alpha))) 66 | return sum(ls.between(lower, upper)) / (upper - lower) 67 | 68 | ``` 69 | 70 | In addition to the `__len__` and `__getitem__` methods demonstrated above, 71 | LazySorted also supports the `__iter__`, `__contains__`, `index`, and `count` 72 | methods, just like a regular python list: 73 | 74 | ```python 75 | >>> import random 76 | >>> from lazysorted import LazySorted 77 | >>> xs = list(range(1000)) + 5 * [1234] 78 | >>> random.shuffle(xs) 79 | >>> ls = LazySorted(xs) 80 | >>> for x in ls: 81 | ... print(x) 82 | ... if x >= 3: 83 | ... break 84 | 0 85 | 1 86 | 2 87 | 3 88 | >>> 1235 in ls 89 | False 90 | >>> ls.index(821) 91 | 821 92 | >>> ls.count(1234) 93 | 5 94 | 95 | ``` 96 | 97 | Although the LazySorted constructor pretends to be equivalent to the `sorted` 98 | function, and the LazySorted object pretends to be equivalent to a sorted python 99 | list, there are a few differences between them: 100 | 101 | 1. LazySorted objects are immutable, while python lists are not. 102 | 2. Sorting with the builtin `sorted` function is guaranteed to be stable, (ie, 103 | preserve the original order of elements that compare equal), while 104 | LazySorted sorting is not stable. 105 | 3. The LazySorted object has a `between(i, j)` method, which returns a list of 106 | all the items whose sorted indices are in `range(i, j)`, but not necessarily 107 | in order. This is useful, for example, for throwing away outliers when 108 | computing an alpha-trimmed mean. 109 | 110 | When the APIs differ between python2.x and python3.x, lazysorted implements the 111 | python3.x version. So the LazySorted constructor does not support the `cmp` 112 | argument that was removed in python3.x, and the LazySorted object does not 113 | support the `__getslice__` method that was also removed in python3.x. 114 | 115 | All of the LazySorted methods have pretty good documentation, which can be 116 | accessed through the builtin `help(...)` function. 117 | 118 | I've tested lazysorted and found it to work for CPython versions 2.5, 2.6, 2.7, 119 | and 3.1, 3.2, and 3.3. I haven't tested 3.0. 120 | 121 | 122 | How it works 123 | ------------ 124 | 125 | In short, LazySorted works by using quicksort partitions lazily and keeping 126 | track of the indices used as pivots. 127 | 128 | **[quicksort](http://en.wikipedia.org/wiki/Quicksort)** sorts a list by picking 129 | an element of the list to be the "pivot", and then partitioning the data into 130 | the part that's greater than or equal to the pivot and the part that's less 131 | than the pivot. These two parts are then recursively sorted with quicksort, 132 | 133 | **[quickselect](http://en.wikipedia.org/wiki/Quickselect)** finds the kth 134 | smallest element of a list by picking a pivot element and partitioning the 135 | data, as in quicksort. Then the algorithm recurses into the larger or smaller 136 | part of the list, depending on whether k is larger or smaller than the index of 137 | the pivot element. 138 | 139 | There are two key observations to make from these algorithms: First of all, if 140 | we are only interested in part of a sorted list, we only need to recurse into 141 | the part we are interested in after doing a partition. Second of all, after 142 | doing some partitions, the list is partially sorted, with the pivots all in 143 | their sorted order and the elements between two pivots guaranteed to be bigger 144 | than the pivot to their left and smaller than the pivot to their right. 145 | 146 | So whenever some data is queried from a LazySorted object, we first look 147 | through the pivots to see which pivots delimit the data we want. Then we 148 | partition sublist(s) as necessary and recurse into the side(s) that our data is 149 | in. 150 | 151 | There are also some implementation details that help lazysorted to run quickly: 152 | First of all, pivots elements are chosen to be the median of three randomly selected 153 | elements, which makes the partition likely to be more balanced and guarantees 154 | average case O(n log n) behavior. 155 | 156 | Second of all, for sufficiently small lists, lazysorted uses insertion sort 157 | instead of quicksort, which is faster on small lists. Both of these tricks are 158 | well-known to speed up quicksort implementations. 159 | 160 | Thirdly, since it's important to find the pivots that bound an index quickly, 161 | lazysorted stores the pivots in a binary search tree, so that these sorts of 162 | lookups occur in O(log n) expected time. The BST lazysorted uses is a 163 | [Treap](http://en.wikipedia.org/wiki/Treap), selected for its overall expected 164 | speed, especially in insertion and deletion. 165 | 166 | lazysorted also makes a big effort to delete irrelevant pivots from the BST; 167 | for example, if there are three pivots at indices 5, 26, and 42, and both the 168 | data (between 5 and 26) and (between 26 and 42) is sorted, then we can remove 169 | the irrelevant pivot 26, and just say that the data between indices 5 and 42 is 170 | sorted. 171 | 172 | 173 | Installation 174 | ------------ 175 | 176 | lazysorted requires the python headers, (Python.h). I believe they ship with 177 | OSX, but if you don't have them they can be installed on a debian-like system 178 | with 179 | 180 | $ sudo apt-get install python-dev 181 | 182 | Then you can install lazysorted with 183 | 184 | $ sudo python setup.py install 185 | 186 | Alternatively, you can install lazysorted from pypi with 187 | 188 | $ easy_install --user lazysorted 189 | 190 | or 191 | 192 | $ pip install lazysorted 193 | 194 | though you'll still need the python headers for it to build properly. 195 | 196 | 197 | Testing 198 | ------- 199 | 200 | I've put in a fair bit of effort to test that lazysorted actually does what 201 | it's supposed to. You can test it yourself (after installing it) with 202 | 203 | $ python test.py 204 | 205 | 206 | FAQ 207 | --- 208 | 209 | **Doesn't numpy have a median and percentile function?** 210 | 211 | Yes, but it's implemented by sorting the entire array and then reading off the 212 | requested values, not with quickselect or another O(n) selection algorithm. 213 | And LazySorted is empirically faster, as you can see from benchmark.py 214 | 215 | **Isn't python3.4 going to have a statistics module with a median function?** 216 | 217 | Yes, and I'm really excited about it! This is 218 | [PEP450](http://www.python.org/dev/peps/pep-0450/). Unfortunately, the current 219 | implementation is in pure python, and computes the median by sorting the data 220 | and picking off the middle element. 221 | 222 | **Doesn't the standard library have a heapq module?** 223 | 224 | Yes, but it lacks the full generality of this module. For example, you can use 225 | it to get the k smallest elements in O(n log k) time, but not k arbitrary 226 | contiguous elements. This module represents a different paradigm: you're 227 | allowed to program as if your list was sorted, and let the data structure deal 228 | with the details. 229 | 230 | **How is lazysorted licensed?** 231 | 232 | lazysorted is BSD-licensed. So you can use it pretty much however you like! 233 | See LICENSE for details. 234 | 235 | **What should I not use lazysorted for?** 236 | 237 | 1. Applications requiring a stable sort; the quicksort partitions make the 238 | order of equal elements in the sorted list undefined. 239 | 2. Applications requiring guaranteed fast worst-case performance. Although 240 | it's very unlikely, many operations in LazySorted run in worst case O(n^2) 241 | time. 242 | 3. Applications requiring high security. The random number generator is 243 | insecure and seeded from system time, so an (ambitious) attacker could 244 | reverse engineer the random number generator and feed LazySorted 245 | pathological lists that make it run in O(n^2) time. 246 | 4. Sorting entire lists: The builtin `sorted(...)` is *very* impressively 247 | designed and implemented. It also has the advantage of running faster than 248 | O(n log n) on lists with partial structure. 249 | 250 | **How does lazysorted work at scale?** 251 | 252 | Unfortunately, only okay. This turns out to be primarily due to the fact that 253 | CPython deals with python objects by passing around pointers to them, causing 254 | cache misses when the list and its elements no longer fit in cache. The gory 255 | details can be found in a blog post I wrote about [Memory Locality and Python 256 | Objects](http://www.naftaliharris.com/blog/heapobjects). 257 | 258 | However, this effect doesn't kick in until lists grow larger than about 100K 259 | values, and even past that lazysorted remains faster than complete sorting. 260 | 261 | 262 | Contact me! 263 | ----------- 264 | 265 | If you use this software and feel so inclined, I'd greatly appreciate hearing 266 | what you are using it for! You can hit me up on Twitter 267 | [@naftaliharris](https://twitter.com/naftaliharris), or at my email address 268 | on my [contact page](http://www.naftaliharris.com/contact/). 269 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | """test.py""" 2 | 3 | import unittest 4 | import random 5 | from itertools import islice 6 | import doctest 7 | import lazysorted 8 | from lazysorted import LazySorted 9 | 10 | 11 | class TestLazySorted(unittest.TestCase): 12 | test_lengths = range(18) + [31, 32, 33, 63, 64, 65, 127, 128, 129] 13 | 14 | def test_creation(self): 15 | """LazySorted objects can be created from any iterable""" 16 | x = LazySorted([]) 17 | x = LazySorted([1, 2, 3, 4]) 18 | x = LazySorted(x for x in range(100) if x % 3 == 0) 19 | x = LazySorted((3, -2, 5)) 20 | x = LazySorted(xrange(100)) 21 | x = LazySorted(xrange(0)) 22 | x = LazySorted({"foo": 10, "bar": 3, "baz": 9}) 23 | 24 | def test_random_select(self): 25 | """Selection should work once""" 26 | for n in TestLazySorted.test_lengths: 27 | xs = range(n) 28 | for k in xrange(1, n): 29 | for rep in xrange(10): 30 | random.shuffle(xs) 31 | self.assertEqual(LazySorted(xs)[k], k, 32 | msg="xs = %s; k = %d" % (xs, k)) 33 | 34 | def test_multiple_select(self): 35 | """Selection should work many times in a row""" 36 | for n in TestLazySorted.test_lengths: 37 | xs = range(n) 38 | ks = 2 * range(n) # include multiple accesses 39 | for rep in xrange(10): 40 | random.shuffle(xs) 41 | random.shuffle(ks) 42 | ls = LazySorted(xs) 43 | for k in ks: 44 | self.assertEqual(ls[k], k, msg="xs = %s; ks = %s; k = %d" % 45 | (xs, ks, k)) 46 | 47 | def test_len(self): 48 | """the __len__ method and len(.) builtin should work""" 49 | for n in TestLazySorted.test_lengths: 50 | xs = range(n) 51 | ls = LazySorted(xs) 52 | self.assertEqual(len(ls), n) 53 | self.assertEqual(ls.__len__(), n) 54 | 55 | def test_select_range(self): 56 | """selecting contiguous forward ranges should work""" 57 | for n in TestLazySorted.test_lengths: 58 | xs = range(n) 59 | for list_rep in xrange(5): 60 | random.shuffle(xs) 61 | ls = LazySorted(xs) 62 | for select_rep in xrange(128): 63 | a, b = random.randrange(n + 1), random.randrange(n + 1) 64 | a, b = min(a, b), max(a, b) 65 | self.assertEqual(ls[a:b], range(a, b), msg="xs = %s; " 66 | "(a, b) = (%d, %d); select_rep = %d" % 67 | (xs, a, b, select_rep)) 68 | 69 | def test_full_range(self): 70 | """selecting slice objects with steps should work""" 71 | for n in TestLazySorted.test_lengths: 72 | xs = range(n) 73 | ys = range(n) 74 | for list_rep in xrange(5): 75 | random.shuffle(xs) 76 | ls = LazySorted(xs) 77 | for select_rep in xrange(16): 78 | a = random.randrange(-n, n + 1) 79 | b = random.randrange(-n, n + 1) 80 | c = random.randrange(1, n + 3) * random.choice([-1, 1]) 81 | self.assertEqual(ls[a:b:c], ys[a:b:c], msg="xs = %s; " 82 | "called ls[%d:%d:%d]" % (xs, a, b, c)) 83 | 84 | def test_step(self): 85 | """selecting slice objects with only a step defined should work""" 86 | steps = [-64, -16, -2, -1, 1, 2, 16, 64] 87 | for n in TestLazySorted.test_lengths: 88 | xs = range(n) 89 | ys = range(n) 90 | for list_rep in xrange(5): 91 | random.shuffle(xs) 92 | ls = LazySorted(xs) 93 | random.shuffle(steps) 94 | for step in steps: 95 | self.assertEqual(ls[::step], ys[::step]) 96 | 97 | def test_between(self): 98 | """the between method should work""" 99 | for n in TestLazySorted.test_lengths: 100 | xs = range(n) 101 | ys = range(n) 102 | for rep in xrange(100): 103 | a = random.randrange(-n, n + 1) 104 | b = random.randrange(-n, n + 1) 105 | 106 | random.shuffle(xs) 107 | ls = LazySorted(xs) 108 | between = ls.between(a, b) 109 | 110 | self.assertEqual(len(between), len(ys[a:b]), msg="n = %d; " 111 | "called ls.between(%d, %d)" % (n, a, b)) 112 | self.assertEqual(set(between), set(ys[a:b]), msg="n = %d; " 113 | "called ls.between(%d, %d)" % (n, a, b)) 114 | 115 | def test_README(self): 116 | """the examples in the README should all be correct""" 117 | failures, tests = doctest.testfile('README.md') 118 | self.assertEqual(failures, 0) 119 | 120 | def test_contains(self): 121 | """The __contains__ method and `in' keyword should work""" 122 | for n in TestLazySorted.test_lengths: 123 | xs = range(n) 124 | ys = range(0, n, 5) + [-4, -3, -2, -1, 0, n, n + 1, n + 2, 3.3] 125 | for rep in xrange(10): 126 | random.shuffle(xs) 127 | random.shuffle(ys) 128 | 129 | ls = LazySorted(xs) 130 | for y in ys: 131 | self.assertEqual(y in xs, y in ls, msg="ys = %s; xs = %s" % 132 | (ys, xs)) 133 | 134 | ls = LazySorted(xs) 135 | for y in ys: 136 | self.assertEqual(xs.__contains__(y), ls.__contains__(y), 137 | msg="ys = %s; xs = %s" % (ys, xs)) 138 | 139 | def test_simple_index(self): 140 | """The index method should work""" 141 | for n in TestLazySorted.test_lengths: 142 | xs = range(n) 143 | ys = range(n) 144 | for rep in xrange(5): 145 | random.shuffle(xs) 146 | random.shuffle(ys) 147 | ls = LazySorted(xs) 148 | 149 | for y in ys: 150 | self.assertEqual(ls.index(y), y) 151 | 152 | def test_index_valueerror(self): 153 | """The index method should raise a ValueError if item not in list""" 154 | for n in TestLazySorted.test_lengths: 155 | xs = range(n) 156 | for rep in xrange(5): 157 | random.shuffle(xs) 158 | ls = LazySorted(xs) 159 | 160 | self.assertRaises(ValueError, lambda: ls.index(-1)) 161 | self.assertRaises(ValueError, lambda: ls.index(n)) 162 | self.assertRaises(ValueError, lambda: ls.index(5.5)) 163 | 164 | def test_index_nonunique(self): 165 | """The index method should work in the presence of nonunique items""" 166 | for a in xrange(1, 32): 167 | for b in xrange(1, 32): 168 | xs = a * ["a"] + b * ["b"] 169 | for rep in xrange(3): 170 | random.shuffle(xs) 171 | ls = LazySorted(xs) 172 | 173 | self.assertEqual(ls.index("b"), a) 174 | self.assertEqual(ls.index("a"), 0) 175 | 176 | for rep in xrange(3): 177 | random.shuffle(xs) 178 | ls = LazySorted(xs) 179 | 180 | self.assertEqual(ls.index("a"), 0) 181 | self.assertEqual(ls.index("b"), a) 182 | 183 | def test_count_nonunique(self): 184 | """The count method should work in the presence of nonunique items""" 185 | for a in xrange(1, 32): 186 | for b in xrange(1, 32): 187 | xs = a * ["a"] + b * ["b"] 188 | for rep in xrange(3): 189 | random.shuffle(xs) 190 | ls = LazySorted(xs) 191 | 192 | self.assertEqual(ls.count("b"), b) 193 | self.assertEqual(ls.count("a"), a) 194 | 195 | for rep in xrange(3): 196 | random.shuffle(xs) 197 | ls = LazySorted(xs) 198 | 199 | self.assertEqual(ls.count("a"), a) 200 | self.assertEqual(ls.count("b"), b) 201 | 202 | def test_count_simple(self): 203 | """The count method should work on simple queries""" 204 | for n in TestLazySorted.test_lengths: 205 | xs = range(n) 206 | ys = range(0, n, 5) + [-4, -3, -2, -1, 0, n, n + 1, n + 2, 3.3] 207 | for rep in xrange(5): 208 | random.shuffle(xs) 209 | random.shuffle(ys) 210 | ls = LazySorted(xs) 211 | for y in ys: 212 | self.assertEqual(ls.count(y), 1 if (isinstance(y, int) and 213 | 0 <= y < n) else 0) 214 | 215 | def test_count_manynonunique(self): 216 | """The count method should work with very many nonunique items""" 217 | for rep in xrange(2000): 218 | items = range(random.randint(1, 50)) 219 | random.shuffle(items) 220 | itemcounts = [random.randint(0, 16) for _ in items] 221 | xs = [y for x in [[i] * itemcounts[i] for i in items] for y in x] 222 | 223 | ls = LazySorted(xs) 224 | for item in items: 225 | self.assertEqual(ls.count(item), itemcounts[item]) 226 | 227 | for n in TestLazySorted.test_lengths: 228 | ls = LazySorted([0] * n) 229 | self.assertEqual(ls.count(0), n) 230 | 231 | def test_sorting(self): 232 | """Iteration should be equivalent to sorting""" 233 | for length in TestLazySorted.test_lengths: 234 | items = range(length) 235 | random.shuffle(items) 236 | self.assertEqual(list(LazySorted(items)), range(length)) 237 | 238 | def test_interupted_iter(self): 239 | """Iteration should work even if it's interrupted by other calls""" 240 | for rep in xrange(100): 241 | items = range(512) 242 | random.shuffle(items) 243 | ls = LazySorted(items) 244 | it = iter(ls) 245 | self.assertEqual(list(islice(it, 30)), range(0, 30)) 246 | _ = ls[random.randrange(512)] 247 | _ = random.randrange(-100, 600) in ls 248 | self.assertEqual(list(islice(it, 30)), range(30, 60)) 249 | 250 | def test_reverse(self): 251 | """Reverse iteration should be equivalent to reverse sorting""" 252 | for length in TestLazySorted.test_lengths: 253 | items = range(length) 254 | random.shuffle(items) 255 | self.assertEqual(list(LazySorted(items, reverse=True)), 256 | range(length-1, -1, -1)) 257 | 258 | def test_keys(self): 259 | """Using keys should work fine, with or without reverse""" 260 | for rep in xrange(100): 261 | items = [(random.random(), random.random()) for _ in xrange(256)] 262 | random.shuffle(items) 263 | for reverse in [True, False]: 264 | self.assertEqual(list(LazySorted(items, key=lambda x: x[0])), 265 | sorted(items, key=lambda x: x[0])) 266 | self.assertEqual(list(LazySorted(items, key=lambda x: x[1])), 267 | sorted(items, key=lambda x: x[1])) 268 | 269 | def test_API(self): 270 | """The sorted(...) API should be implemented except for cmp""" 271 | xs = range(10) 272 | for tryme in [lambda: LazySorted(xs, reverse="foo"), 273 | lambda: LazySorted(xs, key="foo"), 274 | lambda: LazySorted(xs, reverse=True, key="foo"), 275 | lambda: LazySorted(xs, key=5), 276 | lambda: LazySorted(xs, reverse="foo", key=lambda x: x), 277 | lambda: LazySorted(xs, reverse=True, key=5)]: 278 | self.assertRaises(TypeError, tryme) 279 | 280 | # NB: LazySorted(xs, reverse=1.5) will succeed in python2.6 and down, 281 | # even though it should really fail. This was fixed in python2.7 and 282 | # up. See issue 5080 for details: http://bugs.python.org/issue5080 283 | 284 | # Keyword order shouldn't matter if they're named, but should if not 285 | LazySorted(xs, key=lambda x: x, reverse=False) 286 | LazySorted(xs, reverse=False, key=lambda x: x) 287 | LazySorted(xs, lambda x: x, False) 288 | self.assertRaises(TypeError, lambda: LazySorted(xs, 0, lambda x: x)) 289 | 290 | # You can't call LazySorted without arguments 291 | self.assertRaises(TypeError, lambda: LazySorted()) 292 | 293 | # You can't use a key with the wrong number of arguments 294 | for key in [lambda: "foo", lambda x, y: x + y]: 295 | self.assertRaises(TypeError, lambda: LazySorted(xs, key=key)[3]) 296 | self.assertRaises(TypeError, lambda: LazySorted(xs, key=key)[3]) 297 | 298 | def test_new_init(self): 299 | """Calling just __new__ should give you a working LazySorted object""" 300 | ls = LazySorted.__new__(LazySorted, []) 301 | self.assertRaises(IndexError, lambda: ls[4]) 302 | self.assertRaises(TypeError, lambda: ls["foo"]) 303 | self.assertRaises(ValueError, lambda: ls.index(4)) 304 | self.assertRaises(ValueError, lambda: ls.index("foo")) 305 | self.assertFalse("foo" in ls) 306 | self.assertFalse(4 in ls) 307 | self.assertEqual(ls.count("foo"), 0) 308 | self.assertEqual(ls[0:5], []) 309 | self.assertEqual(len(ls), 0) 310 | self.assertEqual(list(ls), []) 311 | 312 | def test_subclassing(self): 313 | """LazySorted should be subclassable""" 314 | class MyLS(LazySorted): 315 | pass 316 | 317 | 318 | if __name__ == "__main__": 319 | unittest.main() 320 | -------------------------------------------------------------------------------- /lazysorted.c: -------------------------------------------------------------------------------- 1 | /* LazySorted objects */ 2 | 3 | #include 4 | #include 5 | 6 | /* Parameters for the sorting function */ 7 | 8 | /* SORT_THRESH: Sort if the sublist has SORT_THRESH or fewer elements */ 9 | #define SORT_THRESH 16 /* Should be at least three because of prefetch */ 10 | 11 | /* CONTIG_THRESH: When computing slices with integer step sizes, sort all data 12 | * between start and stop and then populate the list with it if 13 | * |step| <= CONTIG_THRESH, otherwise select each element individually. 14 | * CONTIG_THRESH should always be bigger than SORT_THRESH */ 15 | #define CONTIG_THRESH 32 16 | 17 | /* Macro definitions to deal different python versions */ 18 | #if PY_MAJOR_VERSION >= 3 19 | #define PyString_FromString PyUnicode_FromString 20 | #define PyString_Format PyUnicode_Format 21 | #define PyInt_FromSsize_t PyLong_FromSsize_t 22 | #endif 23 | 24 | #if PY_VERSION_HEX < 0x03020000 25 | #define PySlice_GetIndicesEx(item, \ 26 | length, start, stop, step, slicelength) \ 27 | PySlice_GetIndicesEx((PySliceObject*)item, \ 28 | length, start, stop, step, slicelength) 29 | #endif 30 | 31 | /* Macros for python2.5 */ 32 | #ifndef PyVarObject_HEAD_INIT 33 | #define PyVarObject_HEAD_INIT(type, size) \ 34 | PyObject_HEAD_INIT(type) size, 35 | #endif 36 | 37 | #ifndef Py_SIZE 38 | #define Py_SIZE(ob) (((PyVarObject*)(ob))->ob_size) 39 | #endif 40 | 41 | #ifndef Py_Type 42 | #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) 43 | #endif 44 | 45 | /* Macros to support different compilers */ 46 | #if !(defined(__GNUC__) || defined(__clang__)) 47 | #define __builtin_prefetch(x) 48 | #endif 49 | 50 | /* Definitions and functions for the binary search tree of pivot points. 51 | * The BST implementation is a Treap, selected because of its general speed, 52 | * especially when inserting and removing elements, which happens a lot in this 53 | * application. */ 54 | 55 | typedef struct PivotNode { 56 | Py_ssize_t idx; /* The index it represents */ 57 | int flags; /* Descriptors of the data between pivots */ 58 | int priority; /* Priority in the Treap */ 59 | struct PivotNode *left; 60 | struct PivotNode *right; 61 | struct PivotNode *parent; 62 | } PivotNode; 63 | 64 | /* SORTED_RIGHT means the pivot is to the right of a sorted region. 65 | * SORTED_LEFT means the pivot is the left of a sorted region */ 66 | #define SORTED_RIGHT 1 67 | #define SORTED_LEFT 2 68 | #define UNSORTED 0 69 | #define SORTED_BOTH 3 70 | 71 | /* The LazySorted object */ 72 | typedef struct { 73 | PyObject_HEAD 74 | PyListObject *xs; /* Partially sorted list */ 75 | PivotNode *root; /* Root of the pivot BST */ 76 | PyObject *keyfunc; /* The key function */ 77 | int reverse; /* 1 for reverse order */ 78 | } LSObject; 79 | 80 | static PyTypeObject LS_Type; 81 | #define LSObject_Check(v) (Py_TYPE(v) == &LS_Type) 82 | 83 | /* Returns the next (bigger) pivot, or NULL if it's the last pivot */ 84 | PivotNode * 85 | next_pivot(PivotNode *current) 86 | { 87 | PivotNode *curr = current; 88 | if (curr->right != NULL) { 89 | curr = curr->right; 90 | while (curr->left != NULL) { 91 | curr = curr->left; 92 | } 93 | } 94 | else { 95 | while (curr->parent != NULL && curr->parent->idx < curr->idx) { 96 | curr = curr->parent; 97 | } 98 | 99 | if (curr->parent == NULL) { 100 | return NULL; 101 | } 102 | else { 103 | curr = curr->parent; 104 | } 105 | } 106 | 107 | assert(curr->idx > current->idx); 108 | return curr; 109 | } 110 | 111 | /* A recursive function getting the consistency of a node. 112 | * Does not assume that the node is the root of the tree, and does NOT examine 113 | * the parentage of node. This is important, because it is often called on 114 | * nodes whose future parents don't know them yet, like in merge_trees(.) */ 115 | #ifndef NDEBUG 116 | static void 117 | assert_node(PivotNode *node) 118 | { 119 | if (node->left != NULL) { 120 | assert(node->left->idx < node->idx); 121 | assert(node->left->priority <= node->priority); 122 | assert(node->left->parent == node); 123 | assert_node(node->left); 124 | } 125 | if (node->right != NULL) { 126 | assert(node->right->idx > node->idx); 127 | assert(node->right->priority <= node->priority); 128 | assert(node->right->parent == node); 129 | assert_node(node->right); 130 | } 131 | } 132 | 133 | /* A series of assert statements that the tree structure is consistent */ 134 | static void 135 | assert_tree(PivotNode *root) 136 | { 137 | assert(root != NULL); 138 | assert(root->parent == NULL); 139 | assert_node(root); 140 | } 141 | 142 | /* A series of assert statements that the tree's flags are consistent */ 143 | static void 144 | assert_tree_flags(PivotNode *root) 145 | { 146 | PivotNode *prev = NULL; 147 | PivotNode *curr = root; 148 | while (curr->left != NULL) 149 | curr = curr->left; 150 | while (curr != NULL) { 151 | if (curr->flags & SORTED_LEFT) 152 | assert(next_pivot(curr)->flags & SORTED_RIGHT); 153 | if (curr->flags & SORTED_RIGHT) 154 | assert(prev->flags & SORTED_LEFT); 155 | 156 | prev = curr; 157 | curr = next_pivot(curr); 158 | } 159 | } 160 | #else 161 | /* Silences -Wunused-parameter */ 162 | #define assert_node(x) 163 | #define assert_tree(x) 164 | #define assert_tree_flags(x) 165 | #endif 166 | 167 | 168 | /* Inserts an index, returning a pointer to the node, or NULL on error. 169 | * *root is the root of the tree, while start is the node to insert from. 170 | */ 171 | static PivotNode *insert_pivot(Py_ssize_t, int, PivotNode **, PivotNode *) 172 | Py_GCC_ATTRIBUTE((warn_unused_result)); 173 | 174 | static PivotNode * 175 | insert_pivot(Py_ssize_t k, int flags, PivotNode **root, PivotNode *start) 176 | { 177 | /* Build the node */ 178 | PivotNode *node = (PivotNode *)PyMem_Malloc(sizeof(PivotNode)); 179 | if (node == NULL) 180 | return (PivotNode *)PyErr_NoMemory(); 181 | node->idx = k; 182 | node->flags = flags; 183 | node->priority = rand(); 184 | node->left = NULL; 185 | node->right = NULL; 186 | 187 | /* Special case the empty tree */ 188 | if (*root == NULL) { 189 | node->parent = NULL; 190 | *root = node; 191 | return node; 192 | } 193 | 194 | /* Put the node in its sorted order */ 195 | PivotNode *current = start; 196 | while (1) { 197 | if (current->idx < k) { 198 | if (current->right == NULL) { 199 | current->right = node; 200 | node->parent = current; 201 | break; 202 | } 203 | current = current->right; 204 | } 205 | else if (current->idx > k) { 206 | if (current->left == NULL) { 207 | current->left = node; 208 | node->parent = current; 209 | break; 210 | } 211 | current = current->left; 212 | } 213 | else { 214 | /* The pivot BST should always have unique pivots */ 215 | PyErr_SetString(PyExc_SystemError, "All pivots must be unique"); 216 | return NULL; 217 | } 218 | } 219 | 220 | /* Reestablish the treap invariant if necessary by tree rotations */ 221 | PivotNode *child, *parent, *grandparent; 222 | while (node->priority > node->parent->priority) { 223 | /* (parent) (node) 224 | * / \ 225 | * / \ 226 | * / \ 227 | * (node) -> (parent) 228 | * \ / 229 | * \ / 230 | * (child) (child) 231 | */ 232 | if (node->idx < node->parent->idx) { 233 | child = node->right; 234 | parent = node->parent; 235 | grandparent = parent->parent; 236 | 237 | node->parent = grandparent; 238 | node->right = parent; 239 | parent->parent = node; 240 | parent->left = child; 241 | if (child != NULL) 242 | child->parent = parent; 243 | } 244 | /* (parent) (node) 245 | * \ / 246 | * \ / 247 | * \ / 248 | * (node) -> (parent) 249 | * / \ 250 | * / \ 251 | * (child) (child) 252 | */ 253 | else { 254 | child = node->left; 255 | parent = node->parent; 256 | grandparent = parent->parent; 257 | 258 | node->parent = grandparent; 259 | node->left = parent; 260 | parent->parent = node; 261 | parent->right = child; 262 | if (child != NULL) 263 | child->parent = parent; 264 | } 265 | 266 | /* Adjust node->parent's child pointer to point to node */ 267 | if (node->parent != NULL) { 268 | if (k < node->parent->idx) { 269 | node->parent->left = node; 270 | } 271 | else { 272 | node->parent->right = node; 273 | } 274 | } 275 | else { /* The node has propogated up to the root */ 276 | *root = node; 277 | break; 278 | } 279 | } 280 | 281 | assert_tree(*root); 282 | assert_tree_flags(*root); 283 | return node; 284 | } 285 | 286 | /* Takes two trees and merges them into one while preserving the treap 287 | * invariant. left must have a smaller index than right. */ 288 | static PivotNode * 289 | merge_trees(PivotNode *left, PivotNode *right) 290 | { 291 | assert(left != NULL || right != NULL); 292 | 293 | if (left == NULL) 294 | return right; 295 | if (right == NULL) 296 | return left; 297 | 298 | assert(left->parent == right->parent); 299 | assert(left->idx < right->idx); 300 | assert_node(left); 301 | assert_node(right); 302 | 303 | if (left->priority > right->priority) { 304 | right->parent = left; 305 | left->right = merge_trees(left->right, right); 306 | 307 | assert_node(left); 308 | return left; 309 | } 310 | else { 311 | left->parent = right; 312 | right->left = merge_trees(left, right->left); 313 | 314 | assert_node(right); 315 | return right; 316 | } 317 | } 318 | 319 | static void 320 | delete_node(PivotNode *node, PivotNode **root) 321 | { 322 | assert_tree(*root); 323 | 324 | if (node->left == NULL) { 325 | /* node has at most one child in node->right, so we just have the 326 | * grandparent adopt it, if node is not the root. If node is the root, 327 | * we promote the child to root. */ 328 | if (node->parent != NULL) { 329 | if (node->parent->left == node) { 330 | node->parent->left = node->right; 331 | } 332 | else { 333 | node->parent->right = node->right; 334 | } 335 | } 336 | else { /* Node is the root */ 337 | *root = node->right; 338 | } 339 | 340 | if (node->right != NULL) { 341 | node->right->parent = node->parent; 342 | } 343 | 344 | PyMem_Free(node); 345 | } 346 | else { 347 | if (node->right == NULL) { 348 | /* node has a single child in node->left, so have grandparent 349 | * adopt it as above */ 350 | if (node->parent != NULL) { 351 | if (node->parent->left == node) { 352 | node->parent->left = node->left; 353 | } 354 | else { 355 | node->parent->right = node->left; 356 | } 357 | } 358 | else { /* Node is the root */ 359 | *root = node->left; 360 | } 361 | 362 | /* node->left is not NULL because of the outer if-else statement */ 363 | node->left->parent = node->parent; 364 | 365 | PyMem_Free(node); 366 | } 367 | else { 368 | /* The hard case: node has two children. We merge the two children 369 | * into one treap, and then replace node by this treap */ 370 | PivotNode *children = merge_trees(node->left, node->right); 371 | 372 | if (node->parent != NULL) { 373 | if (node->parent->left == node) { 374 | node->parent->left = children; 375 | } 376 | else { 377 | node->parent->right = children; 378 | } 379 | } 380 | else { /* Node is the root */ 381 | *root = children; 382 | } 383 | 384 | /* children is not NULL since node has two children */ 385 | children->parent = node->parent; 386 | 387 | PyMem_Free(node); 388 | } 389 | } 390 | 391 | assert_tree(*root); 392 | } 393 | 394 | /* If a sorted pivot is between two sorted section, removes the sorted pivot */ 395 | static void 396 | depivot(PivotNode *left, PivotNode *right, PivotNode **root) 397 | { 398 | assert_tree(*root); 399 | assert_tree_flags(*root); 400 | assert(left->flags & SORTED_LEFT); 401 | assert(right->flags & SORTED_RIGHT); 402 | 403 | if (left->flags & SORTED_RIGHT) { 404 | delete_node(left, root); 405 | } 406 | 407 | if (right->flags & SORTED_LEFT) { 408 | delete_node(right, root); 409 | } 410 | 411 | assert_tree(*root); 412 | assert_tree_flags(*root); 413 | } 414 | 415 | /* If the value at middle is equal to the value at left, left is removed. 416 | * If the value at middle is equal to the value at right, right is removed. 417 | * Returns 0 on success, or -1 on failure */ 418 | 419 | static int uniq_pivots(PivotNode *, PivotNode *, PivotNode *, LSObject *) 420 | Py_GCC_ATTRIBUTE((warn_unused_result)); 421 | 422 | static int 423 | uniq_pivots(PivotNode *left, PivotNode *middle, PivotNode *right, LSObject *ls) 424 | { 425 | assert_tree(ls->root); 426 | assert_tree_flags(ls->root); 427 | assert(left->idx < middle->idx && middle->idx < right->idx); 428 | int cmp; 429 | 430 | if (left->idx >= 0) { 431 | if ((cmp = PyObject_RichCompareBool(ls->xs->ob_item[left->idx], 432 | ls->xs->ob_item[middle->idx], 433 | Py_EQ)) < 0) { 434 | return -1; 435 | } 436 | else if (cmp) { 437 | middle->flags = left->flags; 438 | delete_node(left, &ls->root); 439 | } 440 | } 441 | 442 | if (right->idx < Py_SIZE(ls->xs)) { 443 | if ((cmp = PyObject_RichCompareBool(ls->xs->ob_item[middle->idx], 444 | ls->xs->ob_item[right->idx], 445 | Py_EQ)) < 0) { 446 | return -1; 447 | } 448 | else if (cmp) { 449 | middle->flags = right->flags; 450 | delete_node(right, &ls->root); 451 | } 452 | } 453 | 454 | assert_tree(ls->root); 455 | assert_tree_flags(ls->root); 456 | return 0; 457 | } 458 | 459 | /* Finds PivotNodes left and right that bound the index */ 460 | /* Never returns k in right_node, only the left, if applicable */ 461 | static void 462 | bound_idx(Py_ssize_t k, PivotNode *root, PivotNode **left, PivotNode **right) 463 | { 464 | assert_tree(root); 465 | assert_tree_flags(root); 466 | 467 | *left = NULL; 468 | *right = NULL; 469 | PivotNode *current = root; 470 | while (current != NULL) { 471 | if (current->idx < k) { 472 | *left = current; 473 | current = current->right; 474 | } 475 | else if (current->idx > k) { 476 | *right = current; 477 | current = current->left; 478 | } 479 | else { 480 | *left = current; 481 | break; 482 | } 483 | } 484 | 485 | assert(*left != NULL && ((*left)->idx == k || *right != NULL)); 486 | assert((*left)->idx == k || *right == next_pivot(*left)); 487 | } 488 | 489 | static void 490 | free_tree(PivotNode *root) 491 | { 492 | assert_node(root); /* might not be the actual root because of recursion */ 493 | 494 | if (root->left != NULL) 495 | free_tree(root->left); 496 | if (root->right != NULL) 497 | free_tree(root->right); 498 | 499 | PyMem_Free(root); 500 | } 501 | 502 | static void 503 | LS_dealloc(LSObject *self) 504 | { 505 | Py_DECREF(self->xs); 506 | Py_XDECREF(self->keyfunc); 507 | if (self->root != NULL) { 508 | free_tree(self->root); 509 | } 510 | Py_TYPE(self)->tp_free((PyObject*)self); 511 | } 512 | 513 | static PyObject * 514 | newLSObject(PyTypeObject *type, PyObject *args, PyObject *kwds) 515 | { 516 | LSObject *self; 517 | PyListObject *xs; 518 | PyObject *sequence = NULL; 519 | PyObject *keyfunc = NULL; 520 | int reverse = 0; 521 | static char *kwdlist[] = {"sequence", "key", "reverse", 0}; 522 | 523 | if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|Oi:LazySorted", 524 | kwdlist, &sequence, &keyfunc, &reverse)) 525 | return NULL; 526 | 527 | PyObject *list_args = Py_BuildValue("(O)", sequence); 528 | if (list_args == NULL) 529 | return NULL; 530 | 531 | xs = (PyListObject *)PyList_Type.tp_new(&PyList_Type, list_args, NULL); 532 | if (xs == NULL) { 533 | Py_DECREF(list_args); 534 | return NULL; 535 | } 536 | 537 | if (PyList_Type.tp_init((PyObject *)xs, list_args, NULL)) { 538 | Py_DECREF(list_args); 539 | Py_DECREF(xs); 540 | return NULL; 541 | } 542 | Py_DECREF(list_args); 543 | 544 | self = (LSObject *)type->tp_alloc(type, 0); 545 | if (self == NULL) { 546 | Py_DECREF(xs); 547 | return NULL; 548 | } 549 | self->root = NULL; 550 | self->keyfunc = NULL; 551 | self->reverse = 0; 552 | self->xs = xs; 553 | 554 | if (insert_pivot(-1, UNSORTED, &self->root, self->root) == NULL) { 555 | Py_DECREF(self); 556 | return NULL; 557 | } 558 | 559 | if (insert_pivot(Py_SIZE(xs), UNSORTED, &self->root, self->root) == NULL) { 560 | Py_DECREF(self); 561 | return NULL; 562 | } 563 | 564 | if (reverse) 565 | self->reverse = 1; 566 | 567 | if (keyfunc == Py_None) 568 | keyfunc = NULL; 569 | 570 | if (keyfunc != NULL) { 571 | /* Since we sort lazily, we wouldn't discover that the key isn't 572 | * callable until we actually attempted sorting. So let's try to help 573 | * the user by failing fast if this is the case. */ 574 | if (!PyCallable_Check(keyfunc)) { 575 | PyErr_SetString(PyExc_TypeError, "key must be callable"); 576 | Py_DECREF(self); 577 | return NULL; 578 | } 579 | self->keyfunc = keyfunc; 580 | Py_INCREF(self->keyfunc); 581 | } 582 | 583 | return (PyObject *)self; 584 | } 585 | 586 | /* Private helper functions for partial sorting */ 587 | 588 | /* These macros are basically taken from list.c 589 | * Returns 1 if x < y, 0 if x >= y, and -1 on error */ 590 | /* #define ISLT(X, Y) PyObject_RichCompareBool(X, Y, Py_LT) */ 591 | 592 | static inline int islt(PyObject *, PyObject *, LSObject *) 593 | Py_GCC_ATTRIBUTE((warn_unused_result)); 594 | 595 | static inline int 596 | islt(PyObject *x, PyObject *y, LSObject *ls) 597 | { 598 | if (ls->keyfunc != NULL) { 599 | PyObject *x_cmp, *y_cmp; 600 | 601 | PyObject *x_arg = Py_BuildValue("(O)", x); 602 | x_cmp = PyObject_CallObject(ls->keyfunc, x_arg); 603 | Py_DECREF(x_arg); 604 | if (x_cmp == NULL) { 605 | return -1; 606 | } 607 | 608 | PyObject *y_arg = Py_BuildValue("(O)", y); 609 | y_cmp = PyObject_CallObject(ls->keyfunc, y_arg); 610 | Py_DECREF(y_arg); 611 | if (y_cmp == NULL) { 612 | Py_DECREF(x_cmp); 613 | return -1; 614 | } 615 | 616 | int res = ls->reverse ? PyObject_RichCompareBool(x_cmp, y_cmp, Py_GT) 617 | : PyObject_RichCompareBool(x_cmp, y_cmp, Py_LT); 618 | 619 | Py_DECREF(x_cmp); 620 | Py_DECREF(y_cmp); 621 | return res; 622 | } else { 623 | return ls->reverse ? PyObject_RichCompareBool(x, y, Py_GT) 624 | : PyObject_RichCompareBool(x, y, Py_LT); 625 | } 626 | } 627 | 628 | #define IFLT(X, Y) if ((ltflag = islt(X, Y, ls)) < 0) goto fail; \ 629 | if(ltflag) 630 | 631 | /* N.B: No semicolon at the end, so that you can include one yourself */ 632 | #define SWAP(i, j) tmp = ob_item[i]; \ 633 | ob_item[i] = ob_item[j]; \ 634 | ob_item[j] = tmp 635 | 636 | /* Picks a pivot point among the indices left <= i < right. Returns -1 on 637 | * error */ 638 | 639 | static Py_ssize_t pick_pivot(LSObject *, Py_ssize_t, Py_ssize_t) 640 | Py_GCC_ATTRIBUTE((warn_unused_result)); 641 | 642 | static Py_ssize_t 643 | pick_pivot(LSObject *ls, Py_ssize_t left, Py_ssize_t right) 644 | { 645 | PyObject **ob_item = ls->xs->ob_item; 646 | 647 | /* Use median of three trick */ 648 | Py_ssize_t idx1 = left + rand() % (right - left); 649 | Py_ssize_t idx2 = left + rand() % (right - left); 650 | Py_ssize_t idx3 = left + rand() % (right - left); 651 | 652 | int ltflag; 653 | IFLT(ob_item[idx1], ob_item[idx3]) { 654 | IFLT(ob_item[idx1], ob_item[idx2]) { 655 | /* 1 2 3 vs. 1 3 2 */ 656 | IFLT(ob_item[idx2], ob_item[idx3]) { 657 | return idx2; 658 | } 659 | else { 660 | return idx3; 661 | } 662 | } 663 | else { 664 | /* 2 1 3 */ 665 | return idx1; 666 | } 667 | } 668 | else { 669 | IFLT(ob_item[idx3], ob_item[idx2]) { 670 | /* 3 1 2 vs 3 2 1 */ 671 | IFLT(ob_item[idx1], ob_item[idx2]) { 672 | return idx1; 673 | } 674 | else { 675 | return idx2; 676 | } 677 | } 678 | else { 679 | /* 2 3 1 */ 680 | return idx3; 681 | } 682 | } 683 | 684 | fail: 685 | return -1; 686 | } 687 | 688 | /* Partitions the data between left and right into 689 | * [less than region | greater or equal to region] 690 | * and returns the pivot index, or -1 on error */ 691 | static Py_ssize_t partition(LSObject *, Py_ssize_t, Py_ssize_t) 692 | Py_GCC_ATTRIBUTE((warn_unused_result)); 693 | 694 | static Py_ssize_t 695 | partition(LSObject *ls, Py_ssize_t left, Py_ssize_t right) 696 | { 697 | PyObject **ob_item = ls->xs->ob_item; 698 | 699 | PyObject *tmp; /* Used by SWAP macro */ 700 | PyObject *pivot; 701 | int ltflag; 702 | 703 | Py_ssize_t piv_idx = pick_pivot(ls, left, right); 704 | if (piv_idx < 0) { 705 | return -1; 706 | } 707 | pivot = ob_item[piv_idx]; 708 | 709 | SWAP(left, piv_idx); 710 | Py_ssize_t last_less = left; 711 | 712 | /* Invariant: last_less and everything to its left is less than 713 | * pivot or the pivot itself */ 714 | 715 | Py_ssize_t i; 716 | for (i = left + 1; i < right - 3; i++) { 717 | /* 718 | This single line boosts performance by a factor of around 2 on GCC. 719 | The optimal lookahead distance i+3 was chosen by experimentation. 720 | See http://www.naftaliharris.com/blog/2x-speedup-with-one-line-of-code/ 721 | */ 722 | __builtin_prefetch(ob_item[i+3]); 723 | IFLT(ob_item[i], pivot) { 724 | last_less++; 725 | SWAP(i, last_less); 726 | } 727 | } 728 | assert(right - left >= 3); /* partition isn't called on small lists */ 729 | for (i = right - 3; i < right; i++) { 730 | IFLT(ob_item[i], pivot) { 731 | last_less++; 732 | SWAP(i, last_less); 733 | } 734 | } 735 | 736 | SWAP(left, last_less); 737 | return last_less; 738 | 739 | fail: /* From IFLT macro */ 740 | return -1; 741 | } 742 | 743 | /* Runs insertion sort on the items left <= i < right */ 744 | static int insertion_sort(LSObject *, Py_ssize_t, Py_ssize_t) 745 | Py_GCC_ATTRIBUTE((warn_unused_result)); 746 | 747 | static int 748 | insertion_sort(LSObject *ls, Py_ssize_t left, Py_ssize_t right) 749 | { 750 | PyObject **ob_item = ls->xs->ob_item; 751 | 752 | PyObject *tmp; 753 | Py_ssize_t i, j; 754 | 755 | for (i = left; i < right; i++) { 756 | tmp = ob_item[i]; 757 | int ltflag = 0; 758 | for (j = i; j > 0 && (ltflag = islt(tmp, ob_item[j - 1], ls)) > 0; j--) 759 | ob_item[j] = ob_item[j - 1]; 760 | ob_item[j] = tmp; 761 | if (ltflag < 0) { 762 | return -1; 763 | } 764 | } 765 | return 0; 766 | } 767 | 768 | /* Runs quicksort on the items left <= i < right, returning 0 on success 769 | * or -1 on error. Does not affect stored pivots at all. */ 770 | static int quick_sort(LSObject *, Py_ssize_t, Py_ssize_t) 771 | Py_GCC_ATTRIBUTE((warn_unused_result)); 772 | 773 | static int 774 | quick_sort(LSObject *ls, Py_ssize_t left, Py_ssize_t right) 775 | { 776 | if (right - left <= SORT_THRESH) { 777 | return insertion_sort(ls, left, right); 778 | } 779 | 780 | Py_ssize_t piv_idx = partition(ls, left, right); 781 | if (piv_idx < 0) 782 | return -1; 783 | 784 | if (quick_sort(ls, left, piv_idx) < 0) 785 | return -1; 786 | 787 | if (quick_sort(ls, piv_idx + 1, right) < 0) 788 | return -1; 789 | 790 | return 0; 791 | } 792 | 793 | /* Sorts the list ls sufficiently such that ls->xs->ob_item[k] is actually the 794 | * kth value in sorted order. Returns 0 on success and -1 on error. */ 795 | static int sort_point(LSObject *, Py_ssize_t) 796 | Py_GCC_ATTRIBUTE((warn_unused_result)); 797 | 798 | static int 799 | sort_point(LSObject *ls, Py_ssize_t k) 800 | { 801 | /* Find the best possible bounds */ 802 | PivotNode *left, *right, *middle; 803 | bound_idx(k, ls->root, &left, &right); 804 | 805 | /* bound_idx never returns k in right, but right might be NULL if 806 | * left->idx == k, so check left->idx first. */ 807 | if (left->idx == k || right->flags & SORTED_RIGHT) { 808 | return 0; 809 | } 810 | 811 | /* Run quickselect */ 812 | Py_ssize_t piv_idx; 813 | 814 | while (left->idx + 1 + SORT_THRESH <= right->idx) { 815 | piv_idx = partition(ls, left->idx + 1, right->idx); 816 | if (piv_idx < 0) { 817 | return -1; 818 | } 819 | if (piv_idx < k) { 820 | if (left->right == NULL) { 821 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, left); 822 | } 823 | else { 824 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, right); 825 | } 826 | if (middle == NULL) 827 | return -1; 828 | 829 | if (uniq_pivots(left, middle, right, ls) < 0) return -1; 830 | left = middle; 831 | } 832 | else if (piv_idx > k) { 833 | if (left->right == NULL) { 834 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, left); 835 | } 836 | else { 837 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, right); 838 | } 839 | if (middle == NULL) 840 | return -1; 841 | 842 | if (uniq_pivots(left, middle, right, ls) < 0) return -1; 843 | right = middle; 844 | } 845 | else { 846 | if (left->right == NULL) { 847 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, left); 848 | } 849 | else { 850 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, right); 851 | } 852 | if (middle == NULL) 853 | return -1; 854 | 855 | if (uniq_pivots(left, middle, right, ls) < 0) return -1; 856 | return 0; 857 | } 858 | } 859 | 860 | if (insertion_sort(ls, left->idx + 1, right->idx) < 0) { 861 | return -1; 862 | } 863 | left->flags |= SORTED_LEFT; 864 | right->flags |= SORTED_RIGHT; 865 | depivot(left, right, &ls->root); 866 | 867 | return 0; 868 | } 869 | 870 | /* Sorts the list ls sufficiently such that everything between indices start 871 | * and stop is in sorted order. Returns 0 on success and -1 on error. */ 872 | static int sort_range(LSObject *, Py_ssize_t, Py_ssize_t) 873 | Py_GCC_ATTRIBUTE((warn_unused_result)); 874 | 875 | static int 876 | sort_range(LSObject *ls, Py_ssize_t start, Py_ssize_t stop) 877 | { 878 | /* The xs list is always partially sorted, with pivots partioning up the 879 | * space, like in this picture: 880 | * 881 | * | ~~~~~ | ~~~ | ~~~~~ | ~~ | ~~~~~~~ | 882 | * 883 | * '|' indicates a pivot and '~~' indicates unsorted data. 884 | * 885 | * So we iterate through the regions bounding our data, and sort them. 886 | */ 887 | 888 | assert(0 <= start && start < stop && stop <= Py_SIZE(ls->xs)); 889 | 890 | if (sort_point(ls, start) < 0) 891 | return -1; 892 | if (sort_point(ls, stop) < 0) 893 | return -1; 894 | 895 | PivotNode *current, *next; 896 | bound_idx(start, ls->root, ¤t, &next); 897 | if (current->idx == start) 898 | next = next_pivot(current); 899 | 900 | while (current->idx < stop) { 901 | if (current->flags & SORTED_LEFT) { 902 | assert(next->flags & SORTED_RIGHT); 903 | } 904 | else { 905 | /* Since we are sorting the entire region, we don't need to keep 906 | * track of pivots, and so we can use vanilla quicksort */ 907 | if (quick_sort(ls, current->idx + 1, next->idx) < 0) { 908 | return -1; 909 | } 910 | current->flags |= SORTED_LEFT; 911 | next->flags |= SORTED_RIGHT; 912 | } 913 | 914 | if (current->flags & SORTED_RIGHT) { 915 | delete_node(current, &ls->root); 916 | } 917 | 918 | current = next; 919 | next = next_pivot(current); 920 | } 921 | 922 | assert(current->flags & SORTED_RIGHT); 923 | if (current->flags & SORTED_LEFT) { 924 | delete_node(current, &ls->root); 925 | } 926 | 927 | return 0; 928 | } 929 | 930 | /* Returns the first index of item in the list, or -2 on error, or -1 if item 931 | * is not present. Places item in that first idx, but makes no guarantees 932 | * any duplicate versions of item will immediately follow. Eg, it's possible 933 | * calling find_item on some list with item = 1 will result in the following 934 | * list: 935 | * [0, 0, 0, 1, 2, 2, 1, 2, 1, 1, 2] 936 | */ 937 | static Py_ssize_t find_item(LSObject *, PyObject *) 938 | Py_GCC_ATTRIBUTE((warn_unused_result)); 939 | 940 | static Py_ssize_t 941 | find_item(LSObject *ls, PyObject *item) 942 | { 943 | PivotNode *left = NULL; 944 | PivotNode *right = NULL; 945 | PivotNode *middle; 946 | PivotNode *current = ls->root; 947 | int ltflag; 948 | Py_ssize_t xs_len = Py_SIZE(ls->xs); 949 | Py_ssize_t left_idx, right_idx; 950 | 951 | while (current != NULL) { 952 | if (current->idx == -1) { 953 | left = current; 954 | current = current->right; 955 | } 956 | else if (current->idx == xs_len) { 957 | right = current; 958 | current = current->left; 959 | } 960 | else { 961 | IFLT(ls->xs->ob_item[current->idx], item) { 962 | left = current; 963 | current = current->right; 964 | } 965 | else { 966 | right = current; 967 | current = current->left; 968 | } 969 | } 970 | } 971 | 972 | if (left->flags & SORTED_LEFT) { 973 | assert(right->flags & SORTED_RIGHT); 974 | left_idx = left->idx + 1; 975 | right_idx = right->idx == xs_len ? xs_len : right->idx + 1; 976 | } 977 | else { 978 | Py_ssize_t piv_idx; 979 | while (left->idx + 1 + SORT_THRESH <= right->idx) { 980 | if ((piv_idx = partition(ls, left->idx + 1, right->idx)) < 0) { 981 | return -2; 982 | } 983 | IFLT(ls->xs->ob_item[piv_idx], item) { 984 | if (left->right == NULL) { 985 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, left); 986 | } 987 | else { 988 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, right); 989 | } 990 | if (middle == NULL) 991 | return -2; 992 | 993 | if (uniq_pivots(left, middle, right, ls) < 0) return -1; 994 | left = middle; 995 | } 996 | else { 997 | if (left->right == NULL) { 998 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, left); 999 | } 1000 | else { 1001 | middle = insert_pivot(piv_idx, UNSORTED, &ls->root, right); 1002 | } 1003 | if (middle == NULL) 1004 | return -2; 1005 | 1006 | if (uniq_pivots(left, middle, right, ls) < 0) return -1; 1007 | right = middle; 1008 | } 1009 | } 1010 | 1011 | left_idx = left->idx + 1; 1012 | right_idx = right->idx == xs_len ? xs_len : right->idx + 1; 1013 | 1014 | if (insertion_sort(ls, left->idx + 1, right->idx) < 0) { 1015 | return -2; 1016 | } 1017 | left->flags |= SORTED_LEFT; 1018 | right->flags |= SORTED_RIGHT; 1019 | depivot(left, right, &ls->root); 1020 | } 1021 | 1022 | /* TODO: Do binary search now */ 1023 | Py_ssize_t k; 1024 | int cmp = 0; 1025 | for (k = left_idx; cmp == 0 && k < right_idx; k++) { 1026 | cmp = PyObject_RichCompareBool(item, ls->xs->ob_item[k], Py_EQ); 1027 | } 1028 | 1029 | if (cmp < 0) { 1030 | return -2; 1031 | } 1032 | else if (cmp == 0) { 1033 | return -1; 1034 | } 1035 | else { 1036 | return k - 1; /* -1 since incremented in for loop */ 1037 | } 1038 | 1039 | fail: 1040 | return -2; 1041 | } 1042 | 1043 | /* Public facing LazySorted methods */ 1044 | 1045 | static PyObject *idxerr = NULL; 1046 | 1047 | static PyObject * 1048 | ls_subscript(LSObject* self, PyObject* item) 1049 | { 1050 | Py_ssize_t xs_len = Py_SIZE(self->xs); 1051 | 1052 | if (PyIndex_Check(item)) { 1053 | Py_ssize_t k; 1054 | k = PyNumber_AsSsize_t(item, PyExc_IndexError); 1055 | if (k == -1 && PyErr_Occurred()) 1056 | return NULL; 1057 | if (k < 0) 1058 | k += xs_len; 1059 | 1060 | if (k < 0 || k >= xs_len) { 1061 | if (idxerr == NULL) { 1062 | idxerr = PyString_FromString("LazySorted index out of range"); 1063 | if (idxerr == NULL) 1064 | return NULL; 1065 | } 1066 | PyErr_SetObject(PyExc_IndexError, idxerr); 1067 | return NULL; 1068 | } 1069 | 1070 | if (sort_point(self, k) < 0) 1071 | return NULL; 1072 | 1073 | Py_INCREF(self->xs->ob_item[k]); 1074 | return self->xs->ob_item[k]; 1075 | } 1076 | else if (PySlice_Check(item)) { 1077 | Py_ssize_t start, stop, step, slicelength; 1078 | 1079 | if (PySlice_GetIndicesEx(item, Py_SIZE(self->xs), 1080 | &start, &stop, &step, &slicelength) < 0) { 1081 | return NULL; 1082 | } 1083 | 1084 | if (slicelength <= 0) { 1085 | return PyList_New(0); 1086 | } 1087 | else if (-CONTIG_THRESH <= step && step <= CONTIG_THRESH) { 1088 | Py_ssize_t left = start < stop ? start : stop; 1089 | Py_ssize_t right = start < stop ? stop : start; 1090 | 1091 | if (step < 0) { 1092 | left++; 1093 | right++; 1094 | } 1095 | 1096 | if (sort_range(self, left, right) < 0) { 1097 | return NULL; 1098 | } 1099 | 1100 | PyListObject *result = (PyListObject *)PyList_New(slicelength); 1101 | if (result == NULL) 1102 | return NULL; 1103 | 1104 | Py_ssize_t k, j; 1105 | for (k = start, j = 0; j < slicelength; k += step, j++) { 1106 | Py_INCREF(self->xs->ob_item[k]); 1107 | result->ob_item[j] = self->xs->ob_item[k]; 1108 | } 1109 | 1110 | return (PyObject *)result; 1111 | } 1112 | else { 1113 | PyListObject *result = (PyListObject *)PyList_New(slicelength); 1114 | if (result == NULL) 1115 | return NULL; 1116 | 1117 | Py_ssize_t k, j; 1118 | for (k = start, j = 0; j < slicelength; k += step, j++) { 1119 | if (sort_point(self, k) < 0) 1120 | return NULL; 1121 | Py_INCREF(self->xs->ob_item[k]); 1122 | result->ob_item[j] = self->xs->ob_item[k]; 1123 | } 1124 | 1125 | return (PyObject *)result; 1126 | } 1127 | } 1128 | else { 1129 | PyErr_Format(PyExc_TypeError, 1130 | "list indices must be integers, not %.200s", 1131 | item->ob_type->tp_name); 1132 | return NULL; 1133 | } 1134 | } 1135 | 1136 | /* Returns (possibly unsorted) data in a specified contiguous range */ 1137 | static PyObject * 1138 | between(LSObject *self, PyObject *args) 1139 | { 1140 | Py_ssize_t left; 1141 | Py_ssize_t right; 1142 | 1143 | if (!PyArg_ParseTuple(args, "nn:list", &left, &right)) 1144 | return NULL; 1145 | 1146 | Py_ssize_t xlen = Py_SIZE(self->xs); 1147 | if (left < 0) { 1148 | left += xlen; 1149 | } 1150 | else if (left > xlen) { 1151 | left = xlen; 1152 | } 1153 | 1154 | if (right < 0) { 1155 | right += xlen; 1156 | } 1157 | else if (right > xlen) { 1158 | right = xlen; 1159 | } 1160 | 1161 | if (left >= right || right <= 0) { 1162 | return PyList_New(0); 1163 | } 1164 | 1165 | if (left != 0 && sort_point(self, left) < 0) 1166 | return NULL; 1167 | if (right != xlen && sort_point(self, right) < 0) 1168 | return NULL; 1169 | 1170 | PyListObject *result = (PyListObject *)PyList_New(right - left); 1171 | if (result == NULL) 1172 | return NULL; 1173 | 1174 | Py_ssize_t k; 1175 | for (k = left; k < right; k++) { 1176 | Py_INCREF(self->xs->ob_item[k]); 1177 | result->ob_item[k - left] = self->xs->ob_item[k]; 1178 | } 1179 | 1180 | return (PyObject *)result; 1181 | } 1182 | 1183 | static PyObject * 1184 | ls_index(LSObject *self, PyObject *args) 1185 | { 1186 | PyObject *item; 1187 | if (!PyArg_ParseTuple(args, "O:list", &item)) 1188 | return NULL; 1189 | 1190 | Py_ssize_t index = find_item(self, item); 1191 | if (index == -2) { 1192 | return NULL; 1193 | } 1194 | else if (index == -1) { 1195 | PyObject *err_format, *err_string, *format_tuple; 1196 | err_format = PyString_FromString("%r is not in list"); 1197 | if (err_format == NULL) 1198 | return NULL; 1199 | format_tuple = PyTuple_Pack(1, item); 1200 | if (format_tuple == NULL) 1201 | return NULL; 1202 | err_string = PyString_Format(err_format, format_tuple); 1203 | Py_DECREF(format_tuple); 1204 | if (err_string == NULL) 1205 | return NULL; 1206 | PyErr_SetObject(PyExc_ValueError, err_string); 1207 | Py_DECREF(err_string); 1208 | return NULL; 1209 | } 1210 | else { 1211 | return PyInt_FromSsize_t(index); 1212 | } 1213 | } 1214 | 1215 | static PyObject * 1216 | ls_count(LSObject *self, PyObject *args) 1217 | { 1218 | PyObject *item; 1219 | if (!PyArg_ParseTuple(args, "O:list", &item)) 1220 | return NULL; 1221 | 1222 | Py_ssize_t k = find_item(self, item); 1223 | if (k == -2) { 1224 | return NULL; 1225 | } 1226 | 1227 | if (k == -1) { 1228 | return PyInt_FromSsize_t(0); 1229 | } 1230 | else { 1231 | /* Figure out where items may be */ 1232 | PivotNode *left, *right; 1233 | bound_idx(k, self->root, &left, &right); 1234 | if (right == NULL) { 1235 | right = next_pivot(left); 1236 | } 1237 | 1238 | int xs_len = Py_SIZE(self->xs); 1239 | int cmp; 1240 | for (cmp = 1; right->idx < xs_len && cmp; right = next_pivot(right)) { 1241 | cmp = PyObject_RichCompareBool(item, 1242 | self->xs->ob_item[right->idx], 1243 | Py_EQ); 1244 | if (cmp < 0) { 1245 | return NULL; 1246 | } 1247 | } 1248 | 1249 | /* TODO: do some additional sorting here to take advantage of the 1250 | * compares. Or refactor the code substantially or something. */ 1251 | Py_ssize_t count = 1; 1252 | for (k++; k < right->idx; k++) { 1253 | cmp = PyObject_RichCompareBool(item, self->xs->ob_item[k], Py_EQ); 1254 | if (cmp < 0) { 1255 | return NULL; 1256 | } 1257 | else if (cmp) { 1258 | count++; 1259 | } 1260 | } 1261 | return PyInt_FromSsize_t(count); 1262 | } 1263 | } 1264 | 1265 | static int 1266 | ls_contains(LSObject *self, PyObject *item) 1267 | { 1268 | Py_ssize_t idx = find_item(self, item); 1269 | if (idx == -2) { 1270 | return -1; 1271 | } 1272 | else if (idx == -1) { 1273 | return 0; 1274 | } 1275 | else { 1276 | return 1; 1277 | } 1278 | } 1279 | 1280 | static PyObject * 1281 | ls_pivots(LSObject *self) 1282 | { 1283 | PyObject *result = PyList_New(0); 1284 | if (result == NULL) 1285 | return NULL; 1286 | 1287 | PyObject *unsorted = PyString_FromString("UNSORTED"); 1288 | if (unsorted == NULL) 1289 | return NULL; 1290 | PyObject *sortedright = PyString_FromString("SORTED_RIGHT"); 1291 | if (sortedright == NULL) 1292 | return NULL; 1293 | PyObject *sortedleft = PyString_FromString("SORTED_LEFT"); 1294 | if (sortedleft == NULL) 1295 | return NULL; 1296 | PyObject *sortedboth = PyString_FromString("SORTED_BOTH"); 1297 | if (sortedboth == NULL) 1298 | return NULL; 1299 | 1300 | PyObject *flags[4] = {unsorted, sortedright, sortedleft, sortedboth}; 1301 | 1302 | PivotNode *curr = self->root; 1303 | while (curr->left != NULL) 1304 | curr = curr->left; 1305 | 1306 | Py_ssize_t i; 1307 | PyObject *index; 1308 | PyObject *tuple; 1309 | for (i = 0; curr != NULL; i++, curr = next_pivot(curr)) { 1310 | index = PyInt_FromSsize_t(curr->idx); 1311 | if (index == NULL) { 1312 | Py_DECREF(result); 1313 | return NULL; 1314 | } 1315 | tuple = PyTuple_Pack(2, index, flags[curr->flags]); 1316 | if (tuple == NULL) { 1317 | Py_DECREF(index); 1318 | Py_DECREF(result); 1319 | return NULL; 1320 | } 1321 | PyList_Append(result, tuple); 1322 | } 1323 | 1324 | return (PyObject*)result; 1325 | } 1326 | 1327 | static Py_ssize_t 1328 | ls_length(LSObject *self) 1329 | { 1330 | return Py_SIZE(self->xs); 1331 | } 1332 | 1333 | /* The LazySorted iterator object */ 1334 | /* TODO: Be a little smarter here, (keep track of pivots, etc) */ 1335 | typedef struct { 1336 | PyObject_HEAD 1337 | LSObject *ls; /* The referenced lazysorted object */ 1338 | Py_ssize_t i; /* The next location to check */ 1339 | } LSIterObject; 1340 | 1341 | static PyTypeObject LSIter_Type; 1342 | #define LSIterObject_Check(v) (Py_TYPE(v) == &LSIter_Type) 1343 | 1344 | static PyMethodDef LSIterObject_methods[] = { 1345 | {NULL, NULL} /* sentinel */ 1346 | }; 1347 | 1348 | PyObject* 1349 | LSObject_iter(PyObject *self) 1350 | { 1351 | LSIterObject *it; 1352 | 1353 | if (!LSObject_Check(self)) { 1354 | PyErr_BadInternalCall(); 1355 | return NULL; 1356 | } 1357 | it = PyObject_GC_New(LSIterObject, &LSIter_Type); 1358 | if (it == NULL) 1359 | return NULL; 1360 | it->i = 0; 1361 | Py_INCREF(self); 1362 | it->ls = (LSObject *)self; 1363 | 1364 | return (PyObject *)it; 1365 | } 1366 | 1367 | static void 1368 | LSIterObject_dealloc(LSIterObject *it) 1369 | { 1370 | Py_XDECREF(it->ls); 1371 | PyObject_GC_Del(it); 1372 | } 1373 | 1374 | PyObject* 1375 | LSObject_iternext(PyObject *self) 1376 | { 1377 | LSIterObject *lsi = (LSIterObject *)self; 1378 | if (lsi->i < ls_length(lsi->ls)) { 1379 | if (sort_point(lsi->ls, lsi->i) < 0) { 1380 | return NULL; 1381 | } 1382 | PyObject *res = lsi->ls->xs->ob_item[lsi->i]; 1383 | Py_INCREF(res); 1384 | (lsi->i)++; 1385 | return res; 1386 | } else { 1387 | PyErr_SetNone(PyExc_StopIteration); 1388 | return NULL; 1389 | } 1390 | } 1391 | 1392 | static PyTypeObject LSIter_Type = { 1393 | PyVarObject_HEAD_INIT(&PyType_Type, 0) 1394 | "LazySortedIterator", /* tp_name */ 1395 | sizeof(LSIterObject), /* tp_basicsize */ 1396 | 0, /* tp_itemsize */ 1397 | /* methods */ 1398 | (destructor)LSIterObject_dealloc, /* tp_dealloc */ 1399 | 0, /* tp_print */ 1400 | 0, /* tp_getattr */ 1401 | 0, /* tp_setattr */ 1402 | 0, /* tp_compare */ 1403 | 0, /* tp_repr */ 1404 | 0, /* tp_as_number */ 1405 | 0, /* tp_as_sequence */ 1406 | 0, /* tp_as_mapping */ 1407 | 0, /* tp_hash */ 1408 | 0, /* tp_call */ 1409 | 0, /* tp_str */ 1410 | 0, /* tp_getattro */ 1411 | 0, /* tp_setattro */ 1412 | 0, /* tp_as_buffer */ 1413 | Py_TPFLAGS_DEFAULT, /* tp_flags */ 1414 | 0, /* tp_doc */ 1415 | 0, /* tp_traverse */ 1416 | 0, /* tp_clear */ 1417 | 0, /* tp_richcompare */ 1418 | 0, /* tp_weaklistoffset */ 1419 | PyObject_SelfIter, /* tp_iter */ 1420 | (iternextfunc)LSObject_iternext, /* tp_iternext */ 1421 | LSIterObject_methods, /* tp_methods */ 1422 | 0, /* tp_members */ 1423 | }; 1424 | 1425 | 1426 | /* TODO: This documentation sucks */ 1427 | static PyMethodDef LS_methods[] = { 1428 | {"__getitem__", (PyCFunction)ls_subscript, METH_O|METH_COEXIST, 1429 | PyDoc_STR( 1430 | "__getitem__ allows you to access items from the LazySorted instance. It is\n" 1431 | "equivalent to the subscript notation, ie, LS.__getitem__(x) is the same\n" 1432 | "thing as LS[x], where x is integer or slice object.\n" 1433 | "When __getitem__ is called, it sorts the internal list only enough so that\n" 1434 | "it can get your query, and then returns it.\n" 1435 | "\n" 1436 | "Examples:\n" 1437 | " >>> xs = range(100)\n" 1438 | " >>> random.shuffle(xs)\n" 1439 | " >>> ls = LazySorted(xs)\n" 1440 | " >>> ls[26]\n" 1441 | " 26\n" 1442 | " >>> ls[5:10]\n" 1443 | " [5, 6, 7, 8, 9]\n" 1444 | " >>> ls[::20]\n" 1445 | " [0, 20, 40, 60, 80]" 1446 | )}, 1447 | {"between", (PyCFunction)between, METH_VARARGS, 1448 | PyDoc_STR( 1449 | "between allows you to access all points that are between particular\n" 1450 | "indices. The order of the points it returns, however, is undefined. This is\n" 1451 | "useful if you want to throw away outliers in some data set, for example.\n" 1452 | "\n" 1453 | "Examples:\n\n" 1454 | " >>> xs = range(100)\n" 1455 | " >>> random.shuffle(xs)\n" 1456 | " >>> ls = LazySorted(xs)\n" 1457 | " >>> set(ls.between(5, 95)) == set(range(5, 95))\n" 1458 | " True" 1459 | )}, 1460 | {"index", (PyCFunction)ls_index, METH_VARARGS, 1461 | PyDoc_STR( 1462 | "Returns the first index of item in the list, or raises a ValueError if it\n" 1463 | "isn't present" 1464 | )}, 1465 | {"count", (PyCFunction)ls_count, METH_VARARGS, 1466 | PyDoc_STR( 1467 | "Returns the number of times the item appears in the list" 1468 | )}, 1469 | {"_pivots", (PyCFunction)ls_pivots, METH_NOARGS, 1470 | PyDoc_STR( 1471 | "Returns the list of pivot indices, for debugging" 1472 | )}, 1473 | {NULL, NULL} /* sentinel */ 1474 | }; 1475 | 1476 | static PySequenceMethods ls_as_sequence = { 1477 | (lenfunc)ls_length, /* sq_length */ 1478 | 0, /* sq_concat */ 1479 | 0, /* sq_repeat */ 1480 | 0, /* sq_item */ 1481 | 0, /* sq_slice */ 1482 | 0, /* sq_ass_item */ 1483 | 0, /* sq_ass_slice */ 1484 | (objobjproc)ls_contains, /* sq_contains */ 1485 | 0, /* sq_inplace_concat */ 1486 | 0, /* sq_inplace_repeat */ 1487 | }; 1488 | 1489 | static PyMappingMethods ls_as_mapping = { 1490 | (lenfunc)ls_length, 1491 | (binaryfunc)ls_subscript, 1492 | NULL, 1493 | }; 1494 | 1495 | static PyTypeObject LS_Type = { 1496 | /* The ob_type field must be initialized in the module init function 1497 | * to be portable to Windows without using C++. */ 1498 | PyVarObject_HEAD_INIT(NULL, 0) 1499 | "lazysorted.LazySorted",/*tp_name*/ 1500 | sizeof(LSObject), /*tp_basicsize*/ 1501 | 0, /*tp_itemsize*/ 1502 | /* methods */ 1503 | (destructor)LS_dealloc, /*tp_dealloc*/ 1504 | 0, /*tp_print*/ 1505 | 0, /*tp_getattr*/ 1506 | 0, /*tp_setattr*/ 1507 | 0, /*tp_compare*/ 1508 | 0, /*tp_repr*/ 1509 | 0, /*tp_as_number*/ 1510 | &ls_as_sequence, /*tp_as_sequence*/ 1511 | &ls_as_mapping, /*tp_as_mapping*/ 1512 | 0, /*tp_hash*/ 1513 | 0, /*tp_call*/ 1514 | 0, /*tp_str*/ 1515 | 0, /*tp_getattro*/ 1516 | 0, /*tp_setattro*/ 1517 | 0, /*tp_as_buffer*/ 1518 | Py_TPFLAGS_DEFAULT | 1519 | Py_TPFLAGS_BASETYPE, /*tp_flags*/ 1520 | 0, /*tp_doc*/ 1521 | 0, /*tp_traverse*/ 1522 | 0, /*tp_clear*/ 1523 | 0, /*tp_richcompare*/ 1524 | 0, /*tp_weaklistoffset*/ 1525 | LSObject_iter, /*tp_iter*/ 1526 | 0, /*tp_iternext*/ 1527 | LS_methods, /*tp_methods*/ 1528 | 0, /*tp_members*/ 1529 | 0, /*tp_getset*/ 1530 | 0, /*tp_base*/ 1531 | 0, /*tp_dict*/ 1532 | 0, /*tp_descr_get*/ 1533 | 0, /*tp_descr_set*/ 1534 | 0, /*tp_dictoffset*/ 1535 | 0, /*tp_init*/ 1536 | PyType_GenericAlloc, /*tp_alloc*/ 1537 | newLSObject, /*tp_new*/ 1538 | 0, /*tp_free*/ 1539 | 0, /*tp_is_gc*/ 1540 | }; 1541 | 1542 | /* List of functions defined in the module */ 1543 | static PyMethodDef ls_methods[] = { 1544 | {NULL, NULL} /* sentinel */ 1545 | }; 1546 | 1547 | PyDoc_STRVAR(module_doc, 1548 | "lazysorted is a Python extension module for sorting sequences lazily. It\n" 1549 | "presents the programmer with the abstraction that they are actually working\n" 1550 | "with a sorted list, when in fact the list is only physically sorted when\n" 1551 | "the programmer requests elements from it, and even then it is only sorted\n" 1552 | "partially just enough to return whatever was requested.\n" 1553 | "\n" 1554 | "The LazySorted object has a constructor that implements the same interface\n" 1555 | "as the builtin `sorted(...)` function, and it supports most of the non-\n" 1556 | "mutating methods of a python list.\n" 1557 | ); 1558 | 1559 | /* Initialization function for the module */ 1560 | #if PY_MAJOR_VERSION >= 3 1561 | PyMODINIT_FUNC 1562 | PyInit_lazysorted(void) 1563 | { 1564 | srand(time(NULL)); 1565 | 1566 | PyObject *m; 1567 | 1568 | /* Finalize the type object including setting type of the new type 1569 | * object; doing it here is required for portability, too. */ 1570 | 1571 | if (PyType_Ready(&LS_Type) < 0) 1572 | return NULL; 1573 | 1574 | /* Create the module and add the functions */ 1575 | static struct PyModuleDef moduledef = { 1576 | PyModuleDef_HEAD_INIT, 1577 | "lazysorted", /* m_name */ 1578 | module_doc, /* m_doc */ 1579 | -1, /* m_size */ 1580 | ls_methods, /* m_methods */ 1581 | NULL, /* m_reload */ 1582 | NULL, /* m_traverse */ 1583 | NULL, /* m_clear */ 1584 | NULL, /* m_free */ 1585 | }; 1586 | m = PyModule_Create(&moduledef); 1587 | if (m == NULL) 1588 | return NULL; 1589 | 1590 | PyModule_AddObject(m, "LazySorted", (PyObject *)&LS_Type); 1591 | return m; 1592 | } 1593 | #else 1594 | PyMODINIT_FUNC 1595 | initlazysorted(void) 1596 | { 1597 | srand(time(NULL)); 1598 | 1599 | PyObject *m; 1600 | 1601 | /* Finalize the type object including setting type of the new type 1602 | * object; doing it here is required for portability, too. */ 1603 | 1604 | if (PyType_Ready(&LS_Type) < 0) 1605 | return; 1606 | 1607 | /* Create the module and add the functions */ 1608 | m = Py_InitModule3("lazysorted", ls_methods, module_doc); 1609 | if (m == NULL) 1610 | return; 1611 | 1612 | PyModule_AddObject(m, "LazySorted", (PyObject *)&LS_Type); 1613 | return; 1614 | } 1615 | #endif 1616 | --------------------------------------------------------------------------------