├── redis_import_tools ├── __init__.py ├── examples │ ├── set-vs-list.txt │ ├── redis-import-set_csv.py │ ├── redis-import-set_groupby.py │ ├── redis-import-set_split.py │ ├── redis-import-set_zipfile.py │ └── one_grams.py └── commands.py ├── requirements.pip ├── .gitignore ├── setup.py ├── bin └── redis-import-set ├── LICENSE └── README.rst /redis_import_tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.pip: -------------------------------------------------------------------------------- 1 | redis=2.2.0 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='redis-import-tools', 5 | version='0.1', 6 | long_description=__doc__, 7 | packages=['redis_import_tools'], 8 | include_package_data=True, 9 | zip_safe=False, 10 | #install_requires=['redis==2.2.0'], 11 | scripts=['bin/redis-import-set'] 12 | ) 13 | -------------------------------------------------------------------------------- /redis_import_tools/examples/set-vs-list.txt: -------------------------------------------------------------------------------- 1 | List import twice as fast as set: 2 | 3 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ time redis-import-set set test2 < /tmp/activity_i 4 | ds.txt 5 | 6 | real 4m5.133s 7 | user 3m44.330s 8 | sys 0m1.730s 9 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ time redis-import-set list testlist < /tmp/activi 10 | ty_ids.txt 11 | 12 | real 2m10.471s 13 | user 1m57.740s 14 | sys 0m1.820s 15 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ redis-cli scard test2 16 | (integer) 4617374 17 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ redis-cli llen testlist 18 | (integer) 4617374 19 | -------------------------------------------------------------------------------- /redis_import_tools/examples/redis-import-set_csv.py: -------------------------------------------------------------------------------- 1 | # redis-import-set 2 | import fileinput 3 | import sys 4 | from csv import reader 5 | from itertools import count, islice 6 | 7 | import redis 8 | 9 | 10 | if __name__ == '__main__': 11 | 12 | r = redis.Redis() 13 | pipeline_redis = r.pipeline() 14 | count = 0 15 | try: 16 | keyname = sys.argv[1] 17 | except IndexError: 18 | raise Exception("You must specify the name for the Set") 19 | 20 | for line in reader(sys.stdin, delimiter='\t'): 21 | pipeline_redis.sadd(keyname, line[0]) 22 | count += 1 23 | if not count % 10000: 24 | pipeline_redis.execute() 25 | 26 | 27 | -------------------------------------------------------------------------------- /redis_import_tools/examples/redis-import-set_groupby.py: -------------------------------------------------------------------------------- 1 | # redis-import-set 2 | import sys 3 | from csv import reader 4 | from itertools import count, groupby, islice 5 | 6 | import redis 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | r = redis.Redis() 12 | pipeline_redis = r.pipeline() 13 | count = 0 14 | try: 15 | keyname = sys.argv[1] 16 | except IndexError: 17 | raise Exception("You must specify the name for the Set") 18 | 19 | for k, _ in groupby(reader(sys.stdin, delimiter='\t'), 20 | lambda x:x[0]): 21 | pipeline_redis.sadd(keyname, k) 22 | count += 1 23 | if not count % 10000: 24 | pipeline_redis.execute() 25 | 26 | 27 | -------------------------------------------------------------------------------- /redis_import_tools/examples/redis-import-set_split.py: -------------------------------------------------------------------------------- 1 | # redis-import-set 2 | import fileinput 3 | import sys 4 | from csv import reader 5 | from itertools import count, islice 6 | 7 | import redis 8 | 9 | 10 | if __name__ == '__main__': 11 | 12 | r = redis.Redis() 13 | pipeline_redis = r.pipeline() 14 | count = 0 15 | try: 16 | keyname = sys.argv[1] 17 | except IndexError: 18 | raise Exception("You must specify the name for the Set") 19 | 20 | for line in groupby(sys.stdin, 21 | lambda x: 22 | pipeline_redis.sadd(keyname, line.split('\t')[0]) 23 | count += 1 24 | if not count % 10000: 25 | pipeline_redis.execute() 26 | 27 | 28 | -------------------------------------------------------------------------------- /redis_import_tools/examples/redis-import-set_zipfile.py: -------------------------------------------------------------------------------- 1 | # redis-import-set 2 | import sys 3 | from itertools import count, groupby 4 | from zipfile import ZipFile 5 | 6 | import redis 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | r = redis.Redis() 12 | pipeline_redis = r.pipeline() 13 | count = 0 14 | try: 15 | keyname = sys.argv[1] 16 | except IndexError: 17 | raise Exception("You must specify the name for the Set") 18 | 19 | zf = ZipFile(sys.stdin) 20 | arch = zf.namelist()[0] 21 | 22 | for k, _ in groupby(zf.open(arch), 23 | lambda x:x.split('\t')[0]): 24 | pipeline_redis.sadd(keyname, k) 25 | count += 1 26 | if not count % 10000: 27 | pipeline_redis.execute() 28 | 29 | 30 | -------------------------------------------------------------------------------- /bin/redis-import-set: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import sys 4 | 5 | from redis_import_tools import commands 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser( 10 | description='Tools for importing data into Redis.') 11 | 12 | parser.add_argument('-b', '--batch-size', default=10000, 13 | help='number of SADD operations to send per pipeline batch') 14 | 15 | #parser.add_argument('-u', '--unsorted', action="store_true", 16 | # help='indicates that the input is unsorted') 17 | 18 | parser.add_argument('cmd', 19 | help='Redis command') 20 | parser.add_argument('key', 21 | help='name of the key for the command') 22 | 23 | options = parser.parse_args() 24 | 25 | cmd = options.cmd 26 | if cmd == 'set': 27 | commands.load_set(options.key, sys.stdin, batch_size=options.batch_size) 28 | elif cmd == 'list': 29 | commands.load_list(options.key, sys.stdin, batch_size=options.batch_size) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Brian Luft 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /redis_import_tools/examples/one_grams.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | from csv import reader 5 | from glob import glob 6 | from itertools import count, groupby 7 | from optparse import OptionParser 8 | from zipfile import ZipFile 9 | 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | parser = OptionParser() 15 | parser.add_option('-d', '--datadir', 16 | help="Directory containing data files", 17 | metavar="DIR") 18 | 19 | options, args = parser.parse_args() 20 | 21 | if not options.datadir: 22 | #create a temp dir 23 | pass 24 | else: 25 | datadir = options.datadir 26 | 27 | 28 | #for onegram_archive in glob(os.path.join(datadir, '*.zip')): 29 | t0 = time.clock() 30 | onegram_archive = glob(os.path.join(datadir, '*.zip'))[0] 31 | #archive_lines = count() 32 | archive_lines = 0 33 | zf = ZipFile(onegram_archive) 34 | archive_name = zf.namelist()[0] 35 | print archive_name 36 | for gram, _ in groupby(reader(zf.open(archive_name), delimiter='\t'), 37 | lambda x:x[0]): 38 | archive_lines += 1 39 | if archive_lines % 100000 == 0: 40 | print archive_lines 41 | print time.clock() - t0 42 | t0 = time.clock() 43 | 44 | -------------------------------------------------------------------------------- /redis_import_tools/commands.py: -------------------------------------------------------------------------------- 1 | """wrappers for Redis insert commands""" 2 | import sys 3 | from csv import reader 4 | from itertools import groupby 5 | 6 | import redis 7 | 8 | 9 | def load_set(key, IN, **kwargs): 10 | """ 11 | """ 12 | r = redis.Redis() 13 | pipeline_redis = r.pipeline() 14 | count = 0 15 | #batch_size = kwargs['batch_size'] 16 | batch_size = kwargs.get('batch_size', 1000) 17 | 18 | seen = set([None]) 19 | for member, _ in groupby(reader(IN, delimiter='\t'), 20 | lambda x: x[0] if len(x) else None): 21 | if member not in seen: 22 | pipeline_redis.sadd(key, member.rstrip()) 23 | count += 1 24 | seen.add(member) 25 | if not count % batch_size: 26 | pipeline_redis.execute() 27 | #send the last batch 28 | pipeline_redis.execute() 29 | 30 | 31 | def load_list(key, IN, **kwargs): 32 | """ 33 | """ 34 | r = redis.Redis() 35 | pipeline_redis = r.pipeline() 36 | count = 0 37 | batch_size = kwargs.get('batch_size', 1000) 38 | 39 | for line in IN: 40 | pipeline_redis.rpush(key, line.rstrip()) 41 | count += 1 42 | if not count % batch_size: 43 | pipeline_redis.execute() 44 | #send the last batch 45 | pipeline_redis.execute() 46 | 47 | 48 | def load_hash_list(IN, **kwargs): 49 | """ 50 | """ 51 | r = redis.Redis() 52 | pipeline_redis = r.pipeline() 53 | count = 0 54 | batch_size = kwargs.get('batch_size', 1000) 55 | 56 | for key, mapping in IN: 57 | pipeline_redis.hmset(key, mapping) 58 | count += 1 59 | if not count % batch_size: 60 | pipeline_redis.execute() 61 | #send the last batch 62 | pipeline_redis.execute() 63 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | redis-import-tools 2 | ================== 3 | 4 | A collection of utilities for importing data into Redis 5 | 6 | Commands 7 | -------- 8 | 9 | redis-import-set 10 | Create a Redis Set from a column of values in a text file 11 | 12 | 13 | Installation 14 | ------------ 15 | 16 | It is assumed that you have Redis version >= 1.3 installed and configured. 17 | 18 | :: 19 | 20 | pip install redis-import-tools 21 | 22 | 23 | Introduction 24 | ------------ 25 | 26 | Let's start with a trivial example. We'd like to load our local words dictionary into a Redis set. One approach might be:: 27 | 28 | $ cat /usr/share/dict/words \ 29 | | xargs -d "\n" -I word redis-cli sadd engdict word > /dev/null 30 | 31 | We're piping the contents of the dictionary file to `xargs`, which will run an ``SADD`` command for each 32 | word in the dictionary and add it to a set called ``engdict``. 33 | 34 | While the one-liner is nice, performance was terrible:: 35 | 36 | real 5m36.977s 37 | user 0m3.560s 38 | sys 1m17.490s 39 | 40 | We can observe that we're making one query to Redis for each word in the dictionary and there are:: 41 | 42 | 43 | $ wc -l /usr/share/dict/words 44 | 98569 /usr/share/dict/words 45 | 46 | 98569 words in the dictionary. Five minutes to get 100,000 requests into Redis isn't acceptable. Also, it looks like some words didn't make 47 | it into the set:: 48 | 49 | $ redis-cli 50 | redis> scard engdict 51 | (integer) 95426 52 | 53 | 54 | 55 | One obvious place for improvement is to use the Redis pipelining feature to cut down significantly on the number of requests made. 56 | I see redis-cli as a convenience tool and I suspect it wasn't designed with the use case of frequently forking new processes. By building on 57 | the solid redis-py Redis client library we can come up with some basic utilities that will offer great performance with some flexibility 58 | for populating Redis from data sources such as CSV/TSV files. 59 | 60 | With code like the following, we can send data to Redis in batches (10000 values per request):: 61 | 62 | r = redis.Redis() 63 | pipeline_redis = r.pipeline() 64 | count = 0 65 | for line in sys.stdin: 66 | pipeline_redis.sadd(keyname, line.rstrip()) 67 | count += 1 68 | if not count % 10000: 69 | pipeline_redis.execute() 70 | pipeline_redis.execute() 71 | 72 | This code is the basic idea for the redis-import-set command. Here's how to use the command to perform the desired operation:: 73 | 74 | $ redis-import-set engdict < /usr/share/dict/words 75 | 76 | Performance is now very acceptable:: 77 | 78 | real 0m2.838s 79 | user 0m2.530s 80 | sys 0m0.050s 81 | 82 | And the set count matches the input count:: 83 | 84 | redis> scard engdict 85 | (integer) 98569 86 | 87 | 88 | About Filtering, Sorting, and Compression... 89 | -------------------------------------------- 90 | 91 | Often you will be starting with an input set that contains extraneous data (columns and/or rows you won't need). 92 | 93 | The performance characteristics of these data processing steps can vary depending on where they are handled. For example, 94 | the Python ZipFile streaming appeared to add significant overhead under scenarios I tested. I may add some basic fallbacks 95 | for decompressing the input source in the future and I'm also deciding what features in general I might want to support for I/O with 96 | compressed/archive data formats. Since decompressing data is always 97 | a first operation it easiest just to rely on the native OS compression tools and allow the redis-import-tools commands to 98 | assume (uncompressed) textual input. 99 | 100 | Python CSV seems to add a minimal amount of overhead so leaving it in is worthwhile since it probably more robust than 101 | naively using ``string.split()`` 102 | 103 | 104 | Case Study 105 | ---------- 106 | 107 | 108 | Using the corpus of English 1-grams from Google Books: TODO 109 | 110 | The first archive contains 29232733 rows of which 420044 are unique. 111 | 112 | Extract the first column into a file which we'll later use as input redis-import-set:: 113 | 114 | $ time unzip -p googlebooks-eng-us-all-1gram-20090715-0.csv.zip | cut -f 1 | uniq > eng-us-all-1gram-0-uniq-grams 115 | 116 | real 0m12.706s 117 | user 0m15.820s 118 | sys 0m1.460s 119 | 120 | $ wc -l eng-us-all-1gram-0-uniq-grams 121 | 420044 eng-us-all-1gram-0-uniq-grams 122 | 123 | Import to a Redis set called ``1g``:: 124 | 125 | $ time redis-import-set 1g < eng-us-all-1gram-0-uniq-grams 126 | 127 | real 0m12.995s 128 | user 0m11.130s 129 | sys 0m0.120s 130 | 131 | Let's see how it fares if the input has duplicates:: 132 | 133 | $ googlebooks-eng-us-all-1gram-20090715-0.csv.zip | cut -f 0 > eng-us-all-1gram-0-grams 134 | $ time redis-import-set 1g < eng-us-all-1gram-0-grams 135 | 136 | real 0m31.068s 137 | user 0m28.910s 138 | sys 0m0.160s 139 | 140 | $ wc -l eng-us-all-1gram-0-grams 141 | 29232733 eng-us-all-1gram-0-grams 142 | 143 | Internally redis-import-set is using ``itertools.groupby`` to avoid sending redundant ``SADD`` operations for repeated 144 | entries. 145 | 146 | Here is for just using the raw CSV file, taking advantage of the `redis-import-set` behavior to default to the 147 | first column:: 148 | 149 | $ unzip googlebooks-eng-us-all-1gram-20090715-0.csv.zip 150 | $ redis-import-set 1g < googlebooks-eng-us-all-1gram-20090715-0.csv 151 | 152 | real 0m39.420s 153 | user 0m37.200s 154 | sys 0m0.360s 155 | 156 | This is good considering that the input size of the input with duplicates is 70x bigger yet the execution time only tripled 157 | from the unique inputs case. 158 | 159 | What happens if we try to process unsorted data with many duplicates? The groupby filter won't have any effect 160 | and consequently we'll be sending many more requests than needed, containing mostly redundant data. To illustrate 161 | we'll cut the years column out of the corpus file giving us a huge input count containing only a few hundred 162 | distinct values:: 163 | 164 | # Slice out the years column from the corpus 165 | $ time unzip -p googlebooks-eng-us-all-1gram-20090715-0.csv.zip | cut -f 2 > eng-us-all-1gram-0-years 166 | 167 | real 0m14.114s 168 | user 0m13.190s 169 | sys 0m1.320s 170 | 171 | $ wc -l eng-us-all-1gram-0-years 172 | 29232733 173 | 174 | $ time redis-import-set years < eng-us-all-1gram-0-years 175 | 176 | real 13m50.783s 177 | user 12m39.700s 178 | sys 0m4.450s 179 | 180 | Ouch! This is problematic. However, we can still workaround this by using a Python Set internally to track which 181 | items we've already sent to ``SADD``. After making this change, we have:: 182 | 183 | time redis-import-set years < eng-us-all-1gram-0-years 184 | 185 | real 0m26.108s 186 | user 0m25.970s 187 | sys 0m0.060s 188 | 189 | Back in business. For many inputs the distinct count may be a small percentage of the total inputs but otherwise 190 | it won't be desirable to be automatically cache set members in the command. A forthcoming change will require 191 | use of a command line argument to signify that the input is unsorted and to utilize the cache. 192 | --------------------------------------------------------------------------------