├── redis_import_tools
    ├── __init__.py
    ├── examples
    │   ├── set-vs-list.txt
    │   ├── redis-import-set_csv.py
    │   ├── redis-import-set_groupby.py
    │   ├── redis-import-set_split.py
    │   ├── redis-import-set_zipfile.py
    │   └── one_grams.py
    └── commands.py
├── requirements.pip
├── .gitignore
├── setup.py
├── bin
    └── redis-import-set
├── LICENSE
└── README.rst


/redis_import_tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.pip:
--------------------------------------------------------------------------------
1 | redis=2.2.0
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='redis-import-tools',
 5 |     version='0.1',
 6 |     long_description=__doc__,
 7 |     packages=['redis_import_tools'],
 8 |     include_package_data=True,
 9 |     zip_safe=False,
10 |     #install_requires=['redis==2.2.0'],
11 |     scripts=['bin/redis-import-set']
12 | )
13 | 


--------------------------------------------------------------------------------
/redis_import_tools/examples/set-vs-list.txt:
--------------------------------------------------------------------------------
 1 | List import twice as fast as set:
 2 | 
 3 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ time redis-import-set set test2 < /tmp/activity_i
 4 | ds.txt
 5 | 
 6 | real    4m5.133s
 7 | user    3m44.330s
 8 | sys     0m1.730s
 9 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ time redis-import-set list testlist < /tmp/activi
10 | ty_ids.txt
11 | 
12 | real    2m10.471s
13 | user    1m57.740s
14 | sys     0m1.820s
15 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ redis-cli scard test2
16 | (integer) 4617374
17 | brian@airtank:~/work/gamesradar/gamesradar/gamesradar$ redis-cli llen testlist
18 | (integer) 4617374
19 | 


--------------------------------------------------------------------------------
/redis_import_tools/examples/redis-import-set_csv.py:
--------------------------------------------------------------------------------
 1 | # redis-import-set
 2 | import fileinput
 3 | import sys
 4 | from csv import reader
 5 | from itertools import count, islice
 6 | 
 7 | import redis
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 | 	
12 |     r = redis.Redis()
13 |     pipeline_redis = r.pipeline()
14 |     count = 0
15 |     try:
16 |         keyname = sys.argv[1]
17 |     except IndexError:
18 |         raise Exception("You must specify the name for the Set")
19 | 
20 |     for line in reader(sys.stdin, delimiter='\t'):
21 |         pipeline_redis.sadd(keyname, line[0])
22 |         count += 1
23 |         if not count % 10000:
24 |         	pipeline_redis.execute()
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/redis_import_tools/examples/redis-import-set_groupby.py:
--------------------------------------------------------------------------------
 1 | # redis-import-set
 2 | import sys
 3 | from csv import reader
 4 | from itertools import count, groupby, islice
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 | 	
11 |     r = redis.Redis()
12 |     pipeline_redis = r.pipeline()
13 |     count = 0
14 |     try:
15 |         keyname = sys.argv[1]
16 |     except IndexError:
17 |         raise Exception("You must specify the name for the Set")
18 | 
19 |     for k, _ in groupby(reader(sys.stdin, delimiter='\t'), 
20 |             lambda x:x[0]):
21 |         pipeline_redis.sadd(keyname, k)
22 |         count += 1
23 |         if not count % 10000:
24 |         	pipeline_redis.execute()
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/redis_import_tools/examples/redis-import-set_split.py:
--------------------------------------------------------------------------------
 1 | # redis-import-set
 2 | import fileinput
 3 | import sys
 4 | from csv import reader
 5 | from itertools import count, islice
 6 | 
 7 | import redis
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 | 	
12 |     r = redis.Redis()
13 |     pipeline_redis = r.pipeline()
14 |     count = 0
15 |     try:
16 |         keyname = sys.argv[1]
17 |     except IndexError:
18 |         raise Exception("You must specify the name for the Set")
19 | 
20 |     for line in groupby(sys.stdin, 
21 |     	                lambda x:
22 |         pipeline_redis.sadd(keyname, line.split('\t')[0])
23 |         count += 1
24 |         if not count % 10000:
25 |         	pipeline_redis.execute()
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/redis_import_tools/examples/redis-import-set_zipfile.py:
--------------------------------------------------------------------------------
 1 | # redis-import-set
 2 | import sys
 3 | from itertools import count, groupby
 4 | from zipfile import ZipFile
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 | 	
11 |     r = redis.Redis()
12 |     pipeline_redis = r.pipeline()
13 |     count = 0
14 |     try:
15 |         keyname = sys.argv[1]
16 |     except IndexError:
17 |         raise Exception("You must specify the name for the Set")
18 | 
19 |     zf = ZipFile(sys.stdin)
20 |     arch = zf.namelist()[0]
21 | 
22 |     for k, _ in groupby(zf.open(arch), 
23 |             lambda x:x.split('\t')[0]):
24 |         pipeline_redis.sadd(keyname, k)
25 |         count += 1
26 |         if not count % 10000:
27 |         	pipeline_redis.execute()
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/bin/redis-import-set:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import sys
 4 | 
 5 | from redis_import_tools import commands
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(
10 |             description='Tools for importing data into Redis.')
11 | 
12 |     parser.add_argument('-b', '--batch-size', default=10000,
13 |             help='number of SADD operations to send per pipeline batch')
14 | 
15 |     #parser.add_argument('-u', '--unsorted', action="store_true",
16 |     #        help='indicates that the input is unsorted')
17 | 
18 |     parser.add_argument('cmd',
19 |             help='Redis command')
20 |     parser.add_argument('key',
21 |             help='name of the key for the command')
22 | 
23 |     options = parser.parse_args()
24 | 
25 |     cmd = options.cmd
26 |     if cmd == 'set':
27 |         commands.load_set(options.key, sys.stdin, batch_size=options.batch_size)
28 |     elif cmd == 'list':
29 |         commands.load_list(options.key, sys.stdin, batch_size=options.batch_size)
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010 Brian Luft
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/redis_import_tools/examples/one_grams.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | from csv import reader
 5 | from glob import glob
 6 | from itertools import count, groupby
 7 | from optparse import OptionParser
 8 | from zipfile import ZipFile
 9 | 
10 | 
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     parser = OptionParser()
15 |     parser.add_option('-d', '--datadir', 
16 |             help="Directory containing data files",
17 |             metavar="DIR")
18 | 
19 |     options, args = parser.parse_args()
20 | 
21 |     if not options.datadir:
22 |         #create a temp dir
23 |         pass
24 |     else:
25 |     	datadir = options.datadir
26 | 
27 |     
28 |     #for onegram_archive in glob(os.path.join(datadir, '*.zip')):
29 |     t0 = time.clock()
30 |     onegram_archive = glob(os.path.join(datadir, '*.zip'))[0]
31 |     #archive_lines = count()
32 |     archive_lines = 0
33 |     zf = ZipFile(onegram_archive)
34 |     archive_name = zf.namelist()[0]
35 |     print archive_name
36 |     for gram, _ in groupby(reader(zf.open(archive_name), delimiter='\t'),
37 |                            lambda x:x[0]):
38 |         archive_lines += 1
39 |         if archive_lines % 100000 == 0:
40 |             print archive_lines    
41 |             print time.clock() - t0
42 |             t0 = time.clock()
43 |     
44 | 


--------------------------------------------------------------------------------
/redis_import_tools/commands.py:
--------------------------------------------------------------------------------
 1 | """wrappers for Redis insert commands"""
 2 | import sys
 3 | from csv import reader
 4 | from itertools import groupby
 5 | 
 6 | import redis
 7 | 
 8 | 
 9 | def load_set(key, IN, **kwargs):
10 |     """
11 |     """
12 |     r = redis.Redis()
13 |     pipeline_redis = r.pipeline()
14 |     count = 0
15 |     #batch_size = kwargs['batch_size']
16 |     batch_size = kwargs.get('batch_size', 1000)
17 | 
18 |     seen = set([None])
19 |     for member, _ in groupby(reader(IN, delimiter='\t'),
20 |                             lambda x: x[0] if len(x) else None):
21 |         if member not in seen:
22 |             pipeline_redis.sadd(key, member.rstrip())
23 |             count += 1
24 |             seen.add(member)
25 |             if not count % batch_size:
26 |                 pipeline_redis.execute()
27 |     #send the last batch
28 |     pipeline_redis.execute()
29 | 
30 | 
31 | def load_list(key, IN, **kwargs):
32 |     """
33 |     """
34 |     r = redis.Redis()
35 |     pipeline_redis = r.pipeline()
36 |     count = 0
37 |     batch_size = kwargs.get('batch_size', 1000)
38 | 
39 |     for line in IN:
40 |         pipeline_redis.rpush(key, line.rstrip())
41 |         count += 1
42 |         if not count % batch_size:
43 |             pipeline_redis.execute()
44 |     #send the last batch
45 |     pipeline_redis.execute()
46 | 
47 | 
48 | def load_hash_list(IN, **kwargs):
49 |     """
50 |     """
51 |     r = redis.Redis()
52 |     pipeline_redis = r.pipeline()
53 |     count = 0
54 |     batch_size = kwargs.get('batch_size', 1000)
55 | 
56 |     for key, mapping in IN:
57 |         pipeline_redis.hmset(key, mapping)
58 |         count += 1
59 |         if not count % batch_size:
60 |             pipeline_redis.execute()
61 |     #send the last batch
62 |     pipeline_redis.execute()
63 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | redis-import-tools
  2 | ==================
  3 | 
  4 | A collection of utilities for importing data into Redis
  5 | 
  6 | Commands
  7 | --------
  8 | 
  9 | redis-import-set
 10 |     Create a Redis Set from a column of values in a text file
 11 | 
 12 | 
 13 | Installation
 14 | ------------
 15 | 
 16 | It is assumed that you have Redis version >= 1.3 installed and configured.
 17 | 
 18 | ::
 19 | 
 20 |     pip install redis-import-tools
 21 | 
 22 | 
 23 | Introduction
 24 | ------------
 25 | 
 26 | Let's start with a trivial example. We'd like to load our local words dictionary into a Redis set. One approach might be::
 27 | 
 28 |     $ cat /usr/share/dict/words \
 29 |     | xargs -d "\n" -I word redis-cli sadd engdict word  > /dev/null
 30 | 
 31 | We're piping the contents of the dictionary file to `xargs`, which will run an ``SADD`` command for each
 32 | word in the dictionary and add it to a set called ``engdict``.
 33 | 
 34 | While the one-liner is nice, performance was terrible::
 35 | 
 36 |     real    5m36.977s
 37 |     user    0m3.560s
 38 |     sys     1m17.490s
 39 | 
 40 | We can observe that we're making one query to Redis for each word in the dictionary and there are::
 41 | 
 42 |     
 43 |     $ wc -l /usr/share/dict/words
 44 |     98569 /usr/share/dict/words
 45 | 
 46 | 98569 words in the dictionary. Five minutes to get 100,000 requests into Redis isn't acceptable. Also, it looks like some words didn't make 
 47 | it into the set::
 48 | 
 49 |     $ redis-cli
 50 |     redis> scard engdict
 51 |     (integer) 95426
 52 | 
 53 | 
 54 |       
 55 | One obvious place for improvement is to use the Redis pipelining feature to cut down significantly on the number of requests made.
 56 | I see redis-cli as a convenience tool and I suspect it wasn't designed with the use case of frequently forking new processes. By building on
 57 | the solid redis-py Redis client library we can come up with some basic utilities that will offer great performance with some flexibility 
 58 | for populating Redis from data sources such as CSV/TSV files. 
 59 | 
 60 | With code like the following, we can send data to Redis in batches (10000 values per request)::
 61 | 
 62 |     r = redis.Redis()
 63 |     pipeline_redis = r.pipeline()
 64 |     count = 0
 65 |     for line in sys.stdin:
 66 |         pipeline_redis.sadd(keyname, line.rstrip())
 67 |         count += 1
 68 |         if not count % 10000:
 69 |             pipeline_redis.execute()
 70 |     pipeline_redis.execute()
 71 | 
 72 | This code is the basic idea for the redis-import-set command. Here's how to use the command to perform the desired operation::
 73 | 
 74 |     $ redis-import-set engdict < /usr/share/dict/words
 75 | 
 76 | Performance is now very acceptable::
 77 | 
 78 |     real    0m2.838s
 79 |     user    0m2.530s
 80 |     sys     0m0.050s
 81 | 
 82 | And the set count matches the input count::
 83 | 
 84 |     redis> scard engdict
 85 |     (integer) 98569
 86 | 
 87 | 
 88 | About Filtering, Sorting, and Compression...
 89 | --------------------------------------------
 90 | 
 91 | Often you will be starting with an input set that contains extraneous data (columns and/or rows you won't need). 
 92 | 
 93 | The performance characteristics of these data processing steps can vary depending on where they are handled. For example, 
 94 | the Python ZipFile streaming appeared to add significant overhead under scenarios I tested. I may add some basic fallbacks
 95 | for decompressing the input source in the future and I'm also deciding what features in general I might want to support for I/O with
 96 | compressed/archive data formats. Since decompressing data is always
 97 | a first operation it easiest just to rely on the native OS compression tools and allow the redis-import-tools commands to 
 98 | assume (uncompressed) textual input.
 99 | 
100 | Python CSV seems to add a minimal amount of overhead so leaving it in is worthwhile since it probably more robust than
101 | naively using ``string.split()``
102 | 
103 | 
104 | Case Study
105 | ----------
106 | 
107 | 
108 | Using the corpus of English 1-grams from Google Books: TODO
109 | 
110 | The first archive contains 29232733 rows of which 420044 are unique. 
111 | 
112 | Extract the first column into a file which we'll later use as input redis-import-set::
113 | 
114 |     $ time unzip -p googlebooks-eng-us-all-1gram-20090715-0.csv.zip | cut -f 1 | uniq > eng-us-all-1gram-0-uniq-grams
115 | 
116 |     real    0m12.706s
117 |     user    0m15.820s
118 |     sys     0m1.460s
119 | 
120 |     $ wc -l eng-us-all-1gram-0-uniq-grams 
121 |     420044 eng-us-all-1gram-0-uniq-grams
122 |        
123 | Import to a Redis set called ``1g``::
124 | 
125 |     $ time redis-import-set 1g < eng-us-all-1gram-0-uniq-grams
126 | 
127 |     real    0m12.995s
128 |     user    0m11.130s
129 |     sys     0m0.120s
130 | 
131 | Let's see how it fares if the input has duplicates::
132 | 
133 |     $ googlebooks-eng-us-all-1gram-20090715-0.csv.zip | cut -f 0 > eng-us-all-1gram-0-grams
134 |     $ time redis-import-set 1g < eng-us-all-1gram-0-grams
135 | 
136 |     real    0m31.068s
137 |     user    0m28.910s
138 |     sys     0m0.160s
139 | 
140 |     $ wc -l eng-us-all-1gram-0-grams 
141 |     29232733 eng-us-all-1gram-0-grams
142 | 
143 | Internally redis-import-set is using ``itertools.groupby`` to avoid sending redundant ``SADD`` operations for repeated
144 | entries. 
145 | 
146 | Here is for just using the raw CSV file, taking advantage of the `redis-import-set` behavior to default to the 
147 | first column::
148 | 
149 |     $ unzip googlebooks-eng-us-all-1gram-20090715-0.csv.zip
150 |     $ redis-import-set 1g < googlebooks-eng-us-all-1gram-20090715-0.csv
151 | 
152 |     real    0m39.420s
153 |     user    0m37.200s
154 |     sys     0m0.360s
155 | 
156 | This is good considering that the input size of the input with duplicates is 70x bigger yet the execution time only tripled
157 | from the unique inputs case.
158 | 
159 | What happens if we try to process unsorted data with many duplicates? The groupby filter won't have any effect
160 | and consequently we'll be sending many more requests than needed, containing mostly redundant data. To illustrate
161 | we'll cut the years column out of the corpus file giving us a huge input count containing only a few hundred 
162 | distinct values::
163 | 
164 |     # Slice out the years column from the corpus
165 |     $ time unzip -p googlebooks-eng-us-all-1gram-20090715-0.csv.zip | cut -f 2 > eng-us-all-1gram-0-years
166 | 
167 |     real    0m14.114s
168 |     user    0m13.190s
169 |     sys     0m1.320s
170 | 
171 |     $ wc -l eng-us-all-1gram-0-years
172 |     29232733
173 | 
174 |     $ time redis-import-set years < eng-us-all-1gram-0-years
175 | 
176 |     real    13m50.783s
177 |     user    12m39.700s
178 |     sys     0m4.450s
179 | 
180 | Ouch! This is problematic. However, we can still workaround this by using a Python Set internally to track which
181 | items we've already sent to ``SADD``. After making this change, we have::
182 | 
183 |     time redis-import-set years < eng-us-all-1gram-0-years
184 | 
185 |     real    0m26.108s
186 |     user    0m25.970s
187 |     sys     0m0.060s
188 | 
189 | Back in business. For many inputs the distinct count may be a small percentage of the total inputs but otherwise
190 | it won't be desirable to be automatically cache set members in the command. A forthcoming change will require
191 | use of a command line argument to signify that the input is unsorted and to utilize the cache. 
192 | 


--------------------------------------------------------------------------------