├── .gitignore ├── bin └── datasets ├── readme.md ├── recommender_datasets ├── __init__.py ├── _common.py ├── amazon.py ├── amazon_jmcauley.py ├── gowalla.py ├── movielens.py ├── output.py ├── verification.py └── yoochoose.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.pyc 3 | *egg* 4 | *#* -------------------------------------------------------------------------------- /bin/datasets: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import numpy as np 3 | 4 | from recommender_datasets import (amazon, 5 | amazon_jmcauley, 6 | gowalla, 7 | movielens, 8 | output, 9 | verification, 10 | yoochoose) 11 | 12 | 13 | def download_movielens(): 14 | 15 | params = ( 16 | ('movielens_100K', 17 | movielens.read_movielens_100K), 18 | ('movielens_1M', 19 | movielens.read_movielens_1M), 20 | ('movielens_10M', 21 | movielens.read_movielens_10M), 22 | ('movielens_20M', 23 | movielens.read_movielens_20M), 24 | ) 25 | 26 | for out_file_path, data_fnc in params: 27 | print('Processing {}'.format(out_file_path)) 28 | output.write_csv_data(out_file_path, 29 | data_fnc()) 30 | output.write_hdf5_data(out_file_path, 31 | data_fnc()) 32 | 33 | print('Verifying {}'.format(out_file_path)) 34 | verification.verify(out_file_path) 35 | 36 | 37 | def download_gowalla(): 38 | 39 | header = ('user_id', 40 | 'timestamp', 41 | 'latitude', 42 | 'longitude', 43 | 'item_id') 44 | 45 | output.write_csv_data('gowalla', 46 | gowalla.read_gowalla(), 47 | header=header) 48 | output.write_hdf5_data('gowalla', 49 | gowalla.read_gowalla(), 50 | header=header, 51 | dtype=(np.int32, 52 | np.int32, 53 | np.float32, 54 | np.float32, 55 | np.int32)) 56 | verification.verify('gowalla', columns=header) 57 | 58 | 59 | def download_amazon(): 60 | 61 | header = ('user_id', 62 | 'item_id', 63 | 'rating', 64 | 'timestamp', 65 | 'features_item_id', 66 | 'features_feature_id') 67 | 68 | output.write_hdf5_data('amazon_co_purchasing', 69 | amazon.read_amazon_co_purchasing(), 70 | header=header, 71 | dtype=(np.int32, 72 | np.int32, 73 | np.float32, 74 | np.float32, 75 | np.int32, 76 | np.int32)) 77 | 78 | 79 | def download_yoochoose(): 80 | 81 | header = ('user_id', 'item_id', 'timestamp') 82 | 83 | for variant in ('buys', 'clicks'): 84 | data = yoochoose.read_yoochoose(variant) 85 | output.write_hdf5_data('yoochoose_{}'.format(variant), 86 | data, 87 | header=header, 88 | dtype=(np.int32, 89 | np.int32, 90 | np.float32)) 91 | 92 | 93 | def download_amazon_jmcauley(): 94 | 95 | header = ('user_id', 'item_id', 'rating', 'timestamp') 96 | 97 | for variant in amazon_jmcauley.VARIANTS: 98 | data = amazon_jmcauley.read_amazon(variant) 99 | output.write_hdf5_data('amazon_jmcauley_{}'.format(variant), 100 | data, 101 | header=header, 102 | dtype=(np.int32, 103 | np.int32, 104 | np.float32)) 105 | 106 | 107 | if __name__ == '__main__': 108 | download_amazon_jmcauley() 109 | download_yoochoose() 110 | download_movielens() 111 | download_amazon() 112 | download_gowalla() 113 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Recommender datasets 2 | 3 | Parses and packages popular recommender datasets as simple-to-use CSV and HDF5 files. Have a look at releases for download links. -------------------------------------------------------------------------------- /recommender_datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maciejkula/recommender_datasets/bbca3e736f885c95ff1c38dd0b100a0e081adf31/recommender_datasets/__init__.py -------------------------------------------------------------------------------- /recommender_datasets/_common.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | 5 | 6 | def get_data_home(): 7 | 8 | return os.path.join(os.path.expanduser('~'), 9 | '.recommender_datasets') 10 | 11 | 12 | def create_data_dir(path): 13 | 14 | if not os.path.isdir(path): 15 | os.makedirs(path) 16 | 17 | 18 | def download(url, dest_path): 19 | 20 | req = requests.get(url, stream=True) 21 | req.raise_for_status() 22 | 23 | with open(dest_path, 'wb') as fd: 24 | for chunk in req.iter_content(chunk_size=2**20): 25 | fd.write(chunk) 26 | 27 | 28 | def get_data(url, dest_subdir, dest_filename, download_if_missing=True): 29 | 30 | data_home = get_data_home() 31 | data_dir = os.path.join(os.path.abspath(data_home), dest_subdir) 32 | 33 | create_data_dir(data_dir) 34 | 35 | dest_path = os.path.join(data_dir, dest_filename) 36 | 37 | if not os.path.isfile(dest_path): 38 | if download_if_missing: 39 | download(url, dest_path) 40 | else: 41 | raise IOError('Dataset missing.') 42 | 43 | return dest_path 44 | -------------------------------------------------------------------------------- /recommender_datasets/amazon.py: -------------------------------------------------------------------------------- 1 | import array 2 | import datetime 3 | import gzip 4 | from itertools import islice 5 | import time 6 | 7 | import numpy as np 8 | 9 | from recommender_datasets import _common 10 | 11 | 12 | def _read_blocks(path): 13 | 14 | with gzip.open(path, 'r') as source_file: 15 | 16 | block = [] 17 | 18 | for line in islice(source_file, 3, None): 19 | line = line.decode('utf-8') 20 | if line.startswith('Id:'): 21 | if block: 22 | yield block 23 | block = [line.replace('\r\n', '')] 24 | continue 25 | else: 26 | block.append(line.replace('\r\n', '')) 27 | 28 | 29 | def _parse_category(cat_string): 30 | 31 | start = cat_string.find('[') 32 | stop = cat_string.find(']') 33 | 34 | return int(cat_string[start + 1:stop]) 35 | 36 | 37 | def _parse_categories(lines): 38 | 39 | num_categories = int(lines[0].split(':')[-1].strip()) 40 | 41 | categories = [] 42 | 43 | for category_line in lines[1:1 + num_categories]: 44 | 45 | cat_strings = category_line.strip().split('|') 46 | cat_ids = [_parse_category(x) for x in cat_strings if x] 47 | 48 | categories += cat_ids 49 | 50 | return categories 51 | 52 | 53 | def _parse_reviews(lines): 54 | 55 | customer_ids = [] 56 | ratings = [] 57 | dates = [] 58 | 59 | for line in lines[1:]: 60 | 61 | if not line: 62 | continue 63 | 64 | date_stop = line.find('cutomer') 65 | year, month, day = line[:date_stop].split('-') 66 | dates.append(datetime.date(year=int(year), 67 | month=int(month), 68 | day=int(day))) 69 | 70 | rating_start = line.find('rating') 71 | rating_stop = line.find('votes') 72 | rating = int(line[rating_start:rating_stop].split(':')[1]) 73 | ratings.append(rating) 74 | 75 | customer_ids.append(line[date_stop:rating_start].split(':')[1].strip()) 76 | 77 | return customer_ids, ratings, dates 78 | 79 | 80 | def _parse_block(lines): 81 | 82 | ITEM_ID_LINE = 0 83 | 84 | item_id = int(lines[ITEM_ID_LINE].split(':')[1]) + 1 85 | categories = [] 86 | user_ids = [] 87 | ratings = [] 88 | dates = [] 89 | 90 | for line_num, line in enumerate(lines): 91 | if 'categories' in line: 92 | categories = _parse_categories(lines[line_num:]) 93 | 94 | if 'reviews:' in line: 95 | (user_ids, 96 | ratings, 97 | dates) = _parse_reviews(lines[line_num:]) 98 | 99 | return (item_id, 100 | categories, 101 | user_ids, 102 | ratings, 103 | dates) 104 | 105 | 106 | def read_amazon_co_purchasing(): 107 | 108 | path = _common.get_data('https://snap.stanford.edu/data/bigdata/amazon/amazon-meta.txt.gz', 109 | 'amazon', 110 | 'amazon_co_purchasing.gz') 111 | 112 | user_dict = {} 113 | feature_dict = {} 114 | 115 | interaction_user_ids = array.array('i') 116 | interaction_item_ids = array.array('i') 117 | interaction_ratings = array.array('f') 118 | interaction_timestamps = array.array('f') 119 | 120 | feature_item_ids = array.array('i') 121 | feature_ids = array.array('i') 122 | 123 | failed_parses = [] 124 | total_parses = 0 125 | 126 | for block in _read_blocks(path): 127 | total_parses += 1 128 | try: 129 | (item_id, 130 | categories, 131 | user_ids, 132 | ratings, 133 | dates) = _parse_block(block) 134 | except Exception as e: 135 | print('Parse failed') 136 | failed_parses.append((e, block)) 137 | 138 | user_ids = [user_dict.setdefault(x, len(user_dict)) 139 | for x in user_ids] 140 | 141 | interaction_user_ids.extend(user_ids) 142 | interaction_item_ids.extend([item_id] * len(user_ids)) 143 | interaction_ratings.extend(ratings) 144 | interaction_timestamps.extend([int(time.mktime(x.timetuple())) 145 | for x in dates]) 146 | 147 | categories = [feature_dict.setdefault(x, len(feature_dict)) 148 | for x in categories] 149 | 150 | feature_item_ids.extend([item_id] * len(categories)) 151 | feature_ids.extend(categories) 152 | 153 | print('Num of failed parses: {} (out of {})'.format(len(failed_parses), 154 | total_parses)) 155 | 156 | return (np.array(interaction_user_ids), 157 | np.array(interaction_item_ids), 158 | np.array(interaction_ratings), 159 | np.array(interaction_timestamps), 160 | np.array(feature_item_ids), 161 | np.array(feature_ids)) 162 | -------------------------------------------------------------------------------- /recommender_datasets/amazon_jmcauley.py: -------------------------------------------------------------------------------- 1 | import array 2 | 3 | import numpy as np 4 | 5 | from recommender_datasets import _common 6 | 7 | 8 | BASE_URL = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/' 9 | 10 | VARIANTS = ('books', 'electronics') 11 | 12 | 13 | def _to_numpy(data): 14 | 15 | uid_map = {} 16 | iid_map = {} 17 | 18 | uids = array.array('i') 19 | iids = array.array('i') 20 | ratings = array.array('f') 21 | timestamps = array.array('f') 22 | 23 | for uid, iid, rating, timestamp in data: 24 | uid = uid_map.setdefault(uid, len(uid_map) + 1) 25 | iid = iid_map.setdefault(iid, len(iid_map) + 1) 26 | 27 | uids.append(uid) 28 | iids.append(iid) 29 | ratings.append(rating) 30 | timestamps.append(timestamp) 31 | 32 | return (np.array(uids, dtype=np.int32), 33 | np.array(iids, dtype=np.int32), 34 | np.array(ratings, dtype=np.float32), 35 | np.array(timestamps, dtype=np.float32)) 36 | 37 | 38 | def _read_data(variant): 39 | 40 | file_path = _common.get_data(BASE_URL + 41 | 'ratings_{}.csv'.format(variant.title()), 42 | 'amazon', 43 | 'ratings_{}.csv'.format(variant)) 44 | 45 | with open(file_path, 'r') as datafile: 46 | for line in datafile: 47 | uid, iid, rating, timestamp = line.split(',') 48 | 49 | yield uid, iid, float(rating), float(timestamp) 50 | 51 | 52 | def read_amazon(variant): 53 | 54 | data = _read_data(variant) 55 | 56 | uids, iids, ratings, timestamps = _to_numpy(data) 57 | 58 | return uids, iids, ratings, timestamps 59 | -------------------------------------------------------------------------------- /recommender_datasets/gowalla.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import gzip 3 | 4 | 5 | from recommender_datasets import _common 6 | 7 | URL = 'https://snap.stanford.edu/data/loc-gowalla_totalCheckins.txt.gz' 8 | 9 | 10 | def _read_data(path): 11 | 12 | with gzip.GzipFile(path) as datafile: 13 | for line in datafile: 14 | (user_id, time, lat, 15 | lon, locaton_id) = line.decode('utf-8').split('\t') 16 | 17 | yield (int(user_id), 18 | int(datetime.datetime.strptime(time, 19 | "%Y-%m-%dT%H:%M:%SZ") 20 | .timestamp()), 21 | float(lat), 22 | float(lon), 23 | int(locaton_id)) 24 | 25 | 26 | def read_gowalla(): 27 | 28 | zip_path = _common.get_data(URL, 29 | 'gowalla', 30 | 'gowalla.txt.gz') 31 | 32 | for line in _read_data(zip_path): 33 | yield line 34 | -------------------------------------------------------------------------------- /recommender_datasets/movielens.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | import zipfile 4 | 5 | from recommender_datasets import _common 6 | 7 | 8 | URL_PREFIX = 'http://files.grouplens.org/datasets/movielens/' 9 | URL_100K = 'ml-100k.zip' 10 | URL_1M = 'ml-1m.zip' 11 | URL_10M = 'ml-10m.zip' 12 | URL_20M = 'ml-20m.zip' 13 | 14 | 15 | def _read_data(path, archive_path): 16 | 17 | with zipfile.ZipFile(path) as archive: 18 | with archive.open(archive_path) as datafile: 19 | for line in datafile: 20 | yield line.decode('utf-8') 21 | 22 | 23 | def _parse_line(line, separator='::'): 24 | 25 | uid, iid, rating, timestamp = line.split(separator) 26 | 27 | return (int(uid), int(iid), float(rating), int(timestamp)) 28 | 29 | 30 | def _make_contiguous(data, separator): 31 | 32 | user_map = {} 33 | item_map = {} 34 | 35 | for line in data: 36 | uid, iid, rating, timestamp = _parse_line(line, separator=separator) 37 | 38 | uid = user_map.setdefault(uid, len(user_map) + 1) 39 | iid = item_map.setdefault(iid, len(item_map) + 1) 40 | 41 | yield uid, iid, rating, timestamp 42 | 43 | 44 | def read_movielens_100K(): 45 | 46 | zip_path = _common.get_data(URL_PREFIX + URL_100K, 47 | 'movielens', 48 | 'movielens_100k.zip') 49 | 50 | archive_path = os.path.join('ml-100k', 'u.data') 51 | 52 | for line in _read_data(zip_path, archive_path): 53 | yield _parse_line(line, separator='\t') 54 | 55 | 56 | def read_movielens_1M(): 57 | 58 | zip_path = _common.get_data(URL_PREFIX + URL_1M, 59 | 'movielens', 60 | 'movielens_1M.zip') 61 | 62 | archive_path = os.path.join('ml-1m', 'ratings.dat') 63 | 64 | data = _read_data(zip_path, archive_path) 65 | 66 | for line in _make_contiguous(data, separator='::'): 67 | yield line 68 | 69 | 70 | def read_movielens_10M(): 71 | 72 | zip_path = _common.get_data(URL_PREFIX + URL_10M, 73 | 'movielens', 74 | 'movielens_10M.zip') 75 | 76 | archive_path = os.path.join('ml-10M100K', 'ratings.dat') 77 | 78 | data = _read_data(zip_path, archive_path) 79 | 80 | for line in _make_contiguous(data, separator='::'): 81 | yield line 82 | 83 | 84 | def read_movielens_20M(): 85 | 86 | zip_path = _common.get_data(URL_PREFIX + URL_20M, 87 | 'movielens', 88 | 'movielens_20M.zip') 89 | 90 | archive_path = os.path.join('ml-20m', 'ratings.csv') 91 | 92 | data = itertools.islice(_read_data(zip_path, archive_path), 1, None) 93 | 94 | for line in _make_contiguous(data, separator=','): 95 | yield line 96 | -------------------------------------------------------------------------------- /recommender_datasets/output.py: -------------------------------------------------------------------------------- 1 | import array 2 | import os 3 | import zipfile 4 | 5 | import h5py 6 | import numpy as np 7 | 8 | from recommender_datasets import _common 9 | 10 | 11 | SEPARATOR = ',' 12 | 13 | 14 | def _array_from_dtype(dtype): 15 | 16 | if dtype == np.float32: 17 | return array.array('f') 18 | else: 19 | return array.array('i') 20 | 21 | 22 | def _to_numpy(data, dtypes): 23 | 24 | arrays = tuple(_array_from_dtype(x) 25 | for x in dtypes) 26 | 27 | for row in data: 28 | for (arr, elem) in zip(arrays, row): 29 | arr.append(elem) 30 | 31 | return tuple(np.array(arr, dtype=dtype) 32 | for (arr, dtype) in 33 | zip(arrays, dtypes)) 34 | 35 | 36 | def _serialize(row): 37 | 38 | row = [str(x) for x in row] 39 | 40 | return (SEPARATOR.join(row) + '\n').encode('utf-8') 41 | 42 | 43 | def write_csv_data(filename, data, 44 | header=('user_id', 'item_id', 'rating', 'timestamp')): 45 | 46 | output_dir = os.path.join(_common.get_data_home(), 'output') 47 | 48 | if not os.path.exists(output_dir): 49 | os.makedirs(output_dir) 50 | 51 | path = os.path.join(output_dir, filename) + '.zip' 52 | 53 | with zipfile.ZipFile(path, mode='w') as archive: 54 | with archive.open('data.csv', mode='w') as output_file: 55 | output_file.write(_serialize(header)) 56 | 57 | for row in data: 58 | output_file.write(_serialize(row)) 59 | 60 | 61 | def write_hdf5_data(filename, data, 62 | header=('user_id', 63 | 'item_id', 64 | 'rating', 65 | 'timestamp'), 66 | dtype=(np.int32, 67 | np.int32, 68 | np.float32, 69 | np.int32)): 70 | 71 | if not isinstance(data[0], np.ndarray): 72 | arrays = _to_numpy(data, dtype) 73 | else: 74 | arrays = data 75 | 76 | output_dir = os.path.join(_common.get_data_home(), 'output') 77 | 78 | if not os.path.exists(output_dir): 79 | os.makedirs(output_dir) 80 | 81 | path = os.path.join(output_dir, filename + '.hdf5') 82 | 83 | with h5py.File(path, "w") as archive: 84 | for (arr, name) in zip(arrays, header): 85 | archive.create_dataset(name, data=arr, compression='gzip') 86 | -------------------------------------------------------------------------------- /recommender_datasets/verification.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | 4 | import h5py 5 | import numpy as np 6 | import pandas as pd 7 | 8 | 9 | from recommender_datasets import _common 10 | 11 | 12 | def _read_csv(fname, columns=('user_id', 'item_id', 'rating', 'timestamp')): 13 | 14 | with zipfile.ZipFile(fname, mode='r') as archive: 15 | with archive.open('data.csv') as datafile: 16 | df = pd.read_csv(datafile) 17 | data = df[list(columns)] 18 | 19 | return data 20 | 21 | 22 | def _read_hdf5(fname, columns=('user_id', 'item_id', 'rating', 'timestamp')): 23 | 24 | with h5py.File(fname, 'r') as data: 25 | return {column: data['/{}'.format(column)][:] 26 | for column in columns} 27 | 28 | 29 | def verify(path, columns=('user_id', 'item_id', 'rating', 'timestamp')): 30 | 31 | output_dir = os.path.join(_common.get_data_home(), 'output') 32 | path = os.path.join(output_dir, path) 33 | 34 | csv_data = _read_csv(path + '.zip', columns) 35 | hdf5data = _read_hdf5(path + '.hdf5', columns) 36 | 37 | return all( 38 | np.all(csv_data[column] == hdf5data[column]) 39 | for column in columns 40 | ) 41 | -------------------------------------------------------------------------------- /recommender_datasets/yoochoose.py: -------------------------------------------------------------------------------- 1 | import array 2 | import datetime 3 | import os 4 | import subprocess 5 | import time 6 | 7 | import numpy as np 8 | 9 | from recommender_datasets import _common 10 | 11 | 12 | def _to_numpy(data): 13 | 14 | uid_map = {} 15 | iid_map = {} 16 | 17 | uids = array.array('i') 18 | iids = array.array('i') 19 | timestamps = array.array('f') 20 | 21 | for uid, iid, timestamp in data: 22 | uid = uid_map.setdefault(uid, len(uid_map) + 1) 23 | iid = iid_map.setdefault(iid, len(iid_map) + 1) 24 | 25 | uids.append(uid) 26 | iids.append(iid) 27 | timestamps.append(timestamp) 28 | 29 | return (np.array(uids, dtype=np.int32), 30 | np.array(iids, dtype=np.int32), 31 | np.array(timestamps, dtype=np.float32)) 32 | 33 | 34 | def _read_data(variant): 35 | 36 | zip_path = _common.get_data('https://s3-eu-west-1.amazonaws.com/' 37 | 'yc-rdata/yoochoose-data.7z', 38 | 'yoochoose', 39 | 'yoochoose.7z') 40 | 41 | dest_dir = os.path.dirname(zip_path) 42 | 43 | for suffix in ('buys', 'clicks'): 44 | if not os.path.exists(os.path.join( 45 | dest_dir, 'yoochoose-{}.dat'.format(suffix))): 46 | subprocess.check_call(['7z', 47 | '-o{}'.format(dest_dir), 48 | 'x', 49 | zip_path]) 50 | 51 | fname = os.path.join(dest_dir, 'yoochoose-{}.dat'.format(variant)) 52 | with open(fname, 'r') as datafile: 53 | for line in datafile: 54 | uid, timestamp, iid = line.split(',')[:3] 55 | 56 | timestamp = time.mktime( 57 | datetime.datetime.strptime(timestamp, 58 | '%Y-%m-%dT%H:%M:%S.%fZ') 59 | .timetuple()) 60 | 61 | yield int(uid), int(iid), timestamp 62 | 63 | 64 | def read_yoochoose(variant): 65 | 66 | data = _read_data(variant) 67 | 68 | uids, iids, timestamps = _to_numpy(data) 69 | 70 | return uids, iids, timestamps 71 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='recommender_datasets', 6 | version='0.1.0', 7 | requirements=['click', 'numpy', 'scipy', 'requests', 'h5py'], 8 | packages=['recommender_datasets'], 9 | license='MIT', 10 | classifiers=['Development Status :: 3 - Alpha', 11 | 'License :: OSI Approved :: MIT License', 12 | 'Topic :: Scientific/Engineering :: Artificial Intelligence'], 13 | ) 14 | --------------------------------------------------------------------------------