├── .gitignore ├── License ├── README.md ├── docs ├── Datasets │ ├── amazon.md │ ├── anime.md │ ├── bookx.md │ ├── diginetica.md │ ├── epinions.md │ ├── goodreads.md │ ├── jester.md │ ├── lastfm.md │ ├── movielens.md │ ├── msd.md │ ├── netflix.md │ ├── rekko.md │ ├── retail_rocket.md │ ├── steam.md │ └── yoochoose.md ├── directory.md └── index.md ├── mkdocs.yml ├── rs_datasets ├── __init__.py ├── amazon.py ├── anime.py ├── book_crossing.py ├── data_loader │ ├── __init__.py │ ├── archives.py │ └── loaders.py ├── diginetica.py ├── epinions.py ├── generic_dataset.py ├── goodreads.py ├── in_progress.py ├── jester.py ├── lastfm.py ├── movielens.py ├── msd.py ├── netflix.py ├── rekko.py ├── retail_rocket.py ├── steam.py └── yoochoose.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | /venv/ 2 | /rs_datasets.egg-info/ 3 | /.idea/git 4 | __pycache__/ 5 | -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yan-Martin Tamm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to rs_datasets 2 | 3 | ![](https://img.shields.io/pypi/v/rs_datasets?color=%2300ccff) 4 | ![](https://img.shields.io/badge/datasets-15-00ccff) 5 | 6 | This tool allows you download, unpack and read 7 | recommender systems datasets into `pandas.DataFrame` as easy as `data = Dataset()`. 8 | 9 | ## Installation 10 | 11 | ``` 12 | pip install rs_datasets 13 | ``` 14 | 15 | ## Documentation 16 | Please see [documentation](https://darel13712.github.io/rs_datasets/) to this project to 17 | see available datasets and examples of use. 18 | 19 | ## Example of use 20 | 21 | ```python 22 | from rs_datasets import MovieLens 23 | ml = MovieLens() 24 | ml.info() 25 | ``` 26 | ```text 27 | ratings 28 | user_id item_id rating timestamp 29 | 0 1 1 4.0 964982703 30 | 1 1 3 4.0 964981247 31 | 2 1 6 4.0 964982224 32 | items 33 | item_id ... genres 34 | 0 1 ... Adventure|Animation|Children|Comedy|Fantasy 35 | 1 2 ... Adventure|Children|Fantasy 36 | 2 3 ... Comedy|Romance 37 | [3 rows x 3 columns] 38 | tags 39 | user_id item_id tag timestamp 40 | 0 2 60756 funny 1445714994 41 | 1 2 60756 Highly quotable 1445714996 42 | 2 2 60756 will ferrell 1445714992 43 | links 44 | item_id imdb_id tmdb_id 45 | 0 1 114709 862.0 46 | 1 2 113497 8844.0 47 | 2 3 113228 15602.0 48 | ``` 49 | Loaded DataFrames are available as class attributes. 50 | 51 | ## Note 52 | 53 | This package relies on `datatable` to read files. 54 | There are some known issues with reading some of the datasets, which should be solved with the release of `datatable==1.1.0`, 55 | but they are quite slow on releases. If you experience problems with reading datasets, you may try to downgrade datatable 56 | to 0.11 or 0.9. Or you can install a dev build `1.1.0a2102` or newer from [s3](https://h2o-release.s3.amazonaws.com/datatable/index.html). 57 | Find your python version, copy link for whl and do `pip install link`. Sorry for the inconvenience. 58 | -------------------------------------------------------------------------------- /docs/Datasets/amazon.md: -------------------------------------------------------------------------------- 1 | # Amazon 2 | 3 | This is ratings by category for [Amazon Review Data (2018)](https://nijianmo.github.io/amazon/index.html). 4 | You can download 5-core reviews from dataset site. Full reviews and metadata are available per request as well. 5 | 6 | ## Stats 7 | 8 | Use `category` parameter to specify which category to download. 9 | 10 | | Key | Ratings | 11 | | ------------------- | ---------- | 12 | | fashion | 883,636 | 13 | | beauty | 371,345 | 14 | | appliances | 602,777 | 15 | | arts | 2,875,917 | 16 | | automotive | 7,990,166 | 17 | | books | 51,311,621 | 18 | | cds | 4,543,369 | 19 | | phones | 10,063,255 | 20 | | clothing | 32,292,099 | 21 | | music | 1,584,082 | 22 | | electronics | 20,994,353 | 23 | | cards | 147,194 | 24 | | grocery | 5,074,160 | 25 | | kitchen | 21,928,568 | 26 | | scientific | 1,758,333 | 27 | | kindle | 5,722,988 | 28 | | luxury | 574,628 | 29 | | subscriptions | 89,689 | 30 | | movies | 8,765,568 | 31 | | musical instruments | 1,512,530 | 32 | | office | 5,581,313 | 33 | | garden | 5,236,058 | 34 | | pet | 6,542,483 | 35 | | pantry | 471,614 | 36 | | software | 459,436 | 37 | | sports | 12,980,837 | 38 | | tools | 9,015,203 | 39 | | toys | 8,201,231 | 40 | | game | 2,565,349 | 41 | 42 | 43 | 44 | ## Example 45 | 46 | ```python 47 | from rs_datasets import Amazon 48 | amazon = Amazon('cards') 49 | amazon.info() 50 | ``` 51 | ```text 52 | ratings 53 | user_id item_id rating timestamp 54 | 0 B001GXRQW0 APV13CM0919JD 1.0 1229644800 55 | 1 B001GXRQW0 A3G8U1G1V082SN 5.0 1229472000 56 | 2 B001GXRQW0 A11T2Q0EVTUWP 5.0 1229472000 57 | ``` -------------------------------------------------------------------------------- /docs/Datasets/anime.md: -------------------------------------------------------------------------------- 1 | # Anime 2 | 3 | [Anime Recommendations Database](https://www.kaggle.com/CooperUnion/anime-recommendations-database). 4 | 5 | !!! note 6 | You have to [configure](https://github.com/Kaggle/kaggle-api#:~:text=API%20credentials,file%20containing%20your%20API%20credentials.) 7 | kaggle api token for auto download. 8 | 9 | ## Stats 10 | - 7,813,737 ratings 11 | - 73,515 users 12 | - 11,200 anime titles 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import Anime 18 | anime = Anime() 19 | anime.info() 20 | ``` 21 | ```text 22 | ratings 23 | user_id item_id rating 24 | 0 1 20 -1 25 | 1 1 24 -1 26 | 2 1 79 -1 27 | 28 | titles 29 | item_id name genre type episodes rating members 30 | 0 32281 Kimi no Na wa. Drama, Romance, School, Supernatural Movie 1 9.37 200630 31 | 1 5114 Fullmetal Alchemist: Brotherhood Action, Adventure, Drama, Fantasy, Magic, Mili... TV 64 9.26 793665 32 | 2 28977 Gintama° Action, Comedy, Historical, Parody, Samurai, S... TV 51 9.25 114262 33 | 34 | ``` 35 | -------------------------------------------------------------------------------- /docs/Datasets/bookx.md: -------------------------------------------------------------------------------- 1 | # Book-Crossing 2 | 3 | [Book-crossing dataset](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) 4 | contains user ratings of books on a scale from 1 to 10. 5 | Books have extra information and links to covers. 6 | Users have age and location features. item_ids are actual book ISBNs. 7 | 8 | ## Stats 9 | 10 | - 278,858 users 11 | - 271,379 items 12 | - 1,149,780 ratings 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import BookCrossing 18 | bx = BookCrossing() 19 | bx.info() 20 | ``` 21 | ```text 22 | ratings 23 | user_id item_id rating 24 | 0 276725 034545104X 0 25 | 1 276726 0155061224 5 26 | 2 276727 0446520802 0 27 | 28 | items 29 | item_id title author year publisher img_s img_m img_l 30 | 0 0195153448 Classical Mythology Mark P. O. Morford 2002 Oxford University Press http://... http://... http://... 31 | 1 0002005018 Clara Callan Richard Bruce Wright 2001 HarperFlamingo Canada http://... http://... http://... 32 | 2 0060973129 Decision in Normandy Carlo D'Este 1991 HarperPerennial http://... http://... http://... 33 | 34 | users 35 | user_id location age 36 | 0 1 nyc, new york, usa NaN 37 | 1 2 stockton, california, usa 18.0 38 | 2 3 moscow, yukon territory, russia NaN 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /docs/Datasets/diginetica.md: -------------------------------------------------------------------------------- 1 | # Diginetica 2 | 3 | [Diginetica dataset](https://competitions.codalab.org/competitions/11161#learn_the_details-data2) 4 | for session-based recommendations in an e-commerce website. 5 | 6 | 7 | ## Stats 8 | - 1,235,380 views 9 | - 18,025 purchases 10 | - 232,816 users 11 | - 184,047 items 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import Diginetica 18 | d = Diginetica() 19 | d.info() 20 | ``` 21 | ```text 22 | items 23 | item_id log2price name_tokens 24 | 0 1 10 4875,776,56689,18212,18212,4896 25 | 1 69585 6 7583,18117,41805,41805,2371 26 | 2 90939 6 604,18117,41805,41805,2371 27 | 28 | categories 29 | item_id category_id 30 | 0 139578 1096 31 | 1 417975 1096 32 | 2 291805 1096 33 | 34 | purchases 35 | session_id user_id timeframe date order_id item_id 36 | 0 150 18278.0 17100868 2016-05-06 16421 25911 37 | 1 151 NaN 6454547 2016-05-06 16290 175874 38 | 2 156 7.0 1721689387 2016-05-27 21173 35324 39 | 40 | views 41 | session_id user_id item_id timeframe date 42 | 0 1 NaN 81766 526309 2016-05-09 43 | 1 1 NaN 31331 1031018 2016-05-09 44 | 2 1 NaN 32118 243569 2016-05-09 45 | 46 | queries 47 | query_id session_id user_id timeframe duration date tokens category_id items is_test 48 | 0 1 1 NaN 16327074 311 2016-05-09 16655,244087,51531,529597,58153 0 7518,71,30311,7837,30792,8252,81766,9338,62220... False 49 | 1 2 2 NaN 705527 314 2016-05-09 528941,529116 0 70095,15964,8627,134850,32754,100747,74771,314... False 50 | 2 3 3 NaN 0 502 2016-05-09 133713,16655,138389 0 59081,51125,9338,9550,32087,62793,2717,10403,3... True 51 | ``` 52 | -------------------------------------------------------------------------------- /docs/Datasets/epinions.md: -------------------------------------------------------------------------------- 1 | # Epinions 2 | 3 | [Epinions dataset](http://www.trustlet.org/downloaded_epinions.html) 4 | contains item ratings and trust statements between users. 5 | 6 | There are no distrust statements in the dataset (block list) 7 | but only trust statements (web of trust), 8 | because the block list is kept private and not shown on the site. 9 | 10 | ## Stats 11 | 12 | - 49,290 users who rated a total of 13 | - 139,738 different items at least once, writing 14 | - 664,824 reviews and 15 | - 487,181 issued trust statements. 16 | 17 | ## Example 18 | 19 | ```python 20 | from rs_datasets import Epinions 21 | epinions = Epinions() 22 | epinions.info() 23 | ``` 24 | ```text 25 | ratings 26 | user_id item_id rating 27 | 0 1 100 4 28 | 1 1 101 5 29 | 2 1 102 3 30 | 31 | trust 32 | source_user_id target_user_id trust_value 33 | 0 22605 42915 True 34 | 1 22605 5052 True 35 | 2 22605 42913 True 36 | ``` 37 | 38 | -------------------------------------------------------------------------------- /docs/Datasets/goodreads.md: -------------------------------------------------------------------------------- 1 | # Goodreads 2 | 3 | This dataset is [user-book interactions](https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/shelves) 4 | from Goodreads dataset. 5 | 6 | You can find text reviews and additional data on dataset site. 7 | 8 | ## Stats 9 | 10 | - 876,145 users 11 | - 2,360,650 books 12 | - 228,648,342 interactions 13 | 14 | ## Extra parameters 15 | 16 | - `read_maps=False` 17 | 18 | ids in interactions are encoded to save memory. 19 | You can read mappings, but there's no point 20 | if you don't use the rest of the dataset, which is not included here. 21 | 22 | ## Example 23 | 24 | ```python 25 | from rs_datasets import Goodreads 26 | goodreads = Goodreads(read_maps=True) 27 | goodreads.info() 28 | ``` 29 | ```text 30 | ratings 31 | user_id item_id is_read rating is_reviewed 32 | 0 0 948 True 5 False 33 | 1 0 947 True 5 True 34 | 2 0 946 True 5 False 35 | 36 | books 37 | book_id_csv book_id 38 | 0 0 34684622 39 | 1 1 34536488 40 | 2 2 34017076 41 | 42 | users 43 | user_id_csv user_id 44 | 0 0 8842281e1d1347389f2ab93d60773d4d 45 | 1 1 72fb0d0087d28c832f15776b0d936598 46 | 2 2 ab2923b738ea3082f5f3efcbbfacb218 47 | ``` -------------------------------------------------------------------------------- /docs/Datasets/jester.md: -------------------------------------------------------------------------------- 1 | # Jester 2 | 3 | [Jokes dataset](http://eigentaste.berkeley.edu/dataset/) 4 | consists of several datasets which contain joke ratings in 5 | real values ranging from -10.00 to +10.00. 6 | 7 | Texts of jokes are available too. 8 | 9 | ## Stats 10 | ### Dataset 1 11 | 12 | - 73,421 users 13 | - 100 jokes 14 | - collected between April 1999 - May 2003 15 | 16 | ### Dataset 3 17 | 18 | - 54,905 users 19 | - 150 jokes( 50 not in Dataset 1) 20 | - collected from November 2006 - Mar 2015 21 | - 22 jokes have few ratings as they were removed as of May 2009 22 | deemed to be out of date (eg, Bill Clinton jokes;) 23 | their ids are: {1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 24 | 27, 31, 43, 51, 52, 61, 73, 80, 100, 116}. 25 | - As of May 2009, the jokes {7, 8, 13, 15, 16, 17, 18, 19} are the "gauge set" 26 | (as discussed in the Eigentaste paper) 27 | and the jokes {1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 43, 28 | 51, 52, 61, 73, 80, 100, 116} were removed 29 | (i.e. they are never displayed or rated). 30 | 31 | ### Dataset 4 32 | 33 | - 7699 users 34 | - 158 jokes 35 | - 22 of the jokes don't have ratings, their ids are: 36 | {1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 27, 31, 37 | 43, 51, 52, 61, 73, 80, 100, 116}. 38 | - The jokes {1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 20, 39 | 27, 31, 43, 51, 52, 61, 73, 80, 100, 116} have been removed 40 | (i.e. they are never displayed or rated). 41 | 42 | ## Extra parameters 43 | 44 | - `dataset=1` 45 | 46 | one of 1, 3, 4 to choose corresponding version. 47 | 48 | ## Structure 49 | 50 | Dataset 1 consists of 3 matrices as provided, they are not merged together. 51 | 52 | Ratings are in a form of matrix with columns representing jokes. 53 | In original data 99 meant "no rating" it is replaced to `NaN` here. 54 | 55 | ```text 56 | data 57 | 1 2 3 ... 58 | 0 -7.82 8.79 -9.66 ... 59 | 1 4.08 -0.29 6.36 ... 60 | 2 NaN NaN NaN ... 61 | ... ... ... ... ... 62 | ``` 63 | 64 | Joke texts are available as `Pandas.Series` 65 | 66 | ```text 67 | jokes 68 | 1 A man visits the doctor. The doctor says "I ha... 69 | 2 This couple had an excellent relationship goin... 70 | 3 Q. What's 200 feet long and has 4 teeth? A. ... 71 | ``` -------------------------------------------------------------------------------- /docs/Datasets/lastfm.md: -------------------------------------------------------------------------------- 1 | # Last.fm 2 | 3 | [Last.fm-360k](http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html) 4 | contains `(user, artist, play count)` triplets collected with Last.fm API, 5 | using `user.getTopArtists()` method. 6 | 7 | ## Stats 8 | 9 | - 359,347 unique users 10 | - 186,642 artists with MBID 11 | - 107,373 artists without MBDID 12 | - 17,559,530 total rows 13 | 14 | ## Example 15 | ```python 16 | from rs_datasets import Lastfm 17 | lastfm = Lastfm() 18 | lastfm.info() 19 | ``` 20 | ```text 21 | play_counts 22 | user_id artist_id artist_name play_count 23 | 0 00000c289a1829a808ac09c00daf10bc3c4e223b 3bd73256-3905-4f3a-97e2-8b341527f805 betty blowtorch 2137 24 | 1 00000c289a1829a808ac09c00daf10bc3c4e223b f2fb0ff0-5679-42ec-a55c-15109ce6e320 die Ärzte 1099 25 | 2 00000c289a1829a808ac09c00daf10bc3c4e223b b3ae82c2-e60b-4551-a76d-6620f1b456aa melissa etheridge 897 26 | 27 | users 28 | user_id gender age country signup_date 29 | 0 00000c289a1829a808ac09c00daf10bc3c4e223b f 22.0 Germany Feb 1, 2007 30 | 1 00001411dc427966b17297bf4d69e7e193135d89 f NaN Canada Dec 4, 2007 31 | 2 00004d2ac9316e22dc007ab2243d6fcb239e707d NaN Germany Sep 1, 2006 32 | ``` -------------------------------------------------------------------------------- /docs/Datasets/movielens.md: -------------------------------------------------------------------------------- 1 | # MovieLens 2 | 3 | [MovieLens](https://grouplens.org/datasets/movielens/) is 4 | probably the most popular rs dataset out there. 5 | Contains movie ratings from grouplens site. 6 | 7 | Some versions provide addational information such as user info or tags. 8 | 9 | ## Versions 10 | 11 | Following stable versions are available: 12 | 13 | | Version | Size | Ratings | Users | Movies | Tags | 14 | | ------- | ----- | ------- | ----- | ------ | ---- | 15 | | 25m | 250MB | 25m | 162k | 62k | 1m | 16 | | 20m | 190MB | 20m | 138k | 27k | 456k | 17 | | 10m | 63MB | 10m | 72k | 10k | 100k | 18 | | 1m | 6MB | 1m | 6k | 4k | — | 19 | | 100k | 5MB | 100k | 1k | 1.7k | — | 20 | 21 | There are also 2 versions that change as time goes and are not recommended for research. 22 | That is `latest` version which contains all the data they have for the moment (bigger than `25m`) and 23 | `small` which is just a subset of the `latest` version. `small` is loaded by default if you don't 24 | specify version. 25 | 26 | ## Extra parameters 27 | 28 | - `version='small'` 29 | 30 | One of `{'100k', '1m', '10m', '20m', '25m', 'small', 'latest'}` 31 | 32 | - `read_genome=False` 33 | 34 | whether to read genome tag dataset or not 35 | (available from version 20m and up). 36 | Are not loaded by default to save memory. 37 | 38 | ## Example 39 | ```python 40 | from rs_datasets import MovieLens 41 | ml = MovieLens('10m') 42 | ml.info() 43 | ``` 44 | ```text 45 | ratings 46 | user_id item_id rating timestamp 47 | 0 1 122 5.0 838985046 48 | 1 1 185 5.0 838983525 49 | 2 1 231 5.0 838983392 50 | 51 | items 52 | item_id title genres 53 | 0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 54 | 1 2 Jumanji (1995) Adventure|Children|Fantasy 55 | 2 3 Grumpier Old Men (1995) Comedy|Romance 56 | 57 | tags 58 | user_id item_id tag timestamp 59 | 0 15 4973 excellent! 1215184630 60 | 1 20 1747 politics 1188263867 61 | 2 20 1747 satire 1188263867 62 | ``` -------------------------------------------------------------------------------- /docs/Datasets/msd.md: -------------------------------------------------------------------------------- 1 | # Million Song Dataset 2 | 3 | [Million Song Dataset](http://millionsongdataset.com/tasteprofile/) 4 | also known as Echo Nest Taste Profile Subset is a part of MSD, 5 | which contains play history of songs. 6 | Other datasets, such as preprocessed song features can be found at dataset site. 7 | 8 | ## Stats 9 | 10 | - 1,019,318 unique users 11 | - 384,546 unique songs 12 | - 48,373,586 user-song-play count triplets 13 | 14 | ## Extra parameters 15 | 16 | - `merge_kaggle_splits=True` 17 | 18 | In MSD Challenge on [Kaggle](https://www.kaggle.com/c/msdchallenge) there were 19 | public and private parts. By default they are merged together. 20 | 21 | - `drop_mismatches=True` 22 | 23 | There is a [matching error](http://millionsongdataset.com/blog/12-2-12-fixing-matching-errors/) 24 | between track ids and song ids in MSD. It shouldn't matter if you don't use audio features, but 25 | by default these items are removed. 26 | 27 | 28 | !!! warning 29 | Dataset is quite big and loads for about 5 minutes first time, 30 | taking about 1.2GB RAM. 31 | If you use default recommended parameters, 32 | processed data is saved to disk so that consequent loads take about 30 seconds. 33 | 34 | 35 | ## Example 36 | 37 | ```python 38 | from rs_datasets import MillionSongDataset 39 | msd = MillionSongDataset() 40 | msd.info() 41 | ``` 42 | ```text 43 | train 44 | user_id item_id play_count 45 | 0 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAKIMP12A8C130995 1 46 | 1 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOAPDEY12A81C210A9 1 47 | 2 b80344d063b5ccb3212f76538f3d9e43d87dca9e SOBBMDR12A8C13253B 2 48 | 49 | val 50 | user_id item_id play_count 51 | 0 0007140a3796e901f3190f12e9de6d7548d4ac4a SONVMBN12AC9075271 1 52 | 1 0007140a3796e901f3190f12e9de6d7548d4ac4a SOVIGZG12A6D4FB188 1 53 | 2 0007140a3796e901f3190f12e9de6d7548d4ac4a SOZGXYF12AB0185579 2 54 | 55 | test 56 | user_id item_id play_count 57 | 0 00007a02388c208ea7176479f6ae06f8224355b3 SOAITVD12A6D4F824B 3 58 | 1 00007a02388c208ea7176479f6ae06f8224355b3 SONZGLW12A6D4FBBC1 1 59 | 2 00007a02388c208ea7176479f6ae06f8224355b3 SOXNWYP12A6D4FBDC4 1 60 | ``` -------------------------------------------------------------------------------- /docs/Datasets/netflix.md: -------------------------------------------------------------------------------- 1 | # Netflix 2 | 3 | Classic dataset from the famous [Netflix Prize](https://www.kaggle.com/netflix-inc/netflix-prize-data) 4 | which took place 2006-2009. 5 | 6 | No ratings available for test data. 7 | 8 | 9 | ## Stats 10 | - 480,189 users 11 | - 17,770 movies 12 | - 100m ratings on the scale from 1 to 5 13 | 14 | 15 | ## Example 16 | 17 | ```python 18 | from rs_datasets import Netflix 19 | netflix = Netflix() 20 | netflix.info() 21 | ``` 22 | ```text 23 | movies 24 | item_id year title 25 | 0 1 2003.0 Dinosaur Planet 26 | 1 2 2004.0 Isle of Man TT 2004 Review 27 | 2 3 1997.0 Character 28 | 29 | test 30 | item_id user_id timestamp 31 | 0 1 1046323 2005-12-19 32 | 1 1 1080030 2005-12-23 33 | 2 1 1830096 2005-03-14 34 | 35 | train 36 | item_id user_id rating timestamp 37 | 0 373 643460 4 2005-01-26 38 | 1 373 349399 5 2002-11-06 39 | 2 373 1315469 2 2005-08-15 40 | ``` 41 | 42 | !!! warning 43 | It is not recommended to read data without wrapper (`df = pd.read_parquet`) 44 | when using PyCharm scientific mode. 45 | PyCharm tries to load all 100m rows to show DataFrame info, 46 | which causes huge memory consumption and freezes. 47 | When loading with a wrapper (as in using this class) it doesn't 48 | load that until you specifically try to show it. -------------------------------------------------------------------------------- /docs/Datasets/rekko.md: -------------------------------------------------------------------------------- 1 | # Rekko 2 | 3 | Movie data from [Rekko Challenge](https://boosters.pro/championship/rekko_challenge/data). 4 | 5 | No ratings available for test data. 6 | 7 | ## Stats 8 | - 9,643,012 transactions 9 | - 438,790 ratings 0 to 10 10 | - 499,663 users with transactions 11 | - 104,563 users with ratings 12 | - 8296 movies 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import Rekko 18 | rekko = Rekko() 19 | rekko.info() 20 | ``` 21 | ```text 22 | transactions 23 | item_id user_id consumption_mode ts watched_time device_type device_manufacturer 24 | 0 3336 5177 S 4.430518e+07 4282 0 50 25 | 1 481 593316 S 4.430518e+07 2989 0 11 26 | 2 4128 262355 S 4.430518e+07 833 0 50 27 | 28 | ratings 29 | user_id item_id rating ts 30 | 0 571252 1364 10 4.430517e+07 31 | 1 63140 3037 10 4.430514e+07 32 | 2 443817 4363 8 4.430514e+07 33 | 34 | bookmarks 35 | user_id item_id ts 36 | 0 301135 7185 4.430516e+07 37 | 1 301135 4083 4.430516e+07 38 | 2 301135 10158 4.430516e+07 39 | ``` 40 | Additional info available in `json` catalogue: 41 | ```python 42 | rekko.catalogue['8432'] 43 | ``` 44 | ```json 45 | {'type': 'movie', 46 | 'availability': ['purchase', 'rent', 'subscription'], 47 | 'duration': 100, 48 | 'feature_1': 9059050.751458582, 49 | 'feature_2': 0.7044612684, 50 | 'feature_3': 7, 51 | 'feature_4': 1.1212215513, 52 | 'feature_5': 0.5927161087, 53 | 'attributes': [14658, 54 | 27695, 55 | 27696, 56 | 3713, 57 | 2025, 58 | 7, 59 | 13953, 60 | 10, 61 | 42, 62 | 14, 63 | 15, 64 | 239, 65 | 20, 66 | 21, 67 | 13954, 68 | 197]} 69 | ``` -------------------------------------------------------------------------------- /docs/Datasets/retail_rocket.md: -------------------------------------------------------------------------------- 1 | # Retail Rocket 2 | 3 | [Retail Rocket Dataset](https://www.kaggle.com/retailrocket/ecommerce-dataset). 4 | 5 | !!! note 6 | You have to [configure](https://github.com/Kaggle/kaggle-api#:~:text=API%20credentials,file%20containing%20your%20API%20credentials.) 7 | kaggle api token for auto download. 8 | 9 | ## Stats 10 | - 2,756,101 transactions 11 | - 1,407,580 users 12 | - 235,061 items 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import RetailRocket 18 | rr = RetailRocket() 19 | rr.info() 20 | ``` 21 | ```text 22 | category_tree 23 | category_id parent_id 24 | 0 1016 213.0 25 | 1 809 169.0 26 | 2 570 9.0 27 | 28 | log 29 | ts user_id event item_id transaction_id 30 | 0 1433221332117 257597 view 355908 NaN 31 | 1 1433224214164 992329 view 248676 NaN 32 | 2 1433221999827 111016 view 318965 NaN 33 | 34 | items 35 | ts item_id property value 36 | 0 1435460400000 460429 categoryid 1338 37 | 1 1441508400000 206783 888 1116713 960601 n277.200 38 | 2 1439089200000 395014 400 n552.000 639502 n720.000 424566 39 | ``` 40 | -------------------------------------------------------------------------------- /docs/Datasets/steam.md: -------------------------------------------------------------------------------- 1 | # Steam 2 | 3 | [Steam Video Games dataset](https://www.kaggle.com/tamber/steam-video-games/data). 4 | 5 | !!! note 6 | You have to [configure](https://github.com/Kaggle/kaggle-api#:~:text=API%20credentials,file%20containing%20your%20API%20credentials.) 7 | kaggle api token for auto download. 8 | 9 | ## Stats 10 | - 200,000 transactions 11 | - 12,393 users 12 | - 5155 games 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import Steam 18 | steam = Steam() 19 | steam.info() 20 | ``` 21 | ```text 22 | data 23 | user_id game behavior value 24 | 0 151603712 The Elder Scrolls V Skyrim purchase 1.0 25 | 1 151603712 The Elder Scrolls V Skyrim play 273.0 26 | 2 151603712 Fallout 4 purchase 1.0 27 | ``` 28 | -------------------------------------------------------------------------------- /docs/Datasets/yoochoose.md: -------------------------------------------------------------------------------- 1 | # YooChoose 2 | 3 | [YooChoose dataset](https://recsys.acm.org/recsys15/challenge/) 4 | for session-based recommendations in an e-commerce website. 5 | 6 | 7 | ## Stats 8 | - 33,003,944 clicks 9 | - 9,249,729 sessions 10 | - 1,150,753 purchases 11 | - 52,739 items 12 | 13 | 14 | ## Example 15 | 16 | ```python 17 | from rs_datasets import YooChoose 18 | yc = YooChoose() 19 | yc.info() 20 | ``` 21 | ```text 22 | log 23 | session_id ts item_id category 24 | 0 1 2014-04-07T10:51:09.277Z 214536502 False 25 | 1 1 2014-04-07T10:54:09.868Z 214536500 False 26 | 2 1 2014-04-07T10:54:46.998Z 214536506 False 27 | 28 | purchases 29 | session_id ts item_id price quantity 30 | 0 420374 2014-04-06T18:44:58.314Z 214537888 12462 1 31 | 1 420374 2014-04-06T18:44:58.325Z 214537850 10471 1 32 | 2 281626 2014-04-06T09:40:13.032Z 214535653 1883 1 33 | 34 | test 35 | session_id ts item_id category 36 | 0 5 2014-04-07T17:13:46.713Z 214530776 False 37 | 1 5 2014-04-07T17:20:56.973Z 214530776 False 38 | 2 5 2014-04-07T17:21:19.602Z 214530776 False 39 | ``` 40 | -------------------------------------------------------------------------------- /docs/directory.md: -------------------------------------------------------------------------------- 1 | # Managing dataset directory 2 | 3 | Each dataset creates its folder inside dataset directory. 4 | 5 | Dataset directory is determined by following priorities: 6 | 7 | 1. `path` parameter from initialization 8 | 2. Environment variable `RS_DATASETS` 9 | 3. Default folder, which is in your home directory: `~/.rs_datasets/` 10 | 11 | So in general case you don't have to do anything 12 | and files corresponding to `dataset` will be stored in `~/.rs_datasets/dataset/`. 13 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to rs_datasets 2 | 3 | This tool allows you download, unpack and read 4 | recommender systems datasets into `pandas.DataFrame` as easy as `data = Dataset()`. 5 | 6 | ## Installation 7 | ``` 8 | pip install rs_datasets 9 | ``` 10 | 11 | ## Available datasets 12 | 13 | The following datasets are available for automatic download and 14 | can be retrieved with this package. 15 | 16 | !!! note 17 | Check dataset license to know available usecases. 18 | Authors of this package are not affiliated with dataset contents in any way. 19 | 20 | | Dataset | Users | Items | Interactions | 21 | | :----------------------------------------: | :---: | :---: | :----------: | 22 | | [Movielens](Datasets/movielens.md) | 162k | 62k | up to 25m | 23 | | [Million Song Dataset](Datasets/msd.md) | 1m | 385k | 48m | 24 | | [Netflix](Datasets/netflix.md) | 480k | 17.7k | 100m | 25 | | [Goodreads](Datasets/goodreads.md) | 800k | 1.5m | 225m | 26 | | [Last.fm](Datasets/lastfm.md) | 360k | 290k | 17.5m | 27 | | [Epinions](Datasets/epinions.md) | 49k | 140k | 660k | 28 | | [Book Crossing](Datasets/bookx.md) | 279k | 271k | 1.1m | 29 | | [Jester](Datasets/jester.md) | 73k | 100 | 4.1m | 30 | | [Amazon](Datasets/amazon.md) | ?[^*] | ?[^*] | up to 32m | 31 | | [Rekko](Datasets/rekko.md) | 100k, 500k | 8k | 500k, 9.6m | 32 | | [Steam](Datasets/steam.md) | 12k | 5k | 200k | 33 | | [Anime](Datasets/anime.md) | 73k | 11k | 7.8m | 34 | | [Retail Rocket](Datasets/retail_rocket.md) | 1.4m | 235k | 2.7m | 35 | | [YooChoose](Datasets/yoochoose.md) | 9m | 52k | 33m, 1m | 36 | | [Diginetica](Datasets/diginetica.md) | 232k | 184k | 1.2m, 18k | 37 | 38 | 39 | 40 | 41 | [^*]: Their download speed is extremely slow and I wasn't patient enough to download the biggest one to check this. 42 | 43 | 44 | 45 | 46 | 47 | ## Example of use 48 | 49 | ```python 50 | from rs_datasets import MovieLens 51 | ml = MovieLens() 52 | ml.info() 53 | ``` 54 | ```text 55 | ratings 56 | user_id item_id rating timestamp 57 | 0 1 1 4.0 964982703 58 | 1 1 3 4.0 964981247 59 | 2 1 6 4.0 964982224 60 | items 61 | item_id ... genres 62 | 0 1 ... Adventure|Animation|Children|Comedy|Fantasy 63 | 1 2 ... Adventure|Children|Fantasy 64 | 2 3 ... Comedy|Romance 65 | [3 rows x 3 columns] 66 | tags 67 | user_id item_id tag timestamp 68 | 0 2 60756 funny 1445714994 69 | 1 2 60756 Highly quotable 1445714996 70 | 2 2 60756 will ferrell 1445714992 71 | links 72 | item_id imdb_id tmdb_id 73 | 0 1 114709 862.0 74 | 1 2 113497 8844.0 75 | 2 3 113228 15602.0 76 | ``` 77 | Loaded DataFrames are available as class attributes. 78 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: rs_datasets 2 | 3 | repo_name: 'GitHub' 4 | repo_url: 'https://github.com/Darel13712/rs_datasets' 5 | 6 | theme: 7 | name: 'material' 8 | palette: 9 | primary: 'blue' 10 | accent: 'light-blue' 11 | font: 12 | text: 'Roboto' 13 | code: 'Menlo' 14 | icon: 15 | logo: 'material/star' 16 | 17 | markdown_extensions: 18 | - admonition 19 | - codehilite 20 | - footnotes 21 | 22 | extra: 23 | social: 24 | - icon: 'fontawesome/brands/github' 25 | link: 'https://github.com/Darel13712/rs_datasets' -------------------------------------------------------------------------------- /rs_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from rs_datasets.movielens import MovieLens 2 | from rs_datasets.msd import MillionSongDataset 3 | from rs_datasets.netflix import Netflix 4 | from rs_datasets.lastfm import Lastfm 5 | from rs_datasets.epinions import Epinions 6 | from rs_datasets.goodreads import Goodreads 7 | from rs_datasets.book_crossing import BookCrossing 8 | from rs_datasets.jester import Jester 9 | from rs_datasets.amazon import Amazon 10 | from rs_datasets.rekko import Rekko 11 | from rs_datasets.steam import Steam 12 | from rs_datasets.anime import Anime 13 | from rs_datasets.retail_rocket import RetailRocket 14 | from rs_datasets.yoochoose import YooChoose 15 | from rs_datasets.diginetica import Diginetica 16 | -------------------------------------------------------------------------------- /rs_datasets/amazon.py: -------------------------------------------------------------------------------- 1 | from os import mkdir 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | 6 | from rs_datasets.data_loader import download_url 7 | from rs_datasets.generic_dataset import Dataset, safe 8 | 9 | categories = { 10 | 'fashion': 'AMAZON_FASHION.csv', 11 | 'beauty': 'All_Beauty.csv', 12 | 'appliances': 'Appliances.csv', 13 | 'arts': 'Arts_Crafts_and_Sewing.csv', 14 | 'automotive': 'Automotive.csv', 15 | 'books': 'Books.csv', 16 | 'cds': 'CDs_and_Vinyl.csv', 17 | 'phones': 'Cell_Phones_and_Accessories.csv', 18 | 'clothing': 'Clothing_Shoes_and_Jewelry.csv', 19 | 'music': 'Digital_Music.csv', 20 | 'electronics': 'Electronics.csv', 21 | 'cards': 'Gift_Cards.csv', 22 | 'grocery': 'Grocery_and_Gourmet_Food.csv', 23 | 'kitchen': 'Home_and_Kitchen.csv', 24 | 'scientific': 'Industrial_and_Scientific.csv', 25 | 'kindle': 'Kindle_Store.csv', 26 | 'luxury': 'Luxury_Beauty.csv', 27 | 'subscriptions': 'Magazine_Subscriptions.csv', 28 | 'movies': 'Movies_and_TV.csv', 29 | 'musical instruments': 'Musical_Instruments.csv', 30 | 'office': 'Office_Products.csv', 31 | 'garden': 'Patio_Lawn_and_Garden.csv', 32 | 'pet': 'Pet_Supplies.csv', 33 | 'pantry': 'Prime_Pantry.csv', 34 | 'software': 'Software.csv', 35 | 'sports': 'Sports_and_Outdoors.csv', 36 | 'tools': 'Tools_and_Home_Improvement.csv', 37 | 'toys': 'Toys_and_Games.csv', 38 | 'games': 'Video_Games.csv' 39 | } 40 | 41 | 42 | class Amazon(Dataset): 43 | def __init__(self, category, path: str = None): 44 | """ 45 | :param category: one of {'fashion', 'beauty', 'appliances', 'arts', 'automotive', 46 | 'books', 'cds', 'phones', 'clothing', 'music', 'electronics', 47 | 'cards', 'grocery', 'kitchen', 'scientific', 'kindle', 'luxury', 48 | 'subscriptions', 'movies', 'musical instruments', 'office', 'garden', 49 | 'pet', 'pantry', 'software', 'sports', 'tools', 'toys', 'games} 50 | :param path: folder which is used to download dataset to 51 | if it does not contain dataset files. 52 | If files are found, load them. 53 | """ 54 | super().__init__(path) 55 | if category not in categories: 56 | raise ValueError(f'Dataset category must be one of {categories.keys()}') 57 | folder = join(self.data_folder, 'amazon') 58 | if not exists(folder): 59 | mkdir(folder) 60 | if not exists(join(folder, category + '.csv')): 61 | self._download(folder, category) 62 | self.ratings = dt.fread( 63 | join(folder, category + '.csv'), 64 | columns=['user_id', 'item_id', 'rating', 'timestamp'] 65 | ).to_pandas() 66 | 67 | 68 | @safe 69 | def _download(self, path, category): 70 | self.logger.info(f'Downloading Amazon {category} ratings...') 71 | base_url = 'https://jmcauley.ucsd.edu/data/amazon_v2/categoryFilesSmall/' 72 | url = base_url + categories[category] 73 | download_url(url, join(path, category + '.csv')) 74 | -------------------------------------------------------------------------------- /rs_datasets/anime.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | 6 | from rs_datasets.generic_dataset import Dataset, safe 7 | 8 | 9 | class Anime(Dataset): 10 | def __init__(self, path: str = None): 11 | super().__init__(path) 12 | folder = join(self.data_folder, 'anime') 13 | if not exists(folder): 14 | self._download(folder) 15 | 16 | self.ratings = dt.fread( 17 | join(folder, 'rating.csv'), 18 | columns=[ 19 | 'user_id', 'item_id', 'rating' 20 | ], 21 | ).to_pandas() 22 | 23 | self.titles = dt.fread( 24 | join(folder, 'anime.csv'), 25 | columns=[ 26 | 'item_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members' 27 | ], 28 | ).to_pandas() 29 | 30 | @safe 31 | def _download(self, path): 32 | from kaggle.api.kaggle_api_extended import KaggleApi 33 | logging.info('Downloading anime dataset...') 34 | api = KaggleApi() 35 | api.authenticate() 36 | api.dataset_download_files('CooperUnion/anime-recommendations-database', path, unzip=True) 37 | -------------------------------------------------------------------------------- /rs_datasets/book_crossing.py: -------------------------------------------------------------------------------- 1 | from os.path import join, exists 2 | 3 | import datatable as dt 4 | 5 | from rs_datasets.data_loader import download_dataset 6 | from rs_datasets.generic_dataset import Dataset, safe 7 | 8 | 9 | class BookCrossing(Dataset): 10 | def __init__(self, path: str = None): 11 | """ 12 | :param path: folder which is used to download dataset to 13 | if it does not contain dataset files. 14 | If files are found, load them. 15 | """ 16 | super().__init__(path) 17 | folder = join(self.data_folder, 'bookx') 18 | if not exists(folder): 19 | self._download(folder) 20 | 21 | self.ratings = dt.fread( 22 | join(folder, 'BX-Book-Ratings.csv'), 23 | columns=['user_id', 'item_id', 'rating'] 24 | ).to_pandas() 25 | 26 | self.items = dt.fread( 27 | join(folder, 'BX-Books.csv'), 28 | columns=['item_id', 'title', 'author', 'year', 29 | 'publisher', 'img_s', 'img_m', 'img_l'] 30 | ).to_pandas() 31 | 32 | users = dt.fread( 33 | join(folder, 'BX-Users.csv'), 34 | columns=['user_id', 'location', 'age'] 35 | ).to_pandas() 36 | 37 | users['age'] = users['age'].replace("NULL", "nan").astype('float') 38 | self.users = users 39 | 40 | @safe 41 | def _download(self, path): 42 | self.logger.info('Downloading Book-Crossing Dataset...') 43 | url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip' 44 | download_dataset(url, join(self.data_folder, 'bookx.zip')) 45 | -------------------------------------------------------------------------------- /rs_datasets/data_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from rs_datasets.data_loader.loaders import * 2 | from rs_datasets.data_loader.archives import * 3 | -------------------------------------------------------------------------------- /rs_datasets/data_loader/archives.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tarfile 3 | from os.path import splitext 4 | from tarfile import TarFile 5 | from typing import Union 6 | from zipfile import ZipFile 7 | from py7zr import SevenZipFile 8 | 9 | 10 | def extract(archive_name: str, manage_folder: bool = True) -> None: 11 | """ 12 | Extract `archive_name` and put it inside a folder 13 | if there are multiple files inside. 14 | 15 | :param archive_name: path to archive 16 | :param manage_folder: check if there is root folder in archive: 17 | if there is one, do not create extra folder, 18 | if there are just files inside, put them into folder. 19 | If param is set to `False`, extract "as is". 20 | :return: 21 | """ 22 | if archive_name.endswith('.zip'): 23 | archive = ZipFile(archive_name) 24 | elif archive_name.endswith('.7z'): 25 | archive = SevenZipFile(archive_name) 26 | else: 27 | try: 28 | archive = tarfile.open(archive_name) 29 | except Exception: 30 | raise NotImplementedError(f'Can\'t extract {archive_name}') 31 | 32 | name = os.path.dirname(archive_name) 33 | if manage_folder and not contains_dir(archive): 34 | name = remove_extension(archive_name) 35 | os.mkdir(name) 36 | 37 | archive.extractall(path=name) 38 | archive.close() 39 | 40 | 41 | def rm_if_exists(filepath: str) -> None: 42 | """ 43 | Remove file if it exists, else do nothing. 44 | 45 | :param filepath: path to file 46 | :return: None 47 | """ 48 | if os.path.exists(filepath): 49 | os.remove(filepath) 50 | 51 | 52 | def contains_dir(archive: Union[ZipFile, TarFile]) -> bool: 53 | """ 54 | Check if archive contains a root folder or just files. 55 | 56 | :param archive: archive file 57 | :return: `True` if first element of archive is folder 58 | """ 59 | if isinstance(archive, ZipFile): 60 | contents = archive.infolist() 61 | is_dir = contents[0].is_dir() 62 | elif isinstance(archive, TarFile): 63 | contents = archive.getmembers() 64 | is_dir = contents[0].isdir() 65 | elif isinstance(archive, SevenZipFile): 66 | contents = archive.getnames() 67 | is_dir = os.path.isdir(contents[0]) 68 | else: 69 | raise TypeError(f'Unknown archive type: {type(archive)}') 70 | return is_dir 71 | 72 | 73 | def remove_extension(file: str) -> str: 74 | """ 75 | Get file name without _last_ extension. 76 | 77 | :param file: string 78 | :return: archive.tar.gz -> archive.tar 79 | """ 80 | return splitext(file)[0] 81 | -------------------------------------------------------------------------------- /rs_datasets/data_loader/loaders.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlretrieve 2 | 3 | from tqdm import tqdm 4 | 5 | from rs_datasets.data_loader.archives import extract, rm_if_exists 6 | 7 | 8 | def download_dataset( 9 | url: str, destination_path: str, manage_folder: bool = True): 10 | """ 11 | Download dataset from the internet. 12 | 13 | :param url: from where 14 | :param destination_path: where to 15 | :param manage_folder: check if there is root folder in archive: 16 | if there is one, do not create extra folder, 17 | if there are just files inside, put them into folder. 18 | If param is set to `False`, extract "as is". 19 | :return: None 20 | """ 21 | download_url(url, destination_path) 22 | extract(destination_path, manage_folder) 23 | rm_if_exists(destination_path) 24 | 25 | 26 | def download_url(url: str, filename: str): 27 | """ 28 | Download something from link. 29 | 30 | :param url: link 31 | :param filename: path to save 32 | :return: None 33 | """ 34 | with tqdm(unit='B', unit_scale=True) as progress: 35 | def report(chunk, chunksize, total): 36 | progress.total = total 37 | progress.update(chunksize) 38 | return urlretrieve(url, filename, reporthook=report) 39 | -------------------------------------------------------------------------------- /rs_datasets/diginetica.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | import gdown 6 | 7 | from rs_datasets.data_loader import extract 8 | from rs_datasets.generic_dataset import Dataset, safe 9 | 10 | 11 | class Diginetica(Dataset): 12 | def __init__(self, path: str = None): 13 | """ 14 | :param path: folder which is used to download dataset to 15 | if it does not contain dataset files. 16 | If files are found, load them. 17 | """ 18 | super().__init__(path) 19 | folder = join(self.data_folder, 'diginetica') 20 | if not exists(folder): 21 | self._download(folder) 22 | 23 | self.items = dt.fread( 24 | join(folder, 'products.csv'), 25 | columns=['item_id', 'log2price', 'name_tokens'] 26 | ).to_pandas() 27 | 28 | self.categories = dt.fread( 29 | join(folder, 'product-categories.csv'), 30 | columns=['item_id', 'category_id'] 31 | ).to_pandas() 32 | 33 | self.purchases = dt.fread( 34 | join(folder, 'train-purchases.csv'), 35 | columns=['session_id', 'user_id', 'timeframe', 'date', 'order_id', 'item_id'] 36 | ).to_pandas() 37 | 38 | self.views = dt.fread( 39 | join(folder, 'train-item-views.csv'), 40 | columns=['session_id', 'user_id', 'item_id', 'timeframe', 'date'] 41 | ).to_pandas() 42 | 43 | self.queries = dt.fread( 44 | join(folder, 'train-queries.csv'), 45 | columns=['query_id', 'session_id', 'user_id', 'timeframe', 'duration', 'date', 46 | 'tokens', 'category_id', 'items', 'is_test'] 47 | ).to_pandas() 48 | 49 | 50 | 51 | @safe 52 | def _download(self, path): 53 | self.logger.info('Downloading Diginetica Dataset...') 54 | url = 'https://drive.google.com/uc?id=0B7XZSACQf0KdenRmMk8yVUU5LWc' 55 | zip = join(self.data_folder, 'diginetica.zip') 56 | gdown.download(url, zip) 57 | extract(zip) 58 | os.remove(zip) 59 | -------------------------------------------------------------------------------- /rs_datasets/epinions.py: -------------------------------------------------------------------------------- 1 | from os import mkdir 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | 6 | from rs_datasets.data_loader import download_dataset 7 | from rs_datasets.generic_dataset import Dataset, safe 8 | 9 | 10 | class Epinions(Dataset): 11 | def __init__(self, path: str = None): 12 | """ 13 | :param path: folder which is used to download dataset to 14 | if it does not contain dataset files. 15 | If files are found, load them. 16 | """ 17 | super().__init__(path) 18 | folder = join(self.data_folder, 'epinions') 19 | if not exists(folder): 20 | self._download(folder) 21 | 22 | self.ratings = dt.fread( 23 | join(folder, 'ratings_data.txt'), 24 | columns=['user_id', 'item_id', 'rating'] 25 | ).to_pandas() 26 | 27 | self.trust = dt.fread( 28 | join(folder, 'trust_data.txt'), 29 | columns=['source_user_id', 'target_user_id', 'trust_value'] 30 | ).to_pandas() 31 | 32 | @safe 33 | def _download(self, path): 34 | self.logger.info('Downloading Epinions dataset...') 35 | mkdir(path) 36 | base_url = 'http://www.trustlet.org/datasets/downloaded_epinions/' 37 | 38 | filepath = join(path, 'ratings_data.txt.bz2') 39 | download_dataset( 40 | base_url + 'ratings_data.txt.bz2', 41 | filepath, 42 | manage_folder=False 43 | ) 44 | 45 | filepath = join(path, 'trust_data.txt.bz2') 46 | download_dataset( 47 | base_url + 'trust_data.txt.bz2', 48 | filepath, 49 | manage_folder=False 50 | ) 51 | -------------------------------------------------------------------------------- /rs_datasets/generic_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | from os.path import join 5 | 6 | from pandas import DataFrame, Series 7 | from datatable import Frame 8 | 9 | 10 | class Dataset: 11 | def __init__(self, path: str = None): 12 | data_folder = (path or os.getenv('RS_DATASETS', None) or 13 | self.default_folder) 14 | if not os.path.exists(data_folder): 15 | os.makedirs(data_folder) 16 | self.data_folder = data_folder 17 | logger = logging.getLogger("rs_datasets") 18 | logger.setLevel(logging.INFO) 19 | self.logger = logger 20 | try: 21 | display = __import__('IPython.core.display', globals(), locals(), ['display']) 22 | self.display = display.display 23 | except Exception: 24 | self.display = print 25 | 26 | @property 27 | def default_folder(self): 28 | root = os.path.expanduser('~') 29 | return join(root, '.rs_datasets') 30 | 31 | def info(self): 32 | for name, df in self.__dict__.items(): 33 | if isinstance(df, (Frame, DataFrame, Series)): 34 | print(name) 35 | self.display(df.head(3)) 36 | print() 37 | 38 | 39 | def safe(func): 40 | def decorated(self, path, *args, **kwargs): 41 | try: 42 | func(self, path, *args, **kwargs) 43 | except Exception as e: 44 | if os.path.exists(path): 45 | shutil.rmtree(path) 46 | raise e 47 | return decorated 48 | -------------------------------------------------------------------------------- /rs_datasets/goodreads.py: -------------------------------------------------------------------------------- 1 | from os import mkdir 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | import gdown 6 | 7 | from rs_datasets.generic_dataset import Dataset, safe 8 | 9 | 10 | class Goodreads(Dataset): 11 | def __init__(self, path: str = None, read_maps: bool = False): 12 | """ 13 | :param path: folder which is used to download dataset to 14 | if it does not contain dataset files. 15 | If files are found, load them. 16 | :param read_maps: ids in interactions are encoded to save memory. 17 | You can read mappings, but there's no point 18 | if you don't use the rest of the dataset, which is not included here. 19 | """ 20 | super().__init__(path) 21 | folder = join(self.data_folder, 'goodreads') 22 | if not exists(folder): 23 | self._download(folder) 24 | 25 | self.ratings = dt.fread( 26 | join(folder, 'goodreads_interactions.csv'), 27 | columns=['user_id', 'item_id', 'is_read', 'rating', 'is_reviewed'] 28 | ).to_pandas() 29 | 30 | if read_maps: 31 | self.books = dt.fread( 32 | join(folder, 'book_id_map.csv') 33 | ).to_pandas() 34 | 35 | self.users = dt.fread( 36 | join(folder, 'user_id_map.csv') 37 | ).to_pandas() 38 | 39 | @safe 40 | def _download(self, path): 41 | """ 42 | https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/shelves 43 | https://github.com/MengtingWan/goodreads/blob/master/download.ipynb 44 | """ 45 | self.logger.info('Downloading interactions from Goodreads dataset...') 46 | mkdir(path) 47 | 48 | gdrive = 'https://drive.google.com/uc?id=' 49 | interactions = gdrive + '1zmylV7XW2dfQVCLeg1LbllfQtHD2KUon' 50 | users = gdrive + '15ax-h0Oi_Oyee8gY_aAQN6odoijmiz6Q' 51 | items = gdrive + '1CHTAaNwyzvbi1TR08MJrJ03BxA266Yxr' 52 | 53 | gdown.download(users, join(path, 'user_id_map.csv')) 54 | gdown.download(items, join(path, 'book_id_map.csv')) 55 | gdown.download(interactions, join(path, 'goodreads_interactions.csv')) 56 | -------------------------------------------------------------------------------- /rs_datasets/in_progress.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os import mkdir, rename 3 | from os.path import join 4 | 5 | from rs_datasets.data_loader.loaders import download_dataset 6 | 7 | 8 | def download_citeulike_a(path: str = '.'): 9 | """ 10 | Скачать CiteULike-a 11 | https://github.com/js05212/citeulike-a 12 | 13 | :param path: куда положить 14 | :return: None 15 | """ 16 | logging.info('Downloading CiteULike-a dataset...') 17 | url = 'https://github.com/js05212/citeulike-a/archive/master.zip' 18 | download_dataset(url, join(path, 'citeulike-a.zip')) 19 | rename(join(path, 'citeulike-a-master'), join(path, 'citeulike-a')) 20 | 21 | 22 | def download_hetrec(path: str = '.'): 23 | """ 24 | Скачать HetRec 2011 25 | https://grouplens.org/datasets/hetrec-2011/ 26 | 27 | :param path: куда положить 28 | :return: None 29 | """ 30 | logging.info('Downloading HetRec 2011 dataset...') 31 | folder = join(path, 'hetrec') 32 | mkdir(folder) 33 | base_url = 'http://files.grouplens.org/datasets/hetrec2011/' 34 | 35 | download_dataset( 36 | base_url + 'hetrec2011-delicious-2k.zip', 37 | join(folder, 'delicious.zip') 38 | ) 39 | download_dataset( 40 | base_url + 'hetrec2011-lastfm-2k.zip', 41 | join(folder, 'lastfm.zip') 42 | ) 43 | download_dataset( 44 | base_url + 'hetrec2011-movielens-2k-v2.zip', 45 | join(folder, 'movielens.zip') 46 | ) 47 | -------------------------------------------------------------------------------- /rs_datasets/jester.py: -------------------------------------------------------------------------------- 1 | from os import mkdir, rename 2 | from os.path import join, exists 3 | 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from rs_datasets.data_loader import download_dataset 8 | from rs_datasets.generic_dataset import Dataset, safe 9 | 10 | 11 | class Jester(Dataset): 12 | def __init__(self, dataset: int = 1, path: str = None): 13 | """ 14 | :param dataset: version of dataset (1, 3, 4). 15 | :param path: folder which is used to download dataset to 16 | if it does not contain dataset files. 17 | If files are found, load them. 18 | """ 19 | super().__init__(path) 20 | if dataset not in {1, 3, 4}: 21 | raise ValueError('Dataset version must be one of {1, 3, 4}') 22 | folder = join(self.data_folder, 'jester') 23 | if not exists(folder): 24 | self._download(folder) 25 | 26 | if dataset == 1: 27 | data = [] 28 | for i in [1, 2, 3]: 29 | ratings = pd.read_excel(join(folder, str(dataset), f'data_{i}.xls'), header=None) 30 | ratings = self._process(ratings) 31 | data.append(ratings) 32 | self.data1 = data[0] 33 | self.data2 = data[1] 34 | self.data3 = data[2] 35 | jokes = pd.read_excel(join(folder, '3', f'jokes.xlsx'), header=None) 36 | self.jokes = self._fix_jokes(jokes) 37 | else: 38 | data = pd.read_excel(join(folder, str(dataset), f'data.xlsx'), header=None) 39 | self.data = self._process(data) 40 | jokes = pd.read_excel(join(folder, str(dataset), f'jokes.xlsx'), header=None) 41 | self.jokes = self._fix_jokes(jokes) 42 | 43 | @staticmethod 44 | def _process(df): 45 | df = df.drop(0, axis=1) 46 | df = df.replace(99, np.nan) 47 | df = df.astype(pd.SparseDtype('float', np.nan)) 48 | return df 49 | 50 | @staticmethod 51 | def _fix_jokes(df): 52 | df.index = range(1, len(df) + 1) 53 | return df[0] 54 | 55 | @safe 56 | def _download(self, path): 57 | self.logger.info('Downloading Jester dataset...') 58 | mkdir(path) 59 | base_url = 'http://eigentaste.berkeley.edu/dataset/' 60 | 61 | self.logger.info('Dataset 1...') 62 | d1 = join(path, '1') 63 | mkdir(d1) 64 | download_dataset( 65 | base_url + 'jester_dataset_1_joke_texts.zip', 66 | join(d1, 'joke_texts.zip'), 67 | manage_folder=False 68 | ) 69 | for i in [1, 2, 3]: 70 | download_dataset( 71 | base_url + f'jester_dataset_1_{i}.zip', 72 | join(d1, f'ratings_{i}.zip'), 73 | manage_folder=False 74 | ) 75 | rename(join(d1, f'jester-data-{i}.xls'), join(d1, f'data_{i}.xls')) 76 | 77 | for i in [3, 4]: 78 | self.logger.info(f'Dataset {i}...') 79 | d = join(path, str(i)) 80 | mkdir(d) 81 | download_dataset( 82 | base_url + f'Dataset{i}JokeSet.zip', 83 | join(d, 'joke_texts.zip'), 84 | manage_folder=False 85 | ) 86 | rename(join(d, f'Dataset{i}JokeSet.xlsx'), join(d, 'jokes.xlsx')) 87 | download_dataset( 88 | base_url + f'JesterDataset{i}.zip', 89 | join(d, 'ratings.zip'), 90 | manage_folder=False 91 | ) 92 | if i == 3: 93 | name = 'FINAL jester 2006-15.xls' 94 | else: 95 | name = '[final] April 2015 to Nov 30 2019 - Transformed Jester Data - .xlsx' 96 | rename(join(d, name), join(d, 'data.xlsx')) 97 | -------------------------------------------------------------------------------- /rs_datasets/lastfm.py: -------------------------------------------------------------------------------- 1 | from os import rename 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | 6 | from rs_datasets.data_loader import download_dataset 7 | from rs_datasets.generic_dataset import Dataset, safe 8 | 9 | 10 | class Lastfm(Dataset): 11 | def __init__(self, path: str = None): 12 | super().__init__(path) 13 | folder = join(self.data_folder, 'lastfm') 14 | if not exists(folder): 15 | self._download(folder) 16 | 17 | self.play_counts = dt.fread( 18 | join(folder, 'usersha1-artmbid-artname-plays.tsv'), 19 | columns=['user_id', 'artist_id', 'artist_name', 'play_count'] 20 | ).to_pandas() 21 | 22 | self.users = dt.fread( 23 | join(folder, 'usersha1-profile.tsv'), 24 | columns=['user_id', 'gender', 'age', 'country', 'signup_date'] 25 | ).to_pandas() 26 | 27 | @safe 28 | def _download(self, path): 29 | self.logger.info('Downloading Last.fm 360k dataset...') 30 | url = 'http://mtg.upf.edu/static/datasets/last.fm/lastfm-dataset-360K.tar.gz' 31 | download_dataset(url, join(self.data_folder, 'lastfm.tar.gz')) 32 | rename(join(self.data_folder, 'lastfm-dataset-360K'), path) 33 | -------------------------------------------------------------------------------- /rs_datasets/movielens.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import rename 3 | from os.path import join 4 | from typing import Tuple 5 | 6 | import datatable as dt 7 | from datatable import Frame 8 | 9 | from rs_datasets.data_loader import download_dataset 10 | from rs_datasets.generic_dataset import Dataset, safe 11 | 12 | rating_cols = ['user_id', 'item_id', 'rating', 'timestamp'] 13 | item_cols = ['item_id', 'title', 'genres'] 14 | tag_cols = ['user_id', 'item_id', 'tag', 'timestamp'] 15 | user_cols = ['user_id', 'gender', 'age', 'occupation', 'zip_code'] 16 | link_cols = ['item_id', 'imdb_id', 'tmdb_id'] 17 | tag_g_cols = ['tag_id', 'tag'] 18 | score_cols = ['movie_id', 'tag_id', 'rating'] 19 | genre_cols = ['item_id', 'title', 'release_date', 'video_release_date', 20 | 'imdb_url', 'unknown', 'Action', 'Adventure', 'Animation', 21 | 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 22 | 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 23 | 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] 24 | 25 | class MovieLens(Dataset): 26 | def __init__( 27 | self, 28 | version: str = 'small', 29 | read_genome: bool = False, 30 | path: str = None 31 | ): 32 | """ 33 | :param version: dataset version, 34 | one of {'100k', '1m', '10m', '20m', '25m', 'small', 'latest'} 35 | :param read_genome: whether to read genome tag dataset or not 36 | (available from version 20m and up). 37 | Are not loaded by default to save memory. 38 | :param path: where to read dataset from or where to download to. 39 | """ 40 | super().__init__(path) 41 | options = {'100k', '1m', '10m', '20m', '25m', 'small', 'latest'} 42 | if version not in options: 43 | raise ValueError( 44 | f'{version} is not supported. Available options: {options}') 45 | 46 | if version == 'small': 47 | dataset = 'ml-latest-small' 48 | else: 49 | dataset = 'ml-' + version 50 | 51 | folder = join(self.data_folder, dataset) 52 | if not os.path.exists(folder): 53 | self._download(folder, dataset) 54 | 55 | if version == '100k': 56 | (self.ratings, 57 | self.users, 58 | self.items) = self._read_100k(folder) 59 | elif version == '1m': 60 | (self.ratings, 61 | self.users, 62 | self.items) = self._read_1m(folder) 63 | elif version == '10m': 64 | (self.ratings, 65 | self.items, 66 | self.tags) = self._read_10m(folder) 67 | else: 68 | (self.ratings, 69 | self.items, 70 | self.tags, 71 | self.links) = self._read_modern(folder) 72 | if read_genome: 73 | (self.genome_tags, 74 | self.genome_scores) = self._read_genome(folder) 75 | 76 | @staticmethod 77 | def _read_modern(folder: str) -> Tuple[Frame, Frame, Frame, Frame]: 78 | ratings = dt.fread(join(folder, 'ratings.csv'), columns=rating_cols).to_pandas() 79 | items = dt.fread(join(folder, 'movies.csv'), columns=item_cols).to_pandas() 80 | tags = dt.fread(join(folder, 'tags.csv'), columns=tag_cols).to_pandas() 81 | links = dt.fread(join(folder, 'links.csv'), columns=link_cols).to_pandas() 82 | return ratings, items, tags, links 83 | 84 | @staticmethod 85 | def _read_genome(folder: str) -> Tuple[Frame, Frame]: 86 | genome_tags = dt.fread(join(folder, 'genome-tags.csv'), columns=tag_g_cols).to_pandas() 87 | genome_scores = dt.fread(join(folder, 'genome-scores.csv'), columns=score_cols).to_pandas() 88 | return genome_tags, genome_scores 89 | 90 | @staticmethod 91 | def _read_10m(folder: str) -> Tuple[Frame, Frame, Frame]: 92 | ratings = dt.fread(join(folder, 'ratings.dat'), columns=rating_cols).to_pandas() 93 | items = dt.fread(join(folder, 'movies.dat'), columns=item_cols, quotechar="").to_pandas() 94 | tags = dt.fread(join(folder, 'tags.dat'), columns=tag_cols, quotechar="").to_pandas() 95 | return ratings, items, tags 96 | 97 | @staticmethod 98 | def _read_1m(folder: str) -> Tuple[Frame, Frame, Frame]: 99 | ratings = dt.fread(join(folder, 'ratings.dat'), columns=rating_cols).to_pandas() 100 | users = dt.fread(join(folder, 'users.dat'), columns=user_cols).to_pandas() 101 | items = dt.fread(join(folder, 'movies.dat'), columns=item_cols).to_pandas() 102 | return ratings, users, items 103 | 104 | @staticmethod 105 | def _read_100k(folder: str) -> Tuple[Frame, Frame, Frame]: 106 | ratings = dt.fread(join(folder, 'u.data'), columns=rating_cols).to_pandas() 107 | users = dt.fread(join(folder, 'u.user'), columns=user_cols).to_pandas() 108 | items = dt.fread(join(folder, 'u.item'), columns=genre_cols) 109 | del items[:, 'video_release_date'] 110 | items = items.to_pandas() 111 | return ratings, users, items 112 | 113 | @safe 114 | def _download(self, path, dataset): 115 | """ 116 | Download data from https://grouplens.org/datasets/movielens/ 117 | Available options: ml-20m, ml-latest-small, ml-latest and other, 118 | can be checked on ml site. 119 | 120 | :param path: where to save 121 | :param dataset: dataset version 122 | :return: None 123 | """ 124 | self.logger.info('Downloading %s from grouplens...', dataset) 125 | archive = dataset + '.zip' 126 | url = f'http://files.grouplens.org/datasets/movielens/{archive}' 127 | download_dataset(url, path + '.zip') 128 | if dataset == 'ml-10m': 129 | rename(join(self.data_folder, 'ml-10M100K'), path) 130 | self.replace_separator(join(path, 'movies.dat'), '::', '\t') 131 | self.replace_separator(join(path, 'ratings.dat'), '::', '\t') 132 | self.replace_separator(join(path, 'tags.dat'), '::', '\t') 133 | elif dataset == 'ml-1m': 134 | self.replace_separator(join(path, 'movies.dat'), '::', '\t', 'ISO-8859-1') 135 | self.replace_separator(join(path, 'ratings.dat'), '::', '\t') 136 | self.replace_separator(join(path, 'users.dat'), '::', '\t') 137 | 138 | @staticmethod 139 | def replace_separator(filepath: str, old: str, new: str, encoding: str = 'utf8'): 140 | with open(filepath, 'r', encoding=encoding) as f: 141 | newlines = [] 142 | for line in f.readlines(): 143 | newlines.append(line.replace(old, new)) 144 | with open(filepath, 'w') as f: 145 | for line in newlines: 146 | f.write(line) 147 | -------------------------------------------------------------------------------- /rs_datasets/msd.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os import rename 3 | from os.path import join 4 | 5 | import datatable as dt 6 | import pandas as pd 7 | 8 | from rs_datasets.data_loader import download_dataset, download_url 9 | from rs_datasets.generic_dataset import Dataset, safe 10 | 11 | 12 | class MillionSongDataset(Dataset): 13 | def __init__( 14 | self, 15 | merge_kaggle_splits: bool = True, 16 | drop_mismatches: bool = True, 17 | path: str = None 18 | ): 19 | """ 20 | :param merge_kaggle_splits: 21 | In MSD Challenge on [Kaggle](https://www.kaggle.com/c/msdchallenge) there were 22 | public and private parts. By default they are merged together. You can change this, setting 23 | `merge_kaggle_splits` to `False`. 24 | :param drop_mismatches: 25 | There is a [matching error](http://millionsongdataset.com/blog/12-2-12-fixing-matching-errors/) 26 | between track ids and song ids in MSD. It shouldn't matter if you don't use audio features, but 27 | by default these items are removed. 28 | :param path: where to read dataset from or where to download to. 29 | """ 30 | super().__init__(path) 31 | folder = join(self.data_folder, 'msd') 32 | if not os.path.exists(folder): 33 | self._download(folder) 34 | 35 | try_cache = merge_kaggle_splits and drop_mismatches 36 | processed = join(folder, 'clean') 37 | if try_cache and os.path.exists(processed): 38 | self.train = dt.fread(join(processed, 'train.csv')).to_pandas() 39 | self.val = dt.fread(join(processed, 'val.csv')).to_pandas() 40 | self.test = dt.fread(join(processed, 'test.csv')).to_pandas() 41 | else: 42 | eval_folder = join(folder, 'evaluation') 43 | self.train = self._read_triplets(join(folder, 44 | 'train_triplets.txt')) 45 | val_vis = self._read_triplets(join(eval_folder, 46 | 'year1_valid_triplets_visible.txt')) 47 | val_hid = self._read_triplets(join(eval_folder, 48 | 'year1_valid_triplets_hidden.txt')) 49 | test_vis = self._read_triplets(join(eval_folder, 50 | 'year1_test_triplets_visible.txt')) 51 | test_hid = self._read_triplets(join(eval_folder, 52 | 'year1_test_triplets_hidden.txt')) 53 | if drop_mismatches: 54 | mismatches = self._read_mismatches(folder) 55 | mismatches = set(mismatches.item_id) 56 | self.train = self._drop_mismatches(self.train, mismatches) 57 | val_vis = self._drop_mismatches(val_vis, mismatches) 58 | val_hid = self._drop_mismatches(val_hid, mismatches) 59 | test_vis = self._drop_mismatches(test_vis, mismatches) 60 | test_hid = self._drop_mismatches(test_hid, mismatches) 61 | 62 | if merge_kaggle_splits: 63 | self.val = pd.concat([val_vis, val_hid], ignore_index=True) 64 | self.test = pd.concat([test_vis, test_hid], ignore_index=True) 65 | else: 66 | self.val_visible = val_vis 67 | self.val_hidden = val_hid 68 | self.test_visible = test_vis 69 | self.test_hidden = test_hid 70 | 71 | if try_cache and not os.path.exists(processed): 72 | os.mkdir(processed) 73 | self.train.to_csv(join(processed, 'train.csv'), index=False) 74 | self.val.to_csv(join(processed, 'val.csv'), index=False) 75 | self.test.to_csv(join(processed, 'test.csv'), index=False) 76 | 77 | @staticmethod 78 | def _read_triplets(path): 79 | return dt.fread( 80 | path, 81 | columns=['user_id', 'item_id', 'play_count'] 82 | ).to_pandas().dropna() 83 | 84 | @staticmethod 85 | def _read_mismatches(path): 86 | name = 'sid_mismatches.txt' 87 | file = join(path, name) 88 | mismatches = [] 89 | with open(file) as f: 90 | for line in f.readlines(): 91 | song, track = line[ 92 | line.find('<') + 1: line.find('>')].split(' ') 93 | mismatches.append([song, track]) 94 | return pd.DataFrame(mismatches, columns=['item_id', 'track_id']) 95 | 96 | @staticmethod 97 | def _drop_mismatches(df, mismatches): 98 | return df[~df.item_id.isin(mismatches)] 99 | 100 | @safe 101 | def _download(self, path): 102 | """ 103 | Downloads train triplets, MSD Challenge Kaggle data 104 | (http://millionsongdataset.com/challenge/) 105 | and a list of matching errors 106 | http://millionsongdataset.com/blog/12-2-12-fixing-matching-errors/ 107 | 108 | :param path: path to save 109 | :return: None 110 | """ 111 | self.logger.info('Getting Million Song Dataset...') 112 | self.logger.info('Downloading Echo Nest Taste Subprofile train data...') 113 | base_url = 'http://millionsongdataset.com/sites/default/files/challenge/' 114 | 115 | download_dataset( 116 | base_url + 'train_triplets.txt.zip', 117 | join(self.data_folder, 'train.zip') 118 | ) 119 | rename(join(self.data_folder, 'train'), path) 120 | 121 | self.logger.info('Downloading evaluation data for MSD Challenge...') 122 | download_dataset( 123 | base_url + 'EvalDataYear1MSDWebsite.zip', 124 | join(path, 'eval.zip') 125 | ) 126 | rename( 127 | join(path, 'EvalDataYear1MSDWebsite'), 128 | join(path, 'evaluation') 129 | ) 130 | 131 | self.logger.info('Downloading list of matching errors...') 132 | url = 'http://millionsongdataset.com/sites/default/files/tasteprofile/sid_mismatches.txt' 133 | download_url(url, join(path, 'sid_mismatches.txt')) 134 | -------------------------------------------------------------------------------- /rs_datasets/netflix.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | from os import mkdir, remove, rename 3 | from os.path import exists, join 4 | 5 | import datatable as dt 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | from rs_datasets.data_loader import download_dataset, extract, rm_if_exists 10 | from rs_datasets.generic_dataset import Dataset, safe 11 | 12 | 13 | class Netflix(Dataset): 14 | def __init__(self, path: str = None): 15 | """ 16 | :param path: where to read dataset from or where to download to. 17 | """ 18 | super().__init__(path) 19 | folder = join(self.data_folder, 'netflix') 20 | if not exists(folder): 21 | self._download(folder) 22 | self._save_clean(folder) 23 | self._read_clean(folder) 24 | 25 | def _read_clean(self, folder): 26 | path = join(folder, 'clean') 27 | self.movies = pd.read_csv(join(path, 'movies.csv'), sep='\t', 28 | names=['item_id', 'year', 'title'], 29 | dtype={'item_id': 'uint16', 30 | 'year': 'float32'}) 31 | test = dt.fread( 32 | join(path, 'test.csv'), 33 | columns=['item_id', 'user_id', 'timestamp'] 34 | ).to_pandas() 35 | test['timestamp'] = pd.to_datetime(test['timestamp']) 36 | test['user_id'] = test['user_id'].astype('category') 37 | test['item_id'] = test['item_id'].astype('category') 38 | self.test = test 39 | 40 | if exists(join(path, 'train.parquet')): 41 | self.train = pd.read_parquet(join(path, 'train.parquet')) 42 | else: 43 | train = dt.fread( 44 | join(path, 'train.csv'), 45 | columns=['item_id', 'user_id', 'rating', 'timestamp'] 46 | ).to_pandas() 47 | train['timestamp'] = pd.to_datetime(train['timestamp']) 48 | train['rating'] = train['rating'].astype('uint8') 49 | train['user_id'] = train['user_id'].astype('category') 50 | train['item_id'] = train['item_id'].astype('category') 51 | self.train = train 52 | self.train.to_parquet(join(path, 'train.parquet')) 53 | remove(join(path, 'train.csv')) 54 | 55 | def _save_clean(self, raw): 56 | clean = join(raw, 'clean') 57 | mkdir(clean) 58 | self._fix_movies(raw, clean) 59 | self._fix_train(raw, clean) 60 | self._fix_test(raw, clean) 61 | 62 | @staticmethod 63 | def _fix_test(raw, clean): 64 | dest = open(join(clean, 'test.csv'), 'w') 65 | with open(join(raw, 'qualifying.txt')) as source: 66 | for line in source: 67 | if line[-2] == ':': 68 | movie_id = line[:-2] + ',' 69 | else: 70 | dest.write(movie_id + line) 71 | dest.close() 72 | 73 | def _fix_train(self, raw, clean): 74 | self.logger.info('Parsing train files') 75 | folder = join(raw, 'training_set') 76 | files = glob(join(folder, '*.txt')) 77 | dest = open(join(clean, 'train.csv'), 'w') 78 | for file in tqdm(files): 79 | with open(file) as source: 80 | for line in source: 81 | if line[-2] == ':': 82 | movie_id = line[:-2] + ',' 83 | else: 84 | dest.write(movie_id + line) 85 | dest.close() 86 | 87 | @staticmethod 88 | def _fix_movies(raw, clean): 89 | """ 90 | Comma separator also appears in movie titles, for example: 91 | `72,1974,At Home Among Strangers, A Stranger Among His Own` 92 | Separator is changed to tabulation for easy parsing. 93 | """ 94 | file = join(raw, 'movie_titles.txt') 95 | dest = open(join(clean, 'movies.csv'), 'w') 96 | with open(file, encoding='ISO-8859-1') as f: 97 | for line in f.readlines(): 98 | first = line.find(',') 99 | second = first + 5 100 | m_id = line[:first] 101 | year = line[first + 1:second] 102 | title = line[second + 1:] 103 | dest.write('\t'.join([m_id, year, title]) + '\n') 104 | dest.close() 105 | 106 | @safe 107 | def _download(self, path): 108 | self.logger.info('Downloading Netflix Prize dataset...') 109 | url = 'https://archive.org/download/nf_prize_dataset.tar/nf_prize_dataset.tar.gz' 110 | download_dataset(url, join(self.data_folder, 'netflix.tar.gz')) 111 | rename(join(self.data_folder, 'download'), path) 112 | archive = join(path, 'training_set.tar') 113 | extract(archive) 114 | rm_if_exists(archive) 115 | -------------------------------------------------------------------------------- /rs_datasets/rekko.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from os import rename 4 | from os.path import join, exists 5 | 6 | import datatable as dt 7 | 8 | from rs_datasets.data_loader import download_dataset 9 | from rs_datasets.generic_dataset import Dataset, safe 10 | 11 | 12 | class Rekko(Dataset): 13 | def __init__(self, path: str = None): 14 | super().__init__(path) 15 | folder = join(self.data_folder, 'rekko') 16 | if not exists(folder): 17 | self._download(folder) 18 | 19 | self.transactions = dt.fread( 20 | join(folder, 'transactions.csv'), 21 | columns=[ 22 | 'item_id', 'user_id', 'consumption_mode', 'ts', 23 | 'watched_time', 'device_type', 'device_manufacturer' 24 | ], 25 | ).to_pandas() 26 | 27 | self.ratings = dt.fread( 28 | join(folder, 'ratings.csv'), 29 | columns=['user_id', 'item_id', 'rating', 'ts'] 30 | ).to_pandas() 31 | 32 | self.bookmarks = dt.fread( 33 | join(folder, 'bookmarks.csv'), 34 | columns=['user_id', 'item_id', 'ts'] 35 | ).to_pandas() 36 | 37 | with open(join(folder, 'catalogue.json')) as json_file: 38 | self.catalogue = json.load(json_file) 39 | 40 | 41 | @safe 42 | def _download(self, path): 43 | logging.info('Downloading rekko challenge dataset...') 44 | archive = 'rekko_challenge_rekko_challenge_2019.zip' 45 | url = f'https://boosters.pro/api/ch/files/pub/{archive}' 46 | download_dataset(url, join(self.data_folder, 'rekko.zip')) 47 | rename(join(self.data_folder, 'rekko'), path) 48 | -------------------------------------------------------------------------------- /rs_datasets/retail_rocket.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | 6 | from rs_datasets.generic_dataset import Dataset, safe 7 | 8 | 9 | class RetailRocket(Dataset): 10 | def __init__(self, path: str = None): 11 | super().__init__(path) 12 | folder = join(self.data_folder, 'retail_rocket') 13 | if not exists(folder): 14 | self._download(folder) 15 | 16 | self.category_tree = dt.fread( 17 | join(folder, 'category_tree.csv'), 18 | columns=[ 19 | 'category_id', 'parent_id' 20 | ], 21 | ).to_pandas() 22 | 23 | self.log = dt.fread( 24 | join(folder, 'events.csv'), 25 | columns=[ 26 | 'ts', 'user_id', 'event', 'item_id', 'transaction_id' 27 | ], 28 | ).to_pandas() 29 | 30 | items1 = dt.fread( 31 | join(folder, 'item_properties_part1.csv'), 32 | columns=[ 33 | 'ts', 'item_id', 'property', 'value' 34 | ], 35 | ).to_pandas() 36 | 37 | items2 = dt.fread( 38 | join(folder, 'item_properties_part2.csv'), 39 | columns=[ 40 | 'ts', 'item_id', 'property', 'value' 41 | ], 42 | ).to_pandas() 43 | 44 | self.items = items1.append(items2, ignore_index=True) 45 | 46 | @safe 47 | def _download(self, path): 48 | from kaggle.api.kaggle_api_extended import KaggleApi 49 | logging.info('Downloading Retail Rocket dataset...') 50 | api = KaggleApi() 51 | api.authenticate() 52 | api.dataset_download_files('retailrocket/ecommerce-dataset', path, unzip=True, quiet=False) 53 | -------------------------------------------------------------------------------- /rs_datasets/steam.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from os.path import join, exists 3 | 4 | import datatable as dt 5 | 6 | from rs_datasets.generic_dataset import Dataset, safe 7 | 8 | 9 | class Steam(Dataset): 10 | def __init__(self, path: str = None): 11 | super().__init__(path) 12 | folder = join(self.data_folder, 'steam') 13 | if not exists(folder): 14 | self._download(folder) 15 | 16 | self.data = dt.fread( 17 | join(folder, 'steam-200k.csv'), 18 | columns=[ 19 | 'user_id', 'game', 'behavior', 'value', 'c' 20 | ], 21 | ).to_pandas() 22 | self.data = self.data.drop('c', axis=1) 23 | 24 | @safe 25 | def _download(self, path): 26 | from kaggle.api.kaggle_api_extended import KaggleApi 27 | logging.info('Downloading steam dataset...') 28 | api = KaggleApi() 29 | api.authenticate() 30 | api.dataset_download_files('tamber/steam-video-games', path, unzip=True) 31 | -------------------------------------------------------------------------------- /rs_datasets/yoochoose.py: -------------------------------------------------------------------------------- 1 | from os.path import join, exists 2 | 3 | import datatable as dt 4 | 5 | from rs_datasets.data_loader import download_dataset 6 | from rs_datasets.generic_dataset import Dataset, safe 7 | 8 | 9 | class YooChoose(Dataset): 10 | def __init__(self, path: str = None): 11 | """ 12 | :param path: folder which is used to download dataset to 13 | if it does not contain dataset files. 14 | If files are found, load them. 15 | """ 16 | super().__init__(path) 17 | folder = join(self.data_folder, 'yoochoose') 18 | if not exists(folder): 19 | self._download(folder) 20 | 21 | self.log = dt.fread( 22 | join(folder, 'yoochoose-clicks.dat'), 23 | columns=['session_id', 'ts', 'item_id', 'category'] 24 | ).to_pandas() 25 | 26 | self.purchases = dt.fread( 27 | join(folder, 'yoochoose-buys.dat'), 28 | columns=['session_id', 'ts', 'item_id', 'price', 'quantity'] 29 | ).to_pandas() 30 | 31 | self.test = dt.fread( 32 | join(folder, 'yoochoose-test.dat'), 33 | columns=['session_id', 'ts', 'item_id', 'category'] 34 | ).to_pandas() 35 | 36 | 37 | @safe 38 | def _download(self, path): 39 | self.logger.info('Downloading YooChoose Dataset...') 40 | url = 'https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z' 41 | download_dataset(url, join(self.data_folder, 'yoochoose.7z')) 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md', 'r') as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name='rs_datasets', 8 | version='0.5.1', 9 | author='Yan-Martin Tamm', 10 | author_email='darel142857@gmail.com', 11 | description='Tool for autodownloading recommendation systems datasets', 12 | long_description=long_description, 13 | long_description_content_type='text/markdown', 14 | url='https://github.com/Darel13712/rs_datasets', 15 | packages=setuptools.find_packages(), 16 | install_requires=[ 17 | 'datatable', 18 | 'pandas', 19 | 'gdown', 20 | 'pyarrow', 21 | 'tqdm', 22 | 'xlrd', 23 | 'kaggle', 24 | 'py7zr', 25 | 'openpyxl' 26 | ], 27 | classifiers=[ 28 | 'Programming Language :: Python :: 3', 29 | 'License :: OSI Approved :: MIT License', 30 | 'Operating System :: OS Independent', 31 | ], 32 | python_requires='>=3.6', 33 | ) 34 | --------------------------------------------------------------------------------