├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── pages.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── .yardopts
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── Steepfile
├── doc
    └── text
    │   └── news.md
├── example
    ├── aozora-bunko.rb
    ├── diamonds.rb
    ├── e-stat-japan.rb
    ├── fuel-economy.rb
    ├── house-of-councillor.rb
    ├── house-of-representative.rb
    ├── iris.rb
    ├── mnist.rb
    ├── nagoya-university-conversation-corpus.rb
    ├── ptb.rb
    ├── quora-duplicate-question-pair.rb
    ├── wikipedia-kyoto-japanese-english.rb
    └── wine.rb
├── lib
    ├── datasets.rb
    └── datasets
    │   ├── adult.rb
    │   ├── afinn.rb
    │   ├── aozora-bunko.rb
    │   ├── cache-path.rb
    │   ├── california-housing.rb
    │   ├── cifar.rb
    │   ├── cldr-plurals.rb
    │   ├── communities.rb
    │   ├── dataset.rb
    │   ├── diamonds.rb
    │   ├── dictionary.rb
    │   ├── downloader.rb
    │   ├── e-stat-japan.rb
    │   ├── error.rb
    │   ├── fashion-mnist.rb
    │   ├── fuel-economy.rb
    │   ├── geolonia.rb
    │   ├── ggplot2-dataset.rb
    │   ├── hepatitis.rb
    │   ├── house-of-councillor.rb
    │   ├── house-of-representative.rb
    │   ├── iris.rb
    │   ├── ita-corpus.rb
    │   ├── japanese-date-parser.rb
    │   ├── kuzushiji-mnist.rb
    │   ├── lazy.rb
    │   ├── libsvm-dataset-list.rb
    │   ├── libsvm.rb
    │   ├── license.rb
    │   ├── livedoor-news.rb
    │   ├── metadata.rb
    │   ├── mnist.rb
    │   ├── mushroom.rb
    │   ├── nagoya-university-conversation-corpus.rb
    │   ├── penguins.rb
    │   ├── penn-treebank.rb
    │   ├── pmjt-dataset-list.rb
    │   ├── postal-code-japan.rb
    │   ├── quora-duplicate-question-pair.rb
    │   ├── rdataset.rb
    │   ├── seaborn.rb
    │   ├── sudachi-synonym-dictionary.rb
    │   ├── table.rb
    │   ├── tar-gz-readable.rb
    │   ├── version.rb
    │   ├── wikipedia-kyoto-japanese-english.rb
    │   ├── wikipedia.rb
    │   ├── wine.rb
    │   └── zip-extractor.rb
├── red-datasets.gemspec
└── test
    ├── helper.rb
    ├── japanese-date-parser-test.rb
    ├── run-test.rb
    ├── test-adult.rb
    ├── test-afinn.rb
    ├── test-aozora-bunko.rb
    ├── test-california-housing.rb
    ├── test-cifar.rb
    ├── test-cldr-plurals.rb
    ├── test-communities.rb
    ├── test-dataset.rb
    ├── test-diamonds.rb
    ├── test-dictionary.rb
    ├── test-downloader.rb
    ├── test-e-stat-japan.rb
    ├── test-fashion-mnist.rb
    ├── test-fuel-economy.rb
    ├── test-geolonia.rb
    ├── test-hepatitis.rb
    ├── test-house-of-councillor.rb
    ├── test-house-of-representative.rb
    ├── test-iris.rb
    ├── test-ita-corpus.rb
    ├── test-kuzushiji-mnist.rb
    ├── test-libsvm-dataset-list.rb
    ├── test-libsvm.rb
    ├── test-license.rb
    ├── test-livedoor-news.rb
    ├── test-metadata.rb
    ├── test-mnist.rb
    ├── test-mushroom.rb
    ├── test-nagoya-university-conversation-corpus.rb
    ├── test-penguins.rb
    ├── test-penn-treebank.rb
    ├── test-pmjt-dataset-list.rb
    ├── test-postal-code-japan.rb
    ├── test-quora-duplicate-question-pair.rb
    ├── test-rdataset.rb
    ├── test-seaborn.rb
    ├── test-sudachi-synonym-dictionary.rb
    ├── test-table.rb
    ├── test-wikipedia-kyoto-japanese-english.rb
    ├── test-wikipedia.rb
    └── test-wine.rb


/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "github-actions"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "weekly"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/pages.yml:
--------------------------------------------------------------------------------
 1 | name: Pages
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - "master"
 7 |   workflow_dispatch:
 8 | 
 9 | permissions:
10 |   contents: read
11 |   pages: write
12 |   id-token: write
13 | 
14 | concurrency:
15 |   group: "pages"
16 |   cancel-in-progress: true
17 | 
18 | jobs:
19 |   build:
20 |     runs-on: ubuntu-latest
21 |     env:
22 |       # We can invalidate the current cache by updating this.
23 |       CACHE_VERSION: "2022-10-21"
24 |     steps:
25 |       - uses: actions/checkout@v4
26 |       - uses: actions/configure-pages@v5
27 |       - uses: ruby/setup-ruby@v1
28 |         with:
29 |           ruby-version: ruby
30 |       - uses: actions/cache@v4
31 |         with:
32 |           path: |
33 |             ~/.cache/red-datasets
34 |           key: ${{ env.CACHE_VERSION }}-pages-${{ hashFiles('lib/**') }}
35 |           restore-keys: |
36 |             ${{ env.CACHE_VERSION }}-pages-
37 |       - name: Install dependencies
38 |         run: |
39 |           bundle install
40 |       - name: Generate
41 |         run: |
42 |           bundle exec rake pages
43 |       - uses: actions/upload-pages-artifact@v3
44 | 
45 |   deploy:
46 |     environment:
47 |       name: github-pages
48 |       url: ${{ steps.deployment.outputs.page_url }}
49 |     runs-on: ubuntu-latest
50 |     needs: build
51 |     steps:
52 |       - uses: actions/deploy-pages@v4
53 |         id: deployment
54 | 
55 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - "*"
 6 | jobs:
 7 |   github:
 8 |     name: GitHub
 9 |     runs-on: ubuntu-latest
10 |     timeout-minutes: 10
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - name: Extract release note
14 |         run: |
15 |           ruby \
16 |             -e 'print("## Red Datasets "); \
17 |                 puts(ARGF.read.split(/^## /)[1]. \
18 |                        gsub(/ {.+?}/, ""). \
19 |                        gsub(/\[(.+?)\]\[.+?\]/) {$1})' \
20 |             doc/text/news.md > release-note.md
21 |       - name: Upload to release
22 |         run: |
23 |           title=$(head -n1 release-note.md | sed -e 's/^## //')
24 |           tail -n +2 release-note.md > release-note-without-version.md
25 |           gh release create ${GITHUB_REF_NAME} \
26 |             --discussion-category Announcements \
27 |             --notes-file release-note-without-version.md \
28 |             --title "${title}"
29 |         env:
30 |           GH_TOKEN: ${{ github.token }}
31 |   rubygems:
32 |     name: RubyGems
33 |     runs-on: ubuntu-latest
34 |     timeout-minutes: 10
35 |     permissions:
36 |       id-token: write
37 |     environment: release
38 |     steps:
39 |       - uses: actions/checkout@v4
40 |       - uses: ruby/setup-ruby@v1
41 |         with:
42 |           ruby-version: ruby
43 |           bundler-cache: true
44 |       - uses: rubygems/configure-rubygems-credentials@v1.0.0
45 |       - name: Push gems
46 |         run: |
47 |           bundle exec rake release:rubygem_push
48 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |   schedule:
 7 |     - cron: |
 8 |         0 0 * * 0
 9 | 
10 | jobs:
11 |   test:
12 |     name: "Ruby ${{ matrix.ruby-version }}: ${{ matrix.runs-on }}"
13 |     strategy:
14 |       # To avoid high frequency datasets download in a short time.
15 |       max-parallel: 1
16 |       fail-fast: false
17 |       matrix:
18 |         ruby-version:
19 |           - "3.2"
20 |           - "3.3"
21 |           - "3.4"
22 |         runs-on:
23 |           - macos-latest
24 |           - ubuntu-latest
25 |           - windows-latest
26 |     runs-on: ${{ matrix.runs-on }}
27 |     env:
28 |       # We can invalidate the current cache by updating this.
29 |       CACHE_VERSION: "2024-09-01"
30 |     steps:
31 |       - uses: actions/checkout@v4
32 |       - uses: ruby/setup-ruby@v1
33 |         with:
34 |           ruby-version: ${{ matrix.ruby-version }}
35 |       - uses: actions/cache@v4
36 |         if: |
37 |           runner.os == 'Linux'
38 |         with:
39 |           path: |
40 |             ~/.cache/red-datasets
41 |           key: ${{ env.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('lib/**') }}
42 |           restore-keys: |
43 |             ${{ env.CACHE_VERSION }}-${{ runner.os }}-
44 |       - uses: actions/cache@v4
45 |         if: |
46 |           runner.os == 'macOS'
47 |         with:
48 |           path: |
49 |             ~/Library/Caches/red-datasets
50 |           key: ${{ env.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('lib/**') }}
51 |           restore-keys: |
52 |             ${{ env.CACHE_VERSION }}-${{ runner.os }}-
53 |       - uses: actions/cache@v4
54 |         if: |
55 |           runner.os == 'Windows'
56 |         with:
57 |           path: |
58 |             ~/AppData/Local/red-datasets
59 |           key: ${{ env.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('lib/**') }}
60 |           restore-keys: |
61 |             ${{ env.CACHE_VERSION }}-${{ runner.os }}-
62 |       - name: Install dependencies
63 |         run: |
64 |           bundle install
65 |       - name: Test
66 |         env:
67 |           GH_TOKEN: ${{ github.token }}
68 |         run: |
69 |           bundle exec rake
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /Gemfile.lock
2 | /_site/
3 | /pkg/
4 | 


--------------------------------------------------------------------------------
/.yardopts:
--------------------------------------------------------------------------------
1 | --output-dir doc/reference/en
2 | --markup markdown
3 | --markup-provider kramdown
4 | lib/**/*.rb
5 | -
6 | doc/text/**/*
7 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | # -*- ruby -*-
 2 | 
 3 | source "https://rubygems.org/"
 4 | 
 5 | gemspec
 6 | 
 7 | # add steep and typeprof to development dependencies
 8 | group :development do
 9 |   gem "steep", require: false
10 |   gem "typeprof"
11 | end
12 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2017 Kouhei Sutou <kou@clear-code.com>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Red Datasets
  2 | 
  3 | [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets)
  4 | 
  5 | ## Description
  6 | 
  7 | Red Datasets provides classes that provide common datasets such as iris dataset.
  8 | 
  9 | You can use datasets easily because you can access each dataset with multiple ways such as `#each` and Apache Arrow Record Batch.
 10 | 
 11 | ## Install
 12 | 
 13 | ```console
 14 | % gem install red-datasets
 15 | ```
 16 | 
 17 | ## Available datasets
 18 | 
 19 | * Adult Dataset
 20 | * Aozora Bunko
 21 | * California Housing
 22 | * CIFAR-10 Dataset
 23 | * CIFAR-100 Dataset
 24 | * CLDR language plural rules
 25 | * Communities and crime
 26 | * Diamonds Dataset
 27 | * E-Stat Japan
 28 | * Fashion-MNIST
 29 | * Fuel Economy Dataset
 30 | * Geolonia Japanese Addresses
 31 | * Hepatitis
 32 | * House of Councillors of Japan
 33 | * House of Representatives of Japan
 34 | * Iris Dataset
 35 | * Libsvm
 36 | * MNIST database
 37 | * Mushroom
 38 | * Penguins
 39 | * The Penn Treebank Project
 40 | * PMJT - Pre-Modern Japanese Text dataset list
 41 | * Postal Codes in Japan
 42 | * Rdatasets
 43 | * Seaborn
 44 | * Sudachi Synonym Dictionary
 45 | * Wikipedia
 46 | * Wine Dataset
 47 | 
 48 | ## Usage
 49 | 
 50 | Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each`  or `Table#to_h` or `Table#fetch_values`.
 51 | 
 52 | ```ruby
 53 | require "datasets"
 54 | 
 55 | iris = Datasets::Iris.new
 56 | iris.each do |record|
 57 |   p [
 58 |      record.sepal_length,
 59 |      record.sepal_width,
 60 |      record.petal_length,
 61 |      record.petal_width,
 62 |      record.label,
 63 |   ]
 64 | end
 65 | # => [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
 66 | # => [4.9, 3.0, 1.4, 0.2, "Iris-setosa"]
 67 |   :
 68 | # => [7.0, 3.2, 4.7, 1.4, "Iris-versicolor"]
 69 | 
 70 | 
 71 | iris_hash = iris.to_table.to_h
 72 | p iris_hash[:sepal_length]
 73 | # => [5.1, 4.9, .. , 7.0, ..
 74 | p iris_hash[:sepal_width]
 75 | # => [3.5, 3.0, .. , 3.2, ..
 76 | p iris_hash[:petal_length]
 77 | # => [1.4, 1.4, .. , 4.7, ..
 78 | p iris_hash[:petal_width]
 79 | # => [0.2, 0.2, .. , 1.4, ..
 80 | p iris_hash[:label]
 81 | # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
 82 | 
 83 | 
 84 | iris_table = iris.to_table
 85 | p iris_table.fetch_values(:sepal_length, :sepal_width, :petal_length, :petal_width).transpose
 86 | # => [[5.1, 3.5, 1.4, 0.2],
 87 |       [4.9, 3.0, 1.4, 0.2],
 88 |       :
 89 |       [7.0, 3.2, 4.7, 1.4],
 90 |       :
 91 | 
 92 | p iris_table[:label]
 93 | # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", ..
 94 | ```
 95 | 
 96 | 
 97 | Here is an example to access [The CIFAR-10/100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html) by `#each`:
 98 | 
 99 | **CIFAR-10**
100 | 
101 | ```ruby
102 | require "datasets"
103 | 
104 | cifar = Datasets::CIFAR.new(n_classes: 10, type: :train)
105 | cifar.metadata
106 | #=> #<struct Datasets::Metadata name="CIFAR-10", url="https://www.cs.toronto.edu/~kriz/cifar.html", licenses=nil, description="CIFAR-10 is 32x32 image dataset">licenses=nil, description="CIFAR-10 is 32x32 image datasets">
107 | cifar.each do |record|
108 |   p record.pixels
109 |   # => [59, 43, 50, 68, 98, 119, 139, 145, 149, 143, .....]
110 |   p record.label
111 |   # => 6
112 | end
113 | ```
114 | 
115 | **CIFAR-100**
116 | 
117 | ```ruby
118 | require "datasets"
119 | 
120 | cifar = Datasets::CIFAR.new(n_classes: 100, type: :test)
121 | cifar.metadata
122 | #=> #<struct Datasets::Metadata name="CIFAR-100", url="https://www.cs.toronto.edu/~kriz/cifar.html", licenses=nil, description="CIFAR-100 is 32x32 image dataset">
123 | cifar.each do |record|
124 |   p record.pixels
125 |   #=> [199, 196, 195, 195, 196, 197, 198, 198, 199, .....]
126 |   p record.coarse_label
127 |   #=> 10
128 |   p record.fine_label
129 |   #=> 49
130 | end
131 | ```
132 | 
133 | **MNIST**
134 | 
135 | ```ruby
136 | require "datasets"
137 | 
138 | mnist = Datasets::MNIST.new(type: :train)
139 | mnist.metadata
140 | #=> #<struct Datasets::Metadata name="MNIST-train", url="http://yann.lecun.com/exdb/mnist/", licenses=nil, description="a training set of 60,000 examples">
141 | 
142 | mnist.each do |record|
143 |   p record.pixels
144 |   # => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, .....]
145 |   p record.label
146 |   # => 5
147 | end
148 | ```
149 | 
150 | ## NArray compatibility
151 | 
152 | * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray)
153 | 
154 | ## How to develop Red Datasets
155 | 1. Fork https://github.com/red-data-tools/red-datasets 
156 | 2. Create a feature branch from master
157 | 3. Develop in the feature branch
158 | 4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets
159 | 
160 | ## License
161 | 
162 | The MIT license. See `LICENSE.txt` for details.
163 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # -*- ruby -*-
 2 | 
 3 | require "rubygems"
 4 | require "bundler/gem_helper"
 5 | 
 6 | base_dir = File.join(File.dirname(__FILE__))
 7 | 
 8 | helper = Bundler::GemHelper.new(base_dir)
 9 | def helper.version_tag
10 |   version
11 | end
12 | 
13 | helper.install
14 | spec = helper.gemspec
15 | 
16 | release_task = Rake.application["release"]
17 | # We use Trusted Publishing.
18 | release_task.prerequisites.delete("build")
19 | release_task.prerequisites.delete("release:rubygem_push")
20 | release_task_comment = release_task.comment
21 | if release_task_comment
22 |   release_task.clear_comments
23 |   release_task.comment = release_task_comment.gsub(/ and build.*$/, "")
24 | end
25 | 
26 | task default: :test
27 | 
28 | desc "Run tests"
29 | task :test do
30 |   ruby("test/run-test.rb")
31 | end
32 | 
33 | desc "Generate an artifact for GitHub Pages"
34 | task :pages do
35 |   pages_dir = "_site"
36 |   rm_rf(pages_dir)
37 |   mkdir_p(pages_dir)
38 | 
39 |   require "cgi/util"
40 |   require_relative "lib/datasets/lazy"
41 |   File.open("#{pages_dir}/index.html", "w") do |index_html|
42 |     index_html.puts(<<-HTML)
43 | <!DOCTYPE html>
44 | <html>
45 |   <head>
46 |     <meta charset="UTF-8">
47 |     <title>Red Datasets</title>
48 |     <style>
49 |       table {
50 |         margin-left: 20vw;
51 |         min-width: 50%;
52 |       }
53 |       th {
54 |         font-size: 30px;
55 |         padding: 20px;
56 |       }
57 |       td {
58 |         border-bottom: 1px solid #D9DCE0;
59 |         padding: 20px;
60 |         font-weight: bold;
61 |       }
62 |     </style>
63 |   </head>
64 |   <body>
65 |     <section>
66 |       <h1>Red Datasets</h1>
67 |       <table>
68 |         <thead>
69 |           <tr><th>Available datasets</th></tr>
70 |         </thead>
71 |         <tbody>
72 |     HTML
73 |     Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name|
74 |       index_html.puts(<<-HTML)
75 |           <tr><td>#{CGI.escapeHTML("Datasets::#{constant_name}")}</td></tr>
76 |       HTML
77 |     end
78 |     index_html.puts(<<-HTML)
79 |         </tbody>
80 |       </table>
81 |     </section>
82 |   </body>
83 | </html>
84 |     HTML
85 |   end
86 | end
87 | 


--------------------------------------------------------------------------------
/Steepfile:
--------------------------------------------------------------------------------
1 | D = Steep::Diagnostic
2 | 
3 | target :lib do
4 |   signature "sig"
5 |   check "lib" # Directory name to check
6 | 
7 |   configure_code_diagnostics(D::Ruby.lenient) # `lenient` diagnostics setting
8 | end
9 | 


--------------------------------------------------------------------------------
/example/aozora-bunko.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'datasets'
 4 | 
 5 | aozora = Datasets::AozoraBunko.new
 6 | book = aozora.first
 7 | p [
 8 |   book.title_id,
 9 |   book.title,
10 |   book.title_reading,
11 |   book.title_reading_collation,
12 |   book.subtitle,
13 |   book.subtitle_reading,
14 |   book.original_title,
15 |   book.first_appearance,
16 |   book.ndc_code,
17 |   book.syllabary_spelling_type,
18 |   book.copyrighted?,
19 |   book.published_date,
20 |   book.last_updated_date,
21 |   book.detail_url,
22 |   book.person_id,
23 |   book.person_family_name,
24 |   book.person_first_name,
25 |   book.person_family_name_reading,
26 |   book.person_first_name_reading,
27 |   book.person_family_name_reading_collation,
28 |   book.person_first_name_reading_collation,
29 |   book.person_family_name_romaji,
30 |   book.person_first_name_romaji,
31 |   book.person_type,
32 |   book.person_birthday,
33 |   book.person_date_of_death,
34 |   book.person_copyrighted?,
35 |   book.original_book_name1,
36 |   book.original_book_publisher_name1,
37 |   book.original_book_first_published_date1,
38 |   book.used_version_for_registration1,
39 |   book.used_version_for_proofreading1,
40 |   book.base_of_original_book_name1,
41 |   book.base_of_original_book_publisher_name1,
42 |   book.base_of_original_book_first_published_date1,
43 |   book.original_book_name2,
44 |   book.original_book_publisher_name2,
45 |   book.original_book_first_published_date2,
46 |   book.used_version_for_registration2,
47 |   book.used_version_for_proofreading2,
48 |   book.base_of_original_book_name2,
49 |   book.base_of_original_book_publisher_name2,
50 |   book.base_of_original_book_first_published_date2,
51 |   book.registered_person_name,
52 |   book.proofreader_name,
53 |   book.text_file_url,
54 |   book.last_text_file_updated_date,
55 |   book.text_file_character_encoding,
56 |   book.text_file_character_set,
57 |   book.text_file_updating_count,
58 |   book.html_file_url,
59 |   book.last_html_file_updated_date,
60 |   book.html_file_character_encoding,
61 |   book.html_file_character_set,
62 |   book.html_file_updating_count
63 | ]
64 | 
65 | # text API can read from text_file_url field's url
66 | p book.text
67 | #=> "ウェストミンスター寺院\r\nワシントン・アーヴィング..."
68 | 
69 | # html API can read from html_file_url field's url
70 | p book.html
71 | #=> "<?xml version=\"1.0\" encoding=\"Shift_JIS\"?>\r\n..."
72 | 
73 | # remove all cached files
74 | # aozora.clear_cache!
75 | 


--------------------------------------------------------------------------------
/example/diamonds.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | diamonds = Datasets::Diamonds.new
 6 | 
 7 | diamonds.each do |record|
 8 |   p [
 9 |      record.carat,
10 |      record.cut,
11 |      record.color,
12 |      record.clarity,
13 |      record.depth,
14 |      record.table,
15 |      record.price,
16 |      record.x,
17 |      record.y,
18 |      record.z,
19 |   ]
20 |   # [0.23, "Ideal", "E", "SI2", 61.5, 55, 326, 3.95, 3.98, 2.43]
21 |   # [0.21, "Premium", "E", "SI1", 59.8, 61, 326, 3.89, 3.84, 2.31]
22 |   # ...
23 | end
24 | 


--------------------------------------------------------------------------------
/example/e-stat-japan.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby -Ku
 2 | 
 3 | require 'datasets'
 4 | 
 5 | Datasets::EStatJapan.configure do |config|
 6 |   # put your App ID for e-Stat app_id
 7 |   # see detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only)
 8 |   config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
 9 | end
10 | 
11 | estat = Datasets::EStatJapan::StatsData.new(
12 |   '0000020201', # Ａ　人口・世帯
13 |   hierarchy_selection: 'child',
14 |   skip_nil_column: true,
15 |   skip_nil_row: false,
16 |   categories: ['A1101'] # A1101_人口総数
17 | )
18 | 
19 | # prepare for clustering
20 | indices = []
21 | rows = []
22 | map_id_name = {}
23 | estat.each do |record|
24 |   # Select Hokkaido only
25 |   next unless record.id.to_s.start_with? '01'
26 |   indices << record.id
27 |   rows << record.values
28 |   map_id_name[record.id] = record.name
29 |   p record.name, rows
30 | end
31 | 


--------------------------------------------------------------------------------
/example/fuel-economy.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | fuel_economy = Datasets::FuelEconomy.new
 6 | 
 7 | fuel_economy.each do |record|
 8 |   p [
 9 |      record.manufacturer,
10 |      record.model,
11 |      record.displacement,
12 |      record.year,
13 |      record.n_cylinders,
14 |      record.transmission,
15 |      record.drive_train,
16 |      record.city_mpg,
17 |      record.highway_mpg,
18 |      record.fuel,
19 |      record.type,
20 |   ]
21 |   # ["audi", "a4", 1.8, 1999, 4, "auto(l5)", "f", 18, 29, "p", "compact"]
22 |   # ["audi", "a4", 1.8, 1999, 4, "manual(m5)", "f", 21, 29, "p", "compact"]
23 |   # ...
24 | end
25 | 


--------------------------------------------------------------------------------
/example/house-of-councillor.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | # Bill
 6 | house_of_councillor = Datasets::HouseOfCouncillor.new
 7 | house_of_councillor.each do |record|
 8 |   # Select promulgated after 2020
 9 |   next unless 2020 <= record.promulgated_on&.year.to_i
10 | 
11 |   p record.promulgated_on, record.values
12 | end
13 | 
14 | # In-House group
15 | house_of_councillor = Datasets::HouseOfCouncillor.new(type: :in_house_group)
16 | house_of_councillor.each do |record|
17 |   p record.values
18 | end
19 | 
20 | # Member
21 | house_of_councillor = Datasets::HouseOfCouncillor.new(type: :member)
22 | house_of_councillor.each do |record|
23 |   # Select using professional name
24 |   next if record.true_name.nil?
25 | 
26 |   p [
27 |       record.professional_name,
28 |       record.true_name,
29 |       record.professional_name_reading,
30 |     ]
31 | end
32 | 
33 | # Question
34 | house_of_councillor = Datasets::HouseOfCouncillor.new(type: :question)
35 | house_of_councillor.each do |record|
36 |   # Select number of submissions greater than 1
37 |   next unless 1 < record.number_of_submissions
38 | 
39 |   p record.number_of_submissions, record.values
40 | end
41 | 


--------------------------------------------------------------------------------
/example/house-of-representative.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | house_of_representative = Datasets::HouseOfRepresentative.new
 6 | house_of_representative.each do |record|
 7 |   # Select support of one hundred or more members and promulgated
 8 |   next unless 100 <= record.supporters_of_submitted_bill.size
 9 |   next if record.promulgated_on.nil?
10 | 
11 |   p [
12 |       record.supporters_of_submitted_bill.size,
13 |       record.promulgated_on,
14 |       record.title,
15 |     ]
16 | end
17 | 


--------------------------------------------------------------------------------
/example/iris.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | iris = Datasets::Iris.new
 6 | iris.each do |record|
 7 |   p [
 8 |      record.sepal_length,
 9 |      record.sepal_width,
10 |      record.petal_length,
11 |      record.petal_width,
12 |      record.label,
13 |   ]
14 |   # [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
15 |   # [7.0, 3.2, 4.7, 1.4, "Iris-versicolor"]
16 | end
17 | 


--------------------------------------------------------------------------------
/example/mnist.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | mnist = Datasets::MNIST.new(type: :train)
 6 | mnist.each do |record|
 7 |   p record.pixels
 8 |   # => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, .....]
 9 |   p record.label
10 |   # => 5
11 | end
12 | 


--------------------------------------------------------------------------------
/example/nagoya-university-conversation-corpus.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'datasets'
 4 | 
 5 | nagoya_university_conversation_corpus = Datasets::NagoyaUniversityConversationCorpus.new
 6 | 
 7 | nagoya_university_conversation_corpus.each do |data|
 8 |   data.sentences.each do |sentence|
 9 |     p [
10 |         sentence.participant_id,
11 |         sentence.content
12 |       ]
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/example/ptb.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | require "optparse"
 5 | 
 6 | params = ARGV.getopts("n:")
 7 | 
 8 | ptb = Datasets::PennTreebank.new(type: :train)
 9 | 
10 | if params["n"]
11 |   records = ptb.take(params["n"].to_i)
12 | else
13 |   records = ptb
14 | end
15 | 
16 | records.each {|record| puts record.word }
17 | 


--------------------------------------------------------------------------------
/example/quora-duplicate-question-pair.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | question_pair = Datasets::QuoraDuplicateQuestionPair.new
 6 | question_pair.each do |pair|
 7 |   p [
 8 |     pair.id,
 9 |     pair.first_question_id,
10 |     pair.second_question_id,
11 |     pair.first_question,
12 |     pair.second_question,
13 |     pair.duplicated?
14 |   ]
15 |   # [0, 1, 2, "What is the step by step guide to invest in share market in india?", "What is the step by step guide to invest in share market?", false]
16 |   # [1, 3, 4, "What is the story of Kohinoor (Koh-i-Noor) Diamond?", "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?", false]
17 | end
18 | 


--------------------------------------------------------------------------------
/example/wikipedia-kyoto-japanese-english.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require "datasets"
 4 | 
 5 | wikipedia_kyoto_articles =
 6 |   Datasets::WikipediaKyotoJapaneseEnglish.new(type: :article)
 7 | wikipedia_kyoto_articles.each_with_index do |article, i|
 8 |   puts("#{i}: #{article.source}")
 9 |   article.contents.each do |content|
10 |     puts("  Japanese: #{content.japanese}")
11 |     puts("  English:  #{content.english}")
12 |   end
13 | end
14 | 
15 | wikipedia_kyoto_lexicon =
16 |  Datasets::WikipediaKyotoJapaneseEnglish.new(type: :lexicon)
17 | wikipedia_kyoto_lexicon.each do |record|
18 |  puts("  Japanese: #{record.japanese}")
19 |  puts("  English:  #{record.english}")
20 | end
21 | 


--------------------------------------------------------------------------------
/example/wine.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require 'datasets'
 4 | 
 5 | wine = Datasets::Wine.new
 6 | wine.each do |record|
 7 |   p [
 8 |     record.label,
 9 |     record.alcohol,
10 |     record.malic_acid,
11 |     record.ash,
12 |     record.alcalinity_of_ash,
13 |     record.n_magnesiums,
14 |     record.total_phenols,
15 |     record.total_flavonoids,
16 |     record.total_nonflavanoid_phenols,
17 |     record.total_proanthocyanins,
18 |     record.color_intensity,
19 |     record.hue,
20 |     record.optical_nucleic_acid_concentration,
21 |     record.n_prolines
22 |   ]
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/datasets.rb:
--------------------------------------------------------------------------------
1 | require_relative "datasets/lazy"
2 | Datasets::LAZY_LOADER.load_all
3 | 


--------------------------------------------------------------------------------
/lib/datasets/adult.rb:
--------------------------------------------------------------------------------
 1 | require "csv"
 2 | 
 3 | require_relative "dataset"
 4 | 
 5 | module Datasets
 6 |   class Adult < Dataset
 7 |     Record = Struct.new(
 8 |       :age,
 9 |       :work_class,
10 |       :final_weight,
11 |       :education,
12 |       :n_education_years,
13 |       :marital_status,
14 |       :occupation,
15 |       :relationship,
16 |       :race,
17 |       :sex,
18 |       :capital_gain,
19 |       :capital_loss,
20 |       :hours_per_week,
21 |       :native_country,
22 |       :label
23 |     )
24 | 
25 |     def initialize(type: :train)
26 |       unless [:train, :test].include?(type)
27 |         raise ArgumentError, 'Please set type :train or :test'
28 |       end
29 | 
30 |       super()
31 |       @type = type
32 |       @metadata.id = "adult-#{@type}"
33 |       @metadata.name = "Adult: #{@type}"
34 |       @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult"
35 |       @metadata.licenses = ["CC-BY-4.0"]
36 |       @metadata.description = lambda do
37 |         read_names
38 |       end
39 |     end
40 | 
41 |     def each
42 |       return to_enum(__method__) unless block_given?
43 | 
44 |       open_data do |csv|
45 |         csv.each do |row|
46 |           next if row[0].nil?
47 |           record = Record.new(*row)
48 |           yield(record)
49 |         end
50 |       end
51 |     end
52 | 
53 |     private
54 |     def open_data
55 |       case @type
56 |       when :train
57 |         ext = "data"
58 |       when :test
59 |         ext = "test"
60 |       end
61 |       data_path = cache_dir_path + "adult-#{ext}.csv"
62 |       data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}"
63 |       download(data_path, data_url)
64 | 
65 |       options = {
66 |                  converters: [:numeric, lambda {|f| f.strip}],
67 |                  skip_lines: /\A\|/,
68 |       }
69 |       CSV.open(data_path, **options) do |csv|
70 |         yield(csv)
71 |       end
72 |     end
73 | 
74 |     def read_names
75 |       names_path = cache_dir_path + "adult.names"
76 |       names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names"
77 |       download(names_path, names_url)
78 |       names_path.read
79 |     end
80 |   end
81 | end
82 | 


--------------------------------------------------------------------------------
/lib/datasets/afinn.rb:
--------------------------------------------------------------------------------
 1 | require "csv"
 2 | require_relative "zip-extractor"
 3 | 
 4 | module Datasets
 5 |   class AFINN < Dataset
 6 |     Record = Struct.new(:word,
 7 |                         :valence)
 8 | 
 9 |     def initialize
10 |       super()
11 |       @metadata.id = "afinn"
12 |       @metadata.name = "AFINN"
13 |       @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html"
14 |       @metadata.licenses = ["ODbL-1.0"]
15 |       @metadata.description = lambda do
16 |         extract_file("AFINN/AFINN-README.txt") do |input|
17 |           readme = input.read
18 |           readme.force_encoding("UTF-8")
19 |           readme.
20 |             gsub(/^AFINN-96:.*?\n\n/m, "").
21 |             gsub(/^In Python.*$/m, "").
22 |             strip
23 |         end
24 |       end
25 |     end
26 | 
27 |     def each
28 |       return to_enum(__method__) unless block_given?
29 | 
30 |       extract_file("AFINN/AFINN-111.txt") do |input|
31 |         csv = CSV.new(input, col_sep: "\t", converters: :numeric)
32 |         csv.each do |row|
33 |           yield(Record.new(*row))
34 |         end
35 |       end
36 |     end
37 | 
38 |     private
39 |     def extract_file(file_path, &block)
40 |       data_path = cache_dir_path + "imm6010.zip"
41 |       data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip"
42 |       download(data_path, data_url)
43 | 
44 |       extractor = ZipExtractor.new(data_path)
45 |       extractor.extract_file(file_path, &block)
46 |     end
47 |   end
48 | end
49 | 


--------------------------------------------------------------------------------
/lib/datasets/aozora-bunko.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'dataset'
  2 | require_relative 'zip-extractor'
  3 | 
  4 | module Datasets
  5 |   # Dataset for AozoraBunko
  6 |   class AozoraBunko < Dataset
  7 |     Book = Struct.new(
  8 |       # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL,
  9 |       :title_id,
 10 |       :title,
 11 |       :title_reading,
 12 |       :title_reading_collation,
 13 |       :subtitle,
 14 |       :subtitle_reading,
 15 |       :original_title,
 16 |       :first_appearance,
 17 |       :ndc_code, # 分類番号(日本十進分類法の番号)
 18 |       :syllabary_spelling_type,
 19 |       :copyrighted,
 20 |       :published_date,
 21 |       :last_updated_date,
 22 |       :detail_url,
 23 |       # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ,
 24 |       :person_id,
 25 |       :person_family_name,
 26 |       :person_first_name,
 27 |       :person_family_name_reading,
 28 |       :person_first_name_reading,
 29 |       :person_family_name_reading_collation,
 30 |       :person_first_name_reading_collation,
 31 |       :person_family_name_romaji,
 32 |       :person_first_name_romaji,
 33 |       :person_type,
 34 |       :person_birthday,
 35 |       :person_date_of_death,
 36 |       :person_copyrighted,
 37 |       # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1,
 38 |       :original_book_name1,
 39 |       :original_book_publisher_name1,
 40 |       :original_book_first_published_date1,
 41 |       :used_version_for_registration1,
 42 |       :used_version_for_proofreading1,
 43 |       :base_of_original_book_name1,
 44 |       :base_of_original_book_publisher_name1,
 45 |       :base_of_original_book_first_published_date1,
 46 |       # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2,
 47 |       :original_book_name2,
 48 |       :original_book_publisher_name2,
 49 |       :original_book_first_published_date2,
 50 |       :used_version_for_registration2,
 51 |       :used_version_for_proofreading2,
 52 |       :base_of_original_book_name2,
 53 |       :base_of_original_book_publisher_name2,
 54 |       :base_of_original_book_first_published_date2,
 55 |       # 入力者,校正者,
 56 |       :registered_person_name,
 57 |       :proofreader_name,
 58 |       # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数,
 59 |       :text_file_url,
 60 |       :last_text_file_updated_date,
 61 |       :text_file_character_encoding,
 62 |       :text_file_character_set,
 63 |       :text_file_updating_count,
 64 |       # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数
 65 |       :html_file_url,
 66 |       :last_html_file_updated_date,
 67 |       :html_file_character_encoding,
 68 |       :html_file_character_set,
 69 |       :html_file_updating_count
 70 |     )
 71 | 
 72 |     class Book
 73 |       attr_writer :cache_path
 74 | 
 75 |       def initialize(*args)
 76 |         super
 77 |         @text = nil
 78 |         @html = nil
 79 |         @cache_path = nil
 80 |       end
 81 | 
 82 |       alias_method :copyrighted?, :copyrighted
 83 |       alias_method :person_copyrighted?, :person_copyrighted
 84 | 
 85 |       def text
 86 |         return @text unless @text.nil?
 87 |         return @text if text_file_url.nil? || text_file_url.empty?
 88 | 
 89 |         # when url is not zip file, it needs to open web page by brower and has to download
 90 |         # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE
 91 |         return @text unless text_file_url.end_with?('.zip')
 92 | 
 93 |         downloader = Downloader.new(text_file_url)
 94 |         downloader.download(text_file_output_path)
 95 | 
 96 |         @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input|
 97 |           input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding))
 98 |         end
 99 | 
100 |         @text
101 |       end
102 | 
103 |       def html
104 |         return @html unless @html.nil?
105 |         return @html if html_file_url.nil? || html_file_url.empty?
106 | 
107 |         downloader = Downloader.new(html_file_url)
108 |         downloader.download(html_file_output_path)
109 |         @html = File.read(html_file_output_path).encode(Encoding::UTF_8,
110 |                                                         normalize_encoding(html_file_character_encoding))
111 | 
112 |         @html
113 |       end
114 | 
115 |       private
116 | 
117 |       def text_file_output_path
118 |         cache_base_dir + text_file_name
119 |       end
120 | 
121 |       def html_file_output_path
122 |         cache_base_dir + html_file_name
123 |       end
124 | 
125 |       def text_file_name
126 |         text_file_url.split('/').last
127 |       end
128 | 
129 |       def html_file_name
130 |         html_file_url.split('/').last
131 |       end
132 | 
133 |       def cache_base_dir
134 |         @cache_path.base_dir + title_id + person_id
135 |       end
136 | 
137 |       def normalize_encoding(encoding)
138 |         case encoding
139 |         when 'ShiftJIS'
140 |           Encoding::Shift_JIS
141 |         when 'UTF-8'
142 |           Encoding::UTF_8
143 |         else
144 |           encoding
145 |         end
146 |       end
147 |     end
148 | 
149 |     def initialize
150 |       super()
151 | 
152 |       @metadata.id = 'aozora-bunko'
153 |       @metadata.name = 'Aozora Bunko'
154 |       @metadata.url = 'https://www.aozora.gr.jp/'
155 |       @metadata.licenses = 'CC-BY-2.1-JP'
156 |       @metadata.description = <<~DESCRIPTION
157 |         Aozora Bunko is an activity to collect free electronic books that anyone can access
158 |         on the Internet like a library. The copyrighted works and the works that are said to be
159 |         "free to read" are available after being digitized in text and XHTML (some HTML) formats.
160 |       DESCRIPTION
161 |     end
162 | 
163 |     def each
164 |       return to_enum(__method__) unless block_given?
165 | 
166 |       open_data do |csv_file_stream|
167 |         text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark
168 | 
169 |         CSV.parse(text, headers: true) do |row|
170 |           %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name|
171 |             row[boolean_column_name] = normalize_boolean(row[boolean_column_name])
172 |           end
173 |           book = Book.new(*row.fields)
174 |           book.cache_path = cache_path
175 | 
176 |           yield(book)
177 |         end
178 |       end
179 |     end
180 | 
181 |     private
182 | 
183 |     def open_data(&block)
184 |       data_path = cache_dir_path + 'list_person_all_extended_utf8.zip'
185 |       data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}"
186 |       download(data_path, data_url)
187 |       ZipExtractor.new(data_path).extract_first_file do |input|
188 |         block.call(input)
189 |       end
190 |     end
191 | 
192 |     def normalize_boolean(column_value)
193 |       column_value == 'あり'
194 |     end
195 |   end
196 | end
197 | 


--------------------------------------------------------------------------------
/lib/datasets/cache-path.rb:
--------------------------------------------------------------------------------
 1 | module Datasets
 2 |   class CachePath
 3 |     def initialize(id)
 4 |       @id = id
 5 |     end
 6 | 
 7 |     def base_dir
 8 |       Pathname(system_cache_dir).expand_path + 'red-datasets' + @id
 9 |     end
10 | 
11 |     def remove
12 |       FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist?
13 |     end
14 | 
15 |     private
16 | 
17 |     def system_cache_dir
18 |       case RUBY_PLATFORM
19 |       when /mswin/, /mingw/
20 |         ENV['LOCALAPPDATA'] || '~/AppData/Local'
21 |       when /darwin/
22 |         '~/Library/Caches'
23 |       else
24 |         ENV['XDG_CACHE_HOME'] || '~/.cache'
25 |       end
26 |     end
27 |   end
28 | end
29 | 


--------------------------------------------------------------------------------
/lib/datasets/california-housing.rb:
--------------------------------------------------------------------------------
 1 | require "csv"
 2 | require_relative 'zip-extractor'
 3 | 
 4 | module Datasets
 5 |   class CaliforniaHousing < Dataset
 6 |     Record = Struct.new(:median_house_value,
 7 |                         :median_income,
 8 |                         :housing_median_age,
 9 |                         :total_rooms,
10 |                         :total_bedrooms,
11 |                         :population,
12 |                         :households,
13 |                         :latitude,
14 |                         :longitude)
15 | 
16 |     def initialize
17 |       super()
18 |       @metadata.id = "california-housing"
19 |       @metadata.name = "California Housing"
20 |       @metadata.url = "http://lib.stat.cmu.edu/datasets/"
21 |       @metadata.licenses = ["CCO"]
22 |       @metadata.description = <<-DESCRIPTION
23 | Housing information from the 1990 census used in
24 | Pace, R. Kelley and Ronald Barry,
25 | "Sparse Spatial Autoregressions",
26 | Statistics and Probability Letters, 33 (1997) 291-297.
27 | Available from http://lib.stat.cmu.edu/datasets/.
28 |       DESCRIPTION
29 |     end
30 | 
31 |     def each
32 |       return to_enum(__method__) unless block_given?
33 | 
34 |       data_path = cache_dir_path + "houses.zip"
35 |       data_url = "http://lib.stat.cmu.edu/datasets/houses.zip"
36 |       file_name = "cadata.txt"
37 |       download(data_path, data_url)
38 |       open_data(data_path, file_name) do |input|
39 |         data = +""
40 |         input.each_line do |line|
41 |           next unless line.start_with?(" ")
42 |           data << line.lstrip.gsub(/ +/, ",")
43 |         end
44 |         options = {
45 |           converters: [:numeric],
46 |         }
47 |         CSV.parse(data, **options) do |row|
48 |           yield(Record.new(*row))
49 |         end
50 |       end
51 |     end
52 | 
53 |     private
54 |     def open_data(data_path, file_name)
55 |       ZipExtractor.new(data_path).extract_first_file do |input|
56 |         yield input
57 |       end
58 |     end
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/lib/datasets/cifar.rb:
--------------------------------------------------------------------------------
  1 | require_relative "tar-gz-readable"
  2 | require_relative "dataset"
  3 | 
  4 | module Datasets
  5 |   class CIFAR < Dataset
  6 |     include TarGzReadable
  7 | 
  8 |     module Pixelable
  9 |       def pixels
 10 |         data.unpack("C*")
 11 |       end
 12 | 
 13 |       def to_h
 14 |         hash = super
 15 |         hash[:pixels] = pixels
 16 |         hash
 17 |       end
 18 |     end
 19 | 
 20 |     class Record10 < Struct.new(:data, :label)
 21 |       include Pixelable
 22 |     end
 23 | 
 24 |     class Record100 < Struct.new(:data, :coarse_label, :fine_label)
 25 |       include Pixelable
 26 |     end
 27 | 
 28 |     def initialize(n_classes: 10, type: :train)
 29 |       unless [10, 100].include?(n_classes)
 30 |         message = "Please set n_classes 10 or 100: #{n_classes.inspect}"
 31 |         raise ArgumentError, message
 32 |       end
 33 |       unless [:train, :test].include?(type)
 34 |         message = "Please set type :train or :test: #{type.inspect}"
 35 |         raise ArgumentError, message
 36 |       end
 37 | 
 38 |       super()
 39 | 
 40 |       @metadata.id = "cifar-#{n_classes}"
 41 |       @metadata.name = "CIFAR-#{n_classes}"
 42 |       @metadata.url = "https://www.cs.toronto.edu/~kriz/cifar.html"
 43 |       @metadata.description = "CIFAR-#{n_classes} is 32x32 image dataset"
 44 | 
 45 |       @n_classes = n_classes
 46 |       @type = type
 47 |     end
 48 | 
 49 |     def each(&block)
 50 |       return to_enum(__method__) unless block_given?
 51 | 
 52 |       data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz"
 53 |       data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz"
 54 |       download(data_path, data_url)
 55 | 
 56 |       parse_data(data_path, &block)
 57 |     end
 58 | 
 59 |     private
 60 | 
 61 |     def parse_data(data_path, &block)
 62 |       open_tar_gz(data_path) do |tar|
 63 |         target_file_names.each do |target_file_name|
 64 |           tar.seek(target_file_name) do |entry|
 65 |             parse_entry(entry, &block)
 66 |           end
 67 |         end
 68 |       end
 69 |     end
 70 | 
 71 |     def target_file_names
 72 |       case @n_classes
 73 |       when 10
 74 |         prefix = 'cifar-10-batches-bin'
 75 |         case @type
 76 |         when :train
 77 |           [
 78 |             "#{prefix}/data_batch_1.bin",
 79 |             "#{prefix}/data_batch_2.bin",
 80 |             "#{prefix}/data_batch_3.bin",
 81 |             "#{prefix}/data_batch_4.bin",
 82 |             "#{prefix}/data_batch_5.bin",
 83 |           ]
 84 |         when :test
 85 |           [
 86 |             "#{prefix}/test_batch.bin"
 87 |           ]
 88 |         end
 89 |       when 100
 90 |         prefix = "cifar-100-binary"
 91 |         case @type
 92 |         when :train
 93 |           [
 94 |             "#{prefix}/train.bin",
 95 |           ]
 96 |         when :test
 97 |           [
 98 |             "#{prefix}/test.bin",
 99 |           ]
100 |         end
101 |       end
102 |     end
103 | 
104 |     def parse_entry(entry)
105 |       case @n_classes
106 |       when 10
107 |         loop do
108 |           label = entry.read(1)
109 |           break if label.nil?
110 |           label = label.unpack("C")[0]
111 |           data = entry.read(3072)
112 |           yield Record10.new(data, label)
113 |         end
114 |       when 100
115 |         loop do
116 |           coarse_label = entry.read(1)
117 |           break if coarse_label.nil?
118 |           coarse_label = coarse_label.unpack("C")[0]
119 |           fine_label = entry.read(1).unpack("C")[0]
120 |           data = entry.read(3072)
121 |           yield Record100.new(data, coarse_label, fine_label)
122 |         end
123 |       end
124 |     end
125 |   end
126 | end
127 | 
128 | 


--------------------------------------------------------------------------------
/lib/datasets/communities.rb:
--------------------------------------------------------------------------------
  1 | require "csv"
  2 | 
  3 | require_relative "dataset"
  4 | 
  5 | module Datasets
  6 |   class Communities < Dataset
  7 |     Record = Struct.new(
  8 |       :state,
  9 |       :county,
 10 |       :community,
 11 |       :community_name,
 12 |       :fold,
 13 |       :population,
 14 |       :household_size,
 15 |       :race_percent_black,
 16 |       :race_percent_white,
 17 |       :race_percent_asian,
 18 |       :race_percent_hispanic,
 19 |       :age_percent_12_to_21,
 20 |       :age_percent_12_to_29,
 21 |       :age_percent_16_to_24,
 22 |       :age_percent_65_and_upper,
 23 |       :n_people_urban,
 24 |       :percent_people_urban,
 25 |       :median_income,
 26 |       :percent_households_with_wage,
 27 |       :percent_households_with_farm_self,
 28 |       :percent_households_with_investment_income,
 29 |       :percent_households_with_social_security,
 30 |       :percent_households_with_public_assistant,
 31 |       :percent_households_with_retire,
 32 |       :median_family_income,
 33 |       :per_capita_income,
 34 |       :per_capita_income_white,
 35 |       :per_capita_income_black,
 36 |       :per_capita_income_indian,
 37 |       :per_capita_income_asian,
 38 |       :per_capita_income_other,
 39 |       :per_capita_income_hispanic,
 40 |       :n_people_under_poverty,
 41 |       :percent_people_under_poverty,
 42 |       :percent_less_9th_grade,
 43 |       :percent_not_high_school_graduate,
 44 |       :percent_bachelors_or_more,
 45 |       :percent_unemployed,
 46 |       :percent_employed,
 47 |       :percent_employed_manufacturing,
 48 |       :percent_employed_professional_service,
 49 |       :percent_occupations_manufacturing,
 50 |       :percent_occupations_management_professional,
 51 |       :male_percent_divorced,
 52 |       :male_percent_never_married,
 53 |       :female_percent_divorced,
 54 |       :total_percent_divorced,
 55 |       :mean_persons_per_family,
 56 |       :percent_family_2_parents,
 57 |       :percent_kids_2_parents,
 58 |       :percent_young_kids_2_parents,
 59 |       :percent_teen_2_parents,
 60 |       :percent_work_mom_young_kids,
 61 |       :percent_work_mom,
 62 |       :n_illegals,
 63 |       :percent_illegals,
 64 |       :n_immigrants,
 65 |       :percent_immigrants_recent,
 66 |       :percent_immigrants_recent_5,
 67 |       :percent_immigrants_recent_8,
 68 |       :percent_immigrants_recent_10,
 69 |       :percent_population_immigranted_recent,
 70 |       :percent_population_immigranted_recent_5,
 71 |       :percent_population_immigranted_recent_8,
 72 |       :percent_population_immigranted_recent_10,
 73 |       :percent_speak_english_only,
 74 |       :percent_not_speak_english_well,
 75 |       :percent_large_households_family,
 76 |       :percent_large_households_occupied,
 77 |       :mean_persons_per_occupied_household,
 78 |       :mean_persons_per_owner_occupied_household,
 79 |       :mean_persons_per_rental_occupied_household,
 80 |       :percent_persons_owner_occupied_household,
 81 |       :percent_persons_dense_housing,
 82 |       :percent_housing_less_3_bedrooms,
 83 |       :median_n_bedrooms,
 84 |       :n_vacant_households,
 85 |       :percent_housing_occupied,
 86 |       :percent_housing_owner_occupied,
 87 |       :percent_vacant_housing_boarded,
 88 |       :percent_vacant_housing_more_6_months,
 89 |       :median_year_housing_built,
 90 |       :percent_housing_no_phone,
 91 |       :percent_housing_without_full_plumbing,
 92 |       :owner_occupied_housing_lower_quartile,
 93 |       :owner_occupied_housing_median,
 94 |       :owner_occupied_housing_higher_quartile,
 95 |       :rental_housing_lower_quartile,
 96 |       :rental_housing_median,
 97 |       :rental_housing_higher_quartile,
 98 |       :median_rent,
 99 |       :median_rent_percent_household_income,
100 |       :median_owner_cost_percent_household_income,
101 |       :median_owner_cost_percent_household_income_no_mortgage,
102 |       :n_people_shelter,
103 |       :n_people_street,
104 |       :percent_foreign_born,
105 |       :percent_born_same_state,
106 |       :percent_same_house_85,
107 |       :percent_same_city_85,
108 |       :percent_same_state_85,
109 |       :lemas_sworn_full_time,
110 |       :lemas_sworn_full_time_per_population,
111 |       :lemas_sworn_full_time_field,
112 |       :lemas_sworn_full_time_field_per_population,
113 |       :lemas_total_requests,
114 |       :lemas_total_requests_per_population,
115 |       :total_requests_per_officer,
116 |       :n_officers_per_population,
117 |       :racial_match_community_police,
118 |       :percent_police_white,
119 |       :percent_police_black,
120 |       :percent_police_hispanic,
121 |       :percent_police_asian,
122 |       :percent_police_minority,
123 |       :n_officers_assigned_drug_units,
124 |       :n_kinds_drugs_seized,
125 |       :police_average_overtime_worked,
126 |       :land_area,
127 |       :population_density,
128 |       :percent_use_public_transit,
129 |       :n_police_cars,
130 |       :n_police_operating_budget,
131 |       :lemas_percent_police_on_patrol,
132 |       :lemas_gang_unit_deployed,
133 |       :lemas_percent_office_drug_units,
134 |       :police_operating_budget_per_population,
135 |       :total_violent_crimes_per_population
136 |     )
137 | 
138 |     def initialize
139 |       super()
140 |       @metadata.id = "communities"
141 |       @metadata.name = "Communities"
142 |       @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime"
143 |       @metadata.licenses = ["CC-BY-4.0"]
144 |       @metadata.description = lambda do
145 |         read_names
146 |       end
147 |     end
148 | 
149 |     def each
150 |       return to_enum(__method__) unless block_given?
151 | 
152 |       open_data do |csv|
153 |         csv.each do |row|
154 |           row = row.collect.with_index do |column, i|
155 |             if column == "?"
156 |               nil
157 |             else
158 |               case i
159 |               when 3 # communityname
160 |               # when 124 # LemasGangUnitDeploy
161 |               # 0 means NO, 1 means YES, 0.5 means Part Time
162 |               else
163 |                 column = Float(column)
164 |               end
165 |               column
166 |             end
167 |           end
168 |           record = Record.new(*row)
169 |           yield(record)
170 |         end
171 |       end
172 |     end
173 | 
174 |     private
175 |     def base_url
176 |       "https://archive.ics.uci.edu/ml/machine-learning-databases/communities"
177 |     end
178 | 
179 |     def open_data
180 |       data_path = cache_dir_path + "communities.data"
181 |       data_url = "#{base_url}/communities.data"
182 |       download(data_path, data_url)
183 |       CSV.open(data_path) do |csv|
184 |         yield(csv)
185 |       end
186 |     end
187 | 
188 |     def read_names
189 |       names_path = cache_dir_path + "communities.names"
190 |       names_url = "#{base_url}/communities.names"
191 |       download(names_path, names_url)
192 |       names_path.read
193 |     end
194 |   end
195 | end
196 | 


--------------------------------------------------------------------------------
/lib/datasets/dataset.rb:
--------------------------------------------------------------------------------
 1 | require "pathname"
 2 | 
 3 | require_relative "cache-path"
 4 | require_relative "downloader"
 5 | require_relative "error"
 6 | require_relative "metadata"
 7 | require_relative "table"
 8 | 
 9 | module Datasets
10 |   class Dataset
11 |     include Enumerable
12 | 
13 |     attr_reader :metadata
14 |     def initialize
15 |       @metadata = Metadata.new
16 |     end
17 | 
18 |     def to_table
19 |       Table.new(self)
20 |     end
21 | 
22 |     def clear_cache!
23 |       cache_path.remove
24 |     end
25 | 
26 |     private
27 | 
28 |     def cache_dir_path
29 |       cache_path.base_dir
30 |     end
31 | 
32 |     def cache_path
33 |       @cache_path ||= CachePath.new(@metadata.id)
34 |     end
35 | 
36 |     def download(output_path, url, *fallback_urls, **options, &block)
37 |       downloader = Downloader.new(url, *fallback_urls, **options)
38 |       downloader.download(output_path, &block)
39 |     end
40 | 
41 |     def extract_bz2(bz2)
42 |       case bz2
43 |       when Pathname, String
44 |         IO.pipe do |input, output|
45 |           pid = spawn("bzcat", bz2.to_s, {out: output})
46 |           begin
47 |             output.close
48 |             yield(input)
49 |           ensure
50 |             input.close
51 |             Process.waitpid(pid)
52 |           end
53 |         end
54 |       else
55 |         IO.pipe do |bz2_input, bz2_output|
56 |           IO.pipe do |plain_input, plain_output|
57 |             bz2_stop = false
58 |             bz2_thread = Thread.new do
59 |               begin
60 |                 bz2.each do |chunk|
61 |                   bz2_output.write(chunk)
62 |                   bz2_output.flush
63 |                   break if bz2_stop
64 |                 end
65 |               rescue => error
66 |                 message = "Failed to read bzcat input: " +
67 |                           "#{error.class}: #{error.message}"
68 |                 $stderr.puts(message)
69 |               ensure
70 |                 bz2_output.close
71 |               end
72 |             end
73 |             begin
74 |               pid = spawn("bzcat", {in: bz2_input, out: plain_output})
75 |               begin
76 |                 bz2_input.close
77 |                 plain_output.close
78 |                 yield(plain_input)
79 |               ensure
80 |                 plain_input.close
81 |                 Process.waitpid(pid)
82 |               end
83 |             ensure
84 |               bz2_stop = true
85 |               bz2_thread.join
86 |             end
87 |           end
88 |         end
89 |       end
90 |     end
91 |   end
92 | end
93 | 


--------------------------------------------------------------------------------
/lib/datasets/diamonds.rb:
--------------------------------------------------------------------------------
 1 | require_relative "ggplot2-dataset"
 2 | 
 3 | module Datasets
 4 |   class Diamonds < Ggplot2Dataset
 5 |     Record = Struct.new(:carat,
 6 |                         :cut,
 7 |                         :color,
 8 |                         :clarity,
 9 |                         :depth,
10 |                         :table,
11 |                         :price,
12 |                         :x,
13 |                         :y,
14 |                         :z)
15 | 
16 |     def initialize()
17 |       super("diamonds")
18 |       @metadata.id = "diamonds"
19 |       @metadata.name = "Diamonds"
20 |       @metadata.licenses = ["CC0-1.0"]
21 |     end
22 | 
23 |     COLUMN_NAME_MAPPING = {
24 |     }
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/datasets/dictionary.rb:
--------------------------------------------------------------------------------
 1 | module Datasets
 2 |   class Dictionary
 3 |     include Enumerable
 4 | 
 5 |     def initialize(values)
 6 |       build_dictionary(values)
 7 |     end
 8 | 
 9 |     def id(value)
10 |       @value_to_id[value]
11 |     end
12 | 
13 |     def value(id)
14 |       @id_to_value[id]
15 |     end
16 | 
17 |     def ids
18 |       @id_to_value.keys
19 |     end
20 | 
21 |     def values
22 |       @id_to_value.values
23 |     end
24 | 
25 |     def each(&block)
26 |       @id_to_value.each(&block)
27 |     end
28 | 
29 |     def size
30 |       @id_to_value.size
31 |     end
32 |     alias_method :length, :size
33 | 
34 |     def encode(values)
35 |       values.collect do |value|
36 |         id(value)
37 |       end
38 |     end
39 | 
40 |     def decode(ids)
41 |       ids.collect do |id|
42 |         value(id)
43 |       end
44 |     end
45 | 
46 |     private
47 |     def build_dictionary(values)
48 |       @id_to_value = {}
49 |       @value_to_id = {}
50 |       id = 0
51 |       values.each do |value|
52 |         next if @value_to_id.key?(value)
53 |         @id_to_value[id] = value
54 |         @value_to_id[value] = id
55 |         id += 1
56 |       end
57 |     end
58 |   end
59 | end
60 | 


--------------------------------------------------------------------------------
/lib/datasets/error.rb:
--------------------------------------------------------------------------------
1 | module Datasets
2 |   class Error < StandardError
3 |   end
4 | end
5 | 


--------------------------------------------------------------------------------
/lib/datasets/fashion-mnist.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'mnist'
 2 | 
 3 | module Datasets
 4 |   class FashionMNIST < MNIST
 5 |     private
 6 |     def base_urls
 7 |       [
 8 |         "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/",
 9 |       ]
10 |     end
11 | 
12 |     def dataset_name
13 |       "Fashion-MNIST"
14 |     end
15 | 
16 |     def licenses
17 |       ["MIT"]
18 |     end
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/datasets/fuel-economy.rb:
--------------------------------------------------------------------------------
 1 | require_relative "ggplot2-dataset"
 2 | 
 3 | module Datasets
 4 |   class FuelEconomy < Ggplot2Dataset
 5 |     Record = Struct.new(:manufacturer,
 6 |                         :model,
 7 |                         :displacement,
 8 |                         :year,
 9 |                         :n_cylinders,
10 |                         :transmission,
11 |                         :drive_train,
12 |                         :city_mpg,
13 |                         :highway_mpg,
14 |                         :fuel,
15 |                         :type)
16 | 
17 |     def initialize
18 |       super("mpg")
19 |       @metadata.id = "fuel-economy"
20 |       @metadata.name = "Fuel economy"
21 |       @metadata.licenses = ["CC0-1.0"]
22 |     end
23 | 
24 |     COLUMN_NAME_MAPPING = {
25 |       "displ" => "displacement",
26 |       "cyl" => "n_cylinders",
27 |       "trans" => "transmissions",
28 |       "drv" => "drive_train",
29 |       "cty" => "city_mpg",
30 |       "hwy" => "highway_mpg",
31 |       "fl" => "fuel",
32 |       "class" => "type",
33 |     }
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/lib/datasets/geolonia.rb:
--------------------------------------------------------------------------------
 1 | require 'csv'
 2 | 
 3 | require_relative 'dataset'
 4 | 
 5 | module Datasets
 6 |   class Geolonia < Dataset
 7 |     Record = Struct.new(:prefecture_code,
 8 |                         :prefecture_name,
 9 |                         :prefecture_kana,
10 |                         :prefecture_romaji,
11 |                         :municipality_code,
12 |                         :municipality_name,
13 |                         :municipality_kana,
14 |                         :municipality_romaji,
15 |                         :street_name,
16 |                         :street_kana,
17 |                         :street_romaji,
18 |                         :alias,
19 |                         :latitude,
20 |                         :longitude)
21 | 
22 |     def initialize
23 |       super
24 |       @metadata.id = 'geolonia'
25 |       @metadata.name = 'Geolonia'
26 |       @metadata.url = 'https://github.com/geolonia/japanese-addresses'
27 |       @metadata.licenses = ["CC-BY-4.0"]
28 |       @metadata.description = lambda do
29 |         fetch_readme
30 |       end
31 |     end
32 | 
33 |     def each
34 |       return to_enum(__method__) unless block_given?
35 | 
36 |       open_data do |csv|
37 |         csv.readline
38 |         csv.each do |row|
39 |           record = Record.new(*row)
40 |           yield(record)
41 |         end
42 |       end
43 |     end
44 | 
45 |     private
46 |     def download_base_url
47 |       "https://raw.githubusercontent.com/geolonia/japanese-addresses/master"
48 |     end
49 | 
50 |     def open_data
51 |       data_path = cache_dir_path + 'latest.csv'
52 |       data_url = "#{download_base_url}/data/latest.csv"
53 |       download(data_path, data_url)
54 |       CSV.open(data_path) do |csv|
55 |         yield(csv)
56 |       end
57 |     end
58 | 
59 |     def fetch_readme
60 |       readme_base_name = "README.md"
61 |       readme_path = cache_dir_path + readme_base_name
62 |       readme_url = "#{download_base_url}/#{readme_base_name}"
63 |       download(readme_path, readme_url)
64 |       readme_path.read.split(/^## API/, 2)[0].strip
65 |     end
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/lib/datasets/ggplot2-dataset.rb:
--------------------------------------------------------------------------------
 1 | module Datasets
 2 |   class Ggplot2Dataset < Dataset
 3 |     def initialize(ggplot2_dataset_name)
 4 |       super()
 5 |       @ggplot2_dataset_name = ggplot2_dataset_name
 6 |       @metadata.url =
 7 |         "https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html"
 8 |       @metadata.description = lambda do
 9 |         fetch_description
10 |       end
11 |     end
12 | 
13 |     def each
14 |       return to_enum(__method__) unless block_given?
15 | 
16 |       data_base_name = "#{@ggplot2_dataset_name}.csv"
17 |       data_path = cache_dir_path + data_base_name
18 |       data_url = "#{download_base_url}/data-raw/#{data_base_name}"
19 |       download(data_path, data_url)
20 |       CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv|
21 |         record_class = self.class::Record
22 |         csv.each do |row|
23 |           record = record_class.new(*row.fields)
24 |           yield record
25 |         end
26 |       end
27 |     end
28 | 
29 |     private
30 |     def download_base_url
31 |       "https://raw.githubusercontent.com/tidyverse/ggplot2/main"
32 |     end
33 | 
34 |     def fetch_description
35 |       data_r_base_name = "data.R"
36 |       data_r_path = cache_dir_path + data_r_base_name
37 |       data_r_url = "#{download_base_url}/R/#{data_r_base_name}"
38 |       download(data_r_path, data_r_url)
39 |       descriptions = {}
40 |       comment = +""
41 |       File.open(data_r_path) do |data_r|
42 |         data_r.each_line do |line|
43 |           case line.chomp
44 |           when /\A#'/
45 |             comment_content = Regexp.last_match.post_match
46 |             unless comment_content.empty?
47 |               comment_content = comment_content[1..-1]
48 |             end
49 |             comment << comment_content
50 |             comment << "\n"
51 |           when /\A"(.+)"\z/
52 |             name = Regexp.last_match[1]
53 |             descriptions[name] = parse_roxygen(comment.rstrip)
54 |             comment = +""
55 |           end
56 |         end
57 |         descriptions[@ggplot2_dataset_name]
58 |       end
59 |     end
60 | 
61 |     def parse_roxygen(roxygen)
62 |       column_name_mapping = self.class::COLUMN_NAME_MAPPING
63 |       roxygen
64 |         .gsub(/\\url\{(.*?)\}/, "\\1")
65 |         .gsub(/^@format /, "")
66 |         .gsub(/\\describe\{(.*)\}/m) do
67 |         content = $1
68 |         content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do
69 |           column_name = $1
70 |           description = $2
71 |           column_name = column_name_mapping[column_name] || column_name
72 |           description = description
73 |                           .gsub(/\\\$/, "$")
74 |           "* #{column_name}: #{description}"
75 |         end
76 |       end
77 |     end
78 |   end
79 | end
80 | 


--------------------------------------------------------------------------------
/lib/datasets/hepatitis.rb:
--------------------------------------------------------------------------------
  1 | require "csv"
  2 | 
  3 | require_relative "dataset"
  4 | 
  5 | module Datasets
  6 |   class Hepatitis < Dataset
  7 |     class Record < Struct.new(:label,
  8 |                               :age,
  9 |                               :sex,
 10 |                               :steroid,
 11 |                               :antivirals,
 12 |                               :fatigue,
 13 |                               :malaise,
 14 |                               :anorexia,
 15 |                               :liver_big,
 16 |                               :liver_firm,
 17 |                               :spleen_palpable,
 18 |                               :spiders,
 19 |                               :ascites,
 20 |                               :varices,
 21 |                               :bilirubin,
 22 |                               :alkaline_phosphate,
 23 |                               :sgot,
 24 |                               :albumin,
 25 |                               :protime,
 26 |                               :histology)
 27 |       def initialize(*values)
 28 |         super()
 29 |         members.zip(values) do |member, value|
 30 |           __send__("#{member}=", value)
 31 |         end
 32 |       end
 33 | 
 34 |       def label=(label)
 35 |         case label
 36 |         when "1"
 37 |           super(:die)
 38 |         when "2"
 39 |           super(:live)
 40 |         else
 41 |           super(label)
 42 |         end
 43 |       end
 44 | 
 45 |       def age=(age)
 46 |         super(normalize_integer(age))
 47 |       end
 48 | 
 49 |       def sex=(sex)
 50 |         case sex
 51 |         when "1"
 52 |           super(:male)
 53 |         when "2"
 54 |           super(:female)
 55 |         else
 56 |           super(sex)
 57 |         end
 58 |       end
 59 | 
 60 |       def steroid=(steroid)
 61 |         super(normalize_boolean(steroid))
 62 |       end
 63 | 
 64 |       def antivirals=(antivirals)
 65 |         super(normalize_boolean(antivirals))
 66 |       end
 67 | 
 68 |       def fatigue=(fatigue)
 69 |         super(normalize_boolean(fatigue))
 70 |       end
 71 | 
 72 |       def malaise=(malaise)
 73 |         super(normalize_boolean(malaise))
 74 |       end
 75 | 
 76 |       def anorexia=(anorexia)
 77 |         super(normalize_boolean(anorexia))
 78 |       end
 79 | 
 80 |       def liver_big=(liver_big)
 81 |         super(normalize_boolean(liver_big))
 82 |       end
 83 | 
 84 |       def liver_firm=(liver_firm)
 85 |         super(normalize_boolean(liver_firm))
 86 |       end
 87 | 
 88 |       def spleen_palpable=(spleen_palpable)
 89 |         super(normalize_boolean(spleen_palpable))
 90 |       end
 91 | 
 92 |       def spiders=(spiders)
 93 |         super(normalize_boolean(spiders))
 94 |       end
 95 | 
 96 |       def ascites=(ascites)
 97 |         super(normalize_boolean(ascites))
 98 |       end
 99 | 
100 |       def varices=(varices)
101 |         super(normalize_boolean(varices))
102 |       end
103 | 
104 |       def bilirubin=(bilirubin)
105 |         super(normalize_float(bilirubin))
106 |       end
107 | 
108 |       def alkaline_phosphate=(alkaline_phosphate)
109 |         super(normalize_integer(alkaline_phosphate))
110 |       end
111 | 
112 |       def sgot=(sgot)
113 |         super(normalize_integer(sgot))
114 |       end
115 | 
116 |       def albumin=(albumin)
117 |         super(normalize_float(albumin))
118 |       end
119 | 
120 |       def protime=(protime)
121 |         super(normalize_integer(protime))
122 |       end
123 | 
124 |       def histology=(histology)
125 |         super(normalize_boolean(histology))
126 |       end
127 | 
128 |       private
129 |       def normalize_boolean(value)
130 |         case value
131 |         when "?"
132 |           nil
133 |         when "1"
134 |           false
135 |         when "2"
136 |           true
137 |         else
138 |           value
139 |         end
140 |       end
141 | 
142 |       def normalize_float(value)
143 |         case value
144 |         when "?"
145 |           nil
146 |         else
147 |           Float(value)
148 |         end
149 |       end
150 | 
151 |       def normalize_integer(value)
152 |         case value
153 |         when "?"
154 |           nil
155 |         else
156 |           Integer(value, 10)
157 |         end
158 |       end
159 |     end
160 | 
161 |     def initialize
162 |       super()
163 |       @metadata.id = "hepatitis"
164 |       @metadata.name = "Hepatitis"
165 |       @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis"
166 |       @metadata.licenses = ["CC-BY-4.0"]
167 |       @metadata.description = lambda do
168 |         read_names
169 |       end
170 |     end
171 | 
172 |     def each
173 |       return to_enum(__method__) unless block_given?
174 | 
175 |       open_data do |csv|
176 |         csv.each do |row|
177 |           record = Record.new(*row)
178 |           yield(record)
179 |         end
180 |       end
181 |     end
182 | 
183 |     private
184 |     def base_url
185 |       "https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis"
186 |     end
187 | 
188 |     def open_data
189 |       data_path = cache_dir_path + "hepatitis.csv"
190 |       data_url = "#{base_url}/hepatitis.data"
191 |       download(data_path, data_url)
192 |       CSV.open(data_path) do |csv|
193 |         yield(csv)
194 |       end
195 |     end
196 | 
197 |     def read_names
198 |       names_path = cache_dir_path + "hepatitis.names"
199 |       names_url = "#{base_url}/hepatitis.names"
200 |       download(names_path, names_url)
201 |       names_path.read
202 |     end
203 |   end
204 | end
205 | 


--------------------------------------------------------------------------------
/lib/datasets/house-of-representative.rb:
--------------------------------------------------------------------------------
  1 | require_relative "dataset"
  2 | require_relative "japanese-date-parser"
  3 | 
  4 | module Datasets
  5 |   class HouseOfRepresentative < Dataset
  6 |     Record = Struct.new(:carry_time,
  7 |                         :caption,
  8 |                         :type,
  9 |                         :submit_time,
 10 |                         :submit_number,
 11 |                         :title,
 12 |                         :discussion_status,
 13 |                         :progress,
 14 |                         :progress_url,
 15 |                         :text,
 16 |                         :text_url,
 17 |                         :bill_type,
 18 |                         :submitter,
 19 |                         :submitter_in_house_groups,
 20 |                         :house_of_representatives_of_accepted_bill_on_preliminary_consideration,
 21 |                         :house_of_representatives_of_preliminary_refer_on,
 22 |                         :house_of_representatives_of_preliminary_refer_commission,
 23 |                         :house_of_representatives_of_accepted_bill_on,
 24 |                         :house_of_representatives_of_refer_on,
 25 |                         :house_of_representatives_of_refer_commission,
 26 |                         :house_of_representatives_of_finished_consideration_on,
 27 |                         :house_of_representatives_of_consideration_result,
 28 |                         :house_of_representatives_of_finished_deliberation_on,
 29 |                         :house_of_representatives_of_deliberation_result,
 30 |                         :house_of_representatives_of_attitude_of_in_house_group_during_deliberation,
 31 |                         :house_of_representatives_of_support_in_house_group_during_deliberation,
 32 |                         :house_of_representatives_of_opposition_in_house_group_during_deliberation,
 33 |                         :house_of_councillors_of_accepted_bill_on_preliminary_consideration,
 34 |                         :house_of_councillors_of_preliminary_refer_on,
 35 |                         :house_of_councillors_of_preliminary_refer_commission,
 36 |                         :house_of_councillors_of_accepted_bill_on,
 37 |                         :house_of_councillors_of_refer_on,
 38 |                         :house_of_councillors_of_refer_commission,
 39 |                         :house_of_councillors_of_finished_consideration_on,
 40 |                         :house_of_councillors_of_consideration_result,
 41 |                         :house_of_councillors_of_finished_deliberation_on,
 42 |                         :house_of_councillors_of_deliberation_result,
 43 |                         :promulgated_on,
 44 |                         :law_number,
 45 |                         :submitters,
 46 |                         :supporters_of_submitted_bill)
 47 | 
 48 |     def initialize
 49 |       super()
 50 | 
 51 |       @metadata.id = "house-of-representative"
 52 |       @metadata.name = "Bill of the House of Representatives of Japan"
 53 |       @metadata.url = "https://smartnews-smri.github.io/house-of-representatives"
 54 |       @metadata.licenses = ["MIT"]
 55 |       @metadata.description = "Bill of the House of Representatives of Japan"
 56 |     end
 57 | 
 58 |     def each
 59 |       return to_enum(__method__) unless block_given?
 60 | 
 61 |       open_data do |csv|
 62 |         csv.each do |row|
 63 |           record = Record.new(*row.fields)
 64 |           yield(record)
 65 |         end
 66 |       end
 67 |     end
 68 | 
 69 |     private
 70 | 
 71 |     def open_data
 72 |       data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv"
 73 |       data_path = cache_dir_path + "gian.csv"
 74 |       download(data_path, data_url)
 75 | 
 76 |       parser = JapaneseDateParser.new
 77 |       japanese_date_converter = lambda do |field, info|
 78 |         if info.header.end_with?("年月日")
 79 |           parser.parse(field)
 80 |         else
 81 |           field
 82 |         end
 83 |       end
 84 |       array_converter = lambda do |field, info|
 85 |         case info.header
 86 |         when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者"
 87 |           parse_array(field)
 88 |         else
 89 |           field
 90 |         end
 91 |       end
 92 |       File.open(data_path) do |data_file|
 93 |         options = {
 94 |           col_sep: ",",
 95 |           headers: true,
 96 |           converters: [:integer, japanese_date_converter, array_converter],
 97 |         }
 98 |         # There are two columns within one column. To split into two columns, `#gsub` is necessary.
 99 |         yield(CSV.new(data_file.read.gsub("／", ","), **options))
100 |       end
101 |     end
102 | 
103 |     def parse_array(column_value)
104 |       column_value&.split("; ")
105 |     end
106 |   end
107 | end
108 | 


--------------------------------------------------------------------------------
/lib/datasets/iris.rb:
--------------------------------------------------------------------------------
 1 | require "csv"
 2 | 
 3 | require_relative "dataset"
 4 | 
 5 | module Datasets
 6 |   class Iris < Dataset
 7 |     Record = Struct.new(:sepal_length,
 8 |                         :sepal_width,
 9 |                         :petal_length,
10 |                         :petal_width,
11 |                         :label)
12 | 
13 |     def initialize
14 |       super()
15 |       @metadata.id = "iris"
16 |       @metadata.name = "Iris"
17 |       @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris"
18 |       @metadata.licenses = ["CC-BY-4.0"]
19 |       @metadata.description = lambda do
20 |         read_names
21 |       end
22 |     end
23 | 
24 |     def each
25 |       return to_enum(__method__) unless block_given?
26 | 
27 |       open_data do |csv|
28 |         csv.each do |row|
29 |           next if row[0].nil?
30 |           record = Record.new(*row)
31 |           yield(record)
32 |         end
33 |       end
34 |     end
35 | 
36 |     private
37 |     def open_data
38 |       data_path = cache_dir_path + "iris.csv"
39 |       data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
40 |       download(data_path, data_url)
41 |       CSV.open(data_path, converters: [:numeric]) do |csv|
42 |         yield(csv)
43 |       end
44 |     end
45 | 
46 |     def read_names
47 |       names_path = cache_dir_path + "iris.names"
48 |       names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names"
49 |       download(names_path, names_url)
50 |       names_path.read
51 |     end
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/lib/datasets/ita-corpus.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'dataset'
 2 | 
 3 | module Datasets
 4 |   class ITACorpus < Dataset
 5 |     Record = Struct.new(:id,
 6 |                         :sentence)
 7 | 
 8 |     def initialize(type: :emotion)
 9 |       unless [:emotion, :recitation].include?(type)
10 |         raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}"
11 |       end
12 | 
13 |       super()
14 |       @type = type
15 |       @metadata.id = 'ita-corpus'
16 |       @metadata.name = 'ITA-corpus'
17 |       @metadata.url = 'https://github.com/mmorise/ita-corpus'
18 |       @metadata.licenses = ['Unlicense']
19 |       @metadata.description = lambda do
20 |         fetch_readme
21 |       end
22 |     end
23 | 
24 |     def each(&block)
25 |       return to_enum(__method__) unless block_given?
26 | 
27 |       data_path = cache_dir_path + "#{@type}_transcript_utf8.txt"
28 |       data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt"
29 |       download(data_path, data_url)
30 | 
31 |       parse_data(data_path, &block)
32 |     end
33 | 
34 |     private
35 |     def fetch_readme
36 |       readme_base_name = "README.md"
37 |       readme_path = cache_dir_path + readme_base_name
38 |       readme_url = "#{download_base_url}/#{readme_base_name}"
39 |       download(readme_path, readme_url)
40 |       readme_path.read.split(/^## ファイル構成/, 2)[0].strip
41 |     end
42 | 
43 |     def download_base_url
44 |       "https://raw.githubusercontent.com/mmorise/ita-corpus/main"
45 |     end
46 | 
47 |     def parse_data(data_path)
48 |       File.open(data_path) do |f|
49 |         f.each_line(chomp: true) do |line|
50 |           id, sentence = line.split(':', 2)
51 |           record = Record.new(id , sentence)
52 |           yield(record)
53 |         end
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/lib/datasets/japanese-date-parser.rb:
--------------------------------------------------------------------------------
 1 | module Datasets
 2 |   class JapaneseDateParser
 3 |     class UnsupportedEraInitialRange < Error; end
 4 | 
 5 |     ERA_INITIALS = {
 6 |       "平成" => "H",
 7 |       "令和" => "R",
 8 |     }.freeze
 9 | 
10 |     def parse(string)
11 |       case string
12 |       when nil
13 |         nil
14 |       when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/
15 |         match_data = Regexp.last_match
16 |         era_initial = ERA_INITIALS[match_data[1]]
17 |         if era_initial.nil?
18 |           message = +"era must be one of ["
19 |           message << ERA_INITIALS.keys.join(", ")
20 |           message << "]: #{match_data[1]}"
21 |           raise UnsupportedEraInitialRange, message
22 |         end
23 | 
24 |         year = match_data[2]
25 |         if year == "元"
26 |           year = "01"
27 |         else
28 |           year = year.rjust(2, "0")
29 |         end
30 |         month = match_data[3].rjust(2, "0")
31 |         day = match_data[4].rjust(2, "0")
32 |         Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}")
33 |       else
34 |         string
35 |       end
36 |     end
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/lib/datasets/kuzushiji-mnist.rb:
--------------------------------------------------------------------------------
 1 | require_relative 'mnist'
 2 | 
 3 | module Datasets
 4 |   class KuzushijiMNIST < MNIST
 5 |     private
 6 |     def base_urls
 7 |       [
 8 |         "http://codh.rois.ac.jp/kmnist/dataset/kmnist/",
 9 |       ]
10 |     end
11 | 
12 |     def dataset_name
13 |       "Kuzushiji-MNIST"
14 |     end
15 | 
16 |     def licenses
17 |       ["CC-BY-SA-4.0"]
18 |     end
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/datasets/lazy.rb:
--------------------------------------------------------------------------------
 1 | require_relative "version"
 2 | 
 3 | module Datasets
 4 |   class LazyLoader
 5 |     def initialize
 6 |       @constants = {}
 7 |     end
 8 | 
 9 |     def exist?(constant_name)
10 |       @constants.key?(constant_name)
11 |     end
12 | 
13 |     def load(constant_name)
14 |       feature = @constants[constant_name]
15 |       raise LoadError, "unknown dataset: #{constant_name}" unless feature
16 |       require feature
17 |     end
18 | 
19 |     def load_all
20 |       @constants.each_value do |feature|
21 |         require feature
22 |       end
23 |     end
24 | 
25 |     def register(constant_name, feature)
26 |       @constants[constant_name] = feature
27 |     end
28 | 
29 |     def constant_names
30 |       @constants.keys
31 |     end
32 |   end
33 | 
34 |   LAZY_LOADER = LazyLoader.new
35 | 
36 |   class << self
37 |     def const_missing(name)
38 |       if LAZY_LOADER.exist?(name)
39 |         LAZY_LOADER.load(name)
40 |         const_get(name)
41 |       else
42 |         super
43 |       end
44 |     end
45 |   end
46 | 
47 |   LAZY_LOADER.register(:Adult, "datasets/adult")
48 |   LAZY_LOADER.register(:AFINN, "datasets/afinn")
49 |   LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko")
50 |   LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing")
51 |   LAZY_LOADER.register(:CIFAR, "datasets/cifar")
52 |   LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals")
53 |   LAZY_LOADER.register(:Communities, "datasets/communities")
54 |   LAZY_LOADER.register(:Diamonds, "datasets/diamonds")
55 |   LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan")
56 |   LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist")
57 |   LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy")
58 |   LAZY_LOADER.register(:Geolonia, "datasets/geolonia")
59 |   LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis")
60 |   LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor")
61 |   LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative")
62 |   LAZY_LOADER.register(:Iris, "datasets/iris")
63 |   LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus")
64 |   LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist")
65 |   LAZY_LOADER.register(:LIBSVM, "datasets/libsvm")
66 |   LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list")
67 |   LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news")
68 |   LAZY_LOADER.register(:MNIST, "datasets/mnist")
69 |   LAZY_LOADER.register(:Mushroom, "datasets/mushroom")
70 |   LAZY_LOADER.register(:NagoyaUniversityConversationCorpus,
71 |                        "datasets/nagoya-university-conversation-corpus")
72 |   LAZY_LOADER.register(:Penguins, "datasets/penguins")
73 |   LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank")
74 |   LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list")
75 |   LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan")
76 |   LAZY_LOADER.register(:QuoraDuplicateQuestionPair,
77 |                        "datasets/quora-duplicate-question-pair")
78 |   LAZY_LOADER.register(:RdatasetList, "datasets/rdataset")
79 |   # For backward compatibility
80 |   LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset")
81 |   LAZY_LOADER.register(:Rdataset, "datasets/rdataset")
82 |   # For backward compatibility
83 |   LAZY_LOADER.register(:Rdatasets, "datasets/rdataset")
84 |   LAZY_LOADER.register(:SeabornList, "datasets/seaborn")
85 |   LAZY_LOADER.register(:Seaborn, "datasets/seaborn")
86 |   LAZY_LOADER.register(:SudachiSynonymDictionary,
87 |                        "datasets/sudachi-synonym-dictionary")
88 |   LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia")
89 |   LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish,
90 |                        "datasets/wikipedia-kyoto-japanese-english")
91 |   LAZY_LOADER.register(:Wine, "datasets/wine")
92 | end
93 | 


--------------------------------------------------------------------------------
/lib/datasets/libsvm.rb:
--------------------------------------------------------------------------------
  1 | require "csv"
  2 | 
  3 | require_relative "dataset"
  4 | 
  5 | module Datasets
  6 |   class LIBSVM < Dataset
  7 |     class Record
  8 |       attr_reader :label
  9 |       attr_reader :features
 10 |       def initialize(label, features)
 11 |         @label = label
 12 |         @features = features
 13 |       end
 14 | 
 15 |       def [](index)
 16 |         @features[index]
 17 |       end
 18 | 
 19 |       def to_h
 20 |         hash = {
 21 |           label: @label,
 22 |         }
 23 |         @features.each_with_index do |feature, i|
 24 |           hash[i] = feature
 25 |         end
 26 |         hash
 27 |       end
 28 | 
 29 |       def values
 30 |         [@label] + @features
 31 |       end
 32 |     end
 33 | 
 34 |     def initialize(name,
 35 |                    note: nil,
 36 |                    default_feature_value: 0)
 37 |       super()
 38 |       @libsvm_dataset_metadata = fetch_dataset_info(name)
 39 |       @file = choose_file(note)
 40 |       @default_feature_value = default_feature_value
 41 |       @metadata.id = "libsvm-#{normalize_name(name)}"
 42 |       @metadata.name = "LIBSVM dataset: #{name}"
 43 |       @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/"
 44 |       @metadata.licenses = ["BSD-3-Clause"]
 45 |     end
 46 | 
 47 |     def each
 48 |       return to_enum(__method__) unless block_given?
 49 | 
 50 |       open_data do |input|
 51 |         n_features = @libsvm_dataset_metadata.n_features
 52 |         csv = CSV.new(input, col_sep: " ")
 53 |         csv.each do |row|
 54 |           label = parse_label(row.shift)
 55 |           features = [@default_feature_value] * n_features
 56 |           row.each do |column|
 57 |             next if column.nil?
 58 |             index, value = column.split(":", 2)
 59 |             features[Integer(index, 10) - 1] = parse_value(value)
 60 |           end
 61 |           yield(Record.new(label, features))
 62 |         end
 63 |       end
 64 |     end
 65 | 
 66 |     private
 67 |     def fetch_dataset_info(name)
 68 |       list = LIBSVMDatasetList.new
 69 |       available_datasets = []
 70 |       list.each do |record|
 71 |         available_datasets << record.name
 72 |         if record.name == name
 73 |           return record
 74 |         end
 75 |       end
 76 |       message = "unavailable LIBSVM dataset: #{name.inspect}: "
 77 |       message << "available datasets: ["
 78 |       message << available_datasets.collect(&:inspect).join(", ")
 79 |       message << "]"
 80 |       raise ArgumentError, message
 81 |     end
 82 | 
 83 |     def choose_file(note)
 84 |       files = @libsvm_dataset_metadata.files
 85 |       return files.first if note.nil?
 86 | 
 87 |       available_notes = []
 88 |       @libsvm_dataset_metadata.files.find do |file|
 89 |         return file if file.note == note
 90 |         available_notes << file.note if file.note
 91 |       end
 92 | 
 93 |       name = @libsvm_dataset_metadata.name
 94 |       message = "unavailable note: #{name}: #{note.inspect}: "
 95 |       message << "available notes: ["
 96 |       message << available_notes.collect(&:inspect).join(", ")
 97 |       message << "]"
 98 |       raise ArgumentError, message
 99 |     end
100 | 
101 |     def open_data(&block)
102 |       data_path = cache_dir_path + @file.name
103 |       download(data_path, @file.url)
104 |       if data_path.extname == ".bz2"
105 |         extract_bz2(data_path, &block)
106 |       else
107 |         data_path.open(&block)
108 |       end
109 |     end
110 | 
111 |     def normalize_name(name)
112 |       name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase
113 |     end
114 | 
115 |     def parse_label(label)
116 |       labels = label.split(",").collect do |value|
117 |         parse_value(value)
118 |       end
119 |       if labels.size == 1
120 |         labels[0]
121 |       else
122 |         labels
123 |       end
124 |     end
125 | 
126 |     def parse_value(value)
127 |       if value.include?(".")
128 |         Float(value)
129 |       else
130 |         Integer(value, 10)
131 |       end
132 |     end
133 |   end
134 | end
135 | 


--------------------------------------------------------------------------------
/lib/datasets/license.rb:
--------------------------------------------------------------------------------
 1 | module Datasets
 2 |   class License < Struct.new(:spdx_id,
 3 |                              :name,
 4 |                              :url)
 5 |     class << self
 6 |       def try_convert(value)
 7 |         case value
 8 |         when self
 9 |           value
10 |         when String
11 |           license = new
12 |           license.spdx_id = value
13 |           license
14 |         when Hash
15 |           license = new
16 |           license.spdx_id = value[:spdx_id]
17 |           license.name = value[:name]
18 |           license.url = value[:url]
19 |           license
20 |         else
21 |           nil
22 |         end
23 |       end
24 |     end
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/datasets/livedoor-news.rb:
--------------------------------------------------------------------------------
 1 | require_relative "dataset"
 2 | require_relative "tar-gz-readable"
 3 | 
 4 | module Datasets
 5 |   class LivedoorNews < Dataset
 6 |     include TarGzReadable
 7 |     Record = Struct.new(:url,
 8 |                         :timestamp,
 9 |                         :sentence)
10 | 
11 |     def initialize(type: :topic_news)
12 |       news_list = [
13 |         :topic_news,
14 |         :sports_watch,
15 |         :it_life_hack,
16 |         :kaden_channel,
17 |         :movie_enter,
18 |         :dokujo_tsushin,
19 |         :smax,
20 |         :livedoor_homme,
21 |         :peachy
22 |       ]
23 |       unless news_list.include?(type)
24 |         valid_type_labels = news_list.collect(&:inspect).join(", ")
25 |         message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}"
26 |         raise ArgumentError, message
27 |       end
28 | 
29 |       super()
30 |       @type = type
31 |       @metadata.id = 'livedoor-news'
32 |       @metadata.name = 'livedoor-news'
33 |       @metadata.url = 'https://www.rondhuit.com/download.html#ldcc'
34 |       @metadata.licenses = ['CC-BY-ND-2.1-JP']
35 |       @metadata.description = lambda do
36 |         fetch_readme
37 |       end
38 |     end
39 | 
40 |     def each(&block)
41 |       return to_enum(__method__) unless block_given?
42 | 
43 |       data_path = download_tar_gz
44 |       parse_data(data_path, &block)
45 |     end
46 | 
47 |     private
48 |     def download_tar_gz
49 |       data_path = cache_dir_path + "livedoor-news.tar.gz"
50 |       data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz"
51 |       download(data_path, data_url)
52 |       data_path
53 |     end
54 | 
55 |     def fetch_readme
56 |       data_path = download_tar_gz
57 |       target_file_name = 'text/README.txt'
58 |       open_tar_gz(data_path) do |tar|
59 |         tar.seek(target_file_name) do |entry|
60 |           return entry.read.force_encoding("UTF-8")
61 |         end
62 |       end
63 |     end
64 | 
65 |     def parse_data(data_path, &block)
66 |       target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}"
67 |       open_tar_gz(data_path) do |tar|
68 |         tar.each do |entry|
69 |           next unless entry.file?
70 |           directory_name, base_name = File.split(entry.full_name)
71 |           next unless directory_name == target_directory_name
72 |           next if base_name == "LICENSE.txt"
73 |           url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3)
74 |           record = Record.new(url, Time.iso8601(timestamp), sentence)
75 |           yield(record)
76 |         end
77 |       end
78 |     end
79 |   end
80 | end
81 | 


--------------------------------------------------------------------------------
/lib/datasets/metadata.rb:
--------------------------------------------------------------------------------
 1 | require_relative "license"
 2 | 
 3 | module Datasets
 4 |   class Metadata < Struct.new(:id,
 5 |                               :name,
 6 |                               :url,
 7 |                               :licenses,
 8 |                               :description)
 9 |     def licenses=(licenses)
10 |       licenses = [licenses] unless licenses.is_a?(Array)
11 |       licenses = licenses.collect do |license|
12 |         l = License.try_convert(license)
13 |         if l.nil?
14 |           raise ArgumentError.new("invalid license: #{license.inspect}")
15 |         end
16 |         l
17 |       end
18 |       super(licenses)
19 |     end
20 | 
21 |     def description
22 |       description_raw = super
23 |       if description_raw.respond_to?(:call)
24 |         self.description = description_raw = description_raw.call
25 |       end
26 |       description_raw
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/datasets/mnist.rb:
--------------------------------------------------------------------------------
  1 | require 'zlib'
  2 | 
  3 | require_relative "dataset"
  4 | 
  5 | module Datasets
  6 |   class MNIST < Dataset
  7 |     class Record < Struct.new(:data, :label)
  8 |       def pixels
  9 |         data.unpack("C*")
 10 |       end
 11 | 
 12 |       def to_h
 13 |         hash = super
 14 |         hash[:pixels] = pixels
 15 |         hash
 16 |       end
 17 |     end
 18 | 
 19 |     def initialize(type: :train)
 20 |       unless [:train, :test].include?(type)
 21 |         raise ArgumentError, "Please set type :train or :test: #{type.inspect}"
 22 |       end
 23 | 
 24 |       super()
 25 | 
 26 |       @metadata.id = "#{dataset_name.downcase}-#{type}"
 27 |       @metadata.name = "#{dataset_name}: #{type}"
 28 |       @metadata.url = base_urls.first
 29 |       @metadata.licenses = licenses
 30 |       @type = type
 31 | 
 32 |       case type
 33 |       when :train
 34 |         @metadata.description = "a training set of 60,000 examples"
 35 |       when :test
 36 |         @metadata.description = "a test set of 10,000 examples"
 37 |       end
 38 |     end
 39 | 
 40 |     def each(&block)
 41 |       return to_enum(__method__) unless block_given?
 42 | 
 43 |       image_path = cache_dir_path + target_file(:image)
 44 |       label_path = cache_dir_path + target_file(:label)
 45 | 
 46 |       download(image_path,
 47 |                *base_urls.collect { |base_url| base_url + target_file(:image) })
 48 |       download(label_path,
 49 |                *base_urls.collect { |base_url| base_url + target_file(:label) })
 50 | 
 51 |       open_data(image_path, label_path, &block)
 52 |     end
 53 | 
 54 |     private
 55 |     def base_urls
 56 |       [
 57 |         "https://ossci-datasets.s3.amazonaws.com/mnist/",
 58 |       ]
 59 |     end
 60 | 
 61 |     def licenses
 62 |       []
 63 |     end
 64 | 
 65 |     def open_data(image_path, label_path, &block)
 66 |       labels = parse_labels(label_path)
 67 | 
 68 |       Zlib::GzipReader.open(image_path) do |f|
 69 |         n_uint32s = 4
 70 |         n_bytes = n_uint32s * 4
 71 |         mnist_magic_number = 2051
 72 |         magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*")
 73 |         if magic != mnist_magic_number
 74 |           raise Error, "This is not #{dataset_name} image file"
 75 |         end
 76 |         n_images.times do |i|
 77 |           data = f.read(n_rows * n_cols)
 78 |           label = labels[i]
 79 |           yield Record.new(data, label)
 80 |         end
 81 |       end
 82 |     end
 83 | 
 84 |     def target_file(data)
 85 |       case @type
 86 |       when :train
 87 |         case data
 88 |         when :image
 89 |           "train-images-idx3-ubyte.gz"
 90 |         when :label
 91 |           "train-labels-idx1-ubyte.gz"
 92 |         end
 93 |       when :test
 94 |         case data
 95 |         when :image
 96 |           "t10k-images-idx3-ubyte.gz"
 97 |         when :label
 98 |           "t10k-labels-idx1-ubyte.gz"
 99 |         end
100 |       end
101 |     end
102 | 
103 |     def parse_labels(file_path)
104 |       Zlib::GzipReader.open(file_path) do |f|
105 |         n_uint32s = 4
106 |         n_bytes = n_uint32s * 2
107 |         mnist_magic_number = 2049
108 |         magic, n_labels = f.read(n_bytes).unpack('N2')
109 |         if magic != mnist_magic_number
110 |           raise Error, "This is not #{dataset_name} label file"
111 |         end
112 |         f.read(n_labels).unpack('C*')
113 |       end
114 |     end
115 | 
116 |     def dataset_name
117 |       "MNIST"
118 |     end
119 |   end
120 | end
121 | 


--------------------------------------------------------------------------------
/lib/datasets/nagoya-university-conversation-corpus.rb:
--------------------------------------------------------------------------------
  1 | require_relative 'dataset'
  2 | require_relative 'zip-extractor'
  3 | 
  4 | module Datasets
  5 |   class NagoyaUniversityConversationCorpus < Dataset
  6 |     Data = Struct.new(
  7 |       :name,
  8 |       :date,
  9 |       :place,
 10 |       :participants,
 11 |       :relationships,
 12 |       :note,
 13 |       :sentences
 14 |     )
 15 | 
 16 |     Participant = Struct.new(
 17 |       :id,
 18 |       :attribute,
 19 |       :birthplace,
 20 |       :residence
 21 |     )
 22 | 
 23 |     Sentence = Struct.new(:participant_id, :content) do
 24 |       def end?
 25 |         participant_id.nil? and content.nil?
 26 |       end
 27 |     end
 28 | 
 29 |     def initialize
 30 |       super()
 31 |       @metadata.id = 'nagoya-university-conversation-corpus'
 32 |       @metadata.name = 'Nagoya University Conversation Corpus'
 33 |       @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/'
 34 |       @metadata.licenses = ['CC-BY-NC-ND-4.0']
 35 |       @metadata.description = <<~DESCRIPTION
 36 |         The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
 37 |         total about 100 hours of chatting among native speakers of Japanese,
 38 |         which is converted into text.
 39 |       DESCRIPTION
 40 |     end
 41 | 
 42 |     def each
 43 |       return to_enum(__method__) unless block_given?
 44 | 
 45 |       open_data do |input_stream|
 46 |         yield(parse_file(input_stream))
 47 |       end
 48 |     end
 49 | 
 50 |     private
 51 | 
 52 |     def open_data
 53 |       data_path = cache_dir_path + 'nucc.zip'
 54 |       data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip'
 55 |       download(data_path, data_url)
 56 | 
 57 |       extractor = ZipExtractor.new(data_path)
 58 |       extractor.extract_files do |input_stream|
 59 |         yield(input_stream)
 60 |       end
 61 |     end
 62 | 
 63 |     def parse_file(input_stream)
 64 |       data = Data.new
 65 |       participants = []
 66 |       sentences = []
 67 | 
 68 |       input_stream.each do |input|
 69 |         input.each_line(chomp: true) do |line|
 70 |           line.force_encoding('utf-8')
 71 |           if line.start_with?('＠データ')
 72 |             data.name = line[4..-1]
 73 |           elsif line.start_with?('＠収集年月日')
 74 |             # mixed cases with and without'：'
 75 |             data.date = line[6..-1].delete_prefix('：')
 76 |           elsif line.start_with?('＠場所')
 77 |             data.place = line[4..-1]
 78 |           elsif line.start_with?('＠参加者の関係')
 79 |             data.relationships = line.split('：', 2)[1]
 80 |           elsif line.start_with?('＠参加者')
 81 |             participant = Participant.new
 82 |             participant.id, profiles = line[4..-1].split('：', 2)
 83 |             participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3)
 84 | 
 85 |             participants << participant
 86 |           elsif line.start_with?('％ｃｏｍ')
 87 |             data.note = line.split('：', 2)[1]
 88 |           elsif line == '＠ＥＮＤ'
 89 |             sentence = Sentence.new
 90 |             sentence.participant_id = nil
 91 |             sentence.content = nil
 92 | 
 93 |             sentences << sentence
 94 |           else
 95 |             sentence = Sentence.new
 96 |             sentence.participant_id, sentence.content = line.split('：', 2)
 97 | 
 98 |             sentences << sentence
 99 |           end
100 |         end
101 |       end
102 | 
103 |       data.participants = participants
104 |       data.sentences = sentences
105 | 
106 |       data
107 |     end
108 |   end
109 | end
110 | 


--------------------------------------------------------------------------------
/lib/datasets/penguins.rb:
--------------------------------------------------------------------------------
  1 | require "csv"
  2 | 
  3 | require_relative "dataset"
  4 | 
  5 | module Datasets
  6 |   module PenguinsRawData
  7 |     Record = Struct.new(:study_name,
  8 |                         :sample_number,
  9 |                         :species,
 10 |                         :region,
 11 |                         :island,
 12 |                         :stage,
 13 |                         :individual_id,
 14 |                         :clutch_completion,
 15 |                         :date_egg,
 16 |                         :culmen_length_mm,
 17 |                         :culmen_depth_mm,
 18 |                         :flipper_length_mm,
 19 |                         :body_mass_g,
 20 |                         :sex,
 21 |                         :delta_15_n_permil,
 22 |                         :delta_13_c_permil,
 23 |                         :comments)
 24 |     class SpeciesBase < Dataset
 25 |       def initialize
 26 |         super
 27 |         species = self.class.name.split("::").last.downcase
 28 |         @metadata.id = "palmerpenguins-#{species}"
 29 |         package_id = http_parameters["packageid"]
 30 |         @metadata.url = "https://portal.edirepository.org/nis/mapbrowse" +
 31 |                         "?packageid=#{package_id}"
 32 |         @metadata.licenses = ["CC0-1.0"]
 33 |         @data_path = cache_dir_path + "#{species}.csv"
 34 |       end
 35 | 
 36 |       attr_reader :data_path
 37 | 
 38 |       def each
 39 |         return to_enum(__method__) unless block_given?
 40 | 
 41 |         open_data do |csv|
 42 |           csv.each do |row|
 43 |             next if row[0].nil?
 44 |             record = Record.new(*row.fields)
 45 |             yield record
 46 |           end
 47 |         end
 48 |       end
 49 | 
 50 |       private def open_data
 51 |         download(data_path,
 52 |                  "https://portal.edirepository.org/nis/dataviewer",
 53 |                  http_method: :post,
 54 |                  http_parameters: http_parameters)
 55 |         CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
 56 |           yield csv
 57 |         end
 58 |       end
 59 |     end
 60 | 
 61 |     # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86
 62 |     class Adelie < SpeciesBase
 63 |       DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze
 64 | 
 65 |       private def http_parameters
 66 |         {
 67 |           "packageid" => "knb-lter-pal.219.3",
 68 |           "entityid" => "002f3893385f710df69eeebe893144ff",
 69 |         }
 70 |       end
 71 |     end
 72 | 
 73 |     # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7
 74 |     class Chinstrap < SpeciesBase
 75 |       DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze
 76 | 
 77 |       private def http_parameters
 78 |         {
 79 |           "packageid" => "knb-lter-pal.221.2",
 80 |           "entityid" => "fe853aa8f7a59aa84cdd3197619ef462",
 81 |         }
 82 |       end
 83 |     end
 84 | 
 85 |     # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce
 86 |     class Gentoo < SpeciesBase
 87 |       DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze
 88 | 
 89 |       private def http_parameters
 90 |         {
 91 |           "packageid" => "knb-lter-pal.220.3",
 92 |           "entityid" => "e03b43c924f226486f2f0ab6709d2381",
 93 |         }
 94 |       end
 95 |     end
 96 |   end
 97 | 
 98 |   # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins
 99 |   class Penguins < Dataset
100 |     Record = Struct.new(:species,
101 |                         :island,
102 |                         :bill_length_mm,
103 |                         :bill_depth_mm,
104 |                         :flipper_length_mm,
105 |                         :body_mass_g,
106 |                         :sex,
107 |                         :year)
108 | 
109 |     def initialize
110 |       super
111 |       @metadata.id = "palmerpenguins"
112 |       @metadata.name = "palmerpenguins"
113 |       @metadata.url = "https://allisonhorst.github.io/palmerpenguins/"
114 |       @metadata.licenses = ["CC0"]
115 |       @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris"
116 |     end
117 | 
118 |     def each(&block)
119 |       return to_enum(__method__) unless block_given?
120 | 
121 |       species_classes = [
122 |         PenguinsRawData::Adelie,
123 |         PenguinsRawData::Chinstrap,
124 |         PenguinsRawData::Gentoo,
125 |       ]
126 | 
127 |       species_classes.each do |species_class|
128 |         species_class.new.each do |raw_record|
129 |           yield convert_record(raw_record)
130 |         end
131 |       end
132 |     end
133 | 
134 |     private def convert_record(raw_record)
135 |       Record.new(*cleanse_fields(raw_record))
136 |     end
137 | 
138 |     private def cleanse_fields(raw_record)
139 |       species = raw_record.species.split(' ')[0]
140 |       flipper_length_mm = raw_record.flipper_length_mm&.to_i
141 |       body_mass_g = raw_record.body_mass_g&.to_i
142 |       sex = normalize_sex(raw_record.sex)
143 |       year = raw_record.date_egg&.year
144 | 
145 |       [
146 |         species,
147 |         raw_record.island,
148 |         raw_record.culmen_length_mm,
149 |         raw_record.culmen_depth_mm,
150 |         flipper_length_mm,
151 |         body_mass_g,
152 |         sex,
153 |         year
154 |       ]
155 |     end
156 | 
157 |     private def normalize_sex(val)
158 |       val = val&.downcase
159 |       case val
160 |       when "female", "male", nil
161 |         val
162 |       else
163 |         nil
164 |       end
165 |     end
166 |   end
167 | end
168 | 


--------------------------------------------------------------------------------
/lib/datasets/penn-treebank.rb:
--------------------------------------------------------------------------------
 1 | require_relative "dataset"
 2 | 
 3 | module Datasets
 4 |   class PennTreebank < Dataset
 5 |     Record = Struct.new(:word)
 6 | 
 7 |     DESCRIPTION = <<~DESC
 8 |       `Penn Tree Bank <https://www.cis.upenn.edu/~treebank/>`_ is originally a
 9 |       corpus of English sentences with linguistic structure annotations. This
10 |       function uses a variant distributed at
11 |       `https://github.com/wojzaremba/lstm <https://github.com/wojzaremba/lstm>`_,
12 |       which omits the annotation and splits the dataset into three parts:
13 |       training, validation, and test.
14 |     DESC
15 | 
16 |     def initialize(type: :train)
17 |       valid_types = [:train, :test, :valid]
18 |       unless valid_types.include?(type)
19 |         valid_types_label = valid_types.collect(&:inspect).join(", ")
20 |         message = "Type must be one of [#{valid_types_label}]: #{type.inspect}"
21 |         raise ArgumentError, message
22 |       end
23 |       @type = type
24 | 
25 |       super()
26 | 
27 |       @metadata.id = "penn-treebank-#{@type}"
28 |       @metadata.name = "Penn Treebank: #{@type}"
29 |       @metadata.description = DESCRIPTION
30 |       @metadata.url = "https://github.com/wojzaremba/lstm"
31 |       @metadata.licenses = ["Apache-2.0"]
32 |     end
33 | 
34 |     def each(&block)
35 |       return to_enum(__method__) unless block_given?
36 | 
37 |       base_name = "ptb.#{@type}.txt"
38 |       data_path = cache_dir_path + base_name
39 |       base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data"
40 |       download(data_path, "#{base_url}/#{base_name}")
41 | 
42 |       parse_data(data_path, &block)
43 |     end
44 | 
45 |     private
46 |     def parse_data(data_path)
47 |       File.open(data_path) do |f|
48 |         f.each_line do |line|
49 |           line.split.each do |word|
50 |             yield(Record.new(word.strip))
51 |           end
52 |         end
53 |       end
54 |     end
55 |   end
56 | end
57 | 


--------------------------------------------------------------------------------
/lib/datasets/pmjt-dataset-list.rb:
--------------------------------------------------------------------------------
 1 | require_relative "dataset"
 2 | 
 3 | module Datasets
 4 |   class PMJTDatasetList < Dataset
 5 |     Record = Struct.new(:unit,
 6 |                         :open_data_category,
 7 |                         :tag,
 8 |                         :release_time,
 9 |                         :n_volumes,
10 |                         :type,
11 |                         :publication_year,
12 |                         :original_request_code,
13 |                         :id,
14 |                         :title,
15 |                         :text,
16 |                         :bibliographical_introduction,
17 |                         :year)
18 | 
19 |     def initialize
20 |       super()
21 |       @metadata.id = "pmjt-dataset-list"
22 |       @metadata.name = "List of pre-modern Japanese text dataset"
23 |       @metadata.url = "http://codh.rois.ac.jp/pmjt/"
24 |       @metadata.licenses = ["CC-BY-SA-4.0"]
25 |       @metadata.description = <<~DESCRIPTION
26 |         Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data.
27 |         In addition, some text has description, transcription, and tagging data.
28 |       DESCRIPTION
29 | 
30 |       @data_path = cache_dir_path + (@metadata.id + ".csv")
31 |     end
32 | 
33 |     def each(&block)
34 |       return to_enum(__method__) unless block_given?
35 | 
36 |       latest_version = "201901"
37 |       url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv"
38 |       download(@data_path, url)
39 |       CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv|
40 |         csv.each do |row|
41 |           record = create_record(row)
42 |           yield record
43 |         end
44 |       end
45 |     end
46 | 
47 |     private
48 |     def create_record(csv_row)
49 |       record = Record.new
50 |       record.unit = csv_row["(単位)"]
51 |       record.open_data_category = csv_row["オープンデータ分類"]
52 |       record.tag = csv_row["タグ"]
53 |       record.release_time = csv_row["公開時期"]
54 |       record.n_volumes = csv_row["冊数等"]
55 |       record.type = csv_row["刊・写"]
56 |       record.publication_year = csv_row["刊年・書写年"]
57 |       record.original_request_code = csv_row["原本請求記号"]
58 |       record.id = csv_row["国文研書誌ID"]
59 |       record.title = csv_row["書名（統一書名）"]
60 |       record.text = csv_row["本文"]
61 |       record.bibliographical_introduction = csv_row["解題"]
62 |       record.year = csv_row["（西暦）"]
63 | 
64 |       record
65 |     end
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/lib/datasets/postal-code-japan.rb:
--------------------------------------------------------------------------------
  1 | require "csv"
  2 | require "zip"
  3 | 
  4 | require_relative "dataset"
  5 | 
  6 | module Datasets
  7 |   class PostalCodeJapan < Dataset
  8 |     class Record < Struct.new(:organization_code,
  9 |                               :old_postal_code,
 10 |                               :postal_code,
 11 |                               :prefecture_reading,
 12 |                               :city_reading,
 13 |                               :address_reading,
 14 |                               :prefecture,
 15 |                               :city,
 16 |                               :address,
 17 |                               :have_multiple_postal_codes,
 18 |                               :have_address_number_per_koaza,
 19 |                               :have_chome,
 20 |                               :postal_code_is_shared,
 21 |                               :changed,
 22 |                               :change_reason)
 23 |       alias_method :have_multiple_postal_codes?,
 24 |                    :have_multiple_postal_codes
 25 |       alias_method :have_address_number_per_koaza?,
 26 |                    :have_address_number_per_koaza
 27 |       alias_method :have_chome?,
 28 |                    :have_chome
 29 |       alias_method :postal_code_is_shared?,
 30 |                    :postal_code_is_shared
 31 |       alias_method :changed?,
 32 |                    :changed
 33 |     end
 34 | 
 35 |     VALID_READINGS = [
 36 |       :lowercase,
 37 |       :uppercase,
 38 |       :romaji,
 39 |     ]
 40 |     def initialize(reading: :lowercase)
 41 |       super()
 42 |       @reading = reading
 43 |       unless VALID_READINGS.include?(@reading)
 44 |         message = +":reading must be one of ["
 45 |         message << VALID_READINGS.collect(&:inspect).join(", ")
 46 |         message << "]: #{@reading.inspect}"
 47 |         raise ArgumentError, message
 48 |       end
 49 |       @metadata.id = "postal-code-japan-#{@reading}"
 50 |       @metadata.name = "Postal code in Japan (#{@reading})"
 51 |       @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html"
 52 |       @metadata.licenses = ["CC0-1.0"]
 53 |       @metadata.description = "Postal code in Japan (reading: #{@reading})"
 54 |     end
 55 | 
 56 |     def each(&block)
 57 |       return to_enum(__method__) unless block_given?
 58 | 
 59 |       open_data do |input|
 60 |         utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932)
 61 |         options = {
 62 |           quote_char: nil,
 63 |           strip: %Q["],
 64 |         }
 65 |         if @reading == :romaji
 66 |           CSV.parse(utf8_data, **options) do |row|
 67 |             yield(Record.new(nil,
 68 |                              nil,
 69 |                              row[0],
 70 |                              row[4],
 71 |                              row[5],
 72 |                              row[6],
 73 |                              row[1],
 74 |                              row[2],
 75 |                              row[3],
 76 |                              false,
 77 |                              false,
 78 |                              false,
 79 |                              false,
 80 |                              false,
 81 |                              nil))
 82 |           end
 83 |         else
 84 |           CSV.parse(utf8_data, **options) do |row|
 85 |             yield(Record.new(row[0],
 86 |                              row[1].rstrip,
 87 |                              row[2],
 88 |                              row[3],
 89 |                              row[4],
 90 |                              row[5],
 91 |                              row[6],
 92 |                              row[7],
 93 |                              row[8],
 94 |                              (row[9] == "1"),
 95 |                              (row[10] == "1"),
 96 |                              (row[11] == "1"),
 97 |                              (row[12] == "1"),
 98 |                              (row[13] != "0"),
 99 |                              convert_change_reason(row[14])))
100 |           end
101 |         end
102 |       end
103 |     end
104 | 
105 |     private
106 |     def open_data
107 |       data_url = +"https://www.post.japanpost.jp/zipcode/dl"
108 |       case @reading
109 |       when :lowercase
110 |         data_url << "/kogaki/zip/ken_all.zip"
111 |       when :uppercase
112 |         data_url << "/oogaki/zip/ken_all.zip"
113 |       when :romaji
114 |         data_url << "/roman/KEN_ALL_ROME.zip"
115 |       end
116 |       data_path = cache_dir_path + "#{@reading}-ken-all.zip"
117 |       download(data_path, data_url)
118 | 
119 |       Zip::File.open(data_path.to_s) do |zip_file|
120 |         zip_file.each do |entry|
121 |           next unless entry.file?
122 |           entry.get_input_stream do |input|
123 |             yield(input)
124 |           end
125 |         end
126 |       end
127 |     end
128 | 
129 |     def convert_change_reason(reason)
130 |       case reason
131 |       when "0"
132 |         nil
133 |       when "1"
134 |         :new
135 |       when "2"
136 |         :japanese_addressing_system
137 |       when "3"
138 |         :land_readjustment
139 |       when "4"
140 |         :postal_district_adjustment
141 |       when "5"
142 |         :correction
143 |       when "6"
144 |         :deletion
145 |       else
146 |         :unknown
147 |       end
148 |     end
149 |   end
150 | end
151 | 


--------------------------------------------------------------------------------
/lib/datasets/quora-duplicate-question-pair.rb:
--------------------------------------------------------------------------------
 1 | require "csv"
 2 | 
 3 | require_relative "dataset"
 4 | 
 5 | module Datasets
 6 |   class QuoraDuplicateQuestionPair < Dataset
 7 |     class Record < Struct.new(:id,
 8 |                               :first_question_id,
 9 |                               :second_question_id,
10 |                               :first_question,
11 |                               :second_question,
12 |                               :duplicated)
13 |       alias_method :duplicated?, :duplicated
14 |     end
15 | 
16 |     def initialize
17 |       super()
18 |       @metadata.id = "quora-duplicate-question-pair"
19 |       @metadata.name = "Quora's duplicated question pair dataset"
20 |       @metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs"
21 |       @metadata.licenses = [
22 |         {
23 |           name: "Quora's Terms of Service",
24 |           url: "https://www.quora.com/about/tos",
25 |         }
26 |       ]
27 |     end
28 | 
29 |     def each
30 |       return to_enum(__method__) unless block_given?
31 | 
32 |       open_data do |csv|
33 |         csv.each do |row|
34 |           row["is_duplicate"] = (row["is_duplicate"] == 1)
35 |           record = Record.new(*row.fields)
36 |           yield(record)
37 |         end
38 |       end
39 |     end
40 | 
41 |     private
42 |     def open_data
43 |       data_path = cache_dir_path + "quora_duplicate_questions.tsv"
44 |       data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
45 |       download(data_path, data_url)
46 |       CSV.open(data_path, col_sep: "\t", headers: true, converters: :integer) do |csv|
47 |         yield(csv)
48 |       end
49 |     end
50 |   end
51 | end
52 | 


--------------------------------------------------------------------------------
/lib/datasets/rdataset.rb:
--------------------------------------------------------------------------------
  1 | require_relative "dataset"
  2 | require_relative "tar-gz-readable"
  3 | 
  4 | module Datasets
  5 |   class RdatasetList < Dataset
  6 |     Record = Struct.new(:package,
  7 |                         :dataset,
  8 |                         :title,
  9 |                         :rows,
 10 |                         :cols,
 11 |                         :n_binary,
 12 |                         :n_character,
 13 |                         :n_factor,
 14 |                         :n_logical,
 15 |                         :n_numeric,
 16 |                         :csv,
 17 |                         :doc)
 18 | 
 19 |     def initialize
 20 |       super
 21 |       @metadata.id = "rdataset-list"
 22 |       @metadata.name = "Rdataset"
 23 |       @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/"
 24 |       @metadata.licenses = ["GPL-3"]
 25 |       @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv"
 26 |       @data_path = cache_dir_path + "datasets.csv"
 27 |     end
 28 | 
 29 |     def filter(package: nil, dataset: nil)
 30 |       return to_enum(__method__, package: package, dataset: dataset) unless block_given?
 31 | 
 32 |       conds = {}
 33 |       conds["Package"] = package if package
 34 |       conds["Item"]    = dataset if dataset
 35 |       if conds.empty?
 36 |         each_row {|row| yield Record.new(*row.fields) }
 37 |       else
 38 |         each_row do |row|
 39 |           if conds.all? {|k, v| row[k] == v }
 40 |             yield Record.new(*row.fields)
 41 |           end
 42 |         end
 43 |       end
 44 |     end
 45 | 
 46 |     def each(&block)
 47 |       filter(&block)
 48 |     end
 49 | 
 50 |     private def each_row(&block)
 51 |       download(@data_path, @data_url)
 52 |       CSV.open(@data_path, headers: :first_row, converters: :all) do |csv|
 53 |         csv.each(&block)
 54 |       end
 55 |     end
 56 |   end
 57 | 
 58 |   # For backward compatibility
 59 |   RdatasetsList = RdatasetList
 60 | 
 61 |   class Rdataset < Dataset
 62 |     def initialize(package_name, dataset_name)
 63 |       list = RdatasetList.new
 64 | 
 65 |       info = list.filter(package: package_name, dataset: dataset_name).first
 66 |       unless info
 67 |         raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}"
 68 |       end
 69 | 
 70 |       super()
 71 |       @metadata.id = "rdataset-#{package_name}-#{dataset_name}"
 72 |       @metadata.name = "Rdataset: #{package_name}: #{dataset_name}"
 73 |       @metadata.url = info.csv
 74 |       @metadata.licenses = ["GPL-3"]
 75 |       @metadata.description = info.title
 76 | 
 77 |       # Follow the original directory structure in the cache directory
 78 |       @data_path = cache_dir_path + (dataset_name + ".csv")
 79 | 
 80 |       @package_name = package_name
 81 |       @dataset_name = dataset_name
 82 |     end
 83 | 
 84 |     def each(&block)
 85 |       return to_enum(__method__) unless block_given?
 86 | 
 87 |       download(@data_path, @metadata.url)
 88 | 
 89 |       na_converter = lambda do |field|
 90 |         begin
 91 |           if field.encode(CSV::ConverterEncoding) == "NA"
 92 |             nil
 93 |           else
 94 |             field
 95 |           end
 96 |         rescue
 97 |           field
 98 |         end
 99 |       end
100 | 
101 |       inf_converter = lambda do |field|
102 |         begin
103 |           if field.encode(CSV::ConverterEncoding) == "Inf"
104 |             Float::INFINITY
105 |           else
106 |             field
107 |           end
108 |         rescue
109 |           field
110 |         end
111 |       end
112 | 
113 |       quote_preserving_converter = lambda do |field, info|
114 |         f = field.encode(CSV::ConverterEncoding)
115 |         return f if info.quoted?
116 | 
117 |         begin
118 |           begin
119 |             begin
120 |               return DateTime.parse(f) if f.match?(DateTimeMatcher)
121 |             rescue
122 |               return Integer(f)
123 |             end
124 |           rescue
125 |             return Float(f)
126 |           end
127 |         rescue
128 |           field
129 |         end
130 |       end
131 | 
132 |       table = CSV.table(@data_path,
133 |                         header_converters: [:symbol_raw],
134 |                         # quote_preserving_converter should be the last
135 |                         converters: [na_converter, inf_converter, quote_preserving_converter])
136 |       table.delete(:"") # delete 1st column for indices.
137 | 
138 |       table.each do |row|
139 |         yield row.to_h
140 |       end
141 |     end
142 |   end
143 | 
144 |   # For backward compatibility
145 |   Rdatasets = Rdataset
146 | end
147 | 


--------------------------------------------------------------------------------
/lib/datasets/seaborn.rb:
--------------------------------------------------------------------------------
 1 | require "json"
 2 | 
 3 | module Datasets
 4 |   class SeabornList < Dataset
 5 |     def initialize
 6 |       super
 7 |       @metadata.id = "seaborn-data-list"
 8 |       @metadata.name = "seaborn: data list"
 9 |       @metadata.url = "https://github.com/mwaskom/seaborn-data"
10 |       # Treat as the same license as seaborn
11 |       @metadata.licenses = ["BSD-3-Clause"]
12 |       @metadata.description = "Datasets for seaborn examples."
13 |     end
14 | 
15 |     def each(&block)
16 |       return to_enum(__method__) unless block_given?
17 | 
18 |       data_path = cache_dir_path + "trees.json"
19 |       url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master"
20 |       download(data_path, url)
21 | 
22 |       tree = JSON.parse(File.read(data_path))["tree"]
23 |       tree.each do |content|
24 |         path = content["path"]
25 |         next unless path.end_with?(".csv")
26 |         dataset = File.basename(path, ".csv")
27 |         record = {dataset: dataset}
28 |         yield record
29 |       end
30 |     end
31 |   end
32 | 
33 |   class Seaborn < Dataset
34 |     URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze
35 | 
36 |     def initialize(name)
37 |       super()
38 |       @metadata.id = "seaborn-#{name}"
39 |       @metadata.name = "seaborn: #{name}"
40 |       @metadata.url = URL_FORMAT % {name: name}
41 |       # @metadata.licenses = TODO
42 | 
43 |       @name = name
44 |     end
45 | 
46 |     def each(&block)
47 |       return to_enum(__method__) unless block_given?
48 | 
49 |       data_path = cache_dir_path + "#{@name}.csv"
50 |       download(data_path, @metadata.url)
51 |       CSV.open(data_path, headers: :first_row, converters: :all) do |csv|
52 |         csv.each do |row|
53 |           record = prepare_record(row)
54 |           yield record
55 |         end
56 |       end
57 |     end
58 | 
59 |     private
60 |     def prepare_record(csv_row)
61 |       record = csv_row.to_h
62 |       record.transform_keys! do |key|
63 |         if key.nil?
64 |           :index
65 |         else
66 |           key.to_sym
67 |         end
68 |       end
69 | 
70 |       # Perform the same preprocessing as seaborn's load_dataset function
71 |       preprocessor = :"preprocess_#{@name}_record"
72 |       __send__(preprocessor, record) if respond_to?(preprocessor, true)
73 | 
74 |       record
75 |     end
76 | 
77 |     # The same preprocessing as seaborn.load_dataset
78 |     def preprocess_flights_record(record)
79 |       record[:month] &&= record[:month][0,3]
80 |     end
81 | 
82 |     # The same preprocessing as seaborn.load_dataset
83 |     def preprocess_penguins_record(record)
84 |       record[:sex] &&= record[:sex].capitalize
85 |     end
86 |   end
87 | 
88 |   # For backward compatibility
89 |   SeabornData = Seaborn
90 | end
91 | 


--------------------------------------------------------------------------------
/lib/datasets/sudachi-synonym-dictionary.rb:
--------------------------------------------------------------------------------
  1 | require "csv"
  2 | 
  3 | require_relative "dataset"
  4 | 
  5 | module Datasets
  6 |   class SudachiSynonymDictionary < Dataset
  7 |     class Synonym < Struct.new(:group_id,
  8 |                                :is_noun,
  9 |                                :expansion_type,
 10 |                                :lexeme_id,
 11 |                                :form_type,
 12 |                                :acronym_type,
 13 |                                :variant_type,
 14 |                                :categories,
 15 |                                :notation)
 16 |       alias_method :noun?, :is_noun
 17 |     end
 18 | 
 19 |     def initialize
 20 |       super()
 21 |       @metadata.id = "sudachi-synonym-dictionary"
 22 |       @metadata.name = "Sudachi synonym dictionary"
 23 |       @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md"
 24 |       @metadata.licenses = ["Apache-2.0"]
 25 |       @metadata.description = lambda do
 26 |         download_description
 27 |       end
 28 |     end
 29 | 
 30 |     def each
 31 |       return to_enum(__method__) unless block_given?
 32 | 
 33 |       lexeme_id_context = {}
 34 |       open_data do |csv|
 35 |         csv.each do |row|
 36 |           group_id = row[0]
 37 |           if group_id != lexeme_id_context[:group_id]
 38 |             lexeme_id_context[:group_id] = group_id
 39 |             lexeme_id_context[:counter] = 0
 40 |           end
 41 |           is_noun = (row[1] == "1")
 42 |           expansion_type = normalize_expansion_type(row[2])
 43 |           lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context)
 44 |           form_type = normalize_form_type(row[4])
 45 |           acronym_type = normalize_acronym_type(row[5])
 46 |           variant_type = normalize_variant_type(row[6])
 47 |           categories = normalize_categories(row[7])
 48 |           notation = row[8]
 49 |           synonym = Synonym.new(group_id,
 50 |                                 is_noun,
 51 |                                 expansion_type,
 52 |                                 lexeme_id,
 53 |                                 form_type,
 54 |                                 acronym_type,
 55 |                                 variant_type,
 56 |                                 categories,
 57 |                                 notation)
 58 |           yield(synonym)
 59 |         end
 60 |       end
 61 |     end
 62 | 
 63 |     private
 64 |     def open_data
 65 |       data_path = cache_dir_path + "synonyms.txt"
 66 |       data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt"
 67 |       download(data_path, data_url)
 68 |       CSV.open(data_path,
 69 |                encoding: "UTF-8",
 70 |                skip_blanks: true) do |csv|
 71 |         yield(csv)
 72 |       end
 73 |     end
 74 | 
 75 |     def download_description
 76 |       description_path = cache_dir_path + "synonyms.md"
 77 |       description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md"
 78 |       download(description_path, description_url)
 79 |       description_path.read
 80 |     end
 81 | 
 82 |     def normalize_expansion_type(type)
 83 |       case type
 84 |       when "0", ""
 85 |         :always
 86 |       when "1"
 87 |         :expanded
 88 |       when "2"
 89 |         :never
 90 |       else
 91 |         raise Error, "unknown expansion type: #{type.inspect}"
 92 |       end
 93 |     end
 94 | 
 95 |     def normalize_lexeme_id(id, context)
 96 |       case id
 97 |       when ""
 98 |         lexeme_id_context[:counter] += 1
 99 |         lexeme_id_context[:counter]
100 |       else
101 |         # Use only the first lexeme ID.
102 |         # Example:
103 |         #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,,
104 |         #   000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,,
105 |         Integer(id.split("/").first, 10)
106 |       end
107 |     end
108 | 
109 |     def normalize_form_type(type)
110 |       case type
111 |       when "0", ""
112 |         :typical
113 |       when "1"
114 |         :translation
115 |       when "2"
116 |         :alias
117 |       when "3"
118 |         :old_name
119 |       when "4"
120 |         :misnomer
121 |       else
122 |         raise Error, "unknown form type: #{type.inspect}"
123 |       end
124 |     end
125 | 
126 |     def normalize_acronym_type(type)
127 |       case type
128 |       when "0", ""
129 |         :typical
130 |       when "1"
131 |         :alphabet
132 |       when "2"
133 |         :others
134 |       else
135 |         raise Error, "unknown acronym type: #{type.inspect}"
136 |       end
137 |     end
138 | 
139 |     def normalize_variant_type(type)
140 |       case type
141 |       when "0", ""
142 |         :typical
143 |       when "1"
144 |         :alphabet
145 |       when "2"
146 |         :general
147 |       when "3"
148 |         :misspelled
149 |       else
150 |         raise Error, "unknown variant type: #{type.inspect}"
151 |       end
152 |     end
153 | 
154 |     def normalize_categories(categories)
155 |       case categories
156 |       when ""
157 |         nil
158 |       when /\A\((.*)\)\z/
159 |         $1.split("/")
160 |       else
161 |         raise Error, "invalid categories: #{categories.inspect}"
162 |       end
163 |     end
164 |   end
165 | end
166 | 


--------------------------------------------------------------------------------
/lib/datasets/table.rb:
--------------------------------------------------------------------------------
  1 | require "datasets/dictionary"
  2 | 
  3 | module Datasets
  4 |   class Table
  5 |     class Record
  6 |       include Enumerable
  7 | 
  8 |       def initialize(table, index)
  9 |         @table = table
 10 |         @index = index
 11 |       end
 12 | 
 13 |       def [](column_name_or_column_index)
 14 |         @table[column_name_or_column_index][@index]
 15 |       end
 16 | 
 17 |       def each
 18 |         return to_enum(__method__) unless block_given?
 19 |         @table.each_column.each do |column_name, column_values|
 20 |           yield(column_name, column_values[@index])
 21 |         end
 22 |       end
 23 | 
 24 |       def values
 25 |         @table.each_column.collect do |_column_name, column_values|
 26 |           column_values[@index]
 27 |         end
 28 |       end
 29 | 
 30 |       def to_h
 31 |         hash = {}
 32 |         each do |column_name, column_value|
 33 |           hash[column_name] = column_value
 34 |         end
 35 |         hash
 36 |       end
 37 | 
 38 |       def inspect
 39 |         "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>"
 40 |       end
 41 |     end
 42 | 
 43 |     include Enumerable
 44 | 
 45 |     attr_reader :dataset
 46 |     def initialize(dataset)
 47 |       @dataset = dataset
 48 |       @dictionaries = {}
 49 |     end
 50 | 
 51 |     def n_columns
 52 |       columner_data.size
 53 |     end
 54 |     alias_method :size, :n_columns
 55 |     alias_method :length, :n_columns
 56 | 
 57 |     def n_rows
 58 |       first_column = columner_data.first
 59 |       return 0 if first_column.nil?
 60 |       first_column[1].size
 61 |     end
 62 | 
 63 |     def column_names
 64 |       columner_data.keys
 65 |     end
 66 | 
 67 |     def each_column(&block)
 68 |       columner_data.each(&block)
 69 |     end
 70 |     alias_method :each, :each_column
 71 | 
 72 |     def each_record
 73 |       return to_enum(__method__) unless block_given?
 74 |       n_rows.times do |i|
 75 |         yield(Record.new(self, i))
 76 |       end
 77 |     end
 78 | 
 79 |     def find_record(row)
 80 |       row += n_rows if row < 0
 81 |       return nil if row < 0
 82 |       return nil if row >= n_rows
 83 |       Record.new(self, row)
 84 |     end
 85 | 
 86 |     def [](name_or_index)
 87 |       case name_or_index
 88 |       when Integer
 89 |         index = name_or_index
 90 |         columner_data.each_with_index do |(_name, values), i|
 91 |           return values if i == index
 92 |         end
 93 |         nil
 94 |       else
 95 |         name = name_or_index
 96 |         columner_data[normalize_name(name)]
 97 |       end
 98 |     end
 99 | 
100 |     def dictionary_encode(name)
101 |       @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name])
102 |     end
103 | 
104 |     def label_encode(name)
105 |       dictionary = dictionary_encode(name)
106 |       dictionary.encode(self[name])
107 |     end
108 | 
109 |     def fetch_values(*keys)
110 |       data = columner_data
111 |       keys.collect do |key|
112 |         if data.key?(key)
113 |           data[key]
114 |         else
115 |           raise build_key_error(key) unless block_given?
116 |           yield(key)
117 |         end
118 |       end
119 |     end
120 | 
121 |     def to_h
122 |       columns = {}
123 |       @dataset.each do |record|
124 |         record.to_h.each do |name, value|
125 |           values = (columns[name] ||= [])
126 |           values << value
127 |         end
128 |       end
129 |       columns
130 |     end
131 | 
132 |     private
133 |     begin
134 |       KeyError.new("message", receiver: self, key: :key)
135 |     rescue ArgumentError
136 |       def build_key_error(key)
137 |         KeyError.new("key not found: #{key.inspect}")
138 |       end
139 |     else
140 |       def build_key_error(key)
141 |         KeyError.new("key not found: #{key.inspect}",
142 |                      receiver: self,
143 |                      key: key)
144 |       end
145 |     end
146 | 
147 |     def columner_data
148 |       @columns ||= to_h
149 |     end
150 | 
151 |     def normalize_name(name)
152 |       name.to_sym
153 |     end
154 |   end
155 | end
156 | 


--------------------------------------------------------------------------------
/lib/datasets/tar-gz-readable.rb:
--------------------------------------------------------------------------------
 1 | require "rubygems/package"
 2 | require "zlib"
 3 | 
 4 | module Datasets
 5 |   module TarGzReadable
 6 |     def open_tar_gz(data_path)
 7 |       Zlib::GzipReader.open(data_path) do |f|
 8 |         Gem::Package::TarReader.new(f) do |tar|
 9 |           yield(tar)
10 |         end
11 |       end
12 |     end
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/lib/datasets/version.rb:
--------------------------------------------------------------------------------
1 | module Datasets
2 |   VERSION = "0.2.1"
3 | end
4 | 


--------------------------------------------------------------------------------
/lib/datasets/wikipedia.rb:
--------------------------------------------------------------------------------
  1 | require "rexml/streamlistener"
  2 | require "rexml/parsers/baseparser"
  3 | require "rexml/parsers/streamparser"
  4 | require "time"
  5 | 
  6 | require_relative "dataset"
  7 | 
  8 | module Datasets
  9 |   class Wikipedia < Dataset
 10 |     Contributor = Struct.new(:user_name,
 11 |                              :id)
 12 |     Revision = Struct.new(:id,
 13 |                           :parent_id,
 14 |                           :timestamp,
 15 |                           :contributor,
 16 |                           :minor,
 17 |                           :comment,
 18 |                           :model,
 19 |                           :format,
 20 |                           :text,
 21 |                           :sha1)
 22 |     Page = Struct.new(:title,
 23 |                       :namespace,
 24 |                       :id,
 25 |                       :restrictions,
 26 |                       :redirect,
 27 |                       :revision)
 28 | 
 29 |     def initialize(language: :en,
 30 |                    type: :articles)
 31 |       super()
 32 |       @language = language
 33 |       @type = type
 34 |       @metadata.id = "wikipedia-#{@language}-#{@type}"
 35 |       @metadata.name = "Wikipedia #{@type} (#{@language})"
 36 |       @metadata.url = "https://dumps.wikimedia.org/"
 37 |       @metadata.licenses = [
 38 |         "CC-BY-SA-3.0",
 39 |         "CC-BY-SA-4.0",
 40 |         "GFDL-1.3-or-later",
 41 |       ]
 42 |       @metadata.description = "Wikipedia #{@type} in #{@language}"
 43 |     end
 44 | 
 45 |     def each(&block)
 46 |       return to_enum(__method__) unless block_given?
 47 | 
 48 |       open_data do |input|
 49 |         listener = ArticlesListener.new(block)
 50 |         parser = REXML::Parsers::StreamParser.new(input, listener)
 51 |         parser.parse
 52 |       end
 53 |     end
 54 | 
 55 |     private
 56 |     def base_name
 57 |       "#{@language}wiki-latest-#{type_in_path}.xml.bz2"
 58 |     end
 59 | 
 60 |     def data_path
 61 |       cache_dir_path + base_name
 62 |     end
 63 | 
 64 |     def open_data(&block)
 65 |       data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}"
 66 |       bz2 = Enumerator.new do |yielder|
 67 |         download(data_path, data_url) do |bz2_chunk|
 68 |           yielder << bz2_chunk
 69 |         end
 70 |       end
 71 |       extract_bz2(bz2, &block)
 72 |     end
 73 | 
 74 |     def type_in_path
 75 |       case @type
 76 |       when :articles
 77 |         "pages-articles"
 78 |       else
 79 |         @type.to_s
 80 |       end
 81 |     end
 82 | 
 83 |     class ArticlesListener
 84 |       include REXML::StreamListener
 85 | 
 86 |       def initialize(block)
 87 |         @block = block
 88 |         @page = nil
 89 |         @revision = nil
 90 |         @contributor = nil
 91 |         @current_tag = nil
 92 |         @tag_stack = []
 93 |         @text_stack = [+""]
 94 |         @first_page = true
 95 |       end
 96 | 
 97 |       def tag_start(name, attributes)
 98 |         push_stacks(name)
 99 |         case name
100 |         when "page"
101 |           @page = Page.new
102 |         when "revision"
103 |           @revision = Revision.new
104 |         when "contributor"
105 |           @contributor = Contributor.new
106 |         when "redirect"
107 |           @page.redirect = attributes["title"]
108 |         end
109 |       end
110 | 
111 |       def tag_end(name)
112 |         case name
113 |         when "page"
114 |           on_page(@page)
115 |           @page = nil
116 |         when "title"
117 |           @page.title = @text_stack.last
118 |         when "ns"
119 |           @page.namespace = Integer(@text_stack.last)
120 |         when "id"
121 |           id = Integer(@text_stack.last)
122 |           case @tag_stack[-2]
123 |           when "page"
124 |             @page.id = id
125 |           when "revision"
126 |             @revision.id = id
127 |           when "contributor"
128 |             @contributor.id = id
129 |           end
130 |         when "restrictions"
131 |           @page.restrictions = @text_stack.last.split(":")
132 |         when "revision"
133 |           @page.revision = @revision
134 |           @revision = nil
135 |         when "parentid"
136 |           @revision.parent_id = Integer(@text_stack.last)
137 |         when "timestamp"
138 |           @revision.timestamp = Time.iso8601(@text_stack.last)
139 |         when "contributor"
140 |           @revision.contributor = @contributor
141 |           @contributor = nil
142 |         when "username"
143 |           @contributor.user_name = @text_stack.last
144 |         when "minor"
145 |           # TODO
146 |         when "comment"
147 |           @revision.comment = @text_stack.last
148 |         when "model"
149 |           @revision.model = @text_stack.last
150 |         when "format"
151 |           @revision.format = @text_stack.last
152 |         when "text"
153 |           @revision.text = @text_stack.last
154 |         when "sha1"
155 |           @revision.sha1 = @text_stack.last
156 |         end
157 |         pop_stacks
158 |       end
159 | 
160 |       def text(data)
161 |         @text_stack.last << data
162 |       end
163 | 
164 |       def cdata(content)
165 |         @text_stack.last << content
166 |       end
167 | 
168 |       private
169 |       def on_page(page)
170 |         @block.call(page)
171 |       end
172 | 
173 |       def push_stacks(tag)
174 |         @tag_stack << tag
175 |         @text_stack << +""
176 |       end
177 | 
178 |       def pop_stacks
179 |         @text_stack.pop
180 |         @tag_stack.pop
181 |       end
182 |     end
183 |   end
184 | end
185 | 


--------------------------------------------------------------------------------
/lib/datasets/wine.rb:
--------------------------------------------------------------------------------
 1 | require 'csv'
 2 | 
 3 | require_relative 'dataset'
 4 | 
 5 | module Datasets
 6 |   class Wine < Dataset
 7 |     Record = Struct.new(:label,
 8 |                         :alcohol,
 9 |                         :malic_acid,
10 |                         :ash,
11 |                         :alcalinity_of_ash,
12 |                         :n_magnesiums,
13 |                         :total_phenols,
14 |                         :total_flavonoids,
15 |                         :total_nonflavanoid_phenols,
16 |                         :total_proanthocyanins,
17 |                         :color_intensity,
18 |                         :hue,
19 |                         :optical_nucleic_acid_concentration,
20 |                         :n_prolines)
21 | 
22 |     def initialize
23 |       super
24 |       @metadata.id = 'wine'
25 |       @metadata.name = 'Wine'
26 |       @metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine'
27 |       @metadata.licenses = ["CC-BY-4.0"]
28 |       @metadata.description = -> { read_names }
29 |     end
30 | 
31 |     def each
32 |       return to_enum(__method__) unless block_given?
33 | 
34 |       open_data do |csv|
35 |         csv.each do |row|
36 |           next if row[0].nil?
37 |           record = Record.new(*row)
38 |           yield(record)
39 |         end
40 |       end
41 |     end
42 | 
43 |     private
44 | 
45 |     def read_names
46 |       names_path = cache_dir_path + 'wine.names'
47 |       names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
48 |       download(names_path, names_url)
49 |       names_path.read
50 |     end
51 | 
52 |     def open_data
53 |       data_path = cache_dir_path + 'wine.data'
54 |       data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
55 |       download(data_path, data_url)
56 |       CSV.open(data_path, converters: %i[numeric]) do |csv|
57 |         yield(csv)
58 |       end
59 |     end
60 |   end
61 | end
62 | 


--------------------------------------------------------------------------------
/lib/datasets/zip-extractor.rb:
--------------------------------------------------------------------------------
 1 | require 'zip'
 2 | 
 3 | module Datasets
 4 |   class ZipExtractor
 5 |     def initialize(path)
 6 |       @path = path
 7 |     end
 8 | 
 9 |     def extract_first_file
10 |       Zip::File.open(@path) do |zip_file|
11 |         zip_file.each do |entry|
12 |           next unless entry.file?
13 | 
14 |           entry.get_input_stream do |input|
15 |             return yield(input)
16 |           end
17 |         end
18 |       end
19 |       nil
20 |     end
21 | 
22 |     def extract_file(file_path)
23 |       Zip::File.open(@path) do |zip_file|
24 |         zip_file.each do |entry|
25 |           next unless entry.file?
26 |           next unless entry.name == file_path
27 | 
28 |           entry.get_input_stream do |input|
29 |             return yield(input)
30 |           end
31 |         end
32 |       end
33 |       nil
34 |     end
35 | 
36 |     def extract_files
37 |       Zip::File.open(@path) do |zip_file|
38 |         zip_file.each do |entry|
39 |           next unless entry.file?
40 | 
41 |           entry.get_input_stream do |input|
42 |             yield(input)
43 |           end
44 |         end
45 |       end
46 |     end
47 |   end
48 | end
49 | 


--------------------------------------------------------------------------------
/red-datasets.gemspec:
--------------------------------------------------------------------------------
 1 | # -*- ruby -*-
 2 | 
 3 | clean_white_space = lambda do |entry|
 4 |   entry.gsub(/(\A\n+|\n+\z)/, '') + "\n"
 5 | end
 6 | 
 7 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib"))
 8 | require "datasets/version"
 9 | 
10 | Gem::Specification.new do |spec|
11 |   spec.name = "red-datasets"
12 |   spec.version = Datasets::VERSION
13 |   spec.homepage = "https://github.com/red-data-tools/red-datasets"
14 |   spec.authors = ["tomisuker", "Kouhei Sutou"]
15 |   spec.email = ["tomisuker16@gmail.com", "kou@clear-code.com"]
16 | 
17 |   readme = File.read("README.md")
18 |   readme.force_encoding("UTF-8")
19 |   entries = readme.split(/^\#\#\s(.*)$/)
20 |   clean_white_space.call(entries[entries.index("Description") + 1])
21 |   description = clean_white_space.call(entries[entries.index("Description") + 1])
22 |   spec.summary, spec.description, = description.split(/\n\n+/, 3)
23 |   spec.license = "MIT"
24 |   spec.files = [
25 |     "README.md",
26 |     "LICENSE.txt",
27 |     "Rakefile",
28 |     "Gemfile",
29 |     "#{spec.name}.gemspec",
30 |   ]
31 |   spec.files += [".yardopts"]
32 |   spec.files += Dir.glob("lib/**/*.rb")
33 |   spec.files += Dir.glob("image/*.*")
34 |   spec.files += Dir.glob("doc/text/*")
35 |   spec.test_files += Dir.glob("test/**/*")
36 | 
37 |   spec.add_runtime_dependency("csv", ">= 3.2.4")
38 |   spec.add_runtime_dependency("rexml")
39 |   spec.add_runtime_dependency("rubyzip")
40 | 
41 |   spec.add_development_dependency("bundler")
42 |   spec.add_development_dependency("rake")
43 |   spec.add_development_dependency("test-unit")
44 |   spec.add_development_dependency("yard")
45 |   spec.add_development_dependency("kramdown")
46 | end
47 | 


--------------------------------------------------------------------------------
/test/helper.rb:
--------------------------------------------------------------------------------
 1 | require "fileutils"
 2 | require "pathname"
 3 | require "time"
 4 | require "tmpdir"
 5 | 
 6 | require "datasets"
 7 | 
 8 | require "test-unit"
 9 | 
10 | module Helper
11 |   module Sandbox
12 |     def setup_sandbox
13 |       @tmp_dir = (Pathname.new(__dir__) + "tmp").expand_path
14 |       FileUtils.mkdir_p(@tmp_dir)
15 |     end
16 | 
17 |     def teardown_sandbox
18 |       return unless defined?(@tmp_dir)
19 |       FileUtils.rm_rf(@tmp_dir)
20 |     end
21 |   end
22 | 
23 |   module PathRestorable
24 |     def restore_path(path)
25 |       unless path.exist?
26 |         return yield
27 |       end
28 | 
29 |       Dir.mktmpdir do |dir|
30 |         FileUtils.cp_r(path, dir, preserve: true)
31 |         begin
32 |           yield
33 |         ensure
34 |           FileUtils.rmtree(path, secure: true) if path.exist?
35 |           FileUtils.cp_r(Pathname(dir) + path.basename,
36 |                          path,
37 |                          preserve: true)
38 |         end
39 |       end
40 |     end
41 |   end
42 | end
43 | 


--------------------------------------------------------------------------------
/test/japanese-date-parser-test.rb:
--------------------------------------------------------------------------------
 1 | class JapaneseDateParserTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @parser = Datasets::JapaneseDateParser.new
 4 |   end
 5 | 
 6 |   data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"])
 7 |   data("month         with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"])
 8 |   data("          day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"])
 9 |   data("           without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"])
10 |   data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"])
11 |   data("year, month         with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"])
12 |   data("year,           day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"])
13 |   data("year,            without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"])
14 |   data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"])
15 |   data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"])
16 |   test("#parse") do
17 |     expected_jisx0301, japanese_date_string = data
18 |     assert_equal(expected_jisx0301, @parser.parse(japanese_date_string).jisx0301)
19 |   end
20 | 
21 |   test("unsupported era initial range") do
22 |     expected_message = "era must be one of [平成, 令和]: 昭和"
23 |     assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do
24 |       @parser.parse("昭和元年 1月 1日")
25 |     end
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/test/run-test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | $VERBOSE = true
 4 | 
 5 | require "pathname"
 6 | 
 7 | base_dir = Pathname.new(__FILE__).dirname.parent.expand_path
 8 | 
 9 | lib_dir = base_dir + "lib"
10 | test_dir = base_dir + "test"
11 | 
12 | $LOAD_PATH.unshift(lib_dir.to_s)
13 | 
14 | require_relative "helper"
15 | 
16 | ARGV.unshift("--max-diff-target-string-size=#{10 * 1024}")
17 | 
18 | exit(Test::Unit::AutoRunner.run(true, test_dir.to_s))
19 | 


--------------------------------------------------------------------------------
/test/test-adult.rb:
--------------------------------------------------------------------------------
 1 | class AdultTest < Test::Unit::TestCase
 2 |   sub_test_case("train") do
 3 |     def setup
 4 |       @dataset = Datasets::Adult.new(type: :train)
 5 |     end
 6 | 
 7 |     def record(*args)
 8 |       Datasets::Adult::Record.new(*args)
 9 |     end
10 | 
11 |     test("#each") do
12 |       assert_equal({
13 |                      :age => 39,
14 |                      :work_class => "State-gov",
15 |                      :final_weight => 77516,
16 |                      :education => "Bachelors",
17 |                      :n_education_years => 13,
18 |                      :marital_status => "Never-married",
19 |                      :occupation => "Adm-clerical",
20 |                      :relationship => "Not-in-family",
21 |                      :race => "White",
22 |                      :sex => "Male",
23 |                      :capital_gain => 2174,
24 |                      :capital_loss => 0,
25 |                      :hours_per_week => 40,
26 |                      :native_country => "United-States",
27 |                      :label => "<=50K"
28 |                    },
29 |                    @dataset.each.next.to_h)
30 |     end
31 |   end
32 | 
33 |   sub_test_case("test") do
34 |     def setup
35 |       @dataset = Datasets::Adult.new(type: :test)
36 |     end
37 | 
38 |     def record(*args)
39 |       Datasets::Adult::Record.new(*args)
40 |     end
41 | 
42 |     test("#each") do
43 |       assert_equal({
44 |                      :age => 25,
45 |                      :work_class => "Private",
46 |                      :final_weight => 226802,
47 |                      :education => "11th",
48 |                      :n_education_years => 7,
49 |                      :marital_status => "Never-married",
50 |                      :occupation => "Machine-op-inspct",
51 |                      :relationship => "Own-child",
52 |                      :race => "Black",
53 |                      :sex => "Male",
54 |                      :capital_gain => 0,
55 |                      :capital_loss => 0,
56 |                      :hours_per_week => 40,
57 |                      :native_country => "United-States",
58 |                      :label => "<=50K.",
59 |                    },
60 |                    @dataset.each.next.to_h)
61 |     end
62 |   end
63 | 
64 |   sub_test_case("#metadata") do
65 |     def setup
66 |       @dataset = Datasets::Adult.new(type: :train)
67 |     end
68 | 
69 |     test("#description") do
70 |       description = @dataset.metadata.description
71 |       assert do
72 |         description.start_with?("| This data was extracted from the census bureau database found at")
73 |       end
74 |     end
75 |   end
76 | end
77 | 


--------------------------------------------------------------------------------
/test/test-afinn.rb:
--------------------------------------------------------------------------------
 1 | class AFINNTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::AFINN.new
 4 |   end
 5 | 
 6 |   test('#each') do
 7 |     records = @dataset.each.to_a
 8 |     assert_equal([
 9 |                    2477,
10 |                    {
11 |                      :valence => -2,
12 |                      :word => "abandon"
13 |                    },
14 |                    {
15 |                      :valence => 2,
16 |                      :word => "zealous"
17 |                    },
18 |                  ],
19 |                  [
20 |                    records.size,
21 |                    records[0].to_h,
22 |                    records[-1].to_h,
23 |                  ])
24 |   end
25 | 
26 |   sub_test_case('#metadata') do
27 |     test('#description') do
28 |       description = @dataset.metadata.description
29 |       assert_equal(<<-DESCRIPTION.chomp, description)
30 | AFINN is a list of English words rated for valence with an integer
31 | between minus five (negative) and plus five (positive). The words have
32 | been manually labeled by Finn Årup Nielsen in 2009-2011. The file
33 | is tab-separated. There are two versions:
34 | 
35 | AFINN-111: Newest version with 2477 words and phrases.
36 | 
37 | An evaluation of the word list is available in:
38 | 
39 | Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for
40 | sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903
41 | 
42 | The list was used in: 
43 | 
44 | Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni,
45 | Michael Etter, "Good Friends, Bad News - Affect and Virality in
46 | Twitter", The 2011 International Workshop on Social Computing,
47 | Network, and Services (SocialComNet 2011).
48 | 
49 | 
50 | This database of words is copyright protected and distributed under
51 | "Open Database License (ODbL) v1.0"
52 | http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar
53 | copyleft license.
54 | 
55 | See comments on the word list here:
56 | http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis
57 |       DESCRIPTION
58 |     end
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/test/test-california-housing.rb:
--------------------------------------------------------------------------------
 1 | class CaliforniaHousingTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::CaliforniaHousing.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::CaliforniaHousing::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     assert_equal({
12 |                    median_house_value: 452600.000000,
13 |                    median_income: 8.325200,
14 |                    housing_median_age: 41.000000,
15 |                    total_rooms: 880.000000,
16 |                    total_bedrooms: 129.000000,
17 |                    population: 322.000000,
18 |                    households: 126.000000,
19 |                    latitude: 37.880000,
20 |                    longitude: -122.230000,
21 |                  },
22 |                  @dataset.each.next.to_h)
23 |   end
24 | 
25 |   sub_test_case("#metadata") do
26 |     test("#description") do
27 |       description = @dataset.metadata.description
28 |       assert_equal(<<-DESCRIPTION, description)
29 | Housing information from the 1990 census used in
30 | Pace, R. Kelley and Ronald Barry,
31 | "Sparse Spatial Autoregressions",
32 | Statistics and Probability Letters, 33 (1997) 291-297.
33 | Available from http://lib.stat.cmu.edu/datasets/.
34 |       DESCRIPTION
35 |     end
36 |   end
37 | end
38 | 


--------------------------------------------------------------------------------
/test/test-dataset.rb:
--------------------------------------------------------------------------------
 1 | class TestDataset < Test::Unit::TestCase
 2 |   sub_test_case("#clear_cache!") do
 3 |     include Helper::PathRestorable
 4 | 
 5 |     def setup
 6 |       @dataset = Datasets::Iris.new
 7 |       @cache_dir_path = @dataset.send(:cache_dir_path)
 8 |     end
 9 | 
10 |     test("when the dataset is downloaded") do
11 |       @dataset.first # This ensures the dataset downloaded
12 |       existence = {before: @cache_dir_path.join("iris.csv").exist?}
13 | 
14 |       restore_path(@cache_dir_path) do
15 |         @dataset.clear_cache!
16 |         existence[:after] = @cache_dir_path.join("iris.csv").exist?
17 | 
18 |         assert_equal({before: true, after: false},
19 |                      existence)
20 |       end
21 |     end
22 | 
23 |     test("when the dataset is not downloaded") do
24 |       restore_path(@cache_dir_path) do
25 |         if @cache_dir_path.exist?
26 |           FileUtils.rmtree(@cache_dir_path.to_s, secure: true)
27 |         end
28 | 
29 |         assert_nothing_raised do
30 |           @dataset.clear_cache!
31 |         end
32 |       end
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/test/test-diamonds.rb:
--------------------------------------------------------------------------------
 1 | class DiamondsTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::Diamonds.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::Diamonds::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     assert_equal({
12 |                    carat: 0.23,
13 |                    clarity: "SI2",
14 |                    color: "E",
15 |                    cut: "Ideal",
16 |                    depth: 61.5,
17 |                    price: 326,
18 |                    table: 55.0,
19 |                    x: 3.95,
20 |                    y: 3.98,
21 |                    z: 2.43,
22 |                  },
23 |                  @dataset.each.next.to_h)
24 |   end
25 | 
26 |   sub_test_case("#metadata") do
27 |     test("#description") do
28 |       description = @dataset.metadata.description
29 |       assert_equal(<<-DESCRIPTION, description)
30 | Prices of over 50,000 round cut diamonds
31 | 
32 | A dataset containing the prices and other attributes of almost 54,000
33 |  diamonds. The variables are as follows:
34 | 
35 | A data frame with 53940 rows and 10 variables:
36 | 
37 |   * price: price in US dollars ($326--$18,823)
38 |   * carat: weight of the diamond (0.2--5.01)
39 |   * cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
40 |   * color: diamond colour, from D (best) to J (worst)
41 |   * clarity: a measurement of how clear the diamond is (I1 (worst), SI2,
42 |     SI1, VS2, VS1, VVS2, VVS1, IF (best))
43 |   * x: length in mm (0--10.74)
44 |   * y: width in mm (0--58.9)
45 |   * z: depth in mm (0--31.8)
46 |   * depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
47 |   * table: width of top of diamond relative to widest point (43--95)
48 |       DESCRIPTION
49 |     end
50 |   end
51 | end
52 | 


--------------------------------------------------------------------------------
/test/test-dictionary.rb:
--------------------------------------------------------------------------------
 1 | class DictionaryTest < Test::Unit::TestCase
 2 |   def setup
 3 |     penn_treebank = Datasets::PennTreebank.new(type: :test)
 4 |     @dictionary = penn_treebank.to_table.dictionary_encode(:word)
 5 |   end
 6 | 
 7 |   test("#id") do
 8 |     assert_equal(95, @dictionary.id("<unk>"))
 9 |   end
10 | 
11 |   test("#value") do
12 |     assert_equal("<unk>", @dictionary.value(95))
13 |   end
14 | 
15 |   test("#ids") do
16 |     assert_equal([0, 1, 2, 3, 4], @dictionary.ids.first(5))
17 |   end
18 | 
19 |   test("#values") do
20 |     assert_equal(["no", "it", "was", "n't", "black"],
21 |                  @dictionary.values.first(5))
22 |   end
23 | 
24 |   test("#each") do
25 |     assert_equal([
26 |                    [0, "no"],
27 |                    [1, "it"],
28 |                    [2, "was"],
29 |                    [3, "n't"],
30 |                    [4, "black"],
31 |                  ],
32 |                  @dictionary.each.first(5).to_a)
33 |   end
34 | 
35 |   test("#size") do
36 |     assert_equal(6048, @dictionary.size)
37 |   end
38 | 
39 |   test("#length") do
40 |     assert_equal(@dictionary.size,
41 |                  @dictionary.length)
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/test/test-downloader.rb:
--------------------------------------------------------------------------------
 1 | class DownloaderTest < Test::Unit::TestCase
 2 |   include Helper::Sandbox
 3 | 
 4 |   sub_test_case("#download") do
 5 |     def setup
 6 |       setup_sandbox
 7 |     end
 8 | 
 9 |     def teardown
10 |       teardown_sandbox
11 |     end
12 | 
13 |     test("too many redirection") do
14 |       first_url = "https://example.com/file"
15 |       last_url = "https://example.com/last_redirection"
16 |       expected_message = "too many redirections: #{first_url} .. #{last_url}"
17 |       output_path = @tmp_dir + "file"
18 |       downloader = Datasets::Downloader.new(first_url)
19 | 
20 |       downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers|
21 |         raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}"
22 |       end
23 | 
24 |       assert_raise(Datasets::Downloader::TooManyRedirects.new(expected_message)) do
25 |         downloader.download(output_path)
26 |       end
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/test/test-fashion-mnist.rb:
--------------------------------------------------------------------------------
  1 | class FashionMNISTTest < Test::Unit::TestCase
  2 |   sub_test_case("Normal") do
  3 |     sub_test_case("train") do
  4 |       def setup
  5 |         @dataset = Datasets::FashionMNIST.new(type: :train)
  6 |       end
  7 | 
  8 |       test("#each") do
  9 |         records = @dataset.each.to_a
 10 |         assert_equal([
 11 |                        60000,
 12 |                        [
 13 |                          9,
 14 |                          784,
 15 |                          [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
 16 |                          [220, 232, 246, 0, 3, 202, 228, 224, 221, 211],
 17 |                        ],
 18 |                        [
 19 |                          5,
 20 |                          784,
 21 |                          [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
 22 |                          [180, 177, 177, 47, 101, 235, 194, 223, 232, 255],
 23 |                        ],
 24 |                      ],
 25 |                      [
 26 |                        records.size,
 27 |                        [
 28 |                          records[0].label,
 29 |                          records[0].pixels.size,
 30 |                          records[0].pixels[400, 10],
 31 |                          records[0].pixels[500, 10],
 32 |                        ],
 33 |                        [
 34 |                          records[-1].label,
 35 |                          records[-1].pixels.size,
 36 |                          records[-1].pixels[400, 10],
 37 |                          records[-1].pixels[500, 10],
 38 |                        ],
 39 |                      ])
 40 |       end
 41 | 
 42 |       test("#to_table") do
 43 |         table_data = @dataset.to_table
 44 |         assert_equal([
 45 |                        [0, 0, 0, 0, 237, 226, 217, 223, 222, 219],
 46 |                        [129, 153, 34, 0, 3, 3, 0, 3, 0, 24],
 47 |                      ],
 48 |                      [
 49 |                        table_data[:pixels][0][400, 10],
 50 |                        table_data[:pixels][-1][400, 10],
 51 |                      ])
 52 |       end
 53 | 
 54 |       sub_test_case("#metadata") do
 55 |         test("#id") do
 56 |           assert_equal("fashion-mnist-train", @dataset.metadata.id)
 57 |         end
 58 | 
 59 |         test("#name") do
 60 |           assert_equal("Fashion-MNIST: train", @dataset.metadata.name)
 61 |         end
 62 |       end
 63 |     end
 64 | 
 65 |     sub_test_case("test") do
 66 |       def setup
 67 |         @dataset = Datasets::FashionMNIST.new(type: :test)
 68 |       end
 69 | 
 70 |       test("#each") do
 71 |         records = @dataset.each.to_a
 72 |         assert_equal([
 73 |                        10000,
 74 |                        [
 75 |                          9,
 76 |                          784,
 77 |                          [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
 78 |                          [172, 161, 189, 62, 0, 68, 94, 90, 111, 114],
 79 |                        ],
 80 |                        [
 81 |                          5,
 82 |                          784,
 83 |                          [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
 84 |                          [63, 74, 72, 0, 1, 0, 0, 0, 4, 85],
 85 |                        ],
 86 |                      ],
 87 |                      [
 88 |                        records.size,
 89 |                        [
 90 |                          records[0].label,
 91 |                          records[0].pixels.size,
 92 |                          records[0].pixels[400, 10],
 93 |                          records[0].pixels[500, 10],
 94 |                        ],
 95 |                        [
 96 |                          records[-1].label,
 97 |                          records[-1].pixels.size,
 98 |                          records[-1].pixels[400, 10],
 99 |                          records[-1].pixels[500, 10],
100 |                        ],
101 |                      ])
102 |       end
103 | 
104 |       test("#to_table") do
105 |         table_data = @dataset.to_table
106 |         assert_equal([
107 |                        [1, 0, 0, 0, 98, 136, 110, 109, 110, 162],
108 |                        [45, 45, 69, 128, 100, 120, 132, 123, 135, 171],
109 |                      ],
110 |                      [
111 |                        table_data[:pixels][0][400, 10],
112 |                        table_data[:pixels][-1][400, 10],
113 |                      ])
114 |       end
115 | 
116 |       sub_test_case("#metadata") do
117 |         test("#id") do
118 |           assert_equal("fashion-mnist-test", @dataset.metadata.id)
119 |         end
120 | 
121 |         test("#name") do
122 |           assert_equal("Fashion-MNIST: test", @dataset.metadata.name)
123 |         end
124 |       end
125 |     end
126 |   end
127 | 
128 |   sub_test_case("Abnormal") do
129 |     test("invalid type") do
130 |       invalid_type = :invalid
131 |       message = "Please set type :train or :test: #{invalid_type.inspect}"
132 |       assert_raise(ArgumentError.new(message)) do
133 |         Datasets::FashionMNIST.new(type: invalid_type)
134 |       end
135 |     end
136 |   end
137 | end
138 | 


--------------------------------------------------------------------------------
/test/test-fuel-economy.rb:
--------------------------------------------------------------------------------
 1 | class FuelEconomyTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::FuelEconomy.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::FuelEconomy::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     records = @dataset.each.to_a
12 |     assert_equal([
13 |                    234,
14 |                    {
15 |                       city_mpg: 18,
16 |                       displacement: 1.8,
17 |                       drive_train: "f",
18 |                       fuel: "p",
19 |                       highway_mpg: 29,
20 |                       manufacturer: "audi",
21 |                       model: "a4",
22 |                       n_cylinders: 4,
23 |                       transmission: "auto(l5)",
24 |                       type: "compact",
25 |                       year: 1999
26 |                    },
27 |                    {
28 |                       city_mpg: 17,
29 |                       displacement: 3.6,
30 |                       drive_train: "f",
31 |                       fuel: "p",
32 |                       highway_mpg: 26,
33 |                       manufacturer: "volkswagen",
34 |                       model: "passat",
35 |                       n_cylinders: 6,
36 |                       transmission: "auto(s6)",
37 |                       type: "midsize",
38 |                       year: 2008
39 |                     },
40 |                  ],
41 |                  [
42 |                    records.size,
43 |                    records[0].to_h,
44 |                    records[-1].to_h
45 |                  ])
46 |   end
47 | 
48 |   sub_test_case("#metadata") do
49 |     test("#description") do
50 |       description = @dataset.metadata.description
51 |       assert_equal(<<-DESCRIPTION, description)
52 | Fuel economy data from 1999 to 2008 for 38 popular models of cars
53 | 
54 | This dataset contains a subset of the fuel economy data that the EPA makes
55 | available on https://fueleconomy.gov/. It contains only models which
56 | had a new release every year between 1999 and 2008 - this was used as a
57 | proxy for the popularity of the car.
58 | 
59 | A data frame with 234 rows and 11 variables:
60 | 
61 |   * manufacturer: manufacturer name
62 |   * model: model name
63 |   * displacement: engine displacement, in litres
64 |   * year: year of manufacture
65 |   * n_cylinders: number of cylinders
66 |   * transmissions: type of transmission
67 |   * drive_train: the type of drive train, where f = front-wheel drive, r = rear wheel drive, 4 = 4wd
68 |   * city_mpg: city miles per gallon
69 |   * highway_mpg: highway miles per gallon
70 |   * fuel: fuel type
71 |   * type: "type" of car
72 |       DESCRIPTION
73 |     end
74 |   end
75 | end
76 | 


--------------------------------------------------------------------------------
/test/test-geolonia.rb:
--------------------------------------------------------------------------------
 1 | class GeoloniaTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::Geolonia.new
 4 |   end
 5 | 
 6 |   test('#each') do
 7 |     assert_equal({
 8 |                    :prefecture_code => "01",
 9 |                    :prefecture_name => "北海道",
10 |                    :prefecture_kana => "ホッカイドウ",
11 |                    :prefecture_romaji => "HOKKAIDO",
12 |                    :municipality_code => "01101",
13 |                    :municipality_name => "札幌市中央区",
14 |                    :municipality_kana => "サッポロシチュウオウク",
15 |                    :municipality_romaji => "SAPPORO SHI CHUO KU",
16 |                    :street_name => "旭ケ丘一丁目",
17 |                    :street_kana => "アサヒガオカ 1",
18 |                    :street_romaji => "ASAHIGAOKA 1",
19 |                    :alias => nil,
20 |                    :latitude => "43.04223",
21 |                    :longitude => "141.319722",
22 |                  },
23 |                  @dataset.each.next.to_h)
24 |   end
25 | 
26 |   sub_test_case("#metadata") do
27 |     test("#description") do
28 |       description = @dataset.metadata.description
29 |       assert_equal([
30 |                      "# Geolonia 住所データ",
31 |                      "## 住所データ仕様",
32 |                      "### ファイルフォーマット",
33 |                      "### 列",
34 |                      "### ソート順",
35 |                    ],
36 |                    description.scan(/^#.*$/),
37 |                    description)
38 |     end
39 |   end
40 | 
41 | end
42 | 


--------------------------------------------------------------------------------
/test/test-hepatitis.rb:
--------------------------------------------------------------------------------
 1 | class HepatitisTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::Hepatitis.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::Hepatitis::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     records = @dataset.each.to_a
12 |     assert_equal([
13 |                    155,
14 |                    {
15 |                      :label => :live,
16 |                      :age => 30,
17 |                      :sex => :female,
18 |                      :steroid => false,
19 |                      :antivirals => true,
20 |                      :fatigue => true,
21 |                      :malaise => true,
22 |                      :anorexia => true,
23 |                      :liver_big => false,
24 |                      :liver_firm => true,
25 |                      :spleen_palpable => true,
26 |                      :spiders => true,
27 |                      :ascites => true,
28 |                      :varices => true,
29 |                      :bilirubin => 1.0,
30 |                      :alkaline_phosphate => 85,
31 |                      :sgot => 18,
32 |                      :albumin => 4.0,
33 |                      :protime => nil,
34 |                      :histology => false,
35 |                    },
36 |                    {
37 |                      :label => :die,
38 |                      :age => 43,
39 |                      :sex => :male,
40 |                      :steroid => true,
41 |                      :antivirals => true,
42 |                      :fatigue => false,
43 |                      :malaise => true,
44 |                      :anorexia => true,
45 |                      :liver_big => true,
46 |                      :liver_firm => true,
47 |                      :spleen_palpable => false,
48 |                      :spiders => false,
49 |                      :ascites => false,
50 |                      :varices => true,
51 |                      :bilirubin => 1.2,
52 |                      :alkaline_phosphate => 100,
53 |                      :sgot => 19,
54 |                      :albumin => 3.1,
55 |                      :protime => 42,
56 |                      :histology => true,
57 |                    }
58 |                  ],
59 |                  [
60 |                    records.size,
61 |                    records[0].to_h,
62 |                    records[-1].to_h,
63 |                  ])
64 |   end
65 | 
66 |   sub_test_case("#metadata") do
67 |     test("#description") do
68 |       description = @dataset.metadata.description
69 |       assert do
70 |         description.start_with?("1. Title: Hepatitis Domain")
71 |       end
72 |     end
73 |   end
74 | end
75 | 


--------------------------------------------------------------------------------
/test/test-house-of-representative.rb:
--------------------------------------------------------------------------------
 1 | class HouseOfRepresentativeTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::HouseOfRepresentative.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::HouseOfRepresentative::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     assert_equal(record(142,
12 |                         "衆法の一覧",
13 |                         nil,
14 |                         139,
15 |                         18,
16 |                         "市民活動促進法案",
17 |                         "成立",
18 |                         "経過",
19 |                         "https://www.shugiin.go.jp/internet/itdb_gian.nsf/html/gian/keika/5516.htm",
20 |                         nil,
21 |                         nil,
22 |                         "衆法",
23 |                         "熊代　昭彦君外四名",
24 |                         %w(自由民主党 社会民主党・市民連合 新党さきがけ),
25 |                         nil,
26 |                         nil,
27 |                         nil,
28 |                         Date.jisx0301("H10.03.04"),
29 |                         Date.jisx0301("H10.03.11"),
30 |                         "内閣",
31 |                         Date.jisx0301("H10.03.17"),
32 |                         "可決",
33 |                         Date.jisx0301("H10.03.19"),
34 |                         "可決",
35 |                         nil,
36 |                         nil,
37 |                         nil,
38 |                         nil,
39 |                         nil,
40 |                         nil,
41 |                         nil,
42 |                         Date.jisx0301("H10.01.12"),
43 |                         "労働・社会政策",
44 |                         Date.jisx0301("H10.03.03"),
45 |                         "修正",
46 |                         Date.jisx0301("H10.03.04"),
47 |                         "修正",
48 |                         Date.jisx0301("H10.03.25"),
49 |                         7,
50 |                         nil,
51 |                         nil),
52 |                  @dataset.each.next)
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/test/test-iris.rb:
--------------------------------------------------------------------------------
 1 | class IrisTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::Iris.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::Iris::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     records = @dataset.each.to_a
12 |     assert_equal([
13 |                    150,
14 |                    record(5.1, 3.5, 1.4, 0.2, "Iris-setosa"),
15 |                    record(5.9, 3.0, 5.1, 1.8, "Iris-virginica"),
16 |                  ],
17 |                  [
18 |                    records.size,
19 |                    records[0],
20 |                    records[-1],
21 |                  ])
22 |   end
23 | 
24 |   sub_test_case("#metadata") do
25 |     test("#description") do
26 |       description = @dataset.metadata.description
27 |       assert do
28 |         description.start_with?("1. Title: Iris Plants Database")
29 |       end
30 |     end
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/test/test-ita-corpus.rb:
--------------------------------------------------------------------------------
 1 | class ITACorpusTest < Test::Unit::TestCase
 2 | 
 3 |   sub_test_case("type") do  
 4 |     test("emotion") do
 5 |       dataset = Datasets::ITACorpus.new(type: :emotion)
 6 |       records = dataset.to_a
 7 |       assert_equal([
 8 |                     100,
 9 |                     {
10 |                       :id => "EMOTION100_001",
11 |                       :sentence => "えっ嘘でしょ。,エッウソデショ。"
12 |                     },
13 |                     {
14 |                       :id => "EMOTION100_100",
15 |                       :sentence => "ラーテャン。,ラーテャン。",
16 |                     },
17 |                   ],
18 |                   [
19 |                     records.size,
20 |                     records[0].to_h,
21 |                     records[-1].to_h,
22 |                   ])
23 |     end
24 | 
25 |     test("recitation") do
26 |       dataset = Datasets::ITACorpus.new(type: :recitation)
27 |       records = dataset.to_a
28 |       assert_equal([
29 |                     324,
30 |                     {
31 |                       :id => "RECITATION324_001",
32 |                       :sentence => "女の子がキッキッ嬉しそう。,オンナノコガキッキッウレシソー。"
33 |                     },
34 |                     {
35 |                       :id => "RECITATION324_324",
36 |                       :sentence => "チュクンの波長は、パツンと共通している。,チュクンノハチョーワ、パツントキョーツウシテイル。",
37 |                     },
38 |                   ],
39 |                   [
40 |                     records.size,
41 |                     records[0].to_h,
42 |                     records[-1].to_h,
43 |                   ])
44 |     end
45 | 
46 |     test("invalid") do
47 |       message = "Please set type :emotion or :recitation: :invalid"
48 |       assert_raise(ArgumentError.new(message)) do
49 |         Datasets::ITACorpus.new(type: :invalid)
50 |       end
51 |     end
52 | 
53 |   end
54 | 
55 |   sub_test_case("#metadata") do
56 |     test("#description") do
57 |       dataset = Datasets::ITACorpus.new(type: :emotion)
58 |       description = dataset.metadata.description
59 |       assert_equal([
60 |                      "# ITAコーパスの文章リスト公開用リポジトリ",
61 |                      "## ITAコーパスとは",
62 |                      "## ITAコーパスの文献情報"
63 |                    ],
64 |                    description.scan(/^#.*$/),
65 |                    description)
66 |     end
67 |   end
68 | 
69 | end
70 | 


--------------------------------------------------------------------------------
/test/test-kuzushiji-mnist.rb:
--------------------------------------------------------------------------------
  1 | class KuzushijiMNISTTest < Test::Unit::TestCase
  2 |   sub_test_case("Normal") do
  3 |     sub_test_case("train") do
  4 |       def setup
  5 |         @dataset = Datasets::KuzushijiMNIST.new(type: :train)
  6 |       end
  7 | 
  8 |       test("#each") do
  9 |         records = @dataset.each.to_a
 10 |         assert_equal([
 11 |                        60000,
 12 |                        [
 13 |                          8,
 14 |                          784,
 15 |                          [213, 233, 255, 186, 2, 0, 0, 0, 0, 0],
 16 |                          [0, 0, 0, 0, 0, 0, 0, 0, 45, 252],
 17 |                        ],
 18 |                        [
 19 |                          9,
 20 |                          784,
 21 |                          [81, 246, 254, 155, 224, 255, 230, 39, 0, 0],
 22 |                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 23 |                        ],
 24 |                      ],
 25 |                      [
 26 |                        records.size,
 27 |                        [
 28 |                          records[0].label,
 29 |                          records[0].pixels.size,
 30 |                          records[0].pixels[400, 10],
 31 |                          records[0].pixels[500, 10],
 32 |                        ],
 33 |                        [
 34 |                          records[-1].label,
 35 |                          records[-1].pixels.size,
 36 |                          records[-1].pixels[400, 10],
 37 |                          records[-1].pixels[500, 10],
 38 |                        ],
 39 |                      ])
 40 |       end
 41 | 
 42 |       test("#to_table") do
 43 |         table_data = @dataset.to_table
 44 |         assert_equal([
 45 |                        [213, 233, 255, 186, 2, 0, 0, 0, 0, 0],
 46 |                        [81, 246, 254, 155, 224, 255, 230, 39, 0, 0],
 47 |                      ],
 48 |                      [
 49 |                        table_data[:pixels][0][400, 10],
 50 |                        table_data[:pixels][-1][400, 10],
 51 |                      ])
 52 |       end
 53 | 
 54 |       sub_test_case("#metadata") do
 55 |         test("#id") do
 56 |           assert_equal("kuzushiji-mnist-train", @dataset.metadata.id)
 57 |         end
 58 | 
 59 |         test("#name") do
 60 |           assert_equal("Kuzushiji-MNIST: train", @dataset.metadata.name)
 61 |         end
 62 |       end
 63 |     end
 64 | 
 65 |     sub_test_case("test") do
 66 |       def setup
 67 |         @dataset = Datasets::KuzushijiMNIST.new(type: :test)
 68 |       end
 69 | 
 70 |       test("#each") do
 71 |         records = @dataset.each.to_a
 72 |         assert_equal([
 73 |                        10000,
 74 |                        [
 75 |                          2,
 76 |                          784,
 77 |                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 75],
 78 |                          [44, 255, 255, 246, 119, 252, 46, 0, 70, 255],
 79 |                        ],
 80 |                        [
 81 |                          2,
 82 |                          784,
 83 |                          [0, 0, 0, 0, 0, 0, 0, 84, 255, 192],
 84 |                          [0, 0, 0, 0, 0, 23, 245, 92, 42, 254],
 85 |                        ],
 86 |                      ],
 87 |                      [
 88 |                        records.size,
 89 |                        [
 90 |                          records[0].label,
 91 |                          records[0].pixels.size,
 92 |                          records[0].pixels[400, 10],
 93 |                          records[0].pixels[500, 10],
 94 |                        ],
 95 |                        [
 96 |                          records[-1].label,
 97 |                          records[-1].pixels.size,
 98 |                          records[-1].pixels[400, 10],
 99 |                          records[-1].pixels[500, 10],
100 |                        ],
101 |                      ])
102 |       end
103 | 
104 |       test("#to_table") do
105 |         table_data = @dataset.to_table
106 |         assert_equal([
107 |                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 75],
108 |                        [0, 0, 0, 0, 0, 0, 0, 84, 255, 192],
109 |                      ],
110 |                      [
111 |                        table_data[:pixels][0][400, 10],
112 |                        table_data[:pixels][-1][400, 10],
113 |                      ])
114 |       end
115 | 
116 |       sub_test_case("#metadata") do
117 |         test("#id") do
118 |           assert_equal("kuzushiji-mnist-test", @dataset.metadata.id)
119 |         end
120 | 
121 |         test("#name") do
122 |           assert_equal("Kuzushiji-MNIST: test", @dataset.metadata.name)
123 |         end
124 |       end
125 |     end
126 |   end
127 | 
128 |   sub_test_case("Abnormal") do
129 |     test("invalid type") do
130 |       invalid_type = :invalid
131 |       message = "Please set type :train or :test: #{invalid_type.inspect}"
132 |       assert_raise(ArgumentError.new(message)) do
133 |         Datasets::KuzushijiMNIST.new(type: invalid_type)
134 |       end
135 |     end
136 |   end
137 | end
138 | 


--------------------------------------------------------------------------------
/test/test-libsvm-dataset-list.rb:
--------------------------------------------------------------------------------
 1 | class LIBSVMDatasetListTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::LIBSVMDatasetList.new
 4 |   end
 5 | 
 6 |   test("#each") do
 7 |     assert_equal({
 8 |                    name: "a1a",
 9 |                    source: "UCI / Adult",
10 |                    preprocessing:
11 |                      "The original Adult data set has 14 features, " +
12 |                      "among which six are continuous and eight are " +
13 |                      "categorical. In this data set, continuous features " +
14 |                      "are discretized into quantiles, and each quantile is " +
15 |                      "represented by a binary feature. Also, a categorical " +
16 |                      "feature with m categories is converted to m binary " +
17 |                      "features. Details on how each feature is converted " +
18 |                      "can be found in the beginning of each file from this " +
19 |                      "page. [JP98a]",
20 |                    n_classes: 2,
21 |                    n_data: 1605,
22 |                    n_features: 123,
23 |                    files: [
24 |                      {
25 |                        name: "a1a",
26 |                        url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a",
27 |                        note: nil,
28 |                      },
29 |                      {
30 |                        name: "a1a.t",
31 |                        url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t",
32 |                        note: "testing",
33 |                      }
34 |                    ],
35 |                  },
36 |                  @dataset.first.to_h)
37 |   end
38 | 
39 |   sub_test_case("#metadata") do
40 |     test("#description") do
41 |       description = @dataset.metadata.description
42 |       assert do
43 |         description.start_with?("This page contains many classification, ")
44 |       end
45 |     end
46 |   end
47 | end
48 | 


--------------------------------------------------------------------------------
/test/test-libsvm.rb:
--------------------------------------------------------------------------------
  1 | class LIBSVMDatasetTest < Test::Unit::TestCase
  2 |   test(":note") do
  3 |     dataset = Datasets::LIBSVM.new("a1a", note: "testing")
  4 |     hash = {label: -1}
  5 |     n_features = 123
  6 |     n_features.times do |i|
  7 |       hash[i] = 0
  8 |     end
  9 |     [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i|
 10 |       hash[i - 1] = 1
 11 |     end
 12 |     assert_equal(hash,
 13 |                  dataset.first.to_h)
 14 |   end
 15 | 
 16 |   test(":default_feature_value") do
 17 |     dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil)
 18 |     hash = {label: -1}
 19 |     n_features = 123
 20 |     n_features.times do |i|
 21 |       hash[i] = nil
 22 |     end
 23 |     [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
 24 |       hash[i - 1] = 1
 25 |     end
 26 |     assert_equal(hash,
 27 |                  dataset.first.to_h)
 28 |   end
 29 | 
 30 |   test("classification") do
 31 |     dataset = Datasets::LIBSVM.new("a1a")
 32 |     hash = {label: -1}
 33 |     n_features = 123
 34 |     n_features.times do |i|
 35 |       hash[i] = 0
 36 |     end
 37 |     [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i|
 38 |       hash[i - 1] = 1
 39 |     end
 40 |     assert_equal(hash,
 41 |                  dataset.first.to_h)
 42 |   end
 43 | 
 44 |   test("regression") do
 45 |     dataset = Datasets::LIBSVM.new("abalone")
 46 |     hash = {label: 15}
 47 |     n_features = 8
 48 |     n_features.times do |i|
 49 |       hash[i] = 0
 50 |     end
 51 |     [
 52 |       [1, 1],
 53 |       [2, 0.455],
 54 |       [3, 0.365],
 55 |       [4, 0.095],
 56 |       [5, 0.514],
 57 |       [6, 0.2245],
 58 |       [7, 0.101],
 59 |       [8, 0.15],
 60 |     ].each do |i, value|
 61 |       hash[i - 1] = value
 62 |     end
 63 |     assert_equal(hash,
 64 |                  dataset.first.to_h)
 65 |   end
 66 | 
 67 |   test("multi-label") do
 68 |     dataset = Datasets::LIBSVM.new("mediamill (exp1)")
 69 |     hash = {label: [65, 67, 11, 31]}
 70 |     n_features = 120
 71 |     n_features.times do |i|
 72 |       hash[i] = 0
 73 |     end
 74 |     [
 75 |       [1, 0.380877],
 76 |       [2, 0.494079],
 77 |       [3, 0.540009],
 78 |       [4, 0.422926],
 79 |       [5, 0.158318],
 80 |       [6, 0.326975],
 81 |       [7, 0.390861],
 82 |       [8, 0.527121],
 83 |       [9, 0.254052],
 84 |       [10, 0.223731],
 85 |       [11, 0.040285],
 86 |       [12, 0.141133],
 87 |       [13, 0.112249],
 88 |       [14, 0.263171],
 89 |       [15, 0.147020],
 90 |       [16, 0.472414],
 91 |       [17, 0.592614],
 92 |       [18, 0.653138],
 93 |       [19, 0.499867],
 94 |       [20, 0.196520],
 95 |       [21, 0.403892],
 96 |       [22, 0.482395],
 97 |       [23, 0.619219],
 98 |       [24, 0.320346],
 99 |       [25, 0.281251],
100 |       [26, 0.054750],
101 |       [27, 0.180459],
102 |       [28, 0.139964],
103 |       [29, 0.319925],
104 |       [30, 0.181216],
105 |       [31, 0.364294],
106 |       [32, 0.407211],
107 |       [33, 0.368926],
108 |       [34, 0.427661],
109 |       [35, 0.211391],
110 |       [36, 0.364345],
111 |       [37, 0.370710],
112 |       [38, 0.409107],
113 |       [39, 0.289299],
114 |       [40, 0.243053],
115 |       [41, 0.063121],
116 |       [42, 0.193587],
117 |       [43, 0.158755],
118 |       [44, 0.316054],
119 |       [45, 0.197410],
120 |       [46, 0.656168],
121 |       [47, 0.678760],
122 |       [48, 0.650831],
123 |       [49, 0.674636],
124 |       [50, 0.492428],
125 |       [51, 0.623887],
126 |       [52, 0.610622],
127 |       [53, 0.678219],
128 |       [54, 0.574774],
129 |       [55, 0.523073],
130 |       [56, 0.206804],
131 |       [57, 0.496294],
132 |       [58, 0.429221],
133 |       [59, 0.586611],
134 |       [60, 0.471550],
135 |       [61, 0.284480],
136 |       [62, 0.432466],
137 |       [63, 0.498075],
138 |       [64, 0.408141],
139 |       [65, 0.102713],
140 |       [66, 0.303028],
141 |       [67, 0.309501],
142 |       [68, 0.444855],
143 |       [69, 0.191727],
144 |       [70, 0.174895],
145 |       [71, 0.034143],
146 |       [72, 0.153099],
147 |       [73, 0.068318],
148 |       [74, 0.217020],
149 |       [75, 0.099688],
150 |       [76, 0.409862],
151 |       [77, 0.561918],
152 |       [78, 0.612031],
153 |       [79, 0.514471],
154 |       [80, 0.146015],
155 |       [81, 0.398807],
156 |       [82, 0.383295],
157 |       [83, 0.548485],
158 |       [84, 0.282937],
159 |       [85, 0.252712],
160 |       [86, 0.051008],
161 |       [87, 0.223110],
162 |       [88, 0.098112],
163 |       [89, 0.299672],
164 |       [90, 0.144873],
165 |       [91, 0.308488],
166 |       [92, 0.358478],
167 |       [93, 0.352077],
168 |       [94, 0.394686],
169 |       [95, 0.157513],
170 |       [96, 0.339370],
171 |       [97, 0.321558],
172 |       [98, 0.341373],
173 |       [99, 0.247969],
174 |       [100, 0.206070],
175 |       [101, 0.061001],
176 |       [102, 0.216793],
177 |       [103, 0.112389],
178 |       [104, 0.273648],
179 |       [105, 0.152745],
180 |       [106, 0.598081],
181 |       [107, 0.621687],
182 |       [108, 0.607213],
183 |       [109, 0.644025],
184 |       [110, 0.394948],
185 |       [111, 0.593651],
186 |       [112, 0.551529],
187 |       [113, 0.574392],
188 |       [114, 0.511032],
189 |       [115, 0.463997],
190 |       [116, 0.202034],
191 |       [117, 0.492341],
192 |       [118, 0.317983],
193 |       [119, 0.547807],
194 |       [120, 0.393778],
195 |     ].each do |i, value|
196 |       hash[i - 1] = value
197 |     end
198 |     assert_equal(hash,
199 |                  dataset.first.to_h)
200 |   end
201 | 
202 |   test("string") do
203 |     # TODO
204 |   end
205 | end
206 | 


--------------------------------------------------------------------------------
/test/test-license.rb:
--------------------------------------------------------------------------------
 1 | class LicenseTest < Test::Unit::TestCase
 2 |   sub_test_case(".try_convert") do
 3 |     test("String") do
 4 |       assert_equal(Datasets::License.new("Apache-2.0"),
 5 |                    Datasets::License.try_convert("Apache-2.0"))
 6 |     end
 7 | 
 8 |     test("{spdx_id:}") do
 9 |       assert_equal(Datasets::License.new("Apache-2.0"),
10 |                    Datasets::License.try_convert(spdx_id: "Apache-2.0"))
11 |     end
12 | 
13 |     test("{name:, url:}") do
14 |       license = {
15 |         name: "Quora's Terms of Service",
16 |         url: "https://www.quora.com/about/tos",
17 |       }
18 |       assert_equal(Datasets::License.new(nil,
19 |                                          "Quora's Terms of Service",
20 |                                          "https://www.quora.com/about/tos"),
21 |                    Datasets::License.try_convert(license))
22 |     end
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/test/test-metadata.rb:
--------------------------------------------------------------------------------
 1 | class MetadataTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @metadata = Datasets::Metadata.new
 4 |   end
 5 | 
 6 |   sub_test_case("#licenses") do
 7 |     test("String") do
 8 |       @metadata.licenses = "Apache-2.0"
 9 |       assert_equal([Datasets::License.new("Apache-2.0")],
10 |                    @metadata.licenses)
11 |     end
12 | 
13 |     test("[String]") do
14 |       @metadata.licenses = ["Apache-2.0"]
15 |       assert_equal([Datasets::License.new("Apache-2.0")],
16 |                    @metadata.licenses)
17 |     end
18 | 
19 |     test("{name:, url:}") do
20 |       @metadata.licenses = {
21 |         name: "Quora's Terms of Service",
22 |         url: "https://www.quora.com/about/tos",
23 |       }
24 |       assert_equal([Datasets::License.new(nil,
25 |                                           "Quora's Terms of Service",
26 |                                           "https://www.quora.com/about/tos")],
27 |                    @metadata.licenses)
28 |     end
29 | 
30 |     test("Symbol") do
31 |       assert_raise(ArgumentError.new("invalid license: :apache_2_0")) do
32 |         @metadata.licenses = :apache_2_0
33 |       end
34 |     end
35 |   end
36 | end
37 | 


--------------------------------------------------------------------------------
/test/test-mnist.rb:
--------------------------------------------------------------------------------
  1 | class MNISTTest < Test::Unit::TestCase
  2 |   sub_test_case("Normal") do
  3 |     sub_test_case("train") do
  4 |       def setup
  5 |         @dataset = Datasets::MNIST.new(type: :train)
  6 |       end
  7 | 
  8 |       test("#each") do
  9 |         records = @dataset.each.to_a
 10 |         assert_equal([
 11 |                        60000,
 12 |                        [
 13 |                          5,
 14 |                          784,
 15 |                          [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
 16 |                          [0, 0, 0, 0, 0, 81, 240, 253, 253, 119],
 17 |                        ],
 18 |                        [8,
 19 |                          784,
 20 |                          [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
 21 |                          [0, 0, 190, 196, 14, 2, 97, 254, 252, 146],
 22 |                        ],
 23 |                      ],
 24 |                      [
 25 |                        records.size,
 26 |                        [
 27 |                          records[0].label,
 28 |                          records[0].pixels.size,
 29 |                          records[0].pixels[200, 10],
 30 |                          records[0].pixels[400, 10],
 31 |                        ],
 32 |                        [
 33 |                          records[-1].label,
 34 |                          records[-1].pixels.size,
 35 |                          records[-1].pixels[200, 10],
 36 |                          records[-1].pixels[400, 10],
 37 |                        ],
 38 |                      ])
 39 |       end
 40 | 
 41 |       test("#to_table") do
 42 |         table_data = @dataset.to_table
 43 |         assert_equal([
 44 |                        [0, 0, 0, 49, 238, 253, 253, 253, 253, 253],
 45 |                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 62],
 46 |                      ],
 47 |                      [
 48 |                        table_data[:pixels][0][200, 10],
 49 |                        table_data[:pixels][-1][200, 10],
 50 |                      ])
 51 |       end
 52 | 
 53 |       sub_test_case("#metadata") do
 54 |         test("#id") do
 55 |           assert_equal("mnist-train", @dataset.metadata.id)
 56 |         end
 57 | 
 58 |         test("#name") do
 59 |           assert_equal("MNIST: train", @dataset.metadata.name)
 60 |         end
 61 |       end
 62 |     end
 63 | 
 64 |     sub_test_case("test") do
 65 |       def setup
 66 |         @dataset = Datasets::MNIST.new(type: :test)
 67 |       end
 68 | 
 69 |       test("#each") do
 70 |         records = @dataset.each.to_a
 71 |         assert_equal([
 72 |                        10000,
 73 |                        [
 74 |                          7,
 75 |                          784,
 76 |                          [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
 77 |                          [0, 0, 0, 0, 0, 0, 0, 0, 59, 249],
 78 |                        ],
 79 |                        [
 80 |                          6,
 81 |                          784,
 82 |                          [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
 83 |                          [253, 253, 132, 64, 0, 0, 18, 43, 157, 171],
 84 |                        ],
 85 |                      ],
 86 |                      [
 87 |                        records.size,
 88 |                        [
 89 |                          records[0].label,
 90 |                          records[0].pixels.size,
 91 |                          records[0].pixels[200, 10],
 92 |                          records[0].pixels[400, 10],
 93 |                        ],
 94 |                        [
 95 |                          records[-1].label,
 96 |                          records[-1].pixels.size,
 97 |                          records[-1].pixels[200, 10],
 98 |                          records[-1].pixels[400, 10],
 99 |                        ],
100 |                      ])
101 |       end
102 | 
103 |       test("#to_table") do
104 |         table_data = @dataset.to_table
105 |         assert_equal([
106 |                        [0, 0, 84, 185, 159, 151, 60, 36, 0, 0],
107 |                        [0, 0, 0, 0, 0, 15, 60, 60, 168, 253],
108 |                      ],
109 |                      [
110 |                        table_data[:pixels][0][200, 10],
111 |                        table_data[:pixels][-1][200, 10],
112 |                      ])
113 |       end
114 | 
115 |       sub_test_case("#metadata") do
116 |         test("#id") do
117 |           assert_equal("mnist-test", @dataset.metadata.id)
118 |         end
119 | 
120 |         test("#name") do
121 |           assert_equal("MNIST: test", @dataset.metadata.name)
122 |         end
123 |       end
124 |     end
125 |   end
126 | 
127 |   sub_test_case("Abnormal") do
128 |     test("invalid type") do
129 |       invalid_type = :invalid
130 |       message = "Please set type :train or :test: #{invalid_type.inspect}"
131 |       assert_raise(ArgumentError.new(message)) do
132 |         Datasets::MNIST.new(type: invalid_type)
133 |       end
134 |     end
135 |   end
136 | end
137 | 


--------------------------------------------------------------------------------
/test/test-mushroom.rb:
--------------------------------------------------------------------------------
 1 | class MushroomTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::Mushroom.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::Mushroom::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     records = @dataset.each.to_a
12 |     assert_equal([
13 |                    8124,
14 |                    {
15 |                      :label => "poisonous",
16 |                      :cap_shape => "convex",
17 |                      :cap_surface => "smooth",
18 |                      :cap_color => "brown",
19 |                      :bruises => "bruises",
20 |                      :odor => "pungent",
21 |                      :gill_attachment => "free",
22 |                      :gill_spacing => "close",
23 |                      :gill_size => "narrow",
24 |                      :gill_color => "black",
25 |                      :stalk_shape => "enlarging",
26 |                      :stalk_root => "equal",
27 |                      :stalk_surface_above_ring => "smooth",
28 |                      :stalk_surface_below_ring => "smooth",
29 |                      :stalk_color_above_ring => "white",
30 |                      :stalk_color_below_ring => "white",
31 |                      :veil_type => "partial",
32 |                      :veil_color => "white",
33 |                      :n_rings => 1,
34 |                      :ring_type => "pendant",
35 |                      :spore_print_color => "black",
36 |                      :population => "scattered",
37 |                      :habitat => "urban"
38 |                    },
39 |                    {
40 |                      :label => "edible",
41 |                      :cap_shape => "convex",
42 |                      :cap_surface => "smooth",
43 |                      :cap_color => "brown",
44 |                      :bruises => "no",
45 |                      :odor => "none",
46 |                      :gill_attachment => "attached",
47 |                      :gill_spacing => "close",
48 |                      :gill_size => "broad",
49 |                      :gill_color => "yellow",
50 |                      :stalk_shape => "enlarging",
51 |                      :stalk_root => "missing",
52 |                      :stalk_surface_above_ring => "smooth",
53 |                      :stalk_surface_below_ring => "smooth",
54 |                      :stalk_color_above_ring => "orange",
55 |                      :stalk_color_below_ring => "orange",
56 |                      :veil_type => "partial",
57 |                      :veil_color => "orange",
58 |                      :n_rings => 1,
59 |                      :ring_type => "pendant",
60 |                      :spore_print_color => "orange",
61 |                      :population => "clustered",
62 |                      :habitat => "leaves"
63 |                    }
64 |                  ],
65 |                  [
66 |                    records.size,
67 |                    records[0].to_h,
68 |                    records[-1].to_h
69 |                  ])
70 |   end
71 | 
72 |   sub_test_case("#metadata") do
73 |     test("#description") do
74 |       description = @dataset.metadata.description
75 |       assert do
76 |         description.start_with?("1. Title: Mushroom Database")
77 |       end
78 |     end
79 |   end
80 | end
81 | 


--------------------------------------------------------------------------------
/test/test-nagoya-university-conversation-corpus.rb:
--------------------------------------------------------------------------------
 1 | class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::NagoyaUniversityConversationCorpus.new
 4 |   end
 5 | 
 6 |   sub_test_case("each") do
 7 |     test("#sentences") do
 8 |       first_sentences = @dataset.each.next.sentences
 9 |       assert_equal([
10 |                      856,
11 |                      {
12 |                        participant_id: 'F107',
13 |                        content: '＊＊＊の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても１時間ぐらいですよね。',
14 |                      },
15 |                      {
16 |                        participant_id: nil,
17 |                        content: nil,
18 |                      },
19 |                    ],
20 |                    [
21 |                      first_sentences.size,
22 |                      first_sentences[0].to_h,
23 |                      first_sentences[-1].to_h,
24 |                    ])
25 |     end
26 | 
27 |     test("#participants") do
28 |       first_participants = @dataset.each.next.participants
29 |       assert_equal([
30 |                      4,
31 |                      {
32 |                        id: 'F107',
33 |                        attribute: '女性３０代後半',
34 |                        birthplace: '愛知県幡豆郡出身',
35 |                        residence: '愛知県幡豆郡在住',
36 |                      },
37 |                      {
38 |                        id: 'F128',
39 |                        attribute: '女性２０代前半',
40 |                        birthplace: '愛知県西尾市出身',
41 |                        residence: '西尾市在住',
42 |                      },
43 |                    ],
44 |                    [
45 |                      first_participants.size,
46 |                      first_participants[0].to_h,
47 |                      first_participants[-1].to_h,
48 |                    ])
49 |     end
50 | 
51 |     test("others") do
52 |       first_record = @dataset.each.next
53 |       assert_equal([
54 |                      '１（約３５分）',
55 |                      '２００１年１０月１６日',
56 |                      'ファミリーレストラン',
57 |                      '英会話教室の友人',
58 |                      nil,
59 |                    ],
60 |                    [
61 |                      first_record.name,
62 |                      first_record.date,
63 |                      first_record.place,
64 |                      first_record.relationships,
65 |                      first_record.note,
66 |                    ])
67 |     end
68 |   end
69 | 
70 |   sub_test_case("#metadata") do
71 |     test("#description") do
72 |       description = @dataset.metadata.description
73 |       assert_equal(<<~DESCRIPTION, description)
74 |         The "Nagoya University Conversation Corpus" is a corpus of 129 conversations,
75 |         total about 100 hours of chatting among native speakers of Japanese,
76 |         which is converted into text.
77 |       DESCRIPTION
78 |     end
79 |   end
80 | end
81 | 


--------------------------------------------------------------------------------
/test/test-penn-treebank.rb:
--------------------------------------------------------------------------------
 1 | class PennTreebankTest < Test::Unit::TestCase
 2 |   def record(*args)
 3 |     Datasets::PennTreebank::Record.new(*args)
 4 |   end
 5 | 
 6 |   sub_test_case("type") do
 7 |     test("train") do
 8 |       dataset = Datasets::PennTreebank.new(type: :train)
 9 |       records = dataset.to_a
10 |       assert_equal([
11 |                      887521,
12 |                      record("aer"),
13 |                      record("<unk>"),
14 |                    ],
15 |                    [
16 |                      records.size,
17 |                      records[0],
18 |                      records[-1],
19 |                    ])
20 |     end
21 | 
22 |     test("test") do
23 |       dataset = Datasets::PennTreebank.new(type: :test)
24 |       records = dataset.to_a
25 |       assert_equal([
26 |                      78669,
27 |                      record("no"),
28 |                      record("us"),
29 |                    ],
30 |                    [
31 |                      records.size,
32 |                      records[0],
33 |                      records[-1],
34 |                    ])
35 |     end
36 | 
37 |     test("valid") do
38 |       dataset = Datasets::PennTreebank.new(type: :valid)
39 |       records = dataset.to_a
40 |       assert_equal([
41 |                      70390,
42 |                      record("consumers"),
43 |                      record("N"),
44 |                    ],
45 |                    [
46 |                      records.size,
47 |                      records[0],
48 |                      records[-1],
49 |                    ])
50 |     end
51 | 
52 |     test("invalid") do
53 |       message = "Type must be one of [:train, :test, :valid]: :invalid"
54 |       assert_raise(ArgumentError.new(message)) do
55 |         Datasets::PennTreebank.new(type: :invalid)
56 |       end
57 |     end
58 |   end
59 | end
60 | 


--------------------------------------------------------------------------------
/test/test-pmjt-dataset-list.rb:
--------------------------------------------------------------------------------
 1 | class PMJTDatasetListTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets:: PMJTDatasetList.new
 4 |   end
 5 | 
 6 |   test("#each") do
 7 |     records = @dataset.each.to_a
 8 | 
 9 |     record_first = Datasets::PMJTDatasetList::Record.new
10 |     record_first.unit = '冊'
11 |     record_first.open_data_category = '総記'
12 |     record_first.tag = nil
13 |     record_first.release_time = 'H31.1'
14 |     record_first.n_volumes = '2'
15 |     record_first.type = '刊'
16 |     record_first.publication_year = '元禄９'
17 |     record_first.original_request_code = '９９－３７－１～２'
18 |     record_first.id = '200003090'
19 |     record_first.title = '人倫重宝記'
20 |     record_first.text = nil
21 |     record_first.bibliographical_introduction = nil
22 |     record_first.year = nil
23 | 
24 |     record_last = Datasets::PMJTDatasetList::Record.new
25 |     record_last.unit = '冊'
26 |     record_last.open_data_category = '総記'
27 |     record_last.tag = nil
28 |     record_last.release_time = 'H27.11'
29 |     record_last.n_volumes = '1'
30 |     record_last.type = '刊'
31 |     record_last.publication_year = '慶応2'
32 |     record_last.original_request_code = '４９－１７３'
33 |     record_last.id = '200021837'
34 |     record_last.title = '洋学便覧'
35 |     record_last.text = nil
36 |     record_last.bibliographical_introduction = '○'
37 |     record_last.year = '1866'
38 | 
39 |     assert_equal([
40 |                     3126,
41 |                     record_first,
42 |                     record_last
43 |                   ],
44 |                   [
45 |                     records.size,
46 |                     records[1],
47 |                     records[-1]
48 |                   ])
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/test/test-postal-code-japan.rb:
--------------------------------------------------------------------------------
 1 | class PostalCodeJapanTest < Test::Unit::TestCase
 2 |   test("invalid") do
 3 |     message = ":reading must be one of [:lowercase, :uppercase, :romaji]: :invalid"
 4 |     assert_raise(ArgumentError.new(message)) do
 5 |       Datasets::PostalCodeJapan.new(reading: :invalid)
 6 |     end
 7 |   end
 8 | 
 9 |   sub_test_case(":reading") do
10 |     test(":lowercase") do
11 |       dataset = Datasets::PostalCodeJapan.new(reading: :lowercase)
12 |       assert_equal({
13 |                      organization_code: "01101",
14 |                      old_postal_code: "060",
15 |                      postal_code: "0600000",
16 |                      prefecture_reading: "ﾎｯｶｲﾄﾞｳ",
17 |                      city_reading: "ｻｯﾎﾟﾛｼﾁｭｳｵｳｸ",
18 |                      address_reading: "ｲｶﾆｹｲｻｲｶﾞﾅｲﾊﾞｱｲ",
19 |                      prefecture: "北海道",
20 |                      city: "札幌市中央区",
21 |                      address: "以下に掲載がない場合",
22 |                      have_multiple_postal_codes: false,
23 |                      have_address_number_per_koaza: false,
24 |                      have_chome: false,
25 |                      postal_code_is_shared: false,
26 |                      changed: false,
27 |                      change_reason: nil,
28 |                    },
29 |                    dataset.first.to_h)
30 |     end
31 | 
32 |     test(":uppercase") do
33 |       dataset = Datasets::PostalCodeJapan.new(reading: :uppercase)
34 |       assert_equal({
35 |                      organization_code: "01101",
36 |                      old_postal_code: "060",
37 |                      postal_code: "0600000",
38 |                      prefecture_reading: "ﾎﾂｶｲﾄﾞｳ",
39 |                      city_reading: "ｻﾂﾎﾟﾛｼﾁﾕｳｵｳｸ",
40 |                      address_reading: "ｲｶﾆｹｲｻｲｶﾞﾅｲﾊﾞｱｲ",
41 |                      prefecture: "北海道",
42 |                      city: "札幌市中央区",
43 |                      address: "以下に掲載がない場合",
44 |                      have_multiple_postal_codes: false,
45 |                      have_address_number_per_koaza: false,
46 |                      have_chome: false,
47 |                      postal_code_is_shared: false,
48 |                      changed: false,
49 |                      change_reason: nil,
50 |                    },
51 |                    dataset.first.to_h)
52 |     end
53 | 
54 |     test(":romaji") do
55 |       dataset = Datasets::PostalCodeJapan.new(reading: :romaji)
56 |       assert_equal({
57 |                      organization_code: nil,
58 |                      old_postal_code: nil,
59 |                      postal_code: "0600000",
60 |                      prefecture_reading: "HOKKAIDO",
61 |                      city_reading: "SAPPORO SHI CHUO KU",
62 |                      address_reading: "IKANIKEISAIGANAIBAAI",
63 |                      prefecture: "北海道",
64 |                      city: "札幌市　中央区",
65 |                      address: "以下に掲載がない場合",
66 |                      have_multiple_postal_codes: false,
67 |                      have_address_number_per_koaza: false,
68 |                      have_chome: false,
69 |                      postal_code_is_shared: false,
70 |                      changed: false,
71 |                      change_reason: nil,
72 |                    },
73 |                    dataset.first.to_h)
74 |     end
75 |   end
76 | end
77 | 


--------------------------------------------------------------------------------
/test/test-quora-duplicate-question-pair.rb:
--------------------------------------------------------------------------------
 1 | class QuoraDuplicateQuestionPairTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::QuoraDuplicateQuestionPair.new
 4 |   end
 5 | 
 6 |   def record(*args)
 7 |     Datasets::QuoraDuplicateQuestionPair::Record.new(*args)
 8 |   end
 9 | 
10 |   test("#each") do
11 |     assert_equal(record(0,
12 |                         1,
13 |                         2,
14 |                         "What is the step by step guide to invest in share market in india?",
15 |                         "What is the step by step guide to invest in share market?",
16 |                         false),
17 |                  @dataset.each.next)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/test/test-seaborn.rb:
--------------------------------------------------------------------------------
  1 | class SeabornTest < Test::Unit::TestCase
  2 |   sub_test_case("list") do
  3 |     def setup
  4 |       @dataset = Datasets::SeabornList.new
  5 |     end
  6 | 
  7 |     def test_each
  8 |       records = @dataset.each.to_a
  9 |       assert_equal([
 10 |                      {dataset: "anagrams"},
 11 |                      {dataset: "anscombe"},
 12 |                      {dataset: "attention"},
 13 |                      {dataset: "brain_networks"},
 14 |                      {dataset: "car_crashes"},
 15 |                      {dataset: "diamonds"},
 16 |                      {dataset: "dots"},
 17 |                      {dataset: "dowjones"},
 18 |                      {dataset: "exercise"},
 19 |                      {dataset: "flights"},
 20 |                      {dataset: "fmri"},
 21 |                      {dataset: "geyser"},
 22 |                      {dataset: "glue"},
 23 |                      {dataset: "healthexp"},
 24 |                      {dataset: "iris"},
 25 |                      {dataset: "mpg"},
 26 |                      {dataset: "penguins"},
 27 |                      {dataset: "planets"},
 28 |                      {dataset: "seaice"},
 29 |                      {dataset: "taxis"},
 30 |                      {dataset: "tips"},
 31 |                      {dataset: "titanic"},
 32 |                    ],
 33 |                    records)
 34 |     end
 35 |   end
 36 | 
 37 |   sub_test_case("fmri") do
 38 |     def setup
 39 |       @dataset = Datasets::Seaborn.new("fmri")
 40 |     end
 41 | 
 42 |     def test_each
 43 |       records = @dataset.each.to_a
 44 |       assert_equal([
 45 |                      1064,
 46 |                      {
 47 |                        subject: "s5",
 48 |                        timepoint: 14,
 49 |                        event: "stim",
 50 |                        region: "parietal",
 51 |                        signal: -0.0808829319505
 52 |                      },
 53 |                      {
 54 |                        subject: "s0",
 55 |                        timepoint: 0,
 56 |                        event: "cue",
 57 |                        region: "parietal",
 58 |                        signal: -0.00689923478092
 59 |                      }
 60 |                    ],
 61 |                    [
 62 |                      records.size,
 63 |                      records[1].to_h,
 64 |                      records[-1].to_h
 65 |                    ])
 66 |     end
 67 |   end
 68 | 
 69 |   sub_test_case("flights") do
 70 |     def setup
 71 |       @dataset = Datasets::Seaborn.new("flights")
 72 |     end
 73 | 
 74 |     def test_each
 75 |       records = @dataset.each.to_a
 76 |       assert_equal([
 77 |                      144,
 78 |                      {
 79 |                        year: 1949,
 80 |                        month: "Feb",
 81 |                        passengers: 118
 82 |                      },
 83 |                      {
 84 |                        year: 1960,
 85 |                        month: "Dec",
 86 |                        passengers: 432
 87 |                      }
 88 |                    ],
 89 |                    [
 90 |                      records.size,
 91 |                      records[1].to_h,
 92 |                      records[-1].to_h
 93 |                    ])
 94 |     end
 95 |   end
 96 | 
 97 |   sub_test_case("penguins") do
 98 |     def setup
 99 |       @dataset = Datasets::Seaborn.new("penguins")
100 |     end
101 | 
102 |     def test_each
103 |       records = @dataset.each.to_a
104 |       assert_equal([
105 |                      344,
106 |                      {
107 |                        species: "Adelie",
108 |                        island: "Torgersen",
109 |                        bill_length_mm: 39.5,
110 |                        bill_depth_mm: 17.4,
111 |                        flipper_length_mm: 186,
112 |                        body_mass_g: 3800,
113 |                        sex: "Female"
114 |                      },
115 |                      {
116 |                        species: "Gentoo",
117 |                        island: "Biscoe",
118 |                        bill_length_mm: 49.9,
119 |                        bill_depth_mm: 16.1,
120 |                        flipper_length_mm: 213,
121 |                        body_mass_g: 5400,
122 |                        sex: "Male"
123 |                      }
124 |                    ],
125 |                    [
126 |                      records.size,
127 |                      records[1].to_h,
128 |                      records[-1].to_h
129 |                    ])
130 |     end
131 |   end
132 | 
133 |   sub_test_case("attention") do
134 |     def setup
135 |       @dataset = Datasets::Seaborn.new("attention")
136 |     end
137 | 
138 |     def test_each
139 |       records = @dataset.to_a
140 |       assert_equal([
141 |                      60,
142 |                      {
143 |                        index: 1,
144 |                        subject: 2,
145 |                        attention: "divided",
146 |                        solutions: 1,
147 |                        score: 3.0
148 |                      },
149 |                      {
150 |                        index: 59,
151 |                        subject: 20,
152 |                        attention: "focused",
153 |                        solutions: 3,
154 |                        score: 5.0
155 |                      }
156 |                    ],
157 |                    [
158 |                      records.size,
159 |                      records[1],
160 |                      records[-1]
161 |                    ])
162 |     end
163 |   end
164 | end
165 | 


--------------------------------------------------------------------------------
/test/test-sudachi-synonym-dictionary.rb:
--------------------------------------------------------------------------------
 1 | class SudachiSynonymDictionaryTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::SudachiSynonymDictionary.new
 4 |   end
 5 | 
 6 |   test('#each') do
 7 |     assert_equal({
 8 |                    group_id: "000001",
 9 |                    is_noun: true,
10 |                    expansion_type: :always,
11 |                    lexeme_id: 1,
12 |                    form_type: :typical,
13 |                    acronym_type: :typical,
14 |                    variant_type: :typical,
15 |                    categories: [],
16 |                    notation: "曖昧",
17 |                  },
18 |                  @dataset.each.next.to_h)
19 |   end
20 | 
21 |   sub_test_case('#metadata') do
22 |     test('#description') do
23 |       description = @dataset.metadata.description
24 |       assert do
25 |         description.start_with?('# Sudachi 同義語辞書')
26 |       end
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/test/test-table.rb:
--------------------------------------------------------------------------------
  1 | class TableTest < Test::Unit::TestCase
  2 |   def setup
  3 |     @table = Datasets::Iris.new.to_table
  4 |   end
  5 | 
  6 |   test("#n_columns") do
  7 |     assert_equal(5, @table.n_columns)
  8 |   end
  9 | 
 10 |   test("#n_rows") do
 11 |     assert_equal(150, @table.n_rows)
 12 |   end
 13 | 
 14 |   test("#column_names") do
 15 |     assert_equal([
 16 |                    :sepal_length,
 17 |                    :sepal_width,
 18 |                    :petal_length,
 19 |                    :petal_width,
 20 |                    :label,
 21 |                  ],
 22 |                  @table.column_names)
 23 |   end
 24 | 
 25 |   test("#each") do
 26 |     shorten_hash = {}
 27 |     @table.each do |name, values|
 28 |       shorten_hash[name] = values.first(5)
 29 |     end
 30 |     assert_equal({
 31 |                    :label        => ["Iris-setosa"] * 5,
 32 |                    :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
 33 |                    :petal_width  => [0.2, 0.2, 0.2, 0.2, 0.2],
 34 |                    :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
 35 |                    :sepal_width  => [3.5, 3.0, 3.2, 3.1, 3.6],
 36 |                  },
 37 |                  shorten_hash)
 38 |   end
 39 | 
 40 |   test("#each_column") do
 41 |     shorten_hash = {}
 42 |     @table.each_column do |name, values|
 43 |       shorten_hash[name] = values.first(5)
 44 |     end
 45 |     assert_equal({
 46 |                    :label        => ["Iris-setosa"] * 5,
 47 |                    :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
 48 |                    :petal_width  => [0.2, 0.2, 0.2, 0.2, 0.2],
 49 |                    :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
 50 |                    :sepal_width  => [3.5, 3.0, 3.2, 3.1, 3.6],
 51 |                  },
 52 |                  shorten_hash)
 53 |   end
 54 | 
 55 |   test("#each_record") do
 56 |     records = []
 57 |     @table.each_record do |record|
 58 |       records << record
 59 |       break if records.size == 3
 60 |     end
 61 |     assert_equal([
 62 |                    {
 63 |                      label: "Iris-setosa",
 64 |                      petal_length: 1.4,
 65 |                      petal_width: 0.2,
 66 |                      sepal_length: 5.1,
 67 |                      sepal_width: 3.5,
 68 |                    },
 69 |                    {
 70 |                      label: "Iris-setosa",
 71 |                      petal_length: 1.4,
 72 |                      petal_width: 0.2,
 73 |                      sepal_length: 4.9,
 74 |                      sepal_width: 3.0,
 75 |                    },
 76 |                    {
 77 |                      label: "Iris-setosa",
 78 |                      petal_length: 1.3,
 79 |                      petal_width: 0.2,
 80 |                      sepal_length: 4.7,
 81 |                      sepal_width: 3.2,
 82 |                    },
 83 |                  ],
 84 |                  records.collect(&:to_h))
 85 |   end
 86 | 
 87 |   sub_test_case("#find_record") do
 88 |     test("positive") do
 89 |       assert_equal({
 90 |                      label: "Iris-setosa",
 91 |                      petal_length: 1.4,
 92 |                      petal_width: 0.2,
 93 |                      sepal_length: 4.9,
 94 |                      sepal_width: 3.0,
 95 |                    },
 96 |                    @table.find_record(1).to_h)
 97 |     end
 98 | 
 99 |     test("positive - over") do
100 |       assert_nil(@table.find_record(151))
101 |     end
102 | 
103 |     test("negative") do
104 |       assert_equal({
105 |                      label: "Iris-virginica",
106 |                      petal_length: 5.1,
107 |                      petal_width: 1.8,
108 |                      sepal_length: 5.9,
109 |                      sepal_width: 3.0,
110 |                    },
111 |                    @table.find_record(-1).to_h)
112 |     end
113 | 
114 |     test("negative - over") do
115 |       assert_nil(@table.find_record(-151))
116 |     end
117 |   end
118 | 
119 |   sub_test_case("#[]") do
120 |     test("index") do
121 |       assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
122 |                    @table[2].first(5))
123 |     end
124 | 
125 |     test("name") do
126 |       assert_equal([1.4, 1.4, 1.3, 1.5, 1.4],
127 |                    @table[:petal_length].first(5))
128 |     end
129 |   end
130 | 
131 |   test("#dictionary_encode") do
132 |     assert_equal([
133 |                    [0, "Iris-setosa"],
134 |                    [1, "Iris-versicolor"],
135 |                    [2, "Iris-virginica"],
136 |                  ],
137 |                  @table.dictionary_encode(:label).to_a)
138 |   end
139 | 
140 |   test("#label_encode") do
141 |     label_encoded_labels = @table.label_encode(:label)
142 |     labels = @table[:label]
143 |     assert_equal([0, 1, 2],
144 |                  [
145 |                    label_encoded_labels[labels.find_index("Iris-setosa")],
146 |                    label_encoded_labels[labels.find_index("Iris-versicolor")],
147 |                    label_encoded_labels[labels.find_index("Iris-virginica")],
148 |                  ])
149 |   end
150 | 
151 |   sub_test_case("#fetch_values") do
152 |     test("found") do
153 |       values = @table.fetch_values(:petal_length, :petal_width)
154 |       assert_equal([
155 |                      [1.4, 1.4, 1.3, 1.5, 1.4],
156 |                      [0.2, 0.2, 0.2, 0.2, 0.2],
157 |                    ],
158 |                    values.collect {|v| v.first(5)})
159 |     end
160 | 
161 |     sub_test_case("not found") do
162 |       test("with block") do
163 |         values = @table.fetch_values(:petal_length, :unknown) do |key|
164 |           [key] * 5
165 |         end
166 |         assert_equal([
167 |                        [1.4, 1.4, 1.3, 1.5, 1.4],
168 |                        [:unknown] * 5,
169 |                      ],
170 |                      values.collect {|v| v.first(5)})
171 |       end
172 | 
173 |       test("without block") do
174 |         assert_raise(KeyError) do
175 |           @table.fetch_values(:unknown)
176 |         end
177 |       end
178 |     end
179 |   end
180 | 
181 |   test("#to_h") do
182 |     shorten_hash = {}
183 |     @table.to_h.each do |name, values|
184 |       shorten_hash[name] = values.first(5)
185 |     end
186 |     assert_equal({
187 |                    :label        => ["Iris-setosa"] * 5,
188 |                    :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4],
189 |                    :petal_width  => [0.2, 0.2, 0.2, 0.2, 0.2],
190 |                    :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0],
191 |                    :sepal_width  => [3.5, 3.0, 3.2, 3.1, 3.6],
192 |                  },
193 |                  shorten_hash)
194 |   end
195 | end
196 | 


--------------------------------------------------------------------------------
/test/test-wikipedia.rb:
--------------------------------------------------------------------------------
 1 | class WikipediaTest < Test::Unit::TestCase
 2 |   sub_test_case("en") do
 3 |     sub_test_case("articles") do
 4 |       def setup
 5 |         @dataset = Datasets::Wikipedia.new(language: :en,
 6 |                                            type: :articles)
 7 |       end
 8 | 
 9 |       test("#each") do
10 |         contributor = Datasets::Wikipedia::Contributor.new("Asparagusus", 43603280)
11 |         revision = Datasets::Wikipedia::Revision.new
12 |         revision.id = 1219062925
13 |         revision.parent_id = 1219062840
14 |         revision.timestamp = Time.iso8601("2024-04-15T14:38:04Z")
15 |         revision.contributor = contributor
16 |         revision.comment = "Restored revision 1002250816 by [[Special:Contributions/Elli|Elli]] ([[User talk:Elli|talk]]): Unexplained redirect breaking"
17 |         revision.model = "wikitext"
18 |         revision.format = "text/x-wiki"
19 |         revision.text = <<-TEXT.chomp
20 | #REDIRECT [[Computer accessibility]]
21 | 
22 | {{rcat shell|
23 | {{R from move}}
24 | {{R from CamelCase}}
25 | {{R unprintworthy}}
26 | }}
27 |         TEXT
28 |         revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn"
29 |         page = Datasets::Wikipedia::Page.new
30 |         page.title = "AccessibleComputing"
31 |         page.namespace = 0
32 |         page.id = 10
33 |         page.restrictions = nil
34 |         page.redirect = "Computer accessibility"
35 |         page.revision = revision
36 |         assert_equal(page, @dataset.each.first)
37 |       end
38 | 
39 |       sub_test_case("#metadata") do
40 |         test("#id") do
41 |           assert_equal("wikipedia-en-articles",
42 |                        @dataset.metadata.id)
43 |         end
44 | 
45 |         test("#name") do
46 |           assert_equal("Wikipedia articles (en)",
47 |                        @dataset.metadata.name)
48 |         end
49 | 
50 |         test("#description") do
51 |           assert_equal("Wikipedia articles in en",
52 |                        @dataset.metadata.description)
53 |         end
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/test/test-wine.rb:
--------------------------------------------------------------------------------
 1 | class WineTest < Test::Unit::TestCase
 2 |   def setup
 3 |     @dataset = Datasets::Wine.new
 4 |   end
 5 | 
 6 |   test('#each') do
 7 |     records = @dataset.each.to_a
 8 |     assert_equal([
 9 |                    178,
10 |                    {
11 |                      :alcalinity_of_ash => 15.6,
12 |                      :alcohol => 14.23,
13 |                      :ash => 2.43,
14 |                      :label => 1,
15 |                      :color_intensity => 5.64,
16 |                      :hue => 1.04,
17 |                      :malic_acid => 1.71,
18 |                      :total_flavonoids => 3.06,
19 |                      :n_magnesiums => 127,
20 |                      :total_nonflavanoid_phenols => 0.28,
21 |                      :total_proanthocyanins => 2.29,
22 |                      :n_prolines => 1065,
23 |                      :optical_nucleic_acid_concentration => 3.92,
24 |                      :total_phenols => 2.8
25 |                    },
26 |                    {
27 |                      :alcalinity_of_ash => 24.5,
28 |                      :alcohol => 14.13,
29 |                      :ash => 2.74,
30 |                      :label => 3,
31 |                      :color_intensity => 9.2,
32 |                      :hue => 0.61,
33 |                      :malic_acid => 4.1,
34 |                      :total_flavonoids => 0.76,
35 |                      :n_magnesiums => 96,
36 |                      :total_nonflavanoid_phenols => 0.56,
37 |                      :total_proanthocyanins => 1.35,
38 |                      :n_prolines => 560,
39 |                      :optical_nucleic_acid_concentration => 1.6,
40 |                      :total_phenols => 2.05,
41 |                    },
42 |                  ],
43 |                  [
44 |                    records.size,
45 |                    records[0].to_h,
46 |                    records[-1].to_h,
47 |                  ])
48 |   end
49 | 
50 |   sub_test_case('#metadata') do
51 |     test('#description') do
52 |       description = @dataset.metadata.description
53 |       assert do
54 |         description.start_with?('1. Title of Database: Wine recognition data')
55 |       end
56 |     end
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------