├── .github ├── dependabot.yml └── workflows │ ├── pages.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .yardopts ├── Gemfile ├── LICENSE.txt ├── README.md ├── Rakefile ├── Steepfile ├── doc └── text │ └── news.md ├── example ├── aozora-bunko.rb ├── diamonds.rb ├── e-stat-japan.rb ├── fuel-economy.rb ├── house-of-councillor.rb ├── house-of-representative.rb ├── iris.rb ├── mnist.rb ├── nagoya-university-conversation-corpus.rb ├── ptb.rb ├── quora-duplicate-question-pair.rb ├── wikipedia-kyoto-japanese-english.rb └── wine.rb ├── lib ├── datasets.rb └── datasets │ ├── adult.rb │ ├── afinn.rb │ ├── aozora-bunko.rb │ ├── cache-path.rb │ ├── california-housing.rb │ ├── cifar.rb │ ├── cldr-plurals.rb │ ├── communities.rb │ ├── dataset.rb │ ├── diamonds.rb │ ├── dictionary.rb │ ├── downloader.rb │ ├── e-stat-japan.rb │ ├── error.rb │ ├── fashion-mnist.rb │ ├── fuel-economy.rb │ ├── geolonia.rb │ ├── ggplot2-dataset.rb │ ├── hepatitis.rb │ ├── house-of-councillor.rb │ ├── house-of-representative.rb │ ├── iris.rb │ ├── ita-corpus.rb │ ├── japanese-date-parser.rb │ ├── kuzushiji-mnist.rb │ ├── lazy.rb │ ├── libsvm-dataset-list.rb │ ├── libsvm.rb │ ├── license.rb │ ├── livedoor-news.rb │ ├── metadata.rb │ ├── mnist.rb │ ├── mushroom.rb │ ├── nagoya-university-conversation-corpus.rb │ ├── penguins.rb │ ├── penn-treebank.rb │ ├── pmjt-dataset-list.rb │ ├── postal-code-japan.rb │ ├── quora-duplicate-question-pair.rb │ ├── rdataset.rb │ ├── seaborn.rb │ ├── sudachi-synonym-dictionary.rb │ ├── table.rb │ ├── tar-gz-readable.rb │ ├── version.rb │ ├── wikipedia-kyoto-japanese-english.rb │ ├── wikipedia.rb │ ├── wine.rb │ └── zip-extractor.rb ├── red-datasets.gemspec └── test ├── helper.rb ├── japanese-date-parser-test.rb ├── run-test.rb ├── test-adult.rb ├── test-afinn.rb ├── test-aozora-bunko.rb ├── test-california-housing.rb ├── test-cifar.rb ├── test-cldr-plurals.rb ├── test-communities.rb ├── test-dataset.rb ├── test-diamonds.rb ├── test-dictionary.rb ├── test-downloader.rb ├── test-e-stat-japan.rb ├── test-fashion-mnist.rb ├── test-fuel-economy.rb ├── test-geolonia.rb ├── test-hepatitis.rb ├── test-house-of-councillor.rb ├── test-house-of-representative.rb ├── test-iris.rb ├── test-ita-corpus.rb ├── test-kuzushiji-mnist.rb ├── test-libsvm-dataset-list.rb ├── test-libsvm.rb ├── test-license.rb ├── test-livedoor-news.rb ├── test-metadata.rb ├── test-mnist.rb ├── test-mushroom.rb ├── test-nagoya-university-conversation-corpus.rb ├── test-penguins.rb ├── test-penn-treebank.rb ├── test-pmjt-dataset-list.rb ├── test-postal-code-japan.rb ├── test-quora-duplicate-question-pair.rb ├── test-rdataset.rb ├── test-seaborn.rb ├── test-sudachi-synonym-dictionary.rb ├── test-table.rb ├── test-wikipedia-kyoto-japanese-english.rb ├── test-wikipedia.rb └── test-wine.rb /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | -------------------------------------------------------------------------------- /.github/workflows/pages.yml: -------------------------------------------------------------------------------- 1 | name: Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - "master" 7 | workflow_dispatch: 8 | 9 | permissions: 10 | contents: read 11 | pages: write 12 | id-token: write 13 | 14 | concurrency: 15 | group: "pages" 16 | cancel-in-progress: true 17 | 18 | jobs: 19 | build: 20 | runs-on: ubuntu-latest 21 | env: 22 | # We can invalidate the current cache by updating this. 23 | CACHE_VERSION: "2022-10-21" 24 | steps: 25 | - uses: actions/checkout@v4 26 | - uses: actions/configure-pages@v5 27 | - uses: ruby/setup-ruby@v1 28 | with: 29 | ruby-version: ruby 30 | - uses: actions/cache@v4 31 | with: 32 | path: | 33 | ~/.cache/red-datasets 34 | key: ${{ env.CACHE_VERSION }}-pages-${{ hashFiles('lib/**') }} 35 | restore-keys: | 36 | ${{ env.CACHE_VERSION }}-pages- 37 | - name: Install dependencies 38 | run: | 39 | bundle install 40 | - name: Generate 41 | run: | 42 | bundle exec rake pages 43 | - uses: actions/upload-pages-artifact@v3 44 | 45 | deploy: 46 | environment: 47 | name: github-pages 48 | url: ${{ steps.deployment.outputs.page_url }} 49 | runs-on: ubuntu-latest 50 | needs: build 51 | steps: 52 | - uses: actions/deploy-pages@v4 53 | id: deployment 54 | 55 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: 3 | push: 4 | tags: 5 | - "*" 6 | jobs: 7 | github: 8 | name: GitHub 9 | runs-on: ubuntu-latest 10 | timeout-minutes: 10 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Extract release note 14 | run: | 15 | ruby \ 16 | -e 'print("## Red Datasets "); \ 17 | puts(ARGF.read.split(/^## /)[1]. \ 18 | gsub(/ {.+?}/, ""). \ 19 | gsub(/\[(.+?)\]\[.+?\]/) {$1})' \ 20 | doc/text/news.md > release-note.md 21 | - name: Upload to release 22 | run: | 23 | title=$(head -n1 release-note.md | sed -e 's/^## //') 24 | tail -n +2 release-note.md > release-note-without-version.md 25 | gh release create ${GITHUB_REF_NAME} \ 26 | --discussion-category Announcements \ 27 | --notes-file release-note-without-version.md \ 28 | --title "${title}" 29 | env: 30 | GH_TOKEN: ${{ github.token }} 31 | rubygems: 32 | name: RubyGems 33 | runs-on: ubuntu-latest 34 | timeout-minutes: 10 35 | permissions: 36 | id-token: write 37 | environment: release 38 | steps: 39 | - uses: actions/checkout@v4 40 | - uses: ruby/setup-ruby@v1 41 | with: 42 | ruby-version: ruby 43 | bundler-cache: true 44 | - uses: rubygems/configure-rubygems-credentials@v1.0.0 45 | - name: Push gems 46 | run: | 47 | bundle exec rake release:rubygem_push 48 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: 7 | - cron: | 8 | 0 0 * * 0 9 | 10 | jobs: 11 | test: 12 | name: "Ruby ${{ matrix.ruby-version }}: ${{ matrix.runs-on }}" 13 | strategy: 14 | # To avoid high frequency datasets download in a short time. 15 | max-parallel: 1 16 | fail-fast: false 17 | matrix: 18 | ruby-version: 19 | - "3.2" 20 | - "3.3" 21 | - "3.4" 22 | runs-on: 23 | - macos-latest 24 | - ubuntu-latest 25 | - windows-latest 26 | runs-on: ${{ matrix.runs-on }} 27 | env: 28 | # We can invalidate the current cache by updating this. 29 | CACHE_VERSION: "2024-09-01" 30 | steps: 31 | - uses: actions/checkout@v4 32 | - uses: ruby/setup-ruby@v1 33 | with: 34 | ruby-version: ${{ matrix.ruby-version }} 35 | - uses: actions/cache@v4 36 | if: | 37 | runner.os == 'Linux' 38 | with: 39 | path: | 40 | ~/.cache/red-datasets 41 | key: ${{ env.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('lib/**') }} 42 | restore-keys: | 43 | ${{ env.CACHE_VERSION }}-${{ runner.os }}- 44 | - uses: actions/cache@v4 45 | if: | 46 | runner.os == 'macOS' 47 | with: 48 | path: | 49 | ~/Library/Caches/red-datasets 50 | key: ${{ env.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('lib/**') }} 51 | restore-keys: | 52 | ${{ env.CACHE_VERSION }}-${{ runner.os }}- 53 | - uses: actions/cache@v4 54 | if: | 55 | runner.os == 'Windows' 56 | with: 57 | path: | 58 | ~/AppData/Local/red-datasets 59 | key: ${{ env.CACHE_VERSION }}-${{ runner.os }}-${{ hashFiles('lib/**') }} 60 | restore-keys: | 61 | ${{ env.CACHE_VERSION }}-${{ runner.os }}- 62 | - name: Install dependencies 63 | run: | 64 | bundle install 65 | - name: Test 66 | env: 67 | GH_TOKEN: ${{ github.token }} 68 | run: | 69 | bundle exec rake 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /Gemfile.lock 2 | /_site/ 3 | /pkg/ 4 | -------------------------------------------------------------------------------- /.yardopts: -------------------------------------------------------------------------------- 1 | --output-dir doc/reference/en 2 | --markup markdown 3 | --markup-provider kramdown 4 | lib/**/*.rb 5 | - 6 | doc/text/**/* 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | source "https://rubygems.org/" 4 | 5 | gemspec 6 | 7 | # add steep and typeprof to development dependencies 8 | group :development do 9 | gem "steep", require: false 10 | gem "typeprof" 11 | end 12 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2017 Kouhei Sutou 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Red Datasets 2 | 3 | [![Gem Version](https://badge.fury.io/rb/red-datasets.svg)](https://badge.fury.io/rb/red-datasets) 4 | 5 | ## Description 6 | 7 | Red Datasets provides classes that provide common datasets such as iris dataset. 8 | 9 | You can use datasets easily because you can access each dataset with multiple ways such as `#each` and Apache Arrow Record Batch. 10 | 11 | ## Install 12 | 13 | ```console 14 | % gem install red-datasets 15 | ``` 16 | 17 | ## Available datasets 18 | 19 | * Adult Dataset 20 | * Aozora Bunko 21 | * California Housing 22 | * CIFAR-10 Dataset 23 | * CIFAR-100 Dataset 24 | * CLDR language plural rules 25 | * Communities and crime 26 | * Diamonds Dataset 27 | * E-Stat Japan 28 | * Fashion-MNIST 29 | * Fuel Economy Dataset 30 | * Geolonia Japanese Addresses 31 | * Hepatitis 32 | * House of Councillors of Japan 33 | * House of Representatives of Japan 34 | * Iris Dataset 35 | * Libsvm 36 | * MNIST database 37 | * Mushroom 38 | * Penguins 39 | * The Penn Treebank Project 40 | * PMJT - Pre-Modern Japanese Text dataset list 41 | * Postal Codes in Japan 42 | * Rdatasets 43 | * Seaborn 44 | * Sudachi Synonym Dictionary 45 | * Wikipedia 46 | * Wine Dataset 47 | 48 | ## Usage 49 | 50 | Here is an example to access [Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris) by `#each` or `Table#to_h` or `Table#fetch_values`. 51 | 52 | ```ruby 53 | require "datasets" 54 | 55 | iris = Datasets::Iris.new 56 | iris.each do |record| 57 | p [ 58 | record.sepal_length, 59 | record.sepal_width, 60 | record.petal_length, 61 | record.petal_width, 62 | record.label, 63 | ] 64 | end 65 | # => [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] 66 | # => [4.9, 3.0, 1.4, 0.2, "Iris-setosa"] 67 | : 68 | # => [7.0, 3.2, 4.7, 1.4, "Iris-versicolor"] 69 | 70 | 71 | iris_hash = iris.to_table.to_h 72 | p iris_hash[:sepal_length] 73 | # => [5.1, 4.9, .. , 7.0, .. 74 | p iris_hash[:sepal_width] 75 | # => [3.5, 3.0, .. , 3.2, .. 76 | p iris_hash[:petal_length] 77 | # => [1.4, 1.4, .. , 4.7, .. 78 | p iris_hash[:petal_width] 79 | # => [0.2, 0.2, .. , 1.4, .. 80 | p iris_hash[:label] 81 | # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", .. 82 | 83 | 84 | iris_table = iris.to_table 85 | p iris_table.fetch_values(:sepal_length, :sepal_width, :petal_length, :petal_width).transpose 86 | # => [[5.1, 3.5, 1.4, 0.2], 87 | [4.9, 3.0, 1.4, 0.2], 88 | : 89 | [7.0, 3.2, 4.7, 1.4], 90 | : 91 | 92 | p iris_table[:label] 93 | # => ["Iris-setosa", "Iris-setosa", .. , "Iris-versicolor", .. 94 | ``` 95 | 96 | 97 | Here is an example to access [The CIFAR-10/100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html) by `#each`: 98 | 99 | **CIFAR-10** 100 | 101 | ```ruby 102 | require "datasets" 103 | 104 | cifar = Datasets::CIFAR.new(n_classes: 10, type: :train) 105 | cifar.metadata 106 | #=> #licenses=nil, description="CIFAR-10 is 32x32 image datasets"> 107 | cifar.each do |record| 108 | p record.pixels 109 | # => [59, 43, 50, 68, 98, 119, 139, 145, 149, 143, .....] 110 | p record.label 111 | # => 6 112 | end 113 | ``` 114 | 115 | **CIFAR-100** 116 | 117 | ```ruby 118 | require "datasets" 119 | 120 | cifar = Datasets::CIFAR.new(n_classes: 100, type: :test) 121 | cifar.metadata 122 | #=> # 123 | cifar.each do |record| 124 | p record.pixels 125 | #=> [199, 196, 195, 195, 196, 197, 198, 198, 199, .....] 126 | p record.coarse_label 127 | #=> 10 128 | p record.fine_label 129 | #=> 49 130 | end 131 | ``` 132 | 133 | **MNIST** 134 | 135 | ```ruby 136 | require "datasets" 137 | 138 | mnist = Datasets::MNIST.new(type: :train) 139 | mnist.metadata 140 | #=> # 141 | 142 | mnist.each do |record| 143 | p record.pixels 144 | # => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, .....] 145 | p record.label 146 | # => 5 147 | end 148 | ``` 149 | 150 | ## NArray compatibility 151 | 152 | * [red-datasets-numo-narray](https://github.com/red-data-tools/red-datasets-numo-narray) 153 | 154 | ## How to develop Red Datasets 155 | 1. Fork https://github.com/red-data-tools/red-datasets 156 | 2. Create a feature branch from master 157 | 3. Develop in the feature branch 158 | 4. Pull request from the feature branch to https://github.com/red-data-tools/red-datasets 159 | 160 | ## License 161 | 162 | The MIT license. See `LICENSE.txt` for details. 163 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | require "rubygems" 4 | require "bundler/gem_helper" 5 | 6 | base_dir = File.join(File.dirname(__FILE__)) 7 | 8 | helper = Bundler::GemHelper.new(base_dir) 9 | def helper.version_tag 10 | version 11 | end 12 | 13 | helper.install 14 | spec = helper.gemspec 15 | 16 | release_task = Rake.application["release"] 17 | # We use Trusted Publishing. 18 | release_task.prerequisites.delete("build") 19 | release_task.prerequisites.delete("release:rubygem_push") 20 | release_task_comment = release_task.comment 21 | if release_task_comment 22 | release_task.clear_comments 23 | release_task.comment = release_task_comment.gsub(/ and build.*$/, "") 24 | end 25 | 26 | task default: :test 27 | 28 | desc "Run tests" 29 | task :test do 30 | ruby("test/run-test.rb") 31 | end 32 | 33 | desc "Generate an artifact for GitHub Pages" 34 | task :pages do 35 | pages_dir = "_site" 36 | rm_rf(pages_dir) 37 | mkdir_p(pages_dir) 38 | 39 | require "cgi/util" 40 | require_relative "lib/datasets/lazy" 41 | File.open("#{pages_dir}/index.html", "w") do |index_html| 42 | index_html.puts(<<-HTML) 43 | 44 | 45 | 46 | 47 | Red Datasets 48 | 63 | 64 | 65 |
66 |

Red Datasets

67 | 68 | 69 | 70 | 71 | 72 | HTML 73 | Datasets::LAZY_LOADER.constant_names.sort.each do |constant_name| 74 | index_html.puts(<<-HTML) 75 | 76 | HTML 77 | end 78 | index_html.puts(<<-HTML) 79 | 80 |
Available datasets
#{CGI.escapeHTML("Datasets::#{constant_name}")}
81 |
82 | 83 | 84 | HTML 85 | end 86 | end 87 | -------------------------------------------------------------------------------- /Steepfile: -------------------------------------------------------------------------------- 1 | D = Steep::Diagnostic 2 | 3 | target :lib do 4 | signature "sig" 5 | check "lib" # Directory name to check 6 | 7 | configure_code_diagnostics(D::Ruby.lenient) # `lenient` diagnostics setting 8 | end 9 | -------------------------------------------------------------------------------- /example/aozora-bunko.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'datasets' 4 | 5 | aozora = Datasets::AozoraBunko.new 6 | book = aozora.first 7 | p [ 8 | book.title_id, 9 | book.title, 10 | book.title_reading, 11 | book.title_reading_collation, 12 | book.subtitle, 13 | book.subtitle_reading, 14 | book.original_title, 15 | book.first_appearance, 16 | book.ndc_code, 17 | book.syllabary_spelling_type, 18 | book.copyrighted?, 19 | book.published_date, 20 | book.last_updated_date, 21 | book.detail_url, 22 | book.person_id, 23 | book.person_family_name, 24 | book.person_first_name, 25 | book.person_family_name_reading, 26 | book.person_first_name_reading, 27 | book.person_family_name_reading_collation, 28 | book.person_first_name_reading_collation, 29 | book.person_family_name_romaji, 30 | book.person_first_name_romaji, 31 | book.person_type, 32 | book.person_birthday, 33 | book.person_date_of_death, 34 | book.person_copyrighted?, 35 | book.original_book_name1, 36 | book.original_book_publisher_name1, 37 | book.original_book_first_published_date1, 38 | book.used_version_for_registration1, 39 | book.used_version_for_proofreading1, 40 | book.base_of_original_book_name1, 41 | book.base_of_original_book_publisher_name1, 42 | book.base_of_original_book_first_published_date1, 43 | book.original_book_name2, 44 | book.original_book_publisher_name2, 45 | book.original_book_first_published_date2, 46 | book.used_version_for_registration2, 47 | book.used_version_for_proofreading2, 48 | book.base_of_original_book_name2, 49 | book.base_of_original_book_publisher_name2, 50 | book.base_of_original_book_first_published_date2, 51 | book.registered_person_name, 52 | book.proofreader_name, 53 | book.text_file_url, 54 | book.last_text_file_updated_date, 55 | book.text_file_character_encoding, 56 | book.text_file_character_set, 57 | book.text_file_updating_count, 58 | book.html_file_url, 59 | book.last_html_file_updated_date, 60 | book.html_file_character_encoding, 61 | book.html_file_character_set, 62 | book.html_file_updating_count 63 | ] 64 | 65 | # text API can read from text_file_url field's url 66 | p book.text 67 | #=> "ウェストミンスター寺院\r\nワシントン・アーヴィング..." 68 | 69 | # html API can read from html_file_url field's url 70 | p book.html 71 | #=> "\r\n..." 72 | 73 | # remove all cached files 74 | # aozora.clear_cache! 75 | -------------------------------------------------------------------------------- /example/diamonds.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | diamonds = Datasets::Diamonds.new 6 | 7 | diamonds.each do |record| 8 | p [ 9 | record.carat, 10 | record.cut, 11 | record.color, 12 | record.clarity, 13 | record.depth, 14 | record.table, 15 | record.price, 16 | record.x, 17 | record.y, 18 | record.z, 19 | ] 20 | # [0.23, "Ideal", "E", "SI2", 61.5, 55, 326, 3.95, 3.98, 2.43] 21 | # [0.21, "Premium", "E", "SI1", 59.8, 61, 326, 3.89, 3.84, 2.31] 22 | # ... 23 | end 24 | -------------------------------------------------------------------------------- /example/e-stat-japan.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby -Ku 2 | 3 | require 'datasets' 4 | 5 | Datasets::EStatJapan.configure do |config| 6 | # put your App ID for e-Stat app_id 7 | # see detail at https://www.e-stat.go.jp/api/api-dev/how_to_use (Japanese only) 8 | config.app_id = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' 9 | end 10 | 11 | estat = Datasets::EStatJapan::StatsData.new( 12 | '0000020201', # A 人口・世帯 13 | hierarchy_selection: 'child', 14 | skip_nil_column: true, 15 | skip_nil_row: false, 16 | categories: ['A1101'] # A1101_人口総数 17 | ) 18 | 19 | # prepare for clustering 20 | indices = [] 21 | rows = [] 22 | map_id_name = {} 23 | estat.each do |record| 24 | # Select Hokkaido only 25 | next unless record.id.to_s.start_with? '01' 26 | indices << record.id 27 | rows << record.values 28 | map_id_name[record.id] = record.name 29 | p record.name, rows 30 | end 31 | -------------------------------------------------------------------------------- /example/fuel-economy.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | fuel_economy = Datasets::FuelEconomy.new 6 | 7 | fuel_economy.each do |record| 8 | p [ 9 | record.manufacturer, 10 | record.model, 11 | record.displacement, 12 | record.year, 13 | record.n_cylinders, 14 | record.transmission, 15 | record.drive_train, 16 | record.city_mpg, 17 | record.highway_mpg, 18 | record.fuel, 19 | record.type, 20 | ] 21 | # ["audi", "a4", 1.8, 1999, 4, "auto(l5)", "f", 18, 29, "p", "compact"] 22 | # ["audi", "a4", 1.8, 1999, 4, "manual(m5)", "f", 21, 29, "p", "compact"] 23 | # ... 24 | end 25 | -------------------------------------------------------------------------------- /example/house-of-councillor.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | # Bill 6 | house_of_councillor = Datasets::HouseOfCouncillor.new 7 | house_of_councillor.each do |record| 8 | # Select promulgated after 2020 9 | next unless 2020 <= record.promulgated_on&.year.to_i 10 | 11 | p record.promulgated_on, record.values 12 | end 13 | 14 | # In-House group 15 | house_of_councillor = Datasets::HouseOfCouncillor.new(type: :in_house_group) 16 | house_of_councillor.each do |record| 17 | p record.values 18 | end 19 | 20 | # Member 21 | house_of_councillor = Datasets::HouseOfCouncillor.new(type: :member) 22 | house_of_councillor.each do |record| 23 | # Select using professional name 24 | next if record.true_name.nil? 25 | 26 | p [ 27 | record.professional_name, 28 | record.true_name, 29 | record.professional_name_reading, 30 | ] 31 | end 32 | 33 | # Question 34 | house_of_councillor = Datasets::HouseOfCouncillor.new(type: :question) 35 | house_of_councillor.each do |record| 36 | # Select number of submissions greater than 1 37 | next unless 1 < record.number_of_submissions 38 | 39 | p record.number_of_submissions, record.values 40 | end 41 | -------------------------------------------------------------------------------- /example/house-of-representative.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | house_of_representative = Datasets::HouseOfRepresentative.new 6 | house_of_representative.each do |record| 7 | # Select support of one hundred or more members and promulgated 8 | next unless 100 <= record.supporters_of_submitted_bill.size 9 | next if record.promulgated_on.nil? 10 | 11 | p [ 12 | record.supporters_of_submitted_bill.size, 13 | record.promulgated_on, 14 | record.title, 15 | ] 16 | end 17 | -------------------------------------------------------------------------------- /example/iris.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | iris = Datasets::Iris.new 6 | iris.each do |record| 7 | p [ 8 | record.sepal_length, 9 | record.sepal_width, 10 | record.petal_length, 11 | record.petal_width, 12 | record.label, 13 | ] 14 | # [5.1, 3.5, 1.4, 0.2, "Iris-setosa"] 15 | # [7.0, 3.2, 4.7, 1.4, "Iris-versicolor"] 16 | end 17 | -------------------------------------------------------------------------------- /example/mnist.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | mnist = Datasets::MNIST.new(type: :train) 6 | mnist.each do |record| 7 | p record.pixels 8 | # => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, .....] 9 | p record.label 10 | # => 5 11 | end 12 | -------------------------------------------------------------------------------- /example/nagoya-university-conversation-corpus.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'datasets' 4 | 5 | nagoya_university_conversation_corpus = Datasets::NagoyaUniversityConversationCorpus.new 6 | 7 | nagoya_university_conversation_corpus.each do |data| 8 | data.sentences.each do |sentence| 9 | p [ 10 | sentence.participant_id, 11 | sentence.content 12 | ] 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /example/ptb.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | require "optparse" 5 | 6 | params = ARGV.getopts("n:") 7 | 8 | ptb = Datasets::PennTreebank.new(type: :train) 9 | 10 | if params["n"] 11 | records = ptb.take(params["n"].to_i) 12 | else 13 | records = ptb 14 | end 15 | 16 | records.each {|record| puts record.word } 17 | -------------------------------------------------------------------------------- /example/quora-duplicate-question-pair.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | question_pair = Datasets::QuoraDuplicateQuestionPair.new 6 | question_pair.each do |pair| 7 | p [ 8 | pair.id, 9 | pair.first_question_id, 10 | pair.second_question_id, 11 | pair.first_question, 12 | pair.second_question, 13 | pair.duplicated? 14 | ] 15 | # [0, 1, 2, "What is the step by step guide to invest in share market in india?", "What is the step by step guide to invest in share market?", false] 16 | # [1, 3, 4, "What is the story of Kohinoor (Koh-i-Noor) Diamond?", "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?", false] 17 | end 18 | -------------------------------------------------------------------------------- /example/wikipedia-kyoto-japanese-english.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require "datasets" 4 | 5 | wikipedia_kyoto_articles = 6 | Datasets::WikipediaKyotoJapaneseEnglish.new(type: :article) 7 | wikipedia_kyoto_articles.each_with_index do |article, i| 8 | puts("#{i}: #{article.source}") 9 | article.contents.each do |content| 10 | puts(" Japanese: #{content.japanese}") 11 | puts(" English: #{content.english}") 12 | end 13 | end 14 | 15 | wikipedia_kyoto_lexicon = 16 | Datasets::WikipediaKyotoJapaneseEnglish.new(type: :lexicon) 17 | wikipedia_kyoto_lexicon.each do |record| 18 | puts(" Japanese: #{record.japanese}") 19 | puts(" English: #{record.english}") 20 | end 21 | -------------------------------------------------------------------------------- /example/wine.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'datasets' 4 | 5 | wine = Datasets::Wine.new 6 | wine.each do |record| 7 | p [ 8 | record.label, 9 | record.alcohol, 10 | record.malic_acid, 11 | record.ash, 12 | record.alcalinity_of_ash, 13 | record.n_magnesiums, 14 | record.total_phenols, 15 | record.total_flavonoids, 16 | record.total_nonflavanoid_phenols, 17 | record.total_proanthocyanins, 18 | record.color_intensity, 19 | record.hue, 20 | record.optical_nucleic_acid_concentration, 21 | record.n_prolines 22 | ] 23 | end 24 | -------------------------------------------------------------------------------- /lib/datasets.rb: -------------------------------------------------------------------------------- 1 | require_relative "datasets/lazy" 2 | Datasets::LAZY_LOADER.load_all 3 | -------------------------------------------------------------------------------- /lib/datasets/adult.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class Adult < Dataset 7 | Record = Struct.new( 8 | :age, 9 | :work_class, 10 | :final_weight, 11 | :education, 12 | :n_education_years, 13 | :marital_status, 14 | :occupation, 15 | :relationship, 16 | :race, 17 | :sex, 18 | :capital_gain, 19 | :capital_loss, 20 | :hours_per_week, 21 | :native_country, 22 | :label 23 | ) 24 | 25 | def initialize(type: :train) 26 | unless [:train, :test].include?(type) 27 | raise ArgumentError, 'Please set type :train or :test' 28 | end 29 | 30 | super() 31 | @type = type 32 | @metadata.id = "adult-#{@type}" 33 | @metadata.name = "Adult: #{@type}" 34 | @metadata.url = "https://archive.ics.uci.edu/ml/datasets/adult" 35 | @metadata.licenses = ["CC-BY-4.0"] 36 | @metadata.description = lambda do 37 | read_names 38 | end 39 | end 40 | 41 | def each 42 | return to_enum(__method__) unless block_given? 43 | 44 | open_data do |csv| 45 | csv.each do |row| 46 | next if row[0].nil? 47 | record = Record.new(*row) 48 | yield(record) 49 | end 50 | end 51 | end 52 | 53 | private 54 | def open_data 55 | case @type 56 | when :train 57 | ext = "data" 58 | when :test 59 | ext = "test" 60 | end 61 | data_path = cache_dir_path + "adult-#{ext}.csv" 62 | data_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.#{ext}" 63 | download(data_path, data_url) 64 | 65 | options = { 66 | converters: [:numeric, lambda {|f| f.strip}], 67 | skip_lines: /\A\|/, 68 | } 69 | CSV.open(data_path, **options) do |csv| 70 | yield(csv) 71 | end 72 | end 73 | 74 | def read_names 75 | names_path = cache_dir_path + "adult.names" 76 | names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names" 77 | download(names_path, names_url) 78 | names_path.read 79 | end 80 | end 81 | end 82 | -------------------------------------------------------------------------------- /lib/datasets/afinn.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | require_relative "zip-extractor" 3 | 4 | module Datasets 5 | class AFINN < Dataset 6 | Record = Struct.new(:word, 7 | :valence) 8 | 9 | def initialize 10 | super() 11 | @metadata.id = "afinn" 12 | @metadata.name = "AFINN" 13 | @metadata.url = "http://www2.imm.dtu.dk/pubdb/pubs/6010-full.html" 14 | @metadata.licenses = ["ODbL-1.0"] 15 | @metadata.description = lambda do 16 | extract_file("AFINN/AFINN-README.txt") do |input| 17 | readme = input.read 18 | readme.force_encoding("UTF-8") 19 | readme. 20 | gsub(/^AFINN-96:.*?\n\n/m, ""). 21 | gsub(/^In Python.*$/m, ""). 22 | strip 23 | end 24 | end 25 | end 26 | 27 | def each 28 | return to_enum(__method__) unless block_given? 29 | 30 | extract_file("AFINN/AFINN-111.txt") do |input| 31 | csv = CSV.new(input, col_sep: "\t", converters: :numeric) 32 | csv.each do |row| 33 | yield(Record.new(*row)) 34 | end 35 | end 36 | end 37 | 38 | private 39 | def extract_file(file_path, &block) 40 | data_path = cache_dir_path + "imm6010.zip" 41 | data_url = "http://www2.imm.dtu.dk/pubdb/edoc/imm6010.zip" 42 | download(data_path, data_url) 43 | 44 | extractor = ZipExtractor.new(data_path) 45 | extractor.extract_file(file_path, &block) 46 | end 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /lib/datasets/aozora-bunko.rb: -------------------------------------------------------------------------------- 1 | require_relative 'dataset' 2 | require_relative 'zip-extractor' 3 | 4 | module Datasets 5 | # Dataset for AozoraBunko 6 | class AozoraBunko < Dataset 7 | Book = Struct.new( 8 | # 作品ID,作品名,作品名読み,ソート用読み,副題,副題読み,原題,初出,分類番号,文字遣い種別,作品著作権フラグ,公開日,最終更新日,図書カードURL, 9 | :title_id, 10 | :title, 11 | :title_reading, 12 | :title_reading_collation, 13 | :subtitle, 14 | :subtitle_reading, 15 | :original_title, 16 | :first_appearance, 17 | :ndc_code, # 分類番号(日本十進分類法の番号) 18 | :syllabary_spelling_type, 19 | :copyrighted, 20 | :published_date, 21 | :last_updated_date, 22 | :detail_url, 23 | # 人物ID, 姓,名,姓読み,名読み,姓読みソート用,名読みソート用,姓ローマ字,名ローマ字,役割フラグ,生年月日,没年月日,人物著作権フラグ, 24 | :person_id, 25 | :person_family_name, 26 | :person_first_name, 27 | :person_family_name_reading, 28 | :person_first_name_reading, 29 | :person_family_name_reading_collation, 30 | :person_first_name_reading_collation, 31 | :person_family_name_romaji, 32 | :person_first_name_romaji, 33 | :person_type, 34 | :person_birthday, 35 | :person_date_of_death, 36 | :person_copyrighted, 37 | # 底本名1,底本出版社名1,底本初版発行年1,入力に使用した版1,校正に使用した版1,底本の親本名1,底本の親本出版社名1,底本の親本初版発行年1, 38 | :original_book_name1, 39 | :original_book_publisher_name1, 40 | :original_book_first_published_date1, 41 | :used_version_for_registration1, 42 | :used_version_for_proofreading1, 43 | :base_of_original_book_name1, 44 | :base_of_original_book_publisher_name1, 45 | :base_of_original_book_first_published_date1, 46 | # 底本名2,底本出版社名2,底本初版発行年2,入力に使用した版2,校正に使用した版2,底本の親本名2,底本の親本出版社名2,底本の親本初版発行年2, 47 | :original_book_name2, 48 | :original_book_publisher_name2, 49 | :original_book_first_published_date2, 50 | :used_version_for_registration2, 51 | :used_version_for_proofreading2, 52 | :base_of_original_book_name2, 53 | :base_of_original_book_publisher_name2, 54 | :base_of_original_book_first_published_date2, 55 | # 入力者,校正者, 56 | :registered_person_name, 57 | :proofreader_name, 58 | # テキストファイルURL,テキストファイル最終更新日,テキストファイル符号化方式,テキストファイル文字集合,テキストファイル修正回数, 59 | :text_file_url, 60 | :last_text_file_updated_date, 61 | :text_file_character_encoding, 62 | :text_file_character_set, 63 | :text_file_updating_count, 64 | # XHTML/HTMLファイルURL,XHTML/HTMLファイル最終更新日,XHTML/HTMLファイル符号化方式,XHTML/HTMLファイル文字集合,XHTML/HTMLファイル修正回数 65 | :html_file_url, 66 | :last_html_file_updated_date, 67 | :html_file_character_encoding, 68 | :html_file_character_set, 69 | :html_file_updating_count 70 | ) 71 | 72 | class Book 73 | attr_writer :cache_path 74 | 75 | def initialize(*args) 76 | super 77 | @text = nil 78 | @html = nil 79 | @cache_path = nil 80 | end 81 | 82 | alias_method :copyrighted?, :copyrighted 83 | alias_method :person_copyrighted?, :person_copyrighted 84 | 85 | def text 86 | return @text unless @text.nil? 87 | return @text if text_file_url.nil? || text_file_url.empty? 88 | 89 | # when url is not zip file, it needs to open web page by brower and has to download 90 | # e.g. https://mega.nz/file/6tMxgAjZ#PglDDyJL0syRhnULqK0qhTMC7cktsgqwObj5fY_knpE 91 | return @text unless text_file_url.end_with?('.zip') 92 | 93 | downloader = Downloader.new(text_file_url) 94 | downloader.download(text_file_output_path) 95 | 96 | @text = ZipExtractor.new(text_file_output_path).extract_first_file do |input| 97 | input.read.encode(Encoding::UTF_8, normalize_encoding(text_file_character_encoding)) 98 | end 99 | 100 | @text 101 | end 102 | 103 | def html 104 | return @html unless @html.nil? 105 | return @html if html_file_url.nil? || html_file_url.empty? 106 | 107 | downloader = Downloader.new(html_file_url) 108 | downloader.download(html_file_output_path) 109 | @html = File.read(html_file_output_path).encode(Encoding::UTF_8, 110 | normalize_encoding(html_file_character_encoding)) 111 | 112 | @html 113 | end 114 | 115 | private 116 | 117 | def text_file_output_path 118 | cache_base_dir + text_file_name 119 | end 120 | 121 | def html_file_output_path 122 | cache_base_dir + html_file_name 123 | end 124 | 125 | def text_file_name 126 | text_file_url.split('/').last 127 | end 128 | 129 | def html_file_name 130 | html_file_url.split('/').last 131 | end 132 | 133 | def cache_base_dir 134 | @cache_path.base_dir + title_id + person_id 135 | end 136 | 137 | def normalize_encoding(encoding) 138 | case encoding 139 | when 'ShiftJIS' 140 | Encoding::Shift_JIS 141 | when 'UTF-8' 142 | Encoding::UTF_8 143 | else 144 | encoding 145 | end 146 | end 147 | end 148 | 149 | def initialize 150 | super() 151 | 152 | @metadata.id = 'aozora-bunko' 153 | @metadata.name = 'Aozora Bunko' 154 | @metadata.url = 'https://www.aozora.gr.jp/' 155 | @metadata.licenses = 'CC-BY-2.1-JP' 156 | @metadata.description = <<~DESCRIPTION 157 | Aozora Bunko is an activity to collect free electronic books that anyone can access 158 | on the Internet like a library. The copyrighted works and the works that are said to be 159 | "free to read" are available after being digitized in text and XHTML (some HTML) formats. 160 | DESCRIPTION 161 | end 162 | 163 | def each 164 | return to_enum(__method__) unless block_given? 165 | 166 | open_data do |csv_file_stream| 167 | text = csv_file_stream.read.force_encoding(Encoding::UTF_8) # file has Byte Order Mark 168 | 169 | CSV.parse(text, headers: true) do |row| 170 | %w[作品著作権フラグ 人物著作権フラグ].each do |boolean_column_name| 171 | row[boolean_column_name] = normalize_boolean(row[boolean_column_name]) 172 | end 173 | book = Book.new(*row.fields) 174 | book.cache_path = cache_path 175 | 176 | yield(book) 177 | end 178 | end 179 | end 180 | 181 | private 182 | 183 | def open_data(&block) 184 | data_path = cache_dir_path + 'list_person_all_extended_utf8.zip' 185 | data_url = "https://www.aozora.gr.jp/index_pages/#{data_path.basename}" 186 | download(data_path, data_url) 187 | ZipExtractor.new(data_path).extract_first_file do |input| 188 | block.call(input) 189 | end 190 | end 191 | 192 | def normalize_boolean(column_value) 193 | column_value == 'あり' 194 | end 195 | end 196 | end 197 | -------------------------------------------------------------------------------- /lib/datasets/cache-path.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | class CachePath 3 | def initialize(id) 4 | @id = id 5 | end 6 | 7 | def base_dir 8 | Pathname(system_cache_dir).expand_path + 'red-datasets' + @id 9 | end 10 | 11 | def remove 12 | FileUtils.rmtree(base_dir.to_s, secure: true) if base_dir.exist? 13 | end 14 | 15 | private 16 | 17 | def system_cache_dir 18 | case RUBY_PLATFORM 19 | when /mswin/, /mingw/ 20 | ENV['LOCALAPPDATA'] || '~/AppData/Local' 21 | when /darwin/ 22 | '~/Library/Caches' 23 | else 24 | ENV['XDG_CACHE_HOME'] || '~/.cache' 25 | end 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/datasets/california-housing.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | require_relative 'zip-extractor' 3 | 4 | module Datasets 5 | class CaliforniaHousing < Dataset 6 | Record = Struct.new(:median_house_value, 7 | :median_income, 8 | :housing_median_age, 9 | :total_rooms, 10 | :total_bedrooms, 11 | :population, 12 | :households, 13 | :latitude, 14 | :longitude) 15 | 16 | def initialize 17 | super() 18 | @metadata.id = "california-housing" 19 | @metadata.name = "California Housing" 20 | @metadata.url = "http://lib.stat.cmu.edu/datasets/" 21 | @metadata.licenses = ["CCO"] 22 | @metadata.description = <<-DESCRIPTION 23 | Housing information from the 1990 census used in 24 | Pace, R. Kelley and Ronald Barry, 25 | "Sparse Spatial Autoregressions", 26 | Statistics and Probability Letters, 33 (1997) 291-297. 27 | Available from http://lib.stat.cmu.edu/datasets/. 28 | DESCRIPTION 29 | end 30 | 31 | def each 32 | return to_enum(__method__) unless block_given? 33 | 34 | data_path = cache_dir_path + "houses.zip" 35 | data_url = "http://lib.stat.cmu.edu/datasets/houses.zip" 36 | file_name = "cadata.txt" 37 | download(data_path, data_url) 38 | open_data(data_path, file_name) do |input| 39 | data = +"" 40 | input.each_line do |line| 41 | next unless line.start_with?(" ") 42 | data << line.lstrip.gsub(/ +/, ",") 43 | end 44 | options = { 45 | converters: [:numeric], 46 | } 47 | CSV.parse(data, **options) do |row| 48 | yield(Record.new(*row)) 49 | end 50 | end 51 | end 52 | 53 | private 54 | def open_data(data_path, file_name) 55 | ZipExtractor.new(data_path).extract_first_file do |input| 56 | yield input 57 | end 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /lib/datasets/cifar.rb: -------------------------------------------------------------------------------- 1 | require_relative "tar-gz-readable" 2 | require_relative "dataset" 3 | 4 | module Datasets 5 | class CIFAR < Dataset 6 | include TarGzReadable 7 | 8 | module Pixelable 9 | def pixels 10 | data.unpack("C*") 11 | end 12 | 13 | def to_h 14 | hash = super 15 | hash[:pixels] = pixels 16 | hash 17 | end 18 | end 19 | 20 | class Record10 < Struct.new(:data, :label) 21 | include Pixelable 22 | end 23 | 24 | class Record100 < Struct.new(:data, :coarse_label, :fine_label) 25 | include Pixelable 26 | end 27 | 28 | def initialize(n_classes: 10, type: :train) 29 | unless [10, 100].include?(n_classes) 30 | message = "Please set n_classes 10 or 100: #{n_classes.inspect}" 31 | raise ArgumentError, message 32 | end 33 | unless [:train, :test].include?(type) 34 | message = "Please set type :train or :test: #{type.inspect}" 35 | raise ArgumentError, message 36 | end 37 | 38 | super() 39 | 40 | @metadata.id = "cifar-#{n_classes}" 41 | @metadata.name = "CIFAR-#{n_classes}" 42 | @metadata.url = "https://www.cs.toronto.edu/~kriz/cifar.html" 43 | @metadata.description = "CIFAR-#{n_classes} is 32x32 image dataset" 44 | 45 | @n_classes = n_classes 46 | @type = type 47 | end 48 | 49 | def each(&block) 50 | return to_enum(__method__) unless block_given? 51 | 52 | data_path = cache_dir_path + "cifar-#{@n_classes}.tar.gz" 53 | data_url = "https://www.cs.toronto.edu/~kriz/cifar-#{@n_classes}-binary.tar.gz" 54 | download(data_path, data_url) 55 | 56 | parse_data(data_path, &block) 57 | end 58 | 59 | private 60 | 61 | def parse_data(data_path, &block) 62 | open_tar_gz(data_path) do |tar| 63 | target_file_names.each do |target_file_name| 64 | tar.seek(target_file_name) do |entry| 65 | parse_entry(entry, &block) 66 | end 67 | end 68 | end 69 | end 70 | 71 | def target_file_names 72 | case @n_classes 73 | when 10 74 | prefix = 'cifar-10-batches-bin' 75 | case @type 76 | when :train 77 | [ 78 | "#{prefix}/data_batch_1.bin", 79 | "#{prefix}/data_batch_2.bin", 80 | "#{prefix}/data_batch_3.bin", 81 | "#{prefix}/data_batch_4.bin", 82 | "#{prefix}/data_batch_5.bin", 83 | ] 84 | when :test 85 | [ 86 | "#{prefix}/test_batch.bin" 87 | ] 88 | end 89 | when 100 90 | prefix = "cifar-100-binary" 91 | case @type 92 | when :train 93 | [ 94 | "#{prefix}/train.bin", 95 | ] 96 | when :test 97 | [ 98 | "#{prefix}/test.bin", 99 | ] 100 | end 101 | end 102 | end 103 | 104 | def parse_entry(entry) 105 | case @n_classes 106 | when 10 107 | loop do 108 | label = entry.read(1) 109 | break if label.nil? 110 | label = label.unpack("C")[0] 111 | data = entry.read(3072) 112 | yield Record10.new(data, label) 113 | end 114 | when 100 115 | loop do 116 | coarse_label = entry.read(1) 117 | break if coarse_label.nil? 118 | coarse_label = coarse_label.unpack("C")[0] 119 | fine_label = entry.read(1).unpack("C")[0] 120 | data = entry.read(3072) 121 | yield Record100.new(data, coarse_label, fine_label) 122 | end 123 | end 124 | end 125 | end 126 | end 127 | 128 | -------------------------------------------------------------------------------- /lib/datasets/communities.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class Communities < Dataset 7 | Record = Struct.new( 8 | :state, 9 | :county, 10 | :community, 11 | :community_name, 12 | :fold, 13 | :population, 14 | :household_size, 15 | :race_percent_black, 16 | :race_percent_white, 17 | :race_percent_asian, 18 | :race_percent_hispanic, 19 | :age_percent_12_to_21, 20 | :age_percent_12_to_29, 21 | :age_percent_16_to_24, 22 | :age_percent_65_and_upper, 23 | :n_people_urban, 24 | :percent_people_urban, 25 | :median_income, 26 | :percent_households_with_wage, 27 | :percent_households_with_farm_self, 28 | :percent_households_with_investment_income, 29 | :percent_households_with_social_security, 30 | :percent_households_with_public_assistant, 31 | :percent_households_with_retire, 32 | :median_family_income, 33 | :per_capita_income, 34 | :per_capita_income_white, 35 | :per_capita_income_black, 36 | :per_capita_income_indian, 37 | :per_capita_income_asian, 38 | :per_capita_income_other, 39 | :per_capita_income_hispanic, 40 | :n_people_under_poverty, 41 | :percent_people_under_poverty, 42 | :percent_less_9th_grade, 43 | :percent_not_high_school_graduate, 44 | :percent_bachelors_or_more, 45 | :percent_unemployed, 46 | :percent_employed, 47 | :percent_employed_manufacturing, 48 | :percent_employed_professional_service, 49 | :percent_occupations_manufacturing, 50 | :percent_occupations_management_professional, 51 | :male_percent_divorced, 52 | :male_percent_never_married, 53 | :female_percent_divorced, 54 | :total_percent_divorced, 55 | :mean_persons_per_family, 56 | :percent_family_2_parents, 57 | :percent_kids_2_parents, 58 | :percent_young_kids_2_parents, 59 | :percent_teen_2_parents, 60 | :percent_work_mom_young_kids, 61 | :percent_work_mom, 62 | :n_illegals, 63 | :percent_illegals, 64 | :n_immigrants, 65 | :percent_immigrants_recent, 66 | :percent_immigrants_recent_5, 67 | :percent_immigrants_recent_8, 68 | :percent_immigrants_recent_10, 69 | :percent_population_immigranted_recent, 70 | :percent_population_immigranted_recent_5, 71 | :percent_population_immigranted_recent_8, 72 | :percent_population_immigranted_recent_10, 73 | :percent_speak_english_only, 74 | :percent_not_speak_english_well, 75 | :percent_large_households_family, 76 | :percent_large_households_occupied, 77 | :mean_persons_per_occupied_household, 78 | :mean_persons_per_owner_occupied_household, 79 | :mean_persons_per_rental_occupied_household, 80 | :percent_persons_owner_occupied_household, 81 | :percent_persons_dense_housing, 82 | :percent_housing_less_3_bedrooms, 83 | :median_n_bedrooms, 84 | :n_vacant_households, 85 | :percent_housing_occupied, 86 | :percent_housing_owner_occupied, 87 | :percent_vacant_housing_boarded, 88 | :percent_vacant_housing_more_6_months, 89 | :median_year_housing_built, 90 | :percent_housing_no_phone, 91 | :percent_housing_without_full_plumbing, 92 | :owner_occupied_housing_lower_quartile, 93 | :owner_occupied_housing_median, 94 | :owner_occupied_housing_higher_quartile, 95 | :rental_housing_lower_quartile, 96 | :rental_housing_median, 97 | :rental_housing_higher_quartile, 98 | :median_rent, 99 | :median_rent_percent_household_income, 100 | :median_owner_cost_percent_household_income, 101 | :median_owner_cost_percent_household_income_no_mortgage, 102 | :n_people_shelter, 103 | :n_people_street, 104 | :percent_foreign_born, 105 | :percent_born_same_state, 106 | :percent_same_house_85, 107 | :percent_same_city_85, 108 | :percent_same_state_85, 109 | :lemas_sworn_full_time, 110 | :lemas_sworn_full_time_per_population, 111 | :lemas_sworn_full_time_field, 112 | :lemas_sworn_full_time_field_per_population, 113 | :lemas_total_requests, 114 | :lemas_total_requests_per_population, 115 | :total_requests_per_officer, 116 | :n_officers_per_population, 117 | :racial_match_community_police, 118 | :percent_police_white, 119 | :percent_police_black, 120 | :percent_police_hispanic, 121 | :percent_police_asian, 122 | :percent_police_minority, 123 | :n_officers_assigned_drug_units, 124 | :n_kinds_drugs_seized, 125 | :police_average_overtime_worked, 126 | :land_area, 127 | :population_density, 128 | :percent_use_public_transit, 129 | :n_police_cars, 130 | :n_police_operating_budget, 131 | :lemas_percent_police_on_patrol, 132 | :lemas_gang_unit_deployed, 133 | :lemas_percent_office_drug_units, 134 | :police_operating_budget_per_population, 135 | :total_violent_crimes_per_population 136 | ) 137 | 138 | def initialize 139 | super() 140 | @metadata.id = "communities" 141 | @metadata.name = "Communities" 142 | @metadata.url = "https://archive.ics.uci.edu/ml/datasets/communities+and+crime" 143 | @metadata.licenses = ["CC-BY-4.0"] 144 | @metadata.description = lambda do 145 | read_names 146 | end 147 | end 148 | 149 | def each 150 | return to_enum(__method__) unless block_given? 151 | 152 | open_data do |csv| 153 | csv.each do |row| 154 | row = row.collect.with_index do |column, i| 155 | if column == "?" 156 | nil 157 | else 158 | case i 159 | when 3 # communityname 160 | # when 124 # LemasGangUnitDeploy 161 | # 0 means NO, 1 means YES, 0.5 means Part Time 162 | else 163 | column = Float(column) 164 | end 165 | column 166 | end 167 | end 168 | record = Record.new(*row) 169 | yield(record) 170 | end 171 | end 172 | end 173 | 174 | private 175 | def base_url 176 | "https://archive.ics.uci.edu/ml/machine-learning-databases/communities" 177 | end 178 | 179 | def open_data 180 | data_path = cache_dir_path + "communities.data" 181 | data_url = "#{base_url}/communities.data" 182 | download(data_path, data_url) 183 | CSV.open(data_path) do |csv| 184 | yield(csv) 185 | end 186 | end 187 | 188 | def read_names 189 | names_path = cache_dir_path + "communities.names" 190 | names_url = "#{base_url}/communities.names" 191 | download(names_path, names_url) 192 | names_path.read 193 | end 194 | end 195 | end 196 | -------------------------------------------------------------------------------- /lib/datasets/dataset.rb: -------------------------------------------------------------------------------- 1 | require "pathname" 2 | 3 | require_relative "cache-path" 4 | require_relative "downloader" 5 | require_relative "error" 6 | require_relative "metadata" 7 | require_relative "table" 8 | 9 | module Datasets 10 | class Dataset 11 | include Enumerable 12 | 13 | attr_reader :metadata 14 | def initialize 15 | @metadata = Metadata.new 16 | end 17 | 18 | def to_table 19 | Table.new(self) 20 | end 21 | 22 | def clear_cache! 23 | cache_path.remove 24 | end 25 | 26 | private 27 | 28 | def cache_dir_path 29 | cache_path.base_dir 30 | end 31 | 32 | def cache_path 33 | @cache_path ||= CachePath.new(@metadata.id) 34 | end 35 | 36 | def download(output_path, url, *fallback_urls, **options, &block) 37 | downloader = Downloader.new(url, *fallback_urls, **options) 38 | downloader.download(output_path, &block) 39 | end 40 | 41 | def extract_bz2(bz2) 42 | case bz2 43 | when Pathname, String 44 | IO.pipe do |input, output| 45 | pid = spawn("bzcat", bz2.to_s, {out: output}) 46 | begin 47 | output.close 48 | yield(input) 49 | ensure 50 | input.close 51 | Process.waitpid(pid) 52 | end 53 | end 54 | else 55 | IO.pipe do |bz2_input, bz2_output| 56 | IO.pipe do |plain_input, plain_output| 57 | bz2_stop = false 58 | bz2_thread = Thread.new do 59 | begin 60 | bz2.each do |chunk| 61 | bz2_output.write(chunk) 62 | bz2_output.flush 63 | break if bz2_stop 64 | end 65 | rescue => error 66 | message = "Failed to read bzcat input: " + 67 | "#{error.class}: #{error.message}" 68 | $stderr.puts(message) 69 | ensure 70 | bz2_output.close 71 | end 72 | end 73 | begin 74 | pid = spawn("bzcat", {in: bz2_input, out: plain_output}) 75 | begin 76 | bz2_input.close 77 | plain_output.close 78 | yield(plain_input) 79 | ensure 80 | plain_input.close 81 | Process.waitpid(pid) 82 | end 83 | ensure 84 | bz2_stop = true 85 | bz2_thread.join 86 | end 87 | end 88 | end 89 | end 90 | end 91 | end 92 | end 93 | -------------------------------------------------------------------------------- /lib/datasets/diamonds.rb: -------------------------------------------------------------------------------- 1 | require_relative "ggplot2-dataset" 2 | 3 | module Datasets 4 | class Diamonds < Ggplot2Dataset 5 | Record = Struct.new(:carat, 6 | :cut, 7 | :color, 8 | :clarity, 9 | :depth, 10 | :table, 11 | :price, 12 | :x, 13 | :y, 14 | :z) 15 | 16 | def initialize() 17 | super("diamonds") 18 | @metadata.id = "diamonds" 19 | @metadata.name = "Diamonds" 20 | @metadata.licenses = ["CC0-1.0"] 21 | end 22 | 23 | COLUMN_NAME_MAPPING = { 24 | } 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/datasets/dictionary.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | class Dictionary 3 | include Enumerable 4 | 5 | def initialize(values) 6 | build_dictionary(values) 7 | end 8 | 9 | def id(value) 10 | @value_to_id[value] 11 | end 12 | 13 | def value(id) 14 | @id_to_value[id] 15 | end 16 | 17 | def ids 18 | @id_to_value.keys 19 | end 20 | 21 | def values 22 | @id_to_value.values 23 | end 24 | 25 | def each(&block) 26 | @id_to_value.each(&block) 27 | end 28 | 29 | def size 30 | @id_to_value.size 31 | end 32 | alias_method :length, :size 33 | 34 | def encode(values) 35 | values.collect do |value| 36 | id(value) 37 | end 38 | end 39 | 40 | def decode(ids) 41 | ids.collect do |id| 42 | value(id) 43 | end 44 | end 45 | 46 | private 47 | def build_dictionary(values) 48 | @id_to_value = {} 49 | @value_to_id = {} 50 | id = 0 51 | values.each do |value| 52 | next if @value_to_id.key?(value) 53 | @id_to_value[id] = value 54 | @value_to_id[value] = id 55 | id += 1 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/datasets/error.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | class Error < StandardError 3 | end 4 | end 5 | -------------------------------------------------------------------------------- /lib/datasets/fashion-mnist.rb: -------------------------------------------------------------------------------- 1 | require_relative 'mnist' 2 | 3 | module Datasets 4 | class FashionMNIST < MNIST 5 | private 6 | def base_urls 7 | [ 8 | "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/", 9 | ] 10 | end 11 | 12 | def dataset_name 13 | "Fashion-MNIST" 14 | end 15 | 16 | def licenses 17 | ["MIT"] 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/datasets/fuel-economy.rb: -------------------------------------------------------------------------------- 1 | require_relative "ggplot2-dataset" 2 | 3 | module Datasets 4 | class FuelEconomy < Ggplot2Dataset 5 | Record = Struct.new(:manufacturer, 6 | :model, 7 | :displacement, 8 | :year, 9 | :n_cylinders, 10 | :transmission, 11 | :drive_train, 12 | :city_mpg, 13 | :highway_mpg, 14 | :fuel, 15 | :type) 16 | 17 | def initialize 18 | super("mpg") 19 | @metadata.id = "fuel-economy" 20 | @metadata.name = "Fuel economy" 21 | @metadata.licenses = ["CC0-1.0"] 22 | end 23 | 24 | COLUMN_NAME_MAPPING = { 25 | "displ" => "displacement", 26 | "cyl" => "n_cylinders", 27 | "trans" => "transmissions", 28 | "drv" => "drive_train", 29 | "cty" => "city_mpg", 30 | "hwy" => "highway_mpg", 31 | "fl" => "fuel", 32 | "class" => "type", 33 | } 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/datasets/geolonia.rb: -------------------------------------------------------------------------------- 1 | require 'csv' 2 | 3 | require_relative 'dataset' 4 | 5 | module Datasets 6 | class Geolonia < Dataset 7 | Record = Struct.new(:prefecture_code, 8 | :prefecture_name, 9 | :prefecture_kana, 10 | :prefecture_romaji, 11 | :municipality_code, 12 | :municipality_name, 13 | :municipality_kana, 14 | :municipality_romaji, 15 | :street_name, 16 | :street_kana, 17 | :street_romaji, 18 | :alias, 19 | :latitude, 20 | :longitude) 21 | 22 | def initialize 23 | super 24 | @metadata.id = 'geolonia' 25 | @metadata.name = 'Geolonia' 26 | @metadata.url = 'https://github.com/geolonia/japanese-addresses' 27 | @metadata.licenses = ["CC-BY-4.0"] 28 | @metadata.description = lambda do 29 | fetch_readme 30 | end 31 | end 32 | 33 | def each 34 | return to_enum(__method__) unless block_given? 35 | 36 | open_data do |csv| 37 | csv.readline 38 | csv.each do |row| 39 | record = Record.new(*row) 40 | yield(record) 41 | end 42 | end 43 | end 44 | 45 | private 46 | def download_base_url 47 | "https://raw.githubusercontent.com/geolonia/japanese-addresses/master" 48 | end 49 | 50 | def open_data 51 | data_path = cache_dir_path + 'latest.csv' 52 | data_url = "#{download_base_url}/data/latest.csv" 53 | download(data_path, data_url) 54 | CSV.open(data_path) do |csv| 55 | yield(csv) 56 | end 57 | end 58 | 59 | def fetch_readme 60 | readme_base_name = "README.md" 61 | readme_path = cache_dir_path + readme_base_name 62 | readme_url = "#{download_base_url}/#{readme_base_name}" 63 | download(readme_path, readme_url) 64 | readme_path.read.split(/^## API/, 2)[0].strip 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/datasets/ggplot2-dataset.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | class Ggplot2Dataset < Dataset 3 | def initialize(ggplot2_dataset_name) 4 | super() 5 | @ggplot2_dataset_name = ggplot2_dataset_name 6 | @metadata.url = 7 | "https://ggplot2.tidyverse.org/reference/#{@ggplot2_dataset_name}.html" 8 | @metadata.description = lambda do 9 | fetch_description 10 | end 11 | end 12 | 13 | def each 14 | return to_enum(__method__) unless block_given? 15 | 16 | data_base_name = "#{@ggplot2_dataset_name}.csv" 17 | data_path = cache_dir_path + data_base_name 18 | data_url = "#{download_base_url}/data-raw/#{data_base_name}" 19 | download(data_path, data_url) 20 | CSV.open(data_path, headers: :first_row, converters: :numeric) do |csv| 21 | record_class = self.class::Record 22 | csv.each do |row| 23 | record = record_class.new(*row.fields) 24 | yield record 25 | end 26 | end 27 | end 28 | 29 | private 30 | def download_base_url 31 | "https://raw.githubusercontent.com/tidyverse/ggplot2/main" 32 | end 33 | 34 | def fetch_description 35 | data_r_base_name = "data.R" 36 | data_r_path = cache_dir_path + data_r_base_name 37 | data_r_url = "#{download_base_url}/R/#{data_r_base_name}" 38 | download(data_r_path, data_r_url) 39 | descriptions = {} 40 | comment = +"" 41 | File.open(data_r_path) do |data_r| 42 | data_r.each_line do |line| 43 | case line.chomp 44 | when /\A#'/ 45 | comment_content = Regexp.last_match.post_match 46 | unless comment_content.empty? 47 | comment_content = comment_content[1..-1] 48 | end 49 | comment << comment_content 50 | comment << "\n" 51 | when /\A"(.+)"\z/ 52 | name = Regexp.last_match[1] 53 | descriptions[name] = parse_roxygen(comment.rstrip) 54 | comment = +"" 55 | end 56 | end 57 | descriptions[@ggplot2_dataset_name] 58 | end 59 | end 60 | 61 | def parse_roxygen(roxygen) 62 | column_name_mapping = self.class::COLUMN_NAME_MAPPING 63 | roxygen 64 | .gsub(/\\url\{(.*?)\}/, "\\1") 65 | .gsub(/^@format /, "") 66 | .gsub(/\\describe\{(.*)\}/m) do 67 | content = $1 68 | content.gsub(/\\item\{(.*?)\}\{(.*?)\}/m) do 69 | column_name = $1 70 | description = $2 71 | column_name = column_name_mapping[column_name] || column_name 72 | description = description 73 | .gsub(/\\\$/, "$") 74 | "* #{column_name}: #{description}" 75 | end 76 | end 77 | end 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /lib/datasets/hepatitis.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class Hepatitis < Dataset 7 | class Record < Struct.new(:label, 8 | :age, 9 | :sex, 10 | :steroid, 11 | :antivirals, 12 | :fatigue, 13 | :malaise, 14 | :anorexia, 15 | :liver_big, 16 | :liver_firm, 17 | :spleen_palpable, 18 | :spiders, 19 | :ascites, 20 | :varices, 21 | :bilirubin, 22 | :alkaline_phosphate, 23 | :sgot, 24 | :albumin, 25 | :protime, 26 | :histology) 27 | def initialize(*values) 28 | super() 29 | members.zip(values) do |member, value| 30 | __send__("#{member}=", value) 31 | end 32 | end 33 | 34 | def label=(label) 35 | case label 36 | when "1" 37 | super(:die) 38 | when "2" 39 | super(:live) 40 | else 41 | super(label) 42 | end 43 | end 44 | 45 | def age=(age) 46 | super(normalize_integer(age)) 47 | end 48 | 49 | def sex=(sex) 50 | case sex 51 | when "1" 52 | super(:male) 53 | when "2" 54 | super(:female) 55 | else 56 | super(sex) 57 | end 58 | end 59 | 60 | def steroid=(steroid) 61 | super(normalize_boolean(steroid)) 62 | end 63 | 64 | def antivirals=(antivirals) 65 | super(normalize_boolean(antivirals)) 66 | end 67 | 68 | def fatigue=(fatigue) 69 | super(normalize_boolean(fatigue)) 70 | end 71 | 72 | def malaise=(malaise) 73 | super(normalize_boolean(malaise)) 74 | end 75 | 76 | def anorexia=(anorexia) 77 | super(normalize_boolean(anorexia)) 78 | end 79 | 80 | def liver_big=(liver_big) 81 | super(normalize_boolean(liver_big)) 82 | end 83 | 84 | def liver_firm=(liver_firm) 85 | super(normalize_boolean(liver_firm)) 86 | end 87 | 88 | def spleen_palpable=(spleen_palpable) 89 | super(normalize_boolean(spleen_palpable)) 90 | end 91 | 92 | def spiders=(spiders) 93 | super(normalize_boolean(spiders)) 94 | end 95 | 96 | def ascites=(ascites) 97 | super(normalize_boolean(ascites)) 98 | end 99 | 100 | def varices=(varices) 101 | super(normalize_boolean(varices)) 102 | end 103 | 104 | def bilirubin=(bilirubin) 105 | super(normalize_float(bilirubin)) 106 | end 107 | 108 | def alkaline_phosphate=(alkaline_phosphate) 109 | super(normalize_integer(alkaline_phosphate)) 110 | end 111 | 112 | def sgot=(sgot) 113 | super(normalize_integer(sgot)) 114 | end 115 | 116 | def albumin=(albumin) 117 | super(normalize_float(albumin)) 118 | end 119 | 120 | def protime=(protime) 121 | super(normalize_integer(protime)) 122 | end 123 | 124 | def histology=(histology) 125 | super(normalize_boolean(histology)) 126 | end 127 | 128 | private 129 | def normalize_boolean(value) 130 | case value 131 | when "?" 132 | nil 133 | when "1" 134 | false 135 | when "2" 136 | true 137 | else 138 | value 139 | end 140 | end 141 | 142 | def normalize_float(value) 143 | case value 144 | when "?" 145 | nil 146 | else 147 | Float(value) 148 | end 149 | end 150 | 151 | def normalize_integer(value) 152 | case value 153 | when "?" 154 | nil 155 | else 156 | Integer(value, 10) 157 | end 158 | end 159 | end 160 | 161 | def initialize 162 | super() 163 | @metadata.id = "hepatitis" 164 | @metadata.name = "Hepatitis" 165 | @metadata.url = "https://archive.ics.uci.edu/ml/datasets/hepatitis" 166 | @metadata.licenses = ["CC-BY-4.0"] 167 | @metadata.description = lambda do 168 | read_names 169 | end 170 | end 171 | 172 | def each 173 | return to_enum(__method__) unless block_given? 174 | 175 | open_data do |csv| 176 | csv.each do |row| 177 | record = Record.new(*row) 178 | yield(record) 179 | end 180 | end 181 | end 182 | 183 | private 184 | def base_url 185 | "https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis" 186 | end 187 | 188 | def open_data 189 | data_path = cache_dir_path + "hepatitis.csv" 190 | data_url = "#{base_url}/hepatitis.data" 191 | download(data_path, data_url) 192 | CSV.open(data_path) do |csv| 193 | yield(csv) 194 | end 195 | end 196 | 197 | def read_names 198 | names_path = cache_dir_path + "hepatitis.names" 199 | names_url = "#{base_url}/hepatitis.names" 200 | download(names_path, names_url) 201 | names_path.read 202 | end 203 | end 204 | end 205 | -------------------------------------------------------------------------------- /lib/datasets/house-of-representative.rb: -------------------------------------------------------------------------------- 1 | require_relative "dataset" 2 | require_relative "japanese-date-parser" 3 | 4 | module Datasets 5 | class HouseOfRepresentative < Dataset 6 | Record = Struct.new(:carry_time, 7 | :caption, 8 | :type, 9 | :submit_time, 10 | :submit_number, 11 | :title, 12 | :discussion_status, 13 | :progress, 14 | :progress_url, 15 | :text, 16 | :text_url, 17 | :bill_type, 18 | :submitter, 19 | :submitter_in_house_groups, 20 | :house_of_representatives_of_accepted_bill_on_preliminary_consideration, 21 | :house_of_representatives_of_preliminary_refer_on, 22 | :house_of_representatives_of_preliminary_refer_commission, 23 | :house_of_representatives_of_accepted_bill_on, 24 | :house_of_representatives_of_refer_on, 25 | :house_of_representatives_of_refer_commission, 26 | :house_of_representatives_of_finished_consideration_on, 27 | :house_of_representatives_of_consideration_result, 28 | :house_of_representatives_of_finished_deliberation_on, 29 | :house_of_representatives_of_deliberation_result, 30 | :house_of_representatives_of_attitude_of_in_house_group_during_deliberation, 31 | :house_of_representatives_of_support_in_house_group_during_deliberation, 32 | :house_of_representatives_of_opposition_in_house_group_during_deliberation, 33 | :house_of_councillors_of_accepted_bill_on_preliminary_consideration, 34 | :house_of_councillors_of_preliminary_refer_on, 35 | :house_of_councillors_of_preliminary_refer_commission, 36 | :house_of_councillors_of_accepted_bill_on, 37 | :house_of_councillors_of_refer_on, 38 | :house_of_councillors_of_refer_commission, 39 | :house_of_councillors_of_finished_consideration_on, 40 | :house_of_councillors_of_consideration_result, 41 | :house_of_councillors_of_finished_deliberation_on, 42 | :house_of_councillors_of_deliberation_result, 43 | :promulgated_on, 44 | :law_number, 45 | :submitters, 46 | :supporters_of_submitted_bill) 47 | 48 | def initialize 49 | super() 50 | 51 | @metadata.id = "house-of-representative" 52 | @metadata.name = "Bill of the House of Representatives of Japan" 53 | @metadata.url = "https://smartnews-smri.github.io/house-of-representatives" 54 | @metadata.licenses = ["MIT"] 55 | @metadata.description = "Bill of the House of Representatives of Japan" 56 | end 57 | 58 | def each 59 | return to_enum(__method__) unless block_given? 60 | 61 | open_data do |csv| 62 | csv.each do |row| 63 | record = Record.new(*row.fields) 64 | yield(record) 65 | end 66 | end 67 | end 68 | 69 | private 70 | 71 | def open_data 72 | data_url = "https://raw.githubusercontent.com/smartnews-smri/house-of-representatives/main/data/gian.csv" 73 | data_path = cache_dir_path + "gian.csv" 74 | download(data_path, data_url) 75 | 76 | parser = JapaneseDateParser.new 77 | japanese_date_converter = lambda do |field, info| 78 | if info.header.end_with?("年月日") 79 | parser.parse(field) 80 | else 81 | field 82 | end 83 | end 84 | array_converter = lambda do |field, info| 85 | case info.header 86 | when "議案提出会派", "衆議院審議時賛成会派", "衆議院審議時反対会派", "議案提出者一覧", "議案提出の賛成者" 87 | parse_array(field) 88 | else 89 | field 90 | end 91 | end 92 | File.open(data_path) do |data_file| 93 | options = { 94 | col_sep: ",", 95 | headers: true, 96 | converters: [:integer, japanese_date_converter, array_converter], 97 | } 98 | # There are two columns within one column. To split into two columns, `#gsub` is necessary. 99 | yield(CSV.new(data_file.read.gsub("/", ","), **options)) 100 | end 101 | end 102 | 103 | def parse_array(column_value) 104 | column_value&.split("; ") 105 | end 106 | end 107 | end 108 | -------------------------------------------------------------------------------- /lib/datasets/iris.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class Iris < Dataset 7 | Record = Struct.new(:sepal_length, 8 | :sepal_width, 9 | :petal_length, 10 | :petal_width, 11 | :label) 12 | 13 | def initialize 14 | super() 15 | @metadata.id = "iris" 16 | @metadata.name = "Iris" 17 | @metadata.url = "https://archive.ics.uci.edu/ml/datasets/Iris" 18 | @metadata.licenses = ["CC-BY-4.0"] 19 | @metadata.description = lambda do 20 | read_names 21 | end 22 | end 23 | 24 | def each 25 | return to_enum(__method__) unless block_given? 26 | 27 | open_data do |csv| 28 | csv.each do |row| 29 | next if row[0].nil? 30 | record = Record.new(*row) 31 | yield(record) 32 | end 33 | end 34 | end 35 | 36 | private 37 | def open_data 38 | data_path = cache_dir_path + "iris.csv" 39 | data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data" 40 | download(data_path, data_url) 41 | CSV.open(data_path, converters: [:numeric]) do |csv| 42 | yield(csv) 43 | end 44 | end 45 | 46 | def read_names 47 | names_path = cache_dir_path + "iris.names" 48 | names_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names" 49 | download(names_path, names_url) 50 | names_path.read 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/datasets/ita-corpus.rb: -------------------------------------------------------------------------------- 1 | require_relative 'dataset' 2 | 3 | module Datasets 4 | class ITACorpus < Dataset 5 | Record = Struct.new(:id, 6 | :sentence) 7 | 8 | def initialize(type: :emotion) 9 | unless [:emotion, :recitation].include?(type) 10 | raise ArgumentError, "Please set type :emotion or :recitation: #{type.inspect}" 11 | end 12 | 13 | super() 14 | @type = type 15 | @metadata.id = 'ita-corpus' 16 | @metadata.name = 'ITA-corpus' 17 | @metadata.url = 'https://github.com/mmorise/ita-corpus' 18 | @metadata.licenses = ['Unlicense'] 19 | @metadata.description = lambda do 20 | fetch_readme 21 | end 22 | end 23 | 24 | def each(&block) 25 | return to_enum(__method__) unless block_given? 26 | 27 | data_path = cache_dir_path + "#{@type}_transcript_utf8.txt" 28 | data_url = "#{download_base_url}/#{@type}_transcript_utf8.txt" 29 | download(data_path, data_url) 30 | 31 | parse_data(data_path, &block) 32 | end 33 | 34 | private 35 | def fetch_readme 36 | readme_base_name = "README.md" 37 | readme_path = cache_dir_path + readme_base_name 38 | readme_url = "#{download_base_url}/#{readme_base_name}" 39 | download(readme_path, readme_url) 40 | readme_path.read.split(/^## ファイル構成/, 2)[0].strip 41 | end 42 | 43 | def download_base_url 44 | "https://raw.githubusercontent.com/mmorise/ita-corpus/main" 45 | end 46 | 47 | def parse_data(data_path) 48 | File.open(data_path) do |f| 49 | f.each_line(chomp: true) do |line| 50 | id, sentence = line.split(':', 2) 51 | record = Record.new(id , sentence) 52 | yield(record) 53 | end 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/datasets/japanese-date-parser.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | class JapaneseDateParser 3 | class UnsupportedEraInitialRange < Error; end 4 | 5 | ERA_INITIALS = { 6 | "平成" => "H", 7 | "令和" => "R", 8 | }.freeze 9 | 10 | def parse(string) 11 | case string 12 | when nil 13 | nil 14 | when /\A(平成|令和|..)\s*(\d{1,2}|元)年\s*(\d{1,2})月\s*(\d{1,2})日\z/ 15 | match_data = Regexp.last_match 16 | era_initial = ERA_INITIALS[match_data[1]] 17 | if era_initial.nil? 18 | message = +"era must be one of [" 19 | message << ERA_INITIALS.keys.join(", ") 20 | message << "]: #{match_data[1]}" 21 | raise UnsupportedEraInitialRange, message 22 | end 23 | 24 | year = match_data[2] 25 | if year == "元" 26 | year = "01" 27 | else 28 | year = year.rjust(2, "0") 29 | end 30 | month = match_data[3].rjust(2, "0") 31 | day = match_data[4].rjust(2, "0") 32 | Date.jisx0301("#{era_initial}#{year}.#{month}.#{day}") 33 | else 34 | string 35 | end 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /lib/datasets/kuzushiji-mnist.rb: -------------------------------------------------------------------------------- 1 | require_relative 'mnist' 2 | 3 | module Datasets 4 | class KuzushijiMNIST < MNIST 5 | private 6 | def base_urls 7 | [ 8 | "http://codh.rois.ac.jp/kmnist/dataset/kmnist/", 9 | ] 10 | end 11 | 12 | def dataset_name 13 | "Kuzushiji-MNIST" 14 | end 15 | 16 | def licenses 17 | ["CC-BY-SA-4.0"] 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/datasets/lazy.rb: -------------------------------------------------------------------------------- 1 | require_relative "version" 2 | 3 | module Datasets 4 | class LazyLoader 5 | def initialize 6 | @constants = {} 7 | end 8 | 9 | def exist?(constant_name) 10 | @constants.key?(constant_name) 11 | end 12 | 13 | def load(constant_name) 14 | feature = @constants[constant_name] 15 | raise LoadError, "unknown dataset: #{constant_name}" unless feature 16 | require feature 17 | end 18 | 19 | def load_all 20 | @constants.each_value do |feature| 21 | require feature 22 | end 23 | end 24 | 25 | def register(constant_name, feature) 26 | @constants[constant_name] = feature 27 | end 28 | 29 | def constant_names 30 | @constants.keys 31 | end 32 | end 33 | 34 | LAZY_LOADER = LazyLoader.new 35 | 36 | class << self 37 | def const_missing(name) 38 | if LAZY_LOADER.exist?(name) 39 | LAZY_LOADER.load(name) 40 | const_get(name) 41 | else 42 | super 43 | end 44 | end 45 | end 46 | 47 | LAZY_LOADER.register(:Adult, "datasets/adult") 48 | LAZY_LOADER.register(:AFINN, "datasets/afinn") 49 | LAZY_LOADER.register(:AozoraBunko, "datasets/aozora-bunko") 50 | LAZY_LOADER.register(:CaliforniaHousing, "datasets/california-housing") 51 | LAZY_LOADER.register(:CIFAR, "datasets/cifar") 52 | LAZY_LOADER.register(:CLDRPlurals, "datasets/cldr-plurals") 53 | LAZY_LOADER.register(:Communities, "datasets/communities") 54 | LAZY_LOADER.register(:Diamonds, "datasets/diamonds") 55 | LAZY_LOADER.register(:EStatJapan, "datasets/e-stat-japan") 56 | LAZY_LOADER.register(:FashionMNIST, "datasets/fashion-mnist") 57 | LAZY_LOADER.register(:FuelEconomy, "datasets/fuel-economy") 58 | LAZY_LOADER.register(:Geolonia, "datasets/geolonia") 59 | LAZY_LOADER.register(:Hepatitis, "datasets/hepatitis") 60 | LAZY_LOADER.register(:HouseOfCouncillor, "datasets/house-of-councillor") 61 | LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative") 62 | LAZY_LOADER.register(:Iris, "datasets/iris") 63 | LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus") 64 | LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist") 65 | LAZY_LOADER.register(:LIBSVM, "datasets/libsvm") 66 | LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list") 67 | LAZY_LOADER.register(:LivedoorNews, "datasets/livedoor-news") 68 | LAZY_LOADER.register(:MNIST, "datasets/mnist") 69 | LAZY_LOADER.register(:Mushroom, "datasets/mushroom") 70 | LAZY_LOADER.register(:NagoyaUniversityConversationCorpus, 71 | "datasets/nagoya-university-conversation-corpus") 72 | LAZY_LOADER.register(:Penguins, "datasets/penguins") 73 | LAZY_LOADER.register(:PennTreebank, "datasets/penn-treebank") 74 | LAZY_LOADER.register(:PMJTDatasetList, "datasets/pmjt-dataset-list") 75 | LAZY_LOADER.register(:PostalCodeJapan, "datasets/postal-code-japan") 76 | LAZY_LOADER.register(:QuoraDuplicateQuestionPair, 77 | "datasets/quora-duplicate-question-pair") 78 | LAZY_LOADER.register(:RdatasetList, "datasets/rdataset") 79 | # For backward compatibility 80 | LAZY_LOADER.register(:RdatasetsList, "datasets/rdataset") 81 | LAZY_LOADER.register(:Rdataset, "datasets/rdataset") 82 | # For backward compatibility 83 | LAZY_LOADER.register(:Rdatasets, "datasets/rdataset") 84 | LAZY_LOADER.register(:SeabornList, "datasets/seaborn") 85 | LAZY_LOADER.register(:Seaborn, "datasets/seaborn") 86 | LAZY_LOADER.register(:SudachiSynonymDictionary, 87 | "datasets/sudachi-synonym-dictionary") 88 | LAZY_LOADER.register(:Wikipedia, "datasets/wikipedia") 89 | LAZY_LOADER.register(:WikipediaKyotoJapaneseEnglish, 90 | "datasets/wikipedia-kyoto-japanese-english") 91 | LAZY_LOADER.register(:Wine, "datasets/wine") 92 | end 93 | -------------------------------------------------------------------------------- /lib/datasets/libsvm.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class LIBSVM < Dataset 7 | class Record 8 | attr_reader :label 9 | attr_reader :features 10 | def initialize(label, features) 11 | @label = label 12 | @features = features 13 | end 14 | 15 | def [](index) 16 | @features[index] 17 | end 18 | 19 | def to_h 20 | hash = { 21 | label: @label, 22 | } 23 | @features.each_with_index do |feature, i| 24 | hash[i] = feature 25 | end 26 | hash 27 | end 28 | 29 | def values 30 | [@label] + @features 31 | end 32 | end 33 | 34 | def initialize(name, 35 | note: nil, 36 | default_feature_value: 0) 37 | super() 38 | @libsvm_dataset_metadata = fetch_dataset_info(name) 39 | @file = choose_file(note) 40 | @default_feature_value = default_feature_value 41 | @metadata.id = "libsvm-#{normalize_name(name)}" 42 | @metadata.name = "LIBSVM dataset: #{name}" 43 | @metadata.url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/" 44 | @metadata.licenses = ["BSD-3-Clause"] 45 | end 46 | 47 | def each 48 | return to_enum(__method__) unless block_given? 49 | 50 | open_data do |input| 51 | n_features = @libsvm_dataset_metadata.n_features 52 | csv = CSV.new(input, col_sep: " ") 53 | csv.each do |row| 54 | label = parse_label(row.shift) 55 | features = [@default_feature_value] * n_features 56 | row.each do |column| 57 | next if column.nil? 58 | index, value = column.split(":", 2) 59 | features[Integer(index, 10) - 1] = parse_value(value) 60 | end 61 | yield(Record.new(label, features)) 62 | end 63 | end 64 | end 65 | 66 | private 67 | def fetch_dataset_info(name) 68 | list = LIBSVMDatasetList.new 69 | available_datasets = [] 70 | list.each do |record| 71 | available_datasets << record.name 72 | if record.name == name 73 | return record 74 | end 75 | end 76 | message = "unavailable LIBSVM dataset: #{name.inspect}: " 77 | message << "available datasets: [" 78 | message << available_datasets.collect(&:inspect).join(", ") 79 | message << "]" 80 | raise ArgumentError, message 81 | end 82 | 83 | def choose_file(note) 84 | files = @libsvm_dataset_metadata.files 85 | return files.first if note.nil? 86 | 87 | available_notes = [] 88 | @libsvm_dataset_metadata.files.find do |file| 89 | return file if file.note == note 90 | available_notes << file.note if file.note 91 | end 92 | 93 | name = @libsvm_dataset_metadata.name 94 | message = "unavailable note: #{name}: #{note.inspect}: " 95 | message << "available notes: [" 96 | message << available_notes.collect(&:inspect).join(", ") 97 | message << "]" 98 | raise ArgumentError, message 99 | end 100 | 101 | def open_data(&block) 102 | data_path = cache_dir_path + @file.name 103 | download(data_path, @file.url) 104 | if data_path.extname == ".bz2" 105 | extract_bz2(data_path, &block) 106 | else 107 | data_path.open(&block) 108 | end 109 | end 110 | 111 | def normalize_name(name) 112 | name.gsub(/[()]/, "").gsub(/[ _;]+/, "-").downcase 113 | end 114 | 115 | def parse_label(label) 116 | labels = label.split(",").collect do |value| 117 | parse_value(value) 118 | end 119 | if labels.size == 1 120 | labels[0] 121 | else 122 | labels 123 | end 124 | end 125 | 126 | def parse_value(value) 127 | if value.include?(".") 128 | Float(value) 129 | else 130 | Integer(value, 10) 131 | end 132 | end 133 | end 134 | end 135 | -------------------------------------------------------------------------------- /lib/datasets/license.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | class License < Struct.new(:spdx_id, 3 | :name, 4 | :url) 5 | class << self 6 | def try_convert(value) 7 | case value 8 | when self 9 | value 10 | when String 11 | license = new 12 | license.spdx_id = value 13 | license 14 | when Hash 15 | license = new 16 | license.spdx_id = value[:spdx_id] 17 | license.name = value[:name] 18 | license.url = value[:url] 19 | license 20 | else 21 | nil 22 | end 23 | end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/datasets/livedoor-news.rb: -------------------------------------------------------------------------------- 1 | require_relative "dataset" 2 | require_relative "tar-gz-readable" 3 | 4 | module Datasets 5 | class LivedoorNews < Dataset 6 | include TarGzReadable 7 | Record = Struct.new(:url, 8 | :timestamp, 9 | :sentence) 10 | 11 | def initialize(type: :topic_news) 12 | news_list = [ 13 | :topic_news, 14 | :sports_watch, 15 | :it_life_hack, 16 | :kaden_channel, 17 | :movie_enter, 18 | :dokujo_tsushin, 19 | :smax, 20 | :livedoor_homme, 21 | :peachy 22 | ] 23 | unless news_list.include?(type) 24 | valid_type_labels = news_list.collect(&:inspect).join(", ") 25 | message = ":type must be one of [#{valid_type_labels}]: #{type.inspect}" 26 | raise ArgumentError, message 27 | end 28 | 29 | super() 30 | @type = type 31 | @metadata.id = 'livedoor-news' 32 | @metadata.name = 'livedoor-news' 33 | @metadata.url = 'https://www.rondhuit.com/download.html#ldcc' 34 | @metadata.licenses = ['CC-BY-ND-2.1-JP'] 35 | @metadata.description = lambda do 36 | fetch_readme 37 | end 38 | end 39 | 40 | def each(&block) 41 | return to_enum(__method__) unless block_given? 42 | 43 | data_path = download_tar_gz 44 | parse_data(data_path, &block) 45 | end 46 | 47 | private 48 | def download_tar_gz 49 | data_path = cache_dir_path + "livedoor-news.tar.gz" 50 | data_url = "https://www.rondhuit.com/download/ldcc-20140209.tar.gz" 51 | download(data_path, data_url) 52 | data_path 53 | end 54 | 55 | def fetch_readme 56 | data_path = download_tar_gz 57 | target_file_name = 'text/README.txt' 58 | open_tar_gz(data_path) do |tar| 59 | tar.seek(target_file_name) do |entry| 60 | return entry.read.force_encoding("UTF-8") 61 | end 62 | end 63 | end 64 | 65 | def parse_data(data_path, &block) 66 | target_directory_name = "text/#{@type.to_s.gsub(/_/, '-')}" 67 | open_tar_gz(data_path) do |tar| 68 | tar.each do |entry| 69 | next unless entry.file? 70 | directory_name, base_name = File.split(entry.full_name) 71 | next unless directory_name == target_directory_name 72 | next if base_name == "LICENSE.txt" 73 | url, timestamp, sentence = entry.read.force_encoding("UTF-8").split("\n", 3) 74 | record = Record.new(url, Time.iso8601(timestamp), sentence) 75 | yield(record) 76 | end 77 | end 78 | end 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /lib/datasets/metadata.rb: -------------------------------------------------------------------------------- 1 | require_relative "license" 2 | 3 | module Datasets 4 | class Metadata < Struct.new(:id, 5 | :name, 6 | :url, 7 | :licenses, 8 | :description) 9 | def licenses=(licenses) 10 | licenses = [licenses] unless licenses.is_a?(Array) 11 | licenses = licenses.collect do |license| 12 | l = License.try_convert(license) 13 | if l.nil? 14 | raise ArgumentError.new("invalid license: #{license.inspect}") 15 | end 16 | l 17 | end 18 | super(licenses) 19 | end 20 | 21 | def description 22 | description_raw = super 23 | if description_raw.respond_to?(:call) 24 | self.description = description_raw = description_raw.call 25 | end 26 | description_raw 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/datasets/mnist.rb: -------------------------------------------------------------------------------- 1 | require 'zlib' 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class MNIST < Dataset 7 | class Record < Struct.new(:data, :label) 8 | def pixels 9 | data.unpack("C*") 10 | end 11 | 12 | def to_h 13 | hash = super 14 | hash[:pixels] = pixels 15 | hash 16 | end 17 | end 18 | 19 | def initialize(type: :train) 20 | unless [:train, :test].include?(type) 21 | raise ArgumentError, "Please set type :train or :test: #{type.inspect}" 22 | end 23 | 24 | super() 25 | 26 | @metadata.id = "#{dataset_name.downcase}-#{type}" 27 | @metadata.name = "#{dataset_name}: #{type}" 28 | @metadata.url = base_urls.first 29 | @metadata.licenses = licenses 30 | @type = type 31 | 32 | case type 33 | when :train 34 | @metadata.description = "a training set of 60,000 examples" 35 | when :test 36 | @metadata.description = "a test set of 10,000 examples" 37 | end 38 | end 39 | 40 | def each(&block) 41 | return to_enum(__method__) unless block_given? 42 | 43 | image_path = cache_dir_path + target_file(:image) 44 | label_path = cache_dir_path + target_file(:label) 45 | 46 | download(image_path, 47 | *base_urls.collect { |base_url| base_url + target_file(:image) }) 48 | download(label_path, 49 | *base_urls.collect { |base_url| base_url + target_file(:label) }) 50 | 51 | open_data(image_path, label_path, &block) 52 | end 53 | 54 | private 55 | def base_urls 56 | [ 57 | "https://ossci-datasets.s3.amazonaws.com/mnist/", 58 | ] 59 | end 60 | 61 | def licenses 62 | [] 63 | end 64 | 65 | def open_data(image_path, label_path, &block) 66 | labels = parse_labels(label_path) 67 | 68 | Zlib::GzipReader.open(image_path) do |f| 69 | n_uint32s = 4 70 | n_bytes = n_uint32s * 4 71 | mnist_magic_number = 2051 72 | magic, n_images, n_rows, n_cols = f.read(n_bytes).unpack("N*") 73 | if magic != mnist_magic_number 74 | raise Error, "This is not #{dataset_name} image file" 75 | end 76 | n_images.times do |i| 77 | data = f.read(n_rows * n_cols) 78 | label = labels[i] 79 | yield Record.new(data, label) 80 | end 81 | end 82 | end 83 | 84 | def target_file(data) 85 | case @type 86 | when :train 87 | case data 88 | when :image 89 | "train-images-idx3-ubyte.gz" 90 | when :label 91 | "train-labels-idx1-ubyte.gz" 92 | end 93 | when :test 94 | case data 95 | when :image 96 | "t10k-images-idx3-ubyte.gz" 97 | when :label 98 | "t10k-labels-idx1-ubyte.gz" 99 | end 100 | end 101 | end 102 | 103 | def parse_labels(file_path) 104 | Zlib::GzipReader.open(file_path) do |f| 105 | n_uint32s = 4 106 | n_bytes = n_uint32s * 2 107 | mnist_magic_number = 2049 108 | magic, n_labels = f.read(n_bytes).unpack('N2') 109 | if magic != mnist_magic_number 110 | raise Error, "This is not #{dataset_name} label file" 111 | end 112 | f.read(n_labels).unpack('C*') 113 | end 114 | end 115 | 116 | def dataset_name 117 | "MNIST" 118 | end 119 | end 120 | end 121 | -------------------------------------------------------------------------------- /lib/datasets/nagoya-university-conversation-corpus.rb: -------------------------------------------------------------------------------- 1 | require_relative 'dataset' 2 | require_relative 'zip-extractor' 3 | 4 | module Datasets 5 | class NagoyaUniversityConversationCorpus < Dataset 6 | Data = Struct.new( 7 | :name, 8 | :date, 9 | :place, 10 | :participants, 11 | :relationships, 12 | :note, 13 | :sentences 14 | ) 15 | 16 | Participant = Struct.new( 17 | :id, 18 | :attribute, 19 | :birthplace, 20 | :residence 21 | ) 22 | 23 | Sentence = Struct.new(:participant_id, :content) do 24 | def end? 25 | participant_id.nil? and content.nil? 26 | end 27 | end 28 | 29 | def initialize 30 | super() 31 | @metadata.id = 'nagoya-university-conversation-corpus' 32 | @metadata.name = 'Nagoya University Conversation Corpus' 33 | @metadata.url = 'https://mmsrv.ninjal.ac.jp/nucc/' 34 | @metadata.licenses = ['CC-BY-NC-ND-4.0'] 35 | @metadata.description = <<~DESCRIPTION 36 | The "Nagoya University Conversation Corpus" is a corpus of 129 conversations, 37 | total about 100 hours of chatting among native speakers of Japanese, 38 | which is converted into text. 39 | DESCRIPTION 40 | end 41 | 42 | def each 43 | return to_enum(__method__) unless block_given? 44 | 45 | open_data do |input_stream| 46 | yield(parse_file(input_stream)) 47 | end 48 | end 49 | 50 | private 51 | 52 | def open_data 53 | data_path = cache_dir_path + 'nucc.zip' 54 | data_url = 'https://mmsrv.ninjal.ac.jp/nucc/nucc.zip' 55 | download(data_path, data_url) 56 | 57 | extractor = ZipExtractor.new(data_path) 58 | extractor.extract_files do |input_stream| 59 | yield(input_stream) 60 | end 61 | end 62 | 63 | def parse_file(input_stream) 64 | data = Data.new 65 | participants = [] 66 | sentences = [] 67 | 68 | input_stream.each do |input| 69 | input.each_line(chomp: true) do |line| 70 | line.force_encoding('utf-8') 71 | if line.start_with?('@データ') 72 | data.name = line[4..-1] 73 | elsif line.start_with?('@収集年月日') 74 | # mixed cases with and without':' 75 | data.date = line[6..-1].delete_prefix(':') 76 | elsif line.start_with?('@場所') 77 | data.place = line[4..-1] 78 | elsif line.start_with?('@参加者の関係') 79 | data.relationships = line.split(':', 2)[1] 80 | elsif line.start_with?('@参加者') 81 | participant = Participant.new 82 | participant.id, profiles = line[4..-1].split(':', 2) 83 | participant.attribute, participant.birthplace, participant.residence = profiles.split('、', 3) 84 | 85 | participants << participant 86 | elsif line.start_with?('%com') 87 | data.note = line.split(':', 2)[1] 88 | elsif line == '@END' 89 | sentence = Sentence.new 90 | sentence.participant_id = nil 91 | sentence.content = nil 92 | 93 | sentences << sentence 94 | else 95 | sentence = Sentence.new 96 | sentence.participant_id, sentence.content = line.split(':', 2) 97 | 98 | sentences << sentence 99 | end 100 | end 101 | end 102 | 103 | data.participants = participants 104 | data.sentences = sentences 105 | 106 | data 107 | end 108 | end 109 | end 110 | -------------------------------------------------------------------------------- /lib/datasets/penguins.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | module PenguinsRawData 7 | Record = Struct.new(:study_name, 8 | :sample_number, 9 | :species, 10 | :region, 11 | :island, 12 | :stage, 13 | :individual_id, 14 | :clutch_completion, 15 | :date_egg, 16 | :culmen_length_mm, 17 | :culmen_depth_mm, 18 | :flipper_length_mm, 19 | :body_mass_g, 20 | :sex, 21 | :delta_15_n_permil, 22 | :delta_13_c_permil, 23 | :comments) 24 | class SpeciesBase < Dataset 25 | def initialize 26 | super 27 | species = self.class.name.split("::").last.downcase 28 | @metadata.id = "palmerpenguins-#{species}" 29 | package_id = http_parameters["packageid"] 30 | @metadata.url = "https://portal.edirepository.org/nis/mapbrowse" + 31 | "?packageid=#{package_id}" 32 | @metadata.licenses = ["CC0-1.0"] 33 | @data_path = cache_dir_path + "#{species}.csv" 34 | end 35 | 36 | attr_reader :data_path 37 | 38 | def each 39 | return to_enum(__method__) unless block_given? 40 | 41 | open_data do |csv| 42 | csv.each do |row| 43 | next if row[0].nil? 44 | record = Record.new(*row.fields) 45 | yield record 46 | end 47 | end 48 | end 49 | 50 | private def open_data 51 | download(data_path, 52 | "https://portal.edirepository.org/nis/dataviewer", 53 | http_method: :post, 54 | http_parameters: http_parameters) 55 | CSV.open(data_path, headers: :first_row, converters: :all) do |csv| 56 | yield csv 57 | end 58 | end 59 | end 60 | 61 | # Adelie penguin data from: https://doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86 62 | class Adelie < SpeciesBase 63 | DOI = "doi.org/10.6073/pasta/abc50eed9138b75f54eaada0841b9b86".freeze 64 | 65 | private def http_parameters 66 | { 67 | "packageid" => "knb-lter-pal.219.3", 68 | "entityid" => "002f3893385f710df69eeebe893144ff", 69 | } 70 | end 71 | end 72 | 73 | # Chinstrap penguin data from: https://doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7 74 | class Chinstrap < SpeciesBase 75 | DOI = "doi.org/10.6073/pasta/409c808f8fc9899d02401bdb04580af7".freeze 76 | 77 | private def http_parameters 78 | { 79 | "packageid" => "knb-lter-pal.221.2", 80 | "entityid" => "fe853aa8f7a59aa84cdd3197619ef462", 81 | } 82 | end 83 | end 84 | 85 | # Gentoo penguin data from: https://doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce 86 | class Gentoo < SpeciesBase 87 | DOI = "doi.org/10.6073/pasta/2b1cff60f81640f182433d23e68541ce".freeze 88 | 89 | private def http_parameters 90 | { 91 | "packageid" => "knb-lter-pal.220.3", 92 | "entityid" => "e03b43c924f226486f2f0ab6709d2381", 93 | } 94 | end 95 | end 96 | end 97 | 98 | # This dataset provides the same dataset as https://github.com/allisonhorst/palmerpenguins 99 | class Penguins < Dataset 100 | Record = Struct.new(:species, 101 | :island, 102 | :bill_length_mm, 103 | :bill_depth_mm, 104 | :flipper_length_mm, 105 | :body_mass_g, 106 | :sex, 107 | :year) 108 | 109 | def initialize 110 | super 111 | @metadata.id = "palmerpenguins" 112 | @metadata.name = "palmerpenguins" 113 | @metadata.url = "https://allisonhorst.github.io/palmerpenguins/" 114 | @metadata.licenses = ["CC0"] 115 | @metadata.description = "A great dataset for data exploration & visualization, as an alternative to iris" 116 | end 117 | 118 | def each(&block) 119 | return to_enum(__method__) unless block_given? 120 | 121 | species_classes = [ 122 | PenguinsRawData::Adelie, 123 | PenguinsRawData::Chinstrap, 124 | PenguinsRawData::Gentoo, 125 | ] 126 | 127 | species_classes.each do |species_class| 128 | species_class.new.each do |raw_record| 129 | yield convert_record(raw_record) 130 | end 131 | end 132 | end 133 | 134 | private def convert_record(raw_record) 135 | Record.new(*cleanse_fields(raw_record)) 136 | end 137 | 138 | private def cleanse_fields(raw_record) 139 | species = raw_record.species.split(' ')[0] 140 | flipper_length_mm = raw_record.flipper_length_mm&.to_i 141 | body_mass_g = raw_record.body_mass_g&.to_i 142 | sex = normalize_sex(raw_record.sex) 143 | year = raw_record.date_egg&.year 144 | 145 | [ 146 | species, 147 | raw_record.island, 148 | raw_record.culmen_length_mm, 149 | raw_record.culmen_depth_mm, 150 | flipper_length_mm, 151 | body_mass_g, 152 | sex, 153 | year 154 | ] 155 | end 156 | 157 | private def normalize_sex(val) 158 | val = val&.downcase 159 | case val 160 | when "female", "male", nil 161 | val 162 | else 163 | nil 164 | end 165 | end 166 | end 167 | end 168 | -------------------------------------------------------------------------------- /lib/datasets/penn-treebank.rb: -------------------------------------------------------------------------------- 1 | require_relative "dataset" 2 | 3 | module Datasets 4 | class PennTreebank < Dataset 5 | Record = Struct.new(:word) 6 | 7 | DESCRIPTION = <<~DESC 8 | `Penn Tree Bank `_ is originally a 9 | corpus of English sentences with linguistic structure annotations. This 10 | function uses a variant distributed at 11 | `https://github.com/wojzaremba/lstm `_, 12 | which omits the annotation and splits the dataset into three parts: 13 | training, validation, and test. 14 | DESC 15 | 16 | def initialize(type: :train) 17 | valid_types = [:train, :test, :valid] 18 | unless valid_types.include?(type) 19 | valid_types_label = valid_types.collect(&:inspect).join(", ") 20 | message = "Type must be one of [#{valid_types_label}]: #{type.inspect}" 21 | raise ArgumentError, message 22 | end 23 | @type = type 24 | 25 | super() 26 | 27 | @metadata.id = "penn-treebank-#{@type}" 28 | @metadata.name = "Penn Treebank: #{@type}" 29 | @metadata.description = DESCRIPTION 30 | @metadata.url = "https://github.com/wojzaremba/lstm" 31 | @metadata.licenses = ["Apache-2.0"] 32 | end 33 | 34 | def each(&block) 35 | return to_enum(__method__) unless block_given? 36 | 37 | base_name = "ptb.#{@type}.txt" 38 | data_path = cache_dir_path + base_name 39 | base_url = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data" 40 | download(data_path, "#{base_url}/#{base_name}") 41 | 42 | parse_data(data_path, &block) 43 | end 44 | 45 | private 46 | def parse_data(data_path) 47 | File.open(data_path) do |f| 48 | f.each_line do |line| 49 | line.split.each do |word| 50 | yield(Record.new(word.strip)) 51 | end 52 | end 53 | end 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/datasets/pmjt-dataset-list.rb: -------------------------------------------------------------------------------- 1 | require_relative "dataset" 2 | 3 | module Datasets 4 | class PMJTDatasetList < Dataset 5 | Record = Struct.new(:unit, 6 | :open_data_category, 7 | :tag, 8 | :release_time, 9 | :n_volumes, 10 | :type, 11 | :publication_year, 12 | :original_request_code, 13 | :id, 14 | :title, 15 | :text, 16 | :bibliographical_introduction, 17 | :year) 18 | 19 | def initialize 20 | super() 21 | @metadata.id = "pmjt-dataset-list" 22 | @metadata.name = "List of pre-modern Japanese text dataset" 23 | @metadata.url = "http://codh.rois.ac.jp/pmjt/" 24 | @metadata.licenses = ["CC-BY-SA-4.0"] 25 | @metadata.description = <<~DESCRIPTION 26 | Pre-Modern Japanese Text, owned by National Institute of Japanese Literature, is released image and text data as open data. 27 | In addition, some text has description, transcription, and tagging data. 28 | DESCRIPTION 29 | 30 | @data_path = cache_dir_path + (@metadata.id + ".csv") 31 | end 32 | 33 | def each(&block) 34 | return to_enum(__method__) unless block_given? 35 | 36 | latest_version = "201901" 37 | url = "http://codh.rois.ac.jp/pmjt/list/pmjt-dataset-list-#{latest_version}.csv" 38 | download(@data_path, url) 39 | CSV.open(@data_path, headers: :first_row, encoding: "Windows-31J:UTF-8") do |csv| 40 | csv.each do |row| 41 | record = create_record(row) 42 | yield record 43 | end 44 | end 45 | end 46 | 47 | private 48 | def create_record(csv_row) 49 | record = Record.new 50 | record.unit = csv_row["(単位)"] 51 | record.open_data_category = csv_row["オープンデータ分類"] 52 | record.tag = csv_row["タグ"] 53 | record.release_time = csv_row["公開時期"] 54 | record.n_volumes = csv_row["冊数等"] 55 | record.type = csv_row["刊・写"] 56 | record.publication_year = csv_row["刊年・書写年"] 57 | record.original_request_code = csv_row["原本請求記号"] 58 | record.id = csv_row["国文研書誌ID"] 59 | record.title = csv_row["書名(統一書名)"] 60 | record.text = csv_row["本文"] 61 | record.bibliographical_introduction = csv_row["解題"] 62 | record.year = csv_row["(西暦)"] 63 | 64 | record 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/datasets/postal-code-japan.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | require "zip" 3 | 4 | require_relative "dataset" 5 | 6 | module Datasets 7 | class PostalCodeJapan < Dataset 8 | class Record < Struct.new(:organization_code, 9 | :old_postal_code, 10 | :postal_code, 11 | :prefecture_reading, 12 | :city_reading, 13 | :address_reading, 14 | :prefecture, 15 | :city, 16 | :address, 17 | :have_multiple_postal_codes, 18 | :have_address_number_per_koaza, 19 | :have_chome, 20 | :postal_code_is_shared, 21 | :changed, 22 | :change_reason) 23 | alias_method :have_multiple_postal_codes?, 24 | :have_multiple_postal_codes 25 | alias_method :have_address_number_per_koaza?, 26 | :have_address_number_per_koaza 27 | alias_method :have_chome?, 28 | :have_chome 29 | alias_method :postal_code_is_shared?, 30 | :postal_code_is_shared 31 | alias_method :changed?, 32 | :changed 33 | end 34 | 35 | VALID_READINGS = [ 36 | :lowercase, 37 | :uppercase, 38 | :romaji, 39 | ] 40 | def initialize(reading: :lowercase) 41 | super() 42 | @reading = reading 43 | unless VALID_READINGS.include?(@reading) 44 | message = +":reading must be one of [" 45 | message << VALID_READINGS.collect(&:inspect).join(", ") 46 | message << "]: #{@reading.inspect}" 47 | raise ArgumentError, message 48 | end 49 | @metadata.id = "postal-code-japan-#{@reading}" 50 | @metadata.name = "Postal code in Japan (#{@reading})" 51 | @metadata.url = "https://www.post.japanpost.jp/zipcode/download.html" 52 | @metadata.licenses = ["CC0-1.0"] 53 | @metadata.description = "Postal code in Japan (reading: #{@reading})" 54 | end 55 | 56 | def each(&block) 57 | return to_enum(__method__) unless block_given? 58 | 59 | open_data do |input| 60 | utf8_data = input.read.encode(Encoding::UTF_8, Encoding::CP932) 61 | options = { 62 | quote_char: nil, 63 | strip: %Q["], 64 | } 65 | if @reading == :romaji 66 | CSV.parse(utf8_data, **options) do |row| 67 | yield(Record.new(nil, 68 | nil, 69 | row[0], 70 | row[4], 71 | row[5], 72 | row[6], 73 | row[1], 74 | row[2], 75 | row[3], 76 | false, 77 | false, 78 | false, 79 | false, 80 | false, 81 | nil)) 82 | end 83 | else 84 | CSV.parse(utf8_data, **options) do |row| 85 | yield(Record.new(row[0], 86 | row[1].rstrip, 87 | row[2], 88 | row[3], 89 | row[4], 90 | row[5], 91 | row[6], 92 | row[7], 93 | row[8], 94 | (row[9] == "1"), 95 | (row[10] == "1"), 96 | (row[11] == "1"), 97 | (row[12] == "1"), 98 | (row[13] != "0"), 99 | convert_change_reason(row[14]))) 100 | end 101 | end 102 | end 103 | end 104 | 105 | private 106 | def open_data 107 | data_url = +"https://www.post.japanpost.jp/zipcode/dl" 108 | case @reading 109 | when :lowercase 110 | data_url << "/kogaki/zip/ken_all.zip" 111 | when :uppercase 112 | data_url << "/oogaki/zip/ken_all.zip" 113 | when :romaji 114 | data_url << "/roman/KEN_ALL_ROME.zip" 115 | end 116 | data_path = cache_dir_path + "#{@reading}-ken-all.zip" 117 | download(data_path, data_url) 118 | 119 | Zip::File.open(data_path.to_s) do |zip_file| 120 | zip_file.each do |entry| 121 | next unless entry.file? 122 | entry.get_input_stream do |input| 123 | yield(input) 124 | end 125 | end 126 | end 127 | end 128 | 129 | def convert_change_reason(reason) 130 | case reason 131 | when "0" 132 | nil 133 | when "1" 134 | :new 135 | when "2" 136 | :japanese_addressing_system 137 | when "3" 138 | :land_readjustment 139 | when "4" 140 | :postal_district_adjustment 141 | when "5" 142 | :correction 143 | when "6" 144 | :deletion 145 | else 146 | :unknown 147 | end 148 | end 149 | end 150 | end 151 | -------------------------------------------------------------------------------- /lib/datasets/quora-duplicate-question-pair.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class QuoraDuplicateQuestionPair < Dataset 7 | class Record < Struct.new(:id, 8 | :first_question_id, 9 | :second_question_id, 10 | :first_question, 11 | :second_question, 12 | :duplicated) 13 | alias_method :duplicated?, :duplicated 14 | end 15 | 16 | def initialize 17 | super() 18 | @metadata.id = "quora-duplicate-question-pair" 19 | @metadata.name = "Quora's duplicated question pair dataset" 20 | @metadata.url = "https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs" 21 | @metadata.licenses = [ 22 | { 23 | name: "Quora's Terms of Service", 24 | url: "https://www.quora.com/about/tos", 25 | } 26 | ] 27 | end 28 | 29 | def each 30 | return to_enum(__method__) unless block_given? 31 | 32 | open_data do |csv| 33 | csv.each do |row| 34 | row["is_duplicate"] = (row["is_duplicate"] == 1) 35 | record = Record.new(*row.fields) 36 | yield(record) 37 | end 38 | end 39 | end 40 | 41 | private 42 | def open_data 43 | data_path = cache_dir_path + "quora_duplicate_questions.tsv" 44 | data_url = "https://qim.fs.quoracdn.net/quora_duplicate_questions.tsv" 45 | download(data_path, data_url) 46 | CSV.open(data_path, col_sep: "\t", headers: true, converters: :integer) do |csv| 47 | yield(csv) 48 | end 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /lib/datasets/rdataset.rb: -------------------------------------------------------------------------------- 1 | require_relative "dataset" 2 | require_relative "tar-gz-readable" 3 | 4 | module Datasets 5 | class RdatasetList < Dataset 6 | Record = Struct.new(:package, 7 | :dataset, 8 | :title, 9 | :rows, 10 | :cols, 11 | :n_binary, 12 | :n_character, 13 | :n_factor, 14 | :n_logical, 15 | :n_numeric, 16 | :csv, 17 | :doc) 18 | 19 | def initialize 20 | super 21 | @metadata.id = "rdataset-list" 22 | @metadata.name = "Rdataset" 23 | @metadata.url = "https://vincentarelbundock.github.io/Rdatasets/" 24 | @metadata.licenses = ["GPL-3"] 25 | @data_url = "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/datasets.csv" 26 | @data_path = cache_dir_path + "datasets.csv" 27 | end 28 | 29 | def filter(package: nil, dataset: nil) 30 | return to_enum(__method__, package: package, dataset: dataset) unless block_given? 31 | 32 | conds = {} 33 | conds["Package"] = package if package 34 | conds["Item"] = dataset if dataset 35 | if conds.empty? 36 | each_row {|row| yield Record.new(*row.fields) } 37 | else 38 | each_row do |row| 39 | if conds.all? {|k, v| row[k] == v } 40 | yield Record.new(*row.fields) 41 | end 42 | end 43 | end 44 | end 45 | 46 | def each(&block) 47 | filter(&block) 48 | end 49 | 50 | private def each_row(&block) 51 | download(@data_path, @data_url) 52 | CSV.open(@data_path, headers: :first_row, converters: :all) do |csv| 53 | csv.each(&block) 54 | end 55 | end 56 | end 57 | 58 | # For backward compatibility 59 | RdatasetsList = RdatasetList 60 | 61 | class Rdataset < Dataset 62 | def initialize(package_name, dataset_name) 63 | list = RdatasetList.new 64 | 65 | info = list.filter(package: package_name, dataset: dataset_name).first 66 | unless info 67 | raise ArgumentError, "Unable to locate dataset #{package_name}/#{dataset_name}" 68 | end 69 | 70 | super() 71 | @metadata.id = "rdataset-#{package_name}-#{dataset_name}" 72 | @metadata.name = "Rdataset: #{package_name}: #{dataset_name}" 73 | @metadata.url = info.csv 74 | @metadata.licenses = ["GPL-3"] 75 | @metadata.description = info.title 76 | 77 | # Follow the original directory structure in the cache directory 78 | @data_path = cache_dir_path + (dataset_name + ".csv") 79 | 80 | @package_name = package_name 81 | @dataset_name = dataset_name 82 | end 83 | 84 | def each(&block) 85 | return to_enum(__method__) unless block_given? 86 | 87 | download(@data_path, @metadata.url) 88 | 89 | na_converter = lambda do |field| 90 | begin 91 | if field.encode(CSV::ConverterEncoding) == "NA" 92 | nil 93 | else 94 | field 95 | end 96 | rescue 97 | field 98 | end 99 | end 100 | 101 | inf_converter = lambda do |field| 102 | begin 103 | if field.encode(CSV::ConverterEncoding) == "Inf" 104 | Float::INFINITY 105 | else 106 | field 107 | end 108 | rescue 109 | field 110 | end 111 | end 112 | 113 | quote_preserving_converter = lambda do |field, info| 114 | f = field.encode(CSV::ConverterEncoding) 115 | return f if info.quoted? 116 | 117 | begin 118 | begin 119 | begin 120 | return DateTime.parse(f) if f.match?(DateTimeMatcher) 121 | rescue 122 | return Integer(f) 123 | end 124 | rescue 125 | return Float(f) 126 | end 127 | rescue 128 | field 129 | end 130 | end 131 | 132 | table = CSV.table(@data_path, 133 | header_converters: [:symbol_raw], 134 | # quote_preserving_converter should be the last 135 | converters: [na_converter, inf_converter, quote_preserving_converter]) 136 | table.delete(:"") # delete 1st column for indices. 137 | 138 | table.each do |row| 139 | yield row.to_h 140 | end 141 | end 142 | end 143 | 144 | # For backward compatibility 145 | Rdatasets = Rdataset 146 | end 147 | -------------------------------------------------------------------------------- /lib/datasets/seaborn.rb: -------------------------------------------------------------------------------- 1 | require "json" 2 | 3 | module Datasets 4 | class SeabornList < Dataset 5 | def initialize 6 | super 7 | @metadata.id = "seaborn-data-list" 8 | @metadata.name = "seaborn: data list" 9 | @metadata.url = "https://github.com/mwaskom/seaborn-data" 10 | # Treat as the same license as seaborn 11 | @metadata.licenses = ["BSD-3-Clause"] 12 | @metadata.description = "Datasets for seaborn examples." 13 | end 14 | 15 | def each(&block) 16 | return to_enum(__method__) unless block_given? 17 | 18 | data_path = cache_dir_path + "trees.json" 19 | url = "https://api.github.com/repos/mwaskom/seaborn-data/git/trees/master" 20 | download(data_path, url) 21 | 22 | tree = JSON.parse(File.read(data_path))["tree"] 23 | tree.each do |content| 24 | path = content["path"] 25 | next unless path.end_with?(".csv") 26 | dataset = File.basename(path, ".csv") 27 | record = {dataset: dataset} 28 | yield record 29 | end 30 | end 31 | end 32 | 33 | class Seaborn < Dataset 34 | URL_FORMAT = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/%{name}.csv".freeze 35 | 36 | def initialize(name) 37 | super() 38 | @metadata.id = "seaborn-#{name}" 39 | @metadata.name = "seaborn: #{name}" 40 | @metadata.url = URL_FORMAT % {name: name} 41 | # @metadata.licenses = TODO 42 | 43 | @name = name 44 | end 45 | 46 | def each(&block) 47 | return to_enum(__method__) unless block_given? 48 | 49 | data_path = cache_dir_path + "#{@name}.csv" 50 | download(data_path, @metadata.url) 51 | CSV.open(data_path, headers: :first_row, converters: :all) do |csv| 52 | csv.each do |row| 53 | record = prepare_record(row) 54 | yield record 55 | end 56 | end 57 | end 58 | 59 | private 60 | def prepare_record(csv_row) 61 | record = csv_row.to_h 62 | record.transform_keys! do |key| 63 | if key.nil? 64 | :index 65 | else 66 | key.to_sym 67 | end 68 | end 69 | 70 | # Perform the same preprocessing as seaborn's load_dataset function 71 | preprocessor = :"preprocess_#{@name}_record" 72 | __send__(preprocessor, record) if respond_to?(preprocessor, true) 73 | 74 | record 75 | end 76 | 77 | # The same preprocessing as seaborn.load_dataset 78 | def preprocess_flights_record(record) 79 | record[:month] &&= record[:month][0,3] 80 | end 81 | 82 | # The same preprocessing as seaborn.load_dataset 83 | def preprocess_penguins_record(record) 84 | record[:sex] &&= record[:sex].capitalize 85 | end 86 | end 87 | 88 | # For backward compatibility 89 | SeabornData = Seaborn 90 | end 91 | -------------------------------------------------------------------------------- /lib/datasets/sudachi-synonym-dictionary.rb: -------------------------------------------------------------------------------- 1 | require "csv" 2 | 3 | require_relative "dataset" 4 | 5 | module Datasets 6 | class SudachiSynonymDictionary < Dataset 7 | class Synonym < Struct.new(:group_id, 8 | :is_noun, 9 | :expansion_type, 10 | :lexeme_id, 11 | :form_type, 12 | :acronym_type, 13 | :variant_type, 14 | :categories, 15 | :notation) 16 | alias_method :noun?, :is_noun 17 | end 18 | 19 | def initialize 20 | super() 21 | @metadata.id = "sudachi-synonym-dictionary" 22 | @metadata.name = "Sudachi synonym dictionary" 23 | @metadata.url = "https://github.com/WorksApplications/SudachiDict/blob/develop/docs/synonyms.md" 24 | @metadata.licenses = ["Apache-2.0"] 25 | @metadata.description = lambda do 26 | download_description 27 | end 28 | end 29 | 30 | def each 31 | return to_enum(__method__) unless block_given? 32 | 33 | lexeme_id_context = {} 34 | open_data do |csv| 35 | csv.each do |row| 36 | group_id = row[0] 37 | if group_id != lexeme_id_context[:group_id] 38 | lexeme_id_context[:group_id] = group_id 39 | lexeme_id_context[:counter] = 0 40 | end 41 | is_noun = (row[1] == "1") 42 | expansion_type = normalize_expansion_type(row[2]) 43 | lexeme_id = normalize_lexeme_id(row[3], lexeme_id_context) 44 | form_type = normalize_form_type(row[4]) 45 | acronym_type = normalize_acronym_type(row[5]) 46 | variant_type = normalize_variant_type(row[6]) 47 | categories = normalize_categories(row[7]) 48 | notation = row[8] 49 | synonym = Synonym.new(group_id, 50 | is_noun, 51 | expansion_type, 52 | lexeme_id, 53 | form_type, 54 | acronym_type, 55 | variant_type, 56 | categories, 57 | notation) 58 | yield(synonym) 59 | end 60 | end 61 | end 62 | 63 | private 64 | def open_data 65 | data_path = cache_dir_path + "synonyms.txt" 66 | data_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/src/main/text/synonyms.txt" 67 | download(data_path, data_url) 68 | CSV.open(data_path, 69 | encoding: "UTF-8", 70 | skip_blanks: true) do |csv| 71 | yield(csv) 72 | end 73 | end 74 | 75 | def download_description 76 | description_path = cache_dir_path + "synonyms.md" 77 | description_url = "https://raw.githubusercontent.com/WorksApplications/SudachiDict/develop/docs/synonyms.md" 78 | download(description_path, description_url) 79 | description_path.read 80 | end 81 | 82 | def normalize_expansion_type(type) 83 | case type 84 | when "0", "" 85 | :always 86 | when "1" 87 | :expanded 88 | when "2" 89 | :never 90 | else 91 | raise Error, "unknown expansion type: #{type.inspect}" 92 | end 93 | end 94 | 95 | def normalize_lexeme_id(id, context) 96 | case id 97 | when "" 98 | lexeme_id_context[:counter] += 1 99 | lexeme_id_context[:counter] 100 | else 101 | # Use only the first lexeme ID. 102 | # Example: 103 | # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネットゲー,, 104 | # 000116,1,0,1/2,0,2,0,(IT/娯楽),ネトゲ,, 105 | Integer(id.split("/").first, 10) 106 | end 107 | end 108 | 109 | def normalize_form_type(type) 110 | case type 111 | when "0", "" 112 | :typical 113 | when "1" 114 | :translation 115 | when "2" 116 | :alias 117 | when "3" 118 | :old_name 119 | when "4" 120 | :misnomer 121 | else 122 | raise Error, "unknown form type: #{type.inspect}" 123 | end 124 | end 125 | 126 | def normalize_acronym_type(type) 127 | case type 128 | when "0", "" 129 | :typical 130 | when "1" 131 | :alphabet 132 | when "2" 133 | :others 134 | else 135 | raise Error, "unknown acronym type: #{type.inspect}" 136 | end 137 | end 138 | 139 | def normalize_variant_type(type) 140 | case type 141 | when "0", "" 142 | :typical 143 | when "1" 144 | :alphabet 145 | when "2" 146 | :general 147 | when "3" 148 | :misspelled 149 | else 150 | raise Error, "unknown variant type: #{type.inspect}" 151 | end 152 | end 153 | 154 | def normalize_categories(categories) 155 | case categories 156 | when "" 157 | nil 158 | when /\A\((.*)\)\z/ 159 | $1.split("/") 160 | else 161 | raise Error, "invalid categories: #{categories.inspect}" 162 | end 163 | end 164 | end 165 | end 166 | -------------------------------------------------------------------------------- /lib/datasets/table.rb: -------------------------------------------------------------------------------- 1 | require "datasets/dictionary" 2 | 3 | module Datasets 4 | class Table 5 | class Record 6 | include Enumerable 7 | 8 | def initialize(table, index) 9 | @table = table 10 | @index = index 11 | end 12 | 13 | def [](column_name_or_column_index) 14 | @table[column_name_or_column_index][@index] 15 | end 16 | 17 | def each 18 | return to_enum(__method__) unless block_given? 19 | @table.each_column.each do |column_name, column_values| 20 | yield(column_name, column_values[@index]) 21 | end 22 | end 23 | 24 | def values 25 | @table.each_column.collect do |_column_name, column_values| 26 | column_values[@index] 27 | end 28 | end 29 | 30 | def to_h 31 | hash = {} 32 | each do |column_name, column_value| 33 | hash[column_name] = column_value 34 | end 35 | hash 36 | end 37 | 38 | def inspect 39 | "#<#{self.class.name} #{@table.dataset.metadata.name}[#{@index}] #{to_h.inspect}>" 40 | end 41 | end 42 | 43 | include Enumerable 44 | 45 | attr_reader :dataset 46 | def initialize(dataset) 47 | @dataset = dataset 48 | @dictionaries = {} 49 | end 50 | 51 | def n_columns 52 | columner_data.size 53 | end 54 | alias_method :size, :n_columns 55 | alias_method :length, :n_columns 56 | 57 | def n_rows 58 | first_column = columner_data.first 59 | return 0 if first_column.nil? 60 | first_column[1].size 61 | end 62 | 63 | def column_names 64 | columner_data.keys 65 | end 66 | 67 | def each_column(&block) 68 | columner_data.each(&block) 69 | end 70 | alias_method :each, :each_column 71 | 72 | def each_record 73 | return to_enum(__method__) unless block_given? 74 | n_rows.times do |i| 75 | yield(Record.new(self, i)) 76 | end 77 | end 78 | 79 | def find_record(row) 80 | row += n_rows if row < 0 81 | return nil if row < 0 82 | return nil if row >= n_rows 83 | Record.new(self, row) 84 | end 85 | 86 | def [](name_or_index) 87 | case name_or_index 88 | when Integer 89 | index = name_or_index 90 | columner_data.each_with_index do |(_name, values), i| 91 | return values if i == index 92 | end 93 | nil 94 | else 95 | name = name_or_index 96 | columner_data[normalize_name(name)] 97 | end 98 | end 99 | 100 | def dictionary_encode(name) 101 | @dictionaries[normalize_name(name)] ||= Dictionary.new(self[name]) 102 | end 103 | 104 | def label_encode(name) 105 | dictionary = dictionary_encode(name) 106 | dictionary.encode(self[name]) 107 | end 108 | 109 | def fetch_values(*keys) 110 | data = columner_data 111 | keys.collect do |key| 112 | if data.key?(key) 113 | data[key] 114 | else 115 | raise build_key_error(key) unless block_given? 116 | yield(key) 117 | end 118 | end 119 | end 120 | 121 | def to_h 122 | columns = {} 123 | @dataset.each do |record| 124 | record.to_h.each do |name, value| 125 | values = (columns[name] ||= []) 126 | values << value 127 | end 128 | end 129 | columns 130 | end 131 | 132 | private 133 | begin 134 | KeyError.new("message", receiver: self, key: :key) 135 | rescue ArgumentError 136 | def build_key_error(key) 137 | KeyError.new("key not found: #{key.inspect}") 138 | end 139 | else 140 | def build_key_error(key) 141 | KeyError.new("key not found: #{key.inspect}", 142 | receiver: self, 143 | key: key) 144 | end 145 | end 146 | 147 | def columner_data 148 | @columns ||= to_h 149 | end 150 | 151 | def normalize_name(name) 152 | name.to_sym 153 | end 154 | end 155 | end 156 | -------------------------------------------------------------------------------- /lib/datasets/tar-gz-readable.rb: -------------------------------------------------------------------------------- 1 | require "rubygems/package" 2 | require "zlib" 3 | 4 | module Datasets 5 | module TarGzReadable 6 | def open_tar_gz(data_path) 7 | Zlib::GzipReader.open(data_path) do |f| 8 | Gem::Package::TarReader.new(f) do |tar| 9 | yield(tar) 10 | end 11 | end 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /lib/datasets/version.rb: -------------------------------------------------------------------------------- 1 | module Datasets 2 | VERSION = "0.2.1" 3 | end 4 | -------------------------------------------------------------------------------- /lib/datasets/wikipedia.rb: -------------------------------------------------------------------------------- 1 | require "rexml/streamlistener" 2 | require "rexml/parsers/baseparser" 3 | require "rexml/parsers/streamparser" 4 | require "time" 5 | 6 | require_relative "dataset" 7 | 8 | module Datasets 9 | class Wikipedia < Dataset 10 | Contributor = Struct.new(:user_name, 11 | :id) 12 | Revision = Struct.new(:id, 13 | :parent_id, 14 | :timestamp, 15 | :contributor, 16 | :minor, 17 | :comment, 18 | :model, 19 | :format, 20 | :text, 21 | :sha1) 22 | Page = Struct.new(:title, 23 | :namespace, 24 | :id, 25 | :restrictions, 26 | :redirect, 27 | :revision) 28 | 29 | def initialize(language: :en, 30 | type: :articles) 31 | super() 32 | @language = language 33 | @type = type 34 | @metadata.id = "wikipedia-#{@language}-#{@type}" 35 | @metadata.name = "Wikipedia #{@type} (#{@language})" 36 | @metadata.url = "https://dumps.wikimedia.org/" 37 | @metadata.licenses = [ 38 | "CC-BY-SA-3.0", 39 | "CC-BY-SA-4.0", 40 | "GFDL-1.3-or-later", 41 | ] 42 | @metadata.description = "Wikipedia #{@type} in #{@language}" 43 | end 44 | 45 | def each(&block) 46 | return to_enum(__method__) unless block_given? 47 | 48 | open_data do |input| 49 | listener = ArticlesListener.new(block) 50 | parser = REXML::Parsers::StreamParser.new(input, listener) 51 | parser.parse 52 | end 53 | end 54 | 55 | private 56 | def base_name 57 | "#{@language}wiki-latest-#{type_in_path}.xml.bz2" 58 | end 59 | 60 | def data_path 61 | cache_dir_path + base_name 62 | end 63 | 64 | def open_data(&block) 65 | data_url = "https://dumps.wikimedia.org/#{@language}wiki/latest/#{base_name}" 66 | bz2 = Enumerator.new do |yielder| 67 | download(data_path, data_url) do |bz2_chunk| 68 | yielder << bz2_chunk 69 | end 70 | end 71 | extract_bz2(bz2, &block) 72 | end 73 | 74 | def type_in_path 75 | case @type 76 | when :articles 77 | "pages-articles" 78 | else 79 | @type.to_s 80 | end 81 | end 82 | 83 | class ArticlesListener 84 | include REXML::StreamListener 85 | 86 | def initialize(block) 87 | @block = block 88 | @page = nil 89 | @revision = nil 90 | @contributor = nil 91 | @current_tag = nil 92 | @tag_stack = [] 93 | @text_stack = [+""] 94 | @first_page = true 95 | end 96 | 97 | def tag_start(name, attributes) 98 | push_stacks(name) 99 | case name 100 | when "page" 101 | @page = Page.new 102 | when "revision" 103 | @revision = Revision.new 104 | when "contributor" 105 | @contributor = Contributor.new 106 | when "redirect" 107 | @page.redirect = attributes["title"] 108 | end 109 | end 110 | 111 | def tag_end(name) 112 | case name 113 | when "page" 114 | on_page(@page) 115 | @page = nil 116 | when "title" 117 | @page.title = @text_stack.last 118 | when "ns" 119 | @page.namespace = Integer(@text_stack.last) 120 | when "id" 121 | id = Integer(@text_stack.last) 122 | case @tag_stack[-2] 123 | when "page" 124 | @page.id = id 125 | when "revision" 126 | @revision.id = id 127 | when "contributor" 128 | @contributor.id = id 129 | end 130 | when "restrictions" 131 | @page.restrictions = @text_stack.last.split(":") 132 | when "revision" 133 | @page.revision = @revision 134 | @revision = nil 135 | when "parentid" 136 | @revision.parent_id = Integer(@text_stack.last) 137 | when "timestamp" 138 | @revision.timestamp = Time.iso8601(@text_stack.last) 139 | when "contributor" 140 | @revision.contributor = @contributor 141 | @contributor = nil 142 | when "username" 143 | @contributor.user_name = @text_stack.last 144 | when "minor" 145 | # TODO 146 | when "comment" 147 | @revision.comment = @text_stack.last 148 | when "model" 149 | @revision.model = @text_stack.last 150 | when "format" 151 | @revision.format = @text_stack.last 152 | when "text" 153 | @revision.text = @text_stack.last 154 | when "sha1" 155 | @revision.sha1 = @text_stack.last 156 | end 157 | pop_stacks 158 | end 159 | 160 | def text(data) 161 | @text_stack.last << data 162 | end 163 | 164 | def cdata(content) 165 | @text_stack.last << content 166 | end 167 | 168 | private 169 | def on_page(page) 170 | @block.call(page) 171 | end 172 | 173 | def push_stacks(tag) 174 | @tag_stack << tag 175 | @text_stack << +"" 176 | end 177 | 178 | def pop_stacks 179 | @text_stack.pop 180 | @tag_stack.pop 181 | end 182 | end 183 | end 184 | end 185 | -------------------------------------------------------------------------------- /lib/datasets/wine.rb: -------------------------------------------------------------------------------- 1 | require 'csv' 2 | 3 | require_relative 'dataset' 4 | 5 | module Datasets 6 | class Wine < Dataset 7 | Record = Struct.new(:label, 8 | :alcohol, 9 | :malic_acid, 10 | :ash, 11 | :alcalinity_of_ash, 12 | :n_magnesiums, 13 | :total_phenols, 14 | :total_flavonoids, 15 | :total_nonflavanoid_phenols, 16 | :total_proanthocyanins, 17 | :color_intensity, 18 | :hue, 19 | :optical_nucleic_acid_concentration, 20 | :n_prolines) 21 | 22 | def initialize 23 | super 24 | @metadata.id = 'wine' 25 | @metadata.name = 'Wine' 26 | @metadata.url = 'https://archive.ics.uci.edu/ml/datasets/wine' 27 | @metadata.licenses = ["CC-BY-4.0"] 28 | @metadata.description = -> { read_names } 29 | end 30 | 31 | def each 32 | return to_enum(__method__) unless block_given? 33 | 34 | open_data do |csv| 35 | csv.each do |row| 36 | next if row[0].nil? 37 | record = Record.new(*row) 38 | yield(record) 39 | end 40 | end 41 | end 42 | 43 | private 44 | 45 | def read_names 46 | names_path = cache_dir_path + 'wine.names' 47 | names_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names' 48 | download(names_path, names_url) 49 | names_path.read 50 | end 51 | 52 | def open_data 53 | data_path = cache_dir_path + 'wine.data' 54 | data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' 55 | download(data_path, data_url) 56 | CSV.open(data_path, converters: %i[numeric]) do |csv| 57 | yield(csv) 58 | end 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /lib/datasets/zip-extractor.rb: -------------------------------------------------------------------------------- 1 | require 'zip' 2 | 3 | module Datasets 4 | class ZipExtractor 5 | def initialize(path) 6 | @path = path 7 | end 8 | 9 | def extract_first_file 10 | Zip::File.open(@path) do |zip_file| 11 | zip_file.each do |entry| 12 | next unless entry.file? 13 | 14 | entry.get_input_stream do |input| 15 | return yield(input) 16 | end 17 | end 18 | end 19 | nil 20 | end 21 | 22 | def extract_file(file_path) 23 | Zip::File.open(@path) do |zip_file| 24 | zip_file.each do |entry| 25 | next unless entry.file? 26 | next unless entry.name == file_path 27 | 28 | entry.get_input_stream do |input| 29 | return yield(input) 30 | end 31 | end 32 | end 33 | nil 34 | end 35 | 36 | def extract_files 37 | Zip::File.open(@path) do |zip_file| 38 | zip_file.each do |entry| 39 | next unless entry.file? 40 | 41 | entry.get_input_stream do |input| 42 | yield(input) 43 | end 44 | end 45 | end 46 | end 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /red-datasets.gemspec: -------------------------------------------------------------------------------- 1 | # -*- ruby -*- 2 | 3 | clean_white_space = lambda do |entry| 4 | entry.gsub(/(\A\n+|\n+\z)/, '') + "\n" 5 | end 6 | 7 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), "lib")) 8 | require "datasets/version" 9 | 10 | Gem::Specification.new do |spec| 11 | spec.name = "red-datasets" 12 | spec.version = Datasets::VERSION 13 | spec.homepage = "https://github.com/red-data-tools/red-datasets" 14 | spec.authors = ["tomisuker", "Kouhei Sutou"] 15 | spec.email = ["tomisuker16@gmail.com", "kou@clear-code.com"] 16 | 17 | readme = File.read("README.md") 18 | readme.force_encoding("UTF-8") 19 | entries = readme.split(/^\#\#\s(.*)$/) 20 | clean_white_space.call(entries[entries.index("Description") + 1]) 21 | description = clean_white_space.call(entries[entries.index("Description") + 1]) 22 | spec.summary, spec.description, = description.split(/\n\n+/, 3) 23 | spec.license = "MIT" 24 | spec.files = [ 25 | "README.md", 26 | "LICENSE.txt", 27 | "Rakefile", 28 | "Gemfile", 29 | "#{spec.name}.gemspec", 30 | ] 31 | spec.files += [".yardopts"] 32 | spec.files += Dir.glob("lib/**/*.rb") 33 | spec.files += Dir.glob("image/*.*") 34 | spec.files += Dir.glob("doc/text/*") 35 | spec.test_files += Dir.glob("test/**/*") 36 | 37 | spec.add_runtime_dependency("csv", ">= 3.2.4") 38 | spec.add_runtime_dependency("rexml") 39 | spec.add_runtime_dependency("rubyzip") 40 | 41 | spec.add_development_dependency("bundler") 42 | spec.add_development_dependency("rake") 43 | spec.add_development_dependency("test-unit") 44 | spec.add_development_dependency("yard") 45 | spec.add_development_dependency("kramdown") 46 | end 47 | -------------------------------------------------------------------------------- /test/helper.rb: -------------------------------------------------------------------------------- 1 | require "fileutils" 2 | require "pathname" 3 | require "time" 4 | require "tmpdir" 5 | 6 | require "datasets" 7 | 8 | require "test-unit" 9 | 10 | module Helper 11 | module Sandbox 12 | def setup_sandbox 13 | @tmp_dir = (Pathname.new(__dir__) + "tmp").expand_path 14 | FileUtils.mkdir_p(@tmp_dir) 15 | end 16 | 17 | def teardown_sandbox 18 | return unless defined?(@tmp_dir) 19 | FileUtils.rm_rf(@tmp_dir) 20 | end 21 | end 22 | 23 | module PathRestorable 24 | def restore_path(path) 25 | unless path.exist? 26 | return yield 27 | end 28 | 29 | Dir.mktmpdir do |dir| 30 | FileUtils.cp_r(path, dir, preserve: true) 31 | begin 32 | yield 33 | ensure 34 | FileUtils.rmtree(path, secure: true) if path.exist? 35 | FileUtils.cp_r(Pathname(dir) + path.basename, 36 | path, 37 | preserve: true) 38 | end 39 | end 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /test/japanese-date-parser-test.rb: -------------------------------------------------------------------------------- 1 | class JapaneseDateParserTest < Test::Unit::TestCase 2 | def setup 3 | @parser = Datasets::JapaneseDateParser.new 4 | end 5 | 6 | data("month and day with leading a space in Heisei", ["H10.01.01", "平成10年 1月 1日"]) 7 | data("month with leading a space in Heisei", ["H10.01.10", "平成10年 1月10日"]) 8 | data(" day with leading a space in Heisei", ["H10.10.01", "平成10年10月 1日"]) 9 | data(" without leading a space in Heisei", ["H10.10.10", "平成10年10月10日"]) 10 | data("year, month and day with leading a space in Reiwa", ["R02.01.01", "令和 2年 1月 1日"]) 11 | data("year, month with leading a space in Reiwa", ["R02.01.10", "令和 2年 1月10日"]) 12 | data("year, day with leading a space in Reiwa", ["R02.10.01", "令和 2年10月 1日"]) 13 | data("year, without leading a space in Reiwa", ["R02.10.10", "令和 2年10月10日"]) 14 | data("boundary within Heisei", ["H31.04.30", "平成31年 4月30日"]) 15 | data("boundary within Reiwa", ["R01.05.01", "令和元年 5月 1日"]) 16 | test("#parse") do 17 | expected_jisx0301, japanese_date_string = data 18 | assert_equal(expected_jisx0301, @parser.parse(japanese_date_string).jisx0301) 19 | end 20 | 21 | test("unsupported era initial range") do 22 | expected_message = "era must be one of [平成, 令和]: 昭和" 23 | assert_raise(Datasets::JapaneseDateParser::UnsupportedEraInitialRange.new(expected_message)) do 24 | @parser.parse("昭和元年 1月 1日") 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /test/run-test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | $VERBOSE = true 4 | 5 | require "pathname" 6 | 7 | base_dir = Pathname.new(__FILE__).dirname.parent.expand_path 8 | 9 | lib_dir = base_dir + "lib" 10 | test_dir = base_dir + "test" 11 | 12 | $LOAD_PATH.unshift(lib_dir.to_s) 13 | 14 | require_relative "helper" 15 | 16 | ARGV.unshift("--max-diff-target-string-size=#{10 * 1024}") 17 | 18 | exit(Test::Unit::AutoRunner.run(true, test_dir.to_s)) 19 | -------------------------------------------------------------------------------- /test/test-adult.rb: -------------------------------------------------------------------------------- 1 | class AdultTest < Test::Unit::TestCase 2 | sub_test_case("train") do 3 | def setup 4 | @dataset = Datasets::Adult.new(type: :train) 5 | end 6 | 7 | def record(*args) 8 | Datasets::Adult::Record.new(*args) 9 | end 10 | 11 | test("#each") do 12 | assert_equal({ 13 | :age => 39, 14 | :work_class => "State-gov", 15 | :final_weight => 77516, 16 | :education => "Bachelors", 17 | :n_education_years => 13, 18 | :marital_status => "Never-married", 19 | :occupation => "Adm-clerical", 20 | :relationship => "Not-in-family", 21 | :race => "White", 22 | :sex => "Male", 23 | :capital_gain => 2174, 24 | :capital_loss => 0, 25 | :hours_per_week => 40, 26 | :native_country => "United-States", 27 | :label => "<=50K" 28 | }, 29 | @dataset.each.next.to_h) 30 | end 31 | end 32 | 33 | sub_test_case("test") do 34 | def setup 35 | @dataset = Datasets::Adult.new(type: :test) 36 | end 37 | 38 | def record(*args) 39 | Datasets::Adult::Record.new(*args) 40 | end 41 | 42 | test("#each") do 43 | assert_equal({ 44 | :age => 25, 45 | :work_class => "Private", 46 | :final_weight => 226802, 47 | :education => "11th", 48 | :n_education_years => 7, 49 | :marital_status => "Never-married", 50 | :occupation => "Machine-op-inspct", 51 | :relationship => "Own-child", 52 | :race => "Black", 53 | :sex => "Male", 54 | :capital_gain => 0, 55 | :capital_loss => 0, 56 | :hours_per_week => 40, 57 | :native_country => "United-States", 58 | :label => "<=50K.", 59 | }, 60 | @dataset.each.next.to_h) 61 | end 62 | end 63 | 64 | sub_test_case("#metadata") do 65 | def setup 66 | @dataset = Datasets::Adult.new(type: :train) 67 | end 68 | 69 | test("#description") do 70 | description = @dataset.metadata.description 71 | assert do 72 | description.start_with?("| This data was extracted from the census bureau database found at") 73 | end 74 | end 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /test/test-afinn.rb: -------------------------------------------------------------------------------- 1 | class AFINNTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::AFINN.new 4 | end 5 | 6 | test('#each') do 7 | records = @dataset.each.to_a 8 | assert_equal([ 9 | 2477, 10 | { 11 | :valence => -2, 12 | :word => "abandon" 13 | }, 14 | { 15 | :valence => 2, 16 | :word => "zealous" 17 | }, 18 | ], 19 | [ 20 | records.size, 21 | records[0].to_h, 22 | records[-1].to_h, 23 | ]) 24 | end 25 | 26 | sub_test_case('#metadata') do 27 | test('#description') do 28 | description = @dataset.metadata.description 29 | assert_equal(<<-DESCRIPTION.chomp, description) 30 | AFINN is a list of English words rated for valence with an integer 31 | between minus five (negative) and plus five (positive). The words have 32 | been manually labeled by Finn Årup Nielsen in 2009-2011. The file 33 | is tab-separated. There are two versions: 34 | 35 | AFINN-111: Newest version with 2477 words and phrases. 36 | 37 | An evaluation of the word list is available in: 38 | 39 | Finn Årup Nielsen, "A new ANEW: Evaluation of a word list for 40 | sentiment analysis in microblogs", http://arxiv.org/abs/1103.2903 41 | 42 | The list was used in: 43 | 44 | Lars Kai Hansen, Adam Arvidsson, Finn Årup Nielsen, Elanor Colleoni, 45 | Michael Etter, "Good Friends, Bad News - Affect and Virality in 46 | Twitter", The 2011 International Workshop on Social Computing, 47 | Network, and Services (SocialComNet 2011). 48 | 49 | 50 | This database of words is copyright protected and distributed under 51 | "Open Database License (ODbL) v1.0" 52 | http://www.opendatacommons.org/licenses/odbl/1.0/ or a similar 53 | copyleft license. 54 | 55 | See comments on the word list here: 56 | http://fnielsen.posterous.com/old-anew-a-sentiment-about-sentiment-analysis 57 | DESCRIPTION 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /test/test-california-housing.rb: -------------------------------------------------------------------------------- 1 | class CaliforniaHousingTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::CaliforniaHousing.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::CaliforniaHousing::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | assert_equal({ 12 | median_house_value: 452600.000000, 13 | median_income: 8.325200, 14 | housing_median_age: 41.000000, 15 | total_rooms: 880.000000, 16 | total_bedrooms: 129.000000, 17 | population: 322.000000, 18 | households: 126.000000, 19 | latitude: 37.880000, 20 | longitude: -122.230000, 21 | }, 22 | @dataset.each.next.to_h) 23 | end 24 | 25 | sub_test_case("#metadata") do 26 | test("#description") do 27 | description = @dataset.metadata.description 28 | assert_equal(<<-DESCRIPTION, description) 29 | Housing information from the 1990 census used in 30 | Pace, R. Kelley and Ronald Barry, 31 | "Sparse Spatial Autoregressions", 32 | Statistics and Probability Letters, 33 (1997) 291-297. 33 | Available from http://lib.stat.cmu.edu/datasets/. 34 | DESCRIPTION 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /test/test-dataset.rb: -------------------------------------------------------------------------------- 1 | class TestDataset < Test::Unit::TestCase 2 | sub_test_case("#clear_cache!") do 3 | include Helper::PathRestorable 4 | 5 | def setup 6 | @dataset = Datasets::Iris.new 7 | @cache_dir_path = @dataset.send(:cache_dir_path) 8 | end 9 | 10 | test("when the dataset is downloaded") do 11 | @dataset.first # This ensures the dataset downloaded 12 | existence = {before: @cache_dir_path.join("iris.csv").exist?} 13 | 14 | restore_path(@cache_dir_path) do 15 | @dataset.clear_cache! 16 | existence[:after] = @cache_dir_path.join("iris.csv").exist? 17 | 18 | assert_equal({before: true, after: false}, 19 | existence) 20 | end 21 | end 22 | 23 | test("when the dataset is not downloaded") do 24 | restore_path(@cache_dir_path) do 25 | if @cache_dir_path.exist? 26 | FileUtils.rmtree(@cache_dir_path.to_s, secure: true) 27 | end 28 | 29 | assert_nothing_raised do 30 | @dataset.clear_cache! 31 | end 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /test/test-diamonds.rb: -------------------------------------------------------------------------------- 1 | class DiamondsTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::Diamonds.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::Diamonds::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | assert_equal({ 12 | carat: 0.23, 13 | clarity: "SI2", 14 | color: "E", 15 | cut: "Ideal", 16 | depth: 61.5, 17 | price: 326, 18 | table: 55.0, 19 | x: 3.95, 20 | y: 3.98, 21 | z: 2.43, 22 | }, 23 | @dataset.each.next.to_h) 24 | end 25 | 26 | sub_test_case("#metadata") do 27 | test("#description") do 28 | description = @dataset.metadata.description 29 | assert_equal(<<-DESCRIPTION, description) 30 | Prices of over 50,000 round cut diamonds 31 | 32 | A dataset containing the prices and other attributes of almost 54,000 33 | diamonds. The variables are as follows: 34 | 35 | A data frame with 53940 rows and 10 variables: 36 | 37 | * price: price in US dollars ($326--$18,823) 38 | * carat: weight of the diamond (0.2--5.01) 39 | * cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal) 40 | * color: diamond colour, from D (best) to J (worst) 41 | * clarity: a measurement of how clear the diamond is (I1 (worst), SI2, 42 | SI1, VS2, VS1, VVS2, VVS1, IF (best)) 43 | * x: length in mm (0--10.74) 44 | * y: width in mm (0--58.9) 45 | * z: depth in mm (0--31.8) 46 | * depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79) 47 | * table: width of top of diamond relative to widest point (43--95) 48 | DESCRIPTION 49 | end 50 | end 51 | end 52 | -------------------------------------------------------------------------------- /test/test-dictionary.rb: -------------------------------------------------------------------------------- 1 | class DictionaryTest < Test::Unit::TestCase 2 | def setup 3 | penn_treebank = Datasets::PennTreebank.new(type: :test) 4 | @dictionary = penn_treebank.to_table.dictionary_encode(:word) 5 | end 6 | 7 | test("#id") do 8 | assert_equal(95, @dictionary.id("")) 9 | end 10 | 11 | test("#value") do 12 | assert_equal("", @dictionary.value(95)) 13 | end 14 | 15 | test("#ids") do 16 | assert_equal([0, 1, 2, 3, 4], @dictionary.ids.first(5)) 17 | end 18 | 19 | test("#values") do 20 | assert_equal(["no", "it", "was", "n't", "black"], 21 | @dictionary.values.first(5)) 22 | end 23 | 24 | test("#each") do 25 | assert_equal([ 26 | [0, "no"], 27 | [1, "it"], 28 | [2, "was"], 29 | [3, "n't"], 30 | [4, "black"], 31 | ], 32 | @dictionary.each.first(5).to_a) 33 | end 34 | 35 | test("#size") do 36 | assert_equal(6048, @dictionary.size) 37 | end 38 | 39 | test("#length") do 40 | assert_equal(@dictionary.size, 41 | @dictionary.length) 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /test/test-downloader.rb: -------------------------------------------------------------------------------- 1 | class DownloaderTest < Test::Unit::TestCase 2 | include Helper::Sandbox 3 | 4 | sub_test_case("#download") do 5 | def setup 6 | setup_sandbox 7 | end 8 | 9 | def teardown 10 | teardown_sandbox 11 | end 12 | 13 | test("too many redirection") do 14 | first_url = "https://example.com/file" 15 | last_url = "https://example.com/last_redirection" 16 | expected_message = "too many redirections: #{first_url} .. #{last_url}" 17 | output_path = @tmp_dir + "file" 18 | downloader = Datasets::Downloader.new(first_url) 19 | 20 | downloader.define_singleton_method(:start_http) do |url, fallback_urls, headers| 21 | raise Datasets::Downloader::TooManyRedirects, "too many redirections: #{last_url}" 22 | end 23 | 24 | assert_raise(Datasets::Downloader::TooManyRedirects.new(expected_message)) do 25 | downloader.download(output_path) 26 | end 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /test/test-fashion-mnist.rb: -------------------------------------------------------------------------------- 1 | class FashionMNISTTest < Test::Unit::TestCase 2 | sub_test_case("Normal") do 3 | sub_test_case("train") do 4 | def setup 5 | @dataset = Datasets::FashionMNIST.new(type: :train) 6 | end 7 | 8 | test("#each") do 9 | records = @dataset.each.to_a 10 | assert_equal([ 11 | 60000, 12 | [ 13 | 9, 14 | 784, 15 | [0, 0, 0, 0, 237, 226, 217, 223, 222, 219], 16 | [220, 232, 246, 0, 3, 202, 228, 224, 221, 211], 17 | ], 18 | [ 19 | 5, 20 | 784, 21 | [129, 153, 34, 0, 3, 3, 0, 3, 0, 24], 22 | [180, 177, 177, 47, 101, 235, 194, 223, 232, 255], 23 | ], 24 | ], 25 | [ 26 | records.size, 27 | [ 28 | records[0].label, 29 | records[0].pixels.size, 30 | records[0].pixels[400, 10], 31 | records[0].pixels[500, 10], 32 | ], 33 | [ 34 | records[-1].label, 35 | records[-1].pixels.size, 36 | records[-1].pixels[400, 10], 37 | records[-1].pixels[500, 10], 38 | ], 39 | ]) 40 | end 41 | 42 | test("#to_table") do 43 | table_data = @dataset.to_table 44 | assert_equal([ 45 | [0, 0, 0, 0, 237, 226, 217, 223, 222, 219], 46 | [129, 153, 34, 0, 3, 3, 0, 3, 0, 24], 47 | ], 48 | [ 49 | table_data[:pixels][0][400, 10], 50 | table_data[:pixels][-1][400, 10], 51 | ]) 52 | end 53 | 54 | sub_test_case("#metadata") do 55 | test("#id") do 56 | assert_equal("fashion-mnist-train", @dataset.metadata.id) 57 | end 58 | 59 | test("#name") do 60 | assert_equal("Fashion-MNIST: train", @dataset.metadata.name) 61 | end 62 | end 63 | end 64 | 65 | sub_test_case("test") do 66 | def setup 67 | @dataset = Datasets::FashionMNIST.new(type: :test) 68 | end 69 | 70 | test("#each") do 71 | records = @dataset.each.to_a 72 | assert_equal([ 73 | 10000, 74 | [ 75 | 9, 76 | 784, 77 | [1, 0, 0, 0, 98, 136, 110, 109, 110, 162], 78 | [172, 161, 189, 62, 0, 68, 94, 90, 111, 114], 79 | ], 80 | [ 81 | 5, 82 | 784, 83 | [45, 45, 69, 128, 100, 120, 132, 123, 135, 171], 84 | [63, 74, 72, 0, 1, 0, 0, 0, 4, 85], 85 | ], 86 | ], 87 | [ 88 | records.size, 89 | [ 90 | records[0].label, 91 | records[0].pixels.size, 92 | records[0].pixels[400, 10], 93 | records[0].pixels[500, 10], 94 | ], 95 | [ 96 | records[-1].label, 97 | records[-1].pixels.size, 98 | records[-1].pixels[400, 10], 99 | records[-1].pixels[500, 10], 100 | ], 101 | ]) 102 | end 103 | 104 | test("#to_table") do 105 | table_data = @dataset.to_table 106 | assert_equal([ 107 | [1, 0, 0, 0, 98, 136, 110, 109, 110, 162], 108 | [45, 45, 69, 128, 100, 120, 132, 123, 135, 171], 109 | ], 110 | [ 111 | table_data[:pixels][0][400, 10], 112 | table_data[:pixels][-1][400, 10], 113 | ]) 114 | end 115 | 116 | sub_test_case("#metadata") do 117 | test("#id") do 118 | assert_equal("fashion-mnist-test", @dataset.metadata.id) 119 | end 120 | 121 | test("#name") do 122 | assert_equal("Fashion-MNIST: test", @dataset.metadata.name) 123 | end 124 | end 125 | end 126 | end 127 | 128 | sub_test_case("Abnormal") do 129 | test("invalid type") do 130 | invalid_type = :invalid 131 | message = "Please set type :train or :test: #{invalid_type.inspect}" 132 | assert_raise(ArgumentError.new(message)) do 133 | Datasets::FashionMNIST.new(type: invalid_type) 134 | end 135 | end 136 | end 137 | end 138 | -------------------------------------------------------------------------------- /test/test-fuel-economy.rb: -------------------------------------------------------------------------------- 1 | class FuelEconomyTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::FuelEconomy.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::FuelEconomy::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | records = @dataset.each.to_a 12 | assert_equal([ 13 | 234, 14 | { 15 | city_mpg: 18, 16 | displacement: 1.8, 17 | drive_train: "f", 18 | fuel: "p", 19 | highway_mpg: 29, 20 | manufacturer: "audi", 21 | model: "a4", 22 | n_cylinders: 4, 23 | transmission: "auto(l5)", 24 | type: "compact", 25 | year: 1999 26 | }, 27 | { 28 | city_mpg: 17, 29 | displacement: 3.6, 30 | drive_train: "f", 31 | fuel: "p", 32 | highway_mpg: 26, 33 | manufacturer: "volkswagen", 34 | model: "passat", 35 | n_cylinders: 6, 36 | transmission: "auto(s6)", 37 | type: "midsize", 38 | year: 2008 39 | }, 40 | ], 41 | [ 42 | records.size, 43 | records[0].to_h, 44 | records[-1].to_h 45 | ]) 46 | end 47 | 48 | sub_test_case("#metadata") do 49 | test("#description") do 50 | description = @dataset.metadata.description 51 | assert_equal(<<-DESCRIPTION, description) 52 | Fuel economy data from 1999 to 2008 for 38 popular models of cars 53 | 54 | This dataset contains a subset of the fuel economy data that the EPA makes 55 | available on https://fueleconomy.gov/. It contains only models which 56 | had a new release every year between 1999 and 2008 - this was used as a 57 | proxy for the popularity of the car. 58 | 59 | A data frame with 234 rows and 11 variables: 60 | 61 | * manufacturer: manufacturer name 62 | * model: model name 63 | * displacement: engine displacement, in litres 64 | * year: year of manufacture 65 | * n_cylinders: number of cylinders 66 | * transmissions: type of transmission 67 | * drive_train: the type of drive train, where f = front-wheel drive, r = rear wheel drive, 4 = 4wd 68 | * city_mpg: city miles per gallon 69 | * highway_mpg: highway miles per gallon 70 | * fuel: fuel type 71 | * type: "type" of car 72 | DESCRIPTION 73 | end 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /test/test-geolonia.rb: -------------------------------------------------------------------------------- 1 | class GeoloniaTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::Geolonia.new 4 | end 5 | 6 | test('#each') do 7 | assert_equal({ 8 | :prefecture_code => "01", 9 | :prefecture_name => "北海道", 10 | :prefecture_kana => "ホッカイドウ", 11 | :prefecture_romaji => "HOKKAIDO", 12 | :municipality_code => "01101", 13 | :municipality_name => "札幌市中央区", 14 | :municipality_kana => "サッポロシチュウオウク", 15 | :municipality_romaji => "SAPPORO SHI CHUO KU", 16 | :street_name => "旭ケ丘一丁目", 17 | :street_kana => "アサヒガオカ 1", 18 | :street_romaji => "ASAHIGAOKA 1", 19 | :alias => nil, 20 | :latitude => "43.04223", 21 | :longitude => "141.319722", 22 | }, 23 | @dataset.each.next.to_h) 24 | end 25 | 26 | sub_test_case("#metadata") do 27 | test("#description") do 28 | description = @dataset.metadata.description 29 | assert_equal([ 30 | "# Geolonia 住所データ", 31 | "## 住所データ仕様", 32 | "### ファイルフォーマット", 33 | "### 列", 34 | "### ソート順", 35 | ], 36 | description.scan(/^#.*$/), 37 | description) 38 | end 39 | end 40 | 41 | end 42 | -------------------------------------------------------------------------------- /test/test-hepatitis.rb: -------------------------------------------------------------------------------- 1 | class HepatitisTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::Hepatitis.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::Hepatitis::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | records = @dataset.each.to_a 12 | assert_equal([ 13 | 155, 14 | { 15 | :label => :live, 16 | :age => 30, 17 | :sex => :female, 18 | :steroid => false, 19 | :antivirals => true, 20 | :fatigue => true, 21 | :malaise => true, 22 | :anorexia => true, 23 | :liver_big => false, 24 | :liver_firm => true, 25 | :spleen_palpable => true, 26 | :spiders => true, 27 | :ascites => true, 28 | :varices => true, 29 | :bilirubin => 1.0, 30 | :alkaline_phosphate => 85, 31 | :sgot => 18, 32 | :albumin => 4.0, 33 | :protime => nil, 34 | :histology => false, 35 | }, 36 | { 37 | :label => :die, 38 | :age => 43, 39 | :sex => :male, 40 | :steroid => true, 41 | :antivirals => true, 42 | :fatigue => false, 43 | :malaise => true, 44 | :anorexia => true, 45 | :liver_big => true, 46 | :liver_firm => true, 47 | :spleen_palpable => false, 48 | :spiders => false, 49 | :ascites => false, 50 | :varices => true, 51 | :bilirubin => 1.2, 52 | :alkaline_phosphate => 100, 53 | :sgot => 19, 54 | :albumin => 3.1, 55 | :protime => 42, 56 | :histology => true, 57 | } 58 | ], 59 | [ 60 | records.size, 61 | records[0].to_h, 62 | records[-1].to_h, 63 | ]) 64 | end 65 | 66 | sub_test_case("#metadata") do 67 | test("#description") do 68 | description = @dataset.metadata.description 69 | assert do 70 | description.start_with?("1. Title: Hepatitis Domain") 71 | end 72 | end 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /test/test-house-of-representative.rb: -------------------------------------------------------------------------------- 1 | class HouseOfRepresentativeTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::HouseOfRepresentative.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::HouseOfRepresentative::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | assert_equal(record(142, 12 | "衆法の一覧", 13 | nil, 14 | 139, 15 | 18, 16 | "市民活動促進法案", 17 | "成立", 18 | "経過", 19 | "https://www.shugiin.go.jp/internet/itdb_gian.nsf/html/gian/keika/5516.htm", 20 | nil, 21 | nil, 22 | "衆法", 23 | "熊代 昭彦君外四名", 24 | %w(自由民主党 社会民主党・市民連合 新党さきがけ), 25 | nil, 26 | nil, 27 | nil, 28 | Date.jisx0301("H10.03.04"), 29 | Date.jisx0301("H10.03.11"), 30 | "内閣", 31 | Date.jisx0301("H10.03.17"), 32 | "可決", 33 | Date.jisx0301("H10.03.19"), 34 | "可決", 35 | nil, 36 | nil, 37 | nil, 38 | nil, 39 | nil, 40 | nil, 41 | nil, 42 | Date.jisx0301("H10.01.12"), 43 | "労働・社会政策", 44 | Date.jisx0301("H10.03.03"), 45 | "修正", 46 | Date.jisx0301("H10.03.04"), 47 | "修正", 48 | Date.jisx0301("H10.03.25"), 49 | 7, 50 | nil, 51 | nil), 52 | @dataset.each.next) 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /test/test-iris.rb: -------------------------------------------------------------------------------- 1 | class IrisTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::Iris.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::Iris::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | records = @dataset.each.to_a 12 | assert_equal([ 13 | 150, 14 | record(5.1, 3.5, 1.4, 0.2, "Iris-setosa"), 15 | record(5.9, 3.0, 5.1, 1.8, "Iris-virginica"), 16 | ], 17 | [ 18 | records.size, 19 | records[0], 20 | records[-1], 21 | ]) 22 | end 23 | 24 | sub_test_case("#metadata") do 25 | test("#description") do 26 | description = @dataset.metadata.description 27 | assert do 28 | description.start_with?("1. Title: Iris Plants Database") 29 | end 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /test/test-ita-corpus.rb: -------------------------------------------------------------------------------- 1 | class ITACorpusTest < Test::Unit::TestCase 2 | 3 | sub_test_case("type") do 4 | test("emotion") do 5 | dataset = Datasets::ITACorpus.new(type: :emotion) 6 | records = dataset.to_a 7 | assert_equal([ 8 | 100, 9 | { 10 | :id => "EMOTION100_001", 11 | :sentence => "えっ嘘でしょ。,エッウソデショ。" 12 | }, 13 | { 14 | :id => "EMOTION100_100", 15 | :sentence => "ラーテャン。,ラーテャン。", 16 | }, 17 | ], 18 | [ 19 | records.size, 20 | records[0].to_h, 21 | records[-1].to_h, 22 | ]) 23 | end 24 | 25 | test("recitation") do 26 | dataset = Datasets::ITACorpus.new(type: :recitation) 27 | records = dataset.to_a 28 | assert_equal([ 29 | 324, 30 | { 31 | :id => "RECITATION324_001", 32 | :sentence => "女の子がキッキッ嬉しそう。,オンナノコガキッキッウレシソー。" 33 | }, 34 | { 35 | :id => "RECITATION324_324", 36 | :sentence => "チュクンの波長は、パツンと共通している。,チュクンノハチョーワ、パツントキョーツウシテイル。", 37 | }, 38 | ], 39 | [ 40 | records.size, 41 | records[0].to_h, 42 | records[-1].to_h, 43 | ]) 44 | end 45 | 46 | test("invalid") do 47 | message = "Please set type :emotion or :recitation: :invalid" 48 | assert_raise(ArgumentError.new(message)) do 49 | Datasets::ITACorpus.new(type: :invalid) 50 | end 51 | end 52 | 53 | end 54 | 55 | sub_test_case("#metadata") do 56 | test("#description") do 57 | dataset = Datasets::ITACorpus.new(type: :emotion) 58 | description = dataset.metadata.description 59 | assert_equal([ 60 | "# ITAコーパスの文章リスト公開用リポジトリ", 61 | "## ITAコーパスとは", 62 | "## ITAコーパスの文献情報" 63 | ], 64 | description.scan(/^#.*$/), 65 | description) 66 | end 67 | end 68 | 69 | end 70 | -------------------------------------------------------------------------------- /test/test-kuzushiji-mnist.rb: -------------------------------------------------------------------------------- 1 | class KuzushijiMNISTTest < Test::Unit::TestCase 2 | sub_test_case("Normal") do 3 | sub_test_case("train") do 4 | def setup 5 | @dataset = Datasets::KuzushijiMNIST.new(type: :train) 6 | end 7 | 8 | test("#each") do 9 | records = @dataset.each.to_a 10 | assert_equal([ 11 | 60000, 12 | [ 13 | 8, 14 | 784, 15 | [213, 233, 255, 186, 2, 0, 0, 0, 0, 0], 16 | [0, 0, 0, 0, 0, 0, 0, 0, 45, 252], 17 | ], 18 | [ 19 | 9, 20 | 784, 21 | [81, 246, 254, 155, 224, 255, 230, 39, 0, 0], 22 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 23 | ], 24 | ], 25 | [ 26 | records.size, 27 | [ 28 | records[0].label, 29 | records[0].pixels.size, 30 | records[0].pixels[400, 10], 31 | records[0].pixels[500, 10], 32 | ], 33 | [ 34 | records[-1].label, 35 | records[-1].pixels.size, 36 | records[-1].pixels[400, 10], 37 | records[-1].pixels[500, 10], 38 | ], 39 | ]) 40 | end 41 | 42 | test("#to_table") do 43 | table_data = @dataset.to_table 44 | assert_equal([ 45 | [213, 233, 255, 186, 2, 0, 0, 0, 0, 0], 46 | [81, 246, 254, 155, 224, 255, 230, 39, 0, 0], 47 | ], 48 | [ 49 | table_data[:pixels][0][400, 10], 50 | table_data[:pixels][-1][400, 10], 51 | ]) 52 | end 53 | 54 | sub_test_case("#metadata") do 55 | test("#id") do 56 | assert_equal("kuzushiji-mnist-train", @dataset.metadata.id) 57 | end 58 | 59 | test("#name") do 60 | assert_equal("Kuzushiji-MNIST: train", @dataset.metadata.name) 61 | end 62 | end 63 | end 64 | 65 | sub_test_case("test") do 66 | def setup 67 | @dataset = Datasets::KuzushijiMNIST.new(type: :test) 68 | end 69 | 70 | test("#each") do 71 | records = @dataset.each.to_a 72 | assert_equal([ 73 | 10000, 74 | [ 75 | 2, 76 | 784, 77 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 75], 78 | [44, 255, 255, 246, 119, 252, 46, 0, 70, 255], 79 | ], 80 | [ 81 | 2, 82 | 784, 83 | [0, 0, 0, 0, 0, 0, 0, 84, 255, 192], 84 | [0, 0, 0, 0, 0, 23, 245, 92, 42, 254], 85 | ], 86 | ], 87 | [ 88 | records.size, 89 | [ 90 | records[0].label, 91 | records[0].pixels.size, 92 | records[0].pixels[400, 10], 93 | records[0].pixels[500, 10], 94 | ], 95 | [ 96 | records[-1].label, 97 | records[-1].pixels.size, 98 | records[-1].pixels[400, 10], 99 | records[-1].pixels[500, 10], 100 | ], 101 | ]) 102 | end 103 | 104 | test("#to_table") do 105 | table_data = @dataset.to_table 106 | assert_equal([ 107 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 75], 108 | [0, 0, 0, 0, 0, 0, 0, 84, 255, 192], 109 | ], 110 | [ 111 | table_data[:pixels][0][400, 10], 112 | table_data[:pixels][-1][400, 10], 113 | ]) 114 | end 115 | 116 | sub_test_case("#metadata") do 117 | test("#id") do 118 | assert_equal("kuzushiji-mnist-test", @dataset.metadata.id) 119 | end 120 | 121 | test("#name") do 122 | assert_equal("Kuzushiji-MNIST: test", @dataset.metadata.name) 123 | end 124 | end 125 | end 126 | end 127 | 128 | sub_test_case("Abnormal") do 129 | test("invalid type") do 130 | invalid_type = :invalid 131 | message = "Please set type :train or :test: #{invalid_type.inspect}" 132 | assert_raise(ArgumentError.new(message)) do 133 | Datasets::KuzushijiMNIST.new(type: invalid_type) 134 | end 135 | end 136 | end 137 | end 138 | -------------------------------------------------------------------------------- /test/test-libsvm-dataset-list.rb: -------------------------------------------------------------------------------- 1 | class LIBSVMDatasetListTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::LIBSVMDatasetList.new 4 | end 5 | 6 | test("#each") do 7 | assert_equal({ 8 | name: "a1a", 9 | source: "UCI / Adult", 10 | preprocessing: 11 | "The original Adult data set has 14 features, " + 12 | "among which six are continuous and eight are " + 13 | "categorical. In this data set, continuous features " + 14 | "are discretized into quantiles, and each quantile is " + 15 | "represented by a binary feature. Also, a categorical " + 16 | "feature with m categories is converted to m binary " + 17 | "features. Details on how each feature is converted " + 18 | "can be found in the beginning of each file from this " + 19 | "page. [JP98a]", 20 | n_classes: 2, 21 | n_data: 1605, 22 | n_features: 123, 23 | files: [ 24 | { 25 | name: "a1a", 26 | url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a", 27 | note: nil, 28 | }, 29 | { 30 | name: "a1a.t", 31 | url: "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/a1a.t", 32 | note: "testing", 33 | } 34 | ], 35 | }, 36 | @dataset.first.to_h) 37 | end 38 | 39 | sub_test_case("#metadata") do 40 | test("#description") do 41 | description = @dataset.metadata.description 42 | assert do 43 | description.start_with?("This page contains many classification, ") 44 | end 45 | end 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /test/test-libsvm.rb: -------------------------------------------------------------------------------- 1 | class LIBSVMDatasetTest < Test::Unit::TestCase 2 | test(":note") do 3 | dataset = Datasets::LIBSVM.new("a1a", note: "testing") 4 | hash = {label: -1} 5 | n_features = 123 6 | n_features.times do |i| 7 | hash[i] = 0 8 | end 9 | [5, 7, 14, 19, 39, 40, 51, 63, 67, 73, 74, 76, 78, 83].each do |i| 10 | hash[i - 1] = 1 11 | end 12 | assert_equal(hash, 13 | dataset.first.to_h) 14 | end 15 | 16 | test(":default_feature_value") do 17 | dataset = Datasets::LIBSVM.new("a1a", default_feature_value: nil) 18 | hash = {label: -1} 19 | n_features = 123 20 | n_features.times do |i| 21 | hash[i] = nil 22 | end 23 | [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i| 24 | hash[i - 1] = 1 25 | end 26 | assert_equal(hash, 27 | dataset.first.to_h) 28 | end 29 | 30 | test("classification") do 31 | dataset = Datasets::LIBSVM.new("a1a") 32 | hash = {label: -1} 33 | n_features = 123 34 | n_features.times do |i| 35 | hash[i] = 0 36 | end 37 | [3, 11, 14, 19, 39, 42, 55, 64, 67, 73, 75, 76, 80, 83].each do |i| 38 | hash[i - 1] = 1 39 | end 40 | assert_equal(hash, 41 | dataset.first.to_h) 42 | end 43 | 44 | test("regression") do 45 | dataset = Datasets::LIBSVM.new("abalone") 46 | hash = {label: 15} 47 | n_features = 8 48 | n_features.times do |i| 49 | hash[i] = 0 50 | end 51 | [ 52 | [1, 1], 53 | [2, 0.455], 54 | [3, 0.365], 55 | [4, 0.095], 56 | [5, 0.514], 57 | [6, 0.2245], 58 | [7, 0.101], 59 | [8, 0.15], 60 | ].each do |i, value| 61 | hash[i - 1] = value 62 | end 63 | assert_equal(hash, 64 | dataset.first.to_h) 65 | end 66 | 67 | test("multi-label") do 68 | dataset = Datasets::LIBSVM.new("mediamill (exp1)") 69 | hash = {label: [65, 67, 11, 31]} 70 | n_features = 120 71 | n_features.times do |i| 72 | hash[i] = 0 73 | end 74 | [ 75 | [1, 0.380877], 76 | [2, 0.494079], 77 | [3, 0.540009], 78 | [4, 0.422926], 79 | [5, 0.158318], 80 | [6, 0.326975], 81 | [7, 0.390861], 82 | [8, 0.527121], 83 | [9, 0.254052], 84 | [10, 0.223731], 85 | [11, 0.040285], 86 | [12, 0.141133], 87 | [13, 0.112249], 88 | [14, 0.263171], 89 | [15, 0.147020], 90 | [16, 0.472414], 91 | [17, 0.592614], 92 | [18, 0.653138], 93 | [19, 0.499867], 94 | [20, 0.196520], 95 | [21, 0.403892], 96 | [22, 0.482395], 97 | [23, 0.619219], 98 | [24, 0.320346], 99 | [25, 0.281251], 100 | [26, 0.054750], 101 | [27, 0.180459], 102 | [28, 0.139964], 103 | [29, 0.319925], 104 | [30, 0.181216], 105 | [31, 0.364294], 106 | [32, 0.407211], 107 | [33, 0.368926], 108 | [34, 0.427661], 109 | [35, 0.211391], 110 | [36, 0.364345], 111 | [37, 0.370710], 112 | [38, 0.409107], 113 | [39, 0.289299], 114 | [40, 0.243053], 115 | [41, 0.063121], 116 | [42, 0.193587], 117 | [43, 0.158755], 118 | [44, 0.316054], 119 | [45, 0.197410], 120 | [46, 0.656168], 121 | [47, 0.678760], 122 | [48, 0.650831], 123 | [49, 0.674636], 124 | [50, 0.492428], 125 | [51, 0.623887], 126 | [52, 0.610622], 127 | [53, 0.678219], 128 | [54, 0.574774], 129 | [55, 0.523073], 130 | [56, 0.206804], 131 | [57, 0.496294], 132 | [58, 0.429221], 133 | [59, 0.586611], 134 | [60, 0.471550], 135 | [61, 0.284480], 136 | [62, 0.432466], 137 | [63, 0.498075], 138 | [64, 0.408141], 139 | [65, 0.102713], 140 | [66, 0.303028], 141 | [67, 0.309501], 142 | [68, 0.444855], 143 | [69, 0.191727], 144 | [70, 0.174895], 145 | [71, 0.034143], 146 | [72, 0.153099], 147 | [73, 0.068318], 148 | [74, 0.217020], 149 | [75, 0.099688], 150 | [76, 0.409862], 151 | [77, 0.561918], 152 | [78, 0.612031], 153 | [79, 0.514471], 154 | [80, 0.146015], 155 | [81, 0.398807], 156 | [82, 0.383295], 157 | [83, 0.548485], 158 | [84, 0.282937], 159 | [85, 0.252712], 160 | [86, 0.051008], 161 | [87, 0.223110], 162 | [88, 0.098112], 163 | [89, 0.299672], 164 | [90, 0.144873], 165 | [91, 0.308488], 166 | [92, 0.358478], 167 | [93, 0.352077], 168 | [94, 0.394686], 169 | [95, 0.157513], 170 | [96, 0.339370], 171 | [97, 0.321558], 172 | [98, 0.341373], 173 | [99, 0.247969], 174 | [100, 0.206070], 175 | [101, 0.061001], 176 | [102, 0.216793], 177 | [103, 0.112389], 178 | [104, 0.273648], 179 | [105, 0.152745], 180 | [106, 0.598081], 181 | [107, 0.621687], 182 | [108, 0.607213], 183 | [109, 0.644025], 184 | [110, 0.394948], 185 | [111, 0.593651], 186 | [112, 0.551529], 187 | [113, 0.574392], 188 | [114, 0.511032], 189 | [115, 0.463997], 190 | [116, 0.202034], 191 | [117, 0.492341], 192 | [118, 0.317983], 193 | [119, 0.547807], 194 | [120, 0.393778], 195 | ].each do |i, value| 196 | hash[i - 1] = value 197 | end 198 | assert_equal(hash, 199 | dataset.first.to_h) 200 | end 201 | 202 | test("string") do 203 | # TODO 204 | end 205 | end 206 | -------------------------------------------------------------------------------- /test/test-license.rb: -------------------------------------------------------------------------------- 1 | class LicenseTest < Test::Unit::TestCase 2 | sub_test_case(".try_convert") do 3 | test("String") do 4 | assert_equal(Datasets::License.new("Apache-2.0"), 5 | Datasets::License.try_convert("Apache-2.0")) 6 | end 7 | 8 | test("{spdx_id:}") do 9 | assert_equal(Datasets::License.new("Apache-2.0"), 10 | Datasets::License.try_convert(spdx_id: "Apache-2.0")) 11 | end 12 | 13 | test("{name:, url:}") do 14 | license = { 15 | name: "Quora's Terms of Service", 16 | url: "https://www.quora.com/about/tos", 17 | } 18 | assert_equal(Datasets::License.new(nil, 19 | "Quora's Terms of Service", 20 | "https://www.quora.com/about/tos"), 21 | Datasets::License.try_convert(license)) 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /test/test-metadata.rb: -------------------------------------------------------------------------------- 1 | class MetadataTest < Test::Unit::TestCase 2 | def setup 3 | @metadata = Datasets::Metadata.new 4 | end 5 | 6 | sub_test_case("#licenses") do 7 | test("String") do 8 | @metadata.licenses = "Apache-2.0" 9 | assert_equal([Datasets::License.new("Apache-2.0")], 10 | @metadata.licenses) 11 | end 12 | 13 | test("[String]") do 14 | @metadata.licenses = ["Apache-2.0"] 15 | assert_equal([Datasets::License.new("Apache-2.0")], 16 | @metadata.licenses) 17 | end 18 | 19 | test("{name:, url:}") do 20 | @metadata.licenses = { 21 | name: "Quora's Terms of Service", 22 | url: "https://www.quora.com/about/tos", 23 | } 24 | assert_equal([Datasets::License.new(nil, 25 | "Quora's Terms of Service", 26 | "https://www.quora.com/about/tos")], 27 | @metadata.licenses) 28 | end 29 | 30 | test("Symbol") do 31 | assert_raise(ArgumentError.new("invalid license: :apache_2_0")) do 32 | @metadata.licenses = :apache_2_0 33 | end 34 | end 35 | end 36 | end 37 | -------------------------------------------------------------------------------- /test/test-mnist.rb: -------------------------------------------------------------------------------- 1 | class MNISTTest < Test::Unit::TestCase 2 | sub_test_case("Normal") do 3 | sub_test_case("train") do 4 | def setup 5 | @dataset = Datasets::MNIST.new(type: :train) 6 | end 7 | 8 | test("#each") do 9 | records = @dataset.each.to_a 10 | assert_equal([ 11 | 60000, 12 | [ 13 | 5, 14 | 784, 15 | [0, 0, 0, 49, 238, 253, 253, 253, 253, 253], 16 | [0, 0, 0, 0, 0, 81, 240, 253, 253, 119], 17 | ], 18 | [8, 19 | 784, 20 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 62], 21 | [0, 0, 190, 196, 14, 2, 97, 254, 252, 146], 22 | ], 23 | ], 24 | [ 25 | records.size, 26 | [ 27 | records[0].label, 28 | records[0].pixels.size, 29 | records[0].pixels[200, 10], 30 | records[0].pixels[400, 10], 31 | ], 32 | [ 33 | records[-1].label, 34 | records[-1].pixels.size, 35 | records[-1].pixels[200, 10], 36 | records[-1].pixels[400, 10], 37 | ], 38 | ]) 39 | end 40 | 41 | test("#to_table") do 42 | table_data = @dataset.to_table 43 | assert_equal([ 44 | [0, 0, 0, 49, 238, 253, 253, 253, 253, 253], 45 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 62], 46 | ], 47 | [ 48 | table_data[:pixels][0][200, 10], 49 | table_data[:pixels][-1][200, 10], 50 | ]) 51 | end 52 | 53 | sub_test_case("#metadata") do 54 | test("#id") do 55 | assert_equal("mnist-train", @dataset.metadata.id) 56 | end 57 | 58 | test("#name") do 59 | assert_equal("MNIST: train", @dataset.metadata.name) 60 | end 61 | end 62 | end 63 | 64 | sub_test_case("test") do 65 | def setup 66 | @dataset = Datasets::MNIST.new(type: :test) 67 | end 68 | 69 | test("#each") do 70 | records = @dataset.each.to_a 71 | assert_equal([ 72 | 10000, 73 | [ 74 | 7, 75 | 784, 76 | [0, 0, 84, 185, 159, 151, 60, 36, 0, 0], 77 | [0, 0, 0, 0, 0, 0, 0, 0, 59, 249], 78 | ], 79 | [ 80 | 6, 81 | 784, 82 | [0, 0, 0, 0, 0, 15, 60, 60, 168, 253], 83 | [253, 253, 132, 64, 0, 0, 18, 43, 157, 171], 84 | ], 85 | ], 86 | [ 87 | records.size, 88 | [ 89 | records[0].label, 90 | records[0].pixels.size, 91 | records[0].pixels[200, 10], 92 | records[0].pixels[400, 10], 93 | ], 94 | [ 95 | records[-1].label, 96 | records[-1].pixels.size, 97 | records[-1].pixels[200, 10], 98 | records[-1].pixels[400, 10], 99 | ], 100 | ]) 101 | end 102 | 103 | test("#to_table") do 104 | table_data = @dataset.to_table 105 | assert_equal([ 106 | [0, 0, 84, 185, 159, 151, 60, 36, 0, 0], 107 | [0, 0, 0, 0, 0, 15, 60, 60, 168, 253], 108 | ], 109 | [ 110 | table_data[:pixels][0][200, 10], 111 | table_data[:pixels][-1][200, 10], 112 | ]) 113 | end 114 | 115 | sub_test_case("#metadata") do 116 | test("#id") do 117 | assert_equal("mnist-test", @dataset.metadata.id) 118 | end 119 | 120 | test("#name") do 121 | assert_equal("MNIST: test", @dataset.metadata.name) 122 | end 123 | end 124 | end 125 | end 126 | 127 | sub_test_case("Abnormal") do 128 | test("invalid type") do 129 | invalid_type = :invalid 130 | message = "Please set type :train or :test: #{invalid_type.inspect}" 131 | assert_raise(ArgumentError.new(message)) do 132 | Datasets::MNIST.new(type: invalid_type) 133 | end 134 | end 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /test/test-mushroom.rb: -------------------------------------------------------------------------------- 1 | class MushroomTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::Mushroom.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::Mushroom::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | records = @dataset.each.to_a 12 | assert_equal([ 13 | 8124, 14 | { 15 | :label => "poisonous", 16 | :cap_shape => "convex", 17 | :cap_surface => "smooth", 18 | :cap_color => "brown", 19 | :bruises => "bruises", 20 | :odor => "pungent", 21 | :gill_attachment => "free", 22 | :gill_spacing => "close", 23 | :gill_size => "narrow", 24 | :gill_color => "black", 25 | :stalk_shape => "enlarging", 26 | :stalk_root => "equal", 27 | :stalk_surface_above_ring => "smooth", 28 | :stalk_surface_below_ring => "smooth", 29 | :stalk_color_above_ring => "white", 30 | :stalk_color_below_ring => "white", 31 | :veil_type => "partial", 32 | :veil_color => "white", 33 | :n_rings => 1, 34 | :ring_type => "pendant", 35 | :spore_print_color => "black", 36 | :population => "scattered", 37 | :habitat => "urban" 38 | }, 39 | { 40 | :label => "edible", 41 | :cap_shape => "convex", 42 | :cap_surface => "smooth", 43 | :cap_color => "brown", 44 | :bruises => "no", 45 | :odor => "none", 46 | :gill_attachment => "attached", 47 | :gill_spacing => "close", 48 | :gill_size => "broad", 49 | :gill_color => "yellow", 50 | :stalk_shape => "enlarging", 51 | :stalk_root => "missing", 52 | :stalk_surface_above_ring => "smooth", 53 | :stalk_surface_below_ring => "smooth", 54 | :stalk_color_above_ring => "orange", 55 | :stalk_color_below_ring => "orange", 56 | :veil_type => "partial", 57 | :veil_color => "orange", 58 | :n_rings => 1, 59 | :ring_type => "pendant", 60 | :spore_print_color => "orange", 61 | :population => "clustered", 62 | :habitat => "leaves" 63 | } 64 | ], 65 | [ 66 | records.size, 67 | records[0].to_h, 68 | records[-1].to_h 69 | ]) 70 | end 71 | 72 | sub_test_case("#metadata") do 73 | test("#description") do 74 | description = @dataset.metadata.description 75 | assert do 76 | description.start_with?("1. Title: Mushroom Database") 77 | end 78 | end 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /test/test-nagoya-university-conversation-corpus.rb: -------------------------------------------------------------------------------- 1 | class NagoyaUniversityConversationCorpusTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::NagoyaUniversityConversationCorpus.new 4 | end 5 | 6 | sub_test_case("each") do 7 | test("#sentences") do 8 | first_sentences = @dataset.each.next.sentences 9 | assert_equal([ 10 | 856, 11 | { 12 | participant_id: 'F107', 13 | content: '***の町というのはちいちゃくって、城壁がこう町全体をぐるっと回ってて、それが城壁の上を歩いても1時間ぐらいですよね。', 14 | }, 15 | { 16 | participant_id: nil, 17 | content: nil, 18 | }, 19 | ], 20 | [ 21 | first_sentences.size, 22 | first_sentences[0].to_h, 23 | first_sentences[-1].to_h, 24 | ]) 25 | end 26 | 27 | test("#participants") do 28 | first_participants = @dataset.each.next.participants 29 | assert_equal([ 30 | 4, 31 | { 32 | id: 'F107', 33 | attribute: '女性30代後半', 34 | birthplace: '愛知県幡豆郡出身', 35 | residence: '愛知県幡豆郡在住', 36 | }, 37 | { 38 | id: 'F128', 39 | attribute: '女性20代前半', 40 | birthplace: '愛知県西尾市出身', 41 | residence: '西尾市在住', 42 | }, 43 | ], 44 | [ 45 | first_participants.size, 46 | first_participants[0].to_h, 47 | first_participants[-1].to_h, 48 | ]) 49 | end 50 | 51 | test("others") do 52 | first_record = @dataset.each.next 53 | assert_equal([ 54 | '1(約35分)', 55 | '2001年10月16日', 56 | 'ファミリーレストラン', 57 | '英会話教室の友人', 58 | nil, 59 | ], 60 | [ 61 | first_record.name, 62 | first_record.date, 63 | first_record.place, 64 | first_record.relationships, 65 | first_record.note, 66 | ]) 67 | end 68 | end 69 | 70 | sub_test_case("#metadata") do 71 | test("#description") do 72 | description = @dataset.metadata.description 73 | assert_equal(<<~DESCRIPTION, description) 74 | The "Nagoya University Conversation Corpus" is a corpus of 129 conversations, 75 | total about 100 hours of chatting among native speakers of Japanese, 76 | which is converted into text. 77 | DESCRIPTION 78 | end 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /test/test-penn-treebank.rb: -------------------------------------------------------------------------------- 1 | class PennTreebankTest < Test::Unit::TestCase 2 | def record(*args) 3 | Datasets::PennTreebank::Record.new(*args) 4 | end 5 | 6 | sub_test_case("type") do 7 | test("train") do 8 | dataset = Datasets::PennTreebank.new(type: :train) 9 | records = dataset.to_a 10 | assert_equal([ 11 | 887521, 12 | record("aer"), 13 | record(""), 14 | ], 15 | [ 16 | records.size, 17 | records[0], 18 | records[-1], 19 | ]) 20 | end 21 | 22 | test("test") do 23 | dataset = Datasets::PennTreebank.new(type: :test) 24 | records = dataset.to_a 25 | assert_equal([ 26 | 78669, 27 | record("no"), 28 | record("us"), 29 | ], 30 | [ 31 | records.size, 32 | records[0], 33 | records[-1], 34 | ]) 35 | end 36 | 37 | test("valid") do 38 | dataset = Datasets::PennTreebank.new(type: :valid) 39 | records = dataset.to_a 40 | assert_equal([ 41 | 70390, 42 | record("consumers"), 43 | record("N"), 44 | ], 45 | [ 46 | records.size, 47 | records[0], 48 | records[-1], 49 | ]) 50 | end 51 | 52 | test("invalid") do 53 | message = "Type must be one of [:train, :test, :valid]: :invalid" 54 | assert_raise(ArgumentError.new(message)) do 55 | Datasets::PennTreebank.new(type: :invalid) 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /test/test-pmjt-dataset-list.rb: -------------------------------------------------------------------------------- 1 | class PMJTDatasetListTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets:: PMJTDatasetList.new 4 | end 5 | 6 | test("#each") do 7 | records = @dataset.each.to_a 8 | 9 | record_first = Datasets::PMJTDatasetList::Record.new 10 | record_first.unit = '冊' 11 | record_first.open_data_category = '総記' 12 | record_first.tag = nil 13 | record_first.release_time = 'H31.1' 14 | record_first.n_volumes = '2' 15 | record_first.type = '刊' 16 | record_first.publication_year = '元禄9' 17 | record_first.original_request_code = '99-37-1~2' 18 | record_first.id = '200003090' 19 | record_first.title = '人倫重宝記' 20 | record_first.text = nil 21 | record_first.bibliographical_introduction = nil 22 | record_first.year = nil 23 | 24 | record_last = Datasets::PMJTDatasetList::Record.new 25 | record_last.unit = '冊' 26 | record_last.open_data_category = '総記' 27 | record_last.tag = nil 28 | record_last.release_time = 'H27.11' 29 | record_last.n_volumes = '1' 30 | record_last.type = '刊' 31 | record_last.publication_year = '慶応2' 32 | record_last.original_request_code = '49-173' 33 | record_last.id = '200021837' 34 | record_last.title = '洋学便覧' 35 | record_last.text = nil 36 | record_last.bibliographical_introduction = '○' 37 | record_last.year = '1866' 38 | 39 | assert_equal([ 40 | 3126, 41 | record_first, 42 | record_last 43 | ], 44 | [ 45 | records.size, 46 | records[1], 47 | records[-1] 48 | ]) 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /test/test-postal-code-japan.rb: -------------------------------------------------------------------------------- 1 | class PostalCodeJapanTest < Test::Unit::TestCase 2 | test("invalid") do 3 | message = ":reading must be one of [:lowercase, :uppercase, :romaji]: :invalid" 4 | assert_raise(ArgumentError.new(message)) do 5 | Datasets::PostalCodeJapan.new(reading: :invalid) 6 | end 7 | end 8 | 9 | sub_test_case(":reading") do 10 | test(":lowercase") do 11 | dataset = Datasets::PostalCodeJapan.new(reading: :lowercase) 12 | assert_equal({ 13 | organization_code: "01101", 14 | old_postal_code: "060", 15 | postal_code: "0600000", 16 | prefecture_reading: "ホッカイドウ", 17 | city_reading: "サッポロシチュウオウク", 18 | address_reading: "イカニケイサイガナイバアイ", 19 | prefecture: "北海道", 20 | city: "札幌市中央区", 21 | address: "以下に掲載がない場合", 22 | have_multiple_postal_codes: false, 23 | have_address_number_per_koaza: false, 24 | have_chome: false, 25 | postal_code_is_shared: false, 26 | changed: false, 27 | change_reason: nil, 28 | }, 29 | dataset.first.to_h) 30 | end 31 | 32 | test(":uppercase") do 33 | dataset = Datasets::PostalCodeJapan.new(reading: :uppercase) 34 | assert_equal({ 35 | organization_code: "01101", 36 | old_postal_code: "060", 37 | postal_code: "0600000", 38 | prefecture_reading: "ホツカイドウ", 39 | city_reading: "サツポロシチユウオウク", 40 | address_reading: "イカニケイサイガナイバアイ", 41 | prefecture: "北海道", 42 | city: "札幌市中央区", 43 | address: "以下に掲載がない場合", 44 | have_multiple_postal_codes: false, 45 | have_address_number_per_koaza: false, 46 | have_chome: false, 47 | postal_code_is_shared: false, 48 | changed: false, 49 | change_reason: nil, 50 | }, 51 | dataset.first.to_h) 52 | end 53 | 54 | test(":romaji") do 55 | dataset = Datasets::PostalCodeJapan.new(reading: :romaji) 56 | assert_equal({ 57 | organization_code: nil, 58 | old_postal_code: nil, 59 | postal_code: "0600000", 60 | prefecture_reading: "HOKKAIDO", 61 | city_reading: "SAPPORO SHI CHUO KU", 62 | address_reading: "IKANIKEISAIGANAIBAAI", 63 | prefecture: "北海道", 64 | city: "札幌市 中央区", 65 | address: "以下に掲載がない場合", 66 | have_multiple_postal_codes: false, 67 | have_address_number_per_koaza: false, 68 | have_chome: false, 69 | postal_code_is_shared: false, 70 | changed: false, 71 | change_reason: nil, 72 | }, 73 | dataset.first.to_h) 74 | end 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /test/test-quora-duplicate-question-pair.rb: -------------------------------------------------------------------------------- 1 | class QuoraDuplicateQuestionPairTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::QuoraDuplicateQuestionPair.new 4 | end 5 | 6 | def record(*args) 7 | Datasets::QuoraDuplicateQuestionPair::Record.new(*args) 8 | end 9 | 10 | test("#each") do 11 | assert_equal(record(0, 12 | 1, 13 | 2, 14 | "What is the step by step guide to invest in share market in india?", 15 | "What is the step by step guide to invest in share market?", 16 | false), 17 | @dataset.each.next) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /test/test-seaborn.rb: -------------------------------------------------------------------------------- 1 | class SeabornTest < Test::Unit::TestCase 2 | sub_test_case("list") do 3 | def setup 4 | @dataset = Datasets::SeabornList.new 5 | end 6 | 7 | def test_each 8 | records = @dataset.each.to_a 9 | assert_equal([ 10 | {dataset: "anagrams"}, 11 | {dataset: "anscombe"}, 12 | {dataset: "attention"}, 13 | {dataset: "brain_networks"}, 14 | {dataset: "car_crashes"}, 15 | {dataset: "diamonds"}, 16 | {dataset: "dots"}, 17 | {dataset: "dowjones"}, 18 | {dataset: "exercise"}, 19 | {dataset: "flights"}, 20 | {dataset: "fmri"}, 21 | {dataset: "geyser"}, 22 | {dataset: "glue"}, 23 | {dataset: "healthexp"}, 24 | {dataset: "iris"}, 25 | {dataset: "mpg"}, 26 | {dataset: "penguins"}, 27 | {dataset: "planets"}, 28 | {dataset: "seaice"}, 29 | {dataset: "taxis"}, 30 | {dataset: "tips"}, 31 | {dataset: "titanic"}, 32 | ], 33 | records) 34 | end 35 | end 36 | 37 | sub_test_case("fmri") do 38 | def setup 39 | @dataset = Datasets::Seaborn.new("fmri") 40 | end 41 | 42 | def test_each 43 | records = @dataset.each.to_a 44 | assert_equal([ 45 | 1064, 46 | { 47 | subject: "s5", 48 | timepoint: 14, 49 | event: "stim", 50 | region: "parietal", 51 | signal: -0.0808829319505 52 | }, 53 | { 54 | subject: "s0", 55 | timepoint: 0, 56 | event: "cue", 57 | region: "parietal", 58 | signal: -0.00689923478092 59 | } 60 | ], 61 | [ 62 | records.size, 63 | records[1].to_h, 64 | records[-1].to_h 65 | ]) 66 | end 67 | end 68 | 69 | sub_test_case("flights") do 70 | def setup 71 | @dataset = Datasets::Seaborn.new("flights") 72 | end 73 | 74 | def test_each 75 | records = @dataset.each.to_a 76 | assert_equal([ 77 | 144, 78 | { 79 | year: 1949, 80 | month: "Feb", 81 | passengers: 118 82 | }, 83 | { 84 | year: 1960, 85 | month: "Dec", 86 | passengers: 432 87 | } 88 | ], 89 | [ 90 | records.size, 91 | records[1].to_h, 92 | records[-1].to_h 93 | ]) 94 | end 95 | end 96 | 97 | sub_test_case("penguins") do 98 | def setup 99 | @dataset = Datasets::Seaborn.new("penguins") 100 | end 101 | 102 | def test_each 103 | records = @dataset.each.to_a 104 | assert_equal([ 105 | 344, 106 | { 107 | species: "Adelie", 108 | island: "Torgersen", 109 | bill_length_mm: 39.5, 110 | bill_depth_mm: 17.4, 111 | flipper_length_mm: 186, 112 | body_mass_g: 3800, 113 | sex: "Female" 114 | }, 115 | { 116 | species: "Gentoo", 117 | island: "Biscoe", 118 | bill_length_mm: 49.9, 119 | bill_depth_mm: 16.1, 120 | flipper_length_mm: 213, 121 | body_mass_g: 5400, 122 | sex: "Male" 123 | } 124 | ], 125 | [ 126 | records.size, 127 | records[1].to_h, 128 | records[-1].to_h 129 | ]) 130 | end 131 | end 132 | 133 | sub_test_case("attention") do 134 | def setup 135 | @dataset = Datasets::Seaborn.new("attention") 136 | end 137 | 138 | def test_each 139 | records = @dataset.to_a 140 | assert_equal([ 141 | 60, 142 | { 143 | index: 1, 144 | subject: 2, 145 | attention: "divided", 146 | solutions: 1, 147 | score: 3.0 148 | }, 149 | { 150 | index: 59, 151 | subject: 20, 152 | attention: "focused", 153 | solutions: 3, 154 | score: 5.0 155 | } 156 | ], 157 | [ 158 | records.size, 159 | records[1], 160 | records[-1] 161 | ]) 162 | end 163 | end 164 | end 165 | -------------------------------------------------------------------------------- /test/test-sudachi-synonym-dictionary.rb: -------------------------------------------------------------------------------- 1 | class SudachiSynonymDictionaryTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::SudachiSynonymDictionary.new 4 | end 5 | 6 | test('#each') do 7 | assert_equal({ 8 | group_id: "000001", 9 | is_noun: true, 10 | expansion_type: :always, 11 | lexeme_id: 1, 12 | form_type: :typical, 13 | acronym_type: :typical, 14 | variant_type: :typical, 15 | categories: [], 16 | notation: "曖昧", 17 | }, 18 | @dataset.each.next.to_h) 19 | end 20 | 21 | sub_test_case('#metadata') do 22 | test('#description') do 23 | description = @dataset.metadata.description 24 | assert do 25 | description.start_with?('# Sudachi 同義語辞書') 26 | end 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /test/test-table.rb: -------------------------------------------------------------------------------- 1 | class TableTest < Test::Unit::TestCase 2 | def setup 3 | @table = Datasets::Iris.new.to_table 4 | end 5 | 6 | test("#n_columns") do 7 | assert_equal(5, @table.n_columns) 8 | end 9 | 10 | test("#n_rows") do 11 | assert_equal(150, @table.n_rows) 12 | end 13 | 14 | test("#column_names") do 15 | assert_equal([ 16 | :sepal_length, 17 | :sepal_width, 18 | :petal_length, 19 | :petal_width, 20 | :label, 21 | ], 22 | @table.column_names) 23 | end 24 | 25 | test("#each") do 26 | shorten_hash = {} 27 | @table.each do |name, values| 28 | shorten_hash[name] = values.first(5) 29 | end 30 | assert_equal({ 31 | :label => ["Iris-setosa"] * 5, 32 | :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4], 33 | :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2], 34 | :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0], 35 | :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6], 36 | }, 37 | shorten_hash) 38 | end 39 | 40 | test("#each_column") do 41 | shorten_hash = {} 42 | @table.each_column do |name, values| 43 | shorten_hash[name] = values.first(5) 44 | end 45 | assert_equal({ 46 | :label => ["Iris-setosa"] * 5, 47 | :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4], 48 | :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2], 49 | :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0], 50 | :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6], 51 | }, 52 | shorten_hash) 53 | end 54 | 55 | test("#each_record") do 56 | records = [] 57 | @table.each_record do |record| 58 | records << record 59 | break if records.size == 3 60 | end 61 | assert_equal([ 62 | { 63 | label: "Iris-setosa", 64 | petal_length: 1.4, 65 | petal_width: 0.2, 66 | sepal_length: 5.1, 67 | sepal_width: 3.5, 68 | }, 69 | { 70 | label: "Iris-setosa", 71 | petal_length: 1.4, 72 | petal_width: 0.2, 73 | sepal_length: 4.9, 74 | sepal_width: 3.0, 75 | }, 76 | { 77 | label: "Iris-setosa", 78 | petal_length: 1.3, 79 | petal_width: 0.2, 80 | sepal_length: 4.7, 81 | sepal_width: 3.2, 82 | }, 83 | ], 84 | records.collect(&:to_h)) 85 | end 86 | 87 | sub_test_case("#find_record") do 88 | test("positive") do 89 | assert_equal({ 90 | label: "Iris-setosa", 91 | petal_length: 1.4, 92 | petal_width: 0.2, 93 | sepal_length: 4.9, 94 | sepal_width: 3.0, 95 | }, 96 | @table.find_record(1).to_h) 97 | end 98 | 99 | test("positive - over") do 100 | assert_nil(@table.find_record(151)) 101 | end 102 | 103 | test("negative") do 104 | assert_equal({ 105 | label: "Iris-virginica", 106 | petal_length: 5.1, 107 | petal_width: 1.8, 108 | sepal_length: 5.9, 109 | sepal_width: 3.0, 110 | }, 111 | @table.find_record(-1).to_h) 112 | end 113 | 114 | test("negative - over") do 115 | assert_nil(@table.find_record(-151)) 116 | end 117 | end 118 | 119 | sub_test_case("#[]") do 120 | test("index") do 121 | assert_equal([1.4, 1.4, 1.3, 1.5, 1.4], 122 | @table[2].first(5)) 123 | end 124 | 125 | test("name") do 126 | assert_equal([1.4, 1.4, 1.3, 1.5, 1.4], 127 | @table[:petal_length].first(5)) 128 | end 129 | end 130 | 131 | test("#dictionary_encode") do 132 | assert_equal([ 133 | [0, "Iris-setosa"], 134 | [1, "Iris-versicolor"], 135 | [2, "Iris-virginica"], 136 | ], 137 | @table.dictionary_encode(:label).to_a) 138 | end 139 | 140 | test("#label_encode") do 141 | label_encoded_labels = @table.label_encode(:label) 142 | labels = @table[:label] 143 | assert_equal([0, 1, 2], 144 | [ 145 | label_encoded_labels[labels.find_index("Iris-setosa")], 146 | label_encoded_labels[labels.find_index("Iris-versicolor")], 147 | label_encoded_labels[labels.find_index("Iris-virginica")], 148 | ]) 149 | end 150 | 151 | sub_test_case("#fetch_values") do 152 | test("found") do 153 | values = @table.fetch_values(:petal_length, :petal_width) 154 | assert_equal([ 155 | [1.4, 1.4, 1.3, 1.5, 1.4], 156 | [0.2, 0.2, 0.2, 0.2, 0.2], 157 | ], 158 | values.collect {|v| v.first(5)}) 159 | end 160 | 161 | sub_test_case("not found") do 162 | test("with block") do 163 | values = @table.fetch_values(:petal_length, :unknown) do |key| 164 | [key] * 5 165 | end 166 | assert_equal([ 167 | [1.4, 1.4, 1.3, 1.5, 1.4], 168 | [:unknown] * 5, 169 | ], 170 | values.collect {|v| v.first(5)}) 171 | end 172 | 173 | test("without block") do 174 | assert_raise(KeyError) do 175 | @table.fetch_values(:unknown) 176 | end 177 | end 178 | end 179 | end 180 | 181 | test("#to_h") do 182 | shorten_hash = {} 183 | @table.to_h.each do |name, values| 184 | shorten_hash[name] = values.first(5) 185 | end 186 | assert_equal({ 187 | :label => ["Iris-setosa"] * 5, 188 | :petal_length => [1.4, 1.4, 1.3, 1.5, 1.4], 189 | :petal_width => [0.2, 0.2, 0.2, 0.2, 0.2], 190 | :sepal_length => [5.1, 4.9, 4.7, 4.6, 5.0], 191 | :sepal_width => [3.5, 3.0, 3.2, 3.1, 3.6], 192 | }, 193 | shorten_hash) 194 | end 195 | end 196 | -------------------------------------------------------------------------------- /test/test-wikipedia.rb: -------------------------------------------------------------------------------- 1 | class WikipediaTest < Test::Unit::TestCase 2 | sub_test_case("en") do 3 | sub_test_case("articles") do 4 | def setup 5 | @dataset = Datasets::Wikipedia.new(language: :en, 6 | type: :articles) 7 | end 8 | 9 | test("#each") do 10 | contributor = Datasets::Wikipedia::Contributor.new("Asparagusus", 43603280) 11 | revision = Datasets::Wikipedia::Revision.new 12 | revision.id = 1219062925 13 | revision.parent_id = 1219062840 14 | revision.timestamp = Time.iso8601("2024-04-15T14:38:04Z") 15 | revision.contributor = contributor 16 | revision.comment = "Restored revision 1002250816 by [[Special:Contributions/Elli|Elli]] ([[User talk:Elli|talk]]): Unexplained redirect breaking" 17 | revision.model = "wikitext" 18 | revision.format = "text/x-wiki" 19 | revision.text = <<-TEXT.chomp 20 | #REDIRECT [[Computer accessibility]] 21 | 22 | {{rcat shell| 23 | {{R from move}} 24 | {{R from CamelCase}} 25 | {{R unprintworthy}} 26 | }} 27 | TEXT 28 | revision.sha1 = "kmysdltgexdwkv2xsml3j44jb56dxvn" 29 | page = Datasets::Wikipedia::Page.new 30 | page.title = "AccessibleComputing" 31 | page.namespace = 0 32 | page.id = 10 33 | page.restrictions = nil 34 | page.redirect = "Computer accessibility" 35 | page.revision = revision 36 | assert_equal(page, @dataset.each.first) 37 | end 38 | 39 | sub_test_case("#metadata") do 40 | test("#id") do 41 | assert_equal("wikipedia-en-articles", 42 | @dataset.metadata.id) 43 | end 44 | 45 | test("#name") do 46 | assert_equal("Wikipedia articles (en)", 47 | @dataset.metadata.name) 48 | end 49 | 50 | test("#description") do 51 | assert_equal("Wikipedia articles in en", 52 | @dataset.metadata.description) 53 | end 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /test/test-wine.rb: -------------------------------------------------------------------------------- 1 | class WineTest < Test::Unit::TestCase 2 | def setup 3 | @dataset = Datasets::Wine.new 4 | end 5 | 6 | test('#each') do 7 | records = @dataset.each.to_a 8 | assert_equal([ 9 | 178, 10 | { 11 | :alcalinity_of_ash => 15.6, 12 | :alcohol => 14.23, 13 | :ash => 2.43, 14 | :label => 1, 15 | :color_intensity => 5.64, 16 | :hue => 1.04, 17 | :malic_acid => 1.71, 18 | :total_flavonoids => 3.06, 19 | :n_magnesiums => 127, 20 | :total_nonflavanoid_phenols => 0.28, 21 | :total_proanthocyanins => 2.29, 22 | :n_prolines => 1065, 23 | :optical_nucleic_acid_concentration => 3.92, 24 | :total_phenols => 2.8 25 | }, 26 | { 27 | :alcalinity_of_ash => 24.5, 28 | :alcohol => 14.13, 29 | :ash => 2.74, 30 | :label => 3, 31 | :color_intensity => 9.2, 32 | :hue => 0.61, 33 | :malic_acid => 4.1, 34 | :total_flavonoids => 0.76, 35 | :n_magnesiums => 96, 36 | :total_nonflavanoid_phenols => 0.56, 37 | :total_proanthocyanins => 1.35, 38 | :n_prolines => 560, 39 | :optical_nucleic_acid_concentration => 1.6, 40 | :total_phenols => 2.05, 41 | }, 42 | ], 43 | [ 44 | records.size, 45 | records[0].to_h, 46 | records[-1].to_h, 47 | ]) 48 | end 49 | 50 | sub_test_case('#metadata') do 51 | test('#description') do 52 | description = @dataset.metadata.description 53 | assert do 54 | description.start_with?('1. Title of Database: Wine recognition data') 55 | end 56 | end 57 | end 58 | end 59 | --------------------------------------------------------------------------------